diff --git a/lib/guessit2/__init__.py b/lib/guessit2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b5c841b8fa2d4259d799ba41467853fe7cdcc87 --- /dev/null +++ b/lib/guessit2/__init__.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Extracts as much information as possible from a video file. +""" +from .api import guessit, GuessItApi + +from .__version__ import __version__ diff --git a/lib/guessit2/__main__.py b/lib/guessit2/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..0ba11b422f431b65612d1d30c2f2a94a647afdf9 --- /dev/null +++ b/lib/guessit2/__main__.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Entry point module +""" +# pragma: no cover +from __future__ import print_function, unicode_literals + +import os +import logging +import json +import sys +from io import open #pylint:disable=redefined-builtin + +import six +from guessit.jsonutils import GuessitEncoder + +from guessit.__version__ import __version__ +from guessit.options import argument_parser +from guessit import api + + +def guess_filename(filename, options): + """ + Guess a single filename using given options + """ + if not options.yaml and not options.json and not options.show_property: + print('For:', filename) + + cmd_options = vars(options) + cmd_options['implicit'] = True # Force implicit option in CLI + + guess = api.guessit(filename, vars(options)) + + if options.show_property: + print(guess.get(options.show_property, '')) + return + + if options.json: + print(json.dumps(guess, cls=GuessitEncoder, ensure_ascii=False)) + elif options.yaml: + import yaml + from guessit import yamlutils + + ystr = yaml.dump({filename: dict(guess)}, Dumper=yamlutils.CustomDumper, default_flow_style=False, + allow_unicode=True) + i = 0 + for yline in ystr.splitlines(): + if i == 0: + print("? " + yline[:-1]) + elif i == 1: + print(":" + yline[1:]) + else: + print(yline) + i += 1 + else: + print('GuessIt found:', json.dumps(guess, cls=GuessitEncoder, indent=4, ensure_ascii=False)) + + +def display_properties(options): + """ + Display properties + """ + properties = api.properties(options) + + if options.json: + if options.values: + print(json.dumps(properties, cls=GuessitEncoder, ensure_ascii=False)) + else: + print(json.dumps(list(properties.keys()), cls=GuessitEncoder, ensure_ascii=False)) + elif options.yaml: + import yaml + from guessit import yamlutils + if options.values: + print(yaml.dump(properties, Dumper=yamlutils.CustomDumper, default_flow_style=False, allow_unicode=True)) + else: + print(yaml.dump(list(properties.keys()), Dumper=yamlutils.CustomDumper, default_flow_style=False, + allow_unicode=True)) + else: + print('GuessIt properties:') + + properties_list = list(sorted(properties.keys())) + for property_name in properties_list: + property_values = properties.get(property_name) + print(2 * ' ' + '[+] %s' % (property_name,)) + if property_values and options.values: + for property_value in property_values: + print(4 * ' ' + '[!] %s' % (property_value,)) + + +def main(args=None): # pylint:disable=too-many-branches + """ + Main function for entry point + """ + if six.PY2 and os.name == 'nt': # pragma: no cover + # see http://bugs.python.org/issue2128 + import locale + + for i, j in enumerate(sys.argv): + sys.argv[i] = j.decode(locale.getpreferredencoding()) + + if args is None: # pragma: no cover + options = argument_parser.parse_args() + else: + options = argument_parser.parse_args(args) + if options.verbose: + logging.basicConfig(stream=sys.stdout, format='%(message)s') + logging.getLogger().setLevel(logging.DEBUG) + + help_required = True + + if options.version: + print('+-------------------------------------------------------+') + print('+ GuessIt ' + __version__ + (28 - len(__version__)) * ' ' + '+') + print('+-------------------------------------------------------+') + print('| Please report any bug or feature request at |') + print('| https://github.com/wackou/guessit/issues. |') + print('+-------------------------------------------------------+') + help_required = False + + if options.yaml: + try: + import yaml # pylint:disable=unused-variable + except ImportError: # pragma: no cover + options.yaml = False + print('PyYAML is not installed. \'--yaml\' option will be ignored ...', file=sys.stderr) + + if options.properties or options.values: + display_properties(options) + help_required = False + + filenames = [] + if options.filename: + for filename in options.filename: + if not isinstance(filename, six.text_type): # pragma: no cover + encoding = sys.getfilesystemencoding() + filename = filename.decode(encoding) + filenames.append(filename) + if options.input_file: + input_file = open(options.input_file, 'r', encoding='utf-8') + try: + filenames.extend([line.strip() for line in input_file.readlines()]) + finally: + input_file.close() + + filenames = list(filter(lambda f: f, filenames)) + + if filenames: + for filename in filenames: + help_required = False + guess_filename(filename, options) + + if help_required: # pragma: no cover + argument_parser.print_help() + + +if __name__ == '__main__': # pragma: no cover + main() diff --git a/lib/guessit2/__version__.py b/lib/guessit2/__version__.py new file mode 100644 index 0000000000000000000000000000000000000000..c6db86db2b87155d0dafedbb894862c93b88b578 --- /dev/null +++ b/lib/guessit2/__version__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Version module +""" +# pragma: no cover +__version__ = '2.0rc5.dev0' diff --git a/lib/guessit2/api.py b/lib/guessit2/api.py new file mode 100644 index 0000000000000000000000000000000000000000..3a4959ad1e4c4cfa61f89278074db790cbb24f76 --- /dev/null +++ b/lib/guessit2/api.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +API functions that can be used by external software +""" +from __future__ import unicode_literals +try: + from collections import OrderedDict +except ImportError: # pragma: no-cover + from ordereddict import OrderedDict # pylint:disable=import-error + +import six + +from rebulk.introspector import introspect + +from .rules import rebulk_builder +from .options import parse_options + + +def guessit(string, options=None): + """ + Retrieves all matches from string as a dict + :param string: the filename or release name + :type string: str + :param options: the filename or release name + :type options: str|dict + :return: + :rtype: + """ + return default_api.guessit(string, options) + + +def properties(options=None): + """ + Retrieves all properties with possible values that can be guessed + :param options: + :type options: + :return: + :rtype: + """ + return default_api.properties(options) + + +class GuessItApi(object): + """ + An api class that can be configured with custom Rebulk configuration. + """ + + def __init__(self, rebulk): + """ + :param rebulk: Rebulk instance to use. + :type rebulk: Rebulk + :return: + :rtype: + """ + self.rebulk = rebulk + + def guessit(self, string, options=None): + """ + Retrieves all matches from string as a dict + :param string: the filename or release name + :type string: str + :param options: the filename or release name + :type options: str|dict + :return: + :rtype: + """ + if not isinstance(string, six.text_type): + raise TypeError("guessit input must be %s." % six.text_type.__name__) + options = parse_options(options) + return self.rebulk.matches(string, options).to_dict(options.get('advanced', False), + options.get('implicit', False)) + + def properties(self, options=None): + """ + Grab properties and values that can be generated. + :param options: + :type options: + :return: + :rtype: + """ + unordered = introspect(self.rebulk, options).properties + ordered = OrderedDict() + for k in sorted(unordered.keys(), key=six.text_type): + ordered[k] = list(sorted(unordered[k], key=six.text_type)) + return ordered + + +default_api = GuessItApi(rebulk_builder()) diff --git a/lib/guessit2/backports.py b/lib/guessit2/backports.py new file mode 100644 index 0000000000000000000000000000000000000000..3e94e27ad7438fc7c7fd43b8ac370e30b7d22d72 --- /dev/null +++ b/lib/guessit2/backports.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Backports +""" +# pragma: no-cover +# pylint: disabled + +def cmp_to_key(mycmp): + """functools.cmp_to_key backport""" + class KeyClass(object): + """Key class""" + def __init__(self, obj, *args): # pylint: disable=unused-argument + self.obj = obj + def __lt__(self, other): + return mycmp(self.obj, other.obj) < 0 + def __gt__(self, other): + return mycmp(self.obj, other.obj) > 0 + def __eq__(self, other): + return mycmp(self.obj, other.obj) == 0 + def __le__(self, other): + return mycmp(self.obj, other.obj) <= 0 + def __ge__(self, other): + return mycmp(self.obj, other.obj) >= 0 + def __ne__(self, other): + return mycmp(self.obj, other.obj) != 0 + return KeyClass diff --git a/lib/guessit2/jsonutils.py b/lib/guessit2/jsonutils.py new file mode 100644 index 0000000000000000000000000000000000000000..54bf794316c8180705fb1b4686a191cf6ffc39dc --- /dev/null +++ b/lib/guessit2/jsonutils.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +JSON Utils +""" +import json +try: + from collections import OrderedDict +except ImportError: # pragma: no-cover + from ordereddict import OrderedDict # pylint:disable=import-error + +from rebulk.match import Match + + +class GuessitEncoder(json.JSONEncoder): + """ + JSON Encoder for guessit response + """ + + def default(self, o): # pylint:disable=method-hidden + if isinstance(o, Match): + ret = OrderedDict() + ret['value'] = o.value + if o.raw: + ret['raw'] = o.raw + ret['start'] = o.start + ret['end'] = o.end + return ret + elif hasattr(o, 'name'): # Babelfish languages/countries long name + return o.name + else: # pragma: no cover + return str(o) diff --git a/lib/guessit2/options.py b/lib/guessit2/options.py new file mode 100644 index 0000000000000000000000000000000000000000..2cdc02ceafae16919970a7da41b4b00344532f9e --- /dev/null +++ b/lib/guessit2/options.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Options +""" +from __future__ import unicode_literals + +import sys +from argparse import ArgumentParser +import shlex + +import six + + +if six.PY2: + StrOptType = lambda s: unicode(s, sys.stdin.encoding) # pylint:disable=undefined-variable +else: + StrOptType = str # pylint:disable=redefined-variable-type + + +def build_argument_parser(): + """ + Builds the argument parser + :return: the argument parser + :rtype: ArgumentParser + """ + opts = ArgumentParser() + opts.add_argument(dest='filename', help='Filename or release name to guess', nargs='*') + + naming_opts = opts.add_argument_group("Naming") + naming_opts.add_argument('-t', '--type', dest='type', default=None, + help='The suggested file type: movie, episode. If undefined, type will be guessed.') + naming_opts.add_argument('-n', '--name-only', dest='name_only', action='store_true', default=False, + help='Parse files as name only, considering "/" and "\\" like other separators.') + naming_opts.add_argument('-Y', '--date-year-first', action='store_true', dest='date_year_first', default=None, + help='If short date is found, consider the first digits as the year.') + naming_opts.add_argument('-D', '--date-day-first', action='store_true', dest='date_day_first', default=None, + help='If short date is found, consider the second digits as the day.') + naming_opts.add_argument('-L', '--allowed-languages', action='append', dest='allowed_languages', + help='Allowed language (can be used multiple times)') + naming_opts.add_argument('-C', '--allowed-countries', action='append', dest='allowed_countries', + help='Allowed country (can be used multiple times)') + naming_opts.add_argument('-E', '--episode-prefer-number', action='store_true', dest='episode_prefer_number', + default=False, + help='Guess "serie.213.avi" as the episode 213. Without this option, ' + 'it will be guessed as season 2, episode 13') + naming_opts.add_argument('-T', '--expected-title', action='append', dest='expected_title', type=StrOptType, + help='Expected title to parse (can be used multiple times)') + naming_opts.add_argument('-G', '--expected-group', action='append', dest='expected_group', type=StrOptType, + help='Expected release group (can be used multiple times)') + + input_opts = opts.add_argument_group("Input") + input_opts.add_argument('-f', '--input-file', dest='input_file', default=False, + help='Read filenames from an input text file. File should use UTF-8 charset.') + + output_opts = opts.add_argument_group("Output") + output_opts.add_argument('-v', '--verbose', action='store_true', dest='verbose', default=False, + help='Display debug output') + output_opts.add_argument('-P', '--show-property', dest='show_property', default=None, + help='Display the value of a single property (title, series, video_codec, year, ...)') + output_opts.add_argument('-a', '--advanced', dest='advanced', action='store_true', default=False, + help='Display advanced information for filename guesses, as json output') + output_opts.add_argument('-j', '--json', dest='json', action='store_true', default=False, + help='Display information for filename guesses as json output') + output_opts.add_argument('-y', '--yaml', dest='yaml', action='store_true', default=False, + help='Display information for filename guesses as yaml output') + + + + information_opts = opts.add_argument_group("Information") + information_opts.add_argument('-p', '--properties', dest='properties', action='store_true', default=False, + help='Display properties that can be guessed.') + information_opts.add_argument('-V', '--values', dest='values', action='store_true', default=False, + help='Display property values that can be guessed.') + information_opts.add_argument('--version', dest='version', action='store_true', default=False, + help='Display the guessit version.') + + return opts + + +def parse_options(options): + """ + Parse given option string + :param options: + :type options: + :return: + :rtype: + """ + if isinstance(options, six.string_types): + args = shlex.split(options) + options = vars(argument_parser.parse_args(args)) + if options is None: + options = {} + return options + + +argument_parser = build_argument_parser() diff --git a/lib/guessit2/rules/__init__.py b/lib/guessit2/rules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8e2716cc089436b634f8e508c33be3b84b59e710 --- /dev/null +++ b/lib/guessit2/rules/__init__.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Rebulk object default builder +""" +from __future__ import unicode_literals + +from rebulk import Rebulk + +from .markers.path import path +from .markers.groups import groups + +from .properties.episodes import episodes +from .properties.container import container +from .properties.format import format_ +from .properties.video_codec import video_codec +from .properties.audio_codec import audio_codec +from .properties.screen_size import screen_size +from .properties.website import website +from .properties.date import date +from .properties.title import title +from .properties.episode_title import episode_title +from .properties.language import language +from .properties.country import country +from .properties.release_group import release_group +from .properties.other import other +from .properties.edition import edition +from .properties.cds import cds +from .properties.bonus import bonus +from .properties.film import film +from .properties.part import part +from .properties.crc import crc +from .properties.mimetype import mimetype +from .properties.type import type_ + +from .processors import processors + + +def rebulk_builder(): + """ + Default builder for main Rebulk object used by api. + :return: Main Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk() + + rebulk.rebulk(path()) + rebulk.rebulk(groups()) + + rebulk.rebulk(episodes()) + rebulk.rebulk(container()) + rebulk.rebulk(format_()) + rebulk.rebulk(video_codec()) + rebulk.rebulk(audio_codec()) + rebulk.rebulk(screen_size()) + rebulk.rebulk(website()) + rebulk.rebulk(date()) + rebulk.rebulk(title()) + rebulk.rebulk(episode_title()) + rebulk.rebulk(language()) + rebulk.rebulk(country()) + rebulk.rebulk(release_group()) + rebulk.rebulk(other()) + rebulk.rebulk(edition()) + rebulk.rebulk(cds()) + rebulk.rebulk(bonus()) + rebulk.rebulk(film()) + rebulk.rebulk(part()) + rebulk.rebulk(crc()) + + rebulk.rebulk(processors()) + + rebulk.rebulk(mimetype()) + rebulk.rebulk(type_()) + + return rebulk diff --git a/lib/guessit2/rules/common/__init__.py b/lib/guessit2/rules/common/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a30f087987bdaa4cc941e7046fa561c20a92f9e2 --- /dev/null +++ b/lib/guessit2/rules/common/__init__.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Common module +""" +from __future__ import unicode_literals + +seps = r' [](){}+*|=§-_~#/\.,;:' # list of tags/words separators + +title_seps = r'-+/\|' # separators for title + +dash = (r'-', r'[\W_]') # abbreviation used by many rebulk objects. +alt_dash = (r'@', r'[\W_]') # abbreviation used by many rebulk objects. diff --git a/lib/guessit2/rules/common/comparators.py b/lib/guessit2/rules/common/comparators.py new file mode 100644 index 0000000000000000000000000000000000000000..f27a407fed168b50a4b9ef64e20f88f7e8b1b757 --- /dev/null +++ b/lib/guessit2/rules/common/comparators.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Comparators +""" +from __future__ import unicode_literals + +try: + from functools import cmp_to_key +except ImportError: + from ...backports import cmp_to_key + + +def marker_comparator_predicate(match): + """ + Match predicate used in comparator + """ + return not match.private and \ + match.name not in ['proper_count', 'title', 'episode_title', 'alternativeTitle'] and \ + not (match.name == 'container' and 'extension' in match.tags) + + +def marker_weight(matches, marker): + """ + Compute the comparator weight of a marker + :param matches: + :param marker: + :return: + """ + return len(set(match.name for match in matches.range(*marker.span, predicate=marker_comparator_predicate))) + + +def marker_comparator(matches, markers): + """ + Builds a comparator that returns markers sorted from the most valuable to the less. + + Take the parts where matches count is higher, then when length is higher, then when position is at left. + + :param matches: + :type matches: + :return: + :rtype: + """ + def comparator(marker1, marker2): + """ + The actual comparator function. + """ + matches_count = marker_weight(matches, marker2) - marker_weight(matches, marker1) + if matches_count: + return matches_count + len_diff = len(marker2) - len(marker1) + if len_diff: + return len_diff + return markers.index(marker2) - markers.index(marker1) + + return comparator + + +def marker_sorted(markers, matches): + """ + Sort markers from matches, from the most valuable to the less. + + :param fileparts: + :type fileparts: + :param matches: + :type matches: + :return: + :rtype: + """ + return sorted(markers, key=cmp_to_key(marker_comparator(matches, markers))) diff --git a/lib/guessit2/rules/common/date.py b/lib/guessit2/rules/common/date.py new file mode 100644 index 0000000000000000000000000000000000000000..56b0987f9c70756e5c28b83100daa07ac5647f98 --- /dev/null +++ b/lib/guessit2/rules/common/date.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Date +""" +from __future__ import unicode_literals + +import regex as re + +from dateutil import parser + +_dsep = r'[-/ \.]' +_dsep_bis = r'[-/ \.x]' + +date_regexps = [ + re.compile(r'%s((\d{8}))%s' % (_dsep, _dsep), re.IGNORECASE), + re.compile(r'%s((\d{6}))%s' % (_dsep, _dsep), re.IGNORECASE), + re.compile(r'(?:^|[^\d])((\d{2})%s(\d{1,2})%s(\d{1,2}))(?:$|[^\d])' % (_dsep, _dsep), re.IGNORECASE), + re.compile(r'(?:^|[^\d])((\d{1,2})%s(\d{1,2})%s(\d{2}))(?:$|[^\d])' % (_dsep, _dsep), re.IGNORECASE), + re.compile(r'(?:^|[^\d])((\d{4})%s(\d{1,2})%s(\d{1,2}))(?:$|[^\d])' % (_dsep_bis, _dsep), re.IGNORECASE), + re.compile(r'(?:^|[^\d])((\d{1,2})%s(\d{1,2})%s(\d{4}))(?:$|[^\d])' % (_dsep, _dsep_bis), re.IGNORECASE), + re.compile(r'(?:^|[^\d])((\d{1,2}(?:st|nd|rd|th)?%s(?:[a-z]{3,10})%s\d{4}))(?:$|[^\d])' % (_dsep, _dsep), + re.IGNORECASE)] + + +def valid_year(year): + """Check if number is a valid year""" + return 1920 <= year < 2030 + + +def search_date(string, year_first=None, day_first=True): + """Looks for date patterns, and if found return the date and group span. + + Assumes there are sentinels at the beginning and end of the string that + always allow matching a non-digit delimiting the date. + + Year can be defined on two digit only. It will return the nearest possible + date from today. + + >>> search_date(' This happened on 2002-04-22. ') + (18, 28, datetime.date(2002, 4, 22)) + + >>> search_date(' And this on 17-06-1998. ') + (13, 23, datetime.date(1998, 6, 17)) + + >>> search_date(' no date in here ') + """ + start, end = None, None + match = None + for date_re in date_regexps: + search_match = date_re.search(string) + if search_match and (match is None or search_match.end() - search_match.start() > len(match)): + start, end = search_match.start(1), search_match.end(1) + match = '-'.join(search_match.groups()[1:]) + + if match is None: + return + + # If day_first/year_first is undefined, parse is made using both possible values. + yearfirst_opts = [False, True] + if year_first is not None: + yearfirst_opts = [year_first] + + dayfirst_opts = [True, False] + if day_first is not None: + dayfirst_opts = [day_first] + + kwargs_list = ({'dayfirst': d, 'yearfirst': y} for d in dayfirst_opts for y in yearfirst_opts) + for kwargs in kwargs_list: + try: + date = parser.parse(match, **kwargs) + except (ValueError, TypeError): # pragma: no cover + # see https://bugs.launchpad.net/dateutil/+bug/1247643 + date = None + + # check date plausibility + if date and valid_year(date.year): # pylint:disable=no-member + return start, end, date.date() # pylint:disable=no-member diff --git a/lib/guessit2/rules/common/formatters.py b/lib/guessit2/rules/common/formatters.py new file mode 100644 index 0000000000000000000000000000000000000000..ca70811f83eb6d90ac5c7174d5976191a27b0c86 --- /dev/null +++ b/lib/guessit2/rules/common/formatters.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Formatters +""" +from __future__ import unicode_literals + +import regex as re + +from rebulk.formatters import formatters + +from . import seps + + +_excluded_clean_chars = ',:;-/\\' +clean_chars = "" +for sep in seps: + if sep not in _excluded_clean_chars: + clean_chars += sep + + +def cleanup(input_string): + """ + Removes and strip separators from input_string (but keep ',;' characters) + :param input_string: + :type input_string: + :return: + :rtype: + """ + for char in clean_chars: + input_string = input_string.replace(char, ' ') + return re.sub(' +', ' ', strip(input_string)) + + +def strip(input_string): + """ + Strip separators from input_string + :param input_string: + :type input_string: + :return: + :rtype: + """ + return input_string.strip(seps) + + +def raw_cleanup(raw): + """ + Cleanup a raw value to perform raw comparison + :param raw: + :type raw: + :return: + :rtype: + """ + return formatters(cleanup, strip)(raw.lower()) + + +def reorder_title(title, articles=('the',), separators=(',', ', ')): + """ + Reorder the title + :param title: + :type title: + :param articles: + :type articles: + :param separators: + :type separators: + :return: + :rtype: + """ + ltitle = title.lower() + for article in articles: + for separator in separators: + suffix = separator + article + if ltitle[-len(suffix):] == suffix: + return title[-len(suffix) + len(separator):] + ' ' + title[:-len(suffix)] + return title diff --git a/lib/guessit2/rules/common/numeral.py b/lib/guessit2/rules/common/numeral.py new file mode 100644 index 0000000000000000000000000000000000000000..8b868ea669e831abc90337057ae90a4eea44b63a --- /dev/null +++ b/lib/guessit2/rules/common/numeral.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +parse numeral from various formats +""" +from __future__ import unicode_literals + +import regex as re + +digital_numeral = r'\d{1,4}' + +roman_numeral = r'(?=[MCDLXVI]+)M{0,4}(?:CM|CD|D?C{0,3})(?:XC|XL|L?X{0,3})(?:IX|IV|V?I{0,3})' + +english_word_numeral_list = [ + 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', + 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty' +] + +french_word_numeral_list = [ + 'zéro', 'un', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf', 'dix', + 'onze', 'douze', 'treize', 'quatorze', 'quinze', 'seize', 'dix-sept', 'dix-huit', 'dix-neuf', 'vingt' +] + +french_alt_word_numeral_list = [ + 'zero', 'une', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf', 'dix', + 'onze', 'douze', 'treize', 'quatorze', 'quinze', 'seize', 'dixsept', 'dixhuit', 'dixneuf', 'vingt' +] + + +def __build_word_numeral(*args): + """ + Build word numeral regexp from list. + + :param args: + :type args: + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + re_ = None + for word_list in args: + for word in word_list: + if not re_: + re_ = r'(?:(?=\w+)' + else: + re_ += '|' + re_ += word + re_ += ')' + return re_ + + +word_numeral = __build_word_numeral(english_word_numeral_list, french_word_numeral_list, french_alt_word_numeral_list) + +numeral = '(?:' + digital_numeral + '|' + roman_numeral + '|' + word_numeral + ')' + +__romanNumeralMap = ( + ('M', 1000), + ('CM', 900), + ('D', 500), + ('CD', 400), + ('C', 100), + ('XC', 90), + ('L', 50), + ('XL', 40), + ('X', 10), + ('IX', 9), + ('V', 5), + ('IV', 4), + ('I', 1) +) + +__romanNumeralPattern = re.compile('^' + roman_numeral + '$') + + +def __parse_roman(value): + """ + convert Roman numeral to integer + + :param value: Value to parse + :type value: string + :return: + :rtype: + """ + if not __romanNumeralPattern.search(value): + raise ValueError('Invalid Roman numeral: %s' % value) + + result = 0 + index = 0 + for num, integer in __romanNumeralMap: + while value[index:index + len(num)] == num: + result += integer + index += len(num) + return result + + +def __parse_word(value): + """ + Convert Word numeral to integer + + :param value: Value to parse + :type value: string + :return: + :rtype: + """ + for word_list in [english_word_numeral_list, french_word_numeral_list, french_alt_word_numeral_list]: + try: + return word_list.index(value.lower()) + except ValueError: + pass + raise ValueError # pragma: no cover + + +_clean_re = re.compile(r'[^\d]*(\d+)[^\d]*') + + +def parse_numeral(value, int_enabled=True, roman_enabled=True, word_enabled=True, clean=True): + """ + Parse a numeric value into integer. + + :param value: Value to parse. Can be an integer, roman numeral or word. + :type value: string + :param int_enabled: + :type int_enabled: + :param roman_enabled: + :type roman_enabled: + :param word_enabled: + :type word_enabled: + :param clean: + :type clean: + :return: Numeric value, or None if value can't be parsed + :rtype: int + """ + # pylint: disable=too-many-branches + if int_enabled: + try: + if clean: + match = _clean_re.match(value) + if match: + clean_value = match.group(1) + return int(clean_value) + return int(value) + except ValueError: + pass + if roman_enabled: + try: + if clean: + for word in value.split(): + try: + return __parse_roman(word.upper()) + except ValueError: + pass + return __parse_roman(value) + except ValueError: + pass + if word_enabled: + try: + if clean: + for word in value.split(): + try: + return __parse_word(word) + except ValueError: # pragma: no cover + pass + return __parse_word(value) # pragma: no cover + except ValueError: # pragma: no cover + pass + raise ValueError('Invalid numeral: ' + value) # pragma: no cover diff --git a/lib/guessit2/rules/common/validators.py b/lib/guessit2/rules/common/validators.py new file mode 100644 index 0000000000000000000000000000000000000000..f25dd00087a95589480b5f80fca10d6a8479b9f2 --- /dev/null +++ b/lib/guessit2/rules/common/validators.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Validators +""" +from __future__ import unicode_literals + +from functools import partial + +from rebulk.validators import chars_before, chars_after, chars_surround +from . import seps + +seps_before = partial(chars_before, seps) +seps_after = partial(chars_after, seps) +seps_surround = partial(chars_surround, seps) + + +def int_coercable(string): + """ + Check if string can be coerced to int + :param string: + :type string: + :return: + :rtype: + """ + try: + int(string) + return True + except ValueError: + return False diff --git a/lib/guessit2/rules/common/words.py b/lib/guessit2/rules/common/words.py new file mode 100644 index 0000000000000000000000000000000000000000..1a56117eea0caf3fde5c6b7f58d238bbca77c960 --- /dev/null +++ b/lib/guessit2/rules/common/words.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Words utils +""" +from __future__ import unicode_literals + +import regex as re + +_words_rexp = re.compile(r'\w+', re.UNICODE) + + +def iter_words(string): + """ + Iterate on all words in a string + :param string: + :type string: + :return: + :rtype: iterable[str] + """ + return _words_rexp.finditer(string.replace('_', ' ')) + +# list of common words which could be interpreted as properties, but which +# are far too common to be able to say they represent a property in the +# middle of a string (where they most likely carry their commmon meaning) +COMMON_WORDS = frozenset([ + # english words + 'is', 'it', 'am', 'mad', 'men', 'man', 'run', 'sin', 'st', 'to', + 'no', 'non', 'war', 'min', 'new', 'car', 'day', 'bad', 'bat', 'fan', + 'fry', 'cop', 'zen', 'gay', 'fat', 'one', 'cherokee', 'got', 'an', 'as', + 'cat', 'her', 'be', 'hat', 'sun', 'may', 'my', 'mr', 'rum', 'pi', 'bb', + 'bt', 'tv', 'aw', 'by', 'md', 'mp', 'cd', 'lt', 'gt', 'in', 'ad', 'ice', + 'ay', 'at', 'star', 'so', 'he', + # french words + 'bas', 'de', 'le', 'son', 'ne', 'ca', 'ce', 'et', 'que', + 'mal', 'est', 'vol', 'or', 'mon', 'se', 'je', 'tu', 'me', + 'ne', 'ma', 'va', 'au', + # japanese words, + 'wa', 'ga', 'ao', + # spanish words + 'la', 'el', 'del', 'por', 'mar', 'al', + # other + 'ind', 'arw', 'ts', 'ii', 'bin', 'chan', 'ss', 'san', 'oss', 'iii', + 'vi', 'ben', 'da', 'lt', 'ch', 'sr', 'ps', 'cx', 'vo', + # new from babelfish + 'mkv', 'avi', 'dmd', 'the', 'dis', 'cut', 'stv', 'des', 'dia', 'and', + 'cab', 'sub', 'mia', 'rim', 'las', 'une', 'par', 'srt', 'ano', 'toy', + 'job', 'gag', 'reel', 'www', 'for', 'ayu', 'csi', 'ren', 'moi', 'sur', + 'fer', 'fun', 'two', 'big', 'psy', 'air', + # movie title + 'brazil', + # release groups + 'bs', # Bosnian + 'kz', + # countries + 'gt', 'lt', 'im', + # part/pt + 'pt', + # screener + 'scr' +]) diff --git a/lib/guessit2/rules/markers/__init__.py b/lib/guessit2/rules/markers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6a48a13b388724e96b1bceb7a9e6087ae17f025c --- /dev/null +++ b/lib/guessit2/rules/markers/__init__.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Markers +""" diff --git a/lib/guessit2/rules/markers/groups.py b/lib/guessit2/rules/markers/groups.py new file mode 100644 index 0000000000000000000000000000000000000000..15104b869295619876ba4106c980da0a0eec2172 --- /dev/null +++ b/lib/guessit2/rules/markers/groups.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Groups markers (...), [...] and {...} +""" +from __future__ import unicode_literals + +from rebulk import Rebulk + + +def groups(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk() + rebulk.defaults(name="group", marker=True) + + starting = '([{' + ending = ')]}' + + def mark_groups(input_string): + """ + Functional pattern to mark groups (...), [...] and {...}. + + :param input_string: + :return: + """ + openings = ([], [], []) + i = 0 + + ret = [] + for char in input_string: + start_type = starting.find(char) + if start_type > -1: + openings[start_type].append(i) + + i += 1 + + end_type = ending.find(char) + if end_type > -1: + try: + start_index = openings[end_type].pop() + ret.append((start_index, i)) + except IndexError: + pass + return ret + + rebulk.functional(mark_groups) + return rebulk diff --git a/lib/guessit2/rules/markers/path.py b/lib/guessit2/rules/markers/path.py new file mode 100644 index 0000000000000000000000000000000000000000..841419634b054420762353d5fe3643c1fea6423b --- /dev/null +++ b/lib/guessit2/rules/markers/path.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Path markers +""" +from __future__ import unicode_literals + +from rebulk import Rebulk + +from rebulk.utils import find_all + + +def path(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk() + rebulk.defaults(name="path", marker=True) + + def mark_path(input_string, context): + """ + Functional pattern to mark path elements. + + :param input_string: + :return: + """ + ret = [] + if context.get('name_only', False): + ret.append((0, len(input_string))) + else: + indices = list(find_all(input_string, '/')) + indices += list(find_all(input_string, r'\\')) + indices += [-1, len(input_string)] + + indices.sort() + + for i in range(0, len(indices) - 1): + ret.append((indices[i] + 1, indices[i + 1])) + + return ret + + rebulk.functional(mark_path) + return rebulk diff --git a/lib/guessit2/rules/processors.py b/lib/guessit2/rules/processors.py new file mode 100644 index 0000000000000000000000000000000000000000..cce451ba1c1bcb1dcc5f21dcdcaad7d8b277acc9 --- /dev/null +++ b/lib/guessit2/rules/processors.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Processors +""" +from __future__ import unicode_literals + +from collections import defaultdict +import copy + +import six + +from rebulk import Rebulk, Rule, CustomRule, POST_PROCESS, PRE_PROCESS, AppendMatch, RemoveMatch +from guessit2.rules.common.words import iter_words +from .common.formatters import cleanup +from .common.comparators import marker_sorted +from .common.date import valid_year + + +class EnlargeGroupMatches(CustomRule): + """ + Enlarge matches that are starting and/or ending group to include brackets in their span. + :param matches: + :type matches: + :return: + :rtype: + """ + priority = PRE_PROCESS + + def when(self, matches, context): + starting = [] + ending = [] + + for group in matches.markers.named('group'): + for match in matches.starting(group.start + 1): + starting.append(match) + + for match in matches.ending(group.end - 1): + ending.append(match) + + if starting or ending: + return starting, ending + + def then(self, matches, when_response, context): + starting, ending = when_response + for match in starting: + matches.remove(match) + match.start -= 1 + match.raw_start += 1 + matches.append(match) + + for match in ending: + matches.remove(match) + match.end += 1 + match.raw_end -= 1 + matches.append(match) + + +class EquivalentHoles(Rule): + """ + Creates equivalent matches for holes that have same values than existing (case insensitive) + """ + priority = POST_PROCESS + consequence = AppendMatch + + def when(self, matches, context): + new_matches = [] + + for filepath in marker_sorted(matches.markers.named('path'), matches): + holes = matches.holes(start=filepath.start, end=filepath.end, formatter=cleanup) + for name in matches.names: + for hole in list(holes): + for current_match in matches.named(name): + if isinstance(current_match.value, six.string_types) and \ + hole.value.lower() == current_match.value.lower(): + if 'equivalent-ignore' in current_match.tags: + continue + new_value = _preferred_string(hole.value, current_match.value) + if hole.value != new_value: + hole.value = new_value + if current_match.value != new_value: + current_match.value = new_value + hole.name = name + hole.tags = ['equivalent'] + new_matches.append(hole) + if hole in holes: + holes.remove(hole) + + return new_matches + + +class RemoveAmbiguous(Rule): + """ + If multiple match are found with same name and different values, keep the one in the most valuable filepart. + Also keep others match with same name and values than those kept ones. + """ + priority = POST_PROCESS + consequence = RemoveMatch + + def when(self, matches, context): + fileparts = marker_sorted(matches.markers.named('path'), matches) + + previous_fileparts_names = set() + values = defaultdict(list) + + to_remove = [] + for filepart in fileparts: + filepart_matches = matches.range(filepart.start, filepart.end) + + filepart_names = set() + for match in filepart_matches: + filepart_names.add(match.name) + if match.name in previous_fileparts_names: + if match.value not in values[match.name]: + to_remove.append(match) + else: + if match.value not in values[match.name]: + values[match.name].append(match.value) + + previous_fileparts_names.update(filepart_names) + + return to_remove + + +def _preferred_string(value1, value2): # pylint:disable=too-many-return-statements + """ + Retrieves preferred title from both values. + :param value1: + :type value1: str + :param value2: + :type value2: str + :return: The preferred title + :rtype: str + """ + if value1 == value2: + return value1 + if value1.istitle() and not value2.istitle(): + return value1 + if not value1.isupper() and value2.isupper(): + return value1 + if not value1.isupper() and value1[0].isupper() and not value2[0].isupper(): + return value1 + if _count_title_words(value1) > _count_title_words(value2): + return value1 + return value2 + + +def _count_title_words(value): + """ + Count only many words are titles in value. + :param value: + :type value: + :return: + :rtype: + """ + ret = 0 + for word in iter_words(value): + if word.group(0).istitle(): + ret += 1 + return ret + +class SeasonYear(Rule): + """ + If a season is a valid year and no year was found, create an match with year. + """ + priority = POST_PROCESS + consequence = AppendMatch + + def when(self, matches, context): + ret = [] + if not matches.named('year'): + for season in matches.named('season'): + if valid_year(season.value): + year = copy.copy(season) + year.name = 'year' + ret.append(year) + return ret + + +class Processors(CustomRule): + """ + Empty rule for ordering post_processing properly. + """ + priority = POST_PROCESS + + def when(self, matches, context): + pass + + def then(self, matches, when_response, context): # pragma: no cover + pass + + +def processors(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + return Rebulk().rules(EnlargeGroupMatches, EquivalentHoles, RemoveAmbiguous, SeasonYear, Processors) diff --git a/lib/guessit2/rules/properties/__init__.py b/lib/guessit2/rules/properties/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e0a24eaf0863690c380f85da6906300a085a81c4 --- /dev/null +++ b/lib/guessit2/rules/properties/__init__.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Properties +""" diff --git a/lib/guessit2/rules/properties/audio_codec.py b/lib/guessit2/rules/properties/audio_codec.py new file mode 100644 index 0000000000000000000000000000000000000000..85ebff3e8510beb1fd10dda66448019f80610f36 --- /dev/null +++ b/lib/guessit2/rules/properties/audio_codec.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +audio_codec, audio_profile and audio_channels property +""" +from __future__ import unicode_literals + +import regex as re + +from rebulk import Rebulk, Rule, RemoveMatch +from ..common import dash +from ..common.validators import seps_before, seps_after + +audio_properties = ['audio_codec', 'audio_profile', 'audio_channels'] + + +def audio_codec(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE, abbreviations=[dash]).string_defaults(ignore_case=True) + rebulk.defaults(name="audio_codec") + + rebulk.regex("MP3", "LAME", r"LAME(?:\d)+-?(?:\d)+", value="MP3") + rebulk.regex("DolbyDigital", "DD", value="DolbyDigital") + rebulk.regex("AAC", value="AAC") + rebulk.regex("AC3", value="AC3") + rebulk.regex("Flac", value="FLAC") + rebulk.regex("DTS", value="DTS") + rebulk.regex("True-?HD", value="TrueHD") + + rebulk.defaults(name="audio_profile") + rebulk.string("HD", value="HD", tags="DTS") + rebulk.regex("HD-?MA", value="HDMA", tags="DTS") + rebulk.string("HE", value="HE", tags="AAC") + rebulk.string("LC", value="LC", tags="AAC") + rebulk.string("HQ", value="HQ", tags="AC3") + + rebulk.defaults(name="audio_channels") + rebulk.regex(r'(7[\W_]1)(?:[^\d]|$)', value='7.1', children=True) + rebulk.regex(r'(5[\W_]1)(?:[^\d]|$)', value='5.1', children=True) + rebulk.regex(r'(2[\W_]0)(?:[^\d]|$)', value='2.0', children=True) + rebulk.string('7ch', '8ch', value='7.1') + rebulk.string('5ch', '6ch', value='5.1') + rebulk.string('2ch', 'stereo', value='2.0') + rebulk.string('1ch', 'mono', value='1.0') + + rebulk.rules(DtsRule, AacRule, Ac3Rule, AudioValidatorRule, HqConflictRule) + + return rebulk + + +class AudioValidatorRule(Rule): + """ + Remove audio properties if not surrounded by separators and not next each others + """ + priority = 64 + consequence = RemoveMatch + + def when(self, matches, context): + ret = [] + + audio_list = matches.range(predicate=lambda match: match.name in audio_properties) + for audio in audio_list: + if not seps_before(audio): + valid_before = matches.range(audio.start - 1, audio.start, + lambda match: match.name in audio_properties) + if not valid_before: + ret.append(audio) + continue + if not seps_after(audio): + valid_after = matches.range(audio.end, audio.end + 1, + lambda match: match.name in audio_properties) + if not valid_after: + ret.append(audio) + continue + + return ret + + +class AudioProfileRule(Rule): + """ + Abstract rule to validate audio profiles + """ + priority = 64 + dependency = AudioValidatorRule + consequence = RemoveMatch + + def __init__(self, codec): + super(AudioProfileRule, self).__init__() + self.codec = codec + + def when(self, matches, context): + profile_list = matches.named('audio_profile', lambda match: self.codec in match.tags) + ret = [] + for profile in profile_list: + codec = matches.previous(profile, lambda match: match.name == 'audio_codec' and match.value == self.codec) + if not codec: + codec = matches.next(profile, lambda match: match.name == 'audio_codec' and match.value == self.codec) + if not codec: + ret.append(profile) + return ret + + +class DtsRule(AudioProfileRule): + """ + Rule to validate DTS profile + """ + + def __init__(self): + super(DtsRule, self).__init__("DTS") + + +class AacRule(AudioProfileRule): + """ + Rule to validate AAC profile + """ + + def __init__(self): + super(AacRule, self).__init__("AAC") + + +class Ac3Rule(AudioProfileRule): + """ + Rule to validate AC3 profile + """ + + def __init__(self): + super(Ac3Rule, self).__init__("AC3") + + +class HqConflictRule(Rule): + """ + Solve conflict between HQ from other property and from audio_profile. + """ + + dependency = [DtsRule, AacRule, Ac3Rule] + consequence = RemoveMatch + + def when(self, matches, context): + hq_audio = matches.named('audio_profile', lambda match: match.value == 'HQ') + hq_audio_spans = [match.span for match in hq_audio] + hq_other = matches.named('other', lambda match: match.span in hq_audio_spans) + + if hq_other: + return hq_other diff --git a/lib/guessit2/rules/properties/bonus.py b/lib/guessit2/rules/properties/bonus.py new file mode 100644 index 0000000000000000000000000000000000000000..416de2b2c801bbe91314983cc7b8e4dc3e06aeec --- /dev/null +++ b/lib/guessit2/rules/properties/bonus.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +bonus property +""" +from __future__ import unicode_literals + +import regex as re + +from rebulk import Rebulk, AppendMatch, Rule + +from .title import TitleFromPosition +from ..common.formatters import cleanup +from ..common.validators import seps_surround + + +def bonus(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE) + + rebulk.regex(r'x(\d+)', name='bonus', private_parent=True, children=True, formatter=int, + validator={'__parent__': lambda match: seps_surround}, + conflict_solver=lambda match, conflicting: match + if conflicting.name in ['video_codec', 'episode'] and 'bonus-conflict' not in conflicting.tags + else '__default__') + + rebulk.rules(BonusTitleRule) + + return rebulk + + +class BonusTitleRule(Rule): + """ + Find bonus title after bonus. + """ + dependency = TitleFromPosition + consequence = AppendMatch + + properties = {'bonus_title': [None]} + + def when(self, matches, context): + bonus_number = matches.named('bonus', lambda match: not match.private, index=0) + if bonus_number: + filepath = matches.markers.at_match(bonus_number, lambda marker: marker.name == 'path', 0) + hole = matches.holes(bonus_number.end, filepath.end + 1, formatter=cleanup, index=0) + if hole and hole.value: + hole.name = 'bonus_title' + return hole diff --git a/lib/guessit2/rules/properties/cds.py b/lib/guessit2/rules/properties/cds.py new file mode 100644 index 0000000000000000000000000000000000000000..e2d39b3ff3d3b78393581227892fb3c271f3acd6 --- /dev/null +++ b/lib/guessit2/rules/properties/cds.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +cd and cd_count properties +""" +from __future__ import unicode_literals + +import regex as re + +from rebulk import Rebulk +from ..common import dash + + +def cds(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE, abbreviations=[dash]) + + rebulk.regex(r'cd-?(?P<cd>\d+)(?:-?of-?(?P<cd_count>\d+))?', + validator={'cd': lambda match: match.value > 0, 'cd_count': lambda match: match.value > 0}, + formatter={'cd': int, 'cd_count': int}, + children=True, + private_parent=True, + properties={'cd': [None], 'cd_count': [None]}) + rebulk.regex(r'(?P<cd_count>\d+)-?cds?', + validator={'cd': lambda match: match.value > 0, 'cd_count': lambda match: match.value > 0}, + formatter={'cd_count': int}, + children=True, + private_parent=True, + properties={'cd': [None], 'cd_count': [None]}) + + return rebulk diff --git a/lib/guessit2/rules/properties/container.py b/lib/guessit2/rules/properties/container.py new file mode 100644 index 0000000000000000000000000000000000000000..564b32f991437b1a2f3dcea1a2cc057b24d6029f --- /dev/null +++ b/lib/guessit2/rules/properties/container.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +container property +""" +from __future__ import unicode_literals + +import regex as re + +from rebulk import Rebulk +from ..common.validators import seps_surround + + +def container(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE).string_defaults(ignore_case=True) + rebulk.defaults(name='container', + formatter=lambda value: value[1:], + tags=['extension'], + conflict_solver=lambda match, other: other + if other.name in ['format', 'video_codec'] or + other.name == 'container' and 'extension' not in other.tags + else '__default__') + + subtitles = ['srt', 'idx', 'sub', 'ssa', 'ass'] + info = ['nfo'] + videos = ['3g2', '3gp', '3gp2', 'asf', 'avi', 'divx', 'flv', 'm4v', 'mk2', + 'mka', 'mkv', 'mov', 'mp4', 'mp4a', 'mpeg', 'mpg', 'ogg', 'ogm', + 'ogv', 'qt', 'ra', 'ram', 'rm', 'ts', 'wav', 'webm', 'wma', 'wmv', + 'iso', 'vob'] + torrent = ['torrent'] + + rebulk.regex(r'\.\L<exts>$', exts=subtitles, tags=['extension', 'subtitle']) + rebulk.regex(r'\.\L<exts>$', exts=info, tags=['extension', 'info']) + rebulk.regex(r'\.\L<exts>$', exts=videos, tags=['extension', 'video']) + rebulk.regex(r'\.\L<exts>$', exts=torrent, tags=['extension', 'torrent']) + + rebulk.defaults(name='container', + validator=seps_surround, + conflict_solver=lambda match, other: match + if other.name in ['format', + 'video_codec'] or other.name == 'container' and 'extension' in other.tags + else '__default__') + + rebulk.string(*[sub for sub in subtitles if sub not in ['sub']], tags=['subtitle']) + rebulk.string(*videos, tags=['video']) + rebulk.string(*torrent, tags=['torrent']) + + return rebulk diff --git a/lib/guessit2/rules/properties/country.py b/lib/guessit2/rules/properties/country.py new file mode 100644 index 0000000000000000000000000000000000000000..4cda291ea6f05dec52f216078e67463aa3db0b51 --- /dev/null +++ b/lib/guessit2/rules/properties/country.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +country property +""" +# pylint: disable=no-member +from __future__ import unicode_literals + +import babelfish + +from rebulk import Rebulk +from ..common.words import COMMON_WORDS, iter_words + + +def country(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().defaults(name='country') + + rebulk.functional(find_countries, + # Prefer language and any other property over country if not US or GB. + conflict_solver=lambda match, other: match + if other.name != 'language' or match.value not in [babelfish.Country('US'), + babelfish.Country('GB')] + else other, + properties={'country': [None]}) + + return rebulk + + +COUNTRIES_SYN = {'ES': ['españa'], + 'GB': ['UK'], + 'BR': ['brazilian', 'bra'], + # FIXME: this one is a bit of a stretch, not sure how to do it properly, though... + 'MX': ['Latinoamérica', 'latin america']} + + +class GuessitCountryConverter(babelfish.CountryReverseConverter): # pylint: disable=missing-docstring + def __init__(self): + self.guessit_exceptions = {} + + for alpha2, synlist in COUNTRIES_SYN.items(): + for syn in synlist: + self.guessit_exceptions[syn.lower()] = alpha2 + + @property + def codes(self): # pylint: disable=missing-docstring + return (babelfish.country_converters['name'].codes | + frozenset(babelfish.COUNTRIES.values()) | + frozenset(self.guessit_exceptions.keys())) + + def convert(self, alpha2): + if alpha2 == 'GB': + return 'UK' + return str(babelfish.Country(alpha2)) + + def reverse(self, name): + # exceptions come first, as they need to override a potential match + # with any of the other guessers + try: + return self.guessit_exceptions[name.lower()] + except KeyError: + pass + + try: + return babelfish.Country(name.upper()).alpha2 + except ValueError: + pass + + for conv in [babelfish.Country.fromname]: + try: + return conv(name).alpha2 + except babelfish.CountryReverseError: + pass + + raise babelfish.CountryReverseError(name) + + +babelfish.country_converters['guessit'] = GuessitCountryConverter() + + +def is_valid_country(country_object, context=None): + """ + Check if country is valid. + """ + if context and context.get('allowed_countries'): + allowed_countries = context.get('allowed_countries') + return country_object.name.lower() in allowed_countries or country_object.alpha2.lower() in allowed_countries + else: + return country_object.name.lower() not in COMMON_WORDS and country_object.alpha2.lower() not in COMMON_WORDS + + +def find_countries(string, context=None): + """ + Find countries in given string. + """ + ret = [] + for word_match in iter_words(string.strip().lower()): + try: + country_object = babelfish.Country.fromguessit(word_match.group()) + if is_valid_country(country_object, context): + ret.append((word_match.start(), word_match.end(), {'value': country_object})) + except babelfish.Error: + continue + return ret diff --git a/lib/guessit2/rules/properties/crc.py b/lib/guessit2/rules/properties/crc.py new file mode 100644 index 0000000000000000000000000000000000000000..62275ca325457b47d6d8ed136593128bdfd2203e --- /dev/null +++ b/lib/guessit2/rules/properties/crc.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +crc and uuid properties +""" +from __future__ import unicode_literals + +import regex as re + +from rebulk import Rebulk +from ..common.validators import seps_surround + + +def crc(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE) + rebulk.defaults(validator=seps_surround) + + rebulk.regex('(?:[a-fA-F]|[0-9]){8}', name='crc32', + conflict_solver=lambda match, other: match + if other.name in ['episode', 'season'] + else '__default__') + + rebulk.functional(guess_idnumber, name='uuid', + conflict_solver=lambda match, other: match + if other.name in ['episode', 'season'] + else '__default__') + return rebulk + + +_DIGIT = 0 +_LETTER = 1 +_OTHER = 2 + +_idnum = re.compile(r'(?P<uuid>[a-zA-Z0-9-]{20,})') # 1.0, (0, 0)) + + +def guess_idnumber(string): + """ + Guess id number function + :param string: + :type string: + :return: + :rtype: + """ + # pylint:disable=invalid-name + ret = [] + + matches = list(_idnum.finditer(string)) + for match in matches: + result = match.groupdict() + switch_count = 0 + switch_letter_count = 0 + letter_count = 0 + last_letter = None + + last = _LETTER + for c in result['uuid']: + if c in '0123456789': + ci = _DIGIT + elif c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': + ci = _LETTER + if c != last_letter: + switch_letter_count += 1 + last_letter = c + letter_count += 1 + else: + ci = _OTHER + + if ci != last: + switch_count += 1 + + last = ci + + # only return the result as probable if we alternate often between + # char type (more likely for hash values than for common words) + switch_ratio = float(switch_count) / len(result['uuid']) + letters_ratio = (float(switch_letter_count) / letter_count) if letter_count > 0 else 1 + + if switch_ratio > 0.4 and letters_ratio > 0.4: + ret.append(match.span()) + + return ret diff --git a/lib/guessit2/rules/properties/date.py b/lib/guessit2/rules/properties/date.py new file mode 100644 index 0000000000000000000000000000000000000000..ff9c248518d2c9f34ae582f25983d8deeef1df54 --- /dev/null +++ b/lib/guessit2/rules/properties/date.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +date and year properties +""" +from __future__ import unicode_literals + +from rebulk import Rebulk, RemoveMatch, Rule + +from ..common.date import search_date, valid_year +from ..common.validators import seps_surround + + +def date(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().defaults(validator=seps_surround) + + rebulk.regex(r"\d{4}", name="year", formatter=int, + validator=lambda match: seps_surround(match) and valid_year(match.value)) + + def date_functional(string, context): + """ + Search for date in the string and retrieves match + + :param string: + :return: + """ + + ret = search_date(string, context.get('date_year_first'), context.get('date_day_first')) + if ret: + return ret[0], ret[1], {'value': ret[2]} + + rebulk.functional(date_functional, name="date", properties={'date': [None]}, + conflict_solver=lambda match, other: other + if other.name in ['episode', 'season'] + else '__default__') + + rebulk.rules(KeepMarkedYearInFilepart) + + return rebulk + + +class KeepMarkedYearInFilepart(Rule): + """ + Keep first years marked with [](){} in filepart, or if no year is marked, ensure it won't override titles. + """ + priority = 64 + consequence = RemoveMatch + + def when(self, matches, context): + ret = [] + if len(matches.named('year')) > 1: + for filepart in matches.markers.named('path'): + years = matches.range(filepart.start, filepart.end, lambda match: match.name == 'year') + if len(years) > 1: + group_years = [] + ungroup_years = [] + for year in years: + if matches.markers.at_match(year, lambda marker: marker.name == 'group'): + group_years.append(year) + else: + ungroup_years.append(year) + if group_years and ungroup_years: + ret.extend(ungroup_years) + ret.extend(group_years[1:]) # Keep the first year in marker. + elif not group_years: + ret.append(ungroup_years[0]) # Keep first year for title. + if len(ungroup_years) > 2: + ret.extend(ungroup_years[2:]) + return ret diff --git a/lib/guessit2/rules/properties/edition.py b/lib/guessit2/rules/properties/edition.py new file mode 100644 index 0000000000000000000000000000000000000000..93021a5f8e0837c86a5400d8d9ee836cd5210e19 --- /dev/null +++ b/lib/guessit2/rules/properties/edition.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +edition property +""" +from __future__ import unicode_literals + +import regex as re + +from rebulk import Rebulk +from ..common import dash +from ..common.validators import seps_surround + + +def edition(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE, abbreviations=[dash]).string_defaults(ignore_case=True) + rebulk.defaults(name='edition', validator=seps_surround) + + rebulk.regex('collector', 'collector-edition', 'edition-collector', value='Collector Edition') + rebulk.regex('special-edition', 'edition-special', value='Special Edition', + conflict_solver=lambda match, other: other + if other.name == 'episode_details' and other.value == 'Special' + else '__default__') + rebulk.regex('criterion-edition', 'edition-criterion', value='Criterion Edition') + rebulk.regex('deluxe', 'deluxe-edition', 'edition-deluxe', value='Deluxe Edition') + rebulk.regex('director\'?s?-cut', 'director\'?s?-cut-edition', 'edition-director\'?s?-cut', value='Director\'s cut') + + return rebulk diff --git a/lib/guessit2/rules/properties/episode_title.py b/lib/guessit2/rules/properties/episode_title.py new file mode 100644 index 0000000000000000000000000000000000000000..fc92d7b97c63148465337fa08e6ca43b91973391 --- /dev/null +++ b/lib/guessit2/rules/properties/episode_title.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Episode title +""" +from __future__ import unicode_literals + +from collections import defaultdict + +from rebulk import Rebulk, Rule, AppendMatch, RenameMatch +from ..common import seps, title_seps +from ..properties.title import TitleFromPosition, TitleBaseRule +from ..common.formatters import cleanup + + +def episode_title(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().rules(EpisodeTitleFromPosition, + AlternativeTitleReplace, + TitleToEpisodeTitle, + Filepart3EpisodeTitle, + Filepart2EpisodeTitle) + return rebulk + + +class TitleToEpisodeTitle(Rule): + """ + If multiple different title are found, convert the one following episode number to episode_title. + """ + dependency = TitleFromPosition + + def when(self, matches, context): + titles = matches.named('title') + + if len(titles) < 2: + return + + title_groups = defaultdict(list) + for title in titles: + title_groups[title.value].append(title) + + episode_titles = [] + main_titles = [] + for title in titles: + if matches.previous(title, lambda match: match.name == 'episode'): + episode_titles.append(title) + else: + main_titles.append(title) + + if episode_titles: + return episode_titles + + def then(self, matches, when_response, context): + for title in when_response: + matches.remove(title) + title.name = 'episode_title' + matches.append(title) + + +class EpisodeTitleFromPosition(TitleBaseRule): + """ + Add episode title match in existing matches + Must run after TitleFromPosition rule. + """ + dependency = TitleToEpisodeTitle + + def hole_filter(self, hole, matches): + episode = matches.previous(hole, + lambda previous: any(name in previous.names + for name in ['episode', 'episode_details', + 'episode_count', 'season', 'season_count', + 'date', 'title', 'year']), + 0) + + crc32 = matches.named('crc32') + + return episode or crc32 + + def filepart_filter(self, filepart, matches): + # Filepart where title was found. + if matches.range(filepart.start, filepart.end, lambda match: match.name == 'title'): + return True + return False + + def should_remove(self, match, matches, filepart, hole, context): + if match.name == 'episode_details': + return False + return super(EpisodeTitleFromPosition, self).should_remove(match, matches, filepart, hole, context) + + def __init__(self): + super(EpisodeTitleFromPosition, self).__init__('episode_title', ['title']) + + def when(self, matches, context): + if matches.named('episode_title'): + return + return super(EpisodeTitleFromPosition, self).when(matches, context) + + +class AlternativeTitleReplace(Rule): + """ + If alternateTitle was found and title is next to episode, season or date, replace it with episode_title. + """ + dependency = EpisodeTitleFromPosition + consequence = RenameMatch + + def when(self, matches, context): + if matches.named('episode_title'): + return + + alternative_title = matches.range(predicate=lambda match: match.name == 'alternativeTitle', index=0) + if alternative_title: + main_title = matches.chain_before(alternative_title.start, seps=seps, + predicate=lambda match: 'title' in match.tags, index=0) + if main_title: + episode = matches.previous(main_title, + lambda previous: any(name in previous.names + for name in ['episode', 'episode_details', + 'episode_count', 'season', + 'season_count', + 'date', 'title', 'year']), + 0) + + crc32 = matches.named('crc32') + + if episode or crc32: + return alternative_title + + def then(self, matches, when_response, context): + matches.remove(when_response) + when_response.name = 'episode_title' + matches.append(when_response) + + +class Filepart3EpisodeTitle(Rule): + """ + If we have at least 3 filepart structured like this: + + Serie name/SO1/E01-episode_title.mkv + AAAAAAAAAA/BBB/CCCCCCCCCCCCCCCCCCCC + + If CCCC contains episode and BBB contains seasonNumber + Then title is to be found in AAAA. + """ + consequence = AppendMatch('title') + + def when(self, matches, context): + fileparts = matches.markers.named('path') + if len(fileparts) < 3: + return + + filename = fileparts[-1] + directory = fileparts[-2] + subdirectory = fileparts[-3] + + episode_number = matches.range(filename.start, filename.end, lambda match: match.name == 'episode', 0) + if episode_number: + season = matches.range(directory.start, directory.end, lambda match: match.name == 'season', 0) + + if season: + hole = matches.holes(subdirectory.start, subdirectory.end, + formatter=cleanup, seps=title_seps, predicate=lambda match: match.value, + index=0) + if hole: + return hole + + +class Filepart2EpisodeTitle(Rule): + """ + If we have at least 2 filepart structured like this: + + Serie name SO1/E01-episode_title.mkv + AAAAAAAAAAAAA/BBBBBBBBBBBBBBBBBBBBB + + If BBBB contains episode and AAA contains a hole followed by seasonNumber + Then title is to be found in AAAA. + """ + consequence = AppendMatch('title') + + def when(self, matches, context): + fileparts = matches.markers.named('path') + if len(fileparts) < 2: + return + + filename = fileparts[-1] + directory = fileparts[-2] + + episode_number = matches.range(filename.start, filename.end, lambda match: match.name == 'episode', 0) + if episode_number: + season = matches.range(directory.start, directory.end, lambda match: match.name == 'season', 0) + if season: + hole = matches.holes(directory.start, directory.end, formatter=cleanup, seps=title_seps, + predicate=lambda match: match.value, index=0) + if hole: + return hole diff --git a/lib/guessit2/rules/properties/episodes.py b/lib/guessit2/rules/properties/episodes.py new file mode 100644 index 0000000000000000000000000000000000000000..87995f0debcfbb2d58f6e6f65e60a020ce1449b9 --- /dev/null +++ b/lib/guessit2/rules/properties/episodes.py @@ -0,0 +1,366 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +episode, season, episode_count, season_count and episode_details properties +""" +from __future__ import unicode_literals + +from collections import defaultdict +import copy + +import regex as re + +from rebulk import Rebulk, RemoveMatch, Rule, AppendMatch, RenameMatch +from .title import TitleFromPosition +from ..common.validators import seps_surround +from ..common import dash, alt_dash +from ..common.numeral import numeral, parse_numeral + + +def episodes(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk() + rebulk.regex_defaults(flags=re.IGNORECASE).string_defaults(ignore_case=True) + rebulk.defaults(private_names=['episodeSeparator', 'seasonSeparator']) + + # 01x02, 01x02x03x04 + rebulk.regex(r'(?P<season>\d+)@?x@?(?P<episode>\d+)' + + r'(?:(?P<episodeSeparator>x|-|\+|&)(?P<episode>\d+))*', + # S01E02, S01x02, S01E02E03, S01Ex02, S01xE02, SO1Ex02Ex03 + r'S(?P<season>\d+)@?(?:xE|Ex|E|x)@?(?P<episode>\d+)' + + r'(?:(?P<episodeSeparator>xE|Ex|E|x|-|\+|&)(?P<episode>\d+))*', + # S01 + r'S(?P<season>\d+)' + + r'(?:(?P<seasonSeparator>S|-|\+|&)(?P<season>\d+))*', + formatter={'season': int, 'episode': int}, + tags=['SxxExx'], + abbreviations=[alt_dash], + children=True, + private_parent=True, + conflict_solver=lambda match, other: match + if match.name in ['season', 'episode'] and other.name in + ['screen_size', 'video_codec', 'audio_codec', 'audio_channels', 'container', 'date'] + else '__default__') + + # episode_details property + for episode_detail in ('Special', 'Bonus', 'Omake', 'Ova', 'Oav', 'Pilot', 'Unaired'): + rebulk.string(episode_detail, value=episode_detail, name='episode_details') + rebulk.regex(r'Extras?', name='episode_details', value='Extras') + + rebulk.defaults(private_names=['episodeSeparator', 'seasonSeparator'], + validate_all=True, validator={'__parent__': seps_surround}, children=True, private_parent=True) + + season_words = ['season', 'saison', 'serie', 'seasons', 'saisons', 'series'] + episode_words = ['episode', 'episodes', 'ep'] + of_words = ['of', 'sur'] + all_words = ['All'] + + rebulk.regex(r'\L<season_words>@?(?P<season>' + numeral + ')' + + r'(?:@?\L<of_words>@?(?P<count>' + numeral + '))?' + + r'(?:@?(?P<seasonSeparator>-)@?(?P<season>\d+))*' + + r'(?:@?(?P<seasonSeparator>\+|&)@?(?P<season>\d+))*', + of_words=of_words, + season_words=season_words, # Season 1, # Season one + abbreviations=[alt_dash], formatter={'season': parse_numeral, 'count': parse_numeral}) + + rebulk.regex(r'\L<episode_words>-?(?P<episode>\d+)' + + r'(?:v(?P<version>\d+))?' + + r'(?:-?\L<of_words>?-?(?P<count>\d+))?', + of_words=of_words, + episode_words=episode_words, # Episode 4 + abbreviations=[dash], formatter=int, + disabled=lambda context: context.get('type') == 'episode') + + rebulk.regex(r'\L<episode_words>-?(?P<episode>' + numeral + ')' + + r'(?:v(?P<version>\d+))?' + + r'(?:-?\L<of_words>?-?(?P<count>\d+))?', + of_words=of_words, + episode_words=episode_words, # Episode 4 + abbreviations=[dash], formatter={'episode': parse_numeral, 'version': int, 'count': int}, + disabled=lambda context: context.get('type') != 'episode') + + rebulk.regex(r'S?(?P<season>\d+)-?(?:xE|Ex|E|x)-?(?P<other>\L<all_words>)', + tags=['SxxExx'], + all_words=all_words, + abbreviations=[dash], + validator=None, + formatter={'season': int, 'other': lambda match: 'Complete'}) + + rebulk.defaults(private_names=['episodeSeparator', 'seasonSeparator'], validate_all=True, + validator={'__parent__': seps_surround}, children=True, private_parent=True) + + # 12, 13 + rebulk.regex(r'(?P<episode>\d{2})' + + r'(?:v(?P<version>\d+))?' + + r'(?:(?P<episodeSeparator>[x-])(?P<episode>\d{2}))*', + tags=['bonus-conflict', 'weak-movie'], formatter={'episode': int, 'version': int}) + + # 012, 013 + rebulk.regex(r'0(?P<episode>\d{1,2})' + + r'(?:v(?P<version>\d+))?' + + r'(?:(?P<episodeSeparator>[x-])0(?P<episode>\d{1,2}))*', + tags=['bonus-conflict', 'weak-movie'], formatter={'episode': int, 'version': int}) + + # 112, 113 + rebulk.regex(r'(?P<episode>\d{3,4})' + + r'(?:v(?P<version>\d+))?' + + r'(?:(?P<episodeSeparator>[x-])(?P<episode>\d{3,4}))*', + tags=['bonus-conflict', 'weak-movie'], formatter={'episode': int, 'version': int}, + disabled=lambda context: not context.get('episode_prefer_number', False)) + + # 1, 2, 3 + rebulk.regex(r'(?P<episode>\d)' + + r'(?:v(?P<version>\d+))?' + + r'(?:(?P<episodeSeparator>[x-])(?P<episode>\d{1,2}))*', + tags=['bonus-conflict', 'weak-movie'], formatter={'episode': int, 'version': int}, + disabled=lambda context: context.get('type') != 'episode') + + # e112, e113 + rebulk.regex(r'e(?P<episode>\d{1,4})' + + r'(?:v(?P<version>\d+))?' + + r'(?:(?P<episodeSeparator>e|x|-)(?P<episode>\d{1,4}))*', + formatter={'episode': int, 'version': int}) + + # ep 112, ep113, ep112, ep113 + rebulk.regex(r'ep-?(?P<episode>\d{1,4})' + + r'(?:v(?P<version>\d+))?' + + r'(?:(?P<episodeSeparator>ep|e|x|-)(?P<episode>\d{1,4}))*', + abbreviations=[dash], + formatter={'episode': int, 'version': int}) + + # 102, 0102 + rebulk.regex(r'(?P<season>\d{1,2})(?P<episode>\d{2})' + + r'(?:v(?P<version>\d+))?' + + r'(?:(?P<episodeSeparator>x|-)(?P<episode>\d{2}))*', + tags=['bonus-conflict', 'weak-movie', 'weak-duplicate'], + formatter={'season': int, 'episode': int, 'version': int}, + conflict_solver=lambda match, other: match if other.name == 'year' else '__default__', + disabled=lambda context: context.get('episode_prefer_number', False)) + + rebulk.regex(r'v(?P<version>\d+)', children=True, private_parent=True, formatter=int) + + rebulk.defaults(private_names=['episodeSeparator', 'seasonSeparator']) + + # detached of X count (season/episode) + rebulk.regex(r'(?P<episode>\d+)?-?\L<of_words>-?(?P<count>\d+)-?\L<episode_words>?', of_words=of_words, + episode_words=episode_words, abbreviations=[dash], children=True, private_parent=True, formatter=int) + + rebulk.regex(r'Minisodes?', name='episode_format', value="Minisode") + + # Harcoded movie to disable weak season/episodes + rebulk.regex('OSS-?117', + abbreviations=[dash], name="hardcoded-movies", marker=True, + conflict_solver=lambda match, other: None) + + rebulk.rules(EpisodeNumberSeparatorRange, SeasonSeparatorRange, RemoveWeakIfMovie, RemoveWeakIfSxxExx, + RemoveWeakDuplicate, EpisodeDetailValidator, RemoveDetachedEpisodeNumber, VersionValidator, + CountValidator, EpisodeSingleDigitValidator) + + return rebulk + + +class CountValidator(Rule): + """ + Validate count property and rename it + """ + priority = 64 + consequence = [RemoveMatch, RenameMatch('episode_count'), RenameMatch('season_count')] + + properties = {'episode_count': [None], 'season_count': [None]} + + def when(self, matches, context): + to_remove = [] + episode_count = [] + season_count = [] + + for count in matches.named('count'): + previous = matches.previous(count, lambda match: match.name in ['episode', 'season'], 0) + if previous: + if previous.name == 'episode': + episode_count.append(count) + elif previous.name == 'season': + season_count.append(count) + else: + to_remove.append(count) + return to_remove, episode_count, season_count + + +class EpisodeNumberSeparatorRange(Rule): + """ + Remove separator matches and create matches for episoderNumber range. + """ + priority = 128 + consequence = [RemoveMatch, AppendMatch] + + def when(self, matches, context): + to_remove = [] + to_append = [] + for separator in matches.named('episodeSeparator'): + previous_match = matches.previous(separator, lambda match: match.name == 'episode', 0) + next_match = matches.next(separator, lambda match: match.name == 'episode', 0) + + if previous_match and next_match and separator.value == '-': + for episode_number in range(previous_match.value + 1, next_match.value): + match = copy.copy(separator) + match.private = False + match.name = 'episode' + match.value = episode_number + to_append.append(match) + to_remove.append(separator) + return to_remove, to_append + + +class SeasonSeparatorRange(Rule): + """ + Remove separator matches and create matches for season range. + """ + priority = 128 + consequence = [RemoveMatch, AppendMatch] + + def when(self, matches, context): + to_remove = [] + to_append = [] + for separator in matches.named('seasonSeparator'): + previous_match = matches.previous(separator, lambda match: match.name == 'season', 0) + next_match = matches.next(separator, lambda match: match.name == 'season', 0) + + if previous_match and next_match and separator.value == '-': + for episode_number in range(previous_match.value + 1, next_match.value): + match = copy.copy(separator) + match.private = False + match.name = 'season' + match.value = episode_number + to_append.append(match) + to_remove.append(separator) + return to_remove, to_append + + +class RemoveWeakIfMovie(Rule): + """ + Remove weak-movie tagged matches if it seems to be a movie. + """ + priority = 64 + consequence = RemoveMatch + + def when(self, matches, context): + if matches.named('year') or matches.markers.named('hardcoded-movies'): + return matches.tagged('weak-movie') + + +class RemoveWeakIfSxxExx(Rule): + """ + Remove weak-movie tagged matches if SxxExx pattern is matched. + """ + priority = 64 + consequence = RemoveMatch + + def when(self, matches, context): + if matches.tagged('SxxExx', lambda match: not match.private): + return matches.tagged('weak-movie') + + +class RemoveWeakDuplicate(Rule): + """ + Remove weak-duplicate tagged matches if duplicate patterns, for example The 100.109 + """ + priority = 64 + consequence = RemoveMatch + + def when(self, matches, context): + to_remove = [] + for filepart in matches.markers.named('path'): + patterns = defaultdict(list) + for match in reversed(matches.range(filepart.start, filepart.end, + predicate=lambda match: 'weak-duplicate' in match.tags)): + if match.pattern in patterns[match.name]: + to_remove.append(match) + else: + patterns[match.name].append(match.pattern) + return to_remove + + +class EpisodeDetailValidator(Rule): + """ + Validate episode_details if they are detached or next to season or episode. + """ + priority = 64 + consequence = RemoveMatch + + def when(self, matches, context): + ret = [] + for detail in matches.named('episode_details'): + if not seps_surround(detail) \ + and not matches.previous(detail, lambda match: match.name in ['season', 'episode']) \ + and not matches.next(detail, lambda match: match.name in ['season', 'episode']): + ret.append(detail) + return ret + + +class RemoveDetachedEpisodeNumber(Rule): + """ + If multiple episode are found, remove those that are not detached from a range and less than 10. + + Fairy Tail 2 - 16-20, 2 should be removed. + """ + priority = 64 + consequence = RemoveMatch + dependency = [RemoveWeakIfSxxExx, RemoveWeakDuplicate] + + def when(self, matches, context): + ret = [] + + episode_numbers = [] + episode_values = set() + for match in matches.named('episode', lambda match: not match.private and 'weak-movie' in match.tags): + if match.value not in episode_values: + episode_numbers.append(match) + episode_values.add(match.value) + + episode_numbers = list(sorted(episode_numbers, key=lambda match: match.value)) + if len(episode_numbers) > 1 and \ + episode_numbers[0].value < 10 and \ + episode_numbers[1].value - episode_numbers[0].value != 1: + parent = episode_numbers[0] + while parent: # TODO: Add a feature in rebulk to avoid this ... + ret.append(parent) + parent = parent.parent + return ret + + +class VersionValidator(Rule): + """ + Validate version if previous match is episode or if surrounded by separators. + """ + priority = 64 + dependency = [RemoveWeakIfMovie, RemoveWeakIfSxxExx] + consequence = RemoveMatch + + def when(self, matches, context): + ret = [] + for version in matches.named('version'): + episode_number = matches.previous(version, lambda match: match.name == 'episode', 0) + if not episode_number and not seps_surround(version.initiator): + ret.append(version) + return ret + + +class EpisodeSingleDigitValidator(Rule): + """ + Remove single digit episode when inside a group that doesn't own title. + """ + dependency = [TitleFromPosition] + + consequence = RemoveMatch + + def when(self, matches, context): + ret = [] + for episode in matches.named('episode', lambda match: len(match.initiator) == 1): + group = matches.markers.at_match(episode, lambda marker: marker.name == 'group', index=0) + if group: + if not matches.range(*group.span, predicate=lambda match: match.name == 'title'): + ret.append(episode) + return ret diff --git a/lib/guessit2/rules/properties/film.py b/lib/guessit2/rules/properties/film.py new file mode 100644 index 0000000000000000000000000000000000000000..5c6e3ab5b3d3e16de448b69c6d7471f8d9595aa4 --- /dev/null +++ b/lib/guessit2/rules/properties/film.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +film property +""" +from __future__ import unicode_literals + +import regex as re + +from rebulk import Rebulk, AppendMatch, Rule +from ..common.formatters import cleanup + + +def film(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE) + + rebulk.regex(r'f(\d+)', name='film', private_parent=True, children=True, formatter=int) + + rebulk.rules(FilmTitleRule) + + return rebulk + + +class FilmTitleRule(Rule): + """ + Rule to find out film_title (hole after film property + """ + consequence = AppendMatch + + properties = {'film_title': [None]} + + def when(self, matches, context): + bonus_number = matches.named('film', lambda match: not match.private, index=0) + if bonus_number: + filepath = matches.markers.at_match(bonus_number, lambda marker: marker.name == 'path', 0) + hole = matches.holes(filepath.start, bonus_number.start + 1, formatter=cleanup, index=0) + if hole and hole.value: + hole.name = 'film_title' + return hole diff --git a/lib/guessit2/rules/properties/format.py b/lib/guessit2/rules/properties/format.py new file mode 100644 index 0000000000000000000000000000000000000000..d250a8b88b09c2ae7e0fbea84a1eff99e0dc53ed --- /dev/null +++ b/lib/guessit2/rules/properties/format.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +format property +""" +from __future__ import unicode_literals + +import regex as re + +from rebulk import Rebulk, RemoveMatch, Rule +from ..common import dash +from ..common.validators import seps_before, seps_after + + +def format_(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE, abbreviations=[dash]) + rebulk.defaults(name="format") + + rebulk.regex("VHS", "VHS-?Rip", value="VHS") + rebulk.regex("CAM", "CAM-?Rip", "HD-?CAM", value="Cam") + rebulk.regex("TELESYNC", "TS", "HD-?TS", value="Telesync") + rebulk.regex("WORKPRINT", "WP", value="Workprint") + rebulk.regex("TELECINE", "TC", value="Telecine") + rebulk.regex("PPV", "PPV-?Rip", value="PPV") # Pay Per View + rebulk.regex("SD-?TV", "SD-?TV-?Rip", "Rip-?SD-?TV", "TV-?Rip", + "Rip-?TV", value="TV") # TV is too common to allow matching + rebulk.regex("DVB-?Rip", "DVB", "PD-?TV", value="DVB") + rebulk.regex("DVD", "DVD-?Rip", "VIDEO-?TS", "DVD-?R(?:$|(?!E))", # "DVD-?R(?:$|^E)" => DVD-Real ... + "DVD-?9", "DVD-?5", value="DVD") + + rebulk.regex("HD-?TV", "TV-?RIP-?HD", "HD-?TV-?RIP", "HD-?RIP", value="HDTV") + rebulk.regex("VOD", "VOD-?Rip", value="VOD") + rebulk.regex("WEB-?Rip", value="WEBRip") + rebulk.regex("WEB-?DL", "WEB-?HD", "WEB", value="WEB-DL") + rebulk.regex("HD-?DVD-?Rip", "HD-?DVD", value="HD-DVD") + rebulk.regex("Blu-?ray(?:-?Rip)?", "B[DR]", "B[DR]-?Rip", "BD[59]", "BD25", "BD50", value="BluRay") + + rebulk.rules(ValidateFormat) + + return rebulk + + +class ValidateFormat(Rule): + """ + Validate format with screener property or separated. + """ + priority = 64 + consequence = RemoveMatch + + def when(self, matches, context): + ret = [] + for format_match in matches.named('format'): + if not seps_before(format_match) and \ + not matches.range(format_match.start - 1, format_match.start - 2, + lambda match: match.name == 'other' and match.value == 'Screener'): + ret.append(format_match) + continue + if not seps_after(format_match) and \ + not matches.range(format_match.end, format_match.end + 1, + lambda match: match.name == 'other' and match.value == 'Screener'): + ret.append(format_match) + continue + return ret diff --git a/lib/guessit2/rules/properties/language.py b/lib/guessit2/rules/properties/language.py new file mode 100644 index 0000000000000000000000000000000000000000..c42b2e116cc27a9e40c3bc36742e03b378fedd47 --- /dev/null +++ b/lib/guessit2/rules/properties/language.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +language and subtitle_language properties +""" +# pylint: disable=no-member +from __future__ import unicode_literals + +import copy + +import regex as re +import babelfish + +from rebulk import Rebulk, Rule, RemoveMatch, RenameMatch +from ..common.words import iter_words, COMMON_WORDS +from ..common.validators import seps_surround + + +def language(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk() + + rebulk.string(*subtitle_prefixes, name="subtitle_language.prefix", ignore_case=True, private=True, + validator=seps_surround) + rebulk.string(*subtitle_suffixes, name="subtitle_language.suffix", ignore_case=True, private=True, + validator=seps_surround) + rebulk.functional(find_languages, properties={'language': [None]}) + rebulk.rules(SubtitlePrefixLanguageRule, SubtitleSuffixLanguageRule, SubtitleExtensionRule) + + return rebulk + + +COMMON_WORDS_STRICT = frozenset(['brazil']) + +UNDETERMINED = babelfish.Language('und') + +SYN = {('und', None): ['unknown', 'inconnu', 'unk', 'un'], + ('ell', None): ['gr', 'greek'], + ('spa', None): ['esp', 'español'], + ('fra', None): ['français', 'vf', 'vff', 'vfi', 'vfq'], + ('swe', None): ['se'], + ('por', 'BR'): ['po', 'pb', 'pob', 'br', 'brazilian'], + ('cat', None): ['català'], + ('ces', None): ['cz'], + ('ukr', None): ['ua'], + ('zho', None): ['cn'], + ('jpn', None): ['jp'], + ('hrv', None): ['scr'], + ('mul', None): ['multi', 'dl']} # http://scenelingo.wordpress.com/2009/03/24/what-does-dl-mean/ + + +class GuessitConverter(babelfish.LanguageReverseConverter): # pylint: disable=missing-docstring + _with_country_regexp = re.compile(r'(.*)\((.*)\)') + _with_country_regexp2 = re.compile(r'(.*)-(.*)') + + def __init__(self): + self.guessit_exceptions = {} + for (alpha3, country), synlist in SYN.items(): + for syn in synlist: + self.guessit_exceptions[syn.lower()] = (alpha3, country, None) + + @property + def codes(self): # pylint: disable=missing-docstring + return (babelfish.language_converters['alpha3b'].codes | + babelfish.language_converters['alpha2'].codes | + babelfish.language_converters['name'].codes | + babelfish.language_converters['opensubtitles'].codes | + babelfish.country_converters['name'].codes | + frozenset(self.guessit_exceptions.keys())) + + def convert(self, alpha3, country=None, script=None): + return str(babelfish.Language(alpha3, country, script)) + + def reverse(self, name): + with_country = (GuessitConverter._with_country_regexp.match(name) or + GuessitConverter._with_country_regexp2.match(name)) + + name = name.lower() + if with_country: + lang = babelfish.Language.fromguessit(with_country.group(1).strip()) + lang.country = babelfish.Country.fromguessit(with_country.group(2).strip()) + return lang.alpha3, lang.country.alpha2 if lang.country else None, lang.script or None + + # exceptions come first, as they need to override a potential match + # with any of the other guessers + try: + return self.guessit_exceptions[name] + except KeyError: + pass + + for conv in [babelfish.Language, + babelfish.Language.fromalpha3b, + babelfish.Language.fromalpha2, + babelfish.Language.fromname, + babelfish.Language.fromopensubtitles]: + try: + reverse = conv(name) + return reverse.alpha3, reverse.country, reverse.script + except (ValueError, babelfish.LanguageReverseError): + pass + + raise babelfish.LanguageReverseError(name) + + +babelfish.language_converters['guessit'] = GuessitConverter() + +subtitle_prefixes = ['sub', 'subs', 'st', 'vost', 'subforced', 'fansub', 'hardsub'] +subtitle_suffixes = ['subforced', 'fansub', 'hardsub', 'sub', 'subs'] +lang_prefixes = ['true'] + +all_lang_prefixes_suffixes = subtitle_prefixes + subtitle_suffixes + lang_prefixes + + +def find_languages(string, context=None): + """Find languages in the string + + :return: list of tuple (property, Language, lang_word, word) + """ + allowed_languages = context.get('allowed_languages') + common_words = COMMON_WORDS_STRICT if allowed_languages else COMMON_WORDS + + matches = [] + for word_match in iter_words(string): + word = word_match.group() + start, end = word_match.span() + + lang_word = word.lower() + key = 'language' + for prefix in subtitle_prefixes: + if lang_word.startswith(prefix): + lang_word = lang_word[len(prefix):] + key = 'subtitle_language' + for suffix in subtitle_suffixes: + if lang_word.endswith(suffix): + lang_word = lang_word[:len(suffix) - 1] + key = 'subtitle_language' + for prefix in lang_prefixes: + if lang_word.startswith(prefix): + lang_word = lang_word[len(prefix):] + if lang_word not in common_words and word.lower() not in common_words: + try: + lang = babelfish.Language.fromguessit(lang_word) + match = (start, end, {'name': key, 'value': lang}) + if allowed_languages: + if lang.name.lower() in allowed_languages \ + or lang.alpha2.lower() in allowed_languages \ + or lang.alpha3.lower() in allowed_languages: + matches.append(match) + # Keep language with alpha2 equivalent. Others are probably + # uncommon languages. + elif lang == 'mul' or hasattr(lang, 'alpha2'): + matches.append(match) + except babelfish.Error: + pass + return matches + + +class SubtitlePrefixLanguageRule(Rule): + """ + Convert language guess as subtitle_language if previous match is a subtitle language prefix + """ + consequence = RemoveMatch + + properties = {'subtitle_language': [None]} + + def when(self, matches, context): + to_rename = [] + to_remove = matches.named('subtitle_language.prefix') + for lang in matches.named('language'): + prefix = matches.previous(lang, lambda match: match.name == 'subtitle_language.prefix', 0) + if not prefix: + group_marker = matches.markers.at_match(lang, lambda marker: marker.name == 'group', 0) + if group_marker: + # Find prefix if placed just before the group + prefix = matches.previous(group_marker, lambda match: match.name == 'subtitle_language.prefix', + 0) + if not prefix: + # Find prefix if placed before in the group + prefix = matches.range(group_marker.start, lang.start, + lambda match: match.name == 'subtitle_language.prefix', 0) + if prefix: + to_rename.append((prefix, lang)) + if prefix in to_remove: + to_remove.remove(prefix) + return to_rename, to_remove + + def then(self, matches, when_response, context): + to_rename, to_remove = when_response + super(SubtitlePrefixLanguageRule, self).then(matches, to_remove, context) + for prefix, match in to_rename: + # Remove suffix equivalent of prefix. + suffix = copy.copy(prefix) + suffix.name = 'subtitle_language.suffix' + if suffix in matches: + matches.remove(suffix) + matches.remove(match) + match.name = 'subtitle_language' + matches.append(match) + + +class SubtitleSuffixLanguageRule(Rule): + """ + Convert language guess as subtitle_language if next match is a subtitle language suffix + """ + dependency = SubtitlePrefixLanguageRule + consequence = RemoveMatch + + properties = {'subtitle_language': [None]} + + def when(self, matches, context): + to_append = [] + to_remove = matches.named('subtitle_language.suffix') + for lang in matches.named('language'): + suffix = matches.next(lang, lambda match: match.name == 'subtitle_language.suffix', 0) + if suffix: + to_append.append(lang) + if suffix in to_remove: + to_remove.remove(suffix) + return to_append, to_remove + + def then(self, matches, when_response, context): + to_rename, to_remove = when_response + super(SubtitleSuffixLanguageRule, self).then(matches, to_remove, context) + for match in to_rename: + matches.remove(match) + match.name = 'subtitle_language' + matches.append(match) + + +class SubtitleExtensionRule(Rule): + """ + Convert language guess as subtitle_language if next match is a subtitle extension + """ + consequence = RenameMatch('subtitle_language') + + properties = {'subtitle_language': [None]} + + def when(self, matches, context): + subtitle_extension = matches.named('container', + lambda match: 'extension' in match.tags and 'subtitle' in match.tags, + 0) + if subtitle_extension: + subtitle_lang = matches.previous(subtitle_extension, lambda match: match.name == 'language', 0) + if subtitle_lang: + return subtitle_lang diff --git a/lib/guessit2/rules/properties/mimetype.py b/lib/guessit2/rules/properties/mimetype.py new file mode 100644 index 0000000000000000000000000000000000000000..8e21ca32ec8faf836e9f774fb1d3e79739dd4c1b --- /dev/null +++ b/lib/guessit2/rules/properties/mimetype.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +mimetype property +""" +from __future__ import unicode_literals + +import mimetypes + +from rebulk import Rebulk, CustomRule, POST_PROCESS +from rebulk.match import Match + +from ...rules.processors import Processors + + +def mimetype(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + return Rebulk().rules(Mimetype) + + +class Mimetype(CustomRule): + """ + Mimetype post processor + :param matches: + :type matches: + :return: + :rtype: + """ + priority = POST_PROCESS + + dependency = Processors + + def when(self, matches, context): + mime, _ = mimetypes.guess_type(matches.input_string, strict=False) + return mime + + def then(self, matches, when_response, context): + mime = when_response + matches.append(Match(len(matches.input_string), len(matches.input_string), name='mimetype', value=mime)) diff --git a/lib/guessit2/rules/properties/other.py b/lib/guessit2/rules/properties/other.py new file mode 100644 index 0000000000000000000000000000000000000000..38e69f767669a7422f456256265889fcf47ddbdd --- /dev/null +++ b/lib/guessit2/rules/properties/other.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +other property +""" +from __future__ import unicode_literals + +import copy + +import regex as re + +from rebulk import Rebulk, Rule, RemoveMatch, POST_PROCESS, AppendMatch +from ..common import dash +from ..common import seps +from ..common.validators import seps_surround +from guessit2.rules.common.formatters import raw_cleanup + + +def other(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE, abbreviations=[dash]).string_defaults(ignore_case=True) + rebulk.defaults(name="other", validator=seps_surround) + + rebulk.regex('Audio-?Fix', 'Audio-?Fixed', value='AudioFix') + rebulk.regex('Sync-?Fix', 'Sync-?Fixed', value='SyncFix') + rebulk.regex('Dual-?Audio', value='DualAudio') + rebulk.regex('ws', 'wide-?screen', value='WideScreen') + rebulk.string('Netflix', 'NF', value='Netflix') + + rebulk.string('Real', 'Fix', value='Proper', tags=['has-neighbor-before', 'has-neighbor-after']) + rebulk.string('Proper', 'Repack', 'Rerip', value='Proper') + rebulk.string('Fansub', value='Fansub', tags='has-neighbor') + rebulk.string('Fastsub', value='Fastsub', tags='has-neighbor') + + rebulk.regex('(?:Seasons?-)?Complete', value='Complete', tags=['release-group-prefix'], + validator=lambda match: seps_surround(match) and match.raw.lower().strip(seps) != "complete") + rebulk.string('R5', 'RC', value='R5') + rebulk.regex('Pre-?Air', value='Preair') + + for value in ( + 'Screener', 'Remux', '3D', 'HD', 'mHD', 'HDLight', 'HQ', 'DDC', 'HR', 'PAL', 'SECAM', 'NTSC', 'CC', 'LD', + 'MD'): + rebulk.string(value, value=value) + + for value in ('Limited', 'Complete', 'Classic', 'Unrated', 'LiNE', 'Bonus', 'Trailer', 'FINAL'): + rebulk.string(value, value=value, tags=['has-neighbor', 'release-group-prefix']) + + rebulk.string('VO', 'OV', value='OV', tags='has-neighbor') + + rebulk.regex('Scr(?:eener)?', value='Screener', validator=None, tags='other.validate.screener') + + rebulk.rules(ValidateHasNeighbor, ValidateHasNeighborAfter, ValidateHasNeighborBefore, ValidateScreenerRule, + ProperCountRule) + + return rebulk + + +class ProperCountRule(Rule): + """ + Add proper_count property + """ + priority = POST_PROCESS + + consequence = AppendMatch + + properties = {'proper_count': [None]} + + def when(self, matches, context): + propers = matches.named('other', lambda match: match.value == 'Proper') + if propers: + raws = {} # Count distinct raw values + for proper in propers: + raws[raw_cleanup(proper.raw)] = proper + proper_count_match = copy.copy(propers[-1]) + proper_count_match.name = 'proper_count' + proper_count_match.value = len(raws) + return proper_count_match + + +class ValidateHasNeighbor(Rule): + """ + Validate tag has-neighbor + """ + consequence = RemoveMatch + + def when(self, matches, context): + ret = [] + for to_check in matches.range(predicate=lambda match: 'has-neighbor' in match.tags): + previous_match = matches.previous(to_check, index=0) + previous_group = matches.markers.previous(to_check, lambda marker: marker.name == 'group', 0) + if previous_group and (not previous_match or previous_group.end > previous_match.end): + previous_match = previous_group + if previous_match and not matches.input_string[previous_match.end:to_check.start].strip(seps): + break + next_match = matches.next(to_check, index=0) + next_group = matches.markers.next(to_check, lambda marker: marker.name == 'group', 0) + if next_group and (not next_match or next_group.start < next_match.start): + next_match = next_group + if next_match and not matches.input_string[to_check.end:next_match.start].strip(seps): + break + ret.append(to_check) + return ret + + +class ValidateHasNeighborBefore(Rule): + """ + Validate tag has-neighbor-before that previous match exists. + """ + consequence = RemoveMatch + + def when(self, matches, context): + ret = [] + for to_check in matches.range(predicate=lambda match: 'has-neighbor-before' in match.tags): + next_match = matches.next(to_check, index=0) + next_group = matches.markers.next(to_check, lambda marker: marker.name == 'group', 0) + if next_group and (not next_match or next_group.start < next_match.start): + next_match = next_group + if next_match and not matches.input_string[to_check.end:next_match.start].strip(seps): + break + ret.append(to_check) + return ret + + +class ValidateHasNeighborAfter(Rule): + """ + Validate tag has-neighbor-after that next match exists. + """ + consequence = RemoveMatch + + def when(self, matches, context): + ret = [] + for to_check in matches.range(predicate=lambda match: 'has-neighbor-after' in match.tags): + previous_match = matches.previous(to_check, index=0) + previous_group = matches.markers.previous(to_check, lambda marker: marker.name == 'group', 0) + if previous_group and (not previous_match or previous_group.end > previous_match.end): + previous_match = previous_group + if previous_match and not matches.input_string[previous_match.end:to_check.start].strip(seps): + break + ret.append(to_check) + return ret + + +class ValidateScreenerRule(Rule): + """ + Validate tag other.validate.screener + """ + consequence = RemoveMatch + + def when(self, matches, context): + ret = [] + for screener in matches.named('other', lambda match: 'other.validate.screener' in match.tags): + format_match = matches.previous(screener, lambda match: match.name == 'format', 0) + if not format_match or matches.input_string[format_match.end:screener.start].strip(seps): + ret.append(screener) + return ret diff --git a/lib/guessit2/rules/properties/part.py b/lib/guessit2/rules/properties/part.py new file mode 100644 index 0000000000000000000000000000000000000000..483b86edb0a482f2bf15108c149690534f0917a7 --- /dev/null +++ b/lib/guessit2/rules/properties/part.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +part property +""" +from __future__ import unicode_literals + +import regex as re + +from rebulk import Rebulk +from ..common import dash +from ..common.validators import seps_surround +from ..common.numeral import numeral, parse_numeral + + +def part(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE, abbreviations=[dash], validator={'__parent__': seps_surround}) + + prefixes = ['pt', 'part'] + + rebulk.regex(r'\L<prefixes>-?(' + numeral + r')', prefixes=prefixes, + name='part', validate_all=True, private_parent=True, children=True, formatter=parse_numeral) + + return rebulk diff --git a/lib/guessit2/rules/properties/release_group.py b/lib/guessit2/rules/properties/release_group.py new file mode 100644 index 0000000000000000000000000000000000000000..0802f490ab929f32cc3fc712afcd31e125ac2a42 --- /dev/null +++ b/lib/guessit2/rules/properties/release_group.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +release_group property +""" +from __future__ import unicode_literals + +import copy + +import regex as re + +from rebulk import Rebulk, Rule, AppendMatch +from ..common.validators import int_coercable +from ..properties.title import TitleFromPosition +from ..common.formatters import cleanup +from ..common import seps, dash +from ..common.comparators import marker_sorted + + +def release_group(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + return Rebulk().rules(SceneReleaseGroup, AnimeReleaseGroup, ExpectedReleaseGroup) + + +forbidden_groupnames = ['rip', 'by', 'for', 'par', 'pour', 'bonus'] + +groupname_seps = ''.join([c for c in seps if c not in '[]{}()']) + + +def clean_groupname(string): + """ + Removes and strip separators from input_string + :param input_string: + :type input_string: + :return: + :rtype: + """ + string = string.strip(groupname_seps) + for forbidden in forbidden_groupnames: + if string.lower().startswith(forbidden): + string = string[len(forbidden):] + string = string.strip(groupname_seps) + if string.lower().endswith(forbidden): + string = string[:len(forbidden)] + string = string.strip(groupname_seps) + return string + + +_scene_previous_names = ['video_codec', 'format', 'video_api', 'audio_codec', 'audio_profile', 'video_profile', + 'audio_channels', 'screen_size'] + +_scene_previous_tags = ['release-group-prefix'] + + +class ExpectedReleaseGroup(Rule): + """ + Add release_group match from expected_group option + """ + consequence = AppendMatch + + properties = {'release_group': [None]} + + def enabled(self, context): + return context.get('expected_group') + + def when(self, matches, context): + expected_rebulk = Rebulk().defaults(name='release_group') + + for expected_group in context.get('expected_group'): + if expected_group.startswith('re:'): + expected_group = expected_group[3:] + expected_group = expected_group.replace(' ', '-') + expected_rebulk.regex(expected_group, abbreviations=[dash], flags=re.IGNORECASE) + else: + expected_rebulk.string(expected_group, ignore_case=True) + + matches = expected_rebulk.matches(matches.input_string, context) + return matches + + +class SceneReleaseGroup(Rule): + """ + Add release_group match in existing matches (scene format). + + Something.XViD-ReleaseGroup.mkv + """ + dependency = [TitleFromPosition, ExpectedReleaseGroup] + consequence = AppendMatch + + properties = {'release_group': [None]} + + def when(self, matches, context): + # If a release_group is found before, ignore this kind of release_group rule. + if matches.named('release_group'): + return + + ret = [] + + for filepart in marker_sorted(matches.markers.named('path'), matches): + start, end = filepart.span + + last_hole = matches.holes(start, end + 1, formatter=clean_groupname, + predicate=lambda hole: cleanup(hole.value), index=-1) + + if last_hole: + previous_match = matches.previous(last_hole, lambda match: not match.private, index=0) + if previous_match and (previous_match.name in _scene_previous_names or + any(tag in previous_match.tags for tag in _scene_previous_tags)) and \ + not matches.input_string[previous_match.end:last_hole.start].strip(seps) \ + and not int_coercable(last_hole.value.strip(seps)): + + last_hole.name = 'release_group' + last_hole.tags = ['scene'] + + # if hole is insed a group marker with same value, remove [](){} ... + group = matches.markers.at_match(last_hole, lambda marker: marker.name == 'group', 0) + if group: + group.formatter = clean_groupname + if group.value == last_hole.value: + last_hole.start = group.start + 1 + last_hole.end = group.end - 1 + last_hole.tags = ['anime'] + + ret.append(last_hole) + return ret + + +class AnimeReleaseGroup(Rule): + """ + Add release_group match in existing matches (anime format) + ...[ReleaseGroup] Something.mkv + """ + dependency = [SceneReleaseGroup, TitleFromPosition] + consequence = AppendMatch + + properties = {'release_group': [None]} + + def when(self, matches, context): + ret = [] + + # If a release_group is found before, ignore this kind of release_group rule. + if matches.named('release_group'): + return ret + + for filepart in marker_sorted(matches.markers.named('path'), matches): + + # pylint:disable=bad-continuation + empty_group_marker = matches.markers \ + .range(filepart.start, filepart.end, lambda marker: marker.name == 'group' + and not matches.range(marker.start, marker.end) + and not int_coercable(marker.value.strip(seps)), + 0) + + if empty_group_marker: + group = copy.copy(empty_group_marker) + group.marker = False + group.raw_start += 1 + group.raw_end -= 1 + group.tags = ['anime'] + group.name = 'release_group' + ret.append(group) + return ret diff --git a/lib/guessit2/rules/properties/screen_size.py b/lib/guessit2/rules/properties/screen_size.py new file mode 100644 index 0000000000000000000000000000000000000000..1ccc6572e77bca73b72ba879061f7972131dd4d3 --- /dev/null +++ b/lib/guessit2/rules/properties/screen_size.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +screen_size property +""" +from __future__ import unicode_literals + +import regex as re + +from rebulk import Rebulk, Rule, RemoveMatch +from ..common.validators import seps_surround +from guessit2.rules.common import dash + + +def screen_size(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + def conflict_solver(match, other): + """ + Conflict solver for most screen_size. + """ + if other.name == 'screen_size': + if 'resolution' in other.tags: + # The chtouile to solve conflict in "720 x 432" string matching both 720p pattern + int_value = _digits_re.findall(match.raw)[-1] + if other.value.startswith(int_value): + return match + return other + return '__default__' + + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE) + rebulk.defaults(name="screen_size", validator=seps_surround, conflict_solver=conflict_solver) + + rebulk.regex(r"(?:\d{3,}(?:x|\*))?360(?:i|p?x?)", value="360p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?368(?:i|p?x?)", value="368p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?480(?:i|p?x?)", value="480p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?576(?:i|p?x?)", value="576p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?720(?:i|p?(?:50|60)?x?)", value="720p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?720(?:p(?:50|60)?x?)", value="720p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?720hd", value="720p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?900(?:i|p?x?)", value="900p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?1080i", value="1080i") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?1080p?x?", value="1080p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?1080(?:p(?:50|60)?x?)", value="1080p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?1080hd", value="1080p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?2160(?:i|p?x?)", value="4K") + + _digits_re = re.compile(r'\d+') + + rebulk.defaults(name="screen_size", validator=seps_surround) + rebulk.regex(r'\d{3,}-?(?:x|\*)-?\d{3,}', + formatter=lambda value: 'x'.join(_digits_re.findall(value)), + abbreviations=[dash], + tags=['resolution'], + conflict_solver=lambda match, other: '__default__' if other.name == 'screen_size' else other) + + rebulk.rules(ScreenSizeOnlyOne) + + return rebulk + + +class ScreenSizeOnlyOne(Rule): + """ + Keep a single screen_size pet filepath part. + """ + consequence = RemoveMatch + + def when(self, matches, context): + to_remove = [] + for filepart in matches.markers.named('path'): + screensize = list(reversed(matches.range(filepart.start, filepart.end, + lambda match: match.name == 'screen_size'))) + if len(screensize) > 1: + to_remove.extend(screensize[1:]) + + return to_remove diff --git a/lib/guessit2/rules/properties/title.py b/lib/guessit2/rules/properties/title.py new file mode 100644 index 0000000000000000000000000000000000000000..25efea85cbf58abff7c36e3ff78a57ce8ecdedb1 --- /dev/null +++ b/lib/guessit2/rules/properties/title.py @@ -0,0 +1,340 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +title property +""" +from __future__ import unicode_literals + +import regex as re + +from rebulk import Rebulk, Rule, AppendMatch, RemoveMatch, AppendTags +from rebulk.formatters import formatters +from rebulk.pattern import RePattern +from rebulk.utils import find_all + +from .film import FilmTitleRule +from .language import SubtitlePrefixLanguageRule, SubtitleSuffixLanguageRule, SubtitleExtensionRule +from ..common.formatters import cleanup, reorder_title +from ..common.comparators import marker_sorted +from ..common import seps, title_seps, dash + + +def title(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().rules(TitleFromPosition, PreferTitleWithYear) + + def expected_title(input_string, context): + """ + Expected title functional pattern. + :param input_string: + :type input_string: + :param context: + :type context: + :return: + :rtype: + """ + ret = [] + for search in context.get('expected_title'): + if search.startswith('re:'): + search = search[3:] + search = search.replace(' ', '-') + matches = RePattern(search, abbreviations=[dash], flags=re.IGNORECASE).matches(input_string, context) + for match in matches: + ret.append(match.span) + else: + for start in find_all(input_string, search, ignore_case=True): + ret.append((start, start+len(search))) + return ret + + rebulk.functional(expected_title, name='title', tags=['expected'], + conflict_solver=lambda match, other: other, + disabled=lambda context: not context.get('expected_title')) + + return rebulk + + +class TitleBaseRule(Rule): + """ + Add title match in existing matches + """ + # pylint:disable=no-self-use,unused-argument + consequence = [AppendMatch, RemoveMatch] + + def __init__(self, match_name, match_tags=None, alternative_match_name=None): + super(TitleBaseRule, self).__init__() + self.match_name = match_name + self.match_tags = match_tags + self.alternative_match_name = alternative_match_name + + def hole_filter(self, hole, matches): + """ + Filter holes for titles. + :param hole: + :type hole: + :param matches: + :type matches: + :return: + :rtype: + """ + return True + + def filepart_filter(self, filepart, matches): + """ + Filter filepart for titles. + :param filepart: + :type filepart: + :param matches: + :type matches: + :return: + :rtype: + """ + return True + + def holes_process(self, holes, matches): + """ + process holes + :param holes: + :type holes: + :param matches: + :type matches: + :return: + :rtype: + """ + cropped_holes = [] + for hole in holes: + group_markers = matches.markers.named('group') + cropped_holes.extend(hole.crop(group_markers)) + return cropped_holes + + def is_ignored(self, match): + """ + Ignore matches when scanning for title (hole) + """ + return match.name in ['language', 'country', 'episode_details'] + + def should_keep(self, match, to_keep, matches, filepart, hole, starting): + """ + Check if this match should be accepted when ending or starting a hole. + :param match: + :type match: + :param to_keep: + :type to_keep: list[Match] + :param matches: + :type matches: Matches + :param hole: the filepart match + :type hole: Match + :param hole: the hole match + :type hole: Match + :param starting: true if match is starting the hole + :type starting: bool + :return: + :rtype: + """ + # Keep language if other languages exists in the filepart. + if match.name in ['language', 'country']: + outside_matches = filepart.crop(hole) + other_languages = [] + for outside in outside_matches: + other_languages.extend(matches.range(outside.start, outside.end, + lambda c_match: c_match.name == match.name and + c_match not in to_keep)) + + if not other_languages: + return True + + return False + + def should_remove(self, match, matches, filepart, hole, context): + """ + Check if this match should be removed after beeing ignored. + :param match: + :param matches: + :param filepart: + :param hole: + :return: + """ + if context.get('type') == 'episode' and match.name == 'episode_details': + return False + return True + + def check_titles_in_filepart(self, filepart, matches, context): + """ + Find title in filepart (ignoring language) + """ + # pylint:disable=too-many-locals,too-many-branches,too-many-statements + start, end = filepart.span + + holes = matches.holes(start, end + 1, formatter=formatters(cleanup, reorder_title), + ignore=self.is_ignored, + predicate=lambda hole: hole.value) + + holes = self.holes_process(holes, matches) + + for hole in holes: + # pylint:disable=cell-var-from-loop + if not hole or (self.hole_filter and not self.hole_filter(hole, matches)): + continue + + to_remove = [] + to_keep = [] + + ignored_matches = matches.range(hole.start, hole.end, self.is_ignored) + + if ignored_matches: + for ignored_match in reversed(ignored_matches): + # pylint:disable=undefined-loop-variable + trailing = matches.chain_before(hole.end, seps, predicate=lambda match: match == ignored_match) + if trailing: + should_keep = self.should_keep(ignored_match, to_keep, matches, filepart, hole, False) + if should_keep: + # pylint:disable=unpacking-non-sequence + try: + append, crop = should_keep + except TypeError: + append, crop = should_keep, should_keep + if append: + to_keep.append(ignored_match) + if crop: + hole.end = ignored_match.start + + for ignored_match in ignored_matches: + if ignored_match not in to_keep: + starting = matches.chain_after(hole.start, seps, + predicate=lambda match: match == ignored_match) + if starting: + should_keep = self.should_keep(ignored_match, to_keep, matches, filepart, hole, True) + if should_keep: + # pylint:disable=unpacking-non-sequence + try: + append, crop = should_keep + except TypeError: + append, crop = should_keep, should_keep + if append: + to_keep.append(ignored_match) + if crop: + hole.start = ignored_match.end + + for match in ignored_matches: + if self.should_remove(match, matches, filepart, hole, context): + to_remove.append(match) + for keep_match in to_keep: + to_remove.remove(keep_match) + + if hole and hole.value: + hole.name = self.match_name + hole.tags = self.match_tags + if self.alternative_match_name: + # Split and keep values that can be a title + titles = hole.split(title_seps, lambda match: match.value) + for title_match in list(titles[1:]): + previous_title = titles[titles.index(title_match) - 1] + separator = matches.input_string[previous_title.end:title_match.start] + if len(separator) == 1 and separator == '-' \ + and previous_title.raw[-1] not in seps \ + and title_match.raw[0] not in seps: + titles[titles.index(title_match) - 1].end = title_match.end + titles.remove(title_match) + else: + title_match.name = self.alternative_match_name + + else: + titles = [hole] + return titles, to_remove + + def when(self, matches, context): + if matches.named(self.match_name, lambda match: 'expected' in match.tags): + return + + fileparts = [filepart for filepart in list(marker_sorted(matches.markers.named('path'), matches)) + if not self.filepart_filter or self.filepart_filter(filepart, matches)] + + to_remove = [] + + # Priorize fileparts containing the year + years_fileparts = [] + for filepart in fileparts: + year_match = matches.range(filepart.start, filepart.end, lambda match: match.name == 'year', 0) + if year_match: + years_fileparts.append(filepart) + + ret = [] + for filepart in fileparts: + try: + years_fileparts.remove(filepart) + except ValueError: + pass + titles = self.check_titles_in_filepart(filepart, matches, context) + if titles: + titles, to_remove_c = titles + ret.extend(titles) + to_remove.extend(to_remove_c) + break + + # Add title match in all fileparts containing the year. + for filepart in years_fileparts: + titles = self.check_titles_in_filepart(filepart, matches, context) + if titles: + # pylint:disable=unbalanced-tuple-unpacking + titles, to_remove_c = titles + ret.extend(titles) + to_remove.extend(to_remove_c) + + return ret, to_remove + + +class TitleFromPosition(TitleBaseRule): + """ + Add title match in existing matches + """ + dependency = [FilmTitleRule, SubtitlePrefixLanguageRule, SubtitleSuffixLanguageRule, SubtitleExtensionRule] + + properties = {'title': [None]} + + def __init__(self): + super(TitleFromPosition, self).__init__('title', ['title'], 'alternativeTitle') + + +class PreferTitleWithYear(Rule): + """ + Prefer title where filepart contains year. + """ + dependency = TitleFromPosition + consequence = [RemoveMatch, AppendTags(['equivalent-ignore'])] + + properties = {'title': [None]} + + def when(self, matches, context): + with_year_in_group = [] + with_year = [] + titles = matches.named('title') + + for title_match in titles: + filepart = matches.markers.at_match(title_match, lambda marker: marker.name == 'path', 0) + if filepart: + year_match = matches.range(filepart.start, filepart.end, lambda match: match.name == 'year', 0) + if year_match: + group = matches.markers.at_match(year_match, lambda group: group.name == 'group') + if group: + with_year_in_group.append(title_match) + else: + with_year.append(title_match) + + to_tag = [] + if with_year_in_group: + title_values = set([title_match.value for title_match in with_year_in_group]) + to_tag.extend(with_year_in_group) + elif with_year: + title_values = set([title_match.value for title_match in with_year]) + to_tag.extend(with_year) + else: + title_values = set([title_match.value for title_match in titles]) + + to_remove = [] + for title_match in titles: + if title_match.value not in title_values: + to_remove.append(title_match) + return to_remove, to_tag diff --git a/lib/guessit2/rules/properties/type.py b/lib/guessit2/rules/properties/type.py new file mode 100644 index 0000000000000000000000000000000000000000..70043e091974d3100f88ca8916025a9139637ec0 --- /dev/null +++ b/lib/guessit2/rules/properties/type.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +type property +""" +from __future__ import unicode_literals + +from rebulk import CustomRule, Rebulk, POST_PROCESS +from rebulk.match import Match + +from ...rules.processors import Processors + + +def _type(matches, value): + """ + Define type match with given value. + :param matches: + :param value: + :return: + """ + matches.append(Match(len(matches.input_string), len(matches.input_string), name='type', value=value)) + + +def type_(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + return Rebulk().rules(TypeProcessor) + + +class TypeProcessor(CustomRule): + """ + Post processor to find file type based on all others found matches. + """ + priority = POST_PROCESS + + dependency = Processors + + properties = {'type': ['episode', 'movie']} + + def when(self, matches, context): # pylint:disable=too-many-return-statements + option_type = context.get('type', None) + if option_type: + return option_type + + episode = matches.named('episode') + season = matches.named('season') + episode_details = matches.named('episode_details') + + if episode or season or episode_details: + return 'episode' + + film = matches.named('film') + if film: + return 'movie' + + year = matches.named('year') + date = matches.named('date') + + if date and not year: + return 'episode' + + bonus = matches.named('bonus') + if bonus and not year: + return 'episode' + + crc32 = matches.named('crc32') + anime_release_group = matches.named('release_group', lambda match: 'anime' in match.tags) + if crc32 and anime_release_group: + return 'episode' + + return 'movie' + + def then(self, matches, when_response, context): + _type(matches, when_response) diff --git a/lib/guessit2/rules/properties/video_codec.py b/lib/guessit2/rules/properties/video_codec.py new file mode 100644 index 0000000000000000000000000000000000000000..41ad1cf5e6f98f08181d95ce55909d40cec4f164 --- /dev/null +++ b/lib/guessit2/rules/properties/video_codec.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +video_codec and video_profile property +""" +from __future__ import unicode_literals + +import regex as re + +from rebulk import Rebulk, Rule, RemoveMatch +from ..common import dash +from ..common.validators import seps_surround + + +def video_codec(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE, abbreviations=[dash]).string_defaults(ignore_case=True) + rebulk.defaults(name="video_codec", validator=seps_surround) + + rebulk.regex(r"Rv\d{2}", value="Real") + rebulk.regex("Mpeg2", value="Mpeg2") + rebulk.regex("DVDivX", "DivX", value="DivX") + rebulk.regex("XviD", value="XviD") + rebulk.regex("[hx]-?264(?:-?AVC)?", "MPEG-?4(?:-?AVC)", value="h264") + rebulk.regex("[hx]-?265(?:-?HEVC)?", "HEVC", value="h265") + + # http://blog.mediacoderhq.com/h264-profiles-and-levels/ + # http://fr.wikipedia.org/wiki/H.264 + rebulk.defaults(name="video_profile", validator=seps_surround) + + rebulk.regex('10.?bit', 'Hi10P', value='10bit') + rebulk.regex('8.?bit', value='8bit') + + rebulk.string('BP', value='BP', tags='video_profile.rule') + rebulk.string('XP', 'EP', value='XP', tags='video_profile.rule') + rebulk.string('MP', value='MP', tags='video_profile.rule') + rebulk.string('HP', 'HiP', value='HP', tags='video_profile.rule') + rebulk.regex('Hi422P', value='Hi422P', tags='video_profile.rule') + rebulk.regex('Hi444PP', value='Hi444PP', tags='video_profile.rule') + + rebulk.string('DXVA', value='DXVA', name='video_api') + + rebulk.rules(VideoProfileRule) + + return rebulk + + +class VideoProfileRule(Rule): + """ + Rule to validate video_profile + """ + consequence = RemoveMatch + + def when(self, matches, context): + profile_list = matches.named('video_profile', lambda match: 'video_profile.rule' in match.tags) + ret = [] + for profile in profile_list: + codec = matches.previous(profile, lambda match: match.name == 'video_codec') + if not codec: + codec = matches.next(profile, lambda match: match.name == 'video_codec') + if not codec: + ret.append(profile) + return ret diff --git a/lib/guessit2/rules/properties/website.py b/lib/guessit2/rules/properties/website.py new file mode 100644 index 0000000000000000000000000000000000000000..8040ad73d5e870e9e751165a4c2e02348982818a --- /dev/null +++ b/lib/guessit2/rules/properties/website.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Website property. +""" +from __future__ import unicode_literals + +from pkg_resources import resource_stream # @UnresolvedImport +import regex as re + +from rebulk import Rebulk + + +def website(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE) + rebulk.defaults(name="website") + + tlds = [l.strip().decode('utf-8') + for l in resource_stream('guessit', 'tlds-alpha-by-domain.txt').readlines() + if b'--' not in l][1:] # All registered domain extension + + safe_tlds = ['com', 'org', 'net'] # For sure a website extension + safe_subdomains = ['www'] # For sure a website subdomain + safe_prefix = ['co', 'com', 'org', 'net'] # Those words before a tlds are sure + + rebulk.regex(r'(?:[^a-z0-9]|^)((?:\L<safe_subdomains>\.)+(?:[a-z-]+\.)+(?:\L<tlds>))(?:[^a-z0-9]|$)', + safe_subdomains=safe_subdomains, tlds=tlds, children=True) + rebulk.regex(r'(?:[^a-z0-9]|^)((?:\L<safe_subdomains>\.)*[a-z-]+\.(?:\L<safe_tlds>))(?:[^a-z0-9]|$)', + safe_subdomains=safe_subdomains, safe_tlds=safe_tlds, children=True) + rebulk.regex( + r'(?:[^a-z0-9]|^)((?:\L<safe_subdomains>\.)*[a-z-]+\.(?:\L<safe_prefix>\.)+(?:\L<tlds>))(?:[^a-z0-9]|$)', + safe_subdomains=safe_subdomains, safe_prefix=safe_prefix, tlds=tlds, children=True) + + return rebulk diff --git a/lib/guessit2/test/__init__.py b/lib/guessit2/test/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e5be370e4be5007b33fd87ec270e91eea041b66a --- /dev/null +++ b/lib/guessit2/test/__init__.py @@ -0,0 +1,3 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name diff --git a/lib/guessit2/test/episodes.yml b/lib/guessit2/test/episodes.yml new file mode 100644 index 0000000000000000000000000000000000000000..2173a258dc5cf5ca283d3bb11fc0135566fcd899 --- /dev/null +++ b/lib/guessit2/test/episodes.yml @@ -0,0 +1,1708 @@ +? __default__ +: type: episode + +? Series/Californication/Season 2/Californication.2x05.Vaginatown.HDTV.XviD-0TV.avi +: title: Californication + season: 2 + episode: 5 + episode_title: Vaginatown + format: HDTV + video_codec: XviD + release_group: 0TV + container: avi + +? Series/dexter/Dexter.5x02.Hello,.Bandit.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi +: title: Dexter + season: 5 + episode: 2 + episode_title: Hello, Bandit + language: English + subtitle_language: French + format: HDTV + video_codec: XviD + release_group: AlFleNi-TeaM + website: tvu.org.ru + container: avi + +? Series/Treme/Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.avi +: title: Treme + season: 1 + episode: 3 + episode_title: Right Place, Wrong Time + format: HDTV + video_codec: XviD + release_group: NoTV + +? Series/Duckman/Duckman - S1E13 Joking The Chicken (unedited).avi +: title: Duckman + season: 1 + episode: 13 + episode_title: Joking The Chicken + +? Series/Simpsons/Saison 12 Français/Simpsons,.The.12x08.A.Bas.Le.Sergent.Skinner.FR.avi +: title: The Simpsons + season: 12 + episode: 8 + episode_title: A Bas Le Sergent Skinner + language: French + +? Series/Duckman/Duckman - 101 (01) - 20021107 - I, Duckman.avi +: title: Duckman + season: 1 + episode: 1 + episode_title: I, Duckman + date: 2002-11-07 + +? Series/Simpsons/Saison 12 Français/Simpsons,.The.12x08.A.Bas.Le.Sergent.Skinner.FR.avi +: title: The Simpsons + season: 12 + episode: 8 + episode_title: A Bas Le Sergent Skinner + language: French + +? Series/Futurama/Season 3 (mkv)/[™] Futurama - S03E22 - Le chef de fer à 30% ( 30 Percent Iron Chef ).mkv +: title: Futurama + season: 3 + episode: 22 + episode_title: Le chef de fer à 30% + +? Series/The Office/Season 6/The Office - S06xE01.avi +: title: The Office + season: 6 + episode: 1 + +? series/The Office/Season 4/The Office [401] Fun Run.avi +: title: The Office + season: 4 + episode: 1 + episode_title: Fun Run + +? Series/Mad Men Season 1 Complete/Mad.Men.S01E01.avi +: title: Mad Men + season: 1 + episode: 1 + other: Complete + +? series/Psych/Psych S02 Season 2 Complete English DVD/Psych.S02E02.65.Million.Years.Off.avi +: title: Psych + season: 2 + episode: 2 + episode_title: 65 Million Years Off + language: english + format: DVD + other: Complete + +? series/Psych/Psych S02 Season 2 Complete English DVD/Psych.S02E03.Psy.Vs.Psy.Français.srt +: title: Psych + season: 2 + episode: 3 + episode_title: Psy Vs Psy + format: DVD + language: English + subtitle_language: French + other: Complete + +? Series/Pure Laine/Pure.Laine.1x01.Toutes.Couleurs.Unies.FR.(Québec).DVB-Kceb.[tvu.org.ru].avi +: title: Pure Laine + season: 1 + episode: 1 + episode_title: Toutes Couleurs Unies + format: DVB + release_group: Kceb + language: french + website: tvu.org.ru + +? Series/Pure Laine/2x05 - Pure Laine - Je Me Souviens.avi +: title: Pure Laine + season: 2 + episode: 5 + episode_title: Je Me Souviens + +? Series/Tout sur moi/Tout sur moi - S02E02 - Ménage à trois (14-01-2008) [Rip by Ampli].avi +: title: Tout sur moi + season: 2 + episode: 2 + episode_title: Ménage à trois + date: 2008-01-14 + +? The.Mentalist.2x21.18-5-4.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi +: title: The Mentalist + season: 2 + episode: 21 + episode_title: 18-5-4 + language: english + subtitle_language: french + format: HDTV + video_codec: XviD + release_group: AlFleNi-TeaM + website: tvu.org.ru + +? series/__ Incomplete __/Dr Slump (Catalan)/Dr._Slump_-_003_DVB-Rip_Catalan_by_kelf.avi +: title: Dr Slump + episode: 3 + format: DVB + language: catalan + +# Disabling this test because it just doesn't looks like a serie ... +#? series/Ren and Stimpy - Black_hole_[DivX].avi +#: title: Ren and Stimpy +# episode_title: Black hole +# video_codec: DivX + +# Disabling this test because it just doesn't looks like a serie ... +# ? Series/Walt Disney/Donald.Duck.-.Good.Scouts.[www.bigernie.jump.to].avi +#: title: Donald Duck +# episode_title: Good Scouts +# website: www.bigernie.jump.to + +? Series/Neverwhere/Neverwhere.05.Down.Street.[tvu.org.ru].avi +: title: Neverwhere + episode: 5 + episode_title: Down Street + website: tvu.org.ru + +? Series/South Park/Season 4/South.Park.4x07.Cherokee.Hair.Tampons.DVDRip.[tvu.org.ru].avi +: title: South Park + season: 4 + episode: 7 + episode_title: Cherokee Hair Tampons + format: DVD + website: tvu.org.ru + +? Series/Kaamelott/Kaamelott - Livre V - Ep 23 - Le Forfait.avi +: title: Kaamelott + alternativeTitle: Livre V + episode: 23 + episode_title: Le Forfait + +? Series/Duckman/Duckman - 110 (10) - 20021218 - Cellar Beware.avi +: title: Duckman + season: 1 + episode: 10 + date: 2002-12-18 + episode_title: Cellar Beware + +# Removing this test because it doesn't look like a series +# ? Series/Ren & Stimpy/Ren And Stimpy - Onward & Upward-Adult Party Cartoon.avi +# : title: Ren And Stimpy +# episode_title: Onward & Upward-Adult Party Cartoon + +? Series/Breaking Bad/Minisodes/Breaking.Bad.(Minisodes).01.Good.Cop.Bad.Cop.WEBRip.XviD.avi +: title: Breaking Bad + episode_format: Minisode + episode: 1 + episode_title: Good Cop Bad Cop + format: WEBRip + video_codec: XviD + +? Series/My Name Is Earl/My.Name.Is.Earl.S01Extras.-.Bad.Karma.DVDRip.XviD.avi +: title: My Name Is Earl + season: 1 + episode_title: Extras - Bad Karma + format: DVD + episode_details: Extras + video_codec: XviD + +? series/Freaks And Geeks/Season 1/Episode 4 - Kim Kelly Is My Friend-eng(1).srt +: title: Freaks And Geeks + season: 1 + episode: 4 + episode_title: Kim Kelly Is My Friend + subtitle_language: English # This is really a subtitle_language, despite guessit 1.x assert for language. + +? /mnt/series/The Big Bang Theory/S01/The.Big.Bang.Theory.S01E01.mkv +: title: The Big Bang Theory + season: 1 + episode: 1 + +? /media/Parks_and_Recreation-s03-e01.mkv +: title: Parks and Recreation + season: 3 + episode: 1 + +? /media/Parks_and_Recreation-s03-e02-Flu_Season.mkv +: title: Parks and Recreation + season: 3 + episode_title: Flu Season + episode: 2 + +? /media/Parks_and_Recreation-s03-x01.mkv +: title: Parks and Recreation + season: 3 + episode: 1 + +? /media/Parks_and_Recreation-s03-x02-Gag_Reel.mkv +: title: Parks and Recreation + season: 3 + episode: 2 + episode_title: Gag Reel + +? /media/Band_of_Brothers-e01-Currahee.mkv +: title: Band of Brothers + episode: 1 + episode_title: Currahee + +? /media/Band_of_Brothers-x02-We_Stand_Alone_Together.mkv +: title: Band of Brothers + bonus: 2 + bonus_title: We Stand Alone Together + +? /TV Shows/Mad.M-5x9.mkv +: title: Mad M + season: 5 + episode: 9 + +? /TV Shows/new.girl.117.hdtv-lol.mp4 +: title: new girl + season: 1 + episode: 17 + format: HDTV + release_group: lol + +? Kaamelott - 5x44x45x46x47x48x49x50.avi +: title: Kaamelott + season: 5 + episode: [44, 45, 46, 47, 48, 49, 50] + +? Example S01E01-02.avi +? Example S01E01E02.avi +: title: Example + season: 1 + episode: [1, 2] + +? Series/Baccano!/Baccano!_-_T1_-_Trailer_-_[Ayu](dae8173e).mkv +: title: Baccano! + other: Trailer + release_group: Ayu + episode_title: T1 + crc32: dae8173e + +? Series/Doctor Who (2005)/Season 06/Doctor Who (2005) - S06E01 - The Impossible Astronaut (1).avi +: title: Doctor Who + year: 2005 + season: 6 + episode: 1 + episode_title: The Impossible Astronaut + +? The Sopranos - [05x07] - In Camelot.mp4 +: title: The Sopranos + season: 5 + episode: 7 + episode_title: In Camelot + +? The.Office.(US).1x03.Health.Care.HDTV.XviD-LOL.avi +: title: The Office + country: US + season: 1 + episode: 3 + episode_title: Health Care + format: HDTV + video_codec: XviD + release_group: LOL + +? /Volumes/data-1/Series/Futurama/Season 3/Futurama_-_S03_DVD_Bonus_-_Deleted_Scenes_Part_3.ogm +: title: Futurama + season: 3 + part: 3 + other: Bonus + episode_title: Deleted Scenes + format: DVD + +? Ben.and.Kate.S01E02.720p.HDTV.X264-DIMENSION.mkv +: title: Ben and Kate + season: 1 + episode: 2 + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: DIMENSION + +? /volume1/TV Series/Drawn Together/Season 1/Drawn Together 1x04 Requiem for a Reality Show.avi +: title: Drawn Together + season: 1 + episode: 4 + episode_title: Requiem for a Reality Show + +? Sons.of.Anarchy.S05E06.720p.WEB.DL.DD5.1.H.264-CtrlHD.mkv +: title: Sons of Anarchy + season: 5 + episode: 6 + screen_size: 720p + format: WEB-DL + audio_channels: "5.1" + audio_codec: DolbyDigital + video_codec: h264 + release_group: CtrlHD + +? /media/bdc64bfe-e36f-4af8-b550-e6fd2dfaa507/TV_Shows/Doctor Who (2005)/Saison 6/Doctor Who (2005) - S06E13 - The Wedding of River Song.mkv +: title: Doctor Who + season: 6 + episode: 13 + year: 2005 + episode_title: The Wedding of River Song + uuid: bdc64bfe-e36f-4af8-b550-e6fd2dfaa507 + +? /mnt/videos/tvshows/Doctor Who/Season 06/E13 - The Wedding of River Song.mkv +: title: Doctor Who + season: 6 + episode: 13 + episode_title: The Wedding of River Song + +? The.Simpsons.S24E03.Adventures.in.Baby-Getting.720p.WEB-DL.DD5.1.H.264-CtrlHD.mkv +: title: The Simpsons + season: 24 + episode: 3 + episode_title: Adventures in Baby-Getting + screen_size: 720p + format: WEB-DL + audio_channels: "5.1" + audio_codec: DolbyDigital + video_codec: h264 + release_group: CtrlHD + +? /home/disaster/Videos/TV/Merlin/merlin_2008.5x02.arthurs_bane_part_two.repack.720p_hdtv_x264-fov.mkv +: title: merlin + season: 5 + episode: 2 + part: 2 + episode_title: arthurs bane + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: fov + year: 2008 + other: Proper + proper_count: 1 + +? "Da Vinci's Demons - 1x04 - The Magician.mkv" +: title: "Da Vinci's Demons" + season: 1 + episode: 4 + episode_title: The Magician + +? CSI.S013E18.Sheltered.720p.WEB-DL.DD5.1.H.264.mkv +: title: CSI + season: 13 + episode: 18 + episode_title: Sheltered + screen_size: 720p + format: WEB-DL + audio_channels: "5.1" + audio_codec: DolbyDigital + video_codec: h264 + +? Game of Thrones S03E06 1080i HDTV DD5.1 MPEG2-TrollHD.ts +: title: Game of Thrones + season: 3 + episode: 6 + screen_size: 1080i + format: HDTV + audio_channels: "5.1" + audio_codec: DolbyDigital + video_codec: Mpeg2 + release_group: TrollHD + +? gossip.girl.s01e18.hdtv.xvid-2hd.eng.srt +: title: gossip girl + season: 1 + episode: 18 + format: HDTV + video_codec: XviD + release_group: 2hd + subtitle_language: english + +? Wheels.S03E01E02.720p.HDTV.x264-IMMERSE.mkv +: title: Wheels + season: 3 + episode: [1, 2] + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: IMMERSE + +? Wheels.S03E01-02.720p.HDTV.x264-IMMERSE.mkv +: title: Wheels + season: 3 + episode: [1, 2] + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: IMMERSE + +? Wheels.S03E01-E02.720p.HDTV.x264-IMMERSE.mkv +: title: Wheels + season: 3 + episode: [1, 2] + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: IMMERSE + +? Wheels.S03E01-04.720p.HDTV.x264-IMMERSE.mkv +: title: Wheels + season: 3 + episode: [1, 2, 3, 4] + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: IMMERSE + +? Marvels.Agents.of.S.H.I.E.L.D.S01E06.720p.HDTV.X264-DIMENSION.mkv +: title: Marvels Agents of S H I E L D + season: 1 + episode: 6 + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: DIMENSION + +? Marvels.Agents.of.S.H.I.E.L.D..S01E06.720p.HDTV.X264-DIMENSION.mkv +: title: Marvels Agents of S H I E L D + season: 1 + episode: 6 + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: DIMENSION + +? Series/Friday Night Lights/Season 1/Friday Night Lights S01E19 - Ch-Ch-Ch-Ch-Changes.avi +: title: Friday Night Lights + season: 1 + episode: 19 + episode_title: Ch-Ch-Ch-Ch-Changes + +? Dexter Saison VII FRENCH.BDRip.XviD-MiND.nfo +: title: Dexter + season: 7 + video_codec: XviD + language: French + format: BluRay + release_group: MiND + +? Dexter Saison sept FRENCH.BDRip.XviD-MiND.nfo +: title: Dexter + season: 7 + video_codec: XviD + language: French + format: BluRay + release_group: MiND + +? "Pokémon S16 - E29 - 1280*720 HDTV VF.mkv" +: title: Pokémon + format: HDTV + language: French + season: 16 + episode: 29 + screen_size: 720p + +? One.Piece.E576.VOSTFR.720p.HDTV.x264-MARINE-FORD.mkv +: episode: 576 + video_codec: h264 + format: HDTV + title: One Piece + release_group: MARINE-FORD + subtitle_language: French + screen_size: 720p + +? Dexter.S08E12.FINAL.MULTi.1080p.BluRay.x264-MiND.mkv +: video_codec: h264 + episode: 12 + season: 8 + format: BluRay + title: Dexter + other: FINAL + language: Multiple languages + release_group: MiND + screen_size: 1080p + +? One Piece - E623 VOSTFR HD [www.manga-ddl-free.com].mkv +: website: www.manga-ddl-free.com + episode: 623 + subtitle_language: French + title: One Piece + other: HD + +? Falling Skies Saison 1.HDLight.720p.x264.VFF.mkv +: language: French + screen_size: 720p + season: 1 + title: Falling Skies + video_codec: h264 + other: HDLight + +? Sleepy.Hollow.S01E09.720p.WEB-DL.DD5.1.H.264-BP.mkv +: episode: 9 + video_codec: h264 + format: WEB-DL + title: Sleepy Hollow + audio_channels: "5.1" + screen_size: 720p + season: 1 + video_profile: BP + audio_codec: DolbyDigital + +? Sleepy.Hollow.S01E09.720p.WEB-DL.DD5.1.H.264-BS.mkv +: episode: 9 + video_codec: h264 + format: WEB-DL + title: Sleepy Hollow + audio_channels: "5.1" + screen_size: 720p + season: 1 + release_group: BS + audio_codec: DolbyDigital + +? Battlestar.Galactica.S00.Pilot.FRENCH.DVDRip.XviD-NOTAG.avi +: title: Battlestar Galactica + season: 0 + episode_details: Pilot + episode_title: Pilot + language: French + format: DVD + video_codec: XviD + release_group: NOTAG + +? The Big Bang Theory S00E00 Unaired Pilot VOSTFR TVRip XviD-VioCs +: title: The Big Bang Theory + season: 0 + episode: 0 + subtitle_language: French + format: TV + video_codec: XviD + release_group: VioCs + episode_details: [Unaired, Pilot] + +? The Big Bang Theory S01E00 PROPER Unaired Pilot TVRip XviD-GIGGITY +: title: The Big Bang Theory + season: 1 + episode: 0 + format: TV + video_codec: XviD + release_group: GIGGITY + other: Proper + proper_count: 1 + episode_details: [Unaired, Pilot] + +? Pawn.Stars.S2014E18.720p.HDTV.x264-KILLERS +: title: Pawn Stars + season: 2014 + year: 2014 + episode: 18 + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: KILLERS + +? 2.Broke.Girls.S03E10.480p.HDTV.x264-mSD.mkv +: title: 2 Broke Girls + season: 3 + episode: 10 + screen_size: 480p + format: HDTV + video_codec: h264 + release_group: mSD + +? House.of.Cards.2013.S02E03.1080p.NF.WEBRip.DD5.1.x264-NTb.mkv +: title: House of Cards + year: 2013 + season: 2 + episode: 3 + screen_size: 1080p + other: Netflix + format: WEBRip + audio_channels: "5.1" + audio_codec: DolbyDigital + video_codec: h264 + release_group: NTb + +? the.100.109.hdtv-lol.mp4 +: title: the 100 + season: 1 + episode: 9 + format: HDTV + release_group: lol + +? Criminal.Minds.5x03.Reckoner.ENG.-.sub.FR.HDTV.XviD-STi.[tvu.org.ru].avi +: title: Criminal Minds + language: English + subtitle_language: French + season: 5 + episode: 3 + video_codec: XviD + format: HDTV + website: tvu.org.ru + release_group: STi + episode_title: Reckoner + +? 03-Criminal.Minds.avi +: title: Criminal Minds + episode: 3 + +? '[Evil-Saizen]_Laughing_Salesman_14_[DVD][1C98686A].mkv' +: crc32: 1C98686A + episode: 14 + format: DVD + release_group: Evil-Saizen + title: Laughing Salesman + +? '[Kaylith] Zankyou no Terror - 04 [480p][B4D4514E].mp4' +: crc32: B4D4514E + episode: 4 + release_group: Kaylith + screen_size: 480p + title: Zankyou no Terror + +? '[PuyaSubs!] Seirei Tsukai no Blade Dance - 05 [720p][32DD560E].mkv' +: crc32: 32DD560E + episode: 5 + release_group: PuyaSubs! + screen_size: 720p + title: Seirei Tsukai no Blade Dance + +? '[Doremi].Happiness.Charge.Precure.27.[1280x720].[DC91581A].mkv' +: crc32: DC91581A + episode: 27 + release_group: Doremi + screen_size: 720p + title: Happiness Charge Precure + +? "[Daisei] Free!:Iwatobi Swim Club - 01 ~ (BD 720p 10-bit AAC) [99E8E009].mkv" +: audio_codec: AAC + crc32: 99E8E009 + episode: 1 + format: BluRay + release_group: Daisei + screen_size: 720p + title: Free!:Iwatobi Swim Club + video_profile: 10bit + +? '[Tsundere] Boku wa Tomodachi ga Sukunai - 03 [BDRip h264 1920x1080 10bit FLAC][AF0C22CC].mkv' +: audio_codec: FLAC + crc32: AF0C22CC + episode: 3 + format: BluRay + release_group: Tsundere + screen_size: 1080p + title: Boku wa Tomodachi ga Sukunai + video_codec: h264 + video_profile: 10bit + +? '[t.3.3.d]_Mikakunin_de_Shinkoukei_-_12_[720p][5DDC1352].mkv' +: crc32: 5DDC1352 + episode: 12 + screen_size: 720p + title: Mikakunin de Shinkoukei + release_group: t.3.3.d + +? '[Anime-Koi] Sabagebu! - 06 [h264-720p][ABB3728A].mkv' +: crc32: ABB3728A + episode: 6 + release_group: Anime-Koi + screen_size: 720p + title: Sabagebu! + video_codec: h264 + +? '[aprm-Diogo4D] [BD][1080p] Nagi no Asukara 08 [4D102B7C].mkv' +: crc32: 4D102B7C + episode: 8 + format: BluRay + release_group: aprm-Diogo4D + screen_size: 1080p + title: Nagi no Asukara + +? '[Akindo-SSK] Zankyou no Terror - 05 [720P][Sub_ITA][F5CCE87C].mkv' +: crc32: F5CCE87C + episode: 5 + release_group: Akindo-SSK + screen_size: 720p + title: Zankyou no Terror + subtitle_language: it + +? Naruto Shippuden Episode 366 VOSTFR.avi +: episode: 366 + title: Naruto Shippuden + subtitle_language: fr + +? Naruto Shippuden Episode 366v2 VOSTFR.avi +: episode: 366 + version: 2 + title: Naruto Shippuden + subtitle_language: fr + +? '[HorribleSubs] Ao Haru Ride - 06 [480p].mkv' +: episode: 6 + release_group: HorribleSubs + screen_size: 480p + title: Ao Haru Ride + +? '[DeadFish] Tari Tari - 01 [BD][720p][AAC].mp4' +: audio_codec: AAC + episode: 1 + format: BluRay + release_group: DeadFish + screen_size: 720p + title: Tari Tari + +? '[NoobSubs] Sword Art Online II 06 (720p 8bit AAC).mp4' +: audio_codec: AAC + episode: 6 + release_group: NoobSubs + screen_size: 720p + title: Sword Art Online II + video_profile: 8bit + +? '[DeadFish] 01 - Tari Tari [BD][720p][AAC].mp4' +: audio_codec: AAC + episode: 1 + format: BluRay + release_group: DeadFish + screen_size: 720p + title: Tari Tari + +? '[NoobSubs] 06 Sword Art Online II (720p 8bit AAC).mp4' +: audio_codec: AAC + episode: 6 + release_group: NoobSubs + screen_size: 720p + title: Sword Art Online II + video_profile: 8bit + +? '[DeadFish] 12 - Tari Tari [BD][720p][AAC].mp4' +: audio_codec: AAC + episode: 12 + format: BluRay + release_group: DeadFish + screen_size: 720p + title: Tari Tari + +? Something.Season.2.1of4.Ep.Title.HDTV.torrent +: episode_count: 4 + episode: 1 + format: HDTV + season: 2 + title: Something + episode_title: Title + container: torrent + +? Something.Season.2of5.3of9.Ep.Title.HDTV.torrent +: episode_count: 9 + episode: 3 + format: HDTV + season: 2 + season_count: 5 + title: Something + episode_title: Title + container: torrent + +? Something.Other.Season.3of5.Complete.HDTV.torrent +: format: HDTV + other: Complete + season: 3 + season_count: 5 + title: Something Other + container: torrent + +? Something.Other.Season.1-3.avi +: season: [1, 2, 3] + title: Something Other + +? Something.Other.Season.1&3.avi +: season: [1, 3] + title: Something Other + +? Something.Other.Season.1&3-1to12ep.avi +: season: [1, 3] + title: Something Other + +? W2Test.123.HDTV.XViD-FlexGet +: episode: 23 + season: 1 + format: HDTV + release_group: FlexGet + title: W2Test + video_codec: XviD + +? W2Test.123.HDTV.XViD-FlexGet +: options: --episode-prefer-number + episode: 123 + format: HDTV + release_group: FlexGet + title: W2Test + video_codec: XviD + +? FooBar.0307.PDTV-FlexGet +: episode: 7 + format: DVB + release_group: FlexGet + season: 3 + title: FooBar + +? FooBar.0307.PDTV-FlexGet +? FooBar.307.PDTV-FlexGet +: options: --episode-prefer-number + episode: 307 + format: DVB + release_group: FlexGet + title: FooBar + +? FooBar.07.PDTV-FlexGet +: options: --episode-prefer-number + episode: 7 + format: DVB + release_group: FlexGet + title: FooBar + +? FooBar.7.PDTV-FlexGet +: options: --episode-prefer-number + episode: 7 + format: DVB + release_group: FlexGet + title: FooBar + +? FooBar.0307.PDTV-FlexGet +: episode: 7 + format: DVB + release_group: FlexGet + season: 3 + title: FooBar + +? FooBar.307.PDTV-FlexGet +: episode: 7 + format: DVB + release_group: FlexGet + season: 3 + title: FooBar + +? FooBar.07.PDTV-FlexGet +: episode: 7 + format: DVB + release_group: FlexGet + title: FooBar + +? FooBar.07v4.PDTV-FlexGet +: episode: 7 + version: 4 + format: DVB + release_group: FlexGet + title: FooBar + +? FooBar.7.PDTV-FlexGet +: format: DVB + release_group: FlexGet + title: FooBar 7 + type: movie + +? FooBar.7.PDTV-FlexGet +: options: -t episode + episode: 7 + format: DVB + release_group: FlexGet + title: FooBar + +? FooBar.7v3.PDTV-FlexGet +: options: -t episode + episode: 7 + version: 3 + format: DVB + release_group: FlexGet + title: FooBar + +? Test.S02E01.hdtv.real.proper +: episode: 1 + format: HDTV + other: Proper + proper_count: 2 + season: 2 + title: Test + +? Real.Test.S02E01.hdtv.proper +: episode: 1 + format: HDTV + other: Proper + proper_count: 1 + season: 2 + title: Real Test + +? Test.Real.S02E01.hdtv.proper +: episode: 1 + format: HDTV + other: Proper + proper_count: 1 + season: 2 + title: Test Real + +? Test.S02E01.hdtv.proper +: episode: 1 + format: HDTV + other: Proper + proper_count: 1 + season: 2 + title: Test + +? Test.S02E01.hdtv.real.repack.proper +: episode: 1 + format: HDTV + other: Proper + proper_count: 3 + season: 2 + title: Test + +? Date.Show.03-29-2012.HDTV.XViD-FlexGet +: date: 2012-03-29 + format: HDTV + release_group: FlexGet + title: Date Show + video_codec: XviD + +? Something.1x5.Season.Complete-FlexGet +: episode: 5 + other: Complete + season: 1 + title: Something + release_group: FlexGet + +? Something Seasons 1 & 2 - Complete +: other: Complete + season: + - 1 + - 2 + title: Something + +? Something Seasons 4 Complete +: other: Complete + season: 4 + title: Something + +? Something.1xAll.Season.Complete-FlexGet +: other: Complete + season: 1 + title: Something + release_group: FlexGet + +? Something.1xAll-FlexGet +: other: Complete + season: 1 + title: Something + episode_title: FlexGet # 1.x guess this as release_group, but it's better to guess it as episode_title + +? FlexGet.US.S2013E14.Title.Here.720p.HDTV.AAC5.1.x264-NOGRP +: audio_channels: '5.1' + audio_codec: AAC + country: US + episode: 14 + format: HDTV + release_group: NOGRP + screen_size: 720p + season: 2013 + title: FlexGet + episode_title: Title Here + video_codec: h264 + year: 2013 + +? FlexGet.14.of.21.Title.Here.720p.HDTV.AAC5.1.x264-NOGRP +: audio_channels: '5.1' + audio_codec: AAC + episode_count: 21 + episode: 14 + format: HDTV + release_group: NOGRP + screen_size: 720p + title: FlexGet + episode_title: Title Here + video_codec: h264 + +? FlexGet.Series.2013.14.of.21.Title.Here.720p.HDTV.AAC5.1.x264-NOGRP +: audio_channels: '5.1' + audio_codec: AAC + episode_count: 21 + episode: 14 + format: HDTV + release_group: NOGRP + screen_size: 720p + season: 2013 + title: FlexGet + episode_title: Title Here + video_codec: h264 + year: 2013 + +? Something.S04E05E09 +: episode: # 1.x guessit this as a range from 5 to 9. But not sure if it should ... + - 5 + - 9 + season: 4 + title: Something + +? FooBar 360 1080i +: options: --episode-prefer-number + episode: 360 + screen_size: 1080i + title: FooBar + +? FooBar 360 1080i +: episode: 60 + season: 3 + screen_size: 1080i + title: FooBar + +? FooBar 360 +: screen_size: 360p + title: FooBar + +? BarFood christmas special HDTV +: options: --expected-title BarFood + format: HDTV + title: BarFood + episode_title: christmas special + episode_details: Special + +? Something.2008x12.13-FlexGet +: title: Something + date: 2008-12-13 + episode_title: FlexGet + +? '[Ignored] Test 12' +: episode: 12 + release_group: Ignored + title: Test + +? '[FlexGet] Test 12' +: episode: 12 + release_group: FlexGet + title: Test + +? Test.13.HDTV-Ignored +: episode: 13 + format: HDTV + release_group: Ignored + title: Test + +? Test.13.HDTV-Ignored +: options: --expected-series test + episode: 13 + format: HDTV + release_group: Ignored + title: Test + +? Test.13.HDTV-Ignored +: title: Test + episode: 13 + format: HDTV + release_group: Ignored + +? Test.13.HDTV-Ignored +: episode: 13 + format: HDTV + release_group: Ignored + title: Test + +? Test.13.HDTV-FlexGet +: episode: 13 + format: HDTV + release_group: FlexGet + title: Test + +? Test.14.HDTV-Name +: episode: 14 + format: HDTV + release_group: Name + title: Test + +? Real.Time.With.Bill.Maher.2014.10.31.HDTV.XviD-AFG.avi +: date: 2014-10-31 + format: HDTV + release_group: AFG + title: Real Time With Bill Maher + video_codec: XviD + +? Arrow.S03E21.Al.Sah-Him.1080p.WEB-DL.DD5.1.H.264-BS.mkv +: title: Arrow + season: 3 + episode: 21 + episode_title: Al Sah-Him + screen_size: 1080p + audio_codec: DolbyDigital + audio_channels: "5.1" + video_codec: h264 + release_group: BS + format: WEB-DL + +? How to Make It in America - S02E06 - I'm Sorry, Who's Yosi?.mkv +: title: How to Make It in America + season: 2 + episode: 6 + episode_title: I'm Sorry, Who's Yosi? + +? 24.S05E07.FRENCH.DVDRip.XviD-FiXi0N.avi +: episode: 7 + format: DVD + language: fr + season: 5 + title: '24' + video_codec: XviD + release_group: FiXi0N + +? 12.Monkeys.S01E12.FRENCH.BDRip.x264-VENUE.mkv +: episode: 12 + format: BluRay + language: fr + release_group: VENUE + season: 1 + title: 12 Monkeys + video_codec: h264 + +? The.Daily.Show.2015.07.01.Kirsten.Gillibrand.Extended.720p.CC.WEBRip.AAC2.0.x264-BTW.mkv +: audio_channels: '2.0' + audio_codec: AAC + date: 2015-07-01 + format: WEBRip + other: CC + release_group: BTW + screen_size: 720p + title: The Daily Show + episode_title: Kirsten Gillibrand Extended + video_codec: h264 + +? The.Daily.Show.2015.07.02.Sarah.Vowell.CC.WEBRip.AAC2.0.x264-BTW.mkv +: audio_channels: '2.0' + audio_codec: AAC + date: 2015-07-02 + format: WEBRip + other: CC + release_group: BTW + title: The Daily Show + episode_title: Sarah Vowell + video_codec: h264 + +? 90.Day.Fiance.S02E07.I.Have.To.Tell.You.Something.720p.HDTV.x264-W4F +: episode: 7 + format: HDTV + screen_size: 720p + season: 2 + title: 90 Day Fiance + episode_title: I Have To Tell You Something + release_group: W4F + +? Doctor.Who.2005.S04E06.FRENCH.LD.DVDRip.XviD-TRACKS.avi +: episode: 6 + format: DVD + language: fr + release_group: TRACKS + season: 4 + title: Doctor Who + other: LD + video_codec: XviD + year: 2005 + +? Astro.Le.Petit.Robot.S01E01+02.FRENCH.DVDRiP.X264.INT-BOOLZ.mkv +: episode: [1, 2] + format: DVD + language: fr + release_group: INT-BOOLZ + season: 1 + title: Astro Le Petit Robot + video_codec: h264 + +? Annika.Bengtzon.2012.E01.Le.Testament.De.Nobel.FRENCH.DVDRiP.XViD-STVFRV.avi +: episode: 1 + format: DVD + language: fr + release_group: STVFRV + title: Annika Bengtzon + episode_title: Le Testament De Nobel + video_codec: XviD + year: 2012 + +? Dead.Set.02.FRENCH.LD.DVDRip.XviD-EPZ.avi +: episode: 2 + format: DVD + language: fr + other: LD + release_group: EPZ + title: Dead Set + video_codec: XviD + +? Phineas and Ferb S01E00 & S01E01 & S01E02 +: episode: [0, 1, 2] + season: 1 + title: Phineas and Ferb + +? Show.Name.S01E02.S01E03.HDTV.XViD.Etc-Group +: episode: [2, 3] + format: HDTV + release_group: Etc-Group + season: 1 + title: Show Name + video_codec: XviD + +? Show Name - S01E02 - S01E03 - S01E04 - Ep Name +: episode: [2, 3, 4] + season: 1 + title: Show Name + episode_title: Ep Name + +? Show.Name.1x02.1x03.HDTV.XViD.Etc-Group +: episode: [2, 3] + format: HDTV + release_group: Etc-Group + season: 1 + title: Show Name + video_codec: XviD + +? Show Name - 1x02 - 1x03 - 1x04 - Ep Name +: episode: [2, 3, 4] + season: 1 + title: Show Name + episode_title: Ep Name + +? Show.Name.S01E02.HDTV.XViD.Etc-Group +: episode: 2 + format: HDTV + release_group: Etc-Group + season: 1 + title: Show Name + video_codec: XviD + +? Show Name - S01E02 - My Ep Name +: episode: 2 + season: 1 + title: Show Name + episode_title: My Ep Name + +? Show Name - S01.E03 - My Ep Name +: episode: 3 + season: 1 + title: Show Name + episode_title: My Ep Name + +? Show.Name.S01E02E03.HDTV.XViD.Etc-Group +: episode: [2, 3] + format: HDTV + release_group: Etc-Group + season: 1 + title: Show Name + video_codec: XviD + +? Show Name - S01E02-03 - My Ep Name +: episode: [2, 3] + season: 1 + title: Show Name + episode_title: My Ep Name + +? Show.Name.S01.E02.E03 +: episode: [2, 3] + season: 1 + title: Show Name + +? Show_Name.1x02.HDTV_XViD_Etc-Group +: episode: 2 + format: HDTV + release_group: Etc-Group + season: 1 + title: Show Name + video_codec: XviD + +? Show Name - 1x02 - My Ep Name +: episode: 2 + season: 1 + title: Show Name + episode_title: My Ep Name + +? Show_Name.1x02x03x04.HDTV_XViD_Etc-Group +: episode: [2, 3, 4] + format: HDTV + release_group: Etc-Group + season: 1 + title: Show Name + video_codec: XviD + +? Show Name - 1x02-03-04 - My Ep Name +: episode: [2, 3, 4] + season: 1 + title: Show Name + episode_title: My Ep Name + +# 1x guess this as episode 100 but 101 as episode 1 season 1. +? Show.Name.100.Event.2010.11.23.HDTV.XViD.Etc-Group +: date: 2010-11-23 + season: 1 + episode: 0 + format: HDTV + release_group: Etc-Group + title: Show Name + episode_title: Event + video_codec: XviD + +? Show.Name.101.Event.2010.11.23.HDTV.XViD.Etc-Group +: date: 2010-11-23 + season: 1 + episode: 1 + format: HDTV + release_group: Etc-Group + title: Show Name + episode_title: Event + video_codec: XviD + +? Show.Name.2010.11.23.HDTV.XViD.Etc-Group +: date: 2010-11-23 + format: HDTV + release_group: Etc-Group + title: Show Name + +? Show Name - 2010-11-23 - Ep Name +: date: 2010-11-23 + title: Show Name + episode_title: Ep Name + +? Show Name Season 1 Episode 2 Ep Name +: episode: 2 + season: 1 + title: Show Name + episode_title: Ep Name + +? Show.Name.S01.HDTV.XViD.Etc-Group +: format: HDTV + release_group: Etc-Group + season: 1 + title: Show Name + video_codec: XviD + +? Show.Name.E02-03 +: episode: [2, 3] + title: Show Name + +? Show.Name.E02.2010 +: episode: 2 + year: 2010 + title: Show Name + +? Show.Name.E23.Test +: episode: 23 + title: Show Name + episode_title: Test + +? Show.Name.Part.3.HDTV.XViD.Etc-Group +: part: 3 + title: Show Name + format: HDTV + video_codec: XviD + release_group: Etc-Group + type: movie + # Fallback to movie type because we can't tell it's a series ... + +? Show.Name.Part.1.and.Part.2.Blah-Group +: part: [1, 2] + title: Show Name + type: movie + # Fallback to movie type because we can't tell it's a series ... + +? Show Name - 01 - Ep Name +: episode: 1 + title: Show Name + episode_title: Ep Name + +? 01 - Ep Name +: episode: 1 + title: Ep Name + +? Show.Name.102.HDTV.XViD.Etc-Group +: episode: 2 + format: HDTV + release_group: Etc-Group + season: 1 + title: Show Name + video_codec: XviD + +? '[HorribleSubs] Maria the Virgin Witch - 01 [720p].mkv' +: episode: 1 + release_group: HorribleSubs + screen_size: 720p + title: Maria the Virgin Witch + +? '[ISLAND]One_Piece_679_[VOSTFR]_[V1]_[8bit]_[720p]_[EB7838FC].mp4' +: options: -E + crc32: EB7838FC + episode: 679 + release_group: ISLAND + screen_size: 720p + title: One Piece + subtitle_language: fr + video_profile: 8bit + version: 1 + +? '[ISLAND]One_Piece_679_[VOSTFR]_[8bit]_[720p]_[EB7838FC].mp4' +: options: -E + crc32: EB7838FC + episode: 679 + release_group: ISLAND + screen_size: 720p + title: One Piece + subtitle_language: fr + video_profile: 8bit + +? '[Kaerizaki-Fansub]_One_Piece_679_[VOSTFR][HD_1280x720].mp4' +: options: -E + episode: 679 + other: HD + release_group: Kaerizaki-Fansub + screen_size: 720p + title: One Piece + subtitle_language: fr + +? '[Kaerizaki-Fansub]_One_Piece_679_[VOSTFR][FANSUB][HD_1280x720].mp4' +: options: -E + episode: 679 + other: + - Fansub + - HD + release_group: Kaerizaki-Fansub + screen_size: 720p + title: One Piece + subtitle_language: fr + +? '[Kaerizaki-Fansub]_One_Piece_681_[VOSTFR][HD_1280x720]_V2.mp4' +: options: -E + episode: 681 + other: HD + release_group: Kaerizaki-Fansub + screen_size: 720p + title: One Piece + subtitle_language: fr + version: 2 + +? '[Kaerizaki-Fansub] High School DxD New 04 VOSTFR HD (1280x720) V2.mp4' +: options: -E + episode: 4 + other: HD + release_group: Kaerizaki-Fansub + screen_size: 720p + title: High School DxD New + subtitle_language: fr + version: 2 + +? '[Kaerizaki-Fansub] One Piece 603 VOSTFR PS VITA (960x544) V2.mp4' +: options: -E + episode: 603 + release_group: Kaerizaki-Fansub + screen_size: 960x544 + title: One Piece + subtitle_language: fr + version: 2 + +? '[Group Name] Show Name.13' +: episode: 13 + release_group: Group Name + title: Show Name + +? '[Group Name] Show Name - 13' +: episode: 13 + release_group: Group Name + title: Show Name + +? '[Group Name] Show Name 13' +: episode: 13 + release_group: Group Name + title: Show Name + +# [Group Name] Show Name.13-14 +# [Group Name] Show Name - 13-14 +# Show Name 13-14 + +? '[Stratos-Subs]_Infinite_Stratos_-_12_(1280x720_H.264_AAC)_[379759DB]' +: audio_codec: AAC + crc32: 379759DB + episode: 12 + release_group: Stratos-Subs + screen_size: 720p + title: Infinite Stratos + video_codec: h264 + +# [ShinBunBu-Subs] Bleach - 02-03 (CX 1280x720 x264 AAC) + +? '[SGKK] Bleach 312v1 [720p/MKV]' +: options: -E # guessit 1.x for episode only when version is guessed, but it's doesn't make it consistent. + episode: 312 + release_group: SGKK + screen_size: 720p + title: Bleach + version: 1 + +? '[Ayako]_Infinite_Stratos_-_IS_-_07_[H264][720p][EB7838FC]' +: crc32: EB7838FC + episode: 7 + release_group: Ayako + screen_size: 720p + title: Infinite Stratos + video_codec: h264 + +? '[Ayako] Infinite Stratos - IS - 07v2 [H264][720p][44419534]' +: crc32: '44419534' + episode: 7 + release_group: Ayako + screen_size: 720p + title: Infinite Stratos + video_codec: h264 + version: 2 + +? '[Ayako-Shikkaku] Oniichan no Koto Nanka Zenzen Suki Janain Dakara ne - 10 [LQ][h264][720p] [8853B21C]' +: crc32: 8853B21C + episode: 10 + release_group: Ayako-Shikkaku + screen_size: 720p + title: Oniichan no Koto Nanka Zenzen Suki Janain Dakara ne + video_codec: h264 + +# TODO: Add support for absolute episodes +? Bleach - s16e03-04 - 313-314 +? Bleach.s16e03-04.313-314 +? Bleach.s16e03-04.313-314 +? Bleach - s16e03-04 - 313-314 +? Bleach.s16e03-04.313-314 +? Bleach s16e03e04 313-314 +: episode: [3, 4] + season: 16 + title: Bleach + +? Bleach - 313-314 +: options: -E + episode: [313, 314] + title: Bleach + +? '[ShinBunBu-Subs] Bleach - 02-03 (CX 1280x720 x264 AAC)' +: audio_codec: AAC + episode: [2, 3] + release_group: ShinBunBu-Subs + screen_size: 720p + title: Bleach + video_codec: h264 + +? 003. Show Name - Ep Name.avi +: episode: 3 + title: Show Name + episode_title: Ep Name + +? 003-004. Show Name - Ep Name.avi +: episode: [3, 4] + title: Show Name + episode_title: Ep Name + +? One Piece - 102 +: episode: 2 + season: 1 + title: One Piece + +? "[ACX]_Wolf's_Spirit_001.mkv" +: episode: 1 + release_group: ACX + title: "Wolf's Spirit" + +? Project.Runway.S14E00.and.S14E01.(Eng.Subs).SDTV.x264-[2Maverick].mp4 +: episode: [0, 1] + format: TV + release_group: 2Maverick + season: 14 + title: Project Runway + subtitle_language: en + video_codec: h264 + +? '[Hatsuyuki-Kaitou]_Fairy_Tail_2_-_16-20_[720p][10bit].torrent' +: episode: [16, 17, 18, 19, 20] + release_group: Hatsuyuki-Kaitou + screen_size: 720p + title: Fairy Tail 2 + video_profile: 10bit + +? '[Hatsuyuki-Kaitou]_Fairy_Tail_2_-_16-20_(191-195)_[720p][10bit].torrent' +: options: -E + episode: [16, 17, 18, 19, 20, 191, 192, 193, 194, 195] + release_group: Hatsuyuki-Kaitou + screen_size: 720p + title: Fairy Tail 2 + +? "Looney Tunes 1940x01 Porky's Last Stand.mkv" +: episode: 1 + season: 1940 + title: Looney Tunes + episode_title: Porky's Last Stand + year: 1940 + +? The.Good.Wife.S06E01.E10.720p.WEB-DL.DD5.1.H.264-CtrlHD/The.Good.Wife.S06E09.Trust.Issues.720p.WEB-DL.DD5.1.H.264-CtrlHD.mkv +: audio_channels: '5.1' + audio_codec: DolbyDigital + episode: 9 + format: WEB-DL + release_group: CtrlHD + screen_size: 720p + season: 6 + title: The Good Wife + episode_title: Trust Issues + video_codec: h264 + +? Fear the Walking Dead - 01x02 - So Close, Yet So Far.REPACK-KILLERS.French.C.updated.Addic7ed.com.mkv +: episode: 2 + language: fr + other: Proper + proper_count: 1 + season: 1 + title: Fear the Walking Dead + episode_title: So Close, Yet So Far + +? Fear the Walking Dead - 01x02 - En Close, Yet En Far.REPACK-KILLERS.French.C.updated.Addic7ed.com.mkv +: episode: 2 + language: fr + other: Proper + proper_count: 1 + season: 1 + title: Fear the Walking Dead + episode_title: En Close, Yet En Far + +? /av/unsorted/The.Daily.Show.2015.07.22.Jake.Gyllenhaal.720p.HDTV.x264-BATV.mkv +: date: 2015-07-22 + format: HDTV + release_group: BATV + screen_size: 720p + title: The Daily Show + episode_title: Jake Gyllenhaal + video_codec: h264 + +? "[7.1.7.8.5] Foo Bar - 11 (H.264) [5235532D].mkv" +: options: -E + episode: 11 + +? my 720p show S01E02 +: options: -T "my 720p show" + title: my 720p show + season: 1 + episode: 2 + +? my 720p show S01E02 720p +: options: -T "my 720p show" + title: my 720p show + season: 1 + episode: 2 + screen_size: 720p + +? -my 720p show S01E02 +: options: -T "re:my \d+p show" + screen_size: 720p + +? Show S01E02 +: options: -T "The Show" + title: Show + season: 1 + episode: 2 + +? Foo's & Bars (2009) S01E01 720p XviD-2HD[AOEU] +: episode: 1 + release_group: 2HD[AOEU] + screen_size: 720p + season: 1 + title: Foo's & Bars + type: episode + video_codec: XviD + year: 2009 + +? Date.Series.10-11-2008.XViD +: date: 2008-11-10 + title: Date + type: episode + video_codec: XviD diff --git a/lib/guessit2/test/movies.yml b/lib/guessit2/test/movies.yml new file mode 100644 index 0000000000000000000000000000000000000000..5167622c8db79688b5969ed42c0d41dd5760c3d4 --- /dev/null +++ b/lib/guessit2/test/movies.yml @@ -0,0 +1,788 @@ +? __default__ +: type: movie + +? Movies/Fear and Loathing in Las Vegas (1998)/Fear.and.Loathing.in.Las.Vegas.720p.HDDVD.DTS.x264-ESiR.mkv +: title: Fear and Loathing in Las Vegas + year: 1998 + screen_size: 720p + format: HD-DVD + audio_codec: DTS + video_codec: h264 + container: mkv + release_group: ESiR + +? Movies/El Dia de la Bestia (1995)/El.dia.de.la.bestia.DVDrip.Spanish.DivX.by.Artik[SEDG].avi +: title: El Dia de la Bestia + year: 1995 + format: DVD + language: spanish + video_codec: DivX + release_group: Artik[SEDG] + container: avi + +? Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv +: title: Dark City + year: 1998 + format: BluRay + screen_size: 720p + audio_codec: DTS + video_codec: h264 + release_group: CHD + +? Movies/Sin City (BluRay) (2005)/Sin.City.2005.BDRip.720p.x264.AC3-SEPTiC.mkv +: title: Sin City + year: 2005 + format: BluRay + screen_size: 720p + video_codec: h264 + audio_codec: AC3 + release_group: SEPTiC + +? Movies/Borat (2006)/Borat.(2006).R5.PROPER.REPACK.DVDRip.XviD-PUKKA.avi +: title: Borat + year: 2006 + proper_count: 2 + format: DVD + other: [ R5, Proper ] + video_codec: XviD + release_group: PUKKA + +? "[XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv" +: title: Le Prestige + format: DVD + video_codec: h264 + video_profile: HP + audio_codec: AAC + audio_profile: HE + language: [ french, english ] + subtitle_language: [ french, english ] + release_group: XCT + +? Battle Royale (2000)/Battle.Royale.(Batoru.Rowaiaru).(2000).(Special.Edition).CD1of2.DVDRiP.XviD-[ZeaL].avi +: title: Battle Royale + year: 2000 + edition: Special Edition + cd: 1 + cd_count: 2 + format: DVD + video_codec: XviD + release_group: ZeaL + +? Movies/Brazil (1985)/Brazil_Criterion_Edition_(1985).CD2.avi +: title: Brazil + edition: Criterion Edition + year: 1985 + cd: 2 + +? Movies/Persepolis (2007)/[XCT] Persepolis [H264+Aac-128(Fr-Eng)+ST(Fr-Eng)+Ind].mkv +: title: Persepolis + year: 2007 + video_codec: h264 + audio_codec: AAC + language: [ French, English ] + subtitle_language: [ French, English ] + release_group: XCT + +? Movies/Toy Story (1995)/Toy Story [HDTV 720p English-Spanish].mkv +: title: Toy Story + year: 1995 + format: HDTV + screen_size: 720p + language: [ english, spanish ] + +? Movies/Office Space (1999)/Office.Space.[Dual-DVDRip].[Spanish-English].[XviD-AC3-AC3].[by.Oswald].avi +: title: Office Space + year: 1999 + format: DVD + language: [ english, spanish ] + video_codec: XviD + audio_codec: AC3 + +? Movies/Wild Zero (2000)/Wild.Zero.DVDivX-EPiC.avi +: title: Wild Zero + year: 2000 + video_codec: DivX + release_group: EPiC + +? movies/Baraka_Edition_Collector.avi +: title: Baraka + edition: Collector Edition + +? Movies/Blade Runner (1982)/Blade.Runner.(1982).(Director's.Cut).CD1.DVDRip.XviD.AC3-WAF.avi +: title: Blade Runner + year: 1982 + edition: Director's cut + cd: 1 + format: DVD + video_codec: XviD + audio_codec: AC3 + release_group: WAF + +? movies/American.The.Bill.Hicks.Story.2009.DVDRip.XviD-EPiSODE.[UsaBit.com]/UsaBit.com_esd-americanbh.avi +: title: American The Bill Hicks Story + year: 2009 + format: DVD + video_codec: XviD + release_group: EPiSODE + website: UsaBit.com + +? movies/Charlie.And.Boots.DVDRip.XviD-TheWretched/wthd-cab.avi +: title: Charlie And Boots + format: DVD + video_codec: XviD + release_group: TheWretched + +? movies/Steig Larsson Millenium Trilogy (2009) BRrip 720 AAC x264/(1)The Girl With The Dragon Tattoo (2009) BRrip 720 AAC x264.mkv +: title: The Girl With The Dragon Tattoo + #film_title: Steig Larsson Millenium Trilogy + #film: 1 + year: 2009 + format: BluRay + audio_codec: AAC + video_codec: h264 + screen_size: 720p + +? movies/Greenberg.REPACK.LiMiTED.DVDRip.XviD-ARROW/arw-repack-greenberg.dvdrip.xvid.avi +: title: Greenberg + format: DVD + video_codec: XviD + release_group: ARROW + other: ['Proper', 'Limited'] + proper_count: 1 + +? Movies/Fr - Paris 2054, Renaissance (2005) - De Christian Volckman - (Film Divx Science Fiction Fantastique Thriller Policier N&B).avi +: title: Paris 2054, Renaissance + year: 2005 + language: french + video_codec: DivX + +? Movies/[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi +: title: Avida + year: 2006 + language: french + format: DVD + video_codec: XviD + release_group: PROD + +? Movies/Alice in Wonderland DVDRip.XviD-DiAMOND/dmd-aw.avi +: title: Alice in Wonderland + format: DVD + video_codec: XviD + release_group: DiAMOND + +? Movies/Ne.Le.Dis.A.Personne.Fr 2 cd/personnea_mp.avi +: title: Ne Le Dis A Personne + language: french + cd_count: 2 + +? Movies/Bunker Palace Hôtel (Enki Bilal) (1989)/Enki Bilal - Bunker Palace Hotel (Fr Vhs Rip).avi +: title: Bunker Palace Hôtel + year: 1989 + language: french + format: VHS + +? Movies/21 (2008)/21.(2008).DVDRip.x264.AC3-FtS.[sharethefiles.com].mkv +: title: "21" + year: 2008 + format: DVD + video_codec: h264 + audio_codec: AC3 + release_group: FtS + website: sharethefiles.com + +? Movies/9 (2009)/9.2009.Blu-ray.DTS.720p.x264.HDBRiSe.[sharethefiles.com].mkv +: title: "9" + year: 2009 + format: BluRay + audio_codec: DTS + screen_size: 720p + video_codec: h264 + release_group: HDBRiSe + website: sharethefiles.com + +? Movies/Mamma.Mia.2008.DVDRip.AC3.XviD-CrazyTeam/Mamma.Mia.2008.DVDRip.AC3.XviD-CrazyTeam.avi +: title: Mamma Mia + year: 2008 + format: DVD + audio_codec: AC3 + video_codec: XviD + release_group: CrazyTeam + +? Movies/M.A.S.H. (1970)/MASH.(1970).[Divx.5.02][Dual-Subtitulos][DVDRip].ogm +: title: MASH + year: 1970 + video_codec: DivX + format: DVD + +? Movies/The Doors (1991)/09.03.08.The.Doors.(1991).BDRip.720p.AC3.X264-HiS@SiLUHD-English.[sharethefiles.com].mkv +: title: The Doors + year: 1991 + date: 2008-03-09 + format: BluRay + screen_size: 720p + audio_codec: AC3 + video_codec: h264 + release_group: HiS@SiLUHD + language: english + website: sharethefiles.com + +? Movies/The Doors (1991)/08.03.09.The.Doors.(1991).BDRip.720p.AC3.X264-HiS@SiLUHD-English.[sharethefiles.com].mkv +: options: --date-year-first + title: The Doors + year: 1991 + date: 2008-03-09 + format: BluRay + screen_size: 720p + audio_codec: AC3 + video_codec: h264 + release_group: HiS@SiLUHD + language: english + website: sharethefiles.com + +? Movies/Ratatouille/video_ts-ratatouille.srt +: title: Ratatouille + format: DVD + +# Removing this one because 001 is guessed as an episode number. +# ? Movies/001 __ A classer/Fantomas se déchaine - Louis de Funès.avi +# : title: Fantomas se déchaine + +? Movies/Comme une Image (2004)/Comme.Une.Image.FRENCH.DVDRiP.XViD-NTK.par-www.divx-overnet.com.avi +: title: Comme une Image + year: 2004 + language: french + format: DVD + video_codec: XviD + release_group: NTK + website: www.divx-overnet.com + +? Movies/Fantastic Mr Fox/Fantastic.Mr.Fox.2009.DVDRip.{x264+LC-AAC.5.1}{Fr-Eng}{Sub.Fr-Eng}-™.[sharethefiles.com].mkv +: title: Fantastic Mr Fox + year: 2009 + format: DVD + video_codec: h264 + audio_codec: AAC + audio_profile: LC + audio_channels: "5.1" + language: [ french, english ] + subtitle_language: [ french, english ] + website: sharethefiles.com + +? Movies/Somewhere.2010.DVDRip.XviD-iLG/i-smwhr.avi +: title: Somewhere + year: 2010 + format: DVD + video_codec: XviD + release_group: iLG + +? Movies/Moon_(2009).mkv +: title: Moon + year: 2009 + +? Movies/Moon_(2009)-x02-Making_Of.mkv +: title: Moon + year: 2009 + bonus: 2 + bonus_title: Making Of + +? movies/James_Bond-f17-Goldeneye.mkv +: title: Goldeneye + film_title: James Bond + film: 17 + + +? /movies/James_Bond-f21-Casino_Royale.mkv +: title: Casino Royale + film_title: James Bond + film: 21 + +? /movies/James_Bond-f21-Casino_Royale-x01-Becoming_Bond.mkv +: title: Casino Royale + film_title: James Bond + film: 21 + bonus: 1 + bonus_title: Becoming Bond + +? /movies/James_Bond-f21-Casino_Royale-x02-Stunts.mkv +: title: Casino Royale + film_title: James Bond + film: 21 + bonus: 2 + bonus_title: Stunts + +? OSS_117--Cairo,_Nest_of_Spies.mkv +: title: OSS 117 +# TODO: Implement subTitle for movies. + +? The Godfather Part 3.mkv +? The Godfather Part III.mkv +: title: The Godfather + part: 3 + +? Foobar Part VI.mkv +: title: Foobar + part: 6 + +? The_Insider-(1999)-x02-60_Minutes_Interview-1996.mp4 +: title: The Insider + year: 1999 + bonus: 2 + bonus_title: 60 Minutes Interview-1996 + +? Rush.._Beyond_The_Lighted_Stage-x09-Between_Sun_and_Moon-2002_Hartford.mkv +: title: Rush Beyond The Lighted Stage + bonus: 9 + bonus_title: Between Sun and Moon + year: 2002 + +? /public/uTorrent/Downloads Finished/Movies/Indiana.Jones.and.the.Temple.of.Doom.1984.HDTV.720p.x264.AC3.5.1-REDµX/Indiana.Jones.and.the.Temple.of.Doom.1984.HDTV.720p.x264.AC3.5.1-REDµX.mkv +: title: Indiana Jones and the Temple of Doom + year: 1984 + format: HDTV + screen_size: 720p + video_codec: h264 + audio_codec: AC3 + audio_channels: "5.1" + release_group: REDµX + +? The.Director’s.Notebook.2006.Blu-Ray.x264.DXVA.720p.AC3-de[42].mkv +: title: The Director’s Notebook + year: 2006 + format: BluRay + video_codec: h264 + video_api: DXVA + screen_size: 720p + audio_codec: AC3 + release_group: de[42] + + +? Movies/Cosmopolis.2012.LiMiTED.720p.BluRay.x264-AN0NYM0US[bb]/ano-cosmo.720p.mkv +: title: Cosmopolis + year: 2012 + screen_size: 720p + video_codec: h264 + release_group: AN0NYM0US[bb] + format: BluRay + other: Limited + +? movies/La Science des Rêves (2006)/La.Science.Des.Reves.FRENCH.DVDRip.XviD-MP-AceBot.avi +: title: La Science des Rêves + year: 2006 + format: DVD + video_codec: XviD + video_profile: MP + release_group: AceBot + language: French + +? The_Italian_Job.mkv +: title: The Italian Job + +? The.Rum.Diary.2011.1080p.BluRay.DTS.x264.D-Z0N3.mkv +: title: The Rum Diary + year: 2011 + screen_size: 1080p + format: BluRay + video_codec: h264 + audio_codec: DTS + release_group: D-Z0N3 + +? Life.Of.Pi.2012.1080p.BluRay.DTS.x264.D-Z0N3.mkv +: title: Life Of Pi + year: 2012 + screen_size: 1080p + format: BluRay + video_codec: h264 + audio_codec: DTS + release_group: D-Z0N3 + +? The.Kings.Speech.2010.1080p.BluRay.DTS.x264.D Z0N3.mkv +: title: The Kings Speech + year: 2010 + screen_size: 1080p + format: BluRay + audio_codec: DTS + video_codec: h264 + release_group: D Z0N3 + +? Street.Kings.2008.BluRay.1080p.DTS.x264.dxva EuReKA.mkv +: title: Street Kings + year: 2008 + format: BluRay + screen_size: 1080p + audio_codec: DTS + video_codec: h264 + video_api: DXVA + release_group: EuReKA + +? 2001.A.Space.Odyssey.1968.HDDVD.1080p.DTS.x264.dxva EuReKA.mkv +: title: 2001 A Space Odyssey + year: 1968 + format: HD-DVD + screen_size: 1080p + audio_codec: DTS + video_codec: h264 + video_api: DXVA + release_group: EuReKA + +? 2012.2009.720p.BluRay.x264.DTS WiKi.mkv +: title: "2012" + year: 2009 + screen_size: 720p + format: BluRay + video_codec: h264 + audio_codec: DTS + release_group: WiKi + +? /share/Download/movie/Dead Man Down (2013) BRRiP XViD DD5_1 Custom NLSubs =-_lt Q_o_Q gt-=_/XD607ebb-BRc59935-5155473f-1c5f49/XD607ebb-BRc59935-5155473f-1c5f49.avi +: title: Dead Man Down + year: 2013 + format: BluRay + video_codec: XviD + audio_channels: "5.1" + audio_codec: DolbyDigital + uuid: XD607ebb-BRc59935-5155473f-1c5f49 + +? Pacific.Rim.3D.2013.COMPLETE.BLURAY-PCH.avi +: title: Pacific Rim + year: 2013 + format: BluRay + other: + - Complete + - 3D + release_group: PCH + +? Immersion.French.2011.STV.READNFO.QC.FRENCH.ENGLISH.NTSC.DVDR.nfo +: title: Immersion French + year: 2011 + language: + - French + - English + format: DVD + other: NTSC + +? Immersion.French.2011.STV.READNFO.QC.FRENCH.NTSC.DVDR.nfo +: title: Immersion French + year: 2011 + language: French + format: DVD + other: NTSC + +? Immersion.French.2011.STV.READNFO.QC.NTSC.DVDR.nfo +: title: Immersion + language: French + year: 2011 + format: DVD + other: NTSC + +? French.Immersion.2011.STV.READNFO.QC.ENGLISH.NTSC.DVDR.nfo +: title: French Immersion + year: 2011 + language: ENGLISH + format: DVD + other: NTSC + +? Howl's_Moving_Castle_(2004)_[720p,HDTV,x264,DTS]-FlexGet.avi +: video_codec: h264 + format: HDTV + title: Howl's Moving Castle + screen_size: 720p + year: 2004 + audio_codec: DTS + release_group: FlexGet + +? Pirates de langkasuka.2008.FRENCH.1920X1080.h264.AVC.AsiaRa.mkv +: screen_size: 1080p + year: 2008 + language: French + video_codec: h264 + title: Pirates de langkasuka + release_group: AsiaRa + +? Masala (2013) Telugu Movie HD DVDScr XviD - Exclusive.avi +: year: 2013 + video_codec: XviD + title: Masala + format: HD-DVD + other: Screener + language: Telugu + release_group: Exclusive + +? Django Unchained 2012 DVDSCR X264 AAC-P2P.nfo +: year: 2012 + other: Screener + video_codec: h264 + title: Django Unchained + audio_codec: AAC + format: DVD + release_group: P2P + +? Ejecutiva.En.Apuros(2009).BLURAY.SCR.Xvid.Spanish.LanzamientosD.nfo +: year: 2009 + other: Screener + format: BluRay + video_codec: XviD + language: Spanish + title: Ejecutiva En Apuros + +? Die.Schluempfe.2.German.DL.1080p.BluRay.x264-EXQUiSiTE.mkv +: title: Die Schluempfe 2 + format: BluRay + language: + - Multiple languages + - German + video_codec: h264 + release_group: EXQUiSiTE + screen_size: 1080p + +? Rocky 1976 French SubForced BRRip x264 AC3-FUNKY.mkv +: title: Rocky + year: 1976 + subtitle_language: French + format: BluRay + video_codec: h264 + audio_codec: AC3 + release_group: FUNKY + +? REDLINE (BD 1080p H264 10bit FLAC) [3xR].mkv +: title: REDLINE + format: BluRay + video_codec: h264 + video_profile: 10bit + audio_codec: FLAC + screen_size: 1080p + +? The.Lizzie.McGuire.Movie.(2003).HR.DVDRiP.avi +: title: The Lizzie McGuire Movie + year: 2003 + format: DVD + other: HR + +? Hua.Mulan.BRRIP.MP4.x264.720p-HR.avi +: title: Hua Mulan + video_codec: h264 + format: BluRay + screen_size: 720p + other: HR + +? Dr.Seuss.The.Lorax.2012.DVDRip.LiNE.XviD.AC3.HQ.Hive-CM8.mp4 +: video_codec: XviD + title: Dr Seuss The Lorax + format: DVD + other: LiNE + year: 2012 + audio_codec: AC3 + audio_profile: HQ + release_group: Hive-CM8 + +? "Star Wars: Episode IV - A New Hope (2004) Special Edition.MKV" +: title: "Star Wars: Episode IV" + alternativeTitle: A New Hope + year: 2004 + edition: Special Edition + +? Dr.LiNE.The.Lorax.2012.DVDRip.LiNE.XviD.AC3.HQ.Hive-CM8.mp4 +: video_codec: XviD + title: Dr LiNE The Lorax + format: DVD + other: LiNE + year: 2012 + audio_codec: AC3 + audio_profile: HQ + release_group: Hive-CM8 + +? Dr.LiNE.The.Lorax.2012.DVDRip.XviD.AC3.HQ.Hive-CM8.mp4 +: video_codec: XviD + title: Dr LiNE The Lorax + format: DVD + year: 2012 + audio_codec: AC3 + audio_profile: HQ + release_group: Hive-CM8 + +? Perfect Child-2007-TRUEFRENCH-TVRip.Xvid-h@mster.avi +: release_group: h@mster + title: Perfect Child + video_codec: XviD + language: French + format: TV + year: 2007 + +? entre.ciel.et.terre.(1994).dvdrip.h264.aac-psypeon.avi +: audio_codec: AAC + format: DVD + release_group: psypeon + title: entre ciel et terre + video_codec: h264 + year: 1994 + +? Yves.Saint.Laurent.2013.FRENCH.DVDSCR.MD.XviD-ViVARiUM.avi +: format: DVD + language: French + other: + - MD + - Screener + release_group: ViVARiUM + title: Yves Saint Laurent + video_codec: XviD + year: 2013 + +? Echec et Mort - Hard to Kill - Steven Seagal Multi 1080p BluRay x264 CCATS.avi +: format: BluRay + language: Multiple languages + release_group: CCATS + screen_size: 1080p + title: Echec et Mort + alternativeTitle: + - Hard to Kill + - Steven Seagal + video_codec: h264 + +? Paparazzi - Timsit/Lindon (MKV 1080p tvripHD) +: options: -n + title: Paparazzi + alternativeTitle: + - Timsit + - Lindon + screen_size: 1080p + format: HDTV + +? some.movie.720p.bluray.x264-mind +: title: some movie + screen_size: 720p + video_codec: h264 + release_group: mind + format: BluRay + +? Dr LiNE The Lorax 720p h264 BluRay +: title: Dr LiNE The Lorax + screen_size: 720p + video_codec: h264 + format: BluRay + +#TODO: Camelcase implementation +#? BeatdownFrenchDVDRip.mkv +#: options: -c +# title: Beatdown +# language: French +# format: DVD + +#? YvesSaintLaurent2013FrenchDVDScrXvid.avi +#: options: -c +# format: DVD +# language: French +# other: Screener +# title: Yves saint laurent +# video_codec: XviD +# year: 2013 + + +? Elle.s.en.va.720p.mkv +: screen_size: 720p + title: Elle s en va + +? FooBar.7.PDTV-FlexGet +: format: DVB + release_group: FlexGet + title: FooBar 7 + +? h265 - HEVC Riddick Unrated Director Cut French 1080p DTS.mkv +: audio_codec: DTS + edition: Director's cut + language: fr + screen_size: 1080p + title: Riddick + other: Unrated + video_codec: h265 + +? "[h265 - HEVC] Riddick Unrated Director Cut French [1080p DTS].mkv" +: audio_codec: DTS + edition: Director's cut + language: fr + screen_size: 1080p + title: Riddick + other: Unrated + video_codec: h265 + +? Barbecue-2014-French-mHD-1080p +: language: fr + other: mHD + screen_size: 1080p + title: Barbecue + year: 2014 + +? Underworld Quadrilogie VO+VFF+VFQ 1080p HDlight.x264~Tonyk~Monde Infernal +: language: fr + other: + - HDLight + - OV + screen_size: 1080p + title: Underworld Quadrilogie + video_codec: h264 + +? A Bout Portant (The Killers).PAL.Multi.DVD-R-KZ +: format: DVD + language: mul + release_group: KZ + title: A Bout Portant + +? "Mise à Sac (Alain Cavalier, 1967) [Vhs.Rip.Vff]" +: format: VHS + language: fr + title: "Mise à Sac" + year: 1967 + +? A Bout Portant (The Killers).PAL.Multi.DVD-R-KZ +: format: DVD + other: PAL + language: mul + release_group: KZ + title: A Bout Portant + +? Youth.In.Revolt.(Be.Bad).2009.MULTI.1080p.LAME3*92-MEDIOZZ +: audio_codec: MP3 + language: mul + release_group: MEDIOZZ + screen_size: 1080p + title: Youth In Revolt + year: 2009 + +? La Defense Lincoln (The Lincoln Lawyer) 2011 [DVDRIP][Vostfr] +: format: DVD + subtitle_language: fr + title: La Defense Lincoln + year: 2011 + +? '[h265 - HEVC] Fight Club French 1080p DTS.' +: audio_codec: DTS + language: fr + screen_size: 1080p + title: Fight Club + video_codec: h265 + +? Love Gourou (Mike Myers) - FR +: language: fr + title: Love Gourou + +? '[h265 - hevc] transformers 2 1080p french ac3 6ch.' +: audio_channels: '5.1' + audio_codec: AC3 + language: fr + screen_size: 1080p + title: transformers 2 + video_codec: h265 + +? 1.Angry.Man.1957.mkv +: title: 1 Angry Man + year: 1957 + +? 12.Angry.Men.1957.mkv +: title: 12 Angry Men + year: 1957 + +? 123.Angry.Men.1957.mkv +: title: 123 Angry Men + year: 1957 + +? "Looney Tunes 1444x866 Porky's Last Stand.mkv" +: screen_size: 1444x866 + title: Looney Tunes diff --git a/lib/guessit2/test/rules/__init__.py b/lib/guessit2/test/rules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e5be370e4be5007b33fd87ec270e91eea041b66a --- /dev/null +++ b/lib/guessit2/test/rules/__init__.py @@ -0,0 +1,3 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name diff --git a/lib/guessit2/test/rules/audio_codec.yml b/lib/guessit2/test/rules/audio_codec.yml new file mode 100644 index 0000000000000000000000000000000000000000..afbfae04bdfd6131e7b35248478094ca655d7b17 --- /dev/null +++ b/lib/guessit2/test/rules/audio_codec.yml @@ -0,0 +1,77 @@ +# Multiple input strings having same expected results can be chained. +# Use $ marker to check inputs that should not match results. + + +? +MP3 +? +lame +? +lame3.12 +? +lame3.100 +: audio_codec: MP3 + +? +DolbyDigital +? +DD +? -Dolby Digital +: audio_codec: DolbyDigital + +? +AAC +: audio_codec: AAC + +? +AC3 +: audio_codec: AC3 + +? +Flac +: audio_codec: FLAC + +? +DTS +: audio_codec: DTS + +? +True-HD +? +trueHD +: audio_codec: TrueHD + +? +DTS-HD +: audio_codec: DTS + audio_profile: HD + +? +DTS-HDma +: audio_codec: DTS + audio_profile: HDMA + +? +AC3-hq +: audio_codec: AC3 + audio_profile: HQ + +? +AAC-HE +: audio_codec: AAC + audio_profile: HE + +? +AAC-LC +: audio_codec: AAC + audio_profile: LC + +? +AAC2.0 +: audio_codec: AAC + audio_channels: '2.0' + +? +7.1 +? +7ch +? +8ch +: audio_channels: '7.1' + +? +5.1 +? +5ch +? +6ch +: audio_channels: '5.1' + +? +2ch +? +2.0 +? +stereo +: audio_channels: '2.0' + +? +1ch +? +mono +: audio_channels: '1.0' + +? DD5.1 +: audio_codec: DolbyDigital + audio_channels: '5.1' diff --git a/lib/guessit2/test/rules/bonus.yml b/lib/guessit2/test/rules/bonus.yml new file mode 100644 index 0000000000000000000000000000000000000000..6ef6f5b254a02753caf30efa0bf769f107619b8b --- /dev/null +++ b/lib/guessit2/test/rules/bonus.yml @@ -0,0 +1,9 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? Movie Title-x01-Other Title.mkv +? Movie Title-x01-Other Title +? directory/Movie Title-x01-Other Title/file.mkv +: title: Movie Title + bonus_title: Other Title + bonus: 1 + diff --git a/lib/guessit2/test/rules/cds.yml b/lib/guessit2/test/rules/cds.yml new file mode 100644 index 0000000000000000000000000000000000000000..8bb4e98c6ce7262b46992f9c4cbe8fcbaf7510c4 --- /dev/null +++ b/lib/guessit2/test/rules/cds.yml @@ -0,0 +1,5 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? cd 1of3 +: cd: 1 + cd_count: 3 diff --git a/lib/guessit2/test/rules/country.yml b/lib/guessit2/test/rules/country.yml new file mode 100644 index 0000000000000000000000000000000000000000..f2da1b2057c7e1ceea7b62e87bfaf9b30edfdb05 --- /dev/null +++ b/lib/guessit2/test/rules/country.yml @@ -0,0 +1,10 @@ +# Multiple input strings having same expected results can be chained. +# Use $ marker to check inputs that should not match results. +? Us.this.is.title +? this.is.title.US +: country: US + title: this is title + +? This.is.us.title +: title: This is us title + diff --git a/lib/guessit2/test/rules/date.yml b/lib/guessit2/test/rules/date.yml new file mode 100644 index 0000000000000000000000000000000000000000..d7379f03c123498a087f76bd72581fc2ca5305f5 --- /dev/null +++ b/lib/guessit2/test/rules/date.yml @@ -0,0 +1,50 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? +09.03.08 +? +09.03.2008 +? +2008.03.09 +: date: 2008-03-09 + +? +31.01.15 +? +31.01.2015 +? +15.01.31 +? +2015.01.31 +: date: 2015-01-31 + +? +01.02.03 +: date: 2003-02-01 + +? +01.02.03 +: options: --date-year-first + date: 2001-02-03 + +? +01.02.03 +: options: --date-day-first + date: 2003-02-01 + +? 1919 +? 2030 +: !!map {} + +? 2029 +: year: 2029 + +? (1920) +: year: 1920 + +? 2012 +: year: 2012 + +? 2011 2013 (2012) (2015) # first marked year is guessed. +: title: "2011 2013" + year: 2012 + +? 2012 2009 S01E02 2015 # If no year is marked, the second one is guessed. +: title: "2012" + year: 2009 + episode_title: "2015" + +? Something 2 mar 2013) +: title: Something + date: 2013-03-02 + type: episode diff --git a/lib/guessit2/test/rules/edition.yml b/lib/guessit2/test/rules/edition.yml new file mode 100644 index 0000000000000000000000000000000000000000..bc35b85e6b1afd88b18ec630f8e2be9f0cd62860 --- /dev/null +++ b/lib/guessit2/test/rules/edition.yml @@ -0,0 +1,25 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? Director's cut +? Edition Director's cut +: edition: Director's cut + +? Collector +? Collector Edition +? Edition Collector +: edition: Collector Edition + +? Special Edition +? Edition Special +? -Special +: edition: Special Edition + +? Criterion Edition +? Edition Criterion +? -Criterion +: edition: Criterion Edition + +? Deluxe +? Deluxe Edition +? Edition Deluxe +: edition: Deluxe Edition diff --git a/lib/guessit2/test/rules/episodes.yml b/lib/guessit2/test/rules/episodes.yml new file mode 100644 index 0000000000000000000000000000000000000000..61eea5766cdf80a336a984a059f3953d74c0e2d1 --- /dev/null +++ b/lib/guessit2/test/rules/episodes.yml @@ -0,0 +1,119 @@ +# Multiple input strings having same expected results can be chained. +# Use $ marker to check inputs that should not match results. +? +2x5 +? +2X5 +? +02x05 +? +2X05 +? +02x5 +? S02E05 +? s02e05 +? s02e5 +? s2e05 +? -s03e05 +? -s02e06 +? -3x05 +? -2x06 +: season: 2 + episode: 5 + +? "+0102" +? "+102" +: season: 1 + episode: 2 + +? "0102 S03E04" +? "S03E04 102" +: season: 3 + episode: 4 + +? +serie Saison 2 other +? +serie Season 2 other +? +serie Saisons 2 other +? +serie Seasons 2 other +? +serie Serie 2 other +? +serie Series 2 other +? +serie Season Two other +? +serie Season II other +: season: 2 + +? Some Series.S02E01.Episode.title.mkv +? Some Series/Season 02/E01-Episode title.mkv +? Some Series/Season 02/Some Series-E01-Episode title.mkv +? Some Dummy Directory/Season 02/Some Series-E01-Episode title.mkv +? -Some Dummy Directory/Season 02/E01-Episode title.mkv +? Some Series/Unsafe Season 02/Some Series-E01-Episode title.mkv +? -Some Series/Unsafe Season 02/E01-Episode title.mkv +? Some Series/Season 02/E01-Episode title.mkv +? Some Series/ Season 02/E01-Episode title.mkv +? Some Dummy Directory/Some Series S02/E01-Episode title.mkv +? Some Dummy Directory/S02 Some Series/E01-Episode title.mkv +: title: Some Series + episode_title: Episode title + season: 2 + episode: 1 + +? Some Series.S02E01.mkv +? Some Series/Season 02/E01.mkv +? Some Series/Season 02/Some Series-E01.mkv +? Some Dummy Directory/Season 02/Some Series-E01.mkv +? -Some Dummy Directory/Season 02/E01.mkv +? Some Series/Unsafe Season 02/Some Series-E01.mkv +? -Some Series/Unsafe Season 02/E01.mkv +? Some Series/Season 02/E01.mkv +? Some Series/ Season 02/E01.mkv +? Some Dummy Directory/Some Series S02/E01-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA.mkv +: title: Some Series + season: 2 + episode: 1 + +? Some Series S03E01E02 +: title: Some Series + season: 3 + episode: [1, 2] + +? Some Series S01S02S03 +? Some Series S01-02-03 +? Some Series S01 S02 S03 +? -Some Series S01 02 03 +: title: Some Series + season: [1, 2, 3] + +? Some Series E01E02E03 +? Some Series E01-02-03 +? Some Series E01-03 +? Some Series E01 E02 E03 +? Some Series E01 02 03 +: title: Some Series + episode: [1, 2, 3] + +? Some Series E01E02E04 +? Some Series E01 E02 E04 +? Some Series E01 02 04 +: title: Some Series + episode: [1, 2, 4] + +? Some Series E01-02-04 +? Some Series E01-04 +? Some Series E01-04 +: title: Some Series + episode: [1, 2, 3, 4] + +? Some Series E01-02-E04 +: title: Some Series + episode: [1, 2, 4] + +? Episode 3 +? -Episode III +: episode: 3 + +? Episode 3 +? Episode III +: options: -t episode + episode: 3 + +? -A very special movie +: episode_details: Special + +? A very special episode +: options: -t episode + episode_details: Special diff --git a/lib/guessit2/test/rules/film.yml b/lib/guessit2/test/rules/film.yml new file mode 100644 index 0000000000000000000000000000000000000000..1f77433185b8b169c3749f9e88895233a661f293 --- /dev/null +++ b/lib/guessit2/test/rules/film.yml @@ -0,0 +1,9 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? Film Title-f01-Series Title.mkv +? Film Title-f01-Series Title +? directory/Film Title-f01-Series Title/file.mkv +: title: Series Title + film_title: Film Title + film: 1 + diff --git a/lib/guessit2/test/rules/format.yml b/lib/guessit2/test/rules/format.yml new file mode 100644 index 0000000000000000000000000000000000000000..cf3dea921354a7c006185580335d6d3b1a05ab2c --- /dev/null +++ b/lib/guessit2/test/rules/format.yml @@ -0,0 +1,112 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? +VHS +? +VHSRip +? +VHS-Rip +? +VhS_rip +? +VHS.RIP +? -VHSAnythingElse +? -SomeVHS stuff +? -VH +? -VHx +? -VHxRip +: format: VHS + +? +Cam +? +CamRip +? +CaM Rip +? +Cam_Rip +? +cam.rip +: format: Cam + +? +Telesync +? +TS +? +HD TS +? -Hd.Ts # ts file extension +? -HD.TS # ts file extension +? +Hd-Ts +: format: Telesync + +? +Workprint +? +workPrint +? +WorkPrint +? +WP +? -Work Print +: format: Workprint + +? +Telecine +? +teleCine +? +TC +? -Tele Cine +: format: Telecine + +? +PPV +? +ppv-rip +: format: PPV + +? -TV +? +SDTV +? +SDTVRIP +? +Rip sd tv +? +TvRip +? +Rip TV +: format: TV + +? +DVB +? +DVB-Rip +? +DvBRiP +? +pdTV +? +Pd Tv +: format: DVB + +? +DVD +? +DVD-RIP +? +video ts +? +DVDR +? +DVD 9 +? +dvd 5 +? -dvd ts +: format: DVD + -format: ts + +? +HDTV +? +tv rip hd +? +HDtv Rip +? +HdRip +: format: HDTV + +? +VOD +? +VodRip +? +vod rip +: format: VOD + +? +webrip +? +Web Rip +: format: WEBRip + +? +webdl +? +Web DL +? +webHD +? +WEB hd +? +web +: format: WEB-DL + +? +HDDVD +? +hd dvd +? +hdDvdRip +: format: HD-DVD + +? +BluRay +? +BluRay rip +? +BD +? +BR +? +BDRip +? +BR rip +? +BD5 +? +BD9 +? +BD25 +? +bd50 +: format: BluRay + +? XVID.NTSC.DVDR.nfo +: format: DVD diff --git a/lib/guessit2/test/rules/language.yml b/lib/guessit2/test/rules/language.yml new file mode 100644 index 0000000000000000000000000000000000000000..7871898b6c6098d130df3470473eeec24faf6013 --- /dev/null +++ b/lib/guessit2/test/rules/language.yml @@ -0,0 +1,26 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? +English +? .ENG. +: language: English + +? +French +: language: French + +? +SubFrench +? +SubFr +? +STFr +? ST.FR +: subtitle_language: French + +? +ENG.-.sub.FR +? ENG.-.FR Sub +? +ENG.-.SubFR +? +ENG.-.FRSUB +: language: English + subtitle_language: French + +? "{Fr-Eng}.St{Fr-Eng}" +? "Le.Prestige[x264.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv" +: language: [French, English] + subtitle_language: [French, English] \ No newline at end of file diff --git a/lib/guessit2/test/rules/other.yml b/lib/guessit2/test/rules/other.yml new file mode 100644 index 0000000000000000000000000000000000000000..cce8cbd05f1bbd6464394f336f50b4028b1b053b --- /dev/null +++ b/lib/guessit2/test/rules/other.yml @@ -0,0 +1,137 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? +DVDSCR +? +DVDScreener +? +DVD-SCR +? +DVD Screener +? +DVD AnythingElse Screener +? -DVD AnythingElse SCR +: other: Screener + +? +AudioFix +? +AudioFixed +? +Audio Fix +? +Audio Fixed +: other: AudioFix + +? +SyncFix +? +SyncFixed +? +Sync Fix +? +Sync Fixed +: other: SyncFix + +? +DualAudio +? +Dual Audio +: other: DualAudio + +? +ws +? +WideScreen +? +Wide Screen +: other: WideScreen + +? +NF +? +Netflix +: other: Netflix + +# Fix and Real must be surround by others properties to be matched. +? DVD.Real.XViD +? DVD.fix.XViD +? -DVD.Real +? -DVD.Fix +? -Real.XViD +? -Fix.XViD +: other: Proper + proper_count: 1 + +? -DVD.BlablaBla.Fix.Blablabla.XVID +? -DVD.BlablaBla.Fix.XVID +? -DVD.Fix.Blablabla.XVID +: other: Proper + proper_count: 1 + + +? DVD.Real.PROPER.REPACK +: other: Proper + proper_count: 3 + + +? Proper +? +Repack +? +Rerip +: other: Proper + proper_count: 1 + +? XViD.Fansub +: other: Fansub + +? XViD.Fastsub +: other: Fastsub + +? +Season Complete +? -Complete +: other: Complete + +? R5 +? RC +: other: R5 + +? PreAir +? Pre Air +: other: Preair + +? Screener +: other: Screener + +? Remux +: other: Remux + +? 3D +: other: 3D + +? HD +: other: HD + +? mHD # ?? +: other: mHD + +? HDLight +: other: HDLight + +? HQ +: other: HQ + +? ddc +: other: DDC + +? hr +: other: HR + +? PAL +: other: PAL + +? SECAM +: other: SECAM + +? NTSC +: other: NTSC + +? CC +: other: CC + +? LD +: other: LD + +? MD +: other: MD + +? -The complete movie +: other: Complete + +? +The complete movie +: title: The complete movie + +? +AC3-HQ +: audio_profile: HQ + +? Other-HQ +: other: HQ diff --git a/lib/guessit2/test/rules/part.yml b/lib/guessit2/test/rules/part.yml new file mode 100644 index 0000000000000000000000000000000000000000..72f3d98a8343e6267d5567f26d03c0c29847b624 --- /dev/null +++ b/lib/guessit2/test/rules/part.yml @@ -0,0 +1,18 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? Filename Part 3.mkv +? Filename Part III.mkv +? Filename Part Three.mkv +? Filename Part Trois.mkv +: title: Filename + part: 3 + +? Part 3 +? Part III +? Part Three +? Part Trois +? Part3 +: part: 3 + +? -Something.Apt.1 +: part: 1 \ No newline at end of file diff --git a/lib/guessit2/test/rules/processors.yml b/lib/guessit2/test/rules/processors.yml new file mode 100644 index 0000000000000000000000000000000000000000..ee906b2c3f7dd3cbbded8859e276c1730844bd05 --- /dev/null +++ b/lib/guessit2/test/rules/processors.yml @@ -0,0 +1,8 @@ +# Multiple input strings having same expected results can be chained. +# Use $ marker to check inputs that should not match results. + +# Prefer information for last path. +? Some movie (2000)/Some movie (2001).mkv +? Some movie (2001)/Some movie.mkv +: year: 2001 + container: mkv diff --git a/lib/guessit2/test/rules/release_group.yml b/lib/guessit2/test/rules/release_group.yml new file mode 100644 index 0000000000000000000000000000000000000000..8f1d9e9316ca5afb3070c66df1b60e0b475e16c6 --- /dev/null +++ b/lib/guessit2/test/rules/release_group.yml @@ -0,0 +1,33 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? Some.Title.XViD-ReleaseGroup +? Some.Title.XViD-ReleaseGroup.mkv +: release_group: ReleaseGroup + +? Some.Title.XViD-by.Artik[SEDG].avi +: release_group: Artik[SEDG] + +? "[ABC] Some.Title.XViD.avi" +? some/folder/[ABC]Some.Title.avi +: release_group: ABC + +? Some.Title.XViD-S2E02.NoReleaseGroup.avi +: release_group: !!null + +? Test.S01E01-FooBar-Group +: options: -G group -G xxxx + episode: 1 + episode_title: FooBar + release_group: Group + season: 1 + title: Test + type: episode + +? Test.S01E01-FooBar-Group +: options: -G re:gr.?up -G xxxx + episode: 1 + episode_title: FooBar + release_group: Group + season: 1 + title: Test + type: episode diff --git a/lib/guessit2/test/rules/screen_size.yml b/lib/guessit2/test/rules/screen_size.yml new file mode 100644 index 0000000000000000000000000000000000000000..b7de201c30d820bdbcaa490bd65f3d4debc74c29 --- /dev/null +++ b/lib/guessit2/test/rules/screen_size.yml @@ -0,0 +1,65 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? +360p +? +360px +? +360i +? "+360" +? +500x360 +: screen_size: 360p + +? +368p +? +368px +? +368i +? "+368" +? +500x368 +: screen_size: 368p + +? +480p +? +480px +? +480i +? "+480" +? +500x480 +: screen_size: 480p + +? +576p +? +576px +? +576i +? "+576" +? +500x576 +: screen_size: 576p + +? +720p +? +720px +? +720i +? "+720" +? +500x720 +: screen_size: 720p + +? +900p +? +900px +? +900i +? "+900" +? +500x900 +: screen_size: 900p + +? +1080p +? +1080px +? -1080i +? "+1080" +? +500x1080 +: screen_size: 1080p + +? +1080i +? -1080p +: screen_size: 1080i + +? +2160p +? +2160px +? +2160i +? "+2160" +? +4096x2160 +: screen_size: 4K + +? Test.File.720hd.bluray +? Test.File.720p50 +: screen_size: 720p diff --git a/lib/guessit2/test/rules/title.yml b/lib/guessit2/test/rules/title.yml new file mode 100644 index 0000000000000000000000000000000000000000..fffaf8a259d4422477369d498a79784c8cd9d31b --- /dev/null +++ b/lib/guessit2/test/rules/title.yml @@ -0,0 +1,32 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? Title Only +? -Title XViD 720p Only +? sub/folder/Title Only +? -sub/folder/Title XViD 720p Only +? Title Only.mkv +? Title Only.avi +: title: Title Only + +? Title Only/title_only.mkv +: title: Title Only + +? title_only.mkv +: title: title only + +? Some Title/some.title.mkv +? some.title/Some.Title.mkv +: title: Some Title + +? SOME TITLE/Some.title.mkv +? Some.title/SOME TITLE.mkv +: title: Some title + +? some title/Some.title.mkv +? Some.title/some title.mkv +: title: Some title + +? Some other title/Some.Other.title.mkv +? Some.Other title/Some other title.mkv +: title: Some Other title + diff --git a/lib/guessit2/test/rules/video_codec.yml b/lib/guessit2/test/rules/video_codec.yml new file mode 100644 index 0000000000000000000000000000000000000000..d195eaafe9e97313c44c2638d3133556c104ebbd --- /dev/null +++ b/lib/guessit2/test/rules/video_codec.yml @@ -0,0 +1,54 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? rv10 +? rv13 +? RV20 +? Rv30 +? rv40 +? -xrv40 +: video_codec: Real + +? mpeg2 +? MPEG2 +? -mpeg +? -mpeg 2 # Not sure if we should ignore this one ... +? -xmpeg2 +? -mpeg2x +: video_codec: Mpeg2 + +? DivX +? -div X +? divx +? dvdivx +? DVDivX +: video_codec: DivX + +? XviD +? xvid +? -x vid +: video_codec: XviD + +? h264 +? x264 +? h.264 +? x.264 +? mpeg4-AVC +? -MPEG-4 +? -mpeg4 +? -mpeg +? -h 265 +? -x265 +: video_codec: h264 + +? h265 +? x265 +? h.265 +? x.265 +? hevc +? -h 264 +? -x264 +: video_codec: h265 + +? h265-HP +: video_codec: h265 + video_profile: HP \ No newline at end of file diff --git a/lib/guessit2/test/rules/website.yml b/lib/guessit2/test/rules/website.yml new file mode 100644 index 0000000000000000000000000000000000000000..552386576214f69f4df9aa50037603f1f6ff35cf --- /dev/null +++ b/lib/guessit2/test/rules/website.yml @@ -0,0 +1,15 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? +tvu.org.ru +? -tvu.unsafe.ru +: website: tvu.org.ru + +? +www.nimp.na +? -somewww.nimp.na +? -www.nimp.nawouak +? -nimp.na +: website: www.nimp.na + +? +wawa.co.uk +? -wawa.uk +: website: wawa.co.uk diff --git a/lib/guessit2/test/test-input-file.txt b/lib/guessit2/test/test-input-file.txt new file mode 100644 index 0000000000000000000000000000000000000000..656bc9317ddb3b44bfcf1e7784b2cc840299885e --- /dev/null +++ b/lib/guessit2/test/test-input-file.txt @@ -0,0 +1,2 @@ +Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv +SecondFile.avi \ No newline at end of file diff --git a/lib/guessit2/test/test_api.py b/lib/guessit2/test/test_api.py new file mode 100644 index 0000000000000000000000000000000000000000..05ed9a43123f306448794beab3468dc6bc44cd69 --- /dev/null +++ b/lib/guessit2/test/test_api.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name + +import os + +import pytest + +from ..api import guessit, properties + +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + + +def test_default(): + ret = guessit(u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv') + assert ret and 'title' in ret + + +def test_unicode(): + ret = guessit(u'[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi') + assert ret and 'title' in ret + + +def test_main_non_unicode(): + with pytest.raises(TypeError): + guessit(b'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv') + + +def test_properties(): + props = properties() + assert 'video_codec' in props.keys() diff --git a/lib/guessit2/test/test_benchmark.py b/lib/guessit2/test/test_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..7638fe2d646dcbb9ab8af414cf1206227a154e5e --- /dev/null +++ b/lib/guessit2/test/test_benchmark.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use,pointless-statement,missing-docstring,invalid-name,line-too-long +import time + +import pytest + +from ..api import guessit + + +def case1(): + return guessit(u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv') + + +def case2(): + return guessit(u'Movies/Fantastic Mr Fox/Fantastic.Mr.Fox.2009.DVDRip.{x264+LC-AAC.5.1}{Fr-Eng}{Sub.Fr-Eng}-™.[sharethefiles.com].mkv') + + +def case3(): + return guessit(u'Series/dexter/Dexter.5x02.Hello,.Bandit.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi') + + +def case4(): + return guessit(u'Movies/The Doors (1991)/09.03.08.The.Doors.(1991).BDRip.720p.AC3.X264-HiS@SiLUHD-English.[sharethefiles.com].mkv') + + +@pytest.mark.benchmark( + group="Performance Tests", + min_time=1, + max_time=2, + min_rounds=5, + timer=time.time, + disable_gc=True, + warmup=False +) +@pytest.mark.skipif(True, reason="Disabled") +class TestBenchmark(object): + def test_case1(self, benchmark): + ret = benchmark(case1) + assert ret + + def test_case2(self, benchmark): + ret = benchmark(case2) + assert ret + + def test_case3(self, benchmark): + ret = benchmark(case3) + assert ret + + def test_case4(self, benchmark): + ret = benchmark(case4) + assert ret + diff --git a/lib/guessit2/test/test_main.py b/lib/guessit2/test/test_main.py new file mode 100644 index 0000000000000000000000000000000000000000..5c76064a60198bc7cf4b2b128fe019c91ef7adc5 --- /dev/null +++ b/lib/guessit2/test/test_main.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name + +import os + +import pytest + +from ..__main__ import main + +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + + +def test_main_no_args(): + main([]) + + +def test_main(): + main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv']) + + +def test_main_unicode(): + main([u'[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi']) + + +def test_main_non_unicode(): + main(['Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv']) + + +def test_main_verbose(): + main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '--verbose']) + + +def test_main_yaml(): + main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '--yaml']) + + +def test_main_json(): + main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '--json']) + + +def test_main_show_property(): + main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '-P', 'title']) + + +def test_main_advanced(): + main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '-a']) + + +def test_main_input(): + main(['--input', os.path.join(__location__, 'test-input-file.txt')]) + + +def test_main_properties(): + main(['-p']) + main(['-p', '--json']) + main(['-p', '--yaml']) + + +def test_main_values(): + main(['-V']) + main(['-V', '--json']) + main(['-V', '--yaml']) + + +def test_main_help(): + with pytest.raises(SystemExit): + main(['--help']) + + +def test_main_version(): + main(['--version']) diff --git a/lib/guessit2/test/test_yml.py b/lib/guessit2/test/test_yml.py new file mode 100644 index 0000000000000000000000000000000000000000..ccd78e15d8d3e7113a8bc4b2c6c9103b0e32b2b6 --- /dev/null +++ b/lib/guessit2/test/test_yml.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name +import logging + +# io.open supports encoding= in python 2.7 +from io import open # pylint: disable=redefined-builtin +import os +import yaml + +import six + +import regex as re + +import babelfish +import pytest + +from rebulk.utils import is_iterable + +from guessit.options import parse_options +from ..yamlutils import OrderedDictYAMLLoader +from .. import guessit + + +logger = logging.getLogger(__name__) + +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + +filename_predicate = None +string_predicate = None + + +# filename_predicate = lambda filename: 'episode_title' in filename +# string_predicate = lambda string: '-DVD.BlablaBla.Fix.Blablabla.XVID' in string + + + +class EntryResult(object): + def __init__(self, string, negates=False): + self.string = string + self.negates = negates + self.valid = [] + self.missing = [] + self.different = [] + self.extra = [] + self.others = [] + + @property + def ok(self): + if self.negates: + return self.missing or self.different + return not self.missing and not self.different and not self.extra and not self.others + + @property + def warning(self): + if self.negates: + return False + return not self.missing and not self.different and self.extra + + @property + def error(self): + if self.negates: + return not self.missing and not self.different and not self.others + return self.missing or self.different or self.others + + def __repr__(self): + if self.ok: + return self.string + ': OK!' + elif self.warning: + return '%s%s: WARNING! (valid=%i, extra=%i)' % ('-' if self.negates else '', self.string, len(self.valid), + len(self.extra)) + elif self.error: + return '%s%s: ERROR! (valid=%i, missing=%i, different=%i, extra=%i, others=%i)' % \ + ('-' if self.negates else '', self.string, len(self.valid), len(self.missing), len(self.different), + len(self.extra), len(self.others)) + else: + return '%s%s: UNKOWN! (valid=%i, missing=%i, different=%i, extra=%i, others=%i)' % \ + ('-' if self.negates else '', self.string, len(self.valid), len(self.missing), len(self.different), + len(self.extra), len(self.others)) + + @property + def details(self): + ret = [] + if self.valid: + ret.append('valid=' + str(len(self.valid))) + for valid in self.valid: + ret.append(' ' * 4 + str(valid)) + if self.missing: + ret.append('missing=' + str(len(self.missing))) + for missing in self.missing: + ret.append(' ' * 4 + str(missing)) + if self.different: + ret.append('different=' + str(len(self.different))) + for different in self.different: + ret.append(' ' * 4 + str(different)) + if self.extra: + ret.append('extra=' + str(len(self.extra))) + for extra in self.extra: + ret.append(' ' * 4 + str(extra)) + if self.others: + ret.append('others=' + str(len(self.others))) + for other in self.others: + ret.append(' ' * 4 + str(other)) + return ret + + +class Results(list): + def assert_ok(self): + errors = [entry for entry in self if entry.error] + assert not errors + + +def files_and_ids(predicate=None): + files = [] + ids = [] + + for (dirpath, _, filenames) in os.walk(__location__): + if dirpath == __location__: + dirpath_rel = '' + else: + dirpath_rel = os.path.relpath(dirpath, __location__) + for filename in filenames: + name, ext = os.path.splitext(filename) + filepath = os.path.join(dirpath_rel, filename) + if ext == '.yml' and (not predicate or predicate(filepath)): + files.append(filepath) + ids.append(os.path.join(dirpath_rel, name)) + + return files, ids + + +class TestYml(object): + """ + Run tests from yaml files. + Multiple input strings having same expected results can be chained. + Use $ marker to check inputs that should not match results. + """ + + options_re = re.compile(r'^([ \+-]+)(.*)') + + files, ids = files_and_ids(filename_predicate) + + @staticmethod + def set_default(expected, default): + if default: + for k, v in default.items(): + if k not in expected: + expected[k] = v + + @pytest.mark.parametrize('filename', files, ids=ids) + def test(self, filename, caplog): + caplog.setLevel(logging.INFO) + with open(os.path.join(__location__, filename), 'r', encoding='utf-8') as infile: + data = yaml.load(infile, OrderedDictYAMLLoader) + entries = Results() + + last_expected = None + for string, expected in reversed(list(data.items())): + if expected is None: + data[string] = last_expected + else: + last_expected = expected + + default = None + try: + default = data['__default__'] + del data['__default__'] + except KeyError: + pass + + for string, expected in data.items(): + TestYml.set_default(expected, default) + if not isinstance(string, six.text_type): + string = six.text_type(string) + if not string_predicate or string_predicate(string): # pylint: disable=not-callable + entry = self.check(string, expected) + if entry.ok: + logger.debug(u'[' + filename + '] ' + six.text_type(entry)) + elif entry.warning: + logger.warning(u'[' + filename + '] ' + six.text_type(entry)) + elif entry.error: + logger.error(u'[' + filename + '] ' + six.text_type(entry)) + for line in entry.details: + logger.error(u'[' + filename + '] ' + ' ' * 4 + line) + entries.append(entry) + entries.assert_ok() + + def check(self, string, expected): + negates, global_, string = self.parse_token_options(string) + + options = expected.get('options') + if options is None: + options = {} + if not isinstance(options, dict): + options = parse_options(options) + if 'implicit' not in options: + options['implicit'] = True + try: + result = guessit(string, options) + except Exception as exc: + logger.error('[' + string + '] Exception: ' + str(exc)) + raise exc + + entry = EntryResult(string, negates) + + if global_: + self.check_global(string, result, entry) + + self.check_expected(result, expected, entry) + + return entry + + def parse_token_options(self, string): + matches = self.options_re.search(string) + negates = False + global_ = False + if matches: + string = matches.group(2) + for opt in matches.group(1): + if '-' in opt: + negates = True + if '+' in opt: + global_ = True + return negates, global_, string + + def check_global(self, string, result, entry): + global_span = [] + for result_matches in result.matches.values(): + for result_match in result_matches: + if not global_span: + global_span = list(result_match.span) + else: + if global_span[0] > result_match.span[0]: + global_span[0] = result_match.span[0] + if global_span[1] < result_match.span[1]: + global_span[1] = result_match.span[1] + if global_span and global_span[1] - global_span[0] < len(string): + entry.others.append("Match is not global") + + def is_same(self, value, expected): + values = set(value) if is_iterable(value) else set((value,)) + expecteds = set(expected) if is_iterable(expected) else set((expected,)) + if len(values) != len(expecteds): + return False + if isinstance(next(iter(values)), babelfish.Language): + # pylint: disable=no-member + expecteds = set([babelfish.Language.fromguessit(expected) for expected in expecteds]) + elif isinstance(next(iter(values)), babelfish.Country): + # pylint: disable=no-member + expecteds = set([babelfish.Country.fromguessit(expected) for expected in expecteds]) + return values == expecteds + + def check_expected(self, result, expected, entry): + if expected: + for expected_key, expected_value in expected.items(): + if expected_key and expected_key != 'options' and expected_value is not None: + negates_key, _, result_key = self.parse_token_options(expected_key) + if result_key in result.keys(): + if not self.is_same(result[result_key], expected_value): + if negates_key: + entry.valid.append((expected_key, expected_value)) + else: + entry.different.append((expected_key, expected_value, result[expected_key])) + else: + if negates_key: + entry.different.append((expected_key, expected_value, result[expected_key])) + else: + entry.valid.append((expected_key, expected_value)) + elif not negates_key: + entry.missing.append((expected_key, expected_value)) + + for result_key, result_value in result.items(): + if result_key not in expected.keys(): + entry.extra.append((result_key, result_value)) diff --git a/lib/guessit2/test/various.yml b/lib/guessit2/test/various.yml new file mode 100644 index 0000000000000000000000000000000000000000..1d22a4eb916843aaeea7cc9dccf6f6828289189d --- /dev/null +++ b/lib/guessit2/test/various.yml @@ -0,0 +1,545 @@ +? Movies/Fear and Loathing in Las Vegas (1998)/Fear.and.Loathing.in.Las.Vegas.720p.HDDVD.DTS.x264-ESiR.mkv +: type: movie + title: Fear and Loathing in Las Vegas + year: 1998 + screen_size: 720p + format: HD-DVD + audio_codec: DTS + video_codec: h264 + release_group: ESiR + +? Series/Duckman/Duckman - 101 (01) - 20021107 - I, Duckman.avi +: type: episode + title: Duckman + season: 1 + episode: 1 + episode_title: I, Duckman + date: 2002-11-07 + +? Series/Neverwhere/Neverwhere.05.Down.Street.[tvu.org.ru].avi +: type: episode + title: Neverwhere + episode: 5 + episode_title: Down Street + website: tvu.org.ru + +? Neverwhere.05.Down.Street.[tvu.org.ru].avi +: type: episode + title: Neverwhere + episode: 5 + episode_title: Down Street + website: tvu.org.ru + +? Series/Breaking Bad/Minisodes/Breaking.Bad.(Minisodes).01.Good.Cop.Bad.Cop.WEBRip.XviD.avi +: type: episode + title: Breaking Bad + episode_format: Minisode + episode: 1 + episode_title: Good Cop Bad Cop + format: WEBRip + video_codec: XviD + +? Series/Kaamelott/Kaamelott - Livre V - Ep 23 - Le Forfait.avi +: type: episode + title: Kaamelott + episode: 23 + episode_title: Le Forfait + +? Movies/The Doors (1991)/09.03.08.The.Doors.(1991).BDRip.720p.AC3.X264-HiS@SiLUHD-English.[sharethefiles.com].mkv +: type: movie + title: The Doors + year: 1991 + date: 2008-03-09 + format: BluRay + screen_size: 720p + audio_codec: AC3 + video_codec: h264 + release_group: HiS@SiLUHD + language: english + website: sharethefiles.com + +? Movies/M.A.S.H. (1970)/MASH.(1970).[Divx.5.02][Dual-Subtitulos][DVDRip].ogm +: type: movie + title: MASH + year: 1970 + video_codec: DivX + format: DVD + +? the.mentalist.501.hdtv-lol.mp4 +: type: episode + title: the mentalist + season: 5 + episode: 1 + format: HDTV + release_group: lol + +? the.simpsons.2401.hdtv-lol.mp4 +: type: episode + title: the simpsons + season: 24 + episode: 1 + format: HDTV + release_group: lol + +? Homeland.S02E01.HDTV.x264-EVOLVE.mp4 +: type: episode + title: Homeland + season: 2 + episode: 1 + format: HDTV + video_codec: h264 + release_group: EVOLVE + +? /media/Band_of_Brothers-e01-Currahee.mkv +: type: episode + title: Band of Brothers + episode: 1 + episode_title: Currahee + +? /media/Band_of_Brothers-x02-We_Stand_Alone_Together.mkv +: type: episode + title: Band of Brothers + bonus: 2 + bonus_title: We Stand Alone Together + +? /movies/James_Bond-f21-Casino_Royale-x02-Stunts.mkv +: type: movie + title: Casino Royale + film_title: James Bond + film: 21 + bonus: 2 + bonus_title: Stunts + +? /TV Shows/new.girl.117.hdtv-lol.mp4 +: type: episode + title: new girl + season: 1 + episode: 17 + format: HDTV + release_group: lol + +? The.Office.(US).1x03.Health.Care.HDTV.XviD-LOL.avi +: type: episode + title: The Office + country: US + season: 1 + episode: 3 + episode_title: Health Care + format: HDTV + video_codec: XviD + release_group: LOL + +? The_Insider-(1999)-x02-60_Minutes_Interview-1996.mp4 +: type: movie + title: The Insider + year: 1999 + bonus: 2 + bonus_title: 60 Minutes Interview-1996 + +? OSS_117--Cairo,_Nest_of_Spies.mkv +: type: movie + title: OSS 117 + alternativeTitle: Cairo, Nest of Spies + +? Rush.._Beyond_The_Lighted_Stage-x09-Between_Sun_and_Moon-2002_Hartford.mkv +: type: movie + title: Rush Beyond The Lighted Stage + bonus: 9 + bonus_title: Between Sun and Moon + year: 2002 + +? House.Hunters.International.S56E06.720p.hdtv.x264.mp4 +: type: episode + title: House Hunters International + season: 56 + episode: 6 + screen_size: 720p + format: HDTV + video_codec: h264 + +? White.House.Down.2013.1080p.BluRay.DTS-HD.MA.5.1.x264-PublicHD.mkv +: type: movie + title: White House Down + year: 2013 + screen_size: 1080p + format: BluRay + audio_codec: DTS + audio_profile: HDMA + video_codec: h264 + release_group: PublicHD + audio_channels: "5.1" + +? White.House.Down.2013.1080p.BluRay.DTSHD.MA.5.1.x264-PublicHD.mkv +: type: movie + title: White House Down + year: 2013 + screen_size: 1080p + format: BluRay + audio_codec: DTS + audio_profile: HDMA + video_codec: h264 + release_group: PublicHD + audio_channels: "5.1" + +? Hostages.S01E01.Pilot.for.Air.720p.WEB-DL.DD5.1.H.264-NTb.nfo +: type: episode + title: Hostages + episode_title: Pilot for Air + season: 1 + episode: 1 + screen_size: 720p + format: WEB-DL + audio_channels: "5.1" + video_codec: h264 + audio_codec: DolbyDigital + release_group: NTb + +? Despicable.Me.2.2013.1080p.BluRay.x264-VeDeTT.nfo +: type: movie + title: Despicable Me 2 + year: 2013 + screen_size: 1080p + format: BluRay + video_codec: h264 + release_group: VeDeTT + +? Le Cinquieme Commando 1971 SUBFORCED FRENCH DVDRiP XViD AC3 Bandix.mkv +: type: movie + audio_codec: AC3 + format: DVD + release_group: Bandix + subtitle_language: French + title: Le Cinquieme Commando + video_codec: XviD + year: 1971 + +? Le Seigneur des Anneaux - La Communauté de l'Anneau - Version Longue - BDRip.mkv +: type: movie + format: BluRay + title: Le Seigneur des Anneaux + +? La petite bande (Michel Deville - 1983) VF PAL MP4 x264 AAC.mkv +: type: movie + audio_codec: AAC + language: French + title: La petite bande + video_codec: h264 + year: 1983 + other: PAL + +? Retour de Flammes (Gregor Schnitzler 2003) FULL DVD.iso +: type: movie + format: DVD + title: Retour de Flammes + type: movie + year: 2003 + +? A.Common.Title.Special.2014.avi +: type: movie + year: 2014 + title: A Common Title Special + +? A.Common.Title.2014.Special.avi +: type: episode + year: 2014 + title: A Common Title + episode_title: Special + episode_details: Special + +? A.Common.Title.2014.Special.Edition.avi +: type: movie + year: 2014 + title: A Common Title + edition: Special Edition + +? Downton.Abbey.2013.Christmas.Special.HDTV.x264-FoV.mp4 +: type: episode + year: 2013 + title: Downton Abbey + episode_title: Christmas Special + video_codec: h264 + release_group: FoV + format: HDTV + episode_details: Special + +? Doctor_Who_2013_Christmas_Special.The_Time_of_The_Doctor.HD +: type: episode + title: Doctor Who + other: HD + episode_details: Special + episode_title: Christmas Special The Time of The Doctor + year: 2013 + +? Doctor Who 2005 50th Anniversary Special The Day of the Doctor 3.avi +: type: episode + title: Doctor Who + episode_details: Special + episode_title: 50th Anniversary Special The Day of the Doctor 3 + year: 2005 + +? Robot Chicken S06-Born Again Virgin Christmas Special HDTV x264.avi +: type: episode + title: Robot Chicken + format: HDTV + season: 6 + episode_title: Born Again Virgin Christmas Special + video_codec: h264 + episode_details: Special + +? Wicked.Tuna.S03E00.Head.To.Tail.Special.HDTV.x264-YesTV +: type: episode + title: Wicked Tuna + episode_title: Head To Tail Special + release_group: YesTV + season: 3 + episode: 0 + video_codec: h264 + format: HDTV + episode_details: Special + +? The.Voice.UK.S03E12.HDTV.x264-C4TV +: episode: 12 + video_codec: h264 + format: HDTV + title: The Voice + release_group: C4TV + season: 3 + country: United Kingdom + type: episode + +? /tmp/star.trek.9/star.trek.9.mkv +: type: movie + title: star trek 9 + +? star.trek.9.mkv +: type: movie + title: star trek 9 + +? FlexGet.S01E02.TheName.HDTV.xvid +: episode: 2 + format: HDTV + season: 1 + title: FlexGet + episode_title: TheName + type: episode + video_codec: XviD + +? FlexGet.S01E02.TheName.HDTV.xvid +: episode: 2 + format: HDTV + season: 1 + title: FlexGet + episode_title: TheName + type: episode + video_codec: XviD + +? some.series.S03E14.Title.Here.720p +: episode: 14 + screen_size: 720p + season: 3 + title: some series + episode_title: Title Here + type: episode + +? '[the.group] Some.Series.S03E15.Title.Two.720p' +: episode: 15 + release_group: the.group + screen_size: 720p + season: 3 + title: Some Series + episode_title: Title Two + type: episode + +? 'HD 720p: Some series.S03E16.Title.Three' +: episode: 16 + other: HD + screen_size: 720p + season: 3 + title: Some series + episode_title: Title Three + type: episode + +? Something.Season.2.1of4.Ep.Title.HDTV.torrent +: episode_count: 4 + episode: 1 + format: HDTV + season: 2 + title: Something + episode_title: Title + type: episode + container: torrent + +? Show-A (US) - Episode Title S02E09 hdtv +: country: US + episode: 9 + format: HDTV + season: 2 + title: Show-A + type: episode + +? Jack's.Show.S03E01.blah.1080p +: episode: 1 + screen_size: 1080p + season: 3 + title: Jack's Show + episode_title: blah + type: episode + +? FlexGet.epic +: title: FlexGet epic + type: movie + +? FlexGet.Apt.1 +: title: FlexGet Apt 1 + type: movie + +? FlexGet.aptitude +: title: FlexGet aptitude + type: movie + +? FlexGet.Step1 +: title: FlexGet Step1 + type: movie + +? Movies/El Bosque Animado (1987)/El.Bosque.Animado.[Jose.Luis.Cuerda.1987].[Xvid-Dvdrip-720 * 432].avi +: format: DVD + screen_size: 720x432 + title: El Bosque Animado + video_codec: XviD + year: 1987 + type: movie + +? Movies/El Bosque Animado (1987)/El.Bosque.Animado.[Jose.Luis.Cuerda.1987].[Xvid-Dvdrip-720x432].avi +: format: DVD + screen_size: 720x432 + title: El Bosque Animado + video_codec: XviD + year: 1987 + type: movie + +? 2009.shoot.fruit.chan.multi.dvd9.pal +: format: DVD + language: mul + other: PAL + title: shoot fruit chan + type: movie + year: 2009 + +? 2009.shoot.fruit.chan.multi.dvd5.pal +: format: DVD + language: mul + other: PAL + title: shoot fruit chan + type: movie + year: 2009 + +? The.Flash.2014.S01E01.PREAIR.WEBRip.XviD-EVO.avi +: episode: 1 + format: WEBRip + other: Preair + release_group: EVO + season: 1 + title: The Flash + type: episode + video_codec: XviD + year: 2014 + +? Ice.Lake.Rebels.S01E06.Ice.Lake.Games.720p.HDTV.x264-DHD +: episode: 6 + format: HDTV + release_group: DHD + screen_size: 720p + season: 1 + title: Ice Lake Rebels + episode_title: Ice Lake Games + type: episode + video_codec: h264 + +? The League - S06E10 - Epi Sexy.mkv +: episode: 10 + season: 6 + title: The League + episode_title: Epi Sexy + type: episode + +? Stay (2005) [1080p]/Stay.2005.1080p.BluRay.x264.YIFY.mp4 +: format: BluRay + release_group: YIFY + screen_size: 1080p + title: Stay + type: movie + video_codec: h264 + year: 2005 + +? /media/live/A/Anger.Management.S02E82.720p.HDTV.X264-DIMENSION.mkv +: format: HDTV + release_group: DIMENSION + screen_size: 720p + title: Anger Management + type: episode + season: 2 + episode: 82 + video_codec: h264 + +? "[Figmentos] Monster 34 - At the End of Darkness [781219F1].mkv" +: type: episode + release_group: Figmentos + title: Monster + episode: 34 + episode_title: At the End of Darkness + crc32: 781219F1 + +? Game.of.Thrones.S05E07.720p.HDTV-KILLERS.mkv +: type: episode + episode: 7 + format: HDTV + release_group: KILLERS + screen_size: 720p + season: 5 + title: Game of Thrones + +? Game.of.Thrones.S05E07.HDTV.720p-KILLERS.mkv +: type: episode + episode: 7 + format: HDTV + release_group: KILLERS + screen_size: 720p + season: 5 + title: Game of Thrones + +? Parks and Recreation - [04x12] - Ad Campaign.avi +: type: episode + title: Parks and Recreation + season: 4 + episode: 12 + episode_title: Ad Campaign + +? Star Trek Into Darkness (2013)/star.trek.into.darkness.2013.720p.web-dl.h264-publichd.mkv +: type: movie + title: Star Trek Into Darkness + year: 2013 + screen_size: 720p + format: WEB-DL + video_codec: h264 + release_group: publichd + +? /var/medias/series/The Originals/Season 02/The.Originals.S02E15.720p.HDTV.X264-DIMENSION.mkv +: type: episode + title: The Originals + season: 2 + episode: 15 + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: DIMENSION + +? Test.S01E01E07-FooBar-Group.avi +: container: avi + episode: + - 1 + - 7 + episode_title: FooBar-Group # Make sure it doesn't conflict with uuid + mimetype: video/x-msvideo + season: 1 + title: Test + type: episode diff --git a/lib/guessit2/tlds-alpha-by-domain.txt b/lib/guessit2/tlds-alpha-by-domain.txt new file mode 100644 index 0000000000000000000000000000000000000000..280c794c5471bfad0e77f7ab8b9574b5c736826c --- /dev/null +++ b/lib/guessit2/tlds-alpha-by-domain.txt @@ -0,0 +1,341 @@ +# Version 2013112900, Last Updated Fri Nov 29 07:07:01 2013 UTC +AC +AD +AE +AERO +AF +AG +AI +AL +AM +AN +AO +AQ +AR +ARPA +AS +ASIA +AT +AU +AW +AX +AZ +BA +BB +BD +BE +BF +BG +BH +BI +BIKE +BIZ +BJ +BM +BN +BO +BR +BS +BT +BV +BW +BY +BZ +CA +CAMERA +CAT +CC +CD +CF +CG +CH +CI +CK +CL +CLOTHING +CM +CN +CO +COM +CONSTRUCTION +CONTRACTORS +COOP +CR +CU +CV +CW +CX +CY +CZ +DE +DIAMONDS +DIRECTORY +DJ +DK +DM +DO +DZ +EC +EDU +EE +EG +ENTERPRISES +EQUIPMENT +ER +ES +ESTATE +ET +EU +FI +FJ +FK +FM +FO +FR +GA +GALLERY +GB +GD +GE +GF +GG +GH +GI +GL +GM +GN +GOV +GP +GQ +GR +GRAPHICS +GS +GT +GU +GURU +GW +GY +HK +HM +HN +HOLDINGS +HR +HT +HU +ID +IE +IL +IM +IN +INFO +INT +IO +IQ +IR +IS +IT +JE +JM +JO +JOBS +JP +KE +KG +KH +KI +KITCHEN +KM +KN +KP +KR +KW +KY +KZ +LA +LAND +LB +LC +LI +LIGHTING +LK +LR +LS +LT +LU +LV +LY +MA +MC +MD +ME +MG +MH +MIL +MK +ML +MM +MN +MO +MOBI +MP +MQ +MR +MS +MT +MU +MUSEUM +MV +MW +MX +MY +MZ +NA +NAME +NC +NE +NET +NF +NG +NI +NL +NO +NP +NR +NU +NZ +OM +ORG +PA +PE +PF +PG +PH +PHOTOGRAPHY +PK +PL +PLUMBING +PM +PN +POST +PR +PRO +PS +PT +PW +PY +QA +RE +RO +RS +RU +RW +SA +SB +SC +SD +SE +SEXY +SG +SH +SI +SINGLES +SJ +SK +SL +SM +SN +SO +SR +ST +SU +SV +SX +SY +SZ +TATTOO +TC +TD +TECHNOLOGY +TEL +TF +TG +TH +TIPS +TJ +TK +TL +TM +TN +TO +TODAY +TP +TR +TRAVEL +TT +TV +TW +TZ +UA +UG +UK +US +UY +UZ +VA +VC +VE +VENTURES +VG +VI +VN +VOYAGE +VU +WF +WS +XN--3E0B707E +XN--45BRJ9C +XN--80AO21A +XN--80ASEHDB +XN--80ASWG +XN--90A3AC +XN--CLCHC0EA0B2G2A9GCD +XN--FIQS8S +XN--FIQZ9S +XN--FPCRJ9C3D +XN--FZC2C9E2C +XN--GECRJ9C +XN--H2BRJ9C +XN--J1AMH +XN--J6W193G +XN--KPRW13D +XN--KPRY57D +XN--L1ACC +XN--LGBBAT1AD8J +XN--MGB9AWBF +XN--MGBA3A4F16A +XN--MGBAAM7A8H +XN--MGBAYH7GPA +XN--MGBBH1A71E +XN--MGBC0A9AZCG +XN--MGBERP4A5D4AR +XN--MGBX4CD0AB +XN--NGBC5AZD +XN--O3CW4H +XN--OGBPF8FL +XN--P1AI +XN--PGBS0DH +XN--Q9JYB4C +XN--S9BRJ9C +XN--UNUP4Y +XN--WGBH1C +XN--WGBL6A +XN--XKC2AL3HYE2A +XN--XKC2DL3A5EE0H +XN--YFRO4I67O +XN--YGBI2AMMX +XXX +YE +YT +ZA +ZM +ZW diff --git a/lib/guessit2/yamlutils.py b/lib/guessit2/yamlutils.py new file mode 100644 index 0000000000000000000000000000000000000000..2824575dafbe138e4ee8cb6ac252adef03fd004e --- /dev/null +++ b/lib/guessit2/yamlutils.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Options +""" +try: + from collections import OrderedDict +except ImportError: # pragma: no-cover + from ordereddict import OrderedDict # pylint:disable=import-error +import babelfish + +import yaml + + +class OrderedDictYAMLLoader(yaml.Loader): + """ + A YAML loader that loads mappings into ordered dictionaries. + From https://gist.github.com/enaeseth/844388 + """ + + def __init__(self, *args, **kwargs): + yaml.Loader.__init__(self, *args, **kwargs) + + self.add_constructor(u'tag:yaml.org,2002:map', type(self).construct_yaml_map) + self.add_constructor(u'tag:yaml.org,2002:omap', type(self).construct_yaml_map) + + def construct_yaml_map(self, node): + data = OrderedDict() + yield data + value = self.construct_mapping(node) + data.update(value) + + def construct_mapping(self, node, deep=False): + if isinstance(node, yaml.MappingNode): + self.flatten_mapping(node) + else: # pragma: no cover + raise yaml.constructor.ConstructorError(None, None, + 'expected a mapping node, but found %s' % node.id, node.start_mark) + + mapping = OrderedDict() + for key_node, value_node in node.value: + key = self.construct_object(key_node, deep=deep) + try: + hash(key) + except TypeError as exc: # pragma: no cover + raise yaml.constructor.ConstructorError('while constructing a mapping', + node.start_mark, 'found unacceptable key (%s)' + % exc, key_node.start_mark) + value = self.construct_object(value_node, deep=deep) + mapping[key] = value + return mapping + + +class CustomDumper(yaml.SafeDumper): + """ + Custom YAML Dumper. + """ + pass + + +def default_representer(dumper, data): + """Default representer""" + return dumper.represent_str(str(data)) +CustomDumper.add_representer(babelfish.Language, default_representer) +CustomDumper.add_representer(babelfish.Country, default_representer) + + +def ordered_dict_representer(dumper, data): + """OrderedDict representer""" + return dumper.represent_dict(data) +CustomDumper.add_representer(OrderedDict, ordered_dict_representer) diff --git a/lib/rebulk/__init__.py b/lib/rebulk/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..93d5e4774a801e332ca99d7c8fb6170ebc66e0fe --- /dev/null +++ b/lib/rebulk/__init__.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Define simple search patterns in bulk to perform advanced matching on any string. +""" +# pylint:disable=import-self +from .rebulk import Rebulk +from .rules import Rule, CustomRule, AppendMatch, RemoveMatch, RenameMatch, AppendTags, RemoveTags +from .processors import ConflictSolver, PrivateRemover, POST_PROCESS, PRE_PROCESS +from .pattern import REGEX_AVAILABLE diff --git a/lib/rebulk/__version__.py b/lib/rebulk/__version__.py new file mode 100644 index 0000000000000000000000000000000000000000..59489449b4399e614506150ac863c988c7c79de2 --- /dev/null +++ b/lib/rebulk/__version__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Version module +""" +# pragma: no cover +__version__ = '0.6.5.dev0' diff --git a/lib/rebulk/debug.py b/lib/rebulk/debug.py new file mode 100644 index 0000000000000000000000000000000000000000..2384b26ef28ceb99ae1846734117f82f78f7e851 --- /dev/null +++ b/lib/rebulk/debug.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Debug tools. + +Can be configured by changing values of those variable. + +DEBUG = False +Enable this variable to activate debug features (like defined_at parameters). It can slow down Rebulk + +LOG_LEVEL = 0 +Default log level of generated rebulk logs. +""" + +import inspect +import logging +import os +from collections import namedtuple + + +DEBUG = False +LOG_LEVEL = logging.DEBUG + + +class Frame(namedtuple('Frame', ['lineno', 'package', 'name', 'filename'])): + """ + Stack frame representation. + """ + __slots__ = () + + def __repr__(self): + return "%s#L%s" % (os.path.basename(self.filename), self.lineno) + + +def defined_at(): + """ + Get definition location of a pattern or a match (outside of rebulk package). + :return: + :rtype: + """ + if DEBUG: + frame = inspect.currentframe() + while frame: + try: + if frame.f_globals['__package__'] != __package__: + break + except KeyError: # pragma:no cover + # If package is missing, consider we are in. Workaround for python 3.3. + break + frame = frame.f_back + ret = Frame(frame.f_lineno, + frame.f_globals.get('__package__'), + frame.f_globals.get('__name__'), + frame.f_code.co_filename) + del frame + return ret diff --git a/lib/rebulk/formatters.py b/lib/rebulk/formatters.py new file mode 100644 index 0000000000000000000000000000000000000000..470469426e64d9a206d57368d49905ac8476a5e1 --- /dev/null +++ b/lib/rebulk/formatters.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Formatter functions to use in patterns. + +All those function have last argument as match.value (str). +""" + + +def formatters(*chained_formatters): + """ + Chain formatter functions. + :param chained_formatters: + :type chained_formatters: + :return: + :rtype: + """ + def formatters_chain(input_string): # pylint:disable=missing-docstring + for chained_formatter in chained_formatters: + input_string = chained_formatter(input_string) + return input_string + + return formatters_chain diff --git a/lib/rebulk/introspector.py b/lib/rebulk/introspector.py new file mode 100644 index 0000000000000000000000000000000000000000..64b9836f0c5232841f53984d1185b8f9a3b8665a --- /dev/null +++ b/lib/rebulk/introspector.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Introspect rebulk object to retrieve capabilities. +""" +from abc import ABCMeta, abstractproperty +from collections import defaultdict + +import six +from .pattern import StringPattern, RePattern, FunctionalPattern +from .utils import extend_safe + + +@six.add_metaclass(ABCMeta) +class Description(object): + """ + Abstract class for a description. + """ + @abstractproperty + def properties(self): # pragma: no cover + """ + Properties of described object. + :return: all properties that described object can generate grouped by name. + :rtype: dict + """ + pass + + +class PatternDescription(Description): + """ + Description of a pattern. + """ + def __init__(self, pattern): # pylint:disable=too-many-branches + self.pattern = pattern + self._properties = defaultdict(list) + + if pattern.properties: + for key, values in pattern.properties.items(): + extend_safe(self._properties[key], values) + elif 'value' in pattern.match_options: + self._properties[pattern.name].append(pattern.match_options['value']) + elif isinstance(pattern, StringPattern): + extend_safe(self._properties[pattern.name], pattern.patterns) + elif isinstance(pattern, RePattern): + if pattern.name and pattern.name not in pattern.private_names: + extend_safe(self._properties[pattern.name], [None]) + if not pattern.private_children: + for regex_pattern in pattern.patterns: + for group_name, values in regex_pattern.groupindex.items(): + if group_name not in pattern.private_names: + extend_safe(self._properties[group_name], [None]) + elif isinstance(pattern, FunctionalPattern): + if pattern.name and pattern.name not in pattern.private_names: + extend_safe(self._properties[pattern.name], [None]) + + + @property + def properties(self): + """ + Properties for this rule. + :return: + :rtype: dict + """ + return self._properties + + +class RuleDescription(Description): + """ + Description of a rule. + """ + def __init__(self, rule): + self.rule = rule + + self._properties = defaultdict(list) + + if rule.properties: + for key, values in rule.properties.items(): + extend_safe(self._properties[key], values) + + @property + def properties(self): + """ + Properties for this rule. + :return: + :rtype: dict + """ + return self._properties + + +class Introspection(Description): + """ + Introspection results. + """ + def __init__(self, rebulk, context=None): + self.patterns = [PatternDescription(pattern) for pattern in rebulk.effective_patterns(context) + if not pattern.private and not pattern.marker] + self.rules = [RuleDescription(rule) for rule in rebulk.effective_rules(context)] + + @property + def properties(self): + """ + Properties for Introspection results. + :return: + :rtype: + """ + properties = defaultdict(list) + for pattern in self.patterns: + for key, values in pattern.properties.items(): + extend_safe(properties[key], values) + for rule in self.rules: + for key, values in rule.properties.items(): + extend_safe(properties[key], values) + return properties + + +def introspect(rebulk, context=None): + """ + Introspect a Rebulk instance to grab defined objects and properties that can be generated. + :param rebulk: + :type rebulk: Rebulk + :param context: + :type context: + :return: Introspection instance + :rtype: Introspection + """ + return Introspection(rebulk, context) diff --git a/lib/rebulk/loose.py b/lib/rebulk/loose.py new file mode 100644 index 0000000000000000000000000000000000000000..79e1a1f126e4c7ac7e94d92b751bbdfb6e4a86f9 --- /dev/null +++ b/lib/rebulk/loose.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Various utilities functions +""" +import inspect +import sys +from .utils import is_iterable + +if sys.version_info < (3, 4, 0): # pragma: no cover + def _constructor(class_): + """ + Retrieves constructor from given class + + :param class_: + :type class_: class + :return: constructor from given class + :rtype: callable + """ + return class_.__init__ +else: # pragma: no cover + def _constructor(class_): + """ + Retrieves constructor from given class + + :param class_: + :type class_: class + :return: constructor from given class + :rtype: callable + """ + return class_ + + +def call(function, *args, **kwargs): + """ + Call a function or constructor with given args and kwargs after removing args and kwargs that doesn't match + function or constructor signature + + :param function: Function or constructor to call + :type function: callable + :param args: + :type args: + :param kwargs: + :type kwargs: + :return: sale vakye as default function call + :rtype: object + """ + func = constructor_args if inspect.isclass(function) else function_args + call_args, call_kwargs = func(function, *args, **kwargs) + return function(*call_args, **call_kwargs) + + +def function_args(callable_, *args, **kwargs): + """ + Return (args, kwargs) matching the function signature + + :param callable: callable to inspect + :type callable: callable + :param args: + :type args: + :param kwargs: + :type kwargs: + :return: (args, kwargs) matching the function signature + :rtype: tuple + """ + argspec = inspect.getargspec(callable_) # pylint:disable=deprecated-method + return argspec_args(argspec, False, *args, **kwargs) + + +def constructor_args(class_, *args, **kwargs): + """ + Return (args, kwargs) matching the function signature + + :param callable: callable to inspect + :type callable: Callable + :param args: + :type args: + :param kwargs: + :type kwargs: + :return: (args, kwargs) matching the function signature + :rtype: tuple + """ + argspec = inspect.getargspec(_constructor(class_)) # pylint:disable=deprecated-method + return argspec_args(argspec, True, *args, **kwargs) + + +def argspec_args(argspec, constructor, *args, **kwargs): + """ + Return (args, kwargs) matching the argspec object + + :param argspec: argspec to use + :type argspec: argspec + :param constructor: is it a constructor ? + :type constructor: bool + :param args: + :type args: + :param kwargs: + :type kwargs: + :return: (args, kwargs) matching the function signature + :rtype: tuple + """ + if argspec.keywords: + call_kwarg = kwargs + else: + call_kwarg = dict((k, kwargs[k]) for k in kwargs if k in argspec.args) # Python 2.6 dict comprehension + if argspec.varargs: + call_args = args + else: + call_args = args[:len(argspec.args) - (1 if constructor else 0)] + return call_args, call_kwarg + + +def ensure_list(param): + """ + Retrieves a list from given parameter. + + :param param: + :type param: + :return: + :rtype: + """ + if not param: + param = [] + elif not is_iterable(param): + param = [param] + return param + + +def ensure_dict(param, default_value, default_key=None): + """ + Retrieves a dict and a default value from given parameter. + + if parameter is not a dict, it will be promoted as the default value. + + :param param: + :type param: + :param default_value: + :type default_value: + :param default_key: + :type default_key: + :return: + :rtype: + """ + if not param: + param = default_value + if not isinstance(param, dict): + if param: + default_value = param + return {default_key: param}, default_value + return param, default_value + + +def filter_index(collection, predicate=None, index=None): + """ + Filter collection with predicate function and index. + + If index is not found, returns None. + :param collection: + :type collection: collection supporting iteration and slicing + :param predicate: function to filter the collection with + :type predicate: function + :param index: position of a single element to retrieve + :type index: int + :return: filtered list, or single element of filtered list if index is defined + :rtype: list or object + """ + if index is None and isinstance(predicate, int): + index = predicate + predicate = None + if predicate: + collection = collection.__class__(filter(predicate, collection)) + if index is not None: + try: + collection = collection[index] + except IndexError: + collection = None + return collection + + +def set_defaults(defaults, kwargs): + """ + Set defaults from defaults dict to kwargs dict + :param defaults: + :type defaults: + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + for key, value in defaults.items(): + if key not in kwargs: + kwargs[key] = value + elif isinstance(value, list) and isinstance(kwargs[key], list): + kwargs[key] = list(value) + kwargs[key] diff --git a/lib/rebulk/match.py b/lib/rebulk/match.py new file mode 100644 index 0000000000000000000000000000000000000000..86034eb25f23bb0a3fb5eba3ddd8fc51a2368bd5 --- /dev/null +++ b/lib/rebulk/match.py @@ -0,0 +1,781 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Classes and functions related to matches +""" +from collections import defaultdict, MutableSequence +import copy +try: + from collections import OrderedDict # pylint:disable=ungrouped-imports +except ImportError: # pragma: no cover + from ordereddict import OrderedDict # pylint:disable=import-error +import six + +from .loose import ensure_list, filter_index +from .utils import is_iterable +from .debug import defined_at + + +class MatchesDict(OrderedDict): + """ + A custom dict with matches property. + """ + def __init__(self): + super(MatchesDict, self).__init__() + self.matches = defaultdict(list) + self.values_list = defaultdict(list) + + +class _BaseMatches(MutableSequence): + """ + A custom list[Match] that automatically maintains name, tag, start and end lookup structures. + """ + _base = list + _base_add = _base.append + _base_remove = _base.remove + + def __init__(self, matches=None, input_string=None): + self.input_string = input_string + self._max_end = 0 + self._delegate = [] + self._name_dict = defaultdict(_BaseMatches._base) + self._tag_dict = defaultdict(_BaseMatches._base) + self._start_dict = defaultdict(_BaseMatches._base) + self._end_dict = defaultdict(_BaseMatches._base) + self._index_dict = defaultdict(_BaseMatches._base) + if matches: + self.extend(matches) + + def _add_match(self, match): + """ + Add a match + :param match: + :type match: Match + """ + if match.name: + _BaseMatches._base_add(self._name_dict[match.name], (match)) + for tag in match.tags: + _BaseMatches._base_add(self._tag_dict[tag], match) + _BaseMatches._base_add(self._start_dict[match.start], match) + _BaseMatches._base_add(self._end_dict[match.end], match) + for index in range(*match.span): + _BaseMatches._base_add(self._index_dict[index], match) + if match.end > self._max_end: + self._max_end = match.end + + def _remove_match(self, match): + """ + Remove a match + :param match: + :type match: Match + """ + if match.name: + _BaseMatches._base_remove(self._name_dict[match.name], match) + for tag in match.tags: + _BaseMatches._base_remove(self._tag_dict[tag], match) + _BaseMatches._base_remove(self._start_dict[match.start], match) + _BaseMatches._base_remove(self._end_dict[match.end], match) + for index in range(*match.span): + _BaseMatches._base_remove(self._index_dict[index], match) + if match.end >= self._max_end and not self._end_dict[match.end]: + self._max_end = max(self._end_dict.keys()) + + def previous(self, match, predicate=None, index=None): + """ + Retrieves the nearest previous matches. + :param match: + :type match: + :param predicate: + :type predicate: + :param index: + :type index: int + :return: + :rtype: + """ + current = match.start + while current > -1: + previous_matches = self.ending(current) + if previous_matches: + return filter_index(previous_matches, predicate, index) + current -= 1 + return filter_index(_BaseMatches._base(), predicate, index) + + def next(self, match, predicate=None, index=None): + """ + Retrieves the nearest next matches. + :param match: + :type match: + :param predicate: + :type predicate: + :param index: + :type index: int + :return: + :rtype: + """ + current = match.start + 1 + while current <= self._max_end: + next_matches = self.starting(current) + if next_matches: + return filter_index(next_matches, predicate, index) + current += 1 + return filter_index(_BaseMatches._base(), predicate, index) + + def named(self, name, predicate=None, index=None): + """ + Retrieves a set of Match objects that have the given name. + :param name: + :type name: str + :param predicate: + :type predicate: + :param index: + :type index: int + :return: set of matches + :rtype: set[Match] + """ + return filter_index(_BaseMatches._base(self._name_dict[name]), predicate, index) + + def tagged(self, tag, predicate=None, index=None): + """ + Retrieves a set of Match objects that have the given tag defined. + :param tag: + :type tag: str + :param predicate: + :type predicate: + :param index: + :type index: int + :return: set of matches + :rtype: set[Match] + """ + return filter_index(_BaseMatches._base(self._tag_dict[tag]), predicate, index) + + def starting(self, start, predicate=None, index=None): + """ + Retrieves a set of Match objects that starts at given index. + :param start: the starting index + :type start: int + :param predicate: + :type predicate: + :param index: + :type index: int + :return: set of matches + :rtype: set[Match] + """ + return filter_index(_BaseMatches._base(self._start_dict[start]), predicate, index) + + def ending(self, end, predicate=None, index=None): + """ + Retrieves a set of Match objects that ends at given index. + :param end: the ending index + :type end: int + :param predicate: + :type predicate: + :return: set of matches + :rtype: set[Match] + """ + return filter_index(_BaseMatches._base(self._end_dict[end]), predicate, index) + + def range(self, start=0, end=None, predicate=None, index=None): + """ + Retrieves a set of Match objects that are available in given range, sorted from start to end. + :param start: the starting index + :type start: int + :param end: the ending index + :type end: int + :param predicate: + :type predicate: + :param index: + :type index: int + :return: set of matches + :rtype: set[Match] + """ + if end is None: + end = self.max_end + else: + end = min(self.max_end, end) + ret = _BaseMatches._base() + for match in sorted(self): + if match.start < end and match.end > start: + ret.append(match) + return filter_index(ret, predicate, index) + + def chain_before(self, position, seps, start=0, predicate=None, index=None): + """ + Retrieves a list of chained matches, before position, matching predicate and separated by characters from seps + only. + :param position: + :type position: + :param seps: + :type seps: + :param start: + :type start: + :param predicate: + :type predicate: + :param index: + :type index: + :return: + :rtype: + """ + if hasattr(position, 'start'): + position = position.start + + chain = _BaseMatches._base() + position = min(self.max_end, position) + + for i in reversed(range(start, position)): + index_matches = self.at_index(i) + filtered_matches = [index_match for index_match in index_matches if not predicate or predicate(index_match)] + if filtered_matches: + for chain_match in filtered_matches: + if chain_match not in chain: + chain.append(chain_match) + elif self.input_string[i] not in seps: + break + + return filter_index(chain, predicate, index) + + def chain_after(self, position, seps, end=None, predicate=None, index=None): + """ + Retrieves a list of chained matches, after position, matching predicate and separated by characters from seps + only. + :param position: + :type position: + :param seps: + :type seps: + :param end: + :type end: + :param predicate: + :type predicate: + :param index: + :type index: + :return: + :rtype: + """ + if hasattr(position, 'end'): + position = position.end + chain = _BaseMatches._base() + + if end is None: + end = self.max_end + else: + end = min(self.max_end, end) + + for i in range(position, end): + index_matches = self.at_index(i) + filtered_matches = [index_match for index_match in index_matches if not predicate or predicate(index_match)] + if filtered_matches: + for chain_match in filtered_matches: + if chain_match not in chain: + chain.append(chain_match) + elif self.input_string[i] not in seps: + break + + return filter_index(chain, predicate, index) + + @property + def max_end(self): + """ + Retrieves the maximum index. + :return: + """ + return max(len(self.input_string), self._max_end) if self.input_string else self._max_end + + def _hole_start(self, position, ignore=None): + """ + Retrieves the start of hole index from position. + :param position: + :type position: + :param ignore: + :type ignore: + :return: + :rtype: + """ + for lindex in reversed(range(0, position)): + for starting in self.starting(lindex): + if not ignore or not ignore(starting): + return lindex + return 0 + + def _hole_end(self, position, ignore=None): + """ + Retrieves the end of hole index from position. + :param position: + :type position: + :param ignore: + :type ignore: + :return: + :rtype: + """ + for rindex in range(position, self.max_end): + for starting in self.starting(rindex): + if not ignore or not ignore(starting): + return rindex + return self.max_end + + def holes(self, start=0, end=None, formatter=None, ignore=None, seps=None, predicate=None, index=None): # pylint: disable=too-many-branches,too-many-locals + """ + Retrieves a set of Match objects that are not defined in given range. + :param start: + :type start: + :param end: + :type end: + :param formatter: + :type formatter: + :param ignore: + :type ignore: + :param seps: + :type seps: + :param predicate: + :type predicate: + :param index: + :type index: + :return: + :rtype: + """ + assert self.input_string if seps else True, "input_string must be defined when using seps parameter" + if end is None: + end = self.max_end + else: + end = min(self.max_end, end) + ret = _BaseMatches._base() + hole = False + rindex = start + + loop_start = self._hole_start(start, ignore) + + for rindex in range(loop_start, end): + current = [] + for at_index in self.at_index(rindex): + if not ignore or not ignore(at_index): + current.append(at_index) + + if seps and hole and self.input_string and self.input_string[rindex] in seps: + hole = False + ret[-1].end = rindex + else: + if not current and not hole: + # Open a new hole match + hole = True + ret.append(Match(max(rindex, start), None, input_string=self.input_string, formatter=formatter)) + elif current and hole: + # Close current hole match + hole = False + ret[-1].end = rindex + + if ret and hole: + # go the the next starting element ... + ret[-1].end = min(self._hole_end(rindex, ignore), end) + return filter_index(ret, predicate, index) + + def conflicting(self, match, predicate=None, index=None): + """ + Retrieves a list of ``Match`` objects that conflicts with given match. + :param match: + :type match: + :param predicate: + :type predicate: + :param index: + :type index: + :return: + :rtype: + """ + ret = _BaseMatches._base() + + for i in range(*match.span): + for at_match in self.at_index(i): + if at_match not in ret: + ret.append(at_match) + + ret.remove(match) + + return filter_index(ret, predicate, index) + + def at_match(self, match, predicate=None, index=None): + """ + Retrieves a list of matches from given match. + """ + return self.at_span(match.span, predicate, index) + + def at_span(self, span, predicate=None, index=None): + """ + Retrieves a list of matches from given (start, end) tuple. + """ + starting = self._index_dict[span[0]] + ending = self._index_dict[span[1] - 1] + + merged = list(starting) + for marker in ending: + if marker not in merged: + merged.append(marker) + + return filter_index(merged, predicate, index) + + def at_index(self, pos, predicate=None, index=None): + """ + Retrieves a list of matches from given position + """ + return filter_index(self._index_dict[pos], predicate, index) + + @property + def names(self): + """ + Retrieve all names. + :return: + """ + return self._name_dict.keys() + + @property + def tags(self): + """ + Retrieve all tags. + :return: + """ + return self._tag_dict.keys() + + def to_dict(self, details=False, implicit=False): + """ + Converts matches to a dict object. + :param details if True, values will be complete Match object, else it will be only string Match.value property + :type details: bool + :param implicit if True, multiple values will be set as a list in the dict. Else, only the first value + will be kept. + :type implicit: bool + :return: + :rtype: dict + """ + ret = MatchesDict() + for match in sorted(self): + value = match if details else match.value + ret.matches[match.name].append(match) + if value not in ret.values_list[match.name]: + ret.values_list[match.name].append(value) + if match.name in ret.keys(): + if implicit: + if not isinstance(ret[match.name], list): + if ret[match.name] == value: + continue + ret[match.name] = [ret[match.name]] + else: + if value in ret[match.name]: + continue + ret[match.name].append(value) + else: + ret[match.name] = value + return ret + + if six.PY2: # pragma: no cover + def clear(self): + """ + Python 3 backport + """ + del self[:] + + def __len__(self): + return len(self._delegate) + + def __getitem__(self, index): + ret = self._delegate[index] + if isinstance(ret, list): + return Matches(ret) + return ret + + def __setitem__(self, index, match): + self._delegate[index] = match + if isinstance(index, slice): + for match_item in match: + self._add_match(match_item) + return + self._add_match(match) + + def __delitem__(self, index): + match = self._delegate[index] + del self._delegate[index] + if isinstance(match, list): + # if index is a slice, we has a match list + for match_item in match: + self._remove_match(match_item) + else: + self._remove_match(match) + + def __repr__(self): + return self._delegate.__repr__() + + def insert(self, index, match): + self._delegate.insert(index, match) + self._add_match(match) + + +class Matches(_BaseMatches): + """ + A custom list[Match] contains matches list. + """ + def __init__(self, matches=None, input_string=None): + self.markers = Markers(input_string=input_string) + super(Matches, self).__init__(matches=matches, input_string=input_string) + + def _add_match(self, match): + assert not match.marker, "A marker match should not be added to <Matches> object" + super(Matches, self)._add_match(match) + + +class Markers(_BaseMatches): + """ + A custom list[Match] containing markers list. + """ + def __init__(self, matches=None, input_string=None): + super(Markers, self).__init__(matches=None, input_string=input_string) + + def _add_match(self, match): + assert match.marker, "A non-marker match should not be added to <Markers> object" + super(Markers, self)._add_match(match) + + +class Match(object): + """ + Object storing values related to a single match + """ + def __init__(self, start, end, value=None, name=None, tags=None, marker=None, parent=None, private=None, + pattern=None, input_string=None, formatter=None, conflict_solver=None): + self.start = start + self.end = end + self.name = name + self._value = value + self.tags = ensure_list(tags) + self.marker = marker + self.parent = parent + self.input_string = input_string + self.formatter = formatter + self.pattern = pattern + self.private = private + self.conflict_solver = conflict_solver + self.children = [] + self._raw_start = None + self._raw_end = None + self.defined_at = pattern.defined_at if pattern else defined_at() + + @property + def span(self): + """ + 2-tuple with start and end indices of the match + """ + return self.start, self.end + + @property + def value(self): + """ + Get the value of the match, using formatter if defined. + :return: + :rtype: + """ + if self._value: + return self._value + if self.formatter: + return self.formatter(self.raw) + return self.raw + + @value.setter + def value(self, value): + """ + Set the value (hardcode) + :param value: + :type value: + :return: + :rtype: + """ + self._value = value # pylint: disable=attribute-defined-outside-init + + @property + def names(self): + """ + Get all names of children + :return: + :rtype: + """ + if not self.children: + return set([self.name]) + else: + ret = set() + for child in self.children: + for name in child.names: + ret.add(name) + return ret + + @property + def raw_start(self): + """ + start index of raw value + :return: + :rtype: + """ + if self._raw_start is None: + return self.start + return self._raw_start + + @raw_start.setter + def raw_start(self, value): + """ + Set start index of raw value + :return: + :rtype: + """ + self._raw_start = value + + @property + def raw_end(self): + """ + end index of raw value + :return: + :rtype: + """ + if self._raw_end is None: + return self.end + return self._raw_end + + @raw_end.setter + def raw_end(self, value): + """ + Set end index of raw value + :return: + :rtype: + """ + self._raw_end = value + + @property + def raw(self): + """ + Get the raw value of the match, without using hardcoded value nor formatter. + :return: + :rtype: + """ + if self.input_string: + return self.input_string[self.raw_start:self.raw_end] + return None + + @property + def initiator(self): + """ + Retrieve the initiator parent of a match + :param match: + :type match: + :return: + :rtype: + """ + match = self + while match.parent: + match = match.parent + return match + + def crop(self, crops, predicate=None, index=None): + """ + crop the match with given Match objects or spans tuples + :param crops: + :type crops: list or object + :return: a list of Match objects + :rtype: list[Match] + """ + if not is_iterable(crops) or len(crops) == 2 and isinstance(crops[0], int): + crops = [crops] + initial = copy.deepcopy(self) + ret = [initial] + for crop in crops: + if hasattr(crop, 'span'): + start, end = crop.span + else: + start, end = crop + for current in list(ret): + if start <= current.start and end >= current.end: + # self is included in crop, remove current ... + ret.remove(current) + elif start >= current.start and end <= current.end: + # crop is included in self, split current ... + right = copy.deepcopy(current) + current.end = start + if len(current) <= 0: + ret.remove(current) + right.start = end + if len(right) > 0: + ret.append(right) + elif end <= current.end and end > current.start: + current.start = end + elif start >= current.start and start < current.end: + current.end = start + return filter_index(ret, predicate, index) + + def split(self, seps, predicate=None, index=None): + """ + Split this match in multiple matches using given separators. + :param seps: + :type seps: string containing separator characters + :return: list of new Match objects + :rtype: list + """ + split_match = copy.deepcopy(self) + current_match = split_match + ret = [] + + for i in range(0, len(self.raw)): + if self.raw[i] in seps: + if not split_match: + split_match = copy.deepcopy(current_match) + current_match.end = self.start + i + + else: + if split_match: + split_match.start = self.start + i + current_match = split_match + ret.append(split_match) + split_match = None + + return filter_index(ret, predicate, index) + + def __len__(self): + return self.end - self.start + + def __hash__(self): + return hash(Match) + hash(self.start) + hash(self.end) + hash(self.value) + + def __eq__(self, other): + if isinstance(other, Match): + return self.span == other.span and self.value == other.value and self.name == other.name and \ + self.parent == other.parent + return NotImplemented + + def __ne__(self, other): + if isinstance(other, Match): + return self.span != other.span or self.value != other.value or self.name != other.name or \ + self.parent != other.parent + return NotImplemented + + def __lt__(self, other): + if isinstance(other, Match): + return self.span < other.span + return NotImplemented + + def __gt__(self, other): + if isinstance(other, Match): + return self.span > other.span + return NotImplemented + + def __le__(self, other): + if isinstance(other, Match): + return self.span <= other.span + return NotImplemented + + def __ge__(self, other): + if isinstance(other, Match): + return self.span >= other.span + return NotImplemented + + def __repr__(self): + flags = "" + name = "" + tags = "" + defined = "" + if self.private: + flags += '+private' + if self.name: + name = "+name=" + self.name + if self.tags: + tags = "+tags=" + six.text_type(self.tags) + if self.defined_at: + defined += "@" + six.text_type(self.defined_at) + return "<%s:%s%s%s%s%s>" % (self.value, self.span, flags, name, tags, defined) diff --git a/lib/rebulk/pattern.py b/lib/rebulk/pattern.py new file mode 100644 index 0000000000000000000000000000000000000000..e789cd60df7e96be93bdaae09dc0ebd109641b8d --- /dev/null +++ b/lib/rebulk/pattern.py @@ -0,0 +1,415 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Abstract pattern class definition along with various implementations (regexp, string, functional) +""" +# pylint: disable=super-init-not-called,wrong-import-position + +from abc import ABCMeta, abstractmethod, abstractproperty +REGEX_AVAILABLE = None +try: + import regex as re + REGEX_AVAILABLE = True +except ImportError: # pragma: no cover + import re #pylint:disable=wrong-import-order + REGEX_AVAILABLE = False + +import six + +from .match import Match +from .utils import find_all, is_iterable +from .loose import call, ensure_list, ensure_dict +from . import debug + + +@six.add_metaclass(ABCMeta) +class Pattern(object): + """ + Definition of a particular pattern to search for. + """ + + def __init__(self, name=None, tags=None, formatter=None, validator=None, children=False, every=False, + private_parent=False, private_children=False, private=False, private_names=None, marker=False, + format_all=False, validate_all=False, disabled=lambda context: False, log_level=None, properties=None): + """ + :param name: Name of this pattern + :type name: str + :param tags: List of tags related to this pattern + :type tags: list[str] + :param formatter: dict (name, func) of formatter to use with this pattern. name is the match name to support, + and func a function(input_string) that returns the formatted string. A single formatter function can also be + passed as a shortcut for {None: formatter}. The returned formatted string with be set in Match.value property. + :type formatter: dict[str, func] || func + :param validator: dict (name, func) of validator to use with this pattern. name is the match name to support, + and func a function(match) that returns the a boolean. A single validator function can also be + passed as a shortcut for {None: validator}. If return value is False, match will be ignored. + :param children: generates children instead of parent + :type children: bool + :param every: generates both parent and children. + :type every: bool + :param private: flag this pattern as beeing private. + :type private: bool + :param private_parent: force return of parent and flag parent matches as private. + :type private_parent: bool + :param private_children: force return of children and flag children matches as private. + :type private_children: bool + :param private_names: force return of named matches as private. + :type private_names: bool + :param marker: flag this pattern as beeing a marker. + :type private: bool + :param format_all if True, pattern will format every match in the hierarchy (even match not yield). + :type format_all: bool + :param validate_all if True, pattern will validate every match in the hierarchy (even match not yield). + :type validate_all: bool + :param disabled: if True, this pattern is disabled. Can also be a function(context). + :type disabled: bool|function + :param log_lvl: Log level associated to this pattern + :type log_lvl: int + """ + # pylint:disable=too-many-locals + self.name = name + self.tags = ensure_list(tags) + self.formatters, self._default_formatter = ensure_dict(formatter, lambda x: x) + self.validators, self._default_validator = ensure_dict(validator, lambda match: True) + self.every = every + self.children = children + self.private = private + self.private_names = private_names if private_names else [] + self.private_parent = private_parent + self.private_children = private_children + self.marker = marker + self.format_all = format_all + self.validate_all = validate_all + if not callable(disabled): + self.disabled = lambda context: disabled + else: + self.disabled = disabled + self._log_level = log_level + self._properties = properties + self.defined_at = debug.defined_at() + + @property + def log_level(self): + """ + Log level for this pattern. + :return: + :rtype: + """ + return self._log_level if self._log_level is not None else debug.LOG_LEVEL + + def _yield_children(self, match): + """ + Does this mat + :param match: + :type match: + :return: + :rtype: + """ + return match.children and (self.children or self.every) + + def _yield_parent(self): + """ + Does this mat + :param match: + :type match: + :return: + :rtype: + """ + return not self.children or self.every + + def _match_parent(self, match, yield_parent): + """ + Handle a parent match + :param match: + :type match: + :param yield_parent: + :type yield_parent: + :return: + :rtype: + """ + if yield_parent or self.format_all: + match.formatter = self.formatters.get(match.name, + self.formatters.get('__parent__', self._default_formatter)) + if yield_parent or self.validate_all: + validator = self.validators.get(match.name, self.validators.get('__parent__', self._default_validator)) + if not validator(match): + return False + return True + + def _match_child(self, child, yield_children): + """ + Handle a children match + :param child: + :type child: + :param yield_children: + :type yield_children: + :return: + :rtype: + """ + if yield_children or self.format_all: + child.formatter = self.formatters.get(child.name, + self.formatters.get('__children__', self._default_formatter)) + if yield_children or self.validate_all: + validator = self.validators.get(child.name, self.validators.get('__children__', self._default_validator)) + if not validator(child): + return False + return True + + def matches(self, input_string, context=None): + """ + Computes all matches for a given input + + :param input_string: the string to parse + :type input_string: str + :param context: the context + :type context: dict + :return: matches based on input_string for this pattern + :rtype: iterator[Match] + """ + + ret = [] + for pattern in self.patterns: + yield_parent = self._yield_parent() + for match in self._match(pattern, input_string, context): + yield_children = self._yield_children(match) + if not self._match_parent(match, yield_parent): + continue + validated = True + for child in match.children: + if not self._match_child(child, yield_children): + validated = False + break + if validated: + if self.private_parent: + match.private = True + if self.private_children: + for child in match.children: + child.private = True + if yield_parent or self.private_parent: + ret.append(match) + if yield_children or self.private_children: + for child in match.children: + ret.append(child) + self._matches_privatize(ret) + return ret + + def _matches_privatize(self, matches): + """ + Mark matches included in private_names with private flag. + :param matches: + :type matches: + :return: + :rtype: + """ + if self.private_names: + for child in matches: + if child.name in self.private_names: + child.private = True + + @abstractproperty + def patterns(self): # pragma: no cover + """ + List of base patterns defined + + :return: A list of base patterns + :rtype: list + """ + pass + + @property + def properties(self): + """ + Properties names and values that can ben retrieved by this pattern. + :return: + :rtype: + """ + if self._properties: + return self._properties + return {} + + @abstractproperty + def match_options(self): # pragma: no cover + """ + dict of default options for generated Match objects + + :return: **options to pass to Match constructor + :rtype: dict + """ + pass + + @abstractmethod + def _match(self, pattern, input_string, context=None): # pragma: no cover + """ + Computes all matches for a given pattern and input + + :param pattern: the pattern to use + :param input_string: the string to parse + :type input_string: str + :param context: the context + :type context: dict + :return: matches based on input_string for this pattern + :rtype: iterator[Match] + """ + pass + + def __repr__(self): + defined = "" + if self.defined_at: + defined = "@" + six.text_type(self.defined_at) + return "<%s%s:%s>" % (self.__class__.__name__, defined, self.patterns) + + +class StringPattern(Pattern): + """ + Definition of one or many strings to search for. + """ + + def __init__(self, *patterns, **kwargs): + call(super(StringPattern, self).__init__, **kwargs) + self._patterns = patterns + self._kwargs = kwargs + self._match_kwargs = _filter_match_kwargs(kwargs) + + @property + def patterns(self): + return self._patterns + + @property + def match_options(self): + return self._match_kwargs + + def _match(self, pattern, input_string, context=None): + for index in call(find_all, input_string, pattern, **self._kwargs): + yield call(Match, index, index + len(pattern), pattern=self, input_string=input_string, + **self._match_kwargs) + + +class RePattern(Pattern): + """ + Definition of one or many regular expression pattern to search for. + """ + + def __init__(self, *patterns, **kwargs): + call(super(RePattern, self).__init__, **kwargs) + self.repeated_captures = REGEX_AVAILABLE + if 'repeated_captures' in kwargs: + self.repeated_captures = kwargs.get('repeated_captures') + if self.repeated_captures and not REGEX_AVAILABLE: # pragma: no cover + raise NotImplementedError("repeated_capture is available only with regex module.") + self.abbreviations = kwargs.get('abbreviations', []) + self._kwargs = kwargs + self._match_kwargs = _filter_match_kwargs(kwargs) + self._children_match_kwargs = _filter_match_kwargs(kwargs, children=True) + self._patterns = [] + for pattern in patterns: + if isinstance(pattern, six.string_types): + if self.abbreviations and pattern: + for key, replacement in self.abbreviations: + pattern = pattern.replace(key, replacement) + pattern = call(re.compile, pattern, **self._kwargs) + elif isinstance(pattern, dict): + if self.abbreviations and 'pattern' in pattern: + for key, replacement in self.abbreviations: + pattern['pattern'] = pattern['pattern'].replace(key, replacement) + pattern = re.compile(**pattern) + elif hasattr(pattern, '__iter__'): + pattern = re.compile(*pattern) + self._patterns.append(pattern) + + @property + def patterns(self): + return self._patterns + + @property + def match_options(self): + return self._match_kwargs + + def _match(self, pattern, input_string, context=None): + names = dict((v, k) for k, v in pattern.groupindex.items()) + for match_object in pattern.finditer(input_string): + start = match_object.start() + end = match_object.end() + main_match = call(Match, start, end, pattern=self, input_string=input_string, **self._match_kwargs) + + if pattern.groups: + for i in range(1, pattern.groups + 1): + name = names.get(i, main_match.name) + if self.repeated_captures: + for start, end in match_object.spans(i): + child_match = call(Match, start, end, name=name, parent=main_match, pattern=self, + input_string=input_string, **self._children_match_kwargs) + main_match.children.append(child_match) + else: + start, end = match_object.span(i) + child_match = call(Match, start, end, name=name, parent=main_match, pattern=self, + input_string=input_string, **self._children_match_kwargs) + main_match.children.append(child_match) + + yield main_match + + +class FunctionalPattern(Pattern): + """ + Definition of one or many functional pattern to search for. + """ + + def __init__(self, *patterns, **kwargs): + call(super(FunctionalPattern, self).__init__, **kwargs) + self._patterns = patterns + self._kwargs = kwargs + self._match_kwargs = _filter_match_kwargs(kwargs) + + @property + def patterns(self): + return self._patterns + + @property + def match_options(self): + return self._match_kwargs + + def _match(self, pattern, input_string, context=None): + ret = call(pattern, input_string, context, **self._kwargs) + if ret: + if not is_iterable(ret) or isinstance(ret, dict) \ + or (is_iterable(ret) and hasattr(ret, '__getitem__') and isinstance(ret[0], int)): + args_iterable = [ret] + else: + args_iterable = ret + for args in args_iterable: + if isinstance(args, dict): + options = args + options.pop('input_string', None) + options.pop('pattern', None) + if self._match_kwargs: + options = self._match_kwargs.copy() + options.update(args) + yield call(Match, pattern=self, input_string=input_string, **options) + else: + kwargs = self._match_kwargs + if isinstance(args[-1], dict): + kwargs = dict(kwargs) + kwargs.update(args[-1]) + args = args[:-1] + yield call(Match, *args, pattern=self, input_string=input_string, **kwargs) + + +def _filter_match_kwargs(kwargs, children=False): + """ + Filters out kwargs for Match construction + + :param kwargs: + :type kwargs: dict + :param children: + :type children: Flag to filter children matches + :return: A filtered dict + :rtype: dict + """ + kwargs = kwargs.copy() + for key in ('pattern', 'start', 'end', 'parent', 'formatter'): + if key in kwargs: + del kwargs[key] + if children: + for key in ('name',): + if key in kwargs: + del kwargs[key] + return kwargs diff --git a/lib/rebulk/processors.py b/lib/rebulk/processors.py new file mode 100644 index 0000000000000000000000000000000000000000..b0a69fc8e88e89a03e940a121541445fd73054bd --- /dev/null +++ b/lib/rebulk/processors.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Processor functions +""" +from logging import getLogger + +from .utils import IdentitySet + +from .rules import Rule, RemoveMatch + +log = getLogger(__name__).log + +DEFAULT = '__default__' + +POST_PROCESS = -2048 +PRE_PROCESS = 2048 + + +def _default_conflict_solver(match, conflicting_match): + """ + Default conflict solver for matches, shorter matches if they conflicts with longer ones + + :param conflicting_match: + :type conflicting_match: + :param match: + :type match: + :return: + :rtype: + """ + if len(conflicting_match.initiator) < len(match.initiator): + return conflicting_match + elif len(match.initiator) < len(conflicting_match.initiator): + return match + return None + + +class ConflictSolver(Rule): + """ + Remove conflicting matches. + """ + priority = PRE_PROCESS + + consequence = RemoveMatch + + @property + def default_conflict_solver(self): # pylint:disable=no-self-use + """ + Default conflict solver to use. + """ + return _default_conflict_solver + + def when(self, matches, context): + to_remove_matches = IdentitySet() + for match in filter(lambda match: not match.private, matches): + conflicting_matches = matches.conflicting(match) + + if conflicting_matches: + # keep the match only if it's the longest + for conflicting_match in filter(lambda match: not match.private, conflicting_matches): + reverse = False + conflict_solvers = [(self.default_conflict_solver, False)] + + if match.conflict_solver: + conflict_solvers.append((match.conflict_solver, False)) + if conflicting_match.conflict_solver: + conflict_solvers.append((conflicting_match.conflict_solver, True)) + + for conflict_solver, reverse in reversed(conflict_solvers): + if reverse: + to_remove = conflict_solver(conflicting_match, match) + else: + to_remove = conflict_solver(match, conflicting_match) + if to_remove == DEFAULT: + continue + if to_remove and to_remove not in to_remove_matches: + to_remove_matches.add(to_remove) + break + return to_remove_matches + + +class PrivateRemover(Rule): + """ + Removes private matches rule. + """ + priority = POST_PROCESS + + consequence = RemoveMatch + + def when(self, matches, context): + return [match for match in matches if match.private] + diff --git a/lib/rebulk/rebulk.py b/lib/rebulk/rebulk.py new file mode 100644 index 0000000000000000000000000000000000000000..dde3699d6fa8c84b7dbb96c7394975d63a59a8c8 --- /dev/null +++ b/lib/rebulk/rebulk.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Entry point functions and classes for Rebulk +""" +from logging import getLogger + +from .match import Matches + +from .pattern import RePattern, StringPattern, FunctionalPattern + +from .processors import ConflictSolver, PrivateRemover +from .loose import set_defaults +from .utils import extend_safe +from .rules import Rules + +log = getLogger(__name__).log + + +class Rebulk(object): + r""" + Regular expression, string and function based patterns are declared in a ``Rebulk`` object. It use a fluent API to + chain ``string``, ``regex``, and ``functional`` methods to define various patterns types. + + .. code-block:: python + + >>> from rebulk import Rebulk + >>> bulk = Rebulk().string('brown').regex(r'qu\w+').functional(lambda s: (20, 25)) + + When ``Rebulk`` object is fully configured, you can call ``matches`` method with an input string to retrieve all + ``Match`` objects found by registered pattern. + + .. code-block:: python + + >>> bulk.matches("The quick brown fox jumps over the lazy dog") + [<brown:(10, 15)>, <quick:(4, 9)>, <jumps:(20, 25)>] + + If multiple ``Match`` objects are found at the same position, only the longer one is kept. + + .. code-block:: python + + >>> bulk = Rebulk().string('lakers').string('la') + >>> bulk.matches("the lakers are from la") + [<lakers:(4, 10)>, <la:(20, 22)>] + """ + # pylint:disable=protected-access + + def __init__(self, disabled=lambda context: False, default_rules=True): + """ + Creates a new Rebulk object. + :param disabled: if True, this pattern is disabled. Can also be a function(context). + :type disabled: bool|function + :param default_rules: use default rules + :type default_rules: + :return: + :rtype: + """ + if not callable(disabled): + self.disabled = lambda context: disabled + else: + self.disabled = disabled + self._patterns = [] + self._rules = Rules() + if default_rules: + self.rules(ConflictSolver, PrivateRemover) + self._defaults = {} + self._regex_defaults = {} + self._string_defaults = {} + self._functional_defaults = {} + self._rebulks = [] + + def pattern(self, *pattern): + """ + Add patterns objects + + :param pattern: + :type pattern: rebulk.pattern.Pattern + :return: self + :rtype: Rebulk + """ + self._patterns.extend(pattern) + return self + + def defaults(self, **kwargs): + """ + Define default keyword arguments for all patterns + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + self._defaults = kwargs + return self + + def regex_defaults(self, **kwargs): + """ + Define default keyword arguments for functional patterns. + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + self._regex_defaults = kwargs + return self + + def regex(self, *pattern, **kwargs): + """ + Add re pattern + + :param pattern: + :type pattern: + :return: self + :rtype: Rebulk + """ + set_defaults(self._regex_defaults, kwargs) + set_defaults(self._defaults, kwargs) + self.pattern(RePattern(*pattern, **kwargs)) + return self + + def string_defaults(self, **kwargs): + """ + Define default keyword arguments for string patterns. + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + self._string_defaults = kwargs + return self + + def string(self, *pattern, **kwargs): + """ + Add string pattern + + :param pattern: + :type pattern: + :return: self + :rtype: Rebulk + """ + set_defaults(self._string_defaults, kwargs) + set_defaults(self._defaults, kwargs) + self.pattern(StringPattern(*pattern, **kwargs)) + return self + + def functional_defaults(self, **kwargs): + """ + Define default keyword arguments for functional patterns. + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + self._functional_defaults = kwargs + return self + + def functional(self, *pattern, **kwargs): + """ + Add functional pattern + + :param pattern: + :type pattern: + :return: self + :rtype: Rebulk + """ + set_defaults(self._functional_defaults, kwargs) + set_defaults(self._defaults, kwargs) + self.pattern(FunctionalPattern(*pattern, **kwargs)) + return self + + def rules(self, *rules): + """ + Add rules as a module, class or instance. + :param rules: + :type rules: list[Rule] + :return: + """ + self._rules.load(*rules) + return self + + def rebulk(self, *rebulks): + """ + Add a children rebulk object + :param rebulks: + :type rebulks: Rebulk + :return: + """ + self._rebulks.extend(rebulks) + return self + + def matches(self, string, context=None): + """ + Search for all matches with current configuration against input_string + :param string: string to search into + :type string: str + :param context: context to use + :type context: dict + :return: A custom list of matches + :rtype: Matches + """ + matches = Matches(input_string=string) + if context is None: + context = {} + + self._matches_patterns(matches, context) + + self._execute_rules(matches, context) + + return matches + + def effective_rules(self, context=None): + """ + Get effective rules for this rebulk object and its children. + :param context: + :type context: + :return: + :rtype: + """ + rules = Rules() + rules.extend(self._rules) + for rebulk in self._rebulks: + if not rebulk.disabled(context): + extend_safe(rules, rebulk._rules) + return rules + + def _execute_rules(self, matches, context): + """ + Execute rules for this rebulk and children. + :param matches: + :type matches: + :param context: + :type context: + :return: + :rtype: + """ + if not self.disabled(context): + rules = self.effective_rules(context) + rules.execute_all_rules(matches, context) + + def effective_patterns(self, context=None): + """ + Get effective patterns for this rebulk object and its children. + :param context: + :type context: + :return: + :rtype: + """ + patterns = list(self._patterns) + for rebulk in self._rebulks: + if not rebulk.disabled(context): + extend_safe(patterns, rebulk._patterns) + return patterns + + def _matches_patterns(self, matches, context): + """ + Search for all matches with current paterns agains input_string + :param matches: matches list + :type matches: Matches + :param context: context to use + :type context: dict + :return: + :rtype: + """ + if not self.disabled(context): + patterns = self.effective_patterns(context) + for pattern in patterns: + if not pattern.disabled(context): + pattern_matches = pattern.matches(matches.input_string, context) + if pattern_matches: + log(pattern.log_level, "Pattern has %s match(es). (%s)", len(pattern_matches), pattern) + else: + pass + # log(pattern.log_level, "Pattern doesn't match. (%s)" % (pattern,)) + for match in pattern_matches: + if match.marker: + log(pattern.log_level, "Marker found. (%s)", match) + matches.markers.append(match) + else: + log(pattern.log_level, "Match found. (%s)", match) + matches.append(match) + else: + log(pattern.log_level, "Pattern is disabled. (%s)", pattern) diff --git a/lib/rebulk/rules.py b/lib/rebulk/rules.py new file mode 100644 index 0000000000000000000000000000000000000000..c318cef9352444dcab0bd287d1f2fcd92deff9ec --- /dev/null +++ b/lib/rebulk/rules.py @@ -0,0 +1,378 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Abstract rule class definition and rule engine implementation +""" +from abc import ABCMeta, abstractmethod +import inspect +from itertools import groupby +from logging import getLogger + +import six +from .utils import is_iterable + +from .toposort import toposort + +from . import debug + +log = getLogger(__name__).log + + +@six.add_metaclass(ABCMeta) +class Consequence(object): + """ + Definition of a consequence to apply. + """ + @abstractmethod + def then(self, matches, when_response, context): # pragma: no cover + """ + Action implementation. + + :param matches: + :type matches: rebulk.match.Matches + :param context: + :type context: + :param when_response: return object from when call. + :type when_response: object + :return: True if the action was runned, False if it wasn't. + :rtype: bool + """ + pass + + +@six.add_metaclass(ABCMeta) +class Condition(object): + """ + Definition of a condition to check. + """ + @abstractmethod + def when(self, matches, context): # pragma: no cover + """ + Condition implementation. + + :param matches: + :type matches: rebulk.match.Matches + :param context: + :type context: + :return: truthy if rule should be triggered and execute then action, falsy if it should not. + :rtype: object + """ + pass + + +@six.add_metaclass(ABCMeta) +class CustomRule(Condition, Consequence): + """ + Definition of a rule to apply + """ + # pylint: disable=no-self-use, unused-argument, abstract-method + priority = 0 + name = None + dependency = None + properties = {} + + def __init__(self, log_level=None): + self.defined_at = debug.defined_at() + if log_level is None and not hasattr(self, 'log_level'): + self.log_level = debug.LOG_LEVEL + + def enabled(self, context): + """ + Disable rule. + + :param context: + :type context: + :return: True if rule is enabled, False if disabled + :rtype: bool + """ + return True + + def __lt__(self, other): + return self.priority > other.priority + + def __repr__(self): + defined = "" + if self.defined_at: + defined = "@" + six.text_type(self.defined_at) + return "<%s%s>" % (self.name if self.name else self.__class__.__name__, defined) + + def __eq__(self, other): + return self.__class__ == other.__class__ + + def __hash__(self): + return hash(self.__class__) + + +class Rule(CustomRule): + """ + Definition of a rule to apply + """ + # pylint:disable=abstract-method + consequence = None + + def then(self, matches, when_response, context): + assert self.consequence + if is_iterable(self.consequence): + if not is_iterable(when_response): + when_response = [when_response] + iterator = iter(when_response) + for cons in self.consequence: #pylint: disable=not-an-iterable + if inspect.isclass(cons): + cons = cons() + cons.then(matches, next(iterator), context) + else: + cons = self.consequence + if inspect.isclass(cons): + cons = cons() # pylint:disable=not-callable + cons.then(matches, when_response, context) + + +class RemoveMatch(Consequence): # pylint: disable=abstract-method + """ + Remove matches returned by then + """ + def then(self, matches, when_response, context): + if is_iterable(when_response): + ret = [] + when_response = list(when_response) + for match in when_response: + if match in matches: + matches.remove(match) + ret.append(match) + return ret + else: + if when_response in matches: + matches.remove(when_response) + return when_response + + +class AppendMatch(Consequence): # pylint: disable=abstract-method + """ + Append matches returned by then + """ + def __init__(self, match_name=None): + self.match_name = match_name + + def then(self, matches, when_response, context): + if is_iterable(when_response): + ret = [] + when_response = list(when_response) + for match in when_response: + if match not in matches: + if self.match_name: + match.name = self.match_name + matches.append(match) + ret.append(match) + return ret + else: + if self.match_name: + when_response.name = self.match_name + if when_response not in matches: + matches.append(when_response) + return when_response + + +class RenameMatch(Consequence): # pylint: disable=abstract-method + """ + Rename matches returned by then + """ + def __init__(self, match_name): + self.match_name = match_name + self.remove = RemoveMatch() + self.append = AppendMatch() + + def then(self, matches, when_response, context): + removed = self.remove.then(matches, when_response, context) + if is_iterable(removed): + removed = list(removed) + for match in removed: + match.name = self.match_name + elif removed: + removed.name = self.match_name + if removed: + self.append.then(matches, removed, context) + + +class AppendTags(Consequence): # pylint: disable=abstract-method + """ + Add tags to returned matches + """ + def __init__(self, tags): + self.tags = tags + self.remove = RemoveMatch() + self.append = AppendMatch() + + def then(self, matches, when_response, context): + removed = self.remove.then(matches, when_response, context) + if is_iterable(removed): + removed = list(removed) + for match in removed: + match.tags.extend(self.tags) + elif removed: + removed.tags.extend(self.tags) # pylint: disable=no-member + if removed: + self.append.then(matches, removed, context) + + +class RemoveTags(Consequence): # pylint: disable=abstract-method + """ + Remove tags from returned matches + """ + def __init__(self, tags): + self.tags = tags + self.remove = RemoveMatch() + self.append = AppendMatch() + + def then(self, matches, when_response, context): + removed = self.remove.then(matches, when_response, context) + if is_iterable(removed): + removed = list(removed) + for match in removed: + for tag in self.tags: + if tag in match.tags: + match.tags.remove(tag) + elif removed: + for tag in self.tags: + if tag in removed.tags: # pylint: disable=no-member + removed.tags.remove(tag) # pylint: disable=no-member + if removed: + self.append.then(matches, removed, context) + + +class Rules(list): + """ + list of rules ready to execute. + """ + + def __init__(self, *rules): + super(Rules, self).__init__() + self.load(*rules) + + def load(self, *rules): + """ + Load rules from a Rule module, class or instance + + :param rules: + :type rules: + :return: + :rtype: + """ + for rule in rules: + if inspect.ismodule(rule): + self.load_module(rule) + elif inspect.isclass(rule): + self.load_class(rule) + else: + self.append(rule) + + def load_module(self, module): + """ + Load a rules module + + :param module: + :type module: + :return: + :rtype: + """ + # pylint: disable=unused-variable + for name, obj in inspect.getmembers(module, + lambda member: hasattr(member, '__module__') + and member.__module__ == module.__name__ + and inspect.isclass): + self.load_class(obj) + + def load_class(self, class_): + """ + Load a Rule class. + + :param class_: + :type class_: + :return: + :rtype: + """ + self.append(class_()) + + def execute_all_rules(self, matches, context): + """ + Execute all rules from this rules list. All when condition with same priority will be performed before + calling then actions. + + :param matches: + :type matches: + :param context: + :type context: + :return: + :rtype: + """ + ret = [] + for priority, priority_rules in groupby(sorted(self), lambda rule: rule.priority): + sorted_rules = toposort_rules(list(priority_rules)) # Group by dependency graph toposort + for rules_group in sorted_rules: + rules_group = list(sorted(rules_group, key=self.index)) # Sort rules group based on initial ordering. + group_log_level = None + for rule in rules_group: + if group_log_level is None or group_log_level < rule.log_level: + group_log_level = rule.log_level + log(group_log_level, "%s independent rule(s) at priority %s.", len(rules_group), priority) + for rule in rules_group: + when_response = execute_rule(rule, matches, context) + if when_response is not None: + ret.append((rule, when_response)) + + return ret + + +def execute_rule(rule, matches, context): + """ + Execute the given rule. + :param rule: + :type rule: + :param matches: + :type matches: + :param context: + :type context: + :return: + :rtype: + """ + if rule.enabled(context): + log(rule.log_level, "Checking rule condition: %s", rule) + when_response = rule.when(matches, context) + if when_response: + log(rule.log_level, "Rule was triggered: %s", when_response) + log(rule.log_level, "Running rule consequence: %s %s", rule, when_response) + rule.then(matches, when_response, context) + return when_response + else: + log(rule.log_level, "Rule is disabled: %s", rule) + +def toposort_rules(rules): + """ + Sort given rules using toposort with dependency parameter. + :param rules: + :type rules: + :return: + :rtype: + """ + graph = {} + class_dict = {} + for rule in rules: + if rule.__class__ in class_dict: + raise ValueError("Duplicate class rules are not allowed: %s" % rule.__class__) + class_dict[rule.__class__] = rule + for rule in rules: + if not is_iterable(rule.dependency) and rule.dependency: + rule_dependencies = [rule.dependency] + else: + rule_dependencies = rule.dependency + dependencies = set() + if rule_dependencies: + for dependency in rule_dependencies: + if inspect.isclass(dependency): + dependency = class_dict.get(dependency) + if dependency: + dependencies.add(dependency) + graph[rule] = dependencies + return toposort(graph) + + + diff --git a/lib/rebulk/test/__init__.py b/lib/rebulk/test/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0ab48c94bc06fc5d0cbe9808646069afd5406c19 --- /dev/null +++ b/lib/rebulk/test/__init__.py @@ -0,0 +1,3 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring diff --git a/lib/rebulk/test/default_rules_module.py b/lib/rebulk/test/default_rules_module.py new file mode 100644 index 0000000000000000000000000000000000000000..533752fc60fd544056a9ded47e846456a130663d --- /dev/null +++ b/lib/rebulk/test/default_rules_module.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name +from ..match import Match +from ..rules import Rule, RemoveMatch, AppendMatch, RenameMatch, AppendTags, RemoveTags + + +class RuleRemove0(Rule): + consequence = RemoveMatch + def when(self, matches, context): + return matches[0] + + +class RuleAppend0(Rule): + consequence = AppendMatch() + def when(self, matches, context): + return Match(5, 10) + +class RuleRename0(Rule): + consequence = [RenameMatch('renamed')] + def when(self, matches, context): + return [Match(5, 10, name="original")] + +class RuleRemove1(Rule): + consequence = [RemoveMatch()] + def when(self, matches, context): + return [matches[0]] + +class RuleAppend1(Rule): + consequence = [AppendMatch] + def when(self, matches, context): + return [Match(5, 10)] + +class RuleRename1(Rule): + consequence = RenameMatch('renamed') + def when(self, matches, context): + return [Match(5, 10, name="original")] + +class RuleAppend2(Rule): + consequence = [AppendMatch('renamed')] + properties = {'renamed': [None]} + def when(self, matches, context): + return [Match(5, 10)] + +class RuleRename2(Rule): + consequence = RenameMatch('renamed') + def when(self, matches, context): + return Match(5, 10, name="original") + +class RuleAppend3(Rule): + consequence = AppendMatch('renamed') + properties = {'renamed': [None]} + def when(self, matches, context): + return [Match(5, 10)] + +class RuleRename3(Rule): + consequence = [RenameMatch('renamed')] + def when(self, matches, context): + return Match(5, 10, name="original") + +class RuleAppendTags0(Rule): + consequence = AppendTags(['new-tag']) + def when(self, matches, context): + return matches.named('tags', 0) + +class RuleRemoveTags0(Rule): + consequence = RemoveTags(['new-tag']) + def when(self, matches, context): + return matches.named('tags', 0) + +class RuleAppendTags1(Rule): + consequence = AppendTags(['new-tag']) + def when(self, matches, context): + return matches.named('tags') + +class RuleRemoveTags1(Rule): + consequence = RemoveTags(['new-tag']) + def when(self, matches, context): + return matches.named('tags') + diff --git a/lib/rebulk/test/rebulk_rules_module.py b/lib/rebulk/test/rebulk_rules_module.py new file mode 100644 index 0000000000000000000000000000000000000000..0bd5ef33a18e8e95fe4fb989f9e14df39692d57a --- /dev/null +++ b/lib/rebulk/test/rebulk_rules_module.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name +from rebulk.rules import Rule, RemoveMatch, CustomRule + + +class RemoveAllButLastYear(Rule): + consequence = RemoveMatch + def when(self, matches, context): + entries = matches.named('year') + return entries[:-1] + + +class PrefixedSuffixedYear(CustomRule): + def when(self, matches, context): + toRemove = [] + years = matches.named('year') + for year in years: + if not matches.previous(year, lambda p: p.name == 'yearPrefix') and \ + not matches.next(year, lambda n: n.name == 'yearSuffix'): + toRemove.append(year) + return toRemove + + def then(self, matches, when_response, context): + for to_remove in when_response: + matches.remove(to_remove) + + +class PrefixedSuffixedYearNoLambda(Rule): + consequence = RemoveMatch + def when(self, matches, context): + toRemove = [] + years = matches.named('year') + for year in years: + if not [m for m in matches.previous(year) if m.name == 'yearPrefix'] and \ + not [m for m in matches.next(year) if m.name == 'yearSuffix']: + toRemove.append(year) + return toRemove diff --git a/lib/rebulk/test/rules_module.py b/lib/rebulk/test/rules_module.py new file mode 100644 index 0000000000000000000000000000000000000000..887b81da8637c39f0acd303490b1d750a08dd930 --- /dev/null +++ b/lib/rebulk/test/rules_module.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name +from ..match import Match +from ..rules import Rule + + +class Rule3(Rule): + def when(self, matches, context): + return context.get('when') + + def then(self, matches, when_response, context): + assert when_response in [True, False] + matches.append(Match(3, 4)) + + +class Rule2(Rule): + dependency = Rule3 + + def when(self, matches, context): + return True + + def then(self, matches, when_response, context): + assert when_response + matches.append(Match(3, 4)) + + +class Rule1(Rule): + dependency = Rule2 + + def when(self, matches, context): + return True + + def then(self, matches, when_response, context): + assert when_response + matches.clear() + + +class Rule0(Rule): + dependency = Rule1 + + def when(self, matches, context): + return True + + def then(self, matches, when_response, context): + assert when_response + matches.append(Match(3, 4)) + + +class Rule1Disabled(Rule1): + name = "Disabled Rule1" + + def enabled(self, context): + return False diff --git a/lib/rebulk/test/test_debug.py b/lib/rebulk/test/test_debug.py new file mode 100644 index 0000000000000000000000000000000000000000..a35f95fdf345afc04d7c63baf21d2a3e15430e44 --- /dev/null +++ b/lib/rebulk/test/test_debug.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, protected-access, invalid-name + +from ..pattern import StringPattern +from ..rebulk import Rebulk +from ..match import Match +from .. import debug +from .default_rules_module import RuleRemove0 + + +class TestDebug(object): + + + #request.addfinalizer(disable_debug) + + + + debug.DEBUG = True + pattern = StringPattern(1, 3, value="es") + + match = Match(1, 3, value="es") + rule = RuleRemove0() + + input_string = "This is a debug test" + rebulk = Rebulk().string("debug") \ + .string("is") + + matches = rebulk.matches(input_string) + debug.DEBUG = False + + @classmethod + def setup_class(cls): + debug.DEBUG = True + + @classmethod + def teardown_class(cls): + debug.DEBUG = False + + def test_pattern(self): + assert self.pattern.defined_at.lineno == 20 + assert self.pattern.defined_at.name == 'rebulk.test.test_debug' + assert self.pattern.defined_at.filename.endswith('test_debug.py') + + assert str(self.pattern.defined_at) == 'test_debug.py#L20' + assert repr(self.pattern) == '<StringPattern@test_debug.py#L20:(1, 3)>' + + def test_match(self): + assert self.match.defined_at.lineno == 22 + assert self.match.defined_at.name == 'rebulk.test.test_debug' + assert self.match.defined_at.filename.endswith('test_debug.py') + + assert str(self.match.defined_at) == 'test_debug.py#L22' + + def test_rule(self): + assert self.rule.defined_at.lineno == 23 + assert self.rule.defined_at.name == 'rebulk.test.test_debug' + assert self.rule.defined_at.filename.endswith('test_debug.py') + + assert str(self.rule.defined_at) == 'test_debug.py#L23' + assert repr(self.rule) == '<RuleRemove0@test_debug.py#L23>' + + def test_rebulk(self): + """ + This test fails on travis CI, can't find out why there's 1 line offset ... + """ + assert self.rebulk._patterns[0].defined_at.lineno in [26, 27] + assert self.rebulk._patterns[0].defined_at.name == 'rebulk.test.test_debug' + assert self.rebulk._patterns[0].defined_at.filename.endswith('test_debug.py') + + assert str(self.rebulk._patterns[0].defined_at) in ['test_debug.py#L26', 'test_debug.py#L27'] + + assert self.rebulk._patterns[1].defined_at.lineno in [27, 28] + assert self.rebulk._patterns[1].defined_at.name == 'rebulk.test.test_debug' + assert self.rebulk._patterns[1].defined_at.filename.endswith('test_debug.py') + + assert str(self.rebulk._patterns[1].defined_at) in ['test_debug.py#L27', 'test_debug.py#L28'] + + assert self.matches[0].defined_at == self.rebulk._patterns[0].defined_at + assert self.matches[1].defined_at == self.rebulk._patterns[1].defined_at + + def test_repr(self): + str(self.matches) diff --git a/lib/rebulk/test/test_introspector.py b/lib/rebulk/test/test_introspector.py new file mode 100644 index 0000000000000000000000000000000000000000..24c0c5001a2e9f9692af28bf5a16bde8948b2332 --- /dev/null +++ b/lib/rebulk/test/test_introspector.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Introspector tests +""" +# pylint: disable=no-self-use,pointless-statement,missing-docstring,protected-access,invalid-name +from ..rebulk import Rebulk +from .. import introspector +from .default_rules_module import RuleAppend2, RuleAppend3 + + +def test_string_introspector(): + rebulk = Rebulk().string('One', 'Two', 'Three', name='first').string('1', '2', '3', name='second') + + introspected = introspector.introspect(rebulk, None) + + assert len(introspected.patterns) == 2 + + first_properties = introspected.patterns[0].properties + assert len(first_properties) == 1 + first_properties['first'] == ['One', 'Two', 'Three'] + + second_properties = introspected.patterns[1].properties + assert len(second_properties) == 1 + second_properties['second'] == ['1', '2', '3'] + + properties = introspected.properties + assert len(properties) == 2 + assert properties['first'] == first_properties['first'] + assert properties['second'] == second_properties['second'] + + +def test_string_properties(): + rebulk = Rebulk()\ + .string('One', 'Two', 'Three', name='first', properties={'custom': ['One']})\ + .string('1', '2', '3', name='second', properties={'custom': [1]}) + + introspected = introspector.introspect(rebulk, None) + + assert len(introspected.patterns) == 2 + assert len(introspected.rules) == 2 + + first_properties = introspected.patterns[0].properties + assert len(first_properties) == 1 + first_properties['custom'] == ['One'] + + second_properties = introspected.patterns[1].properties + assert len(second_properties) == 1 + second_properties['custom'] == [1] + + properties = introspected.properties + assert len(properties) == 1 + assert properties['custom'] == ['One', 1] + + +def test_various_pattern(): + rebulk = Rebulk()\ + .regex('One', 'Two', 'Three', name='first', value="string") \ + .string('1', '2', '3', name='second', value="digit") \ + .string('4', '5', '6', name='third') \ + .string('private', private=True) \ + .functional(lambda string: (0, 5), name='func', value='test') \ + .regex('One', 'Two', 'Three', name='regex_name') \ + .regex('(?P<one>One)(?P<two>Two)(?P<three>Three)') \ + .functional(lambda string: (6, 10), name='func2') \ + .string('7', name='third') + + introspected = introspector.introspect(rebulk, None) + + assert len(introspected.patterns) == 8 + assert len(introspected.rules) == 2 + + first_properties = introspected.patterns[0].properties + assert len(first_properties) == 1 + first_properties['first'] == ['string'] + + second_properties = introspected.patterns[1].properties + assert len(second_properties) == 1 + second_properties['second'] == ['digit'] + + third_properties = introspected.patterns[2].properties + assert len(third_properties) == 1 + third_properties['third'] == ['4', '5', '6'] + + func_properties = introspected.patterns[3].properties + assert len(func_properties) == 1 + func_properties['func'] == ['test'] + + regex_name_properties = introspected.patterns[4].properties + assert len(regex_name_properties) == 1 + regex_name_properties['regex_name'] == [None] + + regex_groups_properties = introspected.patterns[5].properties + assert len(regex_groups_properties) == 3 + regex_groups_properties['one'] == [None] + regex_groups_properties['two'] == [None] + regex_groups_properties['three'] == [None] + + func2_properties = introspected.patterns[6].properties + assert len(func2_properties) == 1 + func2_properties['func2'] == [None] + + append_third_properties = introspected.patterns[7].properties + assert len(append_third_properties) == 1 + append_third_properties['third'] == [None] + + properties = introspected.properties + assert len(properties) == 9 + assert properties['first'] == first_properties['first'] + assert properties['second'] == second_properties['second'] + assert properties['third'] == third_properties['third'] + append_third_properties['third'] + assert properties['func'] == func_properties['func'] + assert properties['regex_name'] == regex_name_properties['regex_name'] + assert properties['one'] == regex_groups_properties['one'] + assert properties['two'] == regex_groups_properties['two'] + assert properties['three'] == regex_groups_properties['three'] + assert properties['func2'] == func2_properties['func2'] + + +def test_rule_properties(): + rebulk = Rebulk(default_rules=False).rules(RuleAppend2, RuleAppend3) + + introspected = introspector.introspect(rebulk, None) + + assert len(introspected.rules) == 2 + assert len(introspected.patterns) == 0 + + rule_properties = introspected.rules[0].properties + assert len(rule_properties) == 1 + assert rule_properties['renamed'] == [None] + + rule_properties = introspected.rules[1].properties + assert len(rule_properties) == 1 + assert rule_properties['renamed'] == [None] + + properties = introspected.properties + assert len(properties) == 1 + assert properties['renamed'] == [None] diff --git a/lib/rebulk/test/test_loose.py b/lib/rebulk/test/test_loose.py new file mode 100644 index 0000000000000000000000000000000000000000..bc0c6bca121de1c51624ac0d015da1bf53ec0fa5 --- /dev/null +++ b/lib/rebulk/test/test_loose.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name + +from ..loose import call + + +def test_loose_function(): + + def func(v1, v2, v3=3, v4=4): + return v1 + v2 + v3 + v4 + + assert call(func, 1, 2) == func(1, 2) + assert call(func, 1, 2, 3, 5) == func(1, 2, 3, 5) + assert call(func, 1, 2, v3=4, v4=5) == func(1, 2, v3=4, v4=5) + assert call(func, 1, 2, 3, 4, 5) == func(1, 2, 3, 4) + assert call(func, 1, 2, 3, 4, more=5) == func(1, 2, 3, 4) + + +def test_loose_varargs_function(): + def func(v1, v2, *args): + return v1 + v2 + args[0] if len(args) > 0 else 3 + args[1] if len(args) > 1 else 4 + + assert call(func, 1, 2) == func(1, 2) + assert call(func, 1, 2, 3, 5) == func(1, 2, 3, 5) + assert call(func, 1, 2, 3, 4, 5) == func(1, 2, 3, 4) + + +def test_loose_kwargs_function(): + def func(v1, v2, **kwargs): + return v1 + v2 + kwargs.get('v3', 3) + kwargs.get('v4', 4) + + assert call(func, v1=1, v2=2) == func(v1=1, v2=2) + assert call(func, v1=1, v2=2, v3=3, v4=5) == func(v1=1, v2=2, v3=3, v4=5) + + +def test_loose_class(): + class Dummy(object): + def __init__(self, v1, v2, v3=3, v4=4): + self.v1 = v1 + self.v2 = v2 + self.v3 = v3 + self.v4 = v4 + + def call(self): + return self.v1 + self.v2 + self.v3 + self.v4 + + assert call(Dummy, 1, 2).call() == Dummy(1, 2).call() + assert call(Dummy, 1, 2, 3, 5).call() == Dummy(1, 2, 3, 5).call() + assert call(Dummy, 1, 2, v3=4, v4=5).call() == Dummy(1, 2, v3=4, v4=5).call() + assert call(Dummy, 1, 2, 3, 4, 5).call() == Dummy(1, 2, 3, 4).call() + assert call(Dummy, 1, 2, 3, 4, more=5).call() == Dummy(1, 2, 3, 4).call() + + +def test_loose_varargs_class(): + class Dummy(object): + def __init__(self, v1, v2, *args): + self.v1 = v1 + self.v2 = v2 + self.v3 = args[0] if len(args) > 0 else 3 + self.v4 = args[1] if len(args) > 1 else 4 + + def call(self): + return self.v1 + self.v2 + self.v3 + self.v4 + + assert call(Dummy, 1, 2).call() == Dummy(1, 2).call() + assert call(Dummy, 1, 2, 3, 5).call() == Dummy(1, 2, 3, 5).call() + assert call(Dummy, 1, 2, 3, 4, 5).call() == Dummy(1, 2, 3, 4).call() + + +def test_loose_kwargs_class(): + class Dummy(object): + def __init__(self, v1, v2, **kwargs): + self.v1 = v1 + self.v2 = v2 + self.v3 = kwargs.get('v3', 3) + self.v4 = kwargs.get('v4', 4) + + def call(self): + return self.v1 + self.v2 + self.v3 + self.v4 + + assert call(Dummy, v1=1, v2=2).call() == Dummy(v1=1, v2=2).call() + assert call(Dummy, v1=1, v2=2, v3=3, v4=5).call() == Dummy(v1=1, v2=2, v3=3, v4=5).call() diff --git a/lib/rebulk/test/test_match.py b/lib/rebulk/test/test_match.py new file mode 100644 index 0000000000000000000000000000000000000000..c87311d1a6a81bf35bb6f59a536220d426426518 --- /dev/null +++ b/lib/rebulk/test/test_match.py @@ -0,0 +1,571 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, unneeded-not + +import pytest +import six + +from ..match import Match, Matches +from ..pattern import StringPattern, RePattern +from ..formatters import formatters + + +class TestMatchClass(object): + def test_repr(self): + match1 = Match(1, 3, value="es") + + assert repr(match1) == '<es:(1, 3)>' + + match2 = Match(0, 4, value="test", private=True, name="abc", tags=['one', 'two']) + + assert repr(match2) == '<test:(0, 4)+private+name=abc+tags=[\'one\', \'two\']>' + + def test_names(self): + parent = Match(0, 10, name="test") + parent.children.append(Match(0, 10, name="child1", parent=parent)) + parent.children.append(Match(0, 10, name="child2", parent=parent)) + + assert set(parent.names) == set(["child1", "child2"]) + + def test_equality(self): + match1 = Match(1, 3, value="es") + match2 = Match(1, 3, value="es") + + other = object() + + assert hash(match1) == hash(match2) + assert hash(match1) != hash(other) + + assert match1 == match2 + assert not match1 == other + + def test_inequality(self): + match1 = Match(0, 2, value="te") + match2 = Match(2, 4, value="st") + match3 = Match(0, 2, value="other") + + other = object() + + assert hash(match1) != hash(match2) + assert hash(match1) != hash(match3) + + assert match1 != other + assert match1 != match2 + assert match1 != match3 + + def test_length(self): + match1 = Match(0, 4, value="test") + match2 = Match(0, 2, value="spanIsUsed") + + assert len(match1) == 4 + assert len(match2) == 2 + + def test_compare(self): + match1 = Match(0, 2, value="te") + match2 = Match(2, 4, value="st") + + other = object() + + assert match1 < match2 + assert match1 <= match2 + + assert match2 > match1 + assert match2 >= match1 + + if six.PY3: + with pytest.raises(TypeError): + match1 < other + + with pytest.raises(TypeError): + match1 <= other + + with pytest.raises(TypeError): + match1 > other + + with pytest.raises(TypeError): + match1 >= other + else: + assert match1 < other + assert match1 <= other + assert not match1 > other + assert not match1 >= other + + def test_value(self): + match1 = Match(1, 3) + match1.value = "test" + + assert match1.value == "test" + + +class TestMatchesClass(object): + match1 = Match(0, 2, value="te", name="start") + match2 = Match(2, 3, value="s", tags="tag1") + match3 = Match(3, 4, value="t", tags=["tag1", "tag2"]) + match4 = Match(2, 4, value="st", name="end") + + def test_tag(self): + matches = Matches() + matches.append(self.match1) + matches.append(self.match2) + matches.append(self.match3) + matches.append(self.match4) + + assert "start" in matches.names + assert "end" in matches.names + + assert "tag1" in matches.tags + assert "tag2" in matches.tags + + tag1 = matches.tagged("tag1") + assert len(tag1) == 2 + assert tag1[0] == self.match2 + assert tag1[1] == self.match3 + + tag2 = matches.tagged("tag2") + assert len(tag2) == 1 + assert tag2[0] == self.match3 + + start = matches.named("start") + assert len(start) == 1 + assert start[0] == self.match1 + + end = matches.named("end") + assert len(end) == 1 + assert end[0] == self.match4 + + def test_base(self): + matches = Matches() + matches.append(self.match1) + + assert len(matches) == 1 + assert repr(matches) == repr([self.match1]) + assert list(matches.starting(0)) == [self.match1] + assert list(matches.ending(2)) == [self.match1] + + matches.append(self.match2) + matches.append(self.match3) + matches.append(self.match4) + + assert len(matches) == 4 + assert list(matches.starting(2)) == [self.match2, self.match4] + assert list(matches.starting(3)) == [self.match3] + assert list(matches.ending(3)) == [self.match2] + assert list(matches.ending(4)) == [self.match3, self.match4] + assert list(matches.range()) == [self.match1, self.match2, self.match4, self.match3] + assert list(matches.range(0)) == [self.match1, self.match2, self.match4, self.match3] + assert list(matches.range(0, 3)) == [self.match1, self.match2, self.match4] + assert list(matches.range(2, 3)) == [self.match2, self.match4] + assert list(matches.range(3, 4)) == [self.match4, self.match3] + + matches.remove(self.match1) + assert len(matches) == 3 + assert len(matches.starting(0)) == 0 + assert len(matches.ending(2)) == 0 + + matches.clear() + + assert len(matches) == 0 + assert len(matches.starting(0)) == 0 + assert len(matches.starting(2)) == 0 + assert len(matches.starting(3)) == 0 + assert len(matches.ending(2)) == 0 + assert len(matches.ending(3)) == 0 + assert len(matches.ending(4)) == 0 + + def test_get_slices(self): + matches = Matches() + matches.append(self.match1) + matches.append(self.match2) + matches.append(self.match3) + matches.append(self.match4) + + slice_matches = matches[1:3] + + assert isinstance(slice_matches, Matches) + + assert len(slice_matches) == 2 + assert slice_matches[0] == self.match2 + assert slice_matches[1] == self.match3 + + def test_remove_slices(self): + matches = Matches() + matches.append(self.match1) + matches.append(self.match2) + matches.append(self.match3) + matches.append(self.match4) + + del matches[1:3] + + assert len(matches) == 2 + assert matches[0] == self.match1 + assert matches[1] == self.match4 + + def test_set_slices(self): + matches = Matches() + matches.append(self.match1) + matches.append(self.match2) + matches.append(self.match3) + matches.append(self.match4) + + matches[1:3] = self.match1, self.match4 + + assert len(matches) == 4 + assert matches[0] == self.match1 + assert matches[1] == self.match1 + assert matches[2] == self.match4 + assert matches[3] == self.match4 + + def test_set_index(self): + matches = Matches() + matches.append(self.match1) + matches.append(self.match2) + matches.append(self.match3) + + matches[1] = self.match4 + + assert len(matches) == 3 + assert matches[0] == self.match1 + assert matches[1] == self.match4 + assert matches[2] == self.match3 + + def test_constructor(self): + matches = Matches([self.match1, self.match2, self.match3, self.match4]) + + assert len(matches) == 4 + assert list(matches.starting(0)) == [self.match1] + assert list(matches.ending(2)) == [self.match1] + assert list(matches.starting(2)) == [self.match2, self.match4] + assert list(matches.starting(3)) == [self.match3] + assert list(matches.ending(3)) == [self.match2] + assert list(matches.ending(4)) == [self.match3, self.match4] + + def test_constructor_kwargs(self): + matches = Matches([self.match1, self.match2, self.match3, self.match4], input_string="test") + + assert len(matches) == 4 + assert matches.input_string == "test" + assert list(matches.starting(0)) == [self.match1] + assert list(matches.ending(2)) == [self.match1] + assert list(matches.starting(2)) == [self.match2, self.match4] + assert list(matches.starting(3)) == [self.match3] + assert list(matches.ending(3)) == [self.match2] + assert list(matches.ending(4)) == [self.match3, self.match4] + + def test_crop(self): + input_string = "abcdefghijklmnopqrstuvwxyz" + + match1 = Match(1, 10, input_string=input_string) + match2 = Match(0, 2, input_string=input_string) + match3 = Match(8, 15, input_string=input_string) + + ret = match1.crop([match2, match3.span]) + + assert len(ret) == 1 + + assert ret[0].span == (2, 8) + assert ret[0].value == "cdefgh" + + ret = match1.crop((1, 10)) + assert len(ret) == 0 + + ret = match1.crop((1, 3)) + assert len(ret) == 1 + assert ret[0].span == (3, 10) + + ret = match1.crop((7, 10)) + assert len(ret) == 1 + assert ret[0].span == (1, 7) + + ret = match1.crop((0, 12)) + assert len(ret) == 0 + + ret = match1.crop((4, 6)) + assert len(ret) == 2 + + assert ret[0].span == (1, 4) + assert ret[1].span == (6, 10) + + ret = match1.crop([(3, 5), (7, 9)]) + assert len(ret) == 3 + + assert ret[0].span == (1, 3) + assert ret[1].span == (5, 7) + assert ret[2].span == (9, 10) + + def test_split(self): + input_string = "123 +word1 - word2 + word3 456" + match = Match(3, len(input_string) - 3, input_string=input_string) + splitted = match.split(" -+") + + assert len(splitted) == 3 + assert [split.value for split in splitted] == ["word1", "word2", "word3"] + + +class TestMaches(object): + def test_names(self): + input_string = "One Two Three" + + matches = Matches() + + matches.extend(StringPattern("One", name="1-str", tags=["One", "str"]).matches(input_string)) + matches.extend(RePattern("One", name="1-re", tags=["One", "re"]).matches(input_string)) + matches.extend(StringPattern("Two", name="2-str", tags=["Two", "str"]).matches(input_string)) + matches.extend(RePattern("Two", name="2-re", tags=["Two", "re"]).matches(input_string)) + matches.extend(StringPattern("Three", name="3-str", tags=["Three", "str"]).matches(input_string)) + matches.extend(RePattern("Three", name="3-re", tags=["Three", "re"]).matches(input_string)) + + assert set(matches.names) == set(["1-str", "1-re", "2-str", "2-re", "3-str", "3-re"]) + + def test_filters(self): + input_string = "One Two Three" + + matches = Matches() + + matches.extend(StringPattern("One", name="1-str", tags=["One", "str"]).matches(input_string)) + matches.extend(RePattern("One", name="1-re", tags=["One", "re"]).matches(input_string)) + matches.extend(StringPattern("Two", name="2-str", tags=["Two", "str"]).matches(input_string)) + matches.extend(RePattern("Two", name="2-re", tags=["Two", "re"]).matches(input_string)) + matches.extend(StringPattern("Three", name="3-str", tags=["Three", "str"]).matches(input_string)) + matches.extend(RePattern("Three", name="3-re", tags=["Three", "re"]).matches(input_string)) + + selection = matches.starting(0) + assert len(selection) == 2 + + selection = matches.starting(0, lambda m: "str" in m.tags) + assert len(selection) == 1 + assert selection[0].pattern.name == "1-str" + + selection = matches.ending(7, predicate=lambda m: "str" in m.tags) + assert len(selection) == 1 + assert selection[0].pattern.name == "2-str" + + selection = matches.previous(matches.named("2-str")[0]) + assert len(selection) == 2 + assert selection[0].pattern.name == "1-str" + assert selection[1].pattern.name == "1-re" + + selection = matches.previous(matches.named("2-str", 0), lambda m: "str" in m.tags) + assert len(selection) == 1 + assert selection[0].pattern.name == "1-str" + + selection = matches.next(matches.named("2-str", 0)) + assert len(selection) == 2 + assert selection[0].pattern.name == "3-str" + assert selection[1].pattern.name == "3-re" + + selection = matches.next(matches.named("2-str", 0), index=0, predicate=lambda m: "re" in m.tags) + assert selection is not None + assert selection.pattern.name == "3-re" + + selection = matches.next(matches.named("2-str", index=0), lambda m: "re" in m.tags) + assert len(selection) == 1 + assert selection[0].pattern.name == "3-re" + + selection = matches.named("2-str", lambda m: "re" in m.tags) + assert len(selection) == 0 + + selection = matches.named("2-re", lambda m: "re" in m.tags, 0) + assert selection is not None + assert selection.name == "2-re" # pylint:disable=no-member + + selection = matches.named("2-re", lambda m: "re" in m.tags) + assert len(selection) == 1 + assert selection[0].name == "2-re" + + selection = matches.named("2-re", lambda m: "re" in m.tags, index=1000) + assert selection is None + + def test_raw(self): + input_string = "0123456789" + + match = Match(0, 10, input_string=input_string, formatter=lambda s: s*2) + + assert match.value == match.raw * 2 + assert match.raw == input_string + + match.raw_end = 9 + match.raw_start = 1 + + assert match.value == match.raw * 2 + assert match.raw == input_string[1:9] + + match.raw_end = None + match.raw_start = None + + assert match.value == match.raw * 2 + assert match.raw == input_string + + + def test_formatter_chain(self): + input_string = "100" + + match = Match(0, 3, input_string=input_string, formatter=formatters(int, lambda s: s*2, lambda s: s+10)) + + assert match.raw == input_string + assert match.value == 100 * 2 + 10 + + + def test_to_dict(self): + input_string = "One Two Two Three" + + matches = Matches() + + matches.extend(StringPattern("One", name="1", tags=["One", "str"]).matches(input_string)) + matches.extend(RePattern("One", name="1", tags=["One", "re"]).matches(input_string)) + matches.extend(StringPattern("Two", name="2", tags=["Two", "str"]).matches(input_string)) + matches.extend(RePattern("Two", name="2", tags=["Two", "re"]).matches(input_string)) + matches.extend(RePattern("Two", name="2", tags=["Two", "reBis"]).matches(input_string)) + matches.extend(StringPattern("Three", name="3", tags=["Three", "str"]).matches(input_string)) + matches.extend(RePattern("Three", name="3bis", tags=["Three", "re"]).matches(input_string)) + matches.extend(RePattern(r"(\w+)", name="words").matches(input_string)) + + kvalues = matches.to_dict() + assert kvalues == {"1": "One", + "2": "Two", + "3": "Three", + "3bis": "Three", + "words": "One"} + assert kvalues.values_list["words"] == ["One", "Two", "Three"] + + kvalues = matches.to_dict(details=True, implicit=True) + assert kvalues["1"].value == "One" + + assert len(kvalues["2"]) == 2 + assert kvalues["2"][0].value == "Two" + assert kvalues["2"][1].value == "Two" + + assert kvalues["3"].value == "Three" + assert kvalues["3bis"].value == "Three" + + assert len(kvalues["words"]) == 4 + assert kvalues["words"][0].value == "One" + assert kvalues["words"][1].value == "Two" + assert kvalues["words"][2].value == "Two" + assert kvalues["words"][3].value == "Three" + + kvalues = matches.to_dict(details=True) + assert kvalues["1"].value == "One" + + assert len(kvalues.values_list["2"]) == 2 + assert kvalues.values_list["2"][0].value == "Two" + assert kvalues.values_list["2"][1].value == "Two" + + assert kvalues["3"].value == "Three" + assert kvalues["3bis"].value == "Three" + + assert len(kvalues.values_list["words"]) == 4 + assert kvalues.values_list["words"][0].value == "One" + assert kvalues.values_list["words"][1].value == "Two" + assert kvalues.values_list["words"][2].value == "Two" + assert kvalues.values_list["words"][3].value == "Three" + + def test_chains(self): + input_string = "wordX 10 20 30 40 wordA, wordB, wordC 70 80 wordX" + + matches = Matches(input_string=input_string) + + matches.extend(RePattern(r"\d+", name="digit").matches(input_string)) + matches.extend(RePattern("[a-zA-Z]+", name="word").matches(input_string)) + + assert len(matches) == 11 + + a_start = input_string.find('wordA') + + b_start = input_string.find('wordB') + b_end = b_start + len('wordB') + + c_start = input_string.find('wordC') + c_end = c_start + len('wordC') + + chain_before = matches.chain_before(b_start, " ,", predicate=lambda match: match.name == "word") + assert len(chain_before) == 1 + assert chain_before[0].value == 'wordA' + + chain_before = matches.chain_before(Match(b_start, b_start), " ,", predicate=lambda match: match.name == "word") + assert len(chain_before) == 1 + assert chain_before[0].value == 'wordA' + + chain_before = matches.chain_before(b_start, " ,", predicate=lambda match: match.name == "digit") + assert len(chain_before) == 0 + + chain_before = matches.chain_before(a_start, " ,", predicate=lambda match: match.name == "digit") + assert len(chain_before) == 4 + assert [match.value for match in chain_before] == ["40", "30", "20", "10"] + + chain_after = matches.chain_after(b_end, " ,", predicate=lambda match: match.name == "word") + assert len(chain_after) == 1 + assert chain_after[0].value == 'wordC' + + chain_after = matches.chain_after(Match(b_end, b_end), " ,", predicate=lambda match: match.name == "word") + assert len(chain_after) == 1 + assert chain_after[0].value == 'wordC' + + chain_after = matches.chain_after(b_end, " ,", predicate=lambda match: match.name == "digit") + assert len(chain_after) == 0 + + chain_after = matches.chain_after(c_end, " ,", predicate=lambda match: match.name == "digit") + assert len(chain_after) == 2 + assert [match.value for match in chain_after] == ["70", "80"] + + chain_after = matches.chain_after(c_end, " ,", end=10000, predicate=lambda match: match.name == "digit") + assert len(chain_after) == 2 + assert [match.value for match in chain_after] == ["70", "80"] + + def test_holes(self): + input_string = '1'*10+'2'*10+'3'*10+'4'*10+'5'*10+'6'*10+'7'*10 + + hole1 = Match(0, 10, input_string=input_string) + hole2 = Match(20, 30, input_string=input_string) + hole3 = Match(30, 40, input_string=input_string) + hole4 = Match(60, 70, input_string=input_string) + + matches = Matches([hole1, hole2], input_string=input_string) + matches.append(hole3) + matches.append(hole4) + + holes = list(matches.holes()) + assert len(holes) == 2 + assert holes[0].span == (10, 20) + assert holes[0].value == '2'*10 + assert holes[1].span == (40, 60) + assert holes[1].value == '5' * 10 + '6' * 10 + + holes = list(matches.holes(5, 15)) + assert len(holes) == 1 + assert holes[0].span == (10, 15) + assert holes[0].value == '2'*5 + + holes = list(matches.holes(5, 15, formatter=lambda value: "formatted")) + assert len(holes) == 1 + assert holes[0].span == (10, 15) + assert holes[0].value == "formatted" + + holes = list(matches.holes(5, 15, predicate=lambda hole: False)) + assert len(holes) == 0 + + def test_holes_empty(self): + input_string = "Test hole on empty matches" + matches = Matches(input_string=input_string) + holes = matches.holes() + assert len(holes) == 1 + assert holes[0].value == input_string + + def test_holes_seps(self): + input_string = "Test hole - with many separators + included" + match = StringPattern("many").matches(input_string) + + matches = Matches(match, input_string) + holes = matches.holes() + + assert len(holes) == 2 + + holes = matches.holes(seps="-+") + + assert len(holes) == 4 + assert [hole.value for hole in holes] == ["Test hole ", " with ", " separators ", " included"] + + + + + + diff --git a/lib/rebulk/test/test_pattern.py b/lib/rebulk/test/test_pattern.py new file mode 100644 index 0000000000000000000000000000000000000000..0316aabaf39c315a6ff90f1453fe2b97fd39fe3b --- /dev/null +++ b/lib/rebulk/test/test_pattern.py @@ -0,0 +1,757 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, unbalanced-tuple-unpacking + +import re +import pytest + +from ..pattern import StringPattern, RePattern, FunctionalPattern, REGEX_AVAILABLE +from ..match import Match + +class TestStringPattern(object): + """ + Tests for StringPattern matching + """ + + input_string = "An Abyssinian fly playing a Celtic violin was annoyed by trashy flags on " \ + "which were the Hebrew letter qoph." + + def test_single(self): + pattern = StringPattern("Celtic") + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (28, 34) + assert matches[0].value == "Celtic" + + def test_repr(self): + pattern = StringPattern("Celtic") + + assert repr(pattern) == '<StringPattern:(\'Celtic\',)>' + + def test_ignore_case(self): + pattern = StringPattern("celtic", ignore_case=False) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 0 + + pattern = StringPattern("celtic", ignore_case=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert matches[0].value == "Celtic" + + def test_private_names(self): + pattern = StringPattern("celtic", name="test", private_names=["test"], ignore_case=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert matches[0].private + + def test_no_match(self): + pattern = StringPattern("Python") + + matches = list(pattern.matches(self.input_string)) + assert not matches + + def test_multiple_patterns(self): + pattern = StringPattern("playing", "annoyed", "Hebrew") + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 3 + + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (18, 25) + assert matches[0].value == "playing" + + assert isinstance(matches[1], Match) + assert matches[1].pattern == pattern + assert matches[1].span == (46, 53) + assert matches[1].value == "annoyed" + + assert isinstance(matches[2], Match) + assert matches[2].pattern == pattern + assert matches[2].span == (88, 94) + assert matches[2].value == "Hebrew" + + def test_start_end_kwargs(self): + pattern = StringPattern("Abyssinian", start=20, end=40) + matches = list(pattern.matches(self.input_string)) + + assert len(matches) == 0 + + def test_matches_kwargs(self): + pattern = StringPattern("Abyssinian", name="test", value="AB") + matches = list(pattern.matches(self.input_string)) + + assert len(matches) == 1 + assert matches[0].name == "test" + assert matches[0].value == "AB" + + +class TestRePattern(object): + """ + Tests for RePattern matching + """ + + input_string = "An Abyssinian fly playing a Celtic violin was annoyed by trashy flags on " \ + "which were the Hebrew letter qoph." + + def test_single_compiled(self): + pattern = RePattern(re.compile("Celt.?c")) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (28, 34) + assert matches[0].value == "Celtic" + + def test_single_string(self): + pattern = RePattern("Celt.?c") + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (28, 34) + assert matches[0].value == "Celtic" + + def test_single_kwargs(self): + pattern = RePattern({"pattern": "celt.?c", "flags": re.IGNORECASE}) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (28, 34) + assert matches[0].value == "Celtic" + + def test_single_vargs(self): + pattern = RePattern(("celt.?c", re.IGNORECASE)) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (28, 34) + assert matches[0].value == "Celtic" + + def test_no_match(self): + pattern = RePattern("abc.?def") + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 0 + + def test_shortcuts(self): + pattern = RePattern("Celtic-violin", abbreviations=[("-", r"[\W_]+")]) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + pattern = RePattern({"pattern": "celtic-violin", "flags": re.IGNORECASE}, abbreviations=[("-", r"[\W_]+")]) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + def test_multiple_patterns(self): + pattern = RePattern("pla.?ing", "ann.?yed", "Heb.?ew") + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 3 + + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (18, 25) + assert matches[0].value == "playing" + + assert isinstance(matches[1], Match) + assert matches[1].pattern == pattern + assert matches[1].span == (46, 53) + assert matches[1].value == "annoyed" + + assert isinstance(matches[2], Match) + assert matches[2].pattern == pattern + assert matches[2].span == (88, 94) + assert matches[2].value == "Hebrew" + + def test_unnamed_groups(self): + pattern = RePattern(r"(Celt.?c)\s+(\w+)") + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + parent = matches[0] + + assert isinstance(parent, Match) + assert parent.pattern == pattern + assert parent.span == (28, 41) + assert parent.name is None + assert parent.value == "Celtic violin" + + assert len(parent.children) == 2 + + group1, group2 = parent.children + + assert isinstance(group1, Match) + assert group1.pattern == pattern + assert group1.span == (28, 34) + assert group1.name is None + assert group1.value == "Celtic" + assert group1.parent == parent + + assert isinstance(group2, Match) + assert group2.pattern == pattern + assert group2.span == (35, 41) + assert group2.name is None + assert group2.value == "violin" + assert group2.parent == parent + + def test_named_groups(self): + pattern = RePattern(r"(?P<param1>Celt.?c)\s+(?P<param2>\w+)") + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + parent = matches[0] + + assert isinstance(parent, Match) + assert parent.pattern == pattern + assert parent.span == (28, 41) + assert parent.name is None + assert parent.value == "Celtic violin" + + assert len(parent.children) == 2 + group1, group2 = parent.children + + assert isinstance(group1, Match) + assert group1.pattern == pattern + assert group1.span == (28, 34) + assert group1.name == "param1" + assert group1.value == "Celtic" + assert group1.parent == parent + + assert isinstance(group2, Match) + assert group2.pattern == pattern + assert group2.span == (35, 41) + assert group2.name == "param2" + assert group2.value == "violin" + assert group2.parent == parent + + def test_children(self): + pattern = RePattern(r"(?P<param1>Celt.?c)\s+(?P<param2>\w+)", children=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 2 + group1, group2 = matches + + assert isinstance(group1, Match) + assert group1.pattern == pattern + assert group1.span == (28, 34) + assert group1.name == "param1" + assert group1.value == "Celtic" + + assert isinstance(group2, Match) + assert group2.pattern == pattern + assert group2.span == (35, 41) + assert group2.name == "param2" + assert group2.value == "violin" + + def test_children_parent_private(self): + pattern = RePattern(r"(?P<param1>Celt.?c)\s+(?P<param2>\w+)", children=True, private_parent=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 3 + parent, group1, group2 = matches + + assert isinstance(group1, Match) + assert parent.private + assert parent.pattern == pattern + assert parent.span == (28, 41) + assert parent.name is None + assert parent.value == "Celtic violin" + + assert isinstance(group1, Match) + assert not group1.private + assert group1.pattern == pattern + assert group1.span == (28, 34) + assert group1.name == "param1" + assert group1.value == "Celtic" + + assert isinstance(group2, Match) + assert not group2.private + assert group2.pattern == pattern + assert group2.span == (35, 41) + assert group2.name == "param2" + assert group2.value == "violin" + + def test_parent_children_private(self): + pattern = RePattern(r"(?P<param1>Celt.?c)\s+(?P<param2>\w+)", private_children=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 3 + parent, group1, group2 = matches + + assert isinstance(group1, Match) + assert not parent.private + assert parent.pattern == pattern + assert parent.span == (28, 41) + assert parent.name is None + assert parent.value == "Celtic violin" + + assert isinstance(group1, Match) + assert group1.private + assert group1.pattern == pattern + assert group1.span == (28, 34) + assert group1.name == "param1" + assert group1.value == "Celtic" + + assert isinstance(group2, Match) + assert group2.private + assert group2.pattern == pattern + assert group2.span == (35, 41) + assert group2.name == "param2" + assert group2.value == "violin" + + def test_every(self): + pattern = RePattern(r"(?P<param1>Celt.?c)\s+(?P<param2>\w+)", every=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 3 + parent, group1, group2 = matches + + assert isinstance(group1, Match) + assert not parent.private + assert parent.pattern == pattern + assert parent.span == (28, 41) + assert parent.name is None + assert parent.value == "Celtic violin" + + assert isinstance(group1, Match) + assert not group1.private + assert group1.pattern == pattern + assert group1.span == (28, 34) + assert group1.name == "param1" + assert group1.value == "Celtic" + + assert isinstance(group2, Match) + assert not group2.private + assert group2.pattern == pattern + assert group2.span == (35, 41) + assert group2.name == "param2" + assert group2.value == "violin" + + def test_matches_kwargs(self): + pattern = RePattern("He.rew", name="test", value="HE") + matches = list(pattern.matches(self.input_string)) + + assert len(matches) == 1 + assert matches[0].name == "test" + assert matches[0].value == "HE" + + pattern = RePattern("H(e.)(rew)", name="test", value="HE") + matches = list(pattern.matches(self.input_string)) + + assert len(matches) == 1 + assert matches[0].name == "test" + assert matches[0].value == "HE" + + children = matches[0].children + assert len(children) == 2 + assert children[0].name is "test" + assert children[0].value == "HE" + + assert children[1].name is "test" + assert children[1].value == "HE" + + pattern = RePattern("H(?P<first>e.)(?P<second>rew)", name="test", value="HE") + matches = list(pattern.matches(self.input_string)) + + assert len(matches) == 1 + assert matches[0].name == "test" + assert matches[0].value == "HE" + + children = matches[0].children + assert len(children) == 2 + assert children[0].name == "first" + assert children[0].value == "HE" + + assert children[1].name == "second" + assert children[1].value == "HE" + + +class TestFunctionalPattern(object): + """ + Tests for FunctionalPattern matching + """ + + input_string = "An Abyssinian fly playing a Celtic violin was annoyed by trashy flags on " \ + "which were the Hebrew letter qoph." + + def test_single_vargs(self): + def func(input_string): + i = input_string.find("fly") + if i > -1: + return i, i + len("fly"), "fly", "functional" + + pattern = FunctionalPattern(func) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (14, 17) + assert matches[0].name == "functional" + assert matches[0].value == "fly" + + def test_single_kwargs(self): + def func(input_string): + i = input_string.find("fly") + if i > -1: + return {"start": i, "end": i + len("fly"), "name": "functional"} + + pattern = FunctionalPattern(func) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (14, 17) + assert matches[0].name == "functional" + assert matches[0].value == "fly" + + def test_multiple_objects(self): + def func(input_string): + i = input_string.find("fly") + matches = [] + if i > -1: + matches.append((i, i + len("fly"), {'name': "functional"})) + i = input_string.find("annoyed") + if i > -1: + matches.append((i, i + len("annoyed"))) + i = input_string.find("Hebrew") + if i > -1: + matches.append({"start": i, "end": i + len("Hebrew")}) + return matches + + pattern = FunctionalPattern(func) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 3 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (14, 17) + assert matches[0].name == "functional" + assert matches[0].value == "fly" + + assert isinstance(matches[1], Match) + assert matches[1].pattern == pattern + assert matches[1].span == (46, 53) + assert matches[1].value == "annoyed" + + assert isinstance(matches[2], Match) + assert matches[2].pattern == pattern + assert matches[2].span == (88, 94) + assert matches[2].value == "Hebrew" + + def test_multiple_generator(self): + def func(input_string): + i = input_string.find("fly") + if i > -1: + yield (i, i + len("fly"), {'name': "functional"}) + i = input_string.find("annoyed") + if i > -1: + yield (i, i + len("annoyed")) + i = input_string.find("Hebrew") + if i > -1: + yield (i, {"end": i + len("Hebrew")}) + + pattern = FunctionalPattern(func) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 3 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (14, 17) + assert matches[0].name == "functional" + assert matches[0].value == "fly" + + assert isinstance(matches[1], Match) + assert matches[1].pattern == pattern + assert matches[1].span == (46, 53) + assert matches[1].value == "annoyed" + + assert isinstance(matches[2], Match) + assert matches[2].pattern == pattern + assert matches[2].span == (88, 94) + assert matches[2].value == "Hebrew" + + def test_no_match(self): + pattern = FunctionalPattern(lambda x: None) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 0 + + def test_multiple_patterns(self): + def playing(input_string): + i = input_string.find("playing") + if i > -1: + return i, i + len("playing") + + def annoyed(input_string): + i = input_string.find("annoyed") + if i > -1: + return i, i + len("annoyed") + + def hebrew(input_string): + i = input_string.find("Hebrew") + if i > -1: + return i, i + len("Hebrew") + + pattern = FunctionalPattern(playing, annoyed, hebrew) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 3 + + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (18, 25) + assert matches[0].value == "playing" + + assert isinstance(matches[1], Match) + assert matches[1].pattern == pattern + assert matches[1].span == (46, 53) + assert matches[1].value == "annoyed" + + assert isinstance(matches[2], Match) + assert matches[2].pattern == pattern + assert matches[2].span == (88, 94) + assert matches[2].value == "Hebrew" + + def test_matches_kwargs(self): + def playing(input_string): + i = input_string.find("playing") + if i > -1: + return i, i + len("playing") + + pattern = FunctionalPattern(playing, name="test", value="PLAY") + matches = list(pattern.matches(self.input_string)) + + assert len(matches) == 1 + assert matches[0].name == "test" + assert matches[0].value == "PLAY" + + +class TestFormatter(object): + """ + Tests for formatter option + """ + + input_string = "This string contains 1849 a number" + + def test_single_string(self): + pattern = StringPattern("1849", name="dummy", formatter=lambda x: int(x) / 2) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (21, 25) + assert matches[0].value == 1849 / 2 + + def test_single_re_no_group(self): + pattern = RePattern(r"\d+", formatter=lambda x: int(x) * 2) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (21, 25) + assert matches[0].value == 1849 * 2 + + def test_single_re_named_groups(self): + pattern = RePattern(r"(?P<strParam>cont.?ins)\s+(?P<intParam>\d+)", + formatter={'intParam': lambda x: int(x) * 2, + 'strParam': lambda x: "really " + x}, format_all=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + parent = matches[0] + assert len(parent.children) == 2 + + group1, group2 = parent.children + + assert isinstance(group1, Match) + assert group1.pattern == pattern + assert group1.span == (12, 20) + assert group1.value == "really contains" + + assert isinstance(group2, Match) + assert group2.pattern == pattern + assert group2.span == (21, 25) + assert group2.value == 1849 * 2 + + def test_repeated_captures_option(self): + pattern = RePattern(r"\[(\d+)\](?:-(\d+))*") + + matches = list(pattern.matches("[02]-03-04-05-06")) + assert len(matches) == 1 + + match = matches[0] + if REGEX_AVAILABLE: + assert len(match.children) == 5 + assert [child.value for child in match.children] == ["02", "03", "04", "05", "06"] + else: + assert len(match.children) == 2 + assert [child.value for child in match.children] == ["02", "06"] + + with pytest.raises(NotImplementedError): + RePattern(r"\[(\d+)\](?:-(\d+))*", repeated_captures=True) + + pattern = RePattern(r"\[(\d+)\](?:-(\d+))*", repeated_captures=False) + + matches = list(pattern.matches("[02]-03-04-05-06")) + assert len(matches) == 1 + + match = matches[0] + assert len(match.children) == 2 + assert [child.value for child in match.children] == ["02", "06"] + + def test_single_functional(self): + def digit(input_string): + i = input_string.find("1849") + if i > -1: + return i, i + len("1849") + + pattern = FunctionalPattern(digit, formatter=lambda x: int(x) * 3) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (21, 25) + assert matches[0].value == 1849 * 3 + + +class TestValidator(object): + """ + Tests for validator option + """ + + input_string = "This string contains 1849 a number" + + @staticmethod + def true_validator(match): + return int(match.value) < 1850 + + @staticmethod + def false_validator(match): + return int(match.value) >= 1850 + + def test_single_string(self): + pattern = StringPattern("1849", name="dummy", validator=self.false_validator) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 0 + + pattern = StringPattern("1849", validator=self.true_validator) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + def test_single_re_no_group(self): + pattern = RePattern(r"\d+", validator=self.false_validator) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 0 + + pattern = RePattern(r"\d+", validator=self.true_validator) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + def test_single_re_named_groups(self): + pattern = RePattern(r"(?P<strParam>cont.?ins)\s+(?P<intParam>\d+)", + validator={'intParam': self.false_validator}, validate_all=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 0 + + pattern = RePattern(r"(?P<strParam>cont.?ins)\s+(?P<intParam>\d+)", + validator={'intParam': self.true_validator}, validate_all=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + def test_validate_all(self): + pattern = RePattern(r"contains (?P<intParam>\d+)", formatter=int, validator=lambda match: match.value < 100, + children=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 0 + + pattern = RePattern(r"contains (?P<intParam>\d+)", formatter=int, validator=lambda match: match.value > 100, + children=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + def invalid_func(match): + if match.name == 'intParam': + return True + else: + return match.value.startswith('abc') + + pattern = RePattern(r"contains (?P<intParam>\d+)", formatter=int, validator=invalid_func, validate_all=True, + children=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 0 + + def func(match): + if match.name == 'intParam': + return True + else: + return match.value.startswith('contains') + + pattern = RePattern(r"contains (?P<intParam>\d+)", formatter=int, validator=func, validate_all=True, + children=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + def test_format_all(self): + pattern = RePattern(r"contains (?P<intParam>\d+)", formatter=int, + children=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + for match in matches: + assert match.value is not None + + with pytest.raises(ValueError): + pattern = RePattern(r"contains (?P<intParam>\d+)", formatter=int, format_all=True) + matches = list(pattern.matches(self.input_string)) + for match in matches: + assert match.value is not None + + def test_single_functional(self): + def digit(input_string): + i = input_string.find("1849") + if i > -1: + return i, i + len("1849") + + pattern = FunctionalPattern(digit, validator=self.false_validator) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 0 + + pattern = FunctionalPattern(digit, validator=self.true_validator) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + diff --git a/lib/rebulk/test/test_processors.py b/lib/rebulk/test/test_processors.py new file mode 100644 index 0000000000000000000000000000000000000000..099cc47dc222b66edb1897756ecdc200d96dc056 --- /dev/null +++ b/lib/rebulk/test/test_processors.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, no-member + +from ..pattern import StringPattern, RePattern +from ..processors import ConflictSolver +from ..rules import execute_rule +from rebulk.match import Matches + + +def test_conflict_1(): + input_string = "abcdefghijklmnopqrstuvwxyz" + + pattern = StringPattern("ijklmn", "kl", "abcdef", "ab", "ef", "yz") + matches = Matches(pattern.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + + values = [x.value for x in matches] + + assert values == ["ijklmn", "abcdef", "yz"] + + +def test_conflict_2(): + input_string = "abcdefghijklmnopqrstuvwxyz" + + pattern = StringPattern("ijklmn", "jklmnopqrst") + matches = Matches(pattern.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + + values = [x.value for x in matches] + + assert values == ["jklmnopqrst"] + + +def test_conflict_3(): + input_string = "abcdefghijklmnopqrstuvwxyz" + + pattern = StringPattern("ijklmnopqrst", "jklmnopqrst") + matches = Matches(pattern.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + + values = [x.value for x in matches] + + assert values == ["ijklmnopqrst"] + + +def test_conflict_4(): + input_string = "123456789" + + pattern = StringPattern("123", "456789") + matches = Matches(pattern.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + + values = [x.value for x in matches] + assert values == ["123", "456789"] + + +def test_conflict_5(): + input_string = "123456789" + + pattern = StringPattern("123456", "789") + matches = Matches(pattern.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + + values = [x.value for x in matches] + assert values == ["123456", "789"] + + +def test_prefer_longer_parent(): + input_string = "xxx.1x02.xxx" + + re1 = RePattern("([0-9]+)x([0-9]+)", name='prefer', children=True, formatter=int) + re2 = RePattern("x([0-9]+)", name='skip', children=True) + + matches = Matches(re1.matches(input_string)) + matches.extend(re2.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 2 + assert matches[0].value == 1 + assert matches[1].value == 2 + + +def test_conflict_solver_1(): + input_string = "123456789" + + re1 = StringPattern("2345678", conflict_solver=lambda match, conflicting: '__default__') + re2 = StringPattern("34567") + + matches = Matches(re1.matches(input_string)) + matches.extend(re2.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 1 + assert matches[0].value == "2345678" + + +def test_conflict_solver_2(): + input_string = "123456789" + + re1 = StringPattern("2345678", conflict_solver=lambda match, conflicting: '__default__') + re2 = StringPattern("34567", conflict_solver=lambda match, conflicting: conflicting) + + matches = Matches(re1.matches(input_string)) + matches.extend(re2.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 1 + assert matches[0].value == "34567" + + +def test_conflict_solver_3(): + input_string = "123456789" + + re1 = StringPattern("2345678", conflict_solver=lambda match, conflicting: match) + re2 = StringPattern("34567") + + matches = Matches(re1.matches(input_string)) + matches.extend(re2.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 1 + assert matches[0].value == "34567" + + +def test_conflict_solver_4(): + input_string = "123456789" + + re1 = StringPattern("2345678") + re2 = StringPattern("34567", conflict_solver=lambda match, conflicting: conflicting) + + matches = Matches(re1.matches(input_string)) + matches.extend(re2.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 1 + assert matches[0].value == "34567" + + +def test_conflict_solver_5(): + input_string = "123456789" + + re1 = StringPattern("2345678", conflict_solver=lambda match, conflicting: conflicting) + re2 = StringPattern("34567") + + matches = Matches(re1.matches(input_string)) + matches.extend(re2.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 1 + assert matches[0].value == "2345678" + + +def test_conflict_solver_6(): + input_string = "123456789" + + re1 = StringPattern("2345678") + re2 = StringPattern("34567", conflict_solver=lambda match, conflicting: conflicting) + + matches = Matches(re1.matches(input_string)) + matches.extend(re2.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 1 + assert matches[0].value == "34567" + + +def test_conflict_solver_7(): + input_string = "102" + + re1 = StringPattern("102") + re2 = StringPattern("02") + + matches = Matches(re2.matches(input_string)) + matches.extend(re1.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 1 + assert matches[0].value == "102" + + +def test_unresolved(): + input_string = "123456789" + + re1 = StringPattern("23456") + re2 = StringPattern("34567") + + matches = Matches(re1.matches(input_string)) + matches.extend(re2.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 2 + + re1 = StringPattern("34567") + re2 = StringPattern("2345678", conflict_solver=lambda match, conflicting: None) + + matches = Matches(re1.matches(input_string)) + matches.extend(re2.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 2 + + re1 = StringPattern("34567", conflict_solver=lambda match, conflicting: None) + re2 = StringPattern("2345678") + + matches = Matches(re1.matches(input_string)) + matches.extend(re2.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 2 diff --git a/lib/rebulk/test/test_rebulk.py b/lib/rebulk/test/test_rebulk.py new file mode 100644 index 0000000000000000000000000000000000000000..a29361ca9dfc5cf6af177430a4dae294dd5a07d8 --- /dev/null +++ b/lib/rebulk/test/test_rebulk.py @@ -0,0 +1,408 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, no-member + +from ..rebulk import Rebulk +from ..rules import Rule +import rebulk.test.rebulk_rules_module as rm + + +def test_rebulk_simple(): + rebulk = Rebulk() + + rebulk.string("quick") + rebulk.regex("f.x") + + def func(input_string): + i = input_string.find("over") + if i > -1: + return i, i + len("over") + + rebulk.functional(func) + + input_string = "The quick brown fox jumps over the lazy dog" + + matches = rebulk.matches(input_string) + assert len(matches) == 3 + + assert matches[0].value == "quick" + assert matches[1].value == "fox" + assert matches[2].value == "over" + + +def test_rebulk_composition(): + rebulk = Rebulk() + + rebulk.string("quick") + rebulk.rebulk(Rebulk().regex("f.x")) + + rebulk.rebulk(Rebulk(disabled=lambda context: True).functional(lambda string: None)) + + input_string = "The quick brown fox jumps over the lazy dog" + + matches = rebulk.matches(input_string) + assert len(matches) == 2 + + assert matches[0].value == "quick" + assert matches[1].value == "fox" + + +def test_rebulk_context(): + rebulk = Rebulk() + + context = {'nostring': True, 'word': 'lazy'} + + rebulk.string("quick", disabled=lambda context: context.get('nostring', False)) + rebulk.regex("f.x", disabled=lambda context: context.get('noregex', False)) + + def func(input_string, context): + word = context.get('word', 'over') + i = input_string.find(word) + if i > -1: + return i, i + len(word) + + rebulk.functional(func) + + input_string = "The quick brown fox jumps over the lazy dog" + + matches = rebulk.matches(input_string, context) + assert len(matches) == 2 + + assert matches[0].value == "fox" + assert matches[1].value == "lazy" + + +def test_rebulk_prefer_longer(): + input_string = "The quick brown fox jumps over the lazy dog" + + matches = Rebulk().string("quick").string("own").regex("br.{2}n").matches(input_string) + + assert len(matches) == 2 + + assert matches[0].value == "quick" + assert matches[1].value == "brown" + + +def test_rebulk_defaults(): + input_string = "The quick brown fox jumps over the lazy dog" + + def func(input_string): + i = input_string.find("fox") + if i > -1: + return i, i + len("fox") + + matches = Rebulk()\ + .string_defaults(name="string", tags=["a", "b"])\ + .regex_defaults(name="regex") \ + .functional_defaults(name="functional") \ + .string("quick", tags=["c"])\ + .functional(func)\ + .regex("br.{2}n") \ + .matches(input_string) + assert matches[0].name == "string" + assert matches[0].tags == ["a", "b", "c"] + assert matches[1].name == "functional" + assert matches[2].name == "regex" + + matches = Rebulk() \ + .defaults(name="default", tags=["0"])\ + .string_defaults(name="string", tags=["a", "b"]) \ + .functional_defaults(name="functional", tags=["1"]) \ + .string("quick", tags=["c"]) \ + .functional(func) \ + .regex("br.{2}n") \ + .matches(input_string) + assert matches[0].name == "string" + assert matches[0].tags == ["0", "a", "b", "c"] + assert matches[1].name == "functional" + assert matches[1].tags == ["0", "1"] + assert matches[2].name == "default" + assert matches[2].tags == ["0"] + + +def test_rebulk_rebulk(): + input_string = "The quick brown fox jumps over the lazy dog" + + base = Rebulk().string("quick") + child = Rebulk().string("own").regex("br.{2}n") + + matches = base.rebulk(child).matches(input_string) + + assert len(matches) == 2 + + assert matches[0].value == "quick" + assert matches[1].value == "brown" + + +def test_rebulk_no_default(): + input_string = "The quick brown fox jumps over the lazy dog" + + matches = Rebulk(default_rules=False).string("quick").string("own").regex("br.{2}n").matches(input_string) + + assert len(matches) == 3 + + assert matches[0].value == "quick" + assert matches[1].value == "own" + assert matches[2].value == "brown" + + +def test_rebulk_tags_names(): + rebulk = Rebulk() + + rebulk.string("quick", name="str", tags=["first", "other"]) + rebulk.regex("f.x", tags="other") + + def func(input_string): + i = input_string.find("over") + if i > -1: + return i, i + len("over"), {'tags': ['custom']} + + rebulk.functional(func, name="fn") + + def func2(input_string): + i = input_string.find("lazy") + if i > -1: + return {'start': i, 'end': i + len("lazy"), 'tags': ['custom']} + + rebulk.functional(func2, name="fn") + + input_string = "The quick brown fox jumps over the lazy dog" + + matches = rebulk.matches(input_string) + assert len(matches) == 4 + + assert len(matches.named("str")) == 1 + assert len(matches.named("fn")) == 2 + assert len(matches.named("false")) == 0 + assert len(matches.tagged("false")) == 0 + assert len(matches.tagged("first")) == 1 + assert len(matches.tagged("other")) == 2 + assert len(matches.tagged("custom")) == 2 + + +def test_rebulk_rules_1(): + rebulk = Rebulk() + + rebulk.regex(r'\d{4}', name="year") + rebulk.rules(rm.RemoveAllButLastYear) + + matches = rebulk.matches("1984 keep only last 1968 entry 1982 case") + assert len(matches) == 1 + assert matches[0].value == "1982" + + +def test_rebulk_rules_2(): + rebulk = Rebulk() + + rebulk.regex(r'\d{4}', name="year") + rebulk.string(r'year', name="yearPrefix", private=True) + rebulk.string(r'keep', name="yearSuffix", private=True) + rebulk.rules(rm.PrefixedSuffixedYear) + + matches = rebulk.matches("Keep suffix 1984 keep prefixed year 1968 and remove the rest 1982") + assert len(matches) == 2 + assert matches[0].value == "1984" + assert matches[1].value == "1968" + + +def test_rebulk_rules_3(): + rebulk = Rebulk() + + rebulk.regex(r'\d{4}', name="year") + rebulk.string(r'year', name="yearPrefix", private=True) + rebulk.string(r'keep', name="yearSuffix", private=True) + rebulk.rules(rm.PrefixedSuffixedYearNoLambda) + + matches = rebulk.matches("Keep suffix 1984 keep prefixed year 1968 and remove the rest 1982") + assert len(matches) == 2 + assert matches[0].value == "1984" + assert matches[1].value == "1968" + + +def test_rebulk_rules_4(): + class FirstOnlyRule(Rule): + def when(self, matches, context): + grabbed = matches.named("grabbed", 0) + if grabbed and matches.previous(grabbed): + return grabbed + + def then(self, matches, when_response, context): + matches.remove(when_response) + + rebulk = Rebulk() + + rebulk.regex("This match (.*?)grabbed", name="grabbed") + rebulk.regex("if it's (.*?)first match", private=True) + + rebulk.rules(FirstOnlyRule) + + matches = rebulk.matches("This match is grabbed only if it's the first match") + assert len(matches) == 1 + assert matches[0].value == "This match is grabbed" + + matches = rebulk.matches("if it's NOT the first match, This match is NOT grabbed") + assert len(matches) == 0 + + +class TestMarkers(object): + def test_one_marker(self): + class MarkerRule(Rule): + def when(self, matches, context): + word_match = matches.named("word", 0) + marker = matches.markers.at_match(word_match, lambda marker: marker.name == "mark1", 0) + if not marker: + return word_match + + def then(self, matches, when_response, context): + matches.remove(when_response) + + rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \ + .regex(r'\[.*?\]', marker=True, name="mark2") \ + .string("word", name="word") \ + .rules(MarkerRule) + + matches = rebulk.matches("grab (word) only if it's in parenthesis") + + assert len(matches) == 1 + assert matches[0].value == "word" + + matches = rebulk.matches("don't grab [word] if it's in braket") + assert len(matches) == 0 + + matches = rebulk.matches("don't grab word at all") + assert len(matches) == 0 + + def test_multiple_marker(self): + class MarkerRule(Rule): + def when(self, matches, context): + word_match = matches.named("word", 0) + marker = matches.markers.at_match(word_match, + lambda marker: marker.name == "mark1" or marker.name == "mark2") + if len(marker) < 2: + return word_match + + def then(self, matches, when_response, context): + matches.remove(when_response) + + rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \ + .regex(r'\[.*?\]', marker=True, name="mark2") \ + .regex("w.*?d", name="word") \ + .rules(MarkerRule) + + matches = rebulk.matches("[grab (word) only] if it's in parenthesis and brakets") + + assert len(matches) == 1 + assert matches[0].value == "word" + + matches = rebulk.matches("[don't grab](word)[if brakets are outside]") + assert len(matches) == 0 + + matches = rebulk.matches("(grab w[or)d even] if it's partially in parenthesis and brakets") + assert len(matches) == 1 + assert matches[0].value == "w[or)d" + + def test_at_index_marker(self): + class MarkerRule(Rule): + def when(self, matches, context): + word_match = matches.named("word", 0) + marker = matches.markers.at_index(word_match.start, + lambda marker: marker.name == "mark1", 0) + if not marker: + return word_match + + def then(self, matches, when_response, context): + matches.remove(when_response) + + rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \ + .regex("w.*?d", name="word") \ + .rules(MarkerRule) + + matches = rebulk.matches("gr(ab wo)rd only if starting of match is inside parenthesis") + + assert len(matches) == 1 + assert matches[0].value == "wo)rd" + + matches = rebulk.matches("don't grab wo(rd if starting of match is not inside parenthesis") + + assert len(matches) == 0 + + def test_remove_marker(self): + class MarkerRule(Rule): + def when(self, matches, context): + marker = matches.markers.named("mark1", 0) + if marker: + return marker + + def then(self, matches, when_response, context): + matches.markers.remove(when_response) + + rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \ + .regex("w.*?d", name="word") \ + .rules(MarkerRule) + + matches = rebulk.matches("grab word event (if it's not) inside parenthesis") + + assert len(matches) == 1 + assert matches[0].value == "word" + + assert not matches.markers + + +class TestUnicode(object): + def test_rebulk_simple(self): + input_string = u"敏捷的棕色狐狸跳過懶狗" + + rebulk = Rebulk() + + rebulk.string(u"敏") + rebulk.regex(u"捷") + + def func(input_string): + i = input_string.find(u"的") + if i > -1: + return i, i + len(u"的") + + rebulk.functional(func) + + matches = rebulk.matches(input_string) + assert len(matches) == 3 + + assert matches[0].value == u"敏" + assert matches[1].value == u"捷" + assert matches[2].value == u"的" + + +class TestImmutable(object): + def test_starting(self): + input_string = "The quick brown fox jumps over the lazy dog" + matches = Rebulk().string("quick").string("over").string("fox").matches(input_string) + + for i in range(0, len(input_string)): + starting = matches.starting(i) + for match in list(starting): + starting.remove(match) + + assert len(matches) == 3 + + def test_ending(self): + input_string = "The quick brown fox jumps over the lazy dog" + matches = Rebulk().string("quick").string("over").string("fox").matches(input_string) + + for i in range(0, len(input_string)): + starting = matches.ending(i) + for match in list(starting): + starting.remove(match) + + assert len(matches) == 3 + + def test_named(self): + input_string = "The quick brown fox jumps over the lazy dog" + matches = Rebulk().defaults(name='test').string("quick").string("over").string("fox").matches(input_string) + + named = matches.named('test') + for match in list(named): + named.remove(match) + + assert len(named) == 0 + assert len(matches) == 3 + diff --git a/lib/rebulk/test/test_rules.py b/lib/rebulk/test/test_rules.py new file mode 100644 index 0000000000000000000000000000000000000000..3d53b591e2fee6a2e15653190095d801e47b4389 --- /dev/null +++ b/lib/rebulk/test/test_rules.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name, no-member +import pytest +from rebulk.test.default_rules_module import RuleRemove0, RuleAppend0, RuleRename0, RuleAppend1, RuleRemove1, \ + RuleRename1, RuleAppend2, RuleRename2, RuleAppend3, RuleRename3, RuleAppendTags0, RuleRemoveTags0, \ + RuleAppendTags1, RuleRemoveTags1 + +from ..rules import Rules +from ..match import Matches, Match + +from .rules_module import Rule1, Rule2, Rule3, Rule0, Rule1Disabled +import rebulk.test.rules_module as rm + + +def test_rule_priority(): + matches = Matches([Match(1, 2)]) + + rules = Rules(Rule1, Rule2()) + + rules.execute_all_rules(matches, {}) + assert len(matches) == 0 + matches = Matches([Match(1, 2)]) + + rules = Rules(Rule1(), Rule0) + + rules.execute_all_rules(matches, {}) + assert len(matches) == 1 + assert matches[0] == Match(3, 4) + + +def test_rules_duplicates(): + matches = Matches([Match(1, 2)]) + + rules = Rules(Rule1, Rule1) + + with pytest.raises(ValueError): + rules.execute_all_rules(matches, {}) + + +def test_rule_disabled(): + matches = Matches([Match(1, 2)]) + + rules = Rules(Rule1Disabled(), Rule2()) + + rules.execute_all_rules(matches, {}) + assert len(matches) == 2 + assert matches[0] == Match(1, 2) + assert matches[1] == Match(3, 4) + + +def test_rule_when(): + matches = Matches([Match(1, 2)]) + + rules = Rules(Rule3()) + + rules.execute_all_rules(matches, {'when': False}) + assert len(matches) == 1 + assert matches[0] == Match(1, 2) + + matches = Matches([Match(1, 2)]) + + rules.execute_all_rules(matches, {'when': True}) + assert len(matches) == 2 + assert matches[0] == Match(1, 2) + assert matches[1] == Match(3, 4) + + +class TestDefaultRules(object): + def test_remove(self): + rules = Rules(RuleRemove0) + + matches = Matches([Match(1, 2)]) + rules.execute_all_rules(matches, {}) + + assert len(matches) == 0 + + rules = Rules(RuleRemove1) + + matches = Matches([Match(1, 2)]) + rules.execute_all_rules(matches, {}) + + assert len(matches) == 0 + + def test_append(self): + rules = Rules(RuleAppend0) + + matches = Matches([Match(1, 2)]) + rules.execute_all_rules(matches, {}) + + assert len(matches) == 2 + + rules = Rules(RuleAppend1) + + matches = Matches([Match(1, 2)]) + rules.execute_all_rules(matches, {}) + + assert len(matches) == 2 + + rules = Rules(RuleAppend2) + + matches = Matches([Match(1, 2)]) + rules.execute_all_rules(matches, {}) + + assert len(matches) == 2 + assert len(matches.named('renamed')) == 1 + + rules = Rules(RuleAppend3) + + matches = Matches([Match(1, 2)]) + rules.execute_all_rules(matches, {}) + + assert len(matches) == 2 + assert len(matches.named('renamed')) == 1 + + def test_rename(self): + rules = Rules(RuleRename0) + + matches = Matches([Match(1, 2, name='original')]) + rules.execute_all_rules(matches, {}) + + assert len(matches.named('original')) == 1 + assert len(matches.named('renamed')) == 0 + + rules = Rules(RuleRename1) + + matches = Matches([Match(5, 10, name='original')]) + rules.execute_all_rules(matches, {}) + + assert len(matches.named('original')) == 0 + assert len(matches.named('renamed')) == 1 + + rules = Rules(RuleRename2) + + matches = Matches([Match(5, 10, name='original')]) + rules.execute_all_rules(matches, {}) + + assert len(matches.named('original')) == 0 + assert len(matches.named('renamed')) == 1 + + rules = Rules(RuleRename3) + + matches = Matches([Match(5, 10, name='original')]) + rules.execute_all_rules(matches, {}) + + assert len(matches.named('original')) == 0 + assert len(matches.named('renamed')) == 1 + + def test_append_tags(self): + rules = Rules(RuleAppendTags0) + + matches = Matches([Match(1, 2, name='tags', tags=['other'])]) + rules.execute_all_rules(matches, {}) + + assert len(matches.named('tags')) == 1 + assert matches.named('tags', index=0).tags == ['other', 'new-tag'] + + rules = Rules(RuleAppendTags1) + + matches = Matches([Match(1, 2, name='tags', tags=['other'])]) + rules.execute_all_rules(matches, {}) + + assert len(matches.named('tags')) == 1 + assert matches.named('tags', index=0).tags == ['other', 'new-tag'] + + def test_remove_tags(self): + rules = Rules(RuleRemoveTags0) + + matches = Matches([Match(1, 2, name='tags', tags=['other', 'new-tag'])]) + rules.execute_all_rules(matches, {}) + + assert len(matches.named('tags')) == 1 + assert matches.named('tags', index=0).tags == ['other'] + + rules = Rules(RuleRemoveTags1) + + matches = Matches([Match(1, 2, name='tags', tags=['other', 'new-tag'])]) + rules.execute_all_rules(matches, {}) + + assert len(matches.named('tags')) == 1 + assert matches.named('tags', index=0).tags == ['other'] + + +def test_rule_module(): + rules = Rules(rm) + + matches = Matches([Match(1, 2)]) + rules.execute_all_rules(matches, {}) + + assert len(matches) == 1 + + +def test_rule_repr(): + assert str(Rule0()) == "<Rule0>" + assert str(Rule1()) == "<Rule1>" + assert str(Rule2()) == "<Rule2>" + assert str(Rule1Disabled()) == "<Disabled Rule1>" diff --git a/lib/rebulk/test/test_toposort.py b/lib/rebulk/test/test_toposort.py new file mode 100644 index 0000000000000000000000000000000000000000..76ea60313bc4fcfd9b79dd94c8dd867c079aacb3 --- /dev/null +++ b/lib/rebulk/test/test_toposort.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2014 True Blade Systems, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Original: +# - https://bitbucket.org/ericvsmith/toposort (1.4) +# Modifications: +# - port to pytest +# pylint: skip-file + +import pytest +from ..toposort import toposort, toposort_flatten, CyclicDependency + + +class TestCase(object): + def test_simple(self): + results = list(toposort({2: set([11]), 9: set([11, 8]), 10: set([11, 3]), 11: set([7, 5]), 8: set([7, 3])})) + expected = [set([3, 5, 7]), set([8, 11]), set([2, 9, 10])] + assert results == expected + + # make sure self dependencies are ignored + results = list(toposort({2: set([2, 11]), 9: set([11, 8]), 10: set([10, 11, 3]), 11: set([7, 5]), 8: set([7, 3])})) + expected = [set([3, 5, 7]), set([8, 11]), set([2, 9, 10])] + assert results == expected + + assert list(toposort({1: set()})) == [set([1])] + assert list(toposort({1: set([1])})) == [set([1])] + + def test_no_dependencies(self): + assert list(toposort({1: set([2]), 3: set([4]), 5: set([6])})) == [set([2, 4, 6]), set([1, 3, 5])] + assert list(toposort({1: set(), 3: set(), 5: set()})) == [set([1, 3, 5])] + + def test_empty(self): + assert list(toposort({})) == [] + + def test_strings(self): + results = list(toposort({'2': set(['11']), '9': set(['11', '8']), '10': set(['11', '3']), '11': set(['7', '5']), '8': set(['7', '3'])})) + expected = [set(['3', '5', '7']), set(['8', '11']), set(['2', '9', '10'])] + assert results == expected + + def test_objects(self): + o2 = object() + o3 = object() + o5 = object() + o7 = object() + o8 = object() + o9 = object() + o10 = object() + o11 = object() + results = list(toposort({o2: set([o11]), o9: set([o11, o8]), o10: set([o11, o3]), o11: set([o7, o5]), o8: set([o7, o3, o8])})) + expected = [set([o3, o5, o7]), set([o8, o11]), set([o2, o9, o10])] + assert results == expected + + def test_cycle(self): + # a simple, 2 element cycle + with pytest.raises(CyclicDependency): + list(toposort({1: set([2]), 2: set([1])})) + + # an indirect cycle + with pytest.raises(CyclicDependency): + list(toposort({1: set([2]), 2: set([3]), 3: set([1])})) + + def test_input_not_modified(self): + data = {2: set([11]), + 9: set([11, 8]), + 10: set([11, 3]), + 11: set([7, 5]), + 8: set([7, 3, 8]), # includes something self-referential + } + orig = data.copy() + results = list(toposort(data)) + assert data == orig + + def test_input_not_modified_when_cycle_error(self): + data = {1: set([2]), + 2: set([1]), + 3: set([4]), + } + orig = data.copy() + with pytest.raises(CyclicDependency): + list(toposort(data)) + assert data == orig + + +class TestCaseAll(object): + def test_sort_flatten(self): + data = {2: set([11]), + 9: set([11, 8]), + 10: set([11, 3]), + 11: set([7, 5]), + 8: set([7, 3, 8]), # includes something self-referential + } + expected = [set([3, 5, 7]), set([8, 11]), set([2, 9, 10])] + assert list(toposort(data)) == expected + + # now check the sorted results + results = [] + for item in expected: + results.extend(sorted(item)) + assert toposort_flatten(data) == results + + # and the unsorted results. break the results up into groups to compare them + actual = toposort_flatten(data, False) + results = [set([i for i in actual[0:3]]), set([i for i in actual[3:5]]), set([i for i in actual[5:8]])] + assert results == expected diff --git a/lib/rebulk/test/test_validators.py b/lib/rebulk/test/test_validators.py new file mode 100644 index 0000000000000000000000000000000000000000..ef5c756d8a9b41072f978dd8aeb71839f4f8304f --- /dev/null +++ b/lib/rebulk/test/test_validators.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name + +from functools import partial + +from rebulk.pattern import StringPattern + +from ..validators import chars_before, chars_after, chars_surround, validators + +chars = ' _.' +left = partial(chars_before, chars) +right = partial(chars_after, chars) +surrounding = partial(chars_surround, chars) + + +def test_left_chars(): + matches = list(StringPattern("word", validator=left).matches("xxxwordxxx")) + assert len(matches) == 0 + + matches = list(StringPattern("word", validator=left).matches("xxx_wordxxx")) + assert len(matches) == 1 + + matches = list(StringPattern("word", validator=left).matches("wordxxx")) + assert len(matches) == 1 + + +def test_right_chars(): + matches = list(StringPattern("word", validator=right).matches("xxxwordxxx")) + assert len(matches) == 0 + + matches = list(StringPattern("word", validator=right).matches("xxxword.xxx")) + assert len(matches) == 1 + + matches = list(StringPattern("word", validator=right).matches("xxxword")) + assert len(matches) == 1 + + +def test_surrounding_chars(): + matches = list(StringPattern("word", validator=surrounding).matches("xxxword xxx")) + assert len(matches) == 0 + + matches = list(StringPattern("word", validator=surrounding).matches("xxx.wordxxx")) + assert len(matches) == 0 + + matches = list(StringPattern("word", validator=surrounding).matches("xxx word_xxx")) + assert len(matches) == 1 + + matches = list(StringPattern("word", validator=surrounding).matches("word")) + assert len(matches) == 1 + + +def test_chain(): + matches = list(StringPattern("word", validator=validators(left, right)).matches("xxxword xxx")) + assert len(matches) == 0 + + matches = list(StringPattern("word", validator=validators(left, right)).matches("xxx.wordxxx")) + assert len(matches) == 0 + + matches = list(StringPattern("word", validator=validators(left, right)).matches("xxx word_xxx")) + assert len(matches) == 1 + + matches = list(StringPattern("word", validator=validators(left, right)).matches("word")) + assert len(matches) == 1 + + + + diff --git a/lib/rebulk/toposort.py b/lib/rebulk/toposort.py new file mode 100644 index 0000000000000000000000000000000000000000..2bcba9ae6bb1493d336833524d07ce9370028165 --- /dev/null +++ b/lib/rebulk/toposort.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2014 True Blade Systems, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Original: +# - https://bitbucket.org/ericvsmith/toposort (1.4) +# Modifications: +# - merged Pull request #2 for CyclicDependency error +# - import reduce as original name +# - support python 2.6 dict comprehension + +# pylint: skip-file +from functools import reduce + + +class CyclicDependency(ValueError): + def __init__(self, cyclic): + s = 'Cyclic dependencies exist among these items: {0}'.format(', '.join(repr(x) for x in cyclic.items())) + super(CyclicDependency, self).__init__(s) + self.cyclic = cyclic + + +def toposort(data): + """ + Dependencies are expressed as a dictionary whose keys are items + and whose values are a set of dependent items. Output is a list of + sets in topological order. The first set consists of items with no + dependences, each subsequent set consists of items that depend upon + items in the preceeding sets. + :param data: + :type data: + :return: + :rtype: + """ + + # Special case empty input. + if len(data) == 0: + return + + # Copy the input so as to leave it unmodified. + data = data.copy() + + # Ignore self dependencies. + for k, v in data.items(): + v.discard(k) + # Find all items that don't depend on anything. + extra_items_in_deps = reduce(set.union, data.values()) - set(data.keys()) + # Add empty dependences where needed. + data.update(dict((item, set()) for item in extra_items_in_deps)) + while True: + ordered = set(item for item, dep in data.items() if len(dep) == 0) + if not ordered: + break + yield ordered + data = dict((item, (dep - ordered)) + for item, dep in data.items() + if item not in ordered) + if len(data) != 0: + raise CyclicDependency(data) + + +def toposort_flatten(data, sort=True): + """ + Returns a single list of dependencies. For any set returned by + toposort(), those items are sorted and appended to the result (just to + make the results deterministic). + :param data: + :type data: + :param sort: + :type sort: + :return: Single list of dependencies. + :rtype: list + """ + + result = [] + for d in toposort(data): + result.extend((sorted if sort else list)(d)) + return result diff --git a/lib/rebulk/utils.py b/lib/rebulk/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6e8ffb23a30c676002103c073cd3b7bea0d03767 --- /dev/null +++ b/lib/rebulk/utils.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Various utilities functions +""" +from types import GeneratorType + +from collections import MutableSet + + +def find_all(string, sub, start=None, end=None, ignore_case=False): + """ + Return all indices in string s where substring sub is + found, such that sub is contained in the slice s[start:end]. + + >>> list(find_all('The quick brown fox jumps over the lazy dog', 'fox')) + [16] + + >>> list(find_all('The quick brown fox jumps over the lazy dog', 'mountain')) + [] + + >>> list(find_all('The quick brown fox jumps over the lazy dog', 'The')) + [0] + + >>> list(find_all( + ... 'Carved symbols in a mountain hollow on the bank of an inlet irritated an eccentric person', + ... 'an')) + [44, 51, 70] + + >>> list(find_all( + ... 'Carved symbols in a mountain hollow on the bank of an inlet irritated an eccentric person', + ... 'an', + ... 50, + ... 60)) + [51] + + :param string: the input string + :type string: str + :param sub: the substring + :type sub: str + :return: all indices in the input string + :rtype: __generator[str] + """ + if ignore_case: + sub = sub.lower() + string = string.lower() + while True: + start = string.find(sub, start, end) + if start == -1: + return + yield start + start += len(sub) + + +def is_iterable(obj): + """ + Are we being asked to look up a list of things, instead of a single thing? + We check for the `__iter__` attribute so that this can cover types that + don't have to be known by this module, such as NumPy arrays. + + Strings, however, should be considered as atomic values to look up, not + iterables. + + We don't need to check for the Python 2 `unicode` type, because it doesn't + have an `__iter__` attribute anyway. + """ + return hasattr(obj, '__iter__') and not isinstance(obj, str) or isinstance(obj, GeneratorType) + + +def extend_safe(target, source): + """ + Extends source list to target list only if elements doesn't exists in target list. + :param target: + :type target: list + :param source: + :type source: list + """ + for elt in source: + if elt not in target: + target.append(elt) + + +class _Ref(object): + """ + Reference for IdentitySet + """ + def __init__(self, value): + self.value = value + + def __eq__(self, other): + return self.value is other.value + + def __hash__(self): + return id(self.value) + + +class IdentitySet(MutableSet): # pragma: no cover + """ + Set based on identity + """ + def __init__(self, items=None): + if items is None: + items = [] + self.refs = set(map(_Ref, items)) + + def __contains__(self, elem): + return _Ref(elem) in self.refs + + def __iter__(self): + return (ref.value for ref in self.refs) + + def __len__(self): + return len(self.refs) + + def add(self, elem): + self.refs.add(_Ref(elem)) + + def discard(self, elem): + self.refs.discard(_Ref(elem)) + + def update(self, iterable): + """ + Update set with iterable + :param iterable: + :type iterable: + :return: + :rtype: + """ + for elem in iterable: + self.add(elem) + + def __repr__(self): # pragma: no cover + return "%s(%s)" % (type(self).__name__, list(self)) diff --git a/lib/rebulk/validators.py b/lib/rebulk/validators.py new file mode 100644 index 0000000000000000000000000000000000000000..7d5e6303bb8d4683ae836973190f335302adf5ef --- /dev/null +++ b/lib/rebulk/validators.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Validator functions to use in patterns. + +All those function have last argument as match, so it's possible to use functools.partial to bind previous arguments. +""" + + +def chars_before(chars, match): + """ + Validate the match if left character is in a given sequence. + + :param chars: + :type chars: + :param match: + :type match: + :return: + :rtype: + """ + if match.start <= 0: + return True + return match.input_string[match.start - 1] in chars + + +def chars_after(chars, match): + """ + Validate the match if left character is in a given sequence. + + :param chars: + :type chars: + :param match: + :type match: + :return: + :rtype: + """ + if match.end >= len(match.input_string): + return True + return match.input_string[match.end] in chars + + +def chars_surround(chars, match): + """ + Validate the match if surrounding characters are in a given sequence. + + :param chars: + :type chars: + :param match: + :type match: + :return: + :rtype: + """ + return chars_before(chars, match) and chars_after(chars, match) + + +def validators(*chained_validators): + """ + Creates a validator chain from several validator functions. + + :param chained_validators: + :type chained_validators: + :return: + :rtype: + """ + def validator_chain(match): # pylint:disable=missing-docstring + for chained_validator in chained_validators: + if not chained_validator(match): + return False + return True + return validator_chain diff --git a/lib/regex/__init__.py b/lib/regex/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..098b02f2a30bb9ca40c0a8b91c5c209286462f83 --- /dev/null +++ b/lib/regex/__init__.py @@ -0,0 +1,703 @@ +# +# Secret Labs' Regular Expression Engine +# +# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. +# +# This version of the SRE library can be redistributed under CNRI's +# Python 1.6 license. For any other use, please contact Secret Labs +# AB (info@pythonware.com). +# +# Portions of this engine have been developed in cooperation with +# CNRI. Hewlett-Packard provided funding for 1.6 integration and +# other compatibility work. +# +# 2010-01-16 mrab Python front-end re-written and extended + +r"""Support for regular expressions (RE). + +This module provides regular expression matching operations similar to those +found in Perl. It supports both 8-bit and Unicode strings; both the pattern and +the strings being processed can contain null bytes and characters outside the +US ASCII range. + +Regular expressions can contain both special and ordinary characters. Most +ordinary characters, like "A", "a", or "0", are the simplest regular +expressions; they simply match themselves. You can concatenate ordinary +characters, so last matches the string 'last'. + +There are a few differences between the old (legacy) behaviour and the new +(enhanced) behaviour, which are indicated by VERSION0 or VERSION1. + +The special characters are: + "." Matches any character except a newline. + "^" Matches the start of the string. + "$" Matches the end of the string or just before the + newline at the end of the string. + "*" Matches 0 or more (greedy) repetitions of the preceding + RE. Greedy means that it will match as many repetitions + as possible. + "+" Matches 1 or more (greedy) repetitions of the preceding + RE. + "?" Matches 0 or 1 (greedy) of the preceding RE. + *?,+?,?? Non-greedy versions of the previous three special + characters. + *+,++,?+ Possessive versions of the previous three special + characters. + {m,n} Matches from m to n repetitions of the preceding RE. + {m,n}? Non-greedy version of the above. + {m,n}+ Possessive version of the above. + {...} Fuzzy matching constraints. + "\\" Either escapes special characters or signals a special + sequence. + [...] Indicates a set of characters. A "^" as the first + character indicates a complementing set. + "|" A|B, creates an RE that will match either A or B. + (...) Matches the RE inside the parentheses. The contents are + captured and can be retrieved or matched later in the + string. + (?flags-flags) VERSION1: Sets/clears the flags for the remainder of + the group or pattern; VERSION0: Sets the flags for the + entire pattern. + (?:...) Non-capturing version of regular parentheses. + (?>...) Atomic non-capturing version of regular parentheses. + (?flags-flags:...) Non-capturing version of regular parentheses with local + flags. + (?P<name>...) The substring matched by the group is accessible by + name. + (?<name>...) The substring matched by the group is accessible by + name. + (?P=name) Matches the text matched earlier by the group named + name. + (?#...) A comment; ignored. + (?=...) Matches if ... matches next, but doesn't consume the + string. + (?!...) Matches if ... doesn't match next. + (?<=...) Matches if preceded by .... + (?<!...) Matches if not preceded by .... + (?(id)yes|no) Matches yes pattern if group id matched, the (optional) + no pattern otherwise. + (?(DEFINE)...) If there's no group called "DEFINE", then ... will be + ignored, but any group definitions will be available. + (?|...|...) (?|A|B), creates an RE that will match either A or B, + but reuses capture group numbers across the + alternatives. + (*FAIL) Forces matching to fail, which means immediate + backtracking. + (*F) Abbreviation for (*FAIL). + (*PRUNE) Discards the current backtracking information. Its + effect doesn't extend outside an atomic group or a + lookaround. + (*SKIP) Similar to (*PRUNE), except that it also sets where in + the text the next attempt at matching the entire + pattern will start. Its effect doesn't extend outside + an atomic group or a lookaround. + +The fuzzy matching constraints are: "i" to permit insertions, "d" to permit +deletions, "s" to permit substitutions, "e" to permit any of these. Limits are +optional with "<=" and "<". If any type of error is provided then any type not +provided is not permitted. + +A cost equation may be provided. + +Examples: + (?:fuzzy){i<=2} + (?:fuzzy){i<=1,s<=2,d<=1,1i+1s+1d<3} + +VERSION1: Set operators are supported, and a set can include nested sets. The +set operators, in order of increasing precedence, are: + || Set union ("x||y" means "x or y"). + ~~ (double tilde) Symmetric set difference ("x~~y" means "x or y, but not + both"). + && Set intersection ("x&&y" means "x and y"). + -- (double dash) Set difference ("x--y" means "x but not y"). + +Implicit union, ie, simple juxtaposition like in [ab], has the highest +precedence. + +VERSION0 and VERSION1: +The special sequences consist of "\\" and a character from the list below. If +the ordinary character is not on the list, then the resulting RE will match the +second character. + \number Matches the contents of the group of the same number if + number is no more than 2 digits, otherwise the character + with the 3-digit octal code. + \a Matches the bell character. + \A Matches only at the start of the string. + \b Matches the empty string, but only at the start or end of a + word. + \B Matches the empty string, but not at the start or end of a + word. + \d Matches any decimal digit; equivalent to the set [0-9] when + matching a bytestring or a Unicode string with the ASCII + flag, or the whole range of Unicode digits when matching a + Unicode string. + \D Matches any non-digit character; equivalent to [^\d]. + \f Matches the formfeed character. + \g<name> Matches the text matched by the group named name. + \G Matches the empty string, but only at the position where + the search started. + \K Keeps only what follows for the entire match. + \L<name> Named list. The list is provided as a keyword argument. + \m Matches the empty string, but only at the start of a word. + \M Matches the empty string, but only at the end of a word. + \n Matches the newline character. + \N{name} Matches the named character. + \p{name=value} Matches the character if its property has the specified + value. + \P{name=value} Matches the character if its property hasn't the specified + value. + \r Matches the carriage-return character. + \s Matches any whitespace character; equivalent to + [ \t\n\r\f\v]. + \S Matches any non-whitespace character; equivalent to [^\s]. + \t Matches the tab character. + \uXXXX Matches the Unicode codepoint with 4-digit hex code XXXX. + \UXXXXXXXX Matches the Unicode codepoint with 8-digit hex code + XXXXXXXX. + \v Matches the vertical tab character. + \w Matches any alphanumeric character; equivalent to + [a-zA-Z0-9_] when matching a bytestring or a Unicode string + with the ASCII flag, or the whole range of Unicode + alphanumeric characters (letters plus digits plus + underscore) when matching a Unicode string. With LOCALE, it + will match the set [0-9_] plus characters defined as + letters for the current locale. + \W Matches the complement of \w; equivalent to [^\w]. + \xXX Matches the character with 2-digit hex code XX. + \X Matches a grapheme. + \Z Matches only at the end of the string. + \\ Matches a literal backslash. + +This module exports the following functions: + match Match a regular expression pattern at the beginning of a string. + fullmatch Match a regular expression pattern against all of a string. + search Search a string for the presence of a pattern. + sub Substitute occurrences of a pattern found in a string using a + template string. + subf Substitute occurrences of a pattern found in a string using a + format string. + subn Same as sub, but also return the number of substitutions made. + subfn Same as subf, but also return the number of substitutions made. + split Split a string by the occurrences of a pattern. VERSION1: will + split at zero-width match; VERSION0: won't split at zero-width + match. + splititer Return an iterator yielding the parts of a split string. + findall Find all occurrences of a pattern in a string. + finditer Return an iterator yielding a match object for each match. + compile Compile a pattern into a Pattern object. + purge Clear the regular expression cache. + escape Backslash all non-alphanumerics or special characters in a + string. + +Most of the functions support a concurrent parameter: if True, the GIL will be +released during matching, allowing other Python threads to run concurrently. If +the string changes during matching, the behaviour is undefined. This parameter +is not needed when working on the builtin (immutable) string classes. + +Some of the functions in this module take flags as optional parameters. Most of +these flags can also be set within an RE: + A a ASCII Make \w, \W, \b, \B, \d, and \D match the + corresponding ASCII character categories. Default + when matching a bytestring. + B b BESTMATCH Find the best fuzzy match (default is first). + D DEBUG Print the parsed pattern. + E e ENHANCEMATCH Attempt to improve the fit after finding the first + fuzzy match. + F f FULLCASE Use full case-folding when performing + case-insensitive matching in Unicode. + I i IGNORECASE Perform case-insensitive matching. + L L LOCALE Make \w, \W, \b, \B, \d, and \D dependent on the + current locale. (One byte per character only.) + M m MULTILINE "^" matches the beginning of lines (after a newline) + as well as the string. "$" matches the end of lines + (before a newline) as well as the end of the string. + P p POSIX Perform POSIX-standard matching (leftmost longest). + R r REVERSE Searches backwards. + S s DOTALL "." matches any character at all, including the + newline. + U u UNICODE Make \w, \W, \b, \B, \d, and \D dependent on the + Unicode locale. Default when matching a Unicode + string. + V0 V0 VERSION0 Turn on the old legacy behaviour. + V1 V1 VERSION1 Turn on the new enhanced behaviour. This flag + includes the FULLCASE flag. + W w WORD Make \b and \B work with default Unicode word breaks + and make ".", "^" and "$" work with Unicode line + breaks. + X x VERBOSE Ignore whitespace and comments for nicer looking REs. + +This module also defines an exception 'error'. + +""" + +# Public symbols. +__all__ = ["compile", "escape", "findall", "finditer", "fullmatch", "match", + "purge", "search", "split", "splititer", "sub", "subf", "subfn", "subn", + "template", "Scanner", "A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", + "ENHANCEMATCH", "S", "DOTALL", "F", "FULLCASE", "I", "IGNORECASE", "L", + "LOCALE", "M", "MULTILINE", "P", "POSIX", "R", "REVERSE", "T", "TEMPLATE", + "U", "UNICODE", "V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W", + "WORD", "error", "Regex"] + +__version__ = "2.4.85" + +# -------------------------------------------------------------------- +# Public interface. + +def match(pattern, string, flags=0, pos=None, endpos=None, partial=False, + concurrent=None, **kwargs): + """Try to apply the pattern at the start of the string, returning a match + object, or None if no match was found.""" + return _compile(pattern, flags, kwargs).match(string, pos, endpos, + concurrent, partial) + +def fullmatch(pattern, string, flags=0, pos=None, endpos=None, partial=False, + concurrent=None, **kwargs): + """Try to apply the pattern against all of the string, returning a match + object, or None if no match was found.""" + return _compile(pattern, flags, kwargs).fullmatch(string, pos, endpos, + concurrent, partial) + +def search(pattern, string, flags=0, pos=None, endpos=None, partial=False, + concurrent=None, **kwargs): + """Search through string looking for a match to the pattern, returning a + match object, or None if no match was found.""" + return _compile(pattern, flags, kwargs).search(string, pos, endpos, + concurrent, partial) + +def sub(pattern, repl, string, count=0, flags=0, pos=None, endpos=None, + concurrent=None, **kwargs): + """Return the string obtained by replacing the leftmost (or rightmost with a + reverse pattern) non-overlapping occurrences of the pattern in string by the + replacement repl. repl can be either a string or a callable; if a string, + backslash escapes in it are processed; if a callable, it's passed the match + object and must return a replacement string to be used.""" + return _compile(pattern, flags, kwargs).sub(repl, string, count, pos, + endpos, concurrent) + +def subf(pattern, format, string, count=0, flags=0, pos=None, endpos=None, + concurrent=None, **kwargs): + """Return the string obtained by replacing the leftmost (or rightmost with a + reverse pattern) non-overlapping occurrences of the pattern in string by the + replacement format. format can be either a string or a callable; if a string, + it's treated as a format string; if a callable, it's passed the match object + and must return a replacement string to be used.""" + return _compile(pattern, flags, kwargs).subf(format, string, count, pos, + endpos, concurrent) + +def subn(pattern, repl, string, count=0, flags=0, pos=None, endpos=None, + concurrent=None, **kwargs): + """Return a 2-tuple containing (new_string, number). new_string is the string + obtained by replacing the leftmost (or rightmost with a reverse pattern) + non-overlapping occurrences of the pattern in the source string by the + replacement repl. number is the number of substitutions that were made. repl + can be either a string or a callable; if a string, backslash escapes in it + are processed; if a callable, it's passed the match object and must return a + replacement string to be used.""" + return _compile(pattern, flags, kwargs).subn(repl, string, count, pos, + endpos, concurrent) + +def subfn(pattern, format, string, count=0, flags=0, pos=None, endpos=None, + concurrent=None, **kwargs): + """Return a 2-tuple containing (new_string, number). new_string is the string + obtained by replacing the leftmost (or rightmost with a reverse pattern) + non-overlapping occurrences of the pattern in the source string by the + replacement format. number is the number of substitutions that were made. format + can be either a string or a callable; if a string, it's treated as a format + string; if a callable, it's passed the match object and must return a + replacement string to be used.""" + return _compile(pattern, flags, kwargs).subfn(format, string, count, pos, + endpos, concurrent) + +def split(pattern, string, maxsplit=0, flags=0, concurrent=None, **kwargs): + """Split the source string by the occurrences of the pattern, returning a + list containing the resulting substrings. If capturing parentheses are used + in pattern, then the text of all groups in the pattern are also returned as + part of the resulting list. If maxsplit is nonzero, at most maxsplit splits + occur, and the remainder of the string is returned as the final element of + the list.""" + return _compile(pattern, flags, kwargs).split(string, maxsplit, concurrent) + +def splititer(pattern, string, maxsplit=0, flags=0, concurrent=None, **kwargs): + "Return an iterator yielding the parts of a split string." + return _compile(pattern, flags, kwargs).splititer(string, maxsplit, + concurrent) + +def findall(pattern, string, flags=0, pos=None, endpos=None, overlapped=False, + concurrent=None, **kwargs): + """Return a list of all matches in the string. The matches may be overlapped + if overlapped is True. If one or more groups are present in the pattern, + return a list of groups; this will be a list of tuples if the pattern has + more than one group. Empty matches are included in the result.""" + return _compile(pattern, flags, kwargs).findall(string, pos, endpos, + overlapped, concurrent) + +def finditer(pattern, string, flags=0, pos=None, endpos=None, overlapped=False, + partial=False, concurrent=None, **kwargs): + """Return an iterator over all matches in the string. The matches may be + overlapped if overlapped is True. For each match, the iterator returns a + match object. Empty matches are included in the result.""" + return _compile(pattern, flags, kwargs).finditer(string, pos, endpos, + overlapped, concurrent, partial) + +def compile(pattern, flags=0, **kwargs): + "Compile a regular expression pattern, returning a pattern object." + return _compile(pattern, flags, kwargs) + +def purge(): + "Clear the regular expression cache" + _cache.clear() + _locale_sensitive.clear() + +def template(pattern, flags=0): + "Compile a template pattern, returning a pattern object." + return _compile(pattern, flags | TEMPLATE) + +def escape(pattern, special_only=False): + "Escape all non-alphanumeric characters or special characters in pattern." + s = [] + if special_only: + for c in pattern: + if c in _METACHARS: + s.append("\\") + s.append(c) + elif c == "\x00": + s.append("\\000") + else: + s.append(c) + else: + for c in pattern: + if c in _ALNUM: + s.append(c) + elif c == "\x00": + s.append("\\000") + else: + s.append("\\") + s.append(c) + + return pattern[ : 0].join(s) + +# -------------------------------------------------------------------- +# Internals. + +import _regex_core +import _regex +from threading import RLock as _RLock +from locale import getlocale as _getlocale +from _regex_core import * +from _regex_core import (_ALL_VERSIONS, _ALL_ENCODINGS, _FirstSetError, + _UnscopedFlagSet, _check_group_features, _compile_firstset, + _compile_replacement, _flatten_code, _fold_case, _get_required_string, + _parse_pattern, _shrink_cache) +from _regex_core import (ALNUM as _ALNUM, Info as _Info, OP as _OP, Source as + _Source, Fuzzy as _Fuzzy) + +# Version 0 is the old behaviour, compatible with the original 're' module. +# Version 1 is the new behaviour, which differs slightly. + +DEFAULT_VERSION = VERSION0 + +_METACHARS = frozenset("()[]{}?*+|^$\\.") + +_regex_core.DEFAULT_VERSION = DEFAULT_VERSION + +# Caches for the patterns and replacements. +_cache = {} +_cache_lock = _RLock() +_named_args = {} +_replacement_cache = {} +_locale_sensitive = {} + +# Maximum size of the cache. +_MAXCACHE = 500 +_MAXREPCACHE = 500 + +def _compile(pattern, flags=0, kwargs={}): + "Compiles a regular expression to a PatternObject." + + # We won't bother to cache the pattern if we're debugging. + debugging = (flags & DEBUG) != 0 + + # What locale is this pattern using? + locale_key = (type(pattern), pattern) + if _locale_sensitive.get(locale_key, True) or (flags & LOCALE) != 0: + # This pattern is, or might be, locale-sensitive. + pattern_locale = _getlocale()[1] + else: + # This pattern is definitely not locale-sensitive. + pattern_locale = None + + if not debugging: + try: + # Do we know what keyword arguments are needed? + args_key = pattern, type(pattern), flags + args_needed = _named_args[args_key] + + # Are we being provided with its required keyword arguments? + args_supplied = set() + if args_needed: + for k, v in args_needed: + try: + args_supplied.add((k, frozenset(kwargs[k]))) + except KeyError: + raise error("missing named list: {!r}".format(k)) + + args_supplied = frozenset(args_supplied) + + # Have we already seen this regular expression and named list? + pattern_key = (pattern, type(pattern), flags, args_supplied, + DEFAULT_VERSION, pattern_locale) + return _cache[pattern_key] + except KeyError: + # It's a new pattern, or new named list for a known pattern. + pass + + # Guess the encoding from the class of the pattern string. + if isinstance(pattern, unicode): + guess_encoding = UNICODE + elif isinstance(pattern, str): + guess_encoding = ASCII + elif isinstance(pattern, _pattern_type): + if flags: + raise ValueError("cannot process flags argument with a compiled pattern") + + return pattern + else: + raise TypeError("first argument must be a string or compiled pattern") + + # Set the default version in the core code in case it has been changed. + _regex_core.DEFAULT_VERSION = DEFAULT_VERSION + + global_flags = flags + + while True: + caught_exception = None + try: + source = _Source(pattern) + info = _Info(global_flags, source.char_type, kwargs) + info.guess_encoding = guess_encoding + source.ignore_space = bool(info.flags & VERBOSE) + parsed = _parse_pattern(source, info) + break + except _UnscopedFlagSet: + # Remember the global flags for the next attempt. + global_flags = info.global_flags + except error, e: + caught_exception = e + + if caught_exception: + raise error(caught_exception.msg, caught_exception.pattern, + caught_exception.pos) + + if not source.at_end(): + raise error("unbalanced parenthesis", pattern, source.pos) + + # Check the global flags for conflicts. + version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION + if version not in (0, VERSION0, VERSION1): + raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible") + + if (info.flags & _ALL_ENCODINGS) not in (0, ASCII, LOCALE, UNICODE): + raise ValueError("ASCII, LOCALE and UNICODE flags are mutually incompatible") + + if not (info.flags & _ALL_ENCODINGS): + if isinstance(pattern, unicode): + info.flags |= UNICODE + else: + info.flags |= ASCII + + reverse = bool(info.flags & REVERSE) + fuzzy = isinstance(parsed, _Fuzzy) + + # Remember whether this pattern as an inline locale flag. + _locale_sensitive[locale_key] = info.inline_locale + + # Fix the group references. + caught_exception = None + try: + parsed.fix_groups(pattern, reverse, False) + except error, e: + caught_exception = e + + if caught_exception: + raise error(caught_exception.msg, caught_exception.pattern, + caught_exception.pos) + + # Should we print the parsed pattern? + if flags & DEBUG: + parsed.dump(indent=0, reverse=reverse) + + # Optimise the parsed pattern. + parsed = parsed.optimise(info) + parsed = parsed.pack_characters(info) + + # Get the required string. + req_offset, req_chars, req_flags = _get_required_string(parsed, info.flags) + + # Build the named lists. + named_lists = {} + named_list_indexes = [None] * len(info.named_lists_used) + args_needed = set() + for key, index in info.named_lists_used.items(): + name, case_flags = key + values = frozenset(kwargs[name]) + if case_flags: + items = frozenset(_fold_case(info, v) for v in values) + else: + items = values + named_lists[name] = values + named_list_indexes[index] = items + args_needed.add((name, values)) + + # Check the features of the groups. + _check_group_features(info, parsed) + + # Compile the parsed pattern. The result is a list of tuples. + code = parsed.compile(reverse) + + # Is there a group call to the pattern as a whole? + key = (0, reverse, fuzzy) + ref = info.call_refs.get(key) + if ref is not None: + code = [(_OP.CALL_REF, ref)] + code + [(_OP.END, )] + + # Add the final 'success' opcode. + code += [(_OP.SUCCESS, )] + + # Compile the additional copies of the groups that we need. + for group, rev, fuz in info.additional_groups: + code += group.compile(rev, fuz) + + # Flatten the code into a list of ints. + code = _flatten_code(code) + + if not parsed.has_simple_start(): + # Get the first set, if possible. + try: + fs_code = _compile_firstset(info, parsed.get_firstset(reverse)) + fs_code = _flatten_code(fs_code) + code = fs_code + code + except _FirstSetError: + pass + + # The named capture groups. + index_group = dict((v, n) for n, v in info.group_index.items()) + + # Create the PatternObject. + # + # Local flags like IGNORECASE affect the code generation, but aren't needed + # by the PatternObject itself. Conversely, global flags like LOCALE _don't_ + # affect the code generation but _are_ needed by the PatternObject. + compiled_pattern = _regex.compile(pattern, info.flags | version, code, + info.group_index, index_group, named_lists, named_list_indexes, + req_offset, req_chars, req_flags, info.group_count) + + # Do we need to reduce the size of the cache? + if len(_cache) >= _MAXCACHE: + _cache_lock.acquire() + try: + _shrink_cache(_cache, _named_args, _locale_sensitive, _MAXCACHE) + finally: + _cache_lock.release() + + if not debugging: + if (info.flags & LOCALE) == 0: + pattern_locale = None + + args_needed = frozenset(args_needed) + + # Store this regular expression and named list. + pattern_key = (pattern, type(pattern), flags, args_needed, + DEFAULT_VERSION, pattern_locale) + _cache[pattern_key] = compiled_pattern + + # Store what keyword arguments are needed. + _named_args[args_key] = args_needed + + return compiled_pattern + +def _compile_replacement_helper(pattern, template): + "Compiles a replacement template." + # This function is called by the _regex module. + + # Have we seen this before? + key = pattern.pattern, pattern.flags, template + compiled = _replacement_cache.get(key) + if compiled is not None: + return compiled + + if len(_replacement_cache) >= _MAXREPCACHE: + _replacement_cache.clear() + + is_unicode = isinstance(template, unicode) + source = _Source(template) + if is_unicode: + def make_string(char_codes): + return u"".join(unichr(c) for c in char_codes) + else: + def make_string(char_codes): + return "".join(chr(c) for c in char_codes) + + compiled = [] + literal = [] + while True: + ch = source.get() + if not ch: + break + if ch == "\\": + # '_compile_replacement' will return either an int group reference + # or a string literal. It returns items (plural) in order to handle + # a 2-character literal (an invalid escape sequence). + is_group, items = _compile_replacement(source, pattern, is_unicode) + if is_group: + # It's a group, so first flush the literal. + if literal: + compiled.append(make_string(literal)) + literal = [] + compiled.extend(items) + else: + literal.extend(items) + else: + literal.append(ord(ch)) + + # Flush the literal. + if literal: + compiled.append(make_string(literal)) + + _replacement_cache[key] = compiled + + return compiled + +# We define _pattern_type here after all the support objects have been defined. +_pattern_type = type(_compile("", 0, {})) + +# We'll define an alias for the 'compile' function so that the repr of a +# pattern object is eval-able. +Regex = compile + +# Register myself for pickling. +import copy_reg as _copy_reg + +def _pickle(p): + return _compile, (p.pattern, p.flags) + +_copy_reg.pickle(_pattern_type, _pickle, _compile) + +if not hasattr(str, "format"): + # Strings don't have the .format method (below Python 2.6). + while True: + _start = __doc__.find(" subf") + if _start < 0: + break + + _end = __doc__.find("\n", _start) + 1 + while __doc__.startswith(" ", _end): + _end = __doc__.find("\n", _end) + 1 + + __doc__ = __doc__[ : _start] + __doc__[_end : ] + + __all__ = [_name for _name in __all__ if not _name.startswith("subf")] + + del _start, _end + + del subf, subfn diff --git a/lib/regex/_regex.c b/lib/regex/_regex.c new file mode 100644 index 0000000000000000000000000000000000000000..e9602102adfdd6afa4bd6859ec0927a50e2d7ff6 --- /dev/null +++ b/lib/regex/_regex.c @@ -0,0 +1,24497 @@ +/* Secret Labs' Regular Expression Engine + * + * regular expression matching engine + * + * partial history: + * 1999-10-24 fl created (based on existing template matcher code) + * 2000-03-06 fl first alpha, sort of + * 2000-08-01 fl fixes for 1.6b1 + * 2000-08-07 fl use PyOS_CheckStack() if available + * 2000-09-20 fl added expand method + * 2001-03-20 fl lots of fixes for 2.1b2 + * 2001-04-15 fl export copyright as Python attribute, not global + * 2001-04-28 fl added __copy__ methods (work in progress) + * 2001-05-14 fl fixes for 1.5.2 compatibility + * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis) + * 2001-10-18 fl fixed group reset issue (from Matthew Mueller) + * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1 + * 2001-10-21 fl added sub/subn primitive + * 2001-10-24 fl added finditer primitive (for 2.2 only) + * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum) + * 2002-11-09 fl fixed empty sub/subn return type + * 2003-04-18 mvl fully support 4-byte codes + * 2003-10-17 gn implemented non recursive scheme + * 2009-07-26 mrab completely re-designed matcher code + * 2011-11-18 mrab added support for PEP 393 strings + * + * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. + * + * This version of the SRE library can be redistributed under CNRI's + * Python 1.6 license. For any other use, please contact Secret Labs + * AB (info@pythonware.com). + * + * Portions of this engine have been developed in cooperation with + * CNRI. Hewlett-Packard provided funding for 1.6 integration and + * other compatibility work. + */ + +/* #define VERBOSE */ + +#if defined(VERBOSE) +#define TRACE(X) printf X; +#else +#define TRACE(X) +#endif + +#include "Python.h" +#include "structmember.h" /* offsetof */ +#include <ctype.h> +#include "_regex.h" +#include "pyport.h" +#include "pythread.h" + +#if PY_VERSION_HEX < 0x02060000 +#if SIZEOF_SIZE_T == SIZEOF_LONG_LONG +#define T_PYSSIZET T_LONGLONG +#elif SIZEOF_SIZE_T == SIZEOF_LONG +#define T_PYSSIZET T_LONG +#else +#error size_t is the same size as neither LONG nor LONGLONG +#endif + +#endif +typedef unsigned char Py_UCS1; +typedef unsigned short Py_UCS2; + +typedef RE_UINT32 RE_CODE; + +/* Properties in the General Category. */ +#define RE_PROP_GC_CN ((RE_PROP_GC << 16) | RE_PROP_CN) +#define RE_PROP_GC_LU ((RE_PROP_GC << 16) | RE_PROP_LU) +#define RE_PROP_GC_LL ((RE_PROP_GC << 16) | RE_PROP_LL) +#define RE_PROP_GC_LT ((RE_PROP_GC << 16) | RE_PROP_LT) +#define RE_PROP_GC_P ((RE_PROP_GC << 16) | RE_PROP_P) + +/* Unlimited repeat count. */ +#define RE_UNLIMITED (~(RE_CODE)0) + +/* The status of a . */ +typedef RE_UINT32 RE_STATUS_T; + +/* Whether to match concurrently, i.e. release the GIL while matching. */ +#define RE_CONC_NO 0 +#define RE_CONC_YES 1 +#define RE_CONC_DEFAULT 2 + +/* The side that could truncate in a partial match. + * + * The values RE_PARTIAL_LEFT and RE_PARTIAL_RIGHT are also used as array + * indexes, so they need to be 0 and 1. + */ +#define RE_PARTIAL_NONE -1 +#define RE_PARTIAL_LEFT 0 +#define RE_PARTIAL_RIGHT 1 + +/* Flags for the kind of 'sub' call: 'sub', 'subn', 'subf', 'subfn'. */ +#define RE_SUB 0x0 +#define RE_SUBN 0x1 +#if PY_VERSION_HEX >= 0x02060000 +#define RE_SUBF 0x2 +#endif + +/* The name of this module, minus the leading underscore. */ +#define RE_MODULE "regex" + +/* Error codes. */ +#define RE_ERROR_SUCCESS 1 /* Successful match. */ +#define RE_ERROR_FAILURE 0 /* Unsuccessful match. */ +#define RE_ERROR_ILLEGAL -1 /* Illegal code. */ +#define RE_ERROR_INTERNAL -2 /* Internal error. */ +#define RE_ERROR_CONCURRENT -3 /* "concurrent" invalid. */ +#define RE_ERROR_MEMORY -4 /* Out of memory. */ +#define RE_ERROR_INTERRUPTED -5 /* Signal handler raised exception. */ +#define RE_ERROR_REPLACEMENT -6 /* Invalid replacement string. */ +#define RE_ERROR_INVALID_GROUP_REF -7 /* Invalid group reference. */ +#define RE_ERROR_GROUP_INDEX_TYPE -8 /* Group index type error. */ +#define RE_ERROR_NO_SUCH_GROUP -9 /* No such group. */ +#define RE_ERROR_INDEX -10 /* String index. */ +#define RE_ERROR_BACKTRACKING -11 /* Too much backtracking. */ +#define RE_ERROR_NOT_STRING -12 /* Not a string. */ +#define RE_ERROR_NOT_UNICODE -13 /* Not a Unicode string. */ +#define RE_ERROR_PARTIAL -15 /* Partial match. */ + +/* The number of backtrack entries per allocated block. */ +#define RE_BACKTRACK_BLOCK_SIZE 64 + +/* The maximum number of backtrack entries to allocate. */ +#define RE_MAX_BACKTRACK_ALLOC (1024 * 1024) + +/* The number of atomic entries per allocated block. */ +#define RE_ATOMIC_BLOCK_SIZE 64 + +/* The initial maximum capacity of the guard block. */ +#define RE_INIT_GUARDS_BLOCK_SIZE 16 + +/* The initial maximum capacity of the node list. */ +#define RE_INIT_NODE_LIST_SIZE 16 + +/* The size increment for various allocation lists. */ +#define RE_LIST_SIZE_INC 16 + +/* The initial maximum capacity of the capture groups. */ +#define RE_INIT_CAPTURE_SIZE 16 + +/* Node bitflags. */ +#define RE_POSITIVE_OP 0x1 +#define RE_ZEROWIDTH_OP 0x2 +#define RE_FUZZY_OP 0x4 +#define RE_REVERSE_OP 0x8 +#define RE_REQUIRED_OP 0x10 + +/* Guards against further matching can occur at the start of the body and the + * tail of a repeat containing a repeat. + */ +#define RE_STATUS_BODY 0x1 +#define RE_STATUS_TAIL 0x2 + +/* Whether a guard is added depends on whether there's a repeat in the body of + * the repeat or a group reference in the body or tail of the repeat. + */ +#define RE_STATUS_NEITHER 0x0 +#define RE_STATUS_REPEAT 0x4 +#define RE_STATUS_LIMITED 0x8 +#define RE_STATUS_REF 0x10 +#define RE_STATUS_VISITED_AG 0x20 +#define RE_STATUS_VISITED_REP 0x40 + +/* Whether a string node has been initialised for fast searching. */ +#define RE_STATUS_FAST_INIT 0x80 + +/* Whether a node us being used. (Additional nodes may be created while the + * pattern is being built. + */ +#define RE_STATUS_USED 0x100 + +/* Whether a node is a string node. */ +#define RE_STATUS_STRING 0x200 + +/* Whether a repeat node is within another repeat. */ +#define RE_STATUS_INNER 0x400 + +/* Various flags stored in a node status member. */ +#define RE_STATUS_SHIFT 11 + +#define RE_STATUS_FUZZY (RE_FUZZY_OP << RE_STATUS_SHIFT) +#define RE_STATUS_REVERSE (RE_REVERSE_OP << RE_STATUS_SHIFT) +#define RE_STATUS_REQUIRED (RE_REQUIRED_OP << RE_STATUS_SHIFT) +#define RE_STATUS_HAS_GROUPS 0x10000 +#define RE_STATUS_HAS_REPEATS 0x20000 + +/* The different error types for fuzzy matching. */ +#define RE_FUZZY_SUB 0 +#define RE_FUZZY_INS 1 +#define RE_FUZZY_DEL 2 +#define RE_FUZZY_ERR 3 +#define RE_FUZZY_COUNT 3 + +/* The various values in a FUZZY node. */ +#define RE_FUZZY_VAL_MAX_BASE 1 +#define RE_FUZZY_VAL_MAX_SUB (RE_FUZZY_VAL_MAX_BASE + RE_FUZZY_SUB) +#define RE_FUZZY_VAL_MAX_INS (RE_FUZZY_VAL_MAX_BASE + RE_FUZZY_INS) +#define RE_FUZZY_VAL_MAX_DEL (RE_FUZZY_VAL_MAX_BASE + RE_FUZZY_DEL) +#define RE_FUZZY_VAL_MAX_ERR (RE_FUZZY_VAL_MAX_BASE + RE_FUZZY_ERR) + +#define RE_FUZZY_VAL_COST_BASE 5 +#define RE_FUZZY_VAL_SUB_COST (RE_FUZZY_VAL_COST_BASE + RE_FUZZY_SUB) +#define RE_FUZZY_VAL_INS_COST (RE_FUZZY_VAL_COST_BASE + RE_FUZZY_INS) +#define RE_FUZZY_VAL_DEL_COST (RE_FUZZY_VAL_COST_BASE + RE_FUZZY_DEL) +#define RE_FUZZY_VAL_MAX_COST (RE_FUZZY_VAL_COST_BASE + RE_FUZZY_ERR) + +/* The various values in an END_FUZZY node. */ +#define RE_FUZZY_VAL_MIN_BASE 1 +#define RE_FUZZY_VAL_MIN_SUB (RE_FUZZY_VAL_MIN_BASE + RE_FUZZY_SUB) +#define RE_FUZZY_VAL_MIN_INS (RE_FUZZY_VAL_MIN_BASE + RE_FUZZY_INS) +#define RE_FUZZY_VAL_MIN_DEL (RE_FUZZY_VAL_MIN_BASE + RE_FUZZY_DEL) +#define RE_FUZZY_VAL_MIN_ERR (RE_FUZZY_VAL_MIN_BASE + RE_FUZZY_ERR) + +/* The maximum number of errors when trying to improve a fuzzy match. */ +#define RE_MAX_ERRORS 10 + +/* The flags which will be set for full Unicode case folding. */ +#define RE_FULL_CASE_FOLDING (RE_FLAG_UNICODE | RE_FLAG_FULLCASE | RE_FLAG_IGNORECASE) + +/* The shortest string prefix for which we'll use a fast string search. */ +#define RE_MIN_FAST_LENGTH 5 + +static char copyright[] = + " RE 2.3.0 Copyright (c) 1997-2002 by Secret Labs AB "; + +/* The exception to raise on error. */ +static PyObject* error_exception; + +/* The dictionary of Unicode properties. */ +static PyObject* property_dict; + +typedef struct RE_State* RE_StatePtr; + +/* Bit-flags for the common character properties supported by locale-sensitive + * matching. + */ +#define RE_LOCALE_ALNUM 0x001 +#define RE_LOCALE_ALPHA 0x002 +#define RE_LOCALE_CNTRL 0x004 +#define RE_LOCALE_DIGIT 0x008 +#define RE_LOCALE_GRAPH 0x010 +#define RE_LOCALE_LOWER 0x020 +#define RE_LOCALE_PRINT 0x040 +#define RE_LOCALE_PUNCT 0x080 +#define RE_LOCALE_SPACE 0x100 +#define RE_LOCALE_UPPER 0x200 + +/* Info about the current locale. + * + * Used by patterns that are locale-sensitive. + */ +typedef struct RE_LocaleInfo { + unsigned short properties[0x100]; + unsigned char uppercase[0x100]; + unsigned char lowercase[0x100]; +} RE_LocaleInfo; + +/* Handlers for ASCII, locale and Unicode. */ +typedef struct RE_EncodingTable { + BOOL (*has_property)(RE_LocaleInfo* locale_info, RE_CODE property, Py_UCS4 + ch); + BOOL (*at_boundary)(RE_StatePtr state, Py_ssize_t text_pos); + BOOL (*at_word_start)(RE_StatePtr state, Py_ssize_t text_pos); + BOOL (*at_word_end)(RE_StatePtr state, Py_ssize_t text_pos); + BOOL (*at_default_boundary)(RE_StatePtr state, Py_ssize_t text_pos); + BOOL (*at_default_word_start)(RE_StatePtr state, Py_ssize_t text_pos); + BOOL (*at_default_word_end)(RE_StatePtr state, Py_ssize_t text_pos); + BOOL (*at_grapheme_boundary)(RE_StatePtr state, Py_ssize_t text_pos); + BOOL (*is_line_sep)(Py_UCS4 ch); + BOOL (*at_line_start)(RE_StatePtr state, Py_ssize_t text_pos); + BOOL (*at_line_end)(RE_StatePtr state, Py_ssize_t text_pos); + BOOL (*possible_turkic)(RE_LocaleInfo* locale_info, Py_UCS4 ch); + int (*all_cases)(RE_LocaleInfo* locale_info, Py_UCS4 ch, Py_UCS4* + codepoints); + Py_UCS4 (*simple_case_fold)(RE_LocaleInfo* locale_info, Py_UCS4 ch); + int (*full_case_fold)(RE_LocaleInfo* locale_info, Py_UCS4 ch, Py_UCS4* + folded); + int (*all_turkic_i)(RE_LocaleInfo* locale_info, Py_UCS4 ch, Py_UCS4* + cases); +} RE_EncodingTable; + +/* Position within the regex and text. */ +typedef struct RE_Position { + struct RE_Node* node; + Py_ssize_t text_pos; +} RE_Position; + +/* Info about fuzzy matching. */ +typedef struct RE_FuzzyInfo { + struct RE_Node* node; + size_t counts[RE_FUZZY_COUNT + 1]; /* Add 1 for total errors. */ + size_t total_cost; +} RE_FuzzyInfo; + +/* Storage for backtrack data. */ +typedef struct RE_BacktrackData { + union { + struct { + size_t capture_change; + BOOL too_few_errors; + } atomic; + struct { + RE_Position position; + } branch; + struct { + RE_FuzzyInfo fuzzy_info; + Py_ssize_t text_pos; + RE_CODE index; + } fuzzy; + struct { + RE_Position position; + size_t count; + struct RE_Node* fuzzy_node; + BOOL too_few_errors; + } fuzzy_insert; + struct { + RE_Position position; + RE_INT8 fuzzy_type; + RE_INT8 step; + } fuzzy_item; + struct { + RE_Position position; + Py_ssize_t string_pos; + RE_INT8 fuzzy_type; + RE_INT8 folded_pos; + RE_INT8 folded_len; + RE_INT8 gfolded_pos; + RE_INT8 gfolded_len; + RE_INT8 step; + } fuzzy_string; + struct { + Py_ssize_t text_pos; + Py_ssize_t current_capture; + RE_CODE private_index; + RE_CODE public_index; + BOOL capture; + } group; + struct { + struct RE_Node* node; + size_t capture_change; + } group_call; + struct { + Py_ssize_t match_pos; + } keep; + struct { + struct RE_Node* node; + size_t capture_change; + BOOL too_few_errors; + BOOL inside; + } lookaround; + struct { + RE_Position position; + Py_ssize_t text_pos; + size_t count; + Py_ssize_t start; + size_t capture_change; + RE_CODE index; + } repeat; + }; + RE_UINT8 op; +} RE_BacktrackData; + +/* Storage for backtrack data is allocated in blocks for speed. */ +typedef struct RE_BacktrackBlock { + RE_BacktrackData items[RE_BACKTRACK_BLOCK_SIZE]; + struct RE_BacktrackBlock* previous; + struct RE_BacktrackBlock* next; + size_t capacity; + size_t count; +} RE_BacktrackBlock; + +/* Storage for atomic data. */ +typedef struct RE_AtomicData { + RE_BacktrackBlock* current_backtrack_block; + size_t backtrack_count; + struct RE_Node* node; + RE_BacktrackData* backtrack; + struct RE_SavedGroups* saved_groups; + struct RE_SavedRepeats* saved_repeats; + Py_ssize_t slice_start; + Py_ssize_t slice_end; + Py_ssize_t text_pos; + BOOL is_lookaround; + BOOL has_groups; + BOOL has_repeats; +} RE_AtomicData; + +/* Storage for atomic data is allocated in blocks for speed. */ +typedef struct RE_AtomicBlock { + RE_AtomicData items[RE_ATOMIC_BLOCK_SIZE]; + struct RE_AtomicBlock* previous; + struct RE_AtomicBlock* next; + size_t capacity; + size_t count; +} RE_AtomicBlock; + +/* Storage for saved groups. */ +typedef struct RE_SavedGroups { + struct RE_SavedGroups* previous; + struct RE_SavedGroups* next; + struct RE_GroupSpan* spans; + size_t* counts; +} RE_SavedGroups; + +/* Storage for info around a recursive by 'basic'match'. */ +typedef struct RE_Info { + RE_BacktrackBlock* current_backtrack_block; + size_t backtrack_count; + RE_SavedGroups* current_saved_groups; + struct RE_GroupCallFrame* current_group_call_frame; + BOOL must_advance; +} RE_Info; + +/* Storage for the next node. */ +typedef struct RE_NextNode { + struct RE_Node* node; + struct RE_Node* test; + struct RE_Node* match_next; + Py_ssize_t match_step; +} RE_NextNode; + +/* A pattern node. */ +typedef struct RE_Node { + RE_NextNode next_1; + union { + struct { + RE_NextNode next_2; + } nonstring; + struct { + /* Used only if (node->status & RE_STATUS_STRING) is true. */ + Py_ssize_t* bad_character_offset; + Py_ssize_t* good_suffix_offset; + } string; + }; + Py_ssize_t step; + size_t value_count; + RE_CODE* values; + RE_STATUS_T status; + RE_UINT8 op; + BOOL match; +} RE_Node; + +/* Info about a group's span. */ +typedef struct RE_GroupSpan { + Py_ssize_t start; + Py_ssize_t end; +} RE_GroupSpan; + +/* Span of a guard (inclusive range). */ +typedef struct RE_GuardSpan { + Py_ssize_t low; + Py_ssize_t high; + BOOL protect; +} RE_GuardSpan; + +/* Spans guarded against further matching. */ +typedef struct RE_GuardList { + size_t capacity; + size_t count; + RE_GuardSpan* spans; + Py_ssize_t last_text_pos; + size_t last_low; +} RE_GuardList; + +/* Info about a group. */ +typedef struct RE_GroupData { + RE_GroupSpan span; + size_t capture_count; + size_t capture_capacity; + Py_ssize_t current_capture; + RE_GroupSpan* captures; +} RE_GroupData; + +/* Info about a repeat. */ +typedef struct RE_RepeatData { + RE_GuardList body_guard_list; + RE_GuardList tail_guard_list; + size_t count; + Py_ssize_t start; + size_t capture_change; +} RE_RepeatData; + +/* Storage for saved repeats. */ +typedef struct RE_SavedRepeats { + struct RE_SavedRepeats* previous; + struct RE_SavedRepeats* next; + RE_RepeatData* repeats; +} RE_SavedRepeats; + +/* Guards for fuzzy sections. */ +typedef struct RE_FuzzyGuards { + RE_GuardList body_guard_list; + RE_GuardList tail_guard_list; +} RE_FuzzyGuards; + +/* Info about a capture group. */ +typedef struct RE_GroupInfo { + Py_ssize_t end_index; + RE_Node* node; + BOOL referenced; + BOOL has_name; +} RE_GroupInfo; + +/* Info about a call_ref. */ +typedef struct RE_CallRefInfo { + RE_Node* node; + BOOL defined; + BOOL used; +} RE_CallRefInfo; + +/* Info about a repeat. */ +typedef struct RE_RepeatInfo { + RE_STATUS_T status; +} RE_RepeatInfo; + +/* Stack frame for a group call. */ +typedef struct RE_GroupCallFrame { + struct RE_GroupCallFrame* previous; + struct RE_GroupCallFrame* next; + RE_Node* node; + RE_GroupData* groups; + RE_RepeatData* repeats; +} RE_GroupCallFrame; + +/* Info about a string argument. */ +typedef struct RE_StringInfo { +#if PY_VERSION_HEX >= 0x02060000 + Py_buffer view; /* View of the string if it's a buffer object. */ +#endif + void* characters; /* Pointer to the characters of the string. */ + Py_ssize_t length; /* Length of the string. */ + Py_ssize_t charsize; /* Size of the characters in the string. */ + BOOL is_unicode; /* Whether the string is Unicode. */ + BOOL should_release; /* Whether the buffer should be released. */ +} RE_StringInfo; + +/* Info about where the next match was found, starting from a certain search + * position. This is used when a pattern starts with a BRANCH. + */ +#define MAX_SEARCH_POSITIONS 7 + +/* Info about a search position. */ +typedef struct { + Py_ssize_t start_pos; + Py_ssize_t match_pos; +} RE_SearchPosition; + +/* The state object used during matching. */ +typedef struct RE_State { + struct PatternObject* pattern; /* Parent PatternObject. */ + /* Info about the string being matched. */ + PyObject* string; +#if PY_VERSION_HEX >= 0x02060000 + Py_buffer view; /* View of the string if it's a buffer object. */ +#endif + Py_ssize_t charsize; + void* text; + Py_ssize_t text_length; + /* The slice of the string being searched. */ + Py_ssize_t slice_start; + Py_ssize_t slice_end; + /* Info about the capture groups. */ + RE_GroupData* groups; + Py_ssize_t lastindex; + Py_ssize_t lastgroup; + /* Info about the repeats. */ + RE_RepeatData* repeats; + Py_ssize_t search_anchor; /* Where the last match finished. */ + Py_ssize_t match_pos; /* The start position of the match. */ + Py_ssize_t text_pos; /* The current position of the match. */ + Py_ssize_t final_newline; /* The index of newline at end of string, or -1. */ + Py_ssize_t final_line_sep; /* The index of line separator at end of string, or -1. */ + /* Storage for backtrack info. */ + RE_BacktrackBlock backtrack_block; + RE_BacktrackBlock* current_backtrack_block; + Py_ssize_t backtrack_allocated; + RE_BacktrackData* backtrack; + RE_AtomicBlock* current_atomic_block; + /* Storage for saved capture groups. */ + RE_SavedGroups* first_saved_groups; + RE_SavedGroups* current_saved_groups; + RE_SavedRepeats* first_saved_repeats; + RE_SavedRepeats* current_saved_repeats; + /* Info about the best POSIX match (leftmost longest). */ + Py_ssize_t best_match_pos; + Py_ssize_t best_text_pos; + RE_GroupData* best_match_groups; + /* Miscellaneous. */ + Py_ssize_t min_width; /* The minimum width of the string to match (assuming it's not a fuzzy pattern). */ + RE_EncodingTable* encoding; /* The 'encoding' of the string being searched. */ + RE_LocaleInfo* locale_info; /* Info about the locale, if needed. */ + Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); + void (*set_char_at)(void* text, Py_ssize_t pos, Py_UCS4 ch); + void* (*point_to)(void* text, Py_ssize_t pos); + PyThread_type_lock lock; /* A lock for accessing the state across threads. */ + RE_FuzzyInfo fuzzy_info; /* Info about fuzzy matching. */ + size_t total_fuzzy_counts[RE_FUZZY_COUNT]; /* Totals for fuzzy matching. */ + size_t best_fuzzy_counts[RE_FUZZY_COUNT]; /* Best totals for fuzzy matching. */ + RE_FuzzyGuards* fuzzy_guards; /* The guards for a fuzzy match. */ + size_t total_errors; /* The total number of errors of a fuzzy match. */ + size_t max_errors; /* The maximum permitted number of errors. */ + size_t fewest_errors; /* The fewest errors so far of an enhanced fuzzy match. */ + /* The group call stack. */ + RE_GroupCallFrame* first_group_call_frame; + RE_GroupCallFrame* current_group_call_frame; + RE_GuardList* group_call_guard_list; + RE_SearchPosition search_positions[MAX_SEARCH_POSITIONS]; /* Where the search matches next. */ + size_t capture_change; /* Incremented every time a captive group changes. */ + Py_ssize_t req_pos; /* The position where the required string matched. */ + Py_ssize_t req_end; /* The end position where the required string matched. */ + int partial_side; /* The side that could truncate in a partial match. */ + RE_UINT16 iterations; /* The number of iterations the matching engine has performed since checking for KeyboardInterrupt. */ + BOOL is_unicode; /* Whether the string to be matched is Unicode. */ + BOOL should_release; /* Whether the buffer should be released. */ + BOOL overlapped; /* Whether the matches can be overlapped. */ + BOOL reverse; /* Whether it's a reverse pattern. */ + BOOL visible_captures; /* Whether the 'captures' method will be visible. */ + BOOL version_0; /* Whether to perform version_0 behaviour (same as re module). */ + BOOL must_advance; /* Whether the end of the match must advance past its start. */ + BOOL is_multithreaded; /* Whether to release the GIL while matching. */ + BOOL too_few_errors; /* Whether there were too few fuzzy errors. */ + BOOL match_all; /* Whether to match all of the string ('fullmatch'). */ + BOOL found_match; /* Whether a POSIX match has been found. */ +} RE_State; + +/* Storage for the regex state and thread state. + * + * Scanner objects can sometimes be shared across threads, which means that + * their RE_State structs are also shared. This isn't safe when the GIL is + * released, so in such instances we have a lock (mutex) in the RE_State struct + * to protect it during matching. We also need a thread-safe place to store the + * thread state when releasing the GIL. + */ +typedef struct RE_SafeState { + RE_State* re_state; + PyThreadState* thread_state; +} RE_SafeState; + +/* The PatternObject created from a regular expression. */ +typedef struct PatternObject { + PyObject_HEAD + PyObject* pattern; /* Pattern source (or None). */ + Py_ssize_t flags; /* Flags used when compiling pattern source. */ + PyObject* weakreflist; /* List of weak references */ + /* Nodes into which the regular expression is compiled. */ + RE_Node* start_node; + RE_Node* start_test; + size_t true_group_count; /* The true number of capture groups. */ + size_t public_group_count; /* The number of public capture groups. */ + size_t repeat_count; /* The number of repeats. */ + Py_ssize_t group_end_index; /* The number of group closures. */ + PyObject* groupindex; + PyObject* indexgroup; + PyObject* named_lists; + size_t named_lists_count; + PyObject** partial_named_lists[2]; + PyObject* named_list_indexes; + /* Storage for the pattern nodes. */ + size_t node_capacity; + size_t node_count; + RE_Node** node_list; + /* Info about the capture groups. */ + size_t group_info_capacity; + RE_GroupInfo* group_info; + /* Info about the call_refs. */ + size_t call_ref_info_capacity; + size_t call_ref_info_count; + RE_CallRefInfo* call_ref_info; + Py_ssize_t pattern_call_ref; + /* Info about the repeats. */ + size_t repeat_info_capacity; + RE_RepeatInfo* repeat_info; + Py_ssize_t min_width; /* The minimum width of the string to match (assuming it isn't a fuzzy pattern). */ + RE_EncodingTable* encoding; /* Encoding handlers. */ + RE_LocaleInfo* locale_info; /* Info about the locale, if needed. */ + RE_GroupData* groups_storage; + RE_RepeatData* repeats_storage; + size_t fuzzy_count; /* The number of fuzzy sections. */ + Py_ssize_t req_offset; /* The offset to the required string. */ + RE_Node* req_string; /* The required string. */ + BOOL is_fuzzy; /* Whether it's a fuzzy pattern. */ + BOOL do_search_start; /* Whether to do an initial search. */ + BOOL recursive; /* Whether the entire pattern is recursive. */ +} PatternObject; + +/* The MatchObject created when a match is found. */ +typedef struct MatchObject { + PyObject_HEAD + PyObject* string; /* Link to the target string or NULL if detached. */ + PyObject* substring; /* Link to (a substring of) the target string. */ + Py_ssize_t substring_offset; /* Offset into the target string. */ + PatternObject* pattern; /* Link to the regex (pattern) object. */ + Py_ssize_t pos; /* Start of current slice. */ + Py_ssize_t endpos; /* End of current slice. */ + Py_ssize_t match_start; /* Start of matched slice. */ + Py_ssize_t match_end; /* End of matched slice. */ + Py_ssize_t lastindex; /* Last group seen by the engine (-1 if none). */ + Py_ssize_t lastgroup; /* Last named group seen by the engine (-1 if none). */ + size_t group_count; /* The number of groups. */ + RE_GroupData* groups; /* The capture groups. */ + PyObject* regs; + size_t fuzzy_counts[RE_FUZZY_COUNT]; + BOOL partial; /* Whether it's a partial match. */ +} MatchObject; + +/* The ScannerObject. */ +typedef struct ScannerObject { + PyObject_HEAD + PatternObject* pattern; + RE_State state; + int status; +} ScannerObject; + +/* The SplitterObject. */ +typedef struct SplitterObject { + PyObject_HEAD + PatternObject* pattern; + RE_State state; + Py_ssize_t maxsplit; + Py_ssize_t last_pos; + Py_ssize_t split_count; + Py_ssize_t index; + int status; +} SplitterObject; +#if PY_VERSION_HEX >= 0x02060000 + +/* The CaptureObject. */ +typedef struct CaptureObject { + PyObject_HEAD + Py_ssize_t group_index; + MatchObject** match_indirect; +} CaptureObject; +#endif + +/* Info used when compiling a pattern to nodes. */ +typedef struct RE_CompileArgs { + RE_CODE* code; /* The start of the compiled pattern. */ + RE_CODE* end_code; /* The end of the compiled pattern. */ + PatternObject* pattern; /* The pattern object. */ + Py_ssize_t min_width; /* The minimum width of the string to match (assuming it isn't a fuzzy pattern). */ + RE_Node* start; /* The start node. */ + RE_Node* end; /* The end node. */ + size_t repeat_depth; /* The nesting depth of the repeat. */ + BOOL forward; /* Whether it's a forward (not reverse) pattern. */ + BOOL visible_captures; /* Whether all of the captures will be visible. */ + BOOL has_captures; /* Whether the pattern has capture groups. */ + BOOL is_fuzzy; /* Whether the pattern (or some part of it) is fuzzy. */ + BOOL within_fuzzy; /* Whether the subpattern is within a fuzzy section. */ + BOOL has_groups; /* Whether the subpattern contains captures. */ + BOOL has_repeats; /* Whether the subpattern contains repeats. */ +} RE_CompileArgs; + +/* The string slices which will be concatenated to make the result string of + * the 'sub' method. + * + * This allows us to avoid creating a list of slices if there of fewer than 2 + * of them. Empty strings aren't recorded, so if 'list' and 'item' are both + * NULL then the result is an empty string. + */ +typedef struct JoinInfo { + PyObject* list; /* The list of slices if there are more than 2 of them. */ + PyObject* item; /* The slice if there is only 1 of them. */ + BOOL reversed; /* Whether the slices have been found in reverse order. */ + BOOL is_unicode; /* Whether the string is Unicode. */ +} JoinInfo; + +/* Info about fuzzy matching. */ +typedef struct { + RE_Node* new_node; + Py_ssize_t new_text_pos; + Py_ssize_t limit; + Py_ssize_t new_string_pos; + int step; + int new_folded_pos; + int folded_len; + int new_gfolded_pos; + int new_group_pos; + int fuzzy_type; + BOOL permit_insertion; +} RE_FuzzyData; + +typedef struct RE_BestEntry { + Py_ssize_t match_pos; + Py_ssize_t text_pos; +} RE_BestEntry; + +typedef struct RE_BestList { + size_t capacity; + size_t count; + RE_BestEntry* entries; +} RE_BestList; + +/* Function types for getting info from a MatchObject. */ +typedef PyObject* (*RE_GetByIndexFunc)(MatchObject* self, Py_ssize_t index); + +/* Returns the magnitude of a 'Py_ssize_t' value. */ +Py_LOCAL_INLINE(Py_ssize_t) abs_ssize_t(Py_ssize_t x) { + return x >= 0 ? x : -x; +} + +/* Returns the minimum of 2 'Py_ssize_t' values. */ +Py_LOCAL_INLINE(Py_ssize_t) min_ssize_t(Py_ssize_t x, Py_ssize_t y) { + return x <= y ? x : y; +} + +/* Returns the maximum of 2 'Py_ssize_t' values. */ +Py_LOCAL_INLINE(Py_ssize_t) max_ssize_t(Py_ssize_t x, Py_ssize_t y) { + return x >= y ? x : y; +} + +/* Returns the minimum of 2 'size_t' values. */ +Py_LOCAL_INLINE(size_t) min_size_t(size_t x, size_t y) { + return x <= y ? x : y; +} + +/* Returns the maximum of 2 'size_t' values. */ +Py_LOCAL_INLINE(size_t) max_size_t(size_t x, size_t y) { + return x >= y ? x : y; +} + +/* Returns the 'maximum' of 2 RE_STATUS_T values. */ +Py_LOCAL_INLINE(RE_STATUS_T) max_status_2(RE_STATUS_T x, RE_STATUS_T y) { + return x >= y ? x : y; +} + +/* Returns the 'maximum' of 3 RE_STATUS_T values. */ +Py_LOCAL_INLINE(RE_STATUS_T) max_status_3(RE_STATUS_T x, RE_STATUS_T y, + RE_STATUS_T z) { + return max_status_2(x, max_status_2(y, z)); +} + +/* Returns the 'maximum' of 4 RE_STATUS_T values. */ +Py_LOCAL_INLINE(RE_STATUS_T) max_status_4(RE_STATUS_T w, RE_STATUS_T x, + RE_STATUS_T y, RE_STATUS_T z) { + return max_status_2(max_status_2(w, x), max_status_2(y, z)); +} + +/* Gets a character at a position assuming 1 byte per character. */ +static Py_UCS4 bytes1_char_at(void* text, Py_ssize_t pos) { + return *((Py_UCS1*)text + pos); +} + +/* Sets a character at a position assuming 1 byte per character. */ +static void bytes1_set_char_at(void* text, Py_ssize_t pos, Py_UCS4 ch) { + *((Py_UCS1*)text + pos) = (Py_UCS1)ch; +} + +/* Gets a pointer to a position assuming 1 byte per character. */ +static void* bytes1_point_to(void* text, Py_ssize_t pos) { + return (Py_UCS1*)text + pos; +} + +/* Gets a character at a position assuming 2 bytes per character. */ +static Py_UCS4 bytes2_char_at(void* text, Py_ssize_t pos) { + return *((Py_UCS2*)text + pos); +} + +/* Sets a character at a position assuming 2 bytes per character. */ +static void bytes2_set_char_at(void* text, Py_ssize_t pos, Py_UCS4 ch) { + *((Py_UCS2*)text + pos) = (Py_UCS2)ch; +} + +/* Gets a pointer to a position assuming 2 bytes per character. */ +static void* bytes2_point_to(void* text, Py_ssize_t pos) { + return (Py_UCS2*)text + pos; +} + +/* Gets a character at a position assuming 4 bytes per character. */ +static Py_UCS4 bytes4_char_at(void* text, Py_ssize_t pos) { + return *((Py_UCS4*)text + pos); +} + +/* Sets a character at a position assuming 4 bytes per character. */ +static void bytes4_set_char_at(void* text, Py_ssize_t pos, Py_UCS4 ch) { + *((Py_UCS4*)text + pos) = (Py_UCS4)ch; +} + +/* Gets a pointer to a position assuming 4 bytes per character. */ +static void* bytes4_point_to(void* text, Py_ssize_t pos) { + return (Py_UCS4*)text + pos; +} + +/* Default for whether a position is on a word boundary. */ +static BOOL at_boundary_always(RE_State* state, Py_ssize_t text_pos) { + return TRUE; +} + +/* Converts a BOOL to success/failure. */ +Py_LOCAL_INLINE(int) bool_as_status(BOOL value) { + return value ? RE_ERROR_SUCCESS : RE_ERROR_FAILURE; +} + +/* ASCII-specific. */ + +Py_LOCAL_INLINE(BOOL) unicode_has_property(RE_CODE property, Py_UCS4 ch); + +/* Checks whether a character has a property. */ +Py_LOCAL_INLINE(BOOL) ascii_has_property(RE_CODE property, Py_UCS4 ch) { + if (ch > RE_ASCII_MAX) { + /* Outside the ASCII range. */ + RE_UINT32 value; + + value = property & 0xFFFF; + + return value == 0; + } + + return unicode_has_property(property, ch); +} + +/* Wrapper for calling 'ascii_has_property' via a pointer. */ +static BOOL ascii_has_property_wrapper(RE_LocaleInfo* locale_info, RE_CODE + property, Py_UCS4 ch) { + return ascii_has_property(property, ch); +} + +/* Checks whether there's a word character to the left. */ +Py_LOCAL_INLINE(BOOL) ascii_word_left(RE_State* state, Py_ssize_t text_pos) { + return text_pos > 0 && ascii_has_property(RE_PROP_WORD, + state->char_at(state->text, text_pos - 1)); +} + +/* Checks whether there's a word character to the right. */ +Py_LOCAL_INLINE(BOOL) ascii_word_right(RE_State* state, Py_ssize_t text_pos) { + return text_pos < state->text_length && ascii_has_property(RE_PROP_WORD, + state->char_at(state->text, text_pos)); +} + +/* Checks whether a position is on a word boundary. */ +static BOOL ascii_at_boundary(RE_State* state, Py_ssize_t text_pos) { + BOOL left; + BOOL right; + + left = ascii_word_left(state, text_pos); + right = ascii_word_right(state, text_pos); + + return left != right; +} + +/* Checks whether a position is at the start of a word. */ +static BOOL ascii_at_word_start(RE_State* state, Py_ssize_t text_pos) { + BOOL left; + BOOL right; + + left = ascii_word_left(state, text_pos); + right = ascii_word_right(state, text_pos); + + return !left && right; +} + +/* Checks whether a position is at the end of a word. */ +static BOOL ascii_at_word_end(RE_State* state, Py_ssize_t text_pos) { + BOOL left; + BOOL right; + + left = ascii_word_left(state, text_pos); + right = ascii_word_right(state, text_pos); + + return left && !right; +} + +/* Checks whether a character is a line separator. */ +static BOOL ascii_is_line_sep(Py_UCS4 ch) { + return 0x0A <= ch && ch <= 0x0D; +} + +/* Checks whether a position is at the start of a line. */ +static BOOL ascii_at_line_start(RE_State* state, Py_ssize_t text_pos) { + Py_UCS4 ch; + + if (text_pos <= 0) + return TRUE; + + ch = state->char_at(state->text, text_pos - 1); + + if (ch == 0x0D) { + if (text_pos >= state->text_length) + return TRUE; + + /* No line break inside CRLF. */ + return state->char_at(state->text, text_pos) != 0x0A; + } + + return 0x0A <= ch && ch <= 0x0D; +} + +/* Checks whether a position is at the end of a line. */ +static BOOL ascii_at_line_end(RE_State* state, Py_ssize_t text_pos) { + Py_UCS4 ch; + + if (text_pos >= state->text_length) + return TRUE; + + ch = state->char_at(state->text, text_pos); + + if (ch == 0x0A) { + if (text_pos <= 0) + return TRUE; + + /* No line break inside CRLF. */ + return state->char_at(state->text, text_pos - 1) != 0x0D; + } + + return 0x0A <= ch && ch <= 0x0D; +} + +/* Checks whether a character could be Turkic (variants of I/i). For ASCII, it + * won't be. + */ +static BOOL ascii_possible_turkic(RE_LocaleInfo* locale_info, Py_UCS4 ch) { + return FALSE; +} + +/* Gets all the cases of a character. */ +static int ascii_all_cases(RE_LocaleInfo* locale_info, Py_UCS4 ch, Py_UCS4* + codepoints) { + int count; + + count = 0; + + codepoints[count++] = ch; + + if (('A' <= ch && ch <= 'Z') || ('a' <= ch && ch <= 'z')) + /* It's a letter, so add the other case. */ + codepoints[count++] = ch ^ 0x20; + + return count; +} + +/* Returns a character with its case folded. */ +static Py_UCS4 ascii_simple_case_fold(RE_LocaleInfo* locale_info, Py_UCS4 ch) { + if ('A' <= ch && ch <= 'Z') + /* Uppercase folds to lowercase. */ + return ch ^ 0x20; + + return ch; +} + +/* Returns a character with its case folded. */ +static int ascii_full_case_fold(RE_LocaleInfo* locale_info, Py_UCS4 ch, + Py_UCS4* folded) { + if ('A' <= ch && ch <= 'Z') + /* Uppercase folds to lowercase. */ + folded[0] = ch ^ 0x20; + else + folded[0] = ch; + + return 1; +} + +/* Gets all the case variants of Turkic 'I'. The given character will be listed + * first. + */ +static int ascii_all_turkic_i(RE_LocaleInfo* locale_info, Py_UCS4 ch, Py_UCS4* + cases) { + int count; + + count = 0; + + cases[count++] = ch; + + if (ch != 'I') + cases[count++] = 'I'; + + if (ch != 'i') + cases[count++] = 'i'; + + return count; +} + +/* The handlers for ASCII characters. */ +static RE_EncodingTable ascii_encoding = { + ascii_has_property_wrapper, + ascii_at_boundary, + ascii_at_word_start, + ascii_at_word_end, + ascii_at_boundary, /* No special "default word boundary" for ASCII. */ + ascii_at_word_start, /* No special "default start of word" for ASCII. */ + ascii_at_word_end, /* No special "default end of a word" for ASCII. */ + at_boundary_always, /* No special "grapheme boundary" for ASCII. */ + ascii_is_line_sep, + ascii_at_line_start, + ascii_at_line_end, + ascii_possible_turkic, + ascii_all_cases, + ascii_simple_case_fold, + ascii_full_case_fold, + ascii_all_turkic_i, +}; + +/* Locale-specific. */ + +/* Checks whether a character has the 'alnum' property in the given locale. */ +Py_LOCAL_INLINE(BOOL) locale_isalnum(RE_LocaleInfo* locale_info, Py_UCS4 ch) { + return ch <= RE_LOCALE_MAX && (locale_info->properties[ch] & + RE_LOCALE_ALNUM) != 0; +} + +/* Checks whether a character has the 'alpha' property in the given locale. */ +Py_LOCAL_INLINE(BOOL) locale_isalpha(RE_LocaleInfo* locale_info, Py_UCS4 ch) { + return ch <= RE_LOCALE_MAX && (locale_info->properties[ch] & + RE_LOCALE_ALPHA) != 0; +} + +/* Checks whether a character has the 'cntrl' property in the given locale. */ +Py_LOCAL_INLINE(BOOL) locale_iscntrl(RE_LocaleInfo* locale_info, Py_UCS4 ch) { + return ch <= RE_LOCALE_MAX && (locale_info->properties[ch] & + RE_LOCALE_CNTRL) != 0; +} + +/* Checks whether a character has the 'digit' property in the given locale. */ +Py_LOCAL_INLINE(BOOL) locale_isdigit(RE_LocaleInfo* locale_info, Py_UCS4 ch) { + return ch <= RE_LOCALE_MAX && (locale_info->properties[ch] & + RE_LOCALE_DIGIT) != 0; +} + +/* Checks whether a character has the 'graph' property in the given locale. */ +Py_LOCAL_INLINE(BOOL) locale_isgraph(RE_LocaleInfo* locale_info, Py_UCS4 ch) { + return ch <= RE_LOCALE_MAX && (locale_info->properties[ch] & + RE_LOCALE_GRAPH) != 0; +} + +/* Checks whether a character has the 'lower' property in the given locale. */ +Py_LOCAL_INLINE(BOOL) locale_islower(RE_LocaleInfo* locale_info, Py_UCS4 ch) { + return ch <= RE_LOCALE_MAX && (locale_info->properties[ch] & + RE_LOCALE_LOWER) != 0; +} + +/* Checks whether a character has the 'print' property in the given locale. */ +Py_LOCAL_INLINE(BOOL) locale_isprint(RE_LocaleInfo* locale_info, Py_UCS4 ch) { + return ch <= RE_LOCALE_MAX && (locale_info->properties[ch] & + RE_LOCALE_PRINT) != 0; +} + +/* Checks whether a character has the 'punct' property in the given locale. */ +Py_LOCAL_INLINE(BOOL) locale_ispunct(RE_LocaleInfo* locale_info, Py_UCS4 ch) { + return ch <= RE_LOCALE_MAX && (locale_info->properties[ch] & + RE_LOCALE_PUNCT) != 0; +} + +/* Checks whether a character has the 'space' property in the given locale. */ +Py_LOCAL_INLINE(BOOL) locale_isspace(RE_LocaleInfo* locale_info, Py_UCS4 ch) { + return ch <= RE_LOCALE_MAX && (locale_info->properties[ch] & + RE_LOCALE_SPACE) != 0; +} + +/* Checks whether a character has the 'upper' property in the given locale. */ +Py_LOCAL_INLINE(BOOL) locale_isupper(RE_LocaleInfo* locale_info, Py_UCS4 ch) { + return ch <= RE_LOCALE_MAX && (locale_info->properties[ch] & + RE_LOCALE_UPPER) != 0; +} + +/* Converts a character to lowercase in the given locale. */ +Py_LOCAL_INLINE(Py_UCS4) locale_tolower(RE_LocaleInfo* locale_info, Py_UCS4 ch) + { + return ch <= RE_LOCALE_MAX ? locale_info->lowercase[ch] : ch; +} + +/* Converts a character to uppercase in the given locale. */ +Py_LOCAL_INLINE(Py_UCS4) locale_toupper(RE_LocaleInfo* locale_info, Py_UCS4 ch) + { + return ch <= RE_LOCALE_MAX ? locale_info->uppercase[ch] : ch; +} + +/* Checks whether a character has a property. */ +Py_LOCAL_INLINE(BOOL) locale_has_property(RE_LocaleInfo* locale_info, RE_CODE + property, Py_UCS4 ch) { + RE_UINT32 value; + RE_UINT32 v; + + value = property & 0xFFFF; + + if (ch > RE_LOCALE_MAX) + /* Outside the locale range. */ + return value == 0; + + switch (property >> 16) { + case RE_PROP_ALNUM >> 16: + v = locale_isalnum(locale_info, ch) != 0; + break; + case RE_PROP_ALPHA >> 16: + v = locale_isalpha(locale_info, ch) != 0; + break; + case RE_PROP_ANY >> 16: + v = 1; + break; + case RE_PROP_ASCII >> 16: + v = ch <= RE_ASCII_MAX; + break; + case RE_PROP_BLANK >> 16: + v = ch == '\t' || ch == ' '; + break; + case RE_PROP_GC: + switch (property) { + case RE_PROP_ASSIGNED: + v = ch <= RE_LOCALE_MAX; + break; + case RE_PROP_CASEDLETTER: + v = locale_isalpha(locale_info, ch) ? value : 0xFFFF; + break; + case RE_PROP_CNTRL: + v = locale_iscntrl(locale_info, ch) ? value : 0xFFFF; + break; + case RE_PROP_DIGIT: + v = locale_isdigit(locale_info, ch) ? value : 0xFFFF; + break; + case RE_PROP_GC_CN: + v = ch > RE_LOCALE_MAX; + break; + case RE_PROP_GC_LL: + v = locale_islower(locale_info, ch) ? value : 0xFFFF; + break; + case RE_PROP_GC_LU: + v = locale_isupper(locale_info, ch) ? value : 0xFFFF; + break; + case RE_PROP_GC_P: + v = locale_ispunct(locale_info, ch) ? value : 0xFFFF; + break; + default: + v = 0xFFFF; + break; + } + break; + case RE_PROP_GRAPH >> 16: + v = locale_isgraph(locale_info, ch) != 0; + break; + case RE_PROP_LOWER >> 16: + v = locale_islower(locale_info, ch) != 0; + break; + case RE_PROP_POSIX_ALNUM >> 16: + v = re_get_posix_alnum(ch) != 0; + break; + case RE_PROP_POSIX_DIGIT >> 16: + v = re_get_posix_digit(ch) != 0; + break; + case RE_PROP_POSIX_PUNCT >> 16: + v = re_get_posix_punct(ch) != 0; + break; + case RE_PROP_POSIX_XDIGIT >> 16: + v = re_get_posix_xdigit(ch) != 0; + break; + case RE_PROP_PRINT >> 16: + v = locale_isprint(locale_info, ch) != 0; + break; + case RE_PROP_SPACE >> 16: + v = locale_isspace(locale_info, ch) != 0; + break; + case RE_PROP_UPPER >> 16: + v = locale_isupper(locale_info, ch) != 0; + break; + case RE_PROP_WORD >> 16: + v = ch == '_' || locale_isalnum(locale_info, ch) != 0; + break; + case RE_PROP_XDIGIT >> 16: + v = re_get_hex_digit(ch) != 0; + break; + default: + v = 0; + break; + } + + return v == value; +} + +/* Wrapper for calling 'locale_has_property' via a pointer. */ +static BOOL locale_has_property_wrapper(RE_LocaleInfo* locale_info, RE_CODE + property, Py_UCS4 ch) { + return locale_has_property(locale_info, property, ch); +} + +/* Checks whether there's a word character to the left. */ +Py_LOCAL_INLINE(BOOL) locale_word_left(RE_State* state, Py_ssize_t text_pos) { + return text_pos > 0 && locale_has_property(state->locale_info, + RE_PROP_WORD, state->char_at(state->text, text_pos - 1)); +} + +/* Checks whether there's a word character to the right. */ +Py_LOCAL_INLINE(BOOL) locale_word_right(RE_State* state, Py_ssize_t text_pos) { + return text_pos < state->text_length && + locale_has_property(state->locale_info, RE_PROP_WORD, + state->char_at(state->text, text_pos)); +} + +/* Checks whether a position is on a word boundary. */ +static BOOL locale_at_boundary(RE_State* state, Py_ssize_t text_pos) { + BOOL left; + BOOL right; + + left = locale_word_left(state, text_pos); + right = locale_word_right(state, text_pos); + + return left != right; +} + +/* Checks whether a position is at the start of a word. */ +static BOOL locale_at_word_start(RE_State* state, Py_ssize_t text_pos) { + BOOL left; + BOOL right; + + left = locale_word_left(state, text_pos); + right = locale_word_right(state, text_pos); + + return !left && right; +} + +/* Checks whether a position is at the end of a word. */ +static BOOL locale_at_word_end(RE_State* state, Py_ssize_t text_pos) { + BOOL left; + BOOL right; + + left = locale_word_left(state, text_pos); + right = locale_word_right(state, text_pos); + + return left && !right; +} + +/* Checks whether a character could be Turkic (variants of I/i). */ +static BOOL locale_possible_turkic(RE_LocaleInfo* locale_info, Py_UCS4 ch) { + return locale_toupper(locale_info, ch) == 'I' || + locale_tolower(locale_info, ch) == 'i'; +} + +/* Gets all the cases of a character. */ +static int locale_all_cases(RE_LocaleInfo* locale_info, Py_UCS4 ch, Py_UCS4* + codepoints) { + int count; + Py_UCS4 other; + + count = 0; + + codepoints[count++] = ch; + + other = locale_toupper(locale_info, ch); + if (other != ch) + codepoints[count++] = other; + + other = locale_tolower(locale_info, ch); + if (other != ch) + codepoints[count++] = other; + + return count; +} + +/* Returns a character with its case folded. */ +static Py_UCS4 locale_simple_case_fold(RE_LocaleInfo* locale_info, Py_UCS4 ch) + { + return locale_tolower(locale_info, ch); +} + +/* Returns a character with its case folded. */ +static int locale_full_case_fold(RE_LocaleInfo* locale_info, Py_UCS4 ch, + Py_UCS4* folded) { + folded[0] = locale_tolower(locale_info, ch); + + return 1; +} + +/* Gets all the case variants of Turkic 'I'. The given character will be listed + * first. + */ +static int locale_all_turkic_i(RE_LocaleInfo* locale_info, Py_UCS4 ch, Py_UCS4* + cases) { + int count; + Py_UCS4 other; + + count = 0; + + cases[count++] = ch; + + if (ch != 'I') + cases[count++] = 'I'; + + if (ch != 'i') + cases[count++] = 'i'; + + /* Uppercase 'i' will be either dotted (Turkic) or dotless (non-Turkic). */ + other = locale_toupper(locale_info, 'i'); + if (other != ch && other != 'I') + cases[count++] = other; + + /* Lowercase 'I' will be either dotless (Turkic) or dotted (non-Turkic). */ + other = locale_tolower(locale_info, 'I'); + if (other != ch && other != 'i') + cases[count++] = other; + + return count; +} + +/* The handlers for locale characters. */ +static RE_EncodingTable locale_encoding = { + locale_has_property_wrapper, + locale_at_boundary, + locale_at_word_start, + locale_at_word_end, + locale_at_boundary, /* No special "default word boundary" for locale. */ + locale_at_word_start, /* No special "default start of a word" for locale. */ + locale_at_word_end, /* No special "default end of a word" for locale. */ + at_boundary_always, /* No special "grapheme boundary" for locale. */ + ascii_is_line_sep, /* Assume locale line separators are same as ASCII. */ + ascii_at_line_start, /* Assume locale line separators are same as ASCII. */ + ascii_at_line_end, /* Assume locale line separators are same as ASCII. */ + locale_possible_turkic, + locale_all_cases, + locale_simple_case_fold, + locale_full_case_fold, + locale_all_turkic_i, +}; + +/* Unicode-specific. */ + +/* Checks whether a Unicode character has a property. */ +Py_LOCAL_INLINE(BOOL) unicode_has_property(RE_CODE property, Py_UCS4 ch) { + RE_UINT32 prop; + RE_UINT32 value; + RE_UINT32 v; + + prop = property >> 16; + if (prop >= sizeof(re_get_property) / sizeof(re_get_property[0])) + return FALSE; + + value = property & 0xFFFF; + v = re_get_property[prop](ch); + + if (v == value) + return TRUE; + + if (prop == RE_PROP_GC) { + switch (value) { + case RE_PROP_ASSIGNED: + return v != RE_PROP_CN; + case RE_PROP_C: + return (RE_PROP_C_MASK & (1 << v)) != 0; + case RE_PROP_CASEDLETTER: + return v == RE_PROP_LU || v == RE_PROP_LL || v == RE_PROP_LT; + case RE_PROP_L: + return (RE_PROP_L_MASK & (1 << v)) != 0; + case RE_PROP_M: + return (RE_PROP_M_MASK & (1 << v)) != 0; + case RE_PROP_N: + return (RE_PROP_N_MASK & (1 << v)) != 0; + case RE_PROP_P: + return (RE_PROP_P_MASK & (1 << v)) != 0; + case RE_PROP_S: + return (RE_PROP_S_MASK & (1 << v)) != 0; + case RE_PROP_Z: + return (RE_PROP_Z_MASK & (1 << v)) != 0; + } + } + + return FALSE; +} + +/* Wrapper for calling 'unicode_has_property' via a pointer. */ +static BOOL unicode_has_property_wrapper(RE_LocaleInfo* locale_info, RE_CODE + property, Py_UCS4 ch) { + return unicode_has_property(property, ch); +} + +/* Checks whether there's a word character to the left. */ +Py_LOCAL_INLINE(BOOL) unicode_word_left(RE_State* state, Py_ssize_t text_pos) { + return text_pos > 0 && unicode_has_property(RE_PROP_WORD, + state->char_at(state->text, text_pos - 1)); +} + +/* Checks whether there's a word character to the right. */ +Py_LOCAL_INLINE(BOOL) unicode_word_right(RE_State* state, Py_ssize_t text_pos) + { + return text_pos < state->text_length && unicode_has_property(RE_PROP_WORD, + state->char_at(state->text, text_pos)); +} + +/* Checks whether a position is on a word boundary. */ +static BOOL unicode_at_boundary(RE_State* state, Py_ssize_t text_pos) { + BOOL left; + BOOL right; + + left = unicode_word_left(state, text_pos); + right = unicode_word_right(state, text_pos); + + return left != right; +} + +/* Checks whether a position is at the start of a word. */ +static BOOL unicode_at_word_start(RE_State* state, Py_ssize_t text_pos) { + BOOL left; + BOOL right; + + left = unicode_word_left(state, text_pos); + right = unicode_word_right(state, text_pos); + + return !left && right; +} + +/* Checks whether a position is at the end of a word. */ +static BOOL unicode_at_word_end(RE_State* state, Py_ssize_t text_pos) { + BOOL left; + BOOL right; + + left = unicode_word_left(state, text_pos); + right = unicode_word_right(state, text_pos); + + return left && !right; +} + +/* Checks whether a character is a Unicode vowel. + * + * Only a limited number are treated as vowels. + */ +Py_LOCAL_INLINE(BOOL) is_unicode_vowel(Py_UCS4 ch) { + switch (Py_UNICODE_TOLOWER((Py_UNICODE)ch)) { + case 'a': case 0xE0: case 0xE1: case 0xE2: + case 'e': case 0xE8: case 0xE9: case 0xEA: + case 'i': case 0xEC: case 0xED: case 0xEE: + case 'o': case 0xF2: case 0xF3: case 0xF4: + case 'u': case 0xF9: case 0xFA: case 0xFB: + return TRUE; + default: + return FALSE; + } +} + +/* Checks whether a position is on a default word boundary. + * + * The rules are defined here: + * http://www.unicode.org/reports/tr29/#Default_Word_Boundaries + */ +static BOOL unicode_at_default_boundary(RE_State* state, Py_ssize_t text_pos) { + Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); + int prop; + int prop_m1; + Py_ssize_t pos_m1; + Py_ssize_t pos_m2; + int prop_m2; + Py_ssize_t pos_p0; + int prop_p0; + Py_ssize_t pos_p1; + int prop_p1; + + /* Break at the start and end of the text. */ + /* WB1 */ + if (text_pos <= 0) + return TRUE; + + /* WB2 */ + if (text_pos >= state->text_length) + return TRUE; + + char_at = state->char_at; + + prop = (int)re_get_word_break(char_at(state->text, text_pos)); + prop_m1 = (int)re_get_word_break(char_at(state->text, text_pos - 1)); + + /* Don't break within CRLF. */ + /* WB3 */ + if (prop_m1 == RE_BREAK_CR && prop == RE_BREAK_LF) + return FALSE; + + /* Otherwise break before and after Newlines (including CR and LF). */ + /* WB3a and WB3b */ + if (prop_m1 == RE_BREAK_NEWLINE || prop_m1 == RE_BREAK_CR || prop_m1 == + RE_BREAK_LF || prop == RE_BREAK_NEWLINE || prop == RE_BREAK_CR || prop == + RE_BREAK_LF) + return TRUE; + + /* WB4 */ + /* Get the property of the previous character, ignoring Format and Extend + * characters. + */ + pos_m1 = text_pos - 1; + prop_m1 = RE_BREAK_OTHER; + while (pos_m1 >= 0) { + prop_m1 = (int)re_get_word_break(char_at(state->text, pos_m1)); + if (prop_m1 != RE_BREAK_EXTEND && prop_m1 != RE_BREAK_FORMAT) + break; + + --pos_m1; + } + + /* Get the property of the preceding character, ignoring Format and Extend + * characters. + */ + pos_m2 = pos_m1 - 1; + prop_m2 = RE_BREAK_OTHER; + while (pos_m2 >= 0) { + prop_m2 = (int)re_get_word_break(char_at(state->text, pos_m2)); + if (prop_m2 != RE_BREAK_EXTEND && prop_m2 != RE_BREAK_FORMAT) + break; + + --pos_m2; + } + + /* Get the property of the next character, ignoring Format and Extend + * characters. + */ + pos_p0 = text_pos; + prop_p0 = prop; + while (pos_p0 < state->text_length) { + prop_p0 = (int)re_get_word_break(char_at(state->text, pos_p0)); + if (prop_p0 != RE_BREAK_EXTEND && prop_p0 != RE_BREAK_FORMAT) + break; + + ++pos_p0; + } + + /* Get the property of the following character, ignoring Format and Extend + * characters. + */ + pos_p1 = pos_p0 + 1; + prop_p1 = RE_BREAK_OTHER; + while (pos_p1 < state->text_length) { + prop_p1 = (int)re_get_word_break(char_at(state->text, pos_p1)); + if (prop_p1 != RE_BREAK_EXTEND && prop_p1 != RE_BREAK_FORMAT) + break; + + ++pos_p1; + } + + /* Don't break between most letters. */ + /* WB5 */ + if ((prop_m1 == RE_BREAK_ALETTER || prop_m1 == RE_BREAK_HEBREWLETTER) && + (prop_p0 == RE_BREAK_ALETTER || prop_p0 == RE_BREAK_HEBREWLETTER)) + return FALSE; + + /* Break between apostrophe and vowels (French, Italian). */ + /* WB5a */ + if (pos_m1 >= 0 && char_at(state->text, pos_m1) == '\'' && + is_unicode_vowel(char_at(state->text, text_pos))) + return TRUE; + + /* Don't break letters across certain punctuation. */ + /* WB6 */ + if ((prop_m1 == RE_BREAK_ALETTER || prop_m1 == RE_BREAK_HEBREWLETTER) && + (prop_p0 == RE_BREAK_MIDLETTER || prop_p0 == RE_BREAK_MIDNUMLET || + prop_p0 == RE_BREAK_SINGLEQUOTE) && (prop_p1 == RE_BREAK_ALETTER || + prop_p1 == RE_BREAK_HEBREWLETTER)) + return FALSE; + /* WB7 */ + if ((prop_m2 == RE_BREAK_ALETTER || prop_m2 == RE_BREAK_HEBREWLETTER) && + (prop_m1 == RE_BREAK_MIDLETTER || prop_m1 == RE_BREAK_MIDNUMLET || + prop_m1 == RE_BREAK_SINGLEQUOTE) && (prop_p0 == RE_BREAK_ALETTER || + prop_p0 == RE_BREAK_HEBREWLETTER)) + return FALSE; + /* WB7a */ + if (prop_m1 == RE_BREAK_HEBREWLETTER && prop_p0 == RE_BREAK_SINGLEQUOTE) + return FALSE; + /* WB7b */ + if (prop_m1 == RE_BREAK_HEBREWLETTER && prop_p0 == RE_BREAK_DOUBLEQUOTE && + prop_p1 == RE_BREAK_HEBREWLETTER) + return FALSE; + /* WB7c */ + if (prop_m2 == RE_BREAK_HEBREWLETTER && prop_m1 == RE_BREAK_DOUBLEQUOTE && + prop_p0 == RE_BREAK_HEBREWLETTER) + return FALSE; + + /* Don't break within sequences of digits, or digits adjacent to letters + * ("3a", or "A3"). + */ + /* WB8 */ + if (prop_m1 == RE_BREAK_NUMERIC && prop_p0 == RE_BREAK_NUMERIC) + return FALSE; + /* WB9 */ + if ((prop_m1 == RE_BREAK_ALETTER || prop_m1 == RE_BREAK_HEBREWLETTER) && + prop_p0 == RE_BREAK_NUMERIC) + return FALSE; + /* WB10 */ + if (prop_m1 == RE_BREAK_NUMERIC && (prop_p0 == RE_BREAK_ALETTER || prop_p0 + == RE_BREAK_HEBREWLETTER)) + return FALSE; + + /* Don't break within sequences, such as "3.2" or "3,456.789". */ + /* WB11 */ + if (prop_m2 == RE_BREAK_NUMERIC && (prop_m1 == RE_BREAK_MIDNUM || prop_m1 + == RE_BREAK_MIDNUMLET || prop_m1 == RE_BREAK_SINGLEQUOTE) && prop_p0 == + RE_BREAK_NUMERIC) + return FALSE; + /* WB12 */ + if (prop_m1 == RE_BREAK_NUMERIC && (prop_p0 == RE_BREAK_MIDNUM || prop_p0 + == RE_BREAK_MIDNUMLET || prop_p0 == RE_BREAK_SINGLEQUOTE) && prop_p1 == + RE_BREAK_NUMERIC) + return FALSE; + + /* Don't break between Katakana. */ + /* WB13 */ + if (prop_m1 == RE_BREAK_KATAKANA && prop_p0 == RE_BREAK_KATAKANA) + return FALSE; + + /* Don't break from extenders. */ + /* WB13a */ + if ((prop_m1 == RE_BREAK_ALETTER || prop_m1 == RE_BREAK_HEBREWLETTER || + prop_m1 == RE_BREAK_NUMERIC || prop_m1 == RE_BREAK_KATAKANA || prop_m1 == + RE_BREAK_EXTENDNUMLET) && prop_p0 == RE_BREAK_EXTENDNUMLET) + return FALSE; + /* WB13b */ + if (prop_m1 == RE_BREAK_EXTENDNUMLET && (prop_p0 == RE_BREAK_ALETTER || + prop_p0 == RE_BREAK_HEBREWLETTER || prop_p0 == RE_BREAK_NUMERIC || + prop_p0 == RE_BREAK_KATAKANA)) + return FALSE; + + /* Don't break between regional indicator symbols. */ + /* WB13c */ + if (prop_m1 == RE_BREAK_REGIONALINDICATOR && prop_p0 == + RE_BREAK_REGIONALINDICATOR) + return FALSE; + + /* Otherwise, break everywhere (including around ideographs). */ + /* WB14 */ + return TRUE; +} + +/* Checks whether a position is at the start/end of a word. */ +Py_LOCAL_INLINE(BOOL) unicode_at_default_word_start_or_end(RE_State* state, + Py_ssize_t text_pos, BOOL at_start) { + Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); + BOOL before; + BOOL after; + Py_UCS4 char_0; + Py_UCS4 char_m1; + int prop; + int prop_m1; + Py_ssize_t pos_m1; + Py_ssize_t pos_p1; + int prop_p1; + Py_UCS4 char_p1; + Py_ssize_t pos_m2; + int prop_m2; + Py_UCS4 char_m2; + + char_at = state->char_at; + + /* At the start or end of the text. */ + if (text_pos <= 0 || text_pos >= state->text_length) { + before = unicode_word_left(state, text_pos); + after = unicode_word_right(state, text_pos); + + return before != at_start && after == at_start; + } + + char_0 = char_at(state->text, text_pos); + char_m1 = char_at(state->text, text_pos - 1); + prop = (int)re_get_word_break(char_0); + prop_m1 = (int)re_get_word_break(char_m1); + + /* No break within CRLF. */ + if (prop_m1 == RE_BREAK_CR && prop == RE_BREAK_LF) + return FALSE; + + /* Break before and after Newlines (including CR and LF). */ + if (prop_m1 == RE_BREAK_NEWLINE || prop_m1 == RE_BREAK_CR || prop_m1 == + RE_BREAK_LF || prop == RE_BREAK_NEWLINE || prop == RE_BREAK_CR || prop == + RE_BREAK_LF) { + before = unicode_has_property(RE_PROP_WORD, char_m1); + after = unicode_has_property(RE_PROP_WORD, char_0); + + return before != at_start && after == at_start; + } + + /* No break just before Format or Extend characters. */ + if (prop == RE_BREAK_EXTEND || prop == RE_BREAK_FORMAT) + return FALSE; + + /* Get the property of the previous character. */ + pos_m1 = text_pos - 1; + prop_m1 = RE_BREAK_OTHER; + while (pos_m1 >= 0) { + char_m1 = char_at(state->text, pos_m1); + prop_m1 = (int)re_get_word_break(char_m1); + if (prop_m1 != RE_BREAK_EXTEND && prop_m1 != RE_BREAK_FORMAT) + break; + + --pos_m1; + } + + /* No break between most letters. */ + if (prop_m1 == RE_BREAK_ALETTER && prop == RE_BREAK_ALETTER) + return FALSE; + + if (pos_m1 >= 0 && char_m1 == '\'' && is_unicode_vowel(char_0)) + return TRUE; + + pos_p1 = text_pos + 1; + prop_p1 = RE_BREAK_OTHER; + while (pos_p1 < state->text_length) { + char_p1 = char_at(state->text, pos_p1); + prop_p1 = (int)re_get_word_break(char_p1); + if (prop_p1 != RE_BREAK_EXTEND && prop_p1 != RE_BREAK_FORMAT) + break; + + ++pos_p1; + } + + /* No break letters across certain punctuation. */ + if (prop_m1 == RE_BREAK_ALETTER && (prop == RE_BREAK_MIDLETTER || prop == + RE_BREAK_MIDNUMLET) && prop_p1 == RE_BREAK_ALETTER) + return FALSE; + + pos_m2 = pos_m1 - 1; + prop_m2 = RE_BREAK_OTHER; + while (pos_m2 >= 0) { + char_m2 = char_at(state->text, pos_m2); + prop_m2 = (int)re_get_word_break(char_m2); + if (prop_m2 != RE_BREAK_EXTEND && prop_m1 != RE_BREAK_FORMAT) + break; + + --pos_m2; + } + + if (prop_m2 == RE_BREAK_ALETTER && (prop_m1 == RE_BREAK_MIDLETTER || + prop_m1 == RE_BREAK_MIDNUMLET) && prop == RE_BREAK_ALETTER) + return FALSE; + + /* No break within sequences of digits, or digits adjacent to letters + * ("3a", or "A3"). + */ + if ((prop_m1 == RE_BREAK_NUMERIC || prop_m1 == RE_BREAK_ALETTER) && prop == + RE_BREAK_NUMERIC) + return FALSE; + + if (prop_m1 == RE_BREAK_NUMERIC && prop == RE_BREAK_ALETTER) + return FALSE; + + /* No break within sequences, such as "3.2" or "3,456.789". */ + if (prop_m2 == RE_BREAK_NUMERIC && (prop_m1 == RE_BREAK_MIDNUM || prop_m1 + == RE_BREAK_MIDNUMLET) && prop == RE_BREAK_NUMERIC) + return FALSE; + + if (prop_m1 == RE_BREAK_NUMERIC && (prop == RE_BREAK_MIDNUM || prop == + RE_BREAK_MIDNUMLET) && prop_p1 == RE_BREAK_NUMERIC) + return FALSE; + + /* No break between Katakana. */ + if (prop_m1 == RE_BREAK_KATAKANA && prop == RE_BREAK_KATAKANA) + return FALSE; + + /* No break from extenders. */ + if ((prop_m1 == RE_BREAK_ALETTER || prop_m1 == RE_BREAK_NUMERIC || prop_m1 + == RE_BREAK_KATAKANA || prop_m1 == RE_BREAK_EXTENDNUMLET) && prop == + RE_BREAK_EXTENDNUMLET) + return FALSE; + + if (prop_m1 == RE_BREAK_EXTENDNUMLET && (prop == RE_BREAK_ALETTER || prop + == RE_BREAK_NUMERIC || prop == RE_BREAK_KATAKANA)) + return FALSE; + + /* Otherwise, break everywhere (including around ideographs). */ + before = unicode_has_property(RE_PROP_WORD, char_m1); + after = unicode_has_property(RE_PROP_WORD, char_0); + + return before != at_start && after == at_start; +} + +/* Checks whether a position is at the start of a word. */ +static BOOL unicode_at_default_word_start(RE_State* state, Py_ssize_t text_pos) + { + return unicode_at_default_word_start_or_end(state, text_pos, TRUE); +} + +/* Checks whether a position is at the end of a word. */ +static BOOL unicode_at_default_word_end(RE_State* state, Py_ssize_t text_pos) { + return unicode_at_default_word_start_or_end(state, text_pos, FALSE); +} + +/* Checks whether a position is on a grapheme boundary. + * + * The rules are defined here: + * http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries + */ +static BOOL unicode_at_grapheme_boundary(RE_State* state, Py_ssize_t text_pos) + { + Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); + int prop; + int prop_m1; + + /* Break at the start and end of the text. */ + /* GB1 */ + if (text_pos <= 0) + return TRUE; + + /* GB2 */ + if (text_pos >= state->text_length) + return TRUE; + + char_at = state->char_at; + + prop = (int)re_get_grapheme_cluster_break(char_at(state->text, text_pos)); + prop_m1 = (int)re_get_grapheme_cluster_break(char_at(state->text, text_pos + - 1)); + + /* Don't break within CRLF. */ + /* GB3 */ + if (prop_m1 == RE_GBREAK_CR && prop == RE_GBREAK_LF) + return FALSE; + + /* Otherwise break before and after controls (including CR and LF). */ + /* GB4 and GB5 */ + if (prop_m1 == RE_GBREAK_CONTROL || prop_m1 == RE_GBREAK_CR || prop_m1 == + RE_GBREAK_LF || prop == RE_GBREAK_CONTROL || prop == RE_GBREAK_CR || prop + == RE_GBREAK_LF) + return TRUE; + + /* Don't break Hangul syllable sequences. */ + /* GB6 */ + if (prop_m1 == RE_GBREAK_L && (prop == RE_GBREAK_L || prop == RE_GBREAK_V + || prop == RE_GBREAK_LV || prop == RE_GBREAK_LVT)) + return FALSE; + /* GB7 */ + if ((prop_m1 == RE_GBREAK_LV || prop_m1 == RE_GBREAK_V) && (prop == + RE_GBREAK_V || prop == RE_GBREAK_T)) + return FALSE; + /* GB8 */ + if ((prop_m1 == RE_GBREAK_LVT || prop_m1 == RE_GBREAK_T) && (prop == + RE_GBREAK_T)) + return FALSE; + + /* Don't break between regional indicator symbols. */ + /* GB8a */ + if (prop_m1 == RE_GBREAK_REGIONALINDICATOR && prop == + RE_GBREAK_REGIONALINDICATOR) + return FALSE; + + /* Don't break just before Extend characters. */ + /* GB9 */ + if (prop == RE_GBREAK_EXTEND) + return FALSE; + + /* Don't break before SpacingMarks, or after Prepend characters. */ + /* GB9a */ + if (prop == RE_GBREAK_SPACINGMARK) + return FALSE; + + /* GB9b */ + if (prop_m1 == RE_GBREAK_PREPEND) + return FALSE; + + /* Otherwise, break everywhere. */ + /* GB10 */ + return TRUE; +} + +/* Checks whether a character is a line separator. */ +static BOOL unicode_is_line_sep(Py_UCS4 ch) { + return (0x0A <= ch && ch <= 0x0D) || ch == 0x85 || ch == 0x2028 || ch == + 0x2029; +} + +/* Checks whether a position is at the start of a line. */ +static BOOL unicode_at_line_start(RE_State* state, Py_ssize_t text_pos) { + Py_UCS4 ch; + + if (text_pos <= 0) + return TRUE; + + ch = state->char_at(state->text, text_pos - 1); + + if (ch == 0x0D) { + if (text_pos >= state->text_length) + return TRUE; + + /* No line break inside CRLF. */ + return state->char_at(state->text, text_pos) != 0x0A; + } + + return (0x0A <= ch && ch <= 0x0D) || ch == 0x85 || ch == 0x2028 || ch == + 0x2029; +} + +/* Checks whether a position is at the end of a line. */ +static BOOL unicode_at_line_end(RE_State* state, Py_ssize_t text_pos) { + Py_UCS4 ch; + + if (text_pos >= state->text_length) + return TRUE; + + ch = state->char_at(state->text, text_pos); + + if (ch == 0x0A) { + if (text_pos <= 0) + return TRUE; + + /* No line break inside CRLF. */ + return state->char_at(state->text, text_pos - 1) != 0x0D; + } + + return (0x0A <= ch && ch <= 0x0D) || ch == 0x85 || ch == 0x2028 || ch == + 0x2029; +} + +/* Checks whether a character could be Turkic (variants of I/i). */ +static BOOL unicode_possible_turkic(RE_LocaleInfo* locale_info, Py_UCS4 ch) { + return ch == 'I' || ch == 'i' || ch == 0x0130 || ch == 0x0131; +} + +/* Gets all the cases of a character. */ +static int unicode_all_cases(RE_LocaleInfo* locale_info, Py_UCS4 ch, Py_UCS4* + codepoints) { + return re_get_all_cases(ch, codepoints); +} + +/* Returns a character with its case folded, unless it could be Turkic + * (variants of I/i). + */ +static Py_UCS4 unicode_simple_case_fold(RE_LocaleInfo* locale_info, Py_UCS4 ch) + { + /* Is it a possible Turkic character? If so, pass it through unchanged. */ + if (ch == 'I' || ch == 'i' || ch == 0x0130 || ch == 0x0131) + return ch; + + return (Py_UCS4)re_get_simple_case_folding(ch); +} + +/* Returns a character with its case folded, unless it could be Turkic + * (variants of I/i). + */ +static int unicode_full_case_fold(RE_LocaleInfo* locale_info, Py_UCS4 ch, + Py_UCS4* folded) { + /* Is it a possible Turkic character? If so, pass it through unchanged. */ + if (ch == 'I' || ch == 'i' || ch == 0x0130 || ch == 0x0131) { + folded[0] = ch; + return 1; + } + + return re_get_full_case_folding(ch, folded); +} + +/* Gets all the case variants of Turkic 'I'. */ +static int unicode_all_turkic_i(RE_LocaleInfo* locale_info, Py_UCS4 ch, + Py_UCS4* cases) { + int count; + + count = 0; + + cases[count++] = ch; + + if (ch != 'I') + cases[count++] = 'I'; + + if (ch != 'i') + cases[count++] = 'i'; + + if (ch != 0x130) + cases[count++] = 0x130; + + if (ch != 0x131) + cases[count++] = 0x131; + + return count; + +} + +/* The handlers for Unicode characters. */ +static RE_EncodingTable unicode_encoding = { + unicode_has_property_wrapper, + unicode_at_boundary, + unicode_at_word_start, + unicode_at_word_end, + unicode_at_default_boundary, + unicode_at_default_word_start, + unicode_at_default_word_end, + unicode_at_grapheme_boundary, + unicode_is_line_sep, + unicode_at_line_start, + unicode_at_line_end, + unicode_possible_turkic, + unicode_all_cases, + unicode_simple_case_fold, + unicode_full_case_fold, + unicode_all_turkic_i, +}; + +Py_LOCAL_INLINE(PyObject*) get_object(char* module_name, char* object_name); + +/* Sets the error message. */ +Py_LOCAL_INLINE(void) set_error(int status, PyObject* object) { + TRACE(("<<set_error>>\n")) + + if (!error_exception) + error_exception = get_object("_" RE_MODULE "_core", "error"); + + switch (status) { + case RE_ERROR_BACKTRACKING: + PyErr_SetString(error_exception, "too much backtracking"); + break; + case RE_ERROR_CONCURRENT: + PyErr_SetString(PyExc_ValueError, "concurrent not int or None"); + break; + case RE_ERROR_GROUP_INDEX_TYPE: + if (object) + PyErr_Format(PyExc_TypeError, + "group indices must be integers or strings, not %.200s", + object->ob_type->tp_name); + else + PyErr_Format(PyExc_TypeError, + "group indices must be integers or strings"); + break; + case RE_ERROR_ILLEGAL: + PyErr_SetString(PyExc_RuntimeError, "invalid RE code"); + break; + case RE_ERROR_INDEX: + PyErr_SetString(PyExc_TypeError, "string indices must be integers"); + break; + case RE_ERROR_INTERRUPTED: + /* An exception has already been raised, so let it fly. */ + break; + case RE_ERROR_INVALID_GROUP_REF: + PyErr_SetString(error_exception, "invalid group reference"); + break; + case RE_ERROR_MEMORY: + PyErr_NoMemory(); + break; + case RE_ERROR_NOT_STRING: + PyErr_Format(PyExc_TypeError, "expected string instance, %.200s found", + object->ob_type->tp_name); + break; + case RE_ERROR_NOT_UNICODE: + PyErr_Format(PyExc_TypeError, "expected unicode instance, not %.200s", + object->ob_type->tp_name); + break; + case RE_ERROR_NO_SUCH_GROUP: + PyErr_SetString(PyExc_IndexError, "no such group"); + break; + case RE_ERROR_REPLACEMENT: + PyErr_SetString(error_exception, "invalid replacement"); + break; + default: + /* Other error codes indicate compiler/engine bugs. */ + PyErr_SetString(PyExc_RuntimeError, + "internal error in regular expression engine"); + break; + } +} + +/* Allocates memory. + * + * Sets the Python error handler and returns NULL if the allocation fails. + */ +Py_LOCAL_INLINE(void*) re_alloc(size_t size) { + void* new_ptr; + + new_ptr = PyMem_Malloc(size); + if (!new_ptr) + set_error(RE_ERROR_MEMORY, NULL); + + return new_ptr; +} + +/* Reallocates memory. + * + * Sets the Python error handler and returns NULL if the reallocation fails. + */ +Py_LOCAL_INLINE(void*) re_realloc(void* ptr, size_t size) { + void* new_ptr; + + new_ptr = PyMem_Realloc(ptr, size); + if (!new_ptr) + set_error(RE_ERROR_MEMORY, NULL); + + return new_ptr; +} + +/* Deallocates memory. */ +Py_LOCAL_INLINE(void) re_dealloc(void* ptr) { + PyMem_Free(ptr); +} + +/* Releases the GIL if multithreading is enabled. */ +Py_LOCAL_INLINE(void) release_GIL(RE_SafeState* safe_state) { + if (safe_state->re_state->is_multithreaded) + safe_state->thread_state = PyEval_SaveThread(); +} + +/* Acquires the GIL if multithreading is enabled. */ +Py_LOCAL_INLINE(void) acquire_GIL(RE_SafeState* safe_state) { + if (safe_state->re_state->is_multithreaded) + PyEval_RestoreThread(safe_state->thread_state); +} + +/* Allocates memory, holding the GIL during the allocation. + * + * Sets the Python error handler and returns NULL if the allocation fails. + */ +Py_LOCAL_INLINE(void*) safe_alloc(RE_SafeState* safe_state, size_t size) { + void* new_ptr; + + acquire_GIL(safe_state); + + new_ptr = re_alloc(size); + + release_GIL(safe_state); + + return new_ptr; +} + +/* Reallocates memory, holding the GIL during the reallocation. + * + * Sets the Python error handler and returns NULL if the reallocation fails. + */ +Py_LOCAL_INLINE(void*) safe_realloc(RE_SafeState* safe_state, void* ptr, size_t + size) { + void* new_ptr; + + acquire_GIL(safe_state); + + new_ptr = re_realloc(ptr, size); + + release_GIL(safe_state); + + return new_ptr; +} + +/* Deallocates memory, holding the GIL during the deallocation. */ +Py_LOCAL_INLINE(void) safe_dealloc(RE_SafeState* safe_state, void* ptr) { + acquire_GIL(safe_state); + + re_dealloc(ptr); + + release_GIL(safe_state); +} + +/* Checks for KeyboardInterrupt, holding the GIL during the check. */ +Py_LOCAL_INLINE(BOOL) safe_check_signals(RE_SafeState* safe_state) { + BOOL result; + + acquire_GIL(safe_state); + + result = (BOOL)PyErr_CheckSignals(); + + release_GIL(safe_state); + + return result; +} + +/* Checks whether a character is in a range. */ +Py_LOCAL_INLINE(BOOL) in_range(Py_UCS4 lower, Py_UCS4 upper, Py_UCS4 ch) { + return lower <= ch && ch <= upper; +} + +/* Checks whether a character is in a range, ignoring case. */ +Py_LOCAL_INLINE(BOOL) in_range_ign(RE_EncodingTable* encoding, RE_LocaleInfo* + locale_info, Py_UCS4 lower, Py_UCS4 upper, Py_UCS4 ch) { + int count; + Py_UCS4 cases[RE_MAX_CASES]; + int i; + + count = encoding->all_cases(locale_info, ch, cases); + + for (i = 0; i < count; i++) { + if (in_range(lower, upper, cases[i])) + return TRUE; + } + + return FALSE; +} + +/* Checks whether 2 characters are the same. */ +Py_LOCAL_INLINE(BOOL) same_char(Py_UCS4 ch1, Py_UCS4 ch2) { + return ch1 == ch2; +} + +/* Wrapper for calling 'same_char' via a pointer. */ +static BOOL same_char_wrapper(RE_EncodingTable* encoding, RE_LocaleInfo* + locale_info, Py_UCS4 ch1, Py_UCS4 ch2) { + return same_char(ch1, ch2); +} + +/* Checks whether 2 characters are the same, ignoring case. */ +Py_LOCAL_INLINE(BOOL) same_char_ign(RE_EncodingTable* encoding, RE_LocaleInfo* + locale_info, Py_UCS4 ch1, Py_UCS4 ch2) { + int count; + Py_UCS4 cases[RE_MAX_CASES]; + int i; + + if (ch1 == ch2) + return TRUE; + + count = encoding->all_cases(locale_info, ch1, cases); + + for (i = 1; i < count; i++) { + if (cases[i] == ch2) + return TRUE; + } + + return FALSE; +} + +/* Wrapper for calling 'same_char' via a pointer. */ +static BOOL same_char_ign_wrapper(RE_EncodingTable* encoding, RE_LocaleInfo* + locale_info, Py_UCS4 ch1, Py_UCS4 ch2) { + return same_char_ign(encoding, locale_info, ch1, ch2); +} + +/* Checks whether a character is anything except a newline. */ +Py_LOCAL_INLINE(BOOL) matches_ANY(RE_EncodingTable* encoding, RE_Node* node, + Py_UCS4 ch) { + return ch != '\n'; +} + +/* Checks whether a character is anything except a line separator. */ +Py_LOCAL_INLINE(BOOL) matches_ANY_U(RE_EncodingTable* encoding, RE_Node* node, + Py_UCS4 ch) { + return !encoding->is_line_sep(ch); +} + +/* Checks whether 2 characters are the same. */ +Py_LOCAL_INLINE(BOOL) matches_CHARACTER(RE_EncodingTable* encoding, + RE_LocaleInfo* locale_info, RE_Node* node, Py_UCS4 ch) { + return same_char(node->values[0], ch); +} + +/* Checks whether 2 characters are the same, ignoring case. */ +Py_LOCAL_INLINE(BOOL) matches_CHARACTER_IGN(RE_EncodingTable* encoding, + RE_LocaleInfo* locale_info, RE_Node* node, Py_UCS4 ch) { + return same_char_ign(encoding, locale_info, node->values[0], ch); +} + +/* Checks whether a character has a property. */ +Py_LOCAL_INLINE(BOOL) matches_PROPERTY(RE_EncodingTable* encoding, + RE_LocaleInfo* locale_info, RE_Node* node, Py_UCS4 ch) { + return encoding->has_property(locale_info, node->values[0], ch); +} + +/* Checks whether a character has a property, ignoring case. */ +Py_LOCAL_INLINE(BOOL) matches_PROPERTY_IGN(RE_EncodingTable* encoding, + RE_LocaleInfo* locale_info, RE_Node* node, Py_UCS4 ch) { + RE_UINT32 property; + RE_UINT32 prop; + + property = node->values[0]; + prop = property >> 16; + + /* We need to do special handling of case-sensitive properties according to + * the 'encoding'. + */ + if (encoding == &unicode_encoding) { + /* We are working with Unicode. */ + if (property == RE_PROP_GC_LU || property == RE_PROP_GC_LL || property + == RE_PROP_GC_LT) { + RE_UINT32 value; + + value = re_get_general_category(ch); + + return value == RE_PROP_LU || value == RE_PROP_LL || value == + RE_PROP_LT; + } else if (prop == RE_PROP_UPPERCASE || prop == RE_PROP_LOWERCASE) + return (BOOL)re_get_cased(ch); + + /* The property is case-insensitive. */ + return unicode_has_property(property, ch); + } else if (encoding == &ascii_encoding) { + /* We are working with ASCII. */ + if (property == RE_PROP_GC_LU || property == RE_PROP_GC_LL || property + == RE_PROP_GC_LT) { + RE_UINT32 value; + + value = re_get_general_category(ch); + + return value == RE_PROP_LU || value == RE_PROP_LL || value == + RE_PROP_LT; + } else if (prop == RE_PROP_UPPERCASE || prop == RE_PROP_LOWERCASE) + return (BOOL)re_get_cased(ch); + + /* The property is case-insensitive. */ + return ascii_has_property(property, ch); + } else { + /* We are working with Locale. */ + if (property == RE_PROP_GC_LU || property == RE_PROP_GC_LL || property + == RE_PROP_GC_LT) + return locale_isupper(locale_info, ch) || + locale_islower(locale_info, ch); + else if (prop == RE_PROP_UPPERCASE || prop == RE_PROP_LOWERCASE) + return locale_isupper(locale_info, ch) || + locale_islower(locale_info, ch); + + /* The property is case-insensitive. */ + return locale_has_property(locale_info, property, ch); + } +} + +/* Checks whether a character is in a range. */ +Py_LOCAL_INLINE(BOOL) matches_RANGE(RE_EncodingTable* encoding, RE_LocaleInfo* + locale_info, RE_Node* node, Py_UCS4 ch) { + return in_range(node->values[0], node->values[1], ch); +} + +/* Checks whether a character is in a range, ignoring case. */ +Py_LOCAL_INLINE(BOOL) matches_RANGE_IGN(RE_EncodingTable* encoding, + RE_LocaleInfo* locale_info, RE_Node* node, Py_UCS4 ch) { + return in_range_ign(encoding, locale_info, node->values[0], + node->values[1], ch); +} + +Py_LOCAL_INLINE(BOOL) in_set_diff(RE_EncodingTable* encoding, RE_LocaleInfo* + locale_info, RE_Node* node, Py_UCS4 ch); +Py_LOCAL_INLINE(BOOL) in_set_inter(RE_EncodingTable* encoding, RE_LocaleInfo* + locale_info, RE_Node* node, Py_UCS4 ch); +Py_LOCAL_INLINE(BOOL) in_set_sym_diff(RE_EncodingTable* encoding, + RE_LocaleInfo* locale_info, RE_Node* node, Py_UCS4 ch); +Py_LOCAL_INLINE(BOOL) in_set_union(RE_EncodingTable* encoding, RE_LocaleInfo* + locale_info, RE_Node* node, Py_UCS4 ch); + +/* Checks whether a character matches a set member. */ +Py_LOCAL_INLINE(BOOL) matches_member(RE_EncodingTable* encoding, RE_LocaleInfo* + locale_info, RE_Node* member, Py_UCS4 ch) { + switch (member->op) { + case RE_OP_CHARACTER: + /* values are: char_code */ + TRACE(("%s %d %d\n", re_op_text[member->op], member->match, + member->values[0])) + return ch == member->values[0]; + case RE_OP_PROPERTY: + /* values are: property */ + TRACE(("%s %d %d\n", re_op_text[member->op], member->match, + member->values[0])) + return encoding->has_property(locale_info, member->values[0], ch); + case RE_OP_RANGE: + /* values are: lower, upper */ + TRACE(("%s %d %d %d\n", re_op_text[member->op], member->match, + member->values[0], member->values[1])) + return in_range(member->values[0], member->values[1], ch); + case RE_OP_SET_DIFF: + TRACE(("%s\n", re_op_text[member->op])) + return in_set_diff(encoding, locale_info, member, ch); + case RE_OP_SET_INTER: + TRACE(("%s\n", re_op_text[member->op])) + return in_set_inter(encoding, locale_info, member, ch); + case RE_OP_SET_SYM_DIFF: + TRACE(("%s\n", re_op_text[member->op])) + return in_set_sym_diff(encoding, locale_info, member, ch); + case RE_OP_SET_UNION: + TRACE(("%s\n", re_op_text[member->op])) + return in_set_union(encoding, locale_info, member, ch); + case RE_OP_STRING: + { + /* values are: char_code, char_code, ... */ + size_t i; + TRACE(("%s %d %d\n", re_op_text[member->op], member->match, + member->value_count)) + + for (i = 0; i < member->value_count; i++) { + if (ch == member->values[i]) + return TRUE; + } + return FALSE; + } + default: + return FALSE; + } +} + +/* Checks whether a character matches a set member, ignoring case. */ +Py_LOCAL_INLINE(BOOL) matches_member_ign(RE_EncodingTable* encoding, + RE_LocaleInfo* locale_info, RE_Node* member, int case_count, Py_UCS4* cases) + { + int i; + + for (i = 0; i < case_count; i++) { + switch (member->op) { + case RE_OP_CHARACTER: + /* values are: char_code */ + TRACE(("%s %d %d\n", re_op_text[member->op], member->match, + member->values[0])) + if (cases[i] == member->values[0]) + return TRUE; + break; + case RE_OP_PROPERTY: + /* values are: property */ + TRACE(("%s %d %d\n", re_op_text[member->op], member->match, + member->values[0])) + if (encoding->has_property(locale_info, member->values[0], + cases[i])) + return TRUE; + break; + case RE_OP_RANGE: + /* values are: lower, upper */ + TRACE(("%s %d %d %d\n", re_op_text[member->op], member->match, + member->values[0], member->values[1])) + if (in_range(member->values[0], member->values[1], cases[i])) + return TRUE; + break; + case RE_OP_SET_DIFF: + TRACE(("%s\n", re_op_text[member->op])) + if (in_set_diff(encoding, locale_info, member, cases[i])) + return TRUE; + break; + case RE_OP_SET_INTER: + TRACE(("%s\n", re_op_text[member->op])) + if (in_set_inter(encoding, locale_info, member, cases[i])) + return TRUE; + break; + case RE_OP_SET_SYM_DIFF: + TRACE(("%s\n", re_op_text[member->op])) + if (in_set_sym_diff(encoding, locale_info, member, cases[i])) + return TRUE; + break; + case RE_OP_SET_UNION: + TRACE(("%s\n", re_op_text[member->op])) + if (in_set_union(encoding, locale_info, member, cases[i])) + return TRUE; + break; + case RE_OP_STRING: + { + size_t j; + TRACE(("%s %d %d\n", re_op_text[member->op], member->match, + member->value_count)) + + for (j = 0; j < member->value_count; j++) { + if (cases[i] == member->values[j]) + return TRUE; + } + break; + } + default: + return TRUE; + } + } + + return FALSE; +} + +/* Checks whether a character is in a set difference. */ +Py_LOCAL_INLINE(BOOL) in_set_diff(RE_EncodingTable* encoding, RE_LocaleInfo* + locale_info, RE_Node* node, Py_UCS4 ch) { + RE_Node* member; + + member = node->nonstring.next_2.node; + + if (matches_member(encoding, locale_info, member, ch) != member->match) + return FALSE; + + member = member->next_1.node; + + while (member) { + if (matches_member(encoding, locale_info, member, ch) == member->match) + return FALSE; + + member = member->next_1.node; + } + + return TRUE; +} + +/* Checks whether a character is in a set difference, ignoring case. */ +Py_LOCAL_INLINE(BOOL) in_set_diff_ign(RE_EncodingTable* encoding, + RE_LocaleInfo* locale_info, RE_Node* node, int case_count, Py_UCS4* cases) { + RE_Node* member; + + member = node->nonstring.next_2.node; + + if (matches_member_ign(encoding, locale_info, member, case_count, cases) != + member->match) + return FALSE; + + member = member->next_1.node; + + while (member) { + if (matches_member_ign(encoding, locale_info, member, case_count, + cases) == member->match) + return FALSE; + + member = member->next_1.node; + } + + return TRUE; +} + +/* Checks whether a character is in a set intersection. */ +Py_LOCAL_INLINE(BOOL) in_set_inter(RE_EncodingTable* encoding, RE_LocaleInfo* + locale_info, RE_Node* node, Py_UCS4 ch) { + RE_Node* member; + + member = node->nonstring.next_2.node; + + while (member) { + if (matches_member(encoding, locale_info, member, ch) != member->match) + return FALSE; + + member = member->next_1.node; + } + + return TRUE; +} + +/* Checks whether a character is in a set intersection, ignoring case. */ +Py_LOCAL_INLINE(BOOL) in_set_inter_ign(RE_EncodingTable* encoding, + RE_LocaleInfo* locale_info, RE_Node* node, int case_count, Py_UCS4* cases) { + RE_Node* member; + + member = node->nonstring.next_2.node; + + while (member) { + if (matches_member_ign(encoding, locale_info, member, case_count, + cases) != member->match) + return FALSE; + + member = member->next_1.node; + } + + return TRUE; +} + +/* Checks whether a character is in a set symmetric difference. */ +Py_LOCAL_INLINE(BOOL) in_set_sym_diff(RE_EncodingTable* encoding, + RE_LocaleInfo* locale_info, RE_Node* node, Py_UCS4 ch) { + RE_Node* member; + BOOL result; + + member = node->nonstring.next_2.node; + + result = FALSE; + + while (member) { + if (matches_member(encoding, locale_info, member, ch) == member->match) + result = !result; + + member = member->next_1.node; + } + + return result; +} + +/* Checks whether a character is in a set symmetric difference, ignoring case. + */ +Py_LOCAL_INLINE(BOOL) in_set_sym_diff_ign(RE_EncodingTable* encoding, + RE_LocaleInfo* locale_info, RE_Node* node, int case_count, Py_UCS4* cases) { + RE_Node* member; + BOOL result; + + member = node->nonstring.next_2.node; + + result = FALSE; + + while (member) { + if (matches_member_ign(encoding, locale_info, member, case_count, + cases) == member->match) + result = !result; + + member = member->next_1.node; + } + + return result; +} + +/* Checks whether a character is in a set union. */ +Py_LOCAL_INLINE(BOOL) in_set_union(RE_EncodingTable* encoding, RE_LocaleInfo* + locale_info, RE_Node* node, Py_UCS4 ch) { + RE_Node* member; + + member = node->nonstring.next_2.node; + + while (member) { + if (matches_member(encoding, locale_info, member, ch) == member->match) + return TRUE; + + member = member->next_1.node; + } + + return FALSE; +} + +/* Checks whether a character is in a set union, ignoring case. */ +Py_LOCAL_INLINE(BOOL) in_set_union_ign(RE_EncodingTable* encoding, + RE_LocaleInfo* locale_info, RE_Node* node, int case_count, Py_UCS4* cases) { + RE_Node* member; + + member = node->nonstring.next_2.node; + + while (member) { + if (matches_member_ign(encoding, locale_info, member, case_count, + cases) == member->match) + return TRUE; + + member = member->next_1.node; + } + + return FALSE; +} + +/* Checks whether a character is in a set. */ +Py_LOCAL_INLINE(BOOL) matches_SET(RE_EncodingTable* encoding, +RE_LocaleInfo* locale_info, RE_Node* node, Py_UCS4 ch) { + switch (node->op) { + case RE_OP_SET_DIFF: + case RE_OP_SET_DIFF_REV: + return in_set_diff(encoding, locale_info, node, ch); + case RE_OP_SET_INTER: + case RE_OP_SET_INTER_REV: + return in_set_inter(encoding, locale_info, node, ch); + case RE_OP_SET_SYM_DIFF: + case RE_OP_SET_SYM_DIFF_REV: + return in_set_sym_diff(encoding, locale_info, node, ch); + case RE_OP_SET_UNION: + case RE_OP_SET_UNION_REV: + return in_set_union(encoding, locale_info, node, ch); + } + + return FALSE; +} + +/* Checks whether a character is in a set, ignoring case. */ +Py_LOCAL_INLINE(BOOL) matches_SET_IGN(RE_EncodingTable* encoding, +RE_LocaleInfo* locale_info, RE_Node* node, Py_UCS4 ch) { + Py_UCS4 cases[RE_MAX_CASES]; + int case_count; + + case_count = encoding->all_cases(locale_info, ch, cases); + + switch (node->op) { + case RE_OP_SET_DIFF_IGN: + case RE_OP_SET_DIFF_IGN_REV: + return in_set_diff_ign(encoding, locale_info, node, case_count, cases); + case RE_OP_SET_INTER_IGN: + case RE_OP_SET_INTER_IGN_REV: + return in_set_inter_ign(encoding, locale_info, node, case_count, + cases); + case RE_OP_SET_SYM_DIFF_IGN: + case RE_OP_SET_SYM_DIFF_IGN_REV: + return in_set_sym_diff_ign(encoding, locale_info, node, case_count, + cases); + case RE_OP_SET_UNION_IGN: + case RE_OP_SET_UNION_IGN_REV: + return in_set_union_ign(encoding, locale_info, node, case_count, + cases); + } + + return FALSE; +} + +/* Resets a guard list. */ +Py_LOCAL_INLINE(void) reset_guard_list(RE_GuardList* guard_list) { + guard_list->count = 0; + guard_list->last_text_pos = -1; +} + +/* Clears the groups. */ +Py_LOCAL_INLINE(void) clear_groups(RE_State* state) { + size_t i; + + for (i = 0; i < state->pattern->true_group_count; i++) { + RE_GroupData* group; + + group = &state->groups[i]; + group->span.start = -1; + group->span.end = -1; + group->capture_count = 0; + group->current_capture = -1; + } +} + +/* Initialises the state for a match. */ +Py_LOCAL_INLINE(void) init_match(RE_State* state) { + RE_AtomicBlock* current; + size_t i; + + /* Reset the backtrack. */ + state->current_backtrack_block = &state->backtrack_block; + state->current_backtrack_block->count = 0; + state->current_saved_groups = state->first_saved_groups; + state->backtrack = NULL; + state->search_anchor = state->text_pos; + state->match_pos = state->text_pos; + + /* Reset the atomic stack. */ + current = state->current_atomic_block; + if (current) { + while (current->previous) + current = current->previous; + + state->current_atomic_block = current; + state->current_atomic_block->count = 0; + } + + /* Reset the guards for the repeats. */ + for (i = 0; i < state->pattern->repeat_count; i++) { + reset_guard_list(&state->repeats[i].body_guard_list); + reset_guard_list(&state->repeats[i].tail_guard_list); + } + + /* Reset the guards for the fuzzy sections. */ + for (i = 0; i < state->pattern->fuzzy_count; i++) { + reset_guard_list(&state->fuzzy_guards[i].body_guard_list); + reset_guard_list(&state->fuzzy_guards[i].tail_guard_list); + } + + /* Clear the groups. */ + clear_groups(state); + + /* Reset the guards for the group calls. */ + for (i = 0; i < state->pattern->call_ref_info_count; i++) + reset_guard_list(&state->group_call_guard_list[i]); + + /* Clear the counts and cost for matching. */ + if (state->pattern->is_fuzzy) { + memset(state->fuzzy_info.counts, 0, sizeof(state->fuzzy_info.counts)); + memset(state->total_fuzzy_counts, 0, + sizeof(state->total_fuzzy_counts)); + } + + state->fuzzy_info.total_cost = 0; + state->total_errors = 0; + state->too_few_errors = FALSE; + state->found_match = FALSE; + state->capture_change = 0; + state->iterations = 0; +} + +/* Adds a new backtrack entry. */ +Py_LOCAL_INLINE(BOOL) add_backtrack(RE_SafeState* safe_state, RE_UINT8 op) { + RE_State* state; + RE_BacktrackBlock* current; + + state = safe_state->re_state; + + current = state->current_backtrack_block; + if (current->count >= current->capacity) { + if (!current->next) { + RE_BacktrackBlock* next; + + /* Is there too much backtracking? */ + if (state->backtrack_allocated >= RE_MAX_BACKTRACK_ALLOC) + return FALSE; + + next = (RE_BacktrackBlock*)safe_alloc(safe_state, + sizeof(RE_BacktrackBlock)); + if (!next) + return FALSE; + + next->previous = current; + next->next = NULL; + next->capacity = RE_BACKTRACK_BLOCK_SIZE; + current->next = next; + + state->backtrack_allocated += RE_BACKTRACK_BLOCK_SIZE; + } + + current = current->next; + current->count = 0; + state->current_backtrack_block = current; + } + + state->backtrack = ¤t->items[current->count++]; + state->backtrack->op = op; + + return TRUE; +} + +/* Gets the last backtrack entry. + * + * It'll never be called when there are _no_ entries. + */ +Py_LOCAL_INLINE(RE_BacktrackData*) last_backtrack(RE_State* state) { + RE_BacktrackBlock* current; + + current = state->current_backtrack_block; + state->backtrack = ¤t->items[current->count - 1]; + + return state->backtrack; +} + +/* Discards the last backtrack entry. + * + * It'll never be called to discard the _only_ entry. + */ +Py_LOCAL_INLINE(void) discard_backtrack(RE_State* state) { + RE_BacktrackBlock* current; + + current = state->current_backtrack_block; + --current->count; + if (current->count == 0 && current->previous) + state->current_backtrack_block = current->previous; +} + +/* Pushes a new empty entry onto the atomic stack. */ +Py_LOCAL_INLINE(RE_AtomicData*) push_atomic(RE_SafeState* safe_state) { + RE_State* state; + RE_AtomicBlock* current; + + state = safe_state->re_state; + + current = state->current_atomic_block; + if (!current || current->count >= current->capacity) { + /* The current block is full. */ + if (current && current->next) + /* Advance to the next block. */ + current = current->next; + else { + /* Add a new block. */ + RE_AtomicBlock* next; + + next = (RE_AtomicBlock*)safe_alloc(safe_state, + sizeof(RE_AtomicBlock)); + if (!next) + return NULL; + + next->previous = current; + next->next = NULL; + next->capacity = RE_ATOMIC_BLOCK_SIZE; + if (current) + /* The current block is the last one. */ + current->next = next; + else + /* The new block is the first one. */ + state->current_atomic_block = next; + current = next; + } + + current->count = 0; + } + + return ¤t->items[current->count++]; +} + +/* Pops the top entry from the atomic stack. */ +Py_LOCAL_INLINE(RE_AtomicData*) pop_atomic(RE_SafeState* safe_state) { + RE_State* state; + RE_AtomicBlock* current; + RE_AtomicData* atomic; + + state = safe_state->re_state; + + current = state->current_atomic_block; + atomic = ¤t->items[--current->count]; + if (current->count == 0 && current->previous) + state->current_atomic_block = current->previous; + + return atomic; +} + +/* Gets the top entry from the atomic stack. */ +Py_LOCAL_INLINE(RE_AtomicData*) top_atomic(RE_SafeState* safe_state) { + RE_State* state; + RE_AtomicBlock* current; + + state = safe_state->re_state; + + current = state->current_atomic_block; + return ¤t->items[current->count - 1]; +} + +/* Copies a repeat guard list. */ +Py_LOCAL_INLINE(BOOL) copy_guard_data(RE_SafeState* safe_state, RE_GuardList* + dst, RE_GuardList* src) { + if (dst->capacity < src->count) { + RE_GuardSpan* new_spans; + + if (!safe_state) + return FALSE; + + dst->capacity = src->count; + new_spans = (RE_GuardSpan*)safe_realloc(safe_state, dst->spans, + dst->capacity * sizeof(RE_GuardSpan)); + if (!new_spans) + return FALSE; + + dst->spans = new_spans; + } + + dst->count = src->count; + memmove(dst->spans, src->spans, dst->count * sizeof(RE_GuardSpan)); + + dst->last_text_pos = -1; + + return TRUE; +} + +/* Copies a repeat. */ +Py_LOCAL_INLINE(BOOL) copy_repeat_data(RE_SafeState* safe_state, RE_RepeatData* + dst, RE_RepeatData* src) { + if (!copy_guard_data(safe_state, &dst->body_guard_list, + &src->body_guard_list) || !copy_guard_data(safe_state, + &dst->tail_guard_list, &src->tail_guard_list)) { + safe_dealloc(safe_state, dst->body_guard_list.spans); + safe_dealloc(safe_state, dst->tail_guard_list.spans); + + return FALSE; + } + + dst->count = src->count; + dst->start = src->start; + dst->capture_change = src->capture_change; + + return TRUE; +} + +/* Pushes a return node onto the group call stack. */ +Py_LOCAL_INLINE(BOOL) push_group_return(RE_SafeState* safe_state, RE_Node* + return_node) { + RE_State* state; + PatternObject* pattern; + RE_GroupCallFrame* frame; + + state = safe_state->re_state; + pattern = state->pattern; + + if (state->current_group_call_frame && + state->current_group_call_frame->next) + /* Advance to the next allocated frame. */ + frame = state->current_group_call_frame->next; + else if (!state->current_group_call_frame && state->first_group_call_frame) + /* Advance to the first allocated frame. */ + frame = state->first_group_call_frame; + else { + /* Create a new frame. */ + frame = (RE_GroupCallFrame*)safe_alloc(safe_state, + sizeof(RE_GroupCallFrame)); + if (!frame) + return FALSE; + + frame->groups = (RE_GroupData*)safe_alloc(safe_state, + pattern->true_group_count * sizeof(RE_GroupData)); + frame->repeats = (RE_RepeatData*)safe_alloc(safe_state, + pattern->repeat_count * sizeof(RE_RepeatData)); + if (!frame->groups || !frame->repeats) { + safe_dealloc(safe_state, frame->groups); + safe_dealloc(safe_state, frame->repeats); + safe_dealloc(safe_state, frame); + + return FALSE; + } + + memset(frame->groups, 0, pattern->true_group_count * + sizeof(RE_GroupData)); + memset(frame->repeats, 0, pattern->repeat_count * + sizeof(RE_RepeatData)); + + frame->previous = state->current_group_call_frame; + frame->next = NULL; + + if (frame->previous) + frame->previous->next = frame; + else + state->first_group_call_frame = frame; + } + + frame->node = return_node; + + /* Push the groups and guards. */ + if (return_node) { + size_t g; + size_t r; + + for (g = 0; g < pattern->true_group_count; g++) { + frame->groups[g].span = state->groups[g].span; + frame->groups[g].current_capture = + state->groups[g].current_capture; + } + + for (r = 0; r < pattern->repeat_count; r++) { + if (!copy_repeat_data(safe_state, &frame->repeats[r], + &state->repeats[r])) + return FALSE; + } + } + + state->current_group_call_frame = frame; + + return TRUE; +} + +/* Pops a return node from the group call stack. */ +Py_LOCAL_INLINE(RE_Node*) pop_group_return(RE_State* state) { + RE_GroupCallFrame* frame; + + frame = state->current_group_call_frame; + + /* Pop the groups and repeats. */ + if (frame->node) { + PatternObject* pattern; + size_t g; + size_t r; + + pattern = state->pattern; + + for (g = 0; g < pattern->true_group_count; g++) { + state->groups[g].span = frame->groups[g].span; + state->groups[g].current_capture = + frame->groups[g].current_capture; + } + + for (r = 0; r < pattern->repeat_count; r++) + copy_repeat_data(NULL, &state->repeats[r], &frame->repeats[r]); + } + + /* Withdraw to previous frame. */ + state->current_group_call_frame = frame->previous; + + return frame->node; +} + +/* Returns the return node from the top of the group call stack. */ +Py_LOCAL_INLINE(RE_Node*) top_group_return(RE_State* state) { + RE_GroupCallFrame* frame; + + frame = state->current_group_call_frame; + + return frame->node; +} + +/* Checks whether a node matches only 1 character. */ +Py_LOCAL_INLINE(BOOL) node_matches_one_character(RE_Node* node) { + switch (node->op) { + case RE_OP_ANY: + case RE_OP_ANY_ALL: + case RE_OP_ANY_ALL_REV: + case RE_OP_ANY_REV: + case RE_OP_ANY_U: + case RE_OP_ANY_U_REV: + case RE_OP_CHARACTER: + case RE_OP_CHARACTER_IGN: + case RE_OP_CHARACTER_IGN_REV: + case RE_OP_CHARACTER_REV: + case RE_OP_PROPERTY: + case RE_OP_PROPERTY_IGN: + case RE_OP_PROPERTY_IGN_REV: + case RE_OP_PROPERTY_REV: + case RE_OP_RANGE: + case RE_OP_RANGE_IGN: + case RE_OP_RANGE_IGN_REV: + case RE_OP_RANGE_REV: + case RE_OP_SET_DIFF: + case RE_OP_SET_DIFF_IGN: + case RE_OP_SET_DIFF_IGN_REV: + case RE_OP_SET_DIFF_REV: + case RE_OP_SET_INTER: + case RE_OP_SET_INTER_IGN: + case RE_OP_SET_INTER_IGN_REV: + case RE_OP_SET_INTER_REV: + case RE_OP_SET_SYM_DIFF: + case RE_OP_SET_SYM_DIFF_IGN: + case RE_OP_SET_SYM_DIFF_IGN_REV: + case RE_OP_SET_SYM_DIFF_REV: + case RE_OP_SET_UNION: + case RE_OP_SET_UNION_IGN: + case RE_OP_SET_UNION_IGN_REV: + case RE_OP_SET_UNION_REV: + return TRUE; + default: + return FALSE; + } +} + +/* Checks whether the node is a firstset. */ +Py_LOCAL_INLINE(BOOL) is_firstset(RE_Node* node) { + if (node->step != 0) + return FALSE; + + return node_matches_one_character(node); +} + +/* Locates the start node for testing ahead. */ +Py_LOCAL_INLINE(RE_Node*) locate_test_start(RE_Node* node) { + for (;;) { + switch (node->op) { + case RE_OP_BOUNDARY: + switch (node->next_1.node->op) { + case RE_OP_STRING: + case RE_OP_STRING_FLD: + case RE_OP_STRING_FLD_REV: + case RE_OP_STRING_IGN: + case RE_OP_STRING_IGN_REV: + case RE_OP_STRING_REV: + return node->next_1.node; + default: + return node; + } + case RE_OP_CALL_REF: + case RE_OP_END_GROUP: + case RE_OP_START_GROUP: + node = node->next_1.node; + break; + case RE_OP_GREEDY_REPEAT: + case RE_OP_LAZY_REPEAT: + if (node->values[1] == 0) + return node; + node = node->next_1.node; + break; + case RE_OP_GREEDY_REPEAT_ONE: + case RE_OP_LAZY_REPEAT_ONE: + if (node->values[1] == 0) + return node; + return node->nonstring.next_2.node; + case RE_OP_LOOKAROUND: + node = node->nonstring.next_2.node; + break; + default: + if (is_firstset(node)) { + switch (node->next_1.node->op) { + case RE_OP_END_OF_STRING: + case RE_OP_START_OF_STRING: + return node->next_1.node; + } + } + + return node; + } + } +} + +/* Checks whether a character matches any of a set of case characters. */ +Py_LOCAL_INLINE(BOOL) any_case(Py_UCS4 ch, int case_count, Py_UCS4* cases) { + int i; + + for (i = 0; i < case_count; i++) { + if (ch == cases[i]) + return TRUE; + } + + return FALSE; +} + +/* Matches many ANYs, up to a limit. */ +Py_LOCAL_INLINE(Py_ssize_t) match_many_ANY(RE_State* state, RE_Node* node, + Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) { + void* text; + RE_EncodingTable* encoding; + + text = state->text; + encoding = state->encoding; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr < limit_ptr && matches_ANY(encoding, node, text_ptr[0]) + == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS1*)text; + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr < limit_ptr && matches_ANY(encoding, node, text_ptr[0]) + == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS2*)text; + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr < limit_ptr && matches_ANY(encoding, node, text_ptr[0]) + == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS4*)text; + break; + } + } + + return text_pos; +} + +/* Matches many ANYs, up to a limit, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) match_many_ANY_REV(RE_State* state, RE_Node* node, + Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) { + void* text; + RE_EncodingTable* encoding; + + text = state->text; + encoding = state->encoding; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr > limit_ptr && matches_ANY(encoding, node, + text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS1*)text; + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr > limit_ptr && matches_ANY(encoding, node, + text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS2*)text; + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr > limit_ptr && matches_ANY(encoding, node, + text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS4*)text; + break; + } + } + + return text_pos; +} + +/* Matches many ANY_Us, up to a limit. */ +Py_LOCAL_INLINE(Py_ssize_t) match_many_ANY_U(RE_State* state, RE_Node* node, + Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) { + void* text; + RE_EncodingTable* encoding; + + text = state->text; + encoding = state->encoding; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr < limit_ptr && matches_ANY_U(encoding, node, + text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS1*)text; + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr < limit_ptr && matches_ANY_U(encoding, node, + text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS2*)text; + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr < limit_ptr && matches_ANY_U(encoding, node, + text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS4*)text; + break; + } + } + + return text_pos; +} + +/* Matches many ANY_Us, up to a limit, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) match_many_ANY_U_REV(RE_State* state, RE_Node* + node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) { + void* text; + RE_EncodingTable* encoding; + + text = state->text; + encoding = state->encoding; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr > limit_ptr && matches_ANY_U(encoding, node, + text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS1*)text; + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr > limit_ptr && matches_ANY_U(encoding, node, + text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS2*)text; + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr > limit_ptr && matches_ANY_U(encoding, node, + text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS4*)text; + break; + } + } + + return text_pos; +} + +/* Matches many CHARACTERs, up to a limit. */ +Py_LOCAL_INLINE(Py_ssize_t) match_many_CHARACTER(RE_State* state, RE_Node* + node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) { + void* text; + Py_UCS4 ch; + + text = state->text; + match = node->match == match; + ch = node->values[0]; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr < limit_ptr && (text_ptr[0] == ch) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS1*)text; + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr < limit_ptr && (text_ptr[0] == ch) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS2*)text; + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr < limit_ptr && (text_ptr[0] == ch) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS4*)text; + break; + } + } + + return text_pos; +} + +/* Matches many CHARACTERs, up to a limit, ignoring case. */ +Py_LOCAL_INLINE(Py_ssize_t) match_many_CHARACTER_IGN(RE_State* state, RE_Node* + node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) { + void* text; + Py_UCS4 cases[RE_MAX_CASES]; + int case_count; + + text = state->text; + match = node->match == match; + case_count = state->encoding->all_cases(state->locale_info, + node->values[0], cases); + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr < limit_ptr && any_case(text_ptr[0], case_count, cases) + == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS1*)text; + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr < limit_ptr && any_case(text_ptr[0], case_count, cases) + == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS2*)text; + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr < limit_ptr && any_case(text_ptr[0], case_count, cases) + == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS4*)text; + break; + } + } + + return text_pos; +} + +/* Matches many CHARACTERs, up to a limit, backwards, ignoring case. */ +Py_LOCAL_INLINE(Py_ssize_t) match_many_CHARACTER_IGN_REV(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) { + void* text; + Py_UCS4 cases[RE_MAX_CASES]; + int case_count; + + text = state->text; + match = node->match == match; + case_count = state->encoding->all_cases(state->locale_info, + node->values[0], cases); + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr > limit_ptr && any_case(text_ptr[-1], case_count, + cases) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS1*)text; + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr > limit_ptr && any_case(text_ptr[-1], case_count, + cases) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS2*)text; + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr > limit_ptr && any_case(text_ptr[-1], case_count, + cases) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS4*)text; + break; + } + } + + return text_pos; +} + +/* Matches many CHARACTERs, up to a limit, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) match_many_CHARACTER_REV(RE_State* state, RE_Node* + node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) { + void* text; + Py_UCS4 ch; + + text = state->text; + match = node->match == match; + ch = node->values[0]; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr > limit_ptr && (text_ptr[-1] == ch) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS1*)text; + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr > limit_ptr && (text_ptr[-1] == ch) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS2*)text; + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr > limit_ptr && (text_ptr[-1] == ch) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS4*)text; + break; + } + } + + return text_pos; +} + +/* Matches many PROPERTYs, up to a limit. */ +Py_LOCAL_INLINE(Py_ssize_t) match_many_PROPERTY(RE_State* state, RE_Node* node, + Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) { + void* text; + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + + text = state->text; + match = node->match == match; + encoding = state->encoding; + locale_info = state->locale_info; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr < limit_ptr && matches_PROPERTY(encoding, locale_info, + node, text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS1*)text; + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr < limit_ptr && matches_PROPERTY(encoding, locale_info, + node, text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS2*)text; + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr < limit_ptr && matches_PROPERTY(encoding, locale_info, + node, text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS4*)text; + break; + } + } + + return text_pos; +} + +/* Matches many PROPERTYs, up to a limit, ignoring case. */ +Py_LOCAL_INLINE(Py_ssize_t) match_many_PROPERTY_IGN(RE_State* state, RE_Node* + node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) { + void* text; + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + + text = state->text; + match = node->match == match; + encoding = state->encoding; + locale_info = state->locale_info; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr < limit_ptr && matches_PROPERTY_IGN(encoding, + locale_info, node, text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS1*)text; + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr < limit_ptr && matches_PROPERTY_IGN(encoding, + locale_info, node, text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS2*)text; + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr < limit_ptr && matches_PROPERTY_IGN(encoding, + locale_info, node, text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS4*)text; + break; + } + } + + return text_pos; +} + +/* Matches many PROPERTYs, up to a limit, backwards, ignoring case. */ +Py_LOCAL_INLINE(Py_ssize_t) match_many_PROPERTY_IGN_REV(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) { + void* text; + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + + text = state->text; + match = node->match == match; + encoding = state->encoding; + locale_info = state->locale_info; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr > limit_ptr && matches_PROPERTY_IGN(encoding, + locale_info, node, text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS1*)text; + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr > limit_ptr && matches_PROPERTY_IGN(encoding, + locale_info, node, text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS2*)text; + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr > limit_ptr && matches_PROPERTY_IGN(encoding, + locale_info, node, text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS4*)text; + break; + } + } + + return text_pos; +} + +/* Matches many PROPERTYs, up to a limit, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) match_many_PROPERTY_REV(RE_State* state, RE_Node* + node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) { + void* text; + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + + text = state->text; + match = node->match == match; + encoding = state->encoding; + locale_info = state->locale_info; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr > limit_ptr && matches_PROPERTY(encoding, locale_info, + node, text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS1*)text; + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr > limit_ptr && matches_PROPERTY(encoding, locale_info, + node, text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS2*)text; + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr > limit_ptr && matches_PROPERTY(encoding, locale_info, + node, text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS4*)text; + break; + } + } + + return text_pos; +} + +/* Matches many RANGEs, up to a limit. */ +Py_LOCAL_INLINE(Py_ssize_t) match_many_RANGE(RE_State* state, RE_Node* node, + Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) { + void* text; + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + + text = state->text; + match = node->match == match; + encoding = state->encoding; + locale_info = state->locale_info; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr < limit_ptr && matches_RANGE(encoding, locale_info, + node, text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS1*)text; + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr < limit_ptr && matches_RANGE(encoding, locale_info, + node, text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS2*)text; + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr < limit_ptr && matches_RANGE(encoding, locale_info, + node, text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS4*)text; + break; + } + } + + return text_pos; +} + +/* Matches many RANGEs, up to a limit, ignoring case. */ +Py_LOCAL_INLINE(Py_ssize_t) match_many_RANGE_IGN(RE_State* state, RE_Node* + node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) { + void* text; + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + + text = state->text; + match = node->match == match; + encoding = state->encoding; + locale_info = state->locale_info; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr < limit_ptr && matches_RANGE_IGN(encoding, locale_info, + node, text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS1*)text; + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr < limit_ptr && matches_RANGE_IGN(encoding, locale_info, + node, text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS2*)text; + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr < limit_ptr && matches_RANGE_IGN(encoding, locale_info, + node, text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS4*)text; + break; + } + } + + return text_pos; +} + +/* Matches many RANGEs, up to a limit, backwards, ignoring case. */ +Py_LOCAL_INLINE(Py_ssize_t) match_many_RANGE_IGN_REV(RE_State* state, RE_Node* + node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) { + void* text; + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + + text = state->text; + match = node->match == match; + encoding = state->encoding; + locale_info = state->locale_info; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr > limit_ptr && matches_RANGE_IGN(encoding, locale_info, + node, text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS1*)text; + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr > limit_ptr && matches_RANGE_IGN(encoding, locale_info, + node, text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS2*)text; + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr > limit_ptr && matches_RANGE_IGN(encoding, locale_info, + node, text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS4*)text; + break; + } + } + + return text_pos; +} + +/* Matches many RANGEs, up to a limit, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) match_many_RANGE_REV(RE_State* state, RE_Node* + node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) { + void* text; + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + + text = state->text; + match = node->match == match; + encoding = state->encoding; + locale_info = state->locale_info; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr > limit_ptr && matches_RANGE(encoding, locale_info, + node, text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS1*)text; + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr > limit_ptr && matches_RANGE(encoding, locale_info, + node, text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS2*)text; + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr > limit_ptr && matches_RANGE(encoding, locale_info, + node, text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS4*)text; + break; + } + } + + return text_pos; +} + +/* Matches many SETs, up to a limit. */ +Py_LOCAL_INLINE(Py_ssize_t) match_many_SET(RE_State* state, RE_Node* node, + Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) { + void* text; + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + + text = state->text; + match = node->match == match; + encoding = state->encoding; + locale_info = state->locale_info; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr < limit_ptr && matches_SET(encoding, locale_info, node, + text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS1*)text; + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr < limit_ptr && matches_SET(encoding, locale_info, node, + text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS2*)text; + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr < limit_ptr && matches_SET(encoding, locale_info, node, + text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS4*)text; + break; + } + } + + return text_pos; +} + +/* Matches many SETs, up to a limit, ignoring case. */ +Py_LOCAL_INLINE(Py_ssize_t) match_many_SET_IGN(RE_State* state, RE_Node* node, + Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) { + void* text; + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + + text = state->text; + match = node->match == match; + encoding = state->encoding; + locale_info = state->locale_info; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr < limit_ptr && matches_SET_IGN(encoding, locale_info, + node, text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS1*)text; + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr < limit_ptr && matches_SET_IGN(encoding, locale_info, + node, text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS2*)text; + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr < limit_ptr && matches_SET_IGN(encoding, locale_info, + node, text_ptr[0]) == match) + ++text_ptr; + + text_pos = text_ptr - (Py_UCS4*)text; + break; + } + } + + return text_pos; +} + +/* Matches many SETs, up to a limit, backwards, ignoring case. */ +Py_LOCAL_INLINE(Py_ssize_t) match_many_SET_IGN_REV(RE_State* state, RE_Node* + node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) { + void* text; + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + + text = state->text; + match = node->match == match; + encoding = state->encoding; + locale_info = state->locale_info; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr > limit_ptr && matches_SET_IGN(encoding, locale_info, + node, text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS1*)text; + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr > limit_ptr && matches_SET_IGN(encoding, locale_info, + node, text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS2*)text; + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr > limit_ptr && matches_SET_IGN(encoding, locale_info, + node, text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS4*)text; + break; + } + } + + return text_pos; +} + +/* Matches many SETs, up to a limit, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) match_many_SET_REV(RE_State* state, RE_Node* node, + Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) { + void* text; + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + + text = state->text; + match = node->match == match; + encoding = state->encoding; + locale_info = state->locale_info; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr > limit_ptr && matches_SET(encoding, locale_info, node, + text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS1*)text; + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr > limit_ptr && matches_SET(encoding, locale_info, node, + text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS2*)text; + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr > limit_ptr && matches_SET(encoding, locale_info, node, + text_ptr[-1]) == match) + --text_ptr; + + text_pos = text_ptr - (Py_UCS4*)text; + break; + } + } + + return text_pos; +} + +/* Counts a repeated character pattern. */ +Py_LOCAL_INLINE(size_t) count_one(RE_State* state, RE_Node* node, Py_ssize_t + text_pos, size_t max_count, BOOL* is_partial) { + size_t count; + + *is_partial = FALSE; + + if (max_count < 1) + return 0; + + switch (node->op) { + case RE_OP_ANY: + count = min_size_t((size_t)(state->slice_end - text_pos), max_count); + + count = (size_t)(match_many_ANY(state, node, text_pos, text_pos + + (Py_ssize_t)count, TRUE) - text_pos); + + *is_partial = count == (size_t)(state->text_length - text_pos) && count + < max_count && state->partial_side == RE_PARTIAL_RIGHT; + + return count; + case RE_OP_ANY_ALL: + count = min_size_t((size_t)(state->slice_end - text_pos), max_count); + + *is_partial = count == (size_t)(state->text_length - text_pos) && count + < max_count && state->partial_side == RE_PARTIAL_RIGHT; + + return count; + case RE_OP_ANY_ALL_REV: + count = min_size_t((size_t)(text_pos - state->slice_start), max_count); + + *is_partial = count == (size_t)(text_pos) && count < max_count && + state->partial_side == RE_PARTIAL_LEFT; + + return count; + case RE_OP_ANY_REV: + count = min_size_t((size_t)(text_pos - state->slice_start), max_count); + + count = (size_t)(text_pos - match_many_ANY_REV(state, node, text_pos, + text_pos - (Py_ssize_t)count, TRUE)); + + *is_partial = count == (size_t)(text_pos) && count < max_count && + state->partial_side == RE_PARTIAL_LEFT; + + return count; + case RE_OP_ANY_U: + count = min_size_t((size_t)(state->slice_end - text_pos), max_count); + + count = (size_t)(match_many_ANY_U(state, node, text_pos, text_pos + + (Py_ssize_t)count, TRUE) - text_pos); + + *is_partial = count == (size_t)(state->text_length - text_pos) && count + < max_count && state->partial_side == RE_PARTIAL_RIGHT; + + return count; + case RE_OP_ANY_U_REV: + count = min_size_t((size_t)(text_pos - state->slice_start), max_count); + + count = (size_t)(text_pos - match_many_ANY_U_REV(state, node, text_pos, + text_pos - (Py_ssize_t)count, TRUE)); + + *is_partial = count == (size_t)(text_pos) && count < max_count && + state->partial_side == RE_PARTIAL_LEFT; + + return count; + case RE_OP_CHARACTER: + count = min_size_t((size_t)(state->slice_end - text_pos), max_count); + + count = (size_t)(match_many_CHARACTER(state, node, text_pos, text_pos + + (Py_ssize_t)count, TRUE) - text_pos); + + *is_partial = count == (size_t)(state->text_length - text_pos) && count + < max_count && state->partial_side == RE_PARTIAL_RIGHT; + + return count; + case RE_OP_CHARACTER_IGN: + count = min_size_t((size_t)(state->slice_end - text_pos), max_count); + + count = (size_t)(match_many_CHARACTER_IGN(state, node, text_pos, + text_pos + (Py_ssize_t)count, TRUE) - text_pos); + + *is_partial = count == (size_t)(state->text_length - text_pos) && count + < max_count && state->partial_side == RE_PARTIAL_RIGHT; + + return count; + case RE_OP_CHARACTER_IGN_REV: + count = min_size_t((size_t)(text_pos - state->slice_start), max_count); + + count = (size_t)(text_pos - match_many_CHARACTER_IGN_REV(state, node, + text_pos, text_pos - (Py_ssize_t)count, TRUE)); + + *is_partial = count == (size_t)(text_pos) && count < max_count && + state->partial_side == RE_PARTIAL_LEFT; + + return count; + case RE_OP_CHARACTER_REV: + count = min_size_t((size_t)(text_pos - state->slice_start), max_count); + + count = (size_t)(text_pos - match_many_CHARACTER_REV(state, node, + text_pos, text_pos - (Py_ssize_t)count, TRUE)); + + *is_partial = count == (size_t)(text_pos) && count < max_count && + state->partial_side == RE_PARTIAL_LEFT; + + return count; + case RE_OP_PROPERTY: + count = min_size_t((size_t)(state->slice_end - text_pos), max_count); + + count = (size_t)(match_many_PROPERTY(state, node, text_pos, text_pos + + (Py_ssize_t)count, TRUE) - text_pos); + + *is_partial = count == (size_t)(state->text_length - text_pos) && count + < max_count && state->partial_side == RE_PARTIAL_RIGHT; + + return count; + case RE_OP_PROPERTY_IGN: + count = min_size_t((size_t)(state->slice_end - text_pos), max_count); + + count = (size_t)(match_many_PROPERTY_IGN(state, node, text_pos, + text_pos + (Py_ssize_t)count, TRUE) - text_pos); + + *is_partial = count == (size_t)(state->text_length - text_pos) && count + < max_count && state->partial_side == RE_PARTIAL_RIGHT; + + return count; + case RE_OP_PROPERTY_IGN_REV: + count = min_size_t((size_t)(text_pos - state->slice_start), max_count); + + count = (size_t)(text_pos - match_many_PROPERTY_IGN_REV(state, node, + text_pos, text_pos - (Py_ssize_t)count, TRUE)); + + *is_partial = count == (size_t)(text_pos) && count < max_count && + state->partial_side == RE_PARTIAL_LEFT; + + return count; + case RE_OP_PROPERTY_REV: + count = min_size_t((size_t)(text_pos - state->slice_start), max_count); + + count = (size_t)(text_pos - match_many_PROPERTY_REV(state, node, + text_pos, text_pos - (Py_ssize_t)count, TRUE)); + + *is_partial = count == (size_t)(text_pos) && count < max_count && + state->partial_side == RE_PARTIAL_LEFT; + + return count; + case RE_OP_RANGE: + count = min_size_t((size_t)(state->slice_end - text_pos), max_count); + + count = (size_t)(match_many_RANGE(state, node, text_pos, text_pos + + (Py_ssize_t)count, TRUE) - text_pos); + + *is_partial = count == (size_t)(state->text_length - text_pos) && count + < max_count && state->partial_side == RE_PARTIAL_RIGHT; + + return count; + case RE_OP_RANGE_IGN: + count = min_size_t((size_t)(state->slice_end - text_pos), max_count); + + count = (size_t)(match_many_RANGE_IGN(state, node, text_pos, text_pos + + (Py_ssize_t)count, TRUE) - text_pos); + + *is_partial = count == (size_t)(state->text_length - text_pos) && count + < max_count && state->partial_side == RE_PARTIAL_RIGHT; + + return count; + case RE_OP_RANGE_IGN_REV: + count = min_size_t((size_t)(text_pos - state->slice_start), max_count); + + count = (size_t)(text_pos - match_many_RANGE_IGN_REV(state, node, + text_pos, text_pos - (Py_ssize_t)count, TRUE)); + + *is_partial = count == (size_t)(text_pos) && count < max_count && + state->partial_side == RE_PARTIAL_LEFT; + + return count; + case RE_OP_RANGE_REV: + count = min_size_t((size_t)(text_pos - state->slice_start), max_count); + + count = (size_t)(text_pos - match_many_RANGE_REV(state, node, text_pos, + text_pos - (Py_ssize_t)count, TRUE)); + + *is_partial = count == (size_t)(text_pos) && count < max_count && + state->partial_side == RE_PARTIAL_LEFT; + + return count; + case RE_OP_SET_DIFF: + case RE_OP_SET_INTER: + case RE_OP_SET_SYM_DIFF: + case RE_OP_SET_UNION: + count = min_size_t((size_t)(state->slice_end - text_pos), max_count); + + count = (size_t)(match_many_SET(state, node, text_pos, text_pos + + (Py_ssize_t)count, TRUE) - text_pos); + + *is_partial = count == (size_t)(state->text_length - text_pos) && count + < max_count && state->partial_side == RE_PARTIAL_RIGHT; + + return count; + case RE_OP_SET_DIFF_IGN: + case RE_OP_SET_INTER_IGN: + case RE_OP_SET_SYM_DIFF_IGN: + case RE_OP_SET_UNION_IGN: + count = min_size_t((size_t)(state->slice_end - text_pos), max_count); + + count = (size_t)(match_many_SET_IGN(state, node, text_pos, text_pos + + (Py_ssize_t)count, TRUE) - text_pos); + + *is_partial = count == (size_t)(state->text_length - text_pos) && count + < max_count && state->partial_side == RE_PARTIAL_RIGHT; + + return count; + case RE_OP_SET_DIFF_IGN_REV: + case RE_OP_SET_INTER_IGN_REV: + case RE_OP_SET_SYM_DIFF_IGN_REV: + case RE_OP_SET_UNION_IGN_REV: + count = min_size_t((size_t)(text_pos - state->slice_start), max_count); + + count = (size_t)(text_pos - match_many_SET_IGN_REV(state, node, + text_pos, text_pos - (Py_ssize_t)count, TRUE)); + + *is_partial = count == (size_t)(text_pos) && count < max_count && + state->partial_side == RE_PARTIAL_LEFT; + + return count; + case RE_OP_SET_DIFF_REV: + case RE_OP_SET_INTER_REV: + case RE_OP_SET_SYM_DIFF_REV: + case RE_OP_SET_UNION_REV: + count = min_size_t((size_t)(text_pos - state->slice_start), max_count); + + count = (size_t)(text_pos - match_many_SET_REV(state, node, text_pos, + text_pos - (Py_ssize_t)count, TRUE)); + + *is_partial = count == (size_t)(text_pos) && count < max_count && + state->partial_side == RE_PARTIAL_LEFT; + + return count; + } + + return 0; +} + +/* Performs a simple string search. */ +Py_LOCAL_INLINE(Py_ssize_t) simple_string_search(RE_State* state, RE_Node* + node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL* is_partial) { + Py_ssize_t length; + RE_CODE* values; + Py_UCS4 check_char; + + length = (Py_ssize_t)node->value_count; + values = node->values; + check_char = values[0]; + + *is_partial = FALSE; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text = (Py_UCS1*)state->text; + Py_UCS1* text_ptr = text + text_pos; + Py_UCS1* limit_ptr = text + limit; + + while (text_ptr < limit_ptr) { + if (text_ptr[0] == check_char) { + Py_ssize_t s_pos; + + s_pos = 1; + + for (;;) { + if (s_pos >= length) + /* End of search string. */ + return text_ptr - text; + + if (text_ptr + s_pos >= limit_ptr) { + /* Off the end of the text. */ + if (state->partial_side == RE_PARTIAL_RIGHT) { + /* Partial match. */ + *is_partial = TRUE; + return text_ptr - text; + } + + return -1; + + } + + if (!same_char(text_ptr[s_pos], values[s_pos])) + break; + + ++s_pos; + } + } + + ++text_ptr; + } + text_pos = text_ptr - text; + break; + } + case 2: + { + Py_UCS2* text = (Py_UCS2*)state->text; + Py_UCS2* text_ptr = text + text_pos; + Py_UCS2* limit_ptr = text + limit; + + while (text_ptr < limit_ptr) { + if (text_ptr[0] == check_char) { + Py_ssize_t s_pos; + + s_pos = 1; + + for (;;) { + if (s_pos >= length) + /* End of search string. */ + return text_ptr - text; + + if (text_ptr + s_pos >= limit_ptr) { + /* Off the end of the text. */ + if (state->partial_side == RE_PARTIAL_RIGHT) { + /* Partial match. */ + *is_partial = TRUE; + return text_ptr - text; + } + + return -1; + + } + + if (!same_char(text_ptr[s_pos], values[s_pos])) + break; + + ++s_pos; + } + } + + ++text_ptr; + } + text_pos = text_ptr - text; + break; + } + case 4: + { + Py_UCS4* text = (Py_UCS4*)state->text; + Py_UCS4* text_ptr = text + text_pos; + Py_UCS4* limit_ptr = text + limit; + + while (text_ptr < limit_ptr) { + if (text_ptr[0] == check_char) { + Py_ssize_t s_pos; + + s_pos = 1; + + for (;;) { + if (s_pos >= length) + /* End of search string. */ + return text_ptr - text; + + if (text_ptr + s_pos >= limit_ptr) { + /* Off the end of the text. */ + if (state->partial_side == RE_PARTIAL_RIGHT) { + /* Partial match. */ + *is_partial = TRUE; + return text_ptr - text; + } + + return -1; + + } + + if (!same_char(text_ptr[s_pos], values[s_pos])) + break; + + ++s_pos; + } + } + + ++text_ptr; + } + text_pos = text_ptr - text; + break; + } + } + + /* Off the end of the text. */ + if (state->partial_side == RE_PARTIAL_RIGHT) { + /* Partial match. */ + *is_partial = TRUE; + return text_pos; + } + + return -1; +} + +/* Performs a simple string search, ignoring case. */ +Py_LOCAL_INLINE(Py_ssize_t) simple_string_search_ign(RE_State* state, RE_Node* + node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL* is_partial) { + Py_ssize_t length; + RE_CODE* values; + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + Py_UCS4 cases[RE_MAX_CASES]; + int case_count; + + length = (Py_ssize_t)node->value_count; + values = node->values; + encoding = state->encoding; + locale_info = state->locale_info; + case_count = encoding->all_cases(locale_info, values[0], cases); + + *is_partial = FALSE; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text = (Py_UCS1*)state->text; + Py_UCS1* text_ptr = text + text_pos; + Py_UCS1* limit_ptr = text + limit; + + while (text_ptr < limit_ptr) { + if (any_case(text_ptr[0], case_count, cases)) { + Py_ssize_t s_pos; + + s_pos = 1; + + for (;;) { + if (s_pos >= length) + /* End of search string. */ + return text_ptr - text; + + if (text_ptr + s_pos >= limit_ptr) { + /* Off the end of the text. */ + if (state->partial_side == RE_PARTIAL_RIGHT) { + /* Partial match. */ + *is_partial = TRUE; + return text_ptr - text; + } + + return -1; + + } + + if (!same_char_ign(encoding, locale_info, text_ptr[s_pos], + values[s_pos])) + break; + + ++s_pos; + } + } + + ++text_ptr; + } + text_pos = text_ptr - text; + break; + } + case 2: + { + Py_UCS2* text = (Py_UCS2*)state->text; + Py_UCS2* text_ptr = text + text_pos; + Py_UCS2* limit_ptr = text + limit; + + while (text_ptr < limit_ptr) { + if (any_case(text_ptr[0], case_count, cases)) { + Py_ssize_t s_pos; + + s_pos = 1; + + for (;;) { + if (s_pos >= length) + /* End of search string. */ + return text_ptr - text; + + if (text_ptr + s_pos >= limit_ptr) { + /* Off the end of the text. */ + if (state->partial_side == RE_PARTIAL_RIGHT) { + /* Partial match. */ + *is_partial = TRUE; + return text_ptr - text; + } + + return -1; + + } + + if (!same_char_ign(encoding, locale_info, text_ptr[s_pos], + values[s_pos])) + break; + + ++s_pos; + } + } + + ++text_ptr; + } + text_pos = text_ptr - text; + break; + } + case 4: + { + Py_UCS4* text = (Py_UCS4*)state->text; + Py_UCS4* text_ptr = text + text_pos; + Py_UCS4* limit_ptr = text + limit; + + while (text_ptr < limit_ptr) { + if (any_case(text_ptr[0], case_count, cases)) { + Py_ssize_t s_pos; + + s_pos = 1; + + for (;;) { + if (s_pos >= length) + /* End of search string. */ + return text_ptr - text; + + if (text_ptr + s_pos >= limit_ptr) { + /* Off the end of the text. */ + if (state->partial_side == RE_PARTIAL_RIGHT) { + /* Partial match. */ + *is_partial = TRUE; + return text_ptr - text; + } + + return -1; + + } + + if (!same_char_ign(encoding, locale_info, text_ptr[s_pos], + values[s_pos])) + break; + + ++s_pos; + } + } + + ++text_ptr; + } + text_pos = text_ptr - text; + break; + } + } + + /* Off the end of the text. */ + if (state->partial_side == RE_PARTIAL_RIGHT) { + /* Partial match. */ + *is_partial = TRUE; + return text_pos; + } + + return -1; +} + +/* Performs a simple string search, backwards, ignoring case. */ +Py_LOCAL_INLINE(Py_ssize_t) simple_string_search_ign_rev(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL* is_partial) { + Py_ssize_t length; + RE_CODE* values; + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + Py_UCS4 cases[RE_MAX_CASES]; + int case_count; + + length = (Py_ssize_t)node->value_count; + values = node->values; + encoding = state->encoding; + locale_info = state->locale_info; + case_count = encoding->all_cases(locale_info, values[length - 1], cases); + + *is_partial = FALSE; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text = (Py_UCS1*)state->text; + Py_UCS1* text_ptr = text + text_pos; + Py_UCS1* limit_ptr = text + limit; + + while (text_ptr > limit_ptr) { + if (any_case(text_ptr[-1], case_count, cases)) { + Py_ssize_t s_pos; + + s_pos = 1; + + for (;;) { + if (s_pos >= length) + /* End of search string. */ + return text_ptr - text; + + if (text_ptr - s_pos <= limit_ptr) { + /* Off the end of the text. */ + if (state->partial_side == RE_PARTIAL_LEFT) { + /* Partial match. */ + *is_partial = TRUE; + return text_ptr - text; + } + + return -1; + + } + + if (!same_char_ign(encoding, locale_info, text_ptr[- s_pos + - 1], values[length - s_pos - 1])) + break; + + ++s_pos; + } + } + + --text_ptr; + } + text_pos = text_ptr - text; + break; + } + case 2: + { + Py_UCS2* text = (Py_UCS2*)state->text; + Py_UCS2* text_ptr = text + text_pos; + Py_UCS2* limit_ptr = text + limit; + + while (text_ptr > limit_ptr) { + if (any_case(text_ptr[-1], case_count, cases)) { + Py_ssize_t s_pos; + + s_pos = 1; + + for (;;) { + if (s_pos >= length) + /* End of search string. */ + return text_ptr - text; + + if (text_ptr - s_pos <= limit_ptr) { + /* Off the end of the text. */ + if (state->partial_side == RE_PARTIAL_LEFT) { + /* Partial match. */ + *is_partial = TRUE; + return text_ptr - text; + } + + return -1; + + } + + if (!same_char_ign(encoding, locale_info, text_ptr[- s_pos + - 1], values[length - s_pos - 1])) + break; + + ++s_pos; + } + } + + --text_ptr; + } + text_pos = text_ptr - text; + break; + } + case 4: + { + Py_UCS4* text = (Py_UCS4*)state->text; + Py_UCS4* text_ptr = text + text_pos; + Py_UCS4* limit_ptr = text + limit; + + while (text_ptr > limit_ptr) { + if (any_case(text_ptr[-1], case_count, cases)) { + Py_ssize_t s_pos; + + s_pos = 1; + + for (;;) { + if (s_pos >= length) + /* End of search string. */ + return text_ptr - text; + + if (text_ptr - s_pos <= limit_ptr) { + /* Off the end of the text. */ + if (state->partial_side == RE_PARTIAL_LEFT) { + /* Partial match. */ + *is_partial = TRUE; + return text_ptr - text; + } + + return -1; + + } + + if (!same_char_ign(encoding, locale_info, text_ptr[- s_pos + - 1], values[length - s_pos - 1])) + break; + + ++s_pos; + } + } + + --text_ptr; + } + text_pos = text_ptr - text; + break; + } + } + + /* Off the end of the text. */ + if (state->partial_side == RE_PARTIAL_LEFT) { + /* Partial match. */ + *is_partial = TRUE; + return text_pos; + } + + return -1; +} + +/* Performs a simple string search, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) simple_string_search_rev(RE_State* state, RE_Node* + node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL* is_partial) { + Py_ssize_t length; + RE_CODE* values; + Py_UCS4 check_char; + + length = (Py_ssize_t)node->value_count; + values = node->values; + check_char = values[length - 1]; + + *is_partial = FALSE; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text = (Py_UCS1*)state->text; + Py_UCS1* text_ptr = text + text_pos; + Py_UCS1* limit_ptr = text + limit; + + while (text_ptr > limit_ptr) { + if (text_ptr[-1] == check_char) { + Py_ssize_t s_pos; + + s_pos = 1; + + for (;;) { + if (s_pos >= length) + /* End of search string. */ + return text_ptr - text; + + if (text_ptr - s_pos <= limit_ptr) { + /* Off the end of the text. */ + if (state->partial_side == RE_PARTIAL_LEFT) { + /* Partial match. */ + *is_partial = TRUE; + return text_ptr - text; + } + + return -1; + + } + + if (!same_char(text_ptr[- s_pos - 1], values[length - s_pos + - 1])) + break; + + ++s_pos; + } + } + + --text_ptr; + } + text_pos = text_ptr - text; + break; + } + case 2: + { + Py_UCS2* text = (Py_UCS2*)state->text; + Py_UCS2* text_ptr = text + text_pos; + Py_UCS2* limit_ptr = text + limit; + + while (text_ptr > limit_ptr) { + if (text_ptr[-1] == check_char) { + Py_ssize_t s_pos; + + s_pos = 1; + + for (;;) { + if (s_pos >= length) + /* End of search string. */ + return text_ptr - text; + + if (text_ptr - s_pos <= limit_ptr) { + /* Off the end of the text. */ + if (state->partial_side == RE_PARTIAL_LEFT) { + /* Partial match. */ + *is_partial = TRUE; + return text_ptr - text; + } + + return -1; + + } + + if (!same_char(text_ptr[- s_pos - 1], values[length - s_pos + - 1])) + break; + + ++s_pos; + } + } + + --text_ptr; + } + text_pos = text_ptr - text; + break; + } + case 4: + { + Py_UCS4* text = (Py_UCS4*)state->text; + Py_UCS4* text_ptr = text + text_pos; + Py_UCS4* limit_ptr = text + limit; + + while (text_ptr > limit_ptr) { + if (text_ptr[-1] == check_char) { + Py_ssize_t s_pos; + + s_pos = 1; + + for (;;) { + if (s_pos >= length) + /* End of search string. */ + return text_ptr - text; + + if (text_ptr - s_pos <= limit_ptr) { + /* Off the end of the text. */ + if (state->partial_side == RE_PARTIAL_LEFT) { + /* Partial match. */ + *is_partial = TRUE; + return text_ptr - text; + } + + return -1; + + } + + if (!same_char(text_ptr[- s_pos - 1], values[length - s_pos + - 1])) + break; + + ++s_pos; + } + } + + --text_ptr; + } + text_pos = text_ptr - text; + break; + } + } + + /* Off the end of the text. */ + if (state->partial_side == RE_PARTIAL_LEFT) { + /* Partial match. */ + *is_partial = TRUE; + return text_pos; + } + + return -1; +} + +/* Performs a Boyer-Moore fast string search. */ +Py_LOCAL_INLINE(Py_ssize_t) fast_string_search(RE_State* state, RE_Node* node, + Py_ssize_t text_pos, Py_ssize_t limit) { + void* text; + Py_ssize_t length; + RE_CODE* values; + Py_ssize_t* bad_character_offset; + Py_ssize_t* good_suffix_offset; + Py_ssize_t last_pos; + Py_UCS4 check_char; + + text = state->text; + length = (Py_ssize_t)node->value_count; + values = node->values; + good_suffix_offset = node->string.good_suffix_offset; + bad_character_offset = node->string.bad_character_offset; + last_pos = length - 1; + check_char = values[last_pos]; + limit -= length; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr <= limit_ptr) { + Py_UCS4 ch; + + ch = text_ptr[last_pos]; + if (ch == check_char) { + Py_ssize_t pos; + + pos = last_pos - 1; + while (pos >= 0 && same_char(text_ptr[pos], values[pos])) + --pos; + + if (pos < 0) + return text_ptr - (Py_UCS1*)text; + + text_ptr += good_suffix_offset[pos]; + } else + text_ptr += bad_character_offset[ch & 0xFF]; + } + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr <= limit_ptr) { + Py_UCS4 ch; + + ch = text_ptr[last_pos]; + if (ch == check_char) { + Py_ssize_t pos; + + pos = last_pos - 1; + while (pos >= 0 && same_char(text_ptr[pos], values[pos])) + --pos; + + if (pos < 0) + return text_ptr - (Py_UCS2*)text; + + text_ptr += good_suffix_offset[pos]; + } else + text_ptr += bad_character_offset[ch & 0xFF]; + } + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr <= limit_ptr) { + Py_UCS4 ch; + + ch = text_ptr[last_pos]; + if (ch == check_char) { + Py_ssize_t pos; + + pos = last_pos - 1; + while (pos >= 0 && same_char(text_ptr[pos], values[pos])) + --pos; + + if (pos < 0) + return text_ptr - (Py_UCS4*)text; + + text_ptr += good_suffix_offset[pos]; + } else + text_ptr += bad_character_offset[ch & 0xFF]; + } + break; + } + } + + return -1; +} + +/* Performs a Boyer-Moore fast string search, ignoring case. */ +Py_LOCAL_INLINE(Py_ssize_t) fast_string_search_ign(RE_State* state, RE_Node* + node, Py_ssize_t text_pos, Py_ssize_t limit) { + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + void* text; + Py_ssize_t length; + RE_CODE* values; + Py_ssize_t* bad_character_offset; + Py_ssize_t* good_suffix_offset; + Py_ssize_t last_pos; + Py_UCS4 cases[RE_MAX_CASES]; + int case_count; + + encoding = state->encoding; + locale_info = state->locale_info; + text = state->text; + length = (Py_ssize_t)node->value_count; + values = node->values; + good_suffix_offset = node->string.good_suffix_offset; + bad_character_offset = node->string.bad_character_offset; + last_pos = length - 1; + case_count = encoding->all_cases(locale_info, values[last_pos], cases); + limit -= length; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr <= limit_ptr) { + Py_UCS4 ch; + + ch = text_ptr[last_pos]; + if (any_case(ch, case_count, cases)) { + Py_ssize_t pos; + + pos = last_pos - 1; + while (pos >= 0 && same_char_ign(encoding, locale_info, + text_ptr[pos], values[pos])) + --pos; + + if (pos < 0) + return text_ptr - (Py_UCS1*)text; + + text_ptr += good_suffix_offset[pos]; + } else + text_ptr += bad_character_offset[ch & 0xFF]; + } + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr <= limit_ptr) { + Py_UCS4 ch; + + ch = text_ptr[last_pos]; + if (any_case(ch, case_count, cases)) { + Py_ssize_t pos; + + pos = last_pos - 1; + while (pos >= 0 && same_char_ign(encoding, locale_info, + text_ptr[pos], values[pos])) + --pos; + + if (pos < 0) + return text_ptr - (Py_UCS2*)text; + + text_ptr += good_suffix_offset[pos]; + } else + text_ptr += bad_character_offset[ch & 0xFF]; + } + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr <= limit_ptr) { + Py_UCS4 ch; + + ch = text_ptr[last_pos]; + if (any_case(ch, case_count, cases)) { + Py_ssize_t pos; + + pos = last_pos - 1; + while (pos >= 0 && same_char_ign(encoding, locale_info, + text_ptr[pos], values[pos])) + --pos; + + if (pos < 0) + return text_ptr - (Py_UCS4*)text; + + text_ptr += good_suffix_offset[pos]; + } else + text_ptr += bad_character_offset[ch & 0xFF]; + } + break; + } + } + + return -1; +} + +/* Performs a Boyer-Moore fast string search, backwards, ignoring case. */ +Py_LOCAL_INLINE(Py_ssize_t) fast_string_search_ign_rev(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, Py_ssize_t limit) { + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + void* text; + Py_ssize_t length; + RE_CODE* values; + Py_ssize_t* bad_character_offset; + Py_ssize_t* good_suffix_offset; + Py_UCS4 cases[RE_MAX_CASES]; + int case_count; + + encoding = state->encoding; + locale_info = state->locale_info; + text = state->text; + length = (Py_ssize_t)node->value_count; + values = node->values; + good_suffix_offset = node->string.good_suffix_offset; + bad_character_offset = node->string.bad_character_offset; + case_count = encoding->all_cases(locale_info, values[0], cases); + text_pos -= length; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr >= limit_ptr) { + Py_UCS4 ch; + + ch = text_ptr[0]; + if (any_case(ch, case_count, cases)) { + Py_ssize_t pos; + + pos = 1; + while (pos < length && same_char_ign(encoding, locale_info, + text_ptr[pos], values[pos])) + ++pos; + + if (pos >= length) + return text_ptr - (Py_UCS1*)text + length; + + text_ptr += good_suffix_offset[pos]; + } else + text_ptr += bad_character_offset[ch & 0xFF]; + } + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr >= limit_ptr) { + Py_UCS4 ch; + + ch = text_ptr[0]; + if (any_case(ch, case_count, cases)) { + Py_ssize_t pos; + + pos = 1; + while (pos < length && same_char_ign(encoding, locale_info, + text_ptr[pos], values[pos])) + ++pos; + + if (pos >= length) + return text_ptr - (Py_UCS2*)text + length; + + text_ptr += good_suffix_offset[pos]; + } else + text_ptr += bad_character_offset[ch & 0xFF]; + } + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr >= limit_ptr) { + Py_UCS4 ch; + + ch = text_ptr[0]; + if (any_case(ch, case_count, cases)) { + Py_ssize_t pos; + + pos = 1; + while (pos < length && same_char_ign(encoding, locale_info, + text_ptr[pos], values[pos])) + ++pos; + + if (pos >= length) + return text_ptr - (Py_UCS4*)text + length; + + text_ptr += good_suffix_offset[pos]; + } else + text_ptr += bad_character_offset[ch & 0xFF]; + } + break; + } + } + + return -1; +} + +/* Performs a Boyer-Moore fast string search, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) fast_string_search_rev(RE_State* state, RE_Node* + node, Py_ssize_t text_pos, Py_ssize_t limit) { + void* text; + Py_ssize_t length; + RE_CODE* values; + Py_ssize_t* bad_character_offset; + Py_ssize_t* good_suffix_offset; + Py_UCS4 check_char; + + text = state->text; + length = (Py_ssize_t)node->value_count; + values = node->values; + good_suffix_offset = node->string.good_suffix_offset; + bad_character_offset = node->string.bad_character_offset; + check_char = values[0]; + text_pos -= length; + + switch (state->charsize) { + case 1: + { + Py_UCS1* text_ptr; + Py_UCS1* limit_ptr; + + text_ptr = (Py_UCS1*)text + text_pos; + limit_ptr = (Py_UCS1*)text + limit; + + while (text_ptr >= limit_ptr) { + Py_UCS4 ch; + + ch = text_ptr[0]; + if (ch == check_char) { + Py_ssize_t pos; + + pos = 1; + while (pos < length && same_char(text_ptr[pos], values[pos])) + ++pos; + + if (pos >= length) + return text_ptr - (Py_UCS1*)text + length; + + text_ptr += good_suffix_offset[pos]; + } else + text_ptr += bad_character_offset[ch & 0xFF]; + } + break; + } + case 2: + { + Py_UCS2* text_ptr; + Py_UCS2* limit_ptr; + + text_ptr = (Py_UCS2*)text + text_pos; + limit_ptr = (Py_UCS2*)text + limit; + + while (text_ptr >= limit_ptr) { + Py_UCS4 ch; + + ch = text_ptr[0]; + if (ch == check_char) { + Py_ssize_t pos; + + pos = 1; + while (pos < length && same_char(text_ptr[pos], values[pos])) + ++pos; + + if (pos >= length) + return text_ptr - (Py_UCS2*)text + length; + + text_ptr += good_suffix_offset[pos]; + } else + text_ptr += bad_character_offset[ch & 0xFF]; + } + break; + } + case 4: + { + Py_UCS4* text_ptr; + Py_UCS4* limit_ptr; + + text_ptr = (Py_UCS4*)text + text_pos; + limit_ptr = (Py_UCS4*)text + limit; + + while (text_ptr >= limit_ptr) { + Py_UCS4 ch; + + ch = text_ptr[0]; + if (ch == check_char) { + Py_ssize_t pos; + + pos = 1; + while (pos < length && same_char(text_ptr[pos], values[pos])) + ++pos; + + if (pos >= length) + return text_ptr - (Py_UCS4*)text + length; + + text_ptr += good_suffix_offset[pos]; + } else + text_ptr += bad_character_offset[ch & 0xFF]; + } + break; + } + } + + return -1; +} + +/* Builds the tables for a Boyer-Moore fast string search. */ +Py_LOCAL_INLINE(BOOL) build_fast_tables(RE_State* state, RE_Node* node, BOOL + ignore) { + Py_ssize_t length; + RE_CODE* values; + Py_ssize_t* bad; + Py_ssize_t* good; + Py_UCS4 ch; + Py_ssize_t last_pos; + Py_ssize_t pos; + BOOL (*is_same_char)(RE_EncodingTable* encoding, RE_LocaleInfo* + locale_info, Py_UCS4 ch1, Py_UCS4 ch2); + Py_ssize_t suffix_len; + BOOL saved_start; + Py_ssize_t s; + Py_ssize_t i; + Py_ssize_t s_start; + Py_UCS4 codepoints[RE_MAX_CASES]; + + length = (Py_ssize_t)node->value_count; + + if (length < RE_MIN_FAST_LENGTH) + return TRUE; + + values = node->values; + bad = (Py_ssize_t*)re_alloc(256 * sizeof(bad[0])); + good = (Py_ssize_t*)re_alloc((size_t)length * sizeof(good[0])); + + if (!bad || !good) { + re_dealloc(bad); + re_dealloc(good); + + return FALSE; + } + + for (ch = 0; ch < 0x100; ch++) + bad[ch] = length; + + last_pos = length - 1; + + for (pos = 0; pos < last_pos; pos++) { + Py_ssize_t offset; + + offset = last_pos - pos; + ch = values[pos]; + if (ignore) { + int count; + int i; + + count = state->encoding->all_cases(state->locale_info, ch, + codepoints); + + for (i = 0; i < count; i++) + bad[codepoints[i] & 0xFF] = offset; + } else + bad[ch & 0xFF] = offset; + } + + is_same_char = ignore ? same_char_ign_wrapper : same_char_wrapper; + + suffix_len = 2; + pos = length - suffix_len; + saved_start = FALSE; + s = pos - 1; + i = suffix_len - 1; + s_start = s; + + while (pos >= 0) { + /* Look for another occurrence of the suffix. */ + while (i > 0) { + /* Have we dropped off the end of the string? */ + if (s + i < 0) + break; + + if (is_same_char(state->encoding, state->locale_info, values[s + + i], values[pos + i])) + /* It still matches. */ + --i; + else { + /* Start again further along. */ + --s; + i = suffix_len - 1; + } + } + + if (s >= 0 && is_same_char(state->encoding, state->locale_info, + values[s], values[pos])) { + /* We haven't dropped off the end of the string, and the suffix has + * matched this far, so this is a good starting point for the next + * iteration. + */ + --s; + if (!saved_start) { + s_start = s; + saved_start = TRUE; + } + } else { + /* Calculate the suffix offset. */ + good[pos] = pos - s; + + /* Extend the suffix and start searching for _this_ one. */ + --pos; + ++suffix_len; + + /* Where's a good place to start searching? */ + if (saved_start) { + s = s_start; + saved_start = FALSE; + } else + --s; + + /* Can we short-circuit the searching? */ + if (s < 0) + break; + } + + i = suffix_len - 1; + } + + /* Fill-in any remaining entries. */ + while (pos >= 0) { + good[pos] = pos - s; + --pos; + --s; + } + + node->string.bad_character_offset = bad; + node->string.good_suffix_offset = good; + + return TRUE; +} + +/* Builds the tables for a Boyer-Moore fast string search, backwards. */ +Py_LOCAL_INLINE(BOOL) build_fast_tables_rev(RE_State* state, RE_Node* node, + BOOL ignore) { + Py_ssize_t length; + RE_CODE* values; + Py_ssize_t* bad; + Py_ssize_t* good; + Py_UCS4 ch; + Py_ssize_t last_pos; + Py_ssize_t pos; + BOOL (*is_same_char)(RE_EncodingTable* encoding, RE_LocaleInfo* + locale_info, Py_UCS4 ch1, Py_UCS4 ch2); + Py_ssize_t suffix_len; + BOOL saved_start; + Py_ssize_t s; + Py_ssize_t i; + Py_ssize_t s_start; + Py_UCS4 codepoints[RE_MAX_CASES]; + + length = (Py_ssize_t)node->value_count; + + if (length < RE_MIN_FAST_LENGTH) + return TRUE; + + values = node->values; + bad = (Py_ssize_t*)re_alloc(256 * sizeof(bad[0])); + good = (Py_ssize_t*)re_alloc((size_t)length * sizeof(good[0])); + + if (!bad || !good) { + re_dealloc(bad); + re_dealloc(good); + + return FALSE; + } + + for (ch = 0; ch < 0x100; ch++) + bad[ch] = -length; + + last_pos = length - 1; + + for (pos = last_pos; pos > 0; pos--) { + Py_ssize_t offset; + + offset = -pos; + ch = values[pos]; + if (ignore) { + int count; + int i; + + count = state->encoding->all_cases(state->locale_info, ch, + codepoints); + + for (i = 0; i < count; i++) + bad[codepoints[i] & 0xFF] = offset; + } else + bad[ch & 0xFF] = offset; + } + + is_same_char = ignore ? same_char_ign_wrapper : same_char_wrapper; + + suffix_len = 2; + pos = suffix_len - 1; + saved_start = FALSE; + s = pos + 1; + i = suffix_len - 1; + s_start = s; + + while (pos < length) { + /* Look for another occurrence of the suffix. */ + while (i > 0) { + /* Have we dropped off the end of the string? */ + if (s - i >= length) + break; + + if (is_same_char(state->encoding, state->locale_info, values[s - + i], values[pos - i])) + /* It still matches. */ + --i; + else { + /* Start again further along. */ + ++s; + i = suffix_len - 1; + } + } + + if (s < length && is_same_char(state->encoding, state->locale_info, + values[s], values[pos])) { + /* We haven't dropped off the end of the string, and the suffix has + * matched this far, so this is a good starting point for the next + * iteration. + */ + ++s; + if (!saved_start) { + s_start = s; + saved_start = TRUE; + } + } else { + /* Calculate the suffix offset. */ + good[pos] = pos - s; + + /* Extend the suffix and start searching for _this_ one. */ + ++pos; + ++suffix_len; + + /* Where's a good place to start searching? */ + if (saved_start) { + s = s_start; + saved_start = FALSE; + } else + ++s; + + /* Can we short-circuit the searching? */ + if (s >= length) + break; + } + + i = suffix_len - 1; + } + + /* Fill-in any remaining entries. */ + while (pos < length) { + good[pos] = pos - s; + ++pos; + ++s; + } + + node->string.bad_character_offset = bad; + node->string.good_suffix_offset = good; + + return TRUE; +} + +/* Performs a string search. */ +Py_LOCAL_INLINE(Py_ssize_t) string_search(RE_SafeState* safe_state, RE_Node* + node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL* is_partial) { + RE_State* state; + Py_ssize_t found_pos; + + state = safe_state->re_state; + + *is_partial = FALSE; + + /* Has the node been initialised for fast searching, if necessary? */ + if (!(node->status & RE_STATUS_FAST_INIT)) { + /* Ideally the pattern should immutable and shareable across threads. + * Internally, however, it isn't. For safety we need to hold the GIL. + */ + acquire_GIL(safe_state); + + /* Double-check because of multithreading. */ + if (!(node->status & RE_STATUS_FAST_INIT)) { + build_fast_tables(state, node, FALSE); + node->status |= RE_STATUS_FAST_INIT; + } + + release_GIL(safe_state); + } + + if (node->string.bad_character_offset) { + /* Start with a fast search. This will find the string if it's complete + * (i.e. not truncated). + */ + found_pos = fast_string_search(state, node, text_pos, limit); + if (found_pos < 0 && state->partial_side == RE_PARTIAL_RIGHT) + /* We didn't find the string, but it could've been truncated, so + * try again, starting close to the end. + */ + found_pos = simple_string_search(state, node, limit - + (Py_ssize_t)(node->value_count - 1), limit, is_partial); + } else + found_pos = simple_string_search(state, node, text_pos, limit, + is_partial); + + return found_pos; +} + +/* Performs a string search, ignoring case. */ +Py_LOCAL_INLINE(Py_ssize_t) string_search_fld(RE_SafeState* safe_state, + RE_Node* node, Py_ssize_t text_pos, Py_ssize_t limit, Py_ssize_t* new_pos, + BOOL* is_partial) { + RE_State* state; + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + int (*full_case_fold)(RE_LocaleInfo* locale_info, Py_UCS4 ch, Py_UCS4* + folded); + Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); + void* text; + RE_CODE* values; + Py_ssize_t start_pos; + int f_pos; + int folded_len; + Py_ssize_t length; + Py_ssize_t s_pos; + Py_UCS4 folded[RE_MAX_FOLDED]; + + state = safe_state->re_state; + encoding = state->encoding; + locale_info = state->locale_info; + full_case_fold = encoding->full_case_fold; + char_at = state->char_at; + text = state->text; + + values = node->values; + start_pos = text_pos; + f_pos = 0; + folded_len = 0; + length = (Py_ssize_t)node->value_count; + s_pos = 0; + + *is_partial = FALSE; + + while (s_pos < length || f_pos < folded_len) { + if (f_pos >= folded_len) { + /* Fetch and casefold another character. */ + if (text_pos >= limit) { + if (text_pos >= state->text_length && state->partial_side == + RE_PARTIAL_RIGHT) { + *is_partial = TRUE; + return start_pos; + } + + return -1; + } + + folded_len = full_case_fold(locale_info, char_at(text, text_pos), + folded); + f_pos = 0; + } + + if (s_pos < length && same_char_ign(encoding, locale_info, + values[s_pos], folded[f_pos])) { + ++s_pos; + ++f_pos; + + if (f_pos >= folded_len) + ++text_pos; + } else { + ++start_pos; + text_pos = start_pos; + f_pos = 0; + folded_len = 0; + s_pos = 0; + } + } + + /* We found the string. */ + if (new_pos) + *new_pos = text_pos; + + return start_pos; +} + +/* Performs a string search, backwards, ignoring case. */ +Py_LOCAL_INLINE(Py_ssize_t) string_search_fld_rev(RE_SafeState* safe_state, + RE_Node* node, Py_ssize_t text_pos, Py_ssize_t limit, Py_ssize_t* new_pos, + BOOL* is_partial) { + RE_State* state; + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + int (*full_case_fold)(RE_LocaleInfo* locale_info, Py_UCS4 ch, Py_UCS4* + folded); + Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); + void* text; + RE_CODE* values; + Py_ssize_t start_pos; + int f_pos; + int folded_len; + Py_ssize_t length; + Py_ssize_t s_pos; + Py_UCS4 folded[RE_MAX_FOLDED]; + + state = safe_state->re_state; + encoding = state->encoding; + locale_info = state->locale_info; + full_case_fold = encoding->full_case_fold; + char_at = state->char_at; + text = state->text; + + values = node->values; + start_pos = text_pos; + f_pos = 0; + folded_len = 0; + length = (Py_ssize_t)node->value_count; + s_pos = 0; + + *is_partial = FALSE; + + while (s_pos < length || f_pos < folded_len) { + if (f_pos >= folded_len) { + /* Fetch and casefold another character. */ + if (text_pos <= limit) { + if (text_pos <= 0 && state->partial_side == RE_PARTIAL_LEFT) { + *is_partial = TRUE; + return start_pos; + } + + return -1; + } + + folded_len = full_case_fold(locale_info, char_at(text, text_pos - + 1), folded); + f_pos = 0; + } + + if (s_pos < length && same_char_ign(encoding, locale_info, + values[length - s_pos - 1], folded[folded_len - f_pos - 1])) { + ++s_pos; + ++f_pos; + + if (f_pos >= folded_len) + --text_pos; + } else { + --start_pos; + text_pos = start_pos; + f_pos = 0; + folded_len = 0; + s_pos = 0; + } + } + + /* We found the string. */ + if (new_pos) + *new_pos = text_pos; + + return start_pos; +} + +/* Performs a string search, ignoring case. */ +Py_LOCAL_INLINE(Py_ssize_t) string_search_ign(RE_SafeState* safe_state, + RE_Node* node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL* is_partial) { + RE_State* state; + Py_ssize_t found_pos; + + state = safe_state->re_state; + + *is_partial = FALSE; + + /* Has the node been initialised for fast searching, if necessary? */ + if (!(node->status & RE_STATUS_FAST_INIT)) { + /* Ideally the pattern should immutable and shareable across threads. + * Internally, however, it isn't. For safety we need to hold the GIL. + */ + acquire_GIL(safe_state); + + /* Double-check because of multithreading. */ + if (!(node->status & RE_STATUS_FAST_INIT)) { + build_fast_tables(state, node, TRUE); + node->status |= RE_STATUS_FAST_INIT; + } + + release_GIL(safe_state); + } + + if (node->string.bad_character_offset) { + /* Start with a fast search. This will find the string if it's complete + * (i.e. not truncated). + */ + found_pos = fast_string_search_ign(state, node, text_pos, limit); + if (found_pos < 0 && state->partial_side == RE_PARTIAL_RIGHT) + /* We didn't find the string, but it could've been truncated, so + * try again, starting close to the end. + */ + found_pos = simple_string_search_ign(state, node, limit - + (Py_ssize_t)(node->value_count - 1), limit, is_partial); + } else + found_pos = simple_string_search_ign(state, node, text_pos, limit, + is_partial); + + return found_pos; +} + +/* Performs a string search, backwards, ignoring case. */ +Py_LOCAL_INLINE(Py_ssize_t) string_search_ign_rev(RE_SafeState* safe_state, + RE_Node* node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL* is_partial) { + RE_State* state; + Py_ssize_t found_pos; + + state = safe_state->re_state; + + *is_partial = FALSE; + + /* Has the node been initialised for fast searching, if necessary? */ + if (!(node->status & RE_STATUS_FAST_INIT)) { + /* Ideally the pattern should immutable and shareable across threads. + * Internally, however, it isn't. For safety we need to hold the GIL. + */ + acquire_GIL(safe_state); + + /* Double-check because of multithreading. */ + if (!(node->status & RE_STATUS_FAST_INIT)) { + build_fast_tables_rev(state, node, TRUE); + node->status |= RE_STATUS_FAST_INIT; + } + + release_GIL(safe_state); + } + + if (node->string.bad_character_offset) { + /* Start with a fast search. This will find the string if it's complete + * (i.e. not truncated). + */ + found_pos = fast_string_search_ign_rev(state, node, text_pos, limit); + if (found_pos < 0 && state->partial_side == RE_PARTIAL_LEFT) + /* We didn't find the string, but it could've been truncated, so + * try again, starting close to the end. + */ + found_pos = simple_string_search_ign_rev(state, node, limit + + (Py_ssize_t)(node->value_count - 1), limit, is_partial); + } else + found_pos = simple_string_search_ign_rev(state, node, text_pos, limit, + is_partial); + + return found_pos; +} + +/* Performs a string search, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) string_search_rev(RE_SafeState* safe_state, + RE_Node* node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL* is_partial) { + RE_State* state; + Py_ssize_t found_pos; + + state = safe_state->re_state; + + *is_partial = FALSE; + + /* Has the node been initialised for fast searching, if necessary? */ + if (!(node->status & RE_STATUS_FAST_INIT)) { + /* Ideally the pattern should immutable and shareable across threads. + * Internally, however, it isn't. For safety we need to hold the GIL. + */ + acquire_GIL(safe_state); + + /* Double-check because of multithreading. */ + if (!(node->status & RE_STATUS_FAST_INIT)) { + build_fast_tables_rev(state, node, FALSE); + node->status |= RE_STATUS_FAST_INIT; + } + + release_GIL(safe_state); + } + + if (node->string.bad_character_offset) { + /* Start with a fast search. This will find the string if it's complete + * (i.e. not truncated). + */ + found_pos = fast_string_search_rev(state, node, text_pos, limit); + if (found_pos < 0 && state->partial_side == RE_PARTIAL_LEFT) + /* We didn't find the string, but it could've been truncated, so + * try again, starting close to the end. + */ + found_pos = simple_string_search_rev(state, node, limit + + (Py_ssize_t)(node->value_count - 1), limit, is_partial); + } else + found_pos = simple_string_search_rev(state, node, text_pos, limit, + is_partial); + + return found_pos; +} + +/* Returns how many characters there could be before full case-folding. */ +Py_LOCAL_INLINE(Py_ssize_t) possible_unfolded_length(Py_ssize_t length) { + if (length == 0) + return 0; + + if (length < RE_MAX_FOLDED) + return 1; + + return length / RE_MAX_FOLDED; +} + +/* Checks whether there's any character except a newline at a position. */ +Py_LOCAL_INLINE(int) try_match_ANY(RE_State* state, RE_Node* node, Py_ssize_t + text_pos) { + if (text_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos < state->slice_end && + matches_ANY(state->encoding, node, state->char_at(state->text, + text_pos))); +} + +/* Checks whether there's any character at all at a position. */ +Py_LOCAL_INLINE(int) try_match_ANY_ALL(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + if (text_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos < state->slice_end); +} + +/* Checks whether there's any character at all at a position, backwards. */ +Py_LOCAL_INLINE(int) try_match_ANY_ALL_REV(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + if (text_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos > state->slice_start); +} + +/* Checks whether there's any character except a newline at a position, + * backwards. + */ +Py_LOCAL_INLINE(int) try_match_ANY_REV(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + if (text_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos > state->slice_start && + matches_ANY(state->encoding, node, state->char_at(state->text, text_pos - + 1))); +} + +/* Checks whether there's any character except a line separator at a position. + */ +Py_LOCAL_INLINE(int) try_match_ANY_U(RE_State* state, RE_Node* node, Py_ssize_t + text_pos) { + if (text_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos < state->slice_end && + matches_ANY_U(state->encoding, node, state->char_at(state->text, + text_pos))); +} + +/* Checks whether there's any character except a line separator at a position, + * backwards. + */ +Py_LOCAL_INLINE(int) try_match_ANY_U_REV(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + if (text_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos > state->slice_start && + matches_ANY_U(state->encoding, node, state->char_at(state->text, text_pos + - 1))); +} + +/* Checks whether a position is on a word boundary. */ +Py_LOCAL_INLINE(int) try_match_BOUNDARY(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + return bool_as_status(state->encoding->at_boundary(state, text_pos) == + node->match); +} + +/* Checks whether there's a character at a position. */ +Py_LOCAL_INLINE(int) try_match_CHARACTER(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + if (text_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos < state->slice_end && + matches_CHARACTER(state->encoding, state->locale_info, node, + state->char_at(state->text, text_pos)) == node->match); +} + +/* Checks whether there's a character at a position, ignoring case. */ +Py_LOCAL_INLINE(int) try_match_CHARACTER_IGN(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + if (text_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos < state->slice_end && + matches_CHARACTER_IGN(state->encoding, state->locale_info, node, + state->char_at(state->text, text_pos)) == node->match); +} + +/* Checks whether there's a character at a position, ignoring case, backwards. + */ +Py_LOCAL_INLINE(int) try_match_CHARACTER_IGN_REV(RE_State* state, RE_Node* + node, Py_ssize_t text_pos) { + if (text_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos > state->slice_start && + matches_CHARACTER_IGN(state->encoding, state->locale_info, node, + state->char_at(state->text, text_pos - 1)) == node->match); +} + +/* Checks whether there's a character at a position, backwards. */ +Py_LOCAL_INLINE(int) try_match_CHARACTER_REV(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + if (text_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos > state->slice_start && + matches_CHARACTER(state->encoding, state->locale_info, node, + state->char_at(state->text, text_pos - 1)) == node->match); +} + +/* Checks whether a position is on a default word boundary. */ +Py_LOCAL_INLINE(int) try_match_DEFAULT_BOUNDARY(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + return bool_as_status(state->encoding->at_default_boundary(state, text_pos) + == node->match); +} + +/* Checks whether a position is at the default end of a word. */ +Py_LOCAL_INLINE(int) try_match_DEFAULT_END_OF_WORD(RE_State* state, RE_Node* + node, Py_ssize_t text_pos) { + return bool_as_status(state->encoding->at_default_word_end(state, + text_pos)); +} + +/* Checks whether a position is at the default start of a word. */ +Py_LOCAL_INLINE(int) try_match_DEFAULT_START_OF_WORD(RE_State* state, RE_Node* + node, Py_ssize_t text_pos) { + return bool_as_status(state->encoding->at_default_word_start(state, + text_pos)); +} + +/* Checks whether a position is at the end of a line. */ +Py_LOCAL_INLINE(int) try_match_END_OF_LINE(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + return bool_as_status(text_pos >= state->slice_end || + state->char_at(state->text, text_pos) == '\n'); +} + +/* Checks whether a position is at the end of a line. */ +Py_LOCAL_INLINE(int) try_match_END_OF_LINE_U(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + return bool_as_status(state->encoding->at_line_end(state, text_pos)); +} + +/* Checks whether a position is at the end of the string. */ +Py_LOCAL_INLINE(int) try_match_END_OF_STRING(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + return bool_as_status(text_pos >= state->text_length); +} + +/* Checks whether a position is at the end of a line or the string. */ +Py_LOCAL_INLINE(int) try_match_END_OF_STRING_LINE(RE_State* state, RE_Node* + node, Py_ssize_t text_pos) { + return bool_as_status(text_pos >= state->text_length || text_pos == + state->final_newline); +} + +/* Checks whether a position is at the end of the string. */ +Py_LOCAL_INLINE(int) try_match_END_OF_STRING_LINE_U(RE_State* state, RE_Node* + node, Py_ssize_t text_pos) { + return bool_as_status(text_pos >= state->text_length || text_pos == + state->final_line_sep); +} + +/* Checks whether a position is at the end of a word. */ +Py_LOCAL_INLINE(int) try_match_END_OF_WORD(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + return bool_as_status(state->encoding->at_word_end(state, text_pos)); +} + +/* Checks whether a position is on a grapheme boundary. */ +Py_LOCAL_INLINE(int) try_match_GRAPHEME_BOUNDARY(RE_State* state, RE_Node* + node, Py_ssize_t text_pos) { + return bool_as_status(state->encoding->at_grapheme_boundary(state, + text_pos)); +} + +/* Checks whether there's a character with a certain property at a position. */ +Py_LOCAL_INLINE(int) try_match_PROPERTY(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + if (text_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos < state->slice_end && + matches_PROPERTY(state->encoding, state->locale_info, node, + state->char_at(state->text, text_pos)) == node->match); +} + +/* Checks whether there's a character with a certain property at a position, + * ignoring case. + */ +Py_LOCAL_INLINE(int) try_match_PROPERTY_IGN(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + if (text_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos < state->slice_end && + matches_PROPERTY_IGN(state->encoding, state->locale_info, node, + state->char_at(state->text, text_pos)) == node->match); +} + +/* Checks whether there's a character with a certain property at a position, + * ignoring case, backwards. + */ +Py_LOCAL_INLINE(int) try_match_PROPERTY_IGN_REV(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + if (text_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos > state->slice_start && + matches_PROPERTY_IGN(state->encoding, state->locale_info, node, + state->char_at(state->text, text_pos - 1)) == node->match); +} + +/* Checks whether there's a character with a certain property at a position, + * backwards. + */ +Py_LOCAL_INLINE(int) try_match_PROPERTY_REV(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + if (text_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos > state->slice_start && + matches_PROPERTY(state->encoding, state->locale_info, node, + state->char_at(state->text, text_pos - 1)) == node->match); +} + +/* Checks whether there's a character in a certain range at a position. */ +Py_LOCAL_INLINE(int) try_match_RANGE(RE_State* state, RE_Node* node, Py_ssize_t + text_pos) { + if (text_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos < state->slice_end && + matches_RANGE(state->encoding, state->locale_info, node, + state->char_at(state->text, text_pos)) == node->match); +} + +/* Checks whether there's a character in a certain range at a position, + * ignoring case. + */ +Py_LOCAL_INLINE(int) try_match_RANGE_IGN(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + if (text_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos < state->slice_end && + matches_RANGE_IGN(state->encoding, state->locale_info, node, + state->char_at(state->text, text_pos)) == node->match); +} + +/* Checks whether there's a character in a certain range at a position, + * ignoring case, backwards. + */ +Py_LOCAL_INLINE(int) try_match_RANGE_IGN_REV(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + if (text_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos > state->slice_start && + matches_RANGE_IGN(state->encoding, state->locale_info, node, + state->char_at(state->text, text_pos - 1)) == node->match); +} + +/* Checks whether there's a character in a certain range at a position, + * backwards. + */ +Py_LOCAL_INLINE(int) try_match_RANGE_REV(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + if (text_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos > state->slice_start && + matches_RANGE(state->encoding, state->locale_info, node, + state->char_at(state->text, text_pos - 1)) == node->match); +} + +/* Checks whether a position is at the search anchor. */ +Py_LOCAL_INLINE(int) try_match_SEARCH_ANCHOR(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + return bool_as_status(text_pos == state->search_anchor); +} + +/* Checks whether there's a character in a certain set at a position. */ +Py_LOCAL_INLINE(int) try_match_SET(RE_State* state, RE_Node* node, Py_ssize_t + text_pos) { + if (text_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos < state->slice_end && + matches_SET(state->encoding, state->locale_info, node, + state->char_at(state->text, text_pos)) == node->match); +} + +/* Checks whether there's a character in a certain set at a position, ignoring + * case. + */ +Py_LOCAL_INLINE(int) try_match_SET_IGN(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + if (text_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos < state->slice_end && + matches_SET_IGN(state->encoding, state->locale_info, node, + state->char_at(state->text, text_pos)) == node->match); +} + +/* Checks whether there's a character in a certain set at a position, ignoring + * case, backwards. + */ +Py_LOCAL_INLINE(int) try_match_SET_IGN_REV(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + if (text_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos > state->slice_start && + matches_SET_IGN(state->encoding, state->locale_info, node, + state->char_at(state->text, text_pos - 1)) == node->match); +} + +/* Checks whether there's a character in a certain set at a position, + * backwards. + */ +Py_LOCAL_INLINE(int) try_match_SET_REV(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + if (text_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + return bool_as_status(text_pos > state->slice_start && + matches_SET(state->encoding, state->locale_info, node, + state->char_at(state->text, text_pos - 1)) == node->match); +} + +/* Checks whether a position is at the start of a line. */ +Py_LOCAL_INLINE(int) try_match_START_OF_LINE(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + return bool_as_status(text_pos <= 0 || state->char_at(state->text, text_pos + - 1) == '\n'); +} + +/* Checks whether a position is at the start of a line. */ +Py_LOCAL_INLINE(int) try_match_START_OF_LINE_U(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + return bool_as_status(state->encoding->at_line_start(state, text_pos)); +} + +/* Checks whether a position is at the start of the string. */ +Py_LOCAL_INLINE(int) try_match_START_OF_STRING(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + return bool_as_status(text_pos <= 0); +} + +/* Checks whether a position is at the start of a word. */ +Py_LOCAL_INLINE(int) try_match_START_OF_WORD(RE_State* state, RE_Node* node, + Py_ssize_t text_pos) { + return bool_as_status(state->encoding->at_word_start(state, text_pos)); +} + +/* Checks whether there's a certain string at a position. */ +Py_LOCAL_INLINE(int) try_match_STRING(RE_State* state, RE_NextNode* next, + RE_Node* node, Py_ssize_t text_pos, RE_Position* next_position) { + Py_ssize_t length; + Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); + RE_CODE* values; + Py_ssize_t s_pos; + + length = (Py_ssize_t)node->value_count; + char_at = state->char_at; + values = node->values; + + for (s_pos = 0; s_pos < length; s_pos++) { + if (text_pos + s_pos >= state->slice_end) { + if (state->partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + if (!same_char(char_at(state->text, text_pos + s_pos), values[s_pos])) + return RE_ERROR_FAILURE; + } + + next_position->node = next->match_next; + next_position->text_pos = text_pos + next->match_step; + + return RE_ERROR_SUCCESS; +} + +/* Checks whether there's a certain string at a position, ignoring case. */ +Py_LOCAL_INLINE(int) try_match_STRING_FLD(RE_State* state, RE_NextNode* next, + RE_Node* node, Py_ssize_t text_pos, RE_Position* next_position) { + Py_ssize_t length; + Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + int (*full_case_fold)(RE_LocaleInfo* locale_info, Py_UCS4 ch, Py_UCS4* + folded); + Py_ssize_t s_pos; + RE_CODE* values; + int folded_len; + int f_pos; + Py_ssize_t start_pos; + Py_UCS4 folded[RE_MAX_FOLDED]; + + length = (Py_ssize_t)node->value_count; + char_at = state->char_at; + encoding = state->encoding; + locale_info = state->locale_info; + full_case_fold = encoding->full_case_fold; + + s_pos = 0; + values = node->values; + folded_len = 0; + f_pos = 0; + start_pos = text_pos; + + while (s_pos < length) { + if (f_pos >= folded_len) { + /* Fetch and casefold another character. */ + if (text_pos >= state->slice_end) { + if (state->partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + folded_len = full_case_fold(locale_info, char_at(state->text, + text_pos), folded); + f_pos = 0; + } + + if (!same_char_ign(encoding, locale_info, folded[f_pos], + values[s_pos])) + return RE_ERROR_FAILURE; + + ++s_pos; + ++f_pos; + + if (f_pos >= folded_len) + ++text_pos; + } + + if (f_pos < folded_len) + return RE_ERROR_FAILURE; + + next_position->node = next->match_next; + if (next->match_step == 0) + next_position->text_pos = start_pos; + else + next_position->text_pos = text_pos; + + return RE_ERROR_SUCCESS; +} + +/* Checks whether there's a certain string at a position, ignoring case, + * backwards. + */ +Py_LOCAL_INLINE(int) try_match_STRING_FLD_REV(RE_State* state, RE_NextNode* + next, RE_Node* node, Py_ssize_t text_pos, RE_Position* next_position) { + Py_ssize_t length; + Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + int (*full_case_fold)(RE_LocaleInfo* locale_info, Py_UCS4 ch, Py_UCS4* + folded); + Py_ssize_t s_pos; + RE_CODE* values; + int folded_len; + int f_pos; + Py_ssize_t start_pos; + Py_UCS4 folded[RE_MAX_FOLDED]; + + length = (Py_ssize_t)node->value_count; + char_at = state->char_at; + encoding = state->encoding; + locale_info = state->locale_info; + full_case_fold = encoding->full_case_fold; + + s_pos = 0; + values = node->values; + folded_len = 0; + f_pos = 0; + start_pos = text_pos; + + while (s_pos < length) { + if (f_pos >= folded_len) { + /* Fetch and casefold another character. */ + if (text_pos <= state->slice_start) { + if (state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + folded_len = full_case_fold(locale_info, char_at(state->text, + text_pos - 1), folded); + f_pos = 0; + } + + if (!same_char_ign(encoding, locale_info, folded[folded_len - f_pos - + 1], values[length - s_pos - 1])) + return RE_ERROR_FAILURE; + + ++s_pos; + ++f_pos; + + if (f_pos >= folded_len) + --text_pos; + } + + if (f_pos < folded_len) + return RE_ERROR_FAILURE; + + next_position->node = next->match_next; + if (next->match_step == 0) + next_position->text_pos = start_pos; + else + next_position->text_pos = text_pos; + + return RE_ERROR_SUCCESS; +} + +/* Checks whether there's a certain string at a position, ignoring case. */ +Py_LOCAL_INLINE(int) try_match_STRING_IGN(RE_State* state, RE_NextNode* next, + RE_Node* node, Py_ssize_t text_pos, RE_Position* next_position) { + Py_ssize_t length; + Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + RE_CODE* values; + Py_ssize_t s_pos; + + length = (Py_ssize_t)node->value_count; + char_at = state->char_at; + encoding = state->encoding; + locale_info = state->locale_info; + values = node->values; + + for (s_pos = 0; s_pos < length; s_pos++) { + if (text_pos + s_pos >= state->slice_end) { + if (state->partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + if (!same_char_ign(encoding, locale_info, char_at(state->text, text_pos + + s_pos), values[s_pos])) + return RE_ERROR_FAILURE; + } + + next_position->node = next->match_next; + next_position->text_pos = text_pos + next->match_step; + + return RE_ERROR_SUCCESS; +} + +/* Checks whether there's a certain string at a position, ignoring case, + * backwards. + */ +Py_LOCAL_INLINE(int) try_match_STRING_IGN_REV(RE_State* state, RE_NextNode* + next, RE_Node* node, Py_ssize_t text_pos, RE_Position* next_position) { + Py_ssize_t length; + Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + RE_CODE* values; + Py_ssize_t s_pos; + + length = (Py_ssize_t)node->value_count; + char_at = state->char_at; + encoding = state->encoding; + locale_info = state->locale_info; + values = node->values; + + for (s_pos = 0; s_pos < length; s_pos++) { + if (text_pos - s_pos <= state->slice_start) { + if (state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + if (!same_char_ign(encoding, locale_info, char_at(state->text, text_pos + - s_pos - 1), values[length - s_pos - 1])) + return RE_ERROR_FAILURE; + } + + next_position->node = next->match_next; + next_position->text_pos = text_pos + next->match_step; + + return RE_ERROR_SUCCESS; +} + +/* Checks whether there's a certain string at a position, backwards. */ +Py_LOCAL_INLINE(int) try_match_STRING_REV(RE_State* state, RE_NextNode* next, + RE_Node* node, Py_ssize_t text_pos, RE_Position* next_position) { + Py_ssize_t length; + Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); + RE_CODE* values; + Py_ssize_t s_pos; + + length = (Py_ssize_t)node->value_count; + char_at = state->char_at; + values = node->values; + + for (s_pos = 0; s_pos < length; s_pos++) { + if (text_pos - s_pos <= state->slice_start) { + if (state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + + if (!same_char(char_at(state->text, text_pos - s_pos - 1), + values[length - s_pos - 1])) + return RE_ERROR_FAILURE; + } + + next_position->node = next->match_next; + next_position->text_pos = text_pos + next->match_step; + + return RE_ERROR_SUCCESS; +} + +/* Tries a match at the current text position. + * + * Returns the next node and text position if the match succeeds. + */ +Py_LOCAL_INLINE(int) try_match(RE_State* state, RE_NextNode* next, Py_ssize_t + text_pos, RE_Position* next_position) { + RE_Node* test; + int status; + + test = next->test; + + if (test->status & RE_STATUS_FUZZY) { + next_position->node = next->node; + next_position->text_pos = text_pos; + return RE_ERROR_SUCCESS; + } + + switch (test->op) { + case RE_OP_ANY: + status = try_match_ANY(state, test, text_pos); + break; + case RE_OP_ANY_ALL: + status = try_match_ANY_ALL(state, test, text_pos); + break; + case RE_OP_ANY_ALL_REV: + status = try_match_ANY_ALL_REV(state, test, text_pos); + break; + case RE_OP_ANY_REV: + status = try_match_ANY_REV(state, test, text_pos); + break; + case RE_OP_ANY_U: + status = try_match_ANY_U(state, test, text_pos); + break; + case RE_OP_ANY_U_REV: + status = try_match_ANY_U_REV(state, test, text_pos); + break; + case RE_OP_BOUNDARY: + status = try_match_BOUNDARY(state, test, text_pos); + break; + case RE_OP_BRANCH: + status = try_match(state, &test->next_1, text_pos, next_position); + if (status == RE_ERROR_FAILURE) + status = try_match(state, &test->nonstring.next_2, text_pos, + next_position); + break; + case RE_OP_CHARACTER: + status = try_match_CHARACTER(state, test, text_pos); + break; + case RE_OP_CHARACTER_IGN: + status = try_match_CHARACTER_IGN(state, test, text_pos); + break; + case RE_OP_CHARACTER_IGN_REV: + status = try_match_CHARACTER_IGN_REV(state, test, text_pos); + break; + case RE_OP_CHARACTER_REV: + status = try_match_CHARACTER_REV(state, test, text_pos); + break; + case RE_OP_DEFAULT_BOUNDARY: + status = try_match_DEFAULT_BOUNDARY(state, test, text_pos); + break; + case RE_OP_DEFAULT_END_OF_WORD: + status = try_match_DEFAULT_END_OF_WORD(state, test, text_pos); + break; + case RE_OP_DEFAULT_START_OF_WORD: + status = try_match_DEFAULT_START_OF_WORD(state, test, text_pos); + break; + case RE_OP_END_OF_LINE: + status = try_match_END_OF_LINE(state, test, text_pos); + break; + case RE_OP_END_OF_LINE_U: + status = try_match_END_OF_LINE_U(state, test, text_pos); + break; + case RE_OP_END_OF_STRING: + status = try_match_END_OF_STRING(state, test, text_pos); + break; + case RE_OP_END_OF_STRING_LINE: + status = try_match_END_OF_STRING_LINE(state, test, text_pos); + break; + case RE_OP_END_OF_STRING_LINE_U: + status = try_match_END_OF_STRING_LINE_U(state, test, text_pos); + break; + case RE_OP_END_OF_WORD: + status = try_match_END_OF_WORD(state, test, text_pos); + break; + case RE_OP_GRAPHEME_BOUNDARY: + status = try_match_GRAPHEME_BOUNDARY(state, test, text_pos); + break; + case RE_OP_PROPERTY: + status = try_match_PROPERTY(state, test, text_pos); + break; + case RE_OP_PROPERTY_IGN: + status = try_match_PROPERTY_IGN(state, test, text_pos); + break; + case RE_OP_PROPERTY_IGN_REV: + status = try_match_PROPERTY_IGN_REV(state, test, text_pos); + break; + case RE_OP_PROPERTY_REV: + status = try_match_PROPERTY_REV(state, test, text_pos); + break; + case RE_OP_RANGE: + status = try_match_RANGE(state, test, text_pos); + break; + case RE_OP_RANGE_IGN: + status = try_match_RANGE_IGN(state, test, text_pos); + break; + case RE_OP_RANGE_IGN_REV: + status = try_match_RANGE_IGN_REV(state, test, text_pos); + break; + case RE_OP_RANGE_REV: + status = try_match_RANGE_REV(state, test, text_pos); + break; + case RE_OP_SEARCH_ANCHOR: + status = try_match_SEARCH_ANCHOR(state, test, text_pos); + break; + case RE_OP_SET_DIFF: + case RE_OP_SET_INTER: + case RE_OP_SET_SYM_DIFF: + case RE_OP_SET_UNION: + status = try_match_SET(state, test, text_pos); + break; + case RE_OP_SET_DIFF_IGN: + case RE_OP_SET_INTER_IGN: + case RE_OP_SET_SYM_DIFF_IGN: + case RE_OP_SET_UNION_IGN: + status = try_match_SET_IGN(state, test, text_pos); + break; + case RE_OP_SET_DIFF_IGN_REV: + case RE_OP_SET_INTER_IGN_REV: + case RE_OP_SET_SYM_DIFF_IGN_REV: + case RE_OP_SET_UNION_IGN_REV: + status = try_match_SET_IGN_REV(state, test, text_pos); + break; + case RE_OP_SET_DIFF_REV: + case RE_OP_SET_INTER_REV: + case RE_OP_SET_SYM_DIFF_REV: + case RE_OP_SET_UNION_REV: + status = try_match_SET_REV(state, test, text_pos); + break; + case RE_OP_START_OF_LINE: + status = try_match_START_OF_LINE(state, test, text_pos); + break; + case RE_OP_START_OF_LINE_U: + status = try_match_START_OF_LINE_U(state, test, text_pos); + break; + case RE_OP_START_OF_STRING: + status = try_match_START_OF_STRING(state, test, text_pos); + break; + case RE_OP_START_OF_WORD: + status = try_match_START_OF_WORD(state, test, text_pos); + break; + case RE_OP_STRING: + return try_match_STRING(state, next, test, text_pos, next_position); + case RE_OP_STRING_FLD: + return try_match_STRING_FLD(state, next, test, text_pos, + next_position); + case RE_OP_STRING_FLD_REV: + return try_match_STRING_FLD_REV(state, next, test, text_pos, + next_position); + case RE_OP_STRING_IGN: + return try_match_STRING_IGN(state, next, test, text_pos, + next_position); + case RE_OP_STRING_IGN_REV: + return try_match_STRING_IGN_REV(state, next, test, text_pos, + next_position); + case RE_OP_STRING_REV: + return try_match_STRING_REV(state, next, test, text_pos, + next_position); + default: + next_position->node = next->node; + next_position->text_pos = text_pos; + return RE_ERROR_SUCCESS; + } + + if (status != RE_ERROR_SUCCESS) + return status; + + next_position->node = next->match_next; + next_position->text_pos = text_pos + next->match_step; + + return RE_ERROR_SUCCESS; +} + +/* Searches for a word boundary. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_BOUNDARY(RE_State* state, RE_Node* + node, Py_ssize_t text_pos, BOOL* is_partial) { + BOOL (*at_boundary)(RE_State* state, Py_ssize_t text_pos); + + at_boundary = state->encoding->at_boundary; + + *is_partial = FALSE; + + for (;;) { + if (at_boundary(state, text_pos) == node->match) + return text_pos; + + if (text_pos >= state->slice_end) + return -1; + + ++text_pos; + } +} + +/* Searches for a word boundary, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_BOUNDARY_rev(RE_State* state, RE_Node* + node, Py_ssize_t text_pos, BOOL* is_partial) { + BOOL (*at_boundary)(RE_State* state, Py_ssize_t text_pos); + + at_boundary = state->encoding->at_boundary; + + *is_partial = FALSE; + + for (;;) { + if (at_boundary(state, text_pos) == node->match) + return text_pos; + + if (text_pos <= state->slice_start) + return -1; + + --text_pos; + } +} + +/* Searches for a default word boundary. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_DEFAULT_BOUNDARY(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + BOOL (*at_default_boundary)(RE_State* state, Py_ssize_t text_pos); + + at_default_boundary = state->encoding->at_default_boundary; + + *is_partial = FALSE; + + for (;;) { + if (at_default_boundary(state, text_pos) == node->match) + return text_pos; + + if (text_pos >= state->slice_end) + return -1; + + ++text_pos; + } +} + +/* Searches for a default word boundary, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_DEFAULT_BOUNDARY_rev(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + BOOL (*at_default_boundary)(RE_State* state, Py_ssize_t text_pos); + + at_default_boundary = state->encoding->at_default_boundary; + + *is_partial = FALSE; + + for (;;) { + if (at_default_boundary(state, text_pos) == node->match) + return text_pos; + + if (text_pos <= state->slice_start) + return -1; + + --text_pos; + } +} + +/* Searches for the default end of a word. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_DEFAULT_END_OF_WORD(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + BOOL (*at_default_word_end)(RE_State* state, Py_ssize_t text_pos); + + at_default_word_end = state->encoding->at_default_word_end; + + *is_partial = FALSE; + + for (;;) { + if (at_default_word_end(state, text_pos) == node->match) + return text_pos; + + if (text_pos >= state->slice_end) + return -1; + + ++text_pos; + } +} + +/* Searches for the default end of a word, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_DEFAULT_END_OF_WORD_rev(RE_State* + state, RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + BOOL (*at_default_word_end)(RE_State* state, Py_ssize_t text_pos); + + at_default_word_end = state->encoding->at_default_word_end; + + *is_partial = FALSE; + + for (;;) { + if (at_default_word_end(state, text_pos) == node->match) + return text_pos; + + if (text_pos <= state->slice_start) + return -1; + + --text_pos; + } +} + +/* Searches for the default start of a word. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_DEFAULT_START_OF_WORD(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + BOOL (*at_default_word_start)(RE_State* state, Py_ssize_t text_pos); + + at_default_word_start = state->encoding->at_default_word_start; + + *is_partial = FALSE; + + for (;;) { + if (at_default_word_start(state, text_pos) == node->match) + return text_pos; + + if (text_pos >= state->slice_end) + return -1; + + ++text_pos; + } +} + +/* Searches for the default start of a word, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_DEFAULT_START_OF_WORD_rev(RE_State* + state, RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + BOOL (*at_default_word_start)(RE_State* state, Py_ssize_t text_pos); + + at_default_word_start = state->encoding->at_default_word_start; + + *is_partial = FALSE; + + for (;;) { + if (at_default_word_start(state, text_pos) == node->match) + return text_pos; + + if (text_pos <= state->slice_start) + return -1; + + --text_pos; + } +} + +/* Searches for the end of line. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_END_OF_LINE(RE_State* state, RE_Node* + node, Py_ssize_t text_pos, BOOL* is_partial) { + *is_partial = FALSE; + + for (;;) { + if (text_pos >= state->text_length || state->char_at(state->text, + text_pos) == '\n') + return text_pos; + + if (text_pos >= state->slice_end) + return -1; + + ++text_pos; + } +} + +/* Searches for the end of line, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_END_OF_LINE_rev(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + *is_partial = FALSE; + + for (;;) { + if (text_pos >= state->text_length || state->char_at(state->text, + text_pos) == '\n') + return text_pos; + + if (text_pos <= state->slice_start) + return -1; + + --text_pos; + } +} + +/* Searches for the end of the string. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_END_OF_STRING(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + *is_partial = FALSE; + + if (state->slice_end >= state->text_length) + return state->text_length; + + return -1; +} + +/* Searches for the end of the string, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_END_OF_STRING_rev(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + *is_partial = FALSE; + + if (text_pos >= state->text_length) + return text_pos; + + return -1; +} + +/* Searches for the end of the string or line. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_END_OF_STRING_LINE(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + *is_partial = FALSE; + + if (text_pos <= state->final_newline) + text_pos = state->final_newline; + else if (text_pos <= state->text_length) + text_pos = state->text_length; + + if (text_pos > state->slice_end) + return -1; + + if (text_pos >= state->text_length) + return text_pos; + + return text_pos; +} + +/* Searches for the end of the string or line, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_END_OF_STRING_LINE_rev(RE_State* + state, RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + *is_partial = FALSE; + + if (text_pos >= state->text_length) + text_pos = state->text_length; + else if (text_pos >= state->final_newline) + text_pos = state->final_newline; + else + return -1; + + if (text_pos < state->slice_start) + return -1; + + if (text_pos <= 0) + return text_pos; + + return text_pos; +} + +/* Searches for the end of a word. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_END_OF_WORD(RE_State* state, RE_Node* + node, Py_ssize_t text_pos, BOOL* is_partial) { + BOOL (*at_word_end)(RE_State* state, Py_ssize_t text_pos); + + at_word_end = state->encoding->at_word_end; + + *is_partial = FALSE; + + for (;;) { + if (at_word_end(state, text_pos) == node->match) + return text_pos; + + if (text_pos >= state->slice_end) + return -1; + + ++text_pos; + } +} + +/* Searches for the end of a word, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_END_OF_WORD_rev(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + BOOL (*at_word_end)(RE_State* state, Py_ssize_t text_pos); + + at_word_end = state->encoding->at_word_end; + + *is_partial = FALSE; + + for (;;) { + if (at_word_end(state, text_pos) == node->match) + return text_pos; + + if (text_pos <= state->slice_start) + return -1; + + --text_pos; + } +} + +/* Searches for a grapheme boundary. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_GRAPHEME_BOUNDARY(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + BOOL (*at_grapheme_boundary)(RE_State* state, Py_ssize_t text_pos); + + at_grapheme_boundary = state->encoding->at_grapheme_boundary; + + *is_partial = FALSE; + + for (;;) { + if (at_grapheme_boundary(state, text_pos) == node->match) + return text_pos; + + if (text_pos >= state->slice_end) + return -1; + + ++text_pos; + } +} + +/* Searches for a grapheme boundary, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_GRAPHEME_BOUNDARY_rev(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + BOOL (*at_grapheme_boundary)(RE_State* state, Py_ssize_t text_pos); + + at_grapheme_boundary = state->encoding->at_grapheme_boundary; + + *is_partial = FALSE; + + for (;;) { + if (at_grapheme_boundary(state, text_pos) == node->match) + return text_pos; + + if (text_pos <= state->slice_start) + return -1; + + --text_pos; + } +} + +/* Searches for the start of line. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_START_OF_LINE(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + *is_partial = FALSE; + + for (;;) { + if (text_pos <= 0 || state->char_at(state->text, text_pos - 1) == '\n') + return text_pos; + + if (text_pos >= state->slice_end) + return -1; + + ++text_pos; + } +} + +/* Searches for the start of line, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_START_OF_LINE_rev(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + *is_partial = FALSE; + + for (;;) { + if (text_pos <= 0 || state->char_at(state->text, text_pos - 1) == '\n') + return text_pos; + + if (text_pos <= state->slice_start) + return -1; + + --text_pos; + } +} + +/* Searches for the start of the string. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_START_OF_STRING(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + *is_partial = FALSE; + + if (text_pos <= 0) + return text_pos; + + return -1; +} + +/* Searches for the start of the string, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_START_OF_STRING_rev(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + *is_partial = FALSE; + + if (state->slice_start <= 0) + return 0; + + return -1; +} + +/* Searches for the start of a word. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_START_OF_WORD(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + BOOL (*at_word_start)(RE_State* state, Py_ssize_t text_pos); + + at_word_start = state->encoding->at_word_start; + + *is_partial = FALSE; + + for (;;) { + if (at_word_start(state, text_pos) == node->match) + return text_pos; + + if (text_pos >= state->slice_end) + return -1; + + ++text_pos; + } +} + +/* Searches for the start of a word, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_START_OF_WORD_rev(RE_State* state, + RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + BOOL (*at_word_start)(RE_State* state, Py_ssize_t text_pos); + + at_word_start = state->encoding->at_word_start; + + *is_partial = FALSE; + + for (;;) { + if (at_word_start(state, text_pos) == node->match) + return text_pos; + + if (text_pos <= state->slice_start) + return -1; + + --text_pos; + } +} + +/* Searches for a string. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_STRING(RE_SafeState* safe_state, + RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + RE_State* state; + + state = safe_state->re_state; + + *is_partial = FALSE; + + if ((node->status & RE_STATUS_REQUIRED) && text_pos == state->req_pos) + return text_pos; + + return string_search(safe_state, node, text_pos, state->slice_end, + is_partial); +} + +/* Searches for a string, ignoring case. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_STRING_FLD(RE_SafeState* safe_state, + RE_Node* node, Py_ssize_t text_pos, Py_ssize_t* new_pos, BOOL* is_partial) { + RE_State* state; + + state = safe_state->re_state; + + *is_partial = FALSE; + + if ((node->status & RE_STATUS_REQUIRED) && text_pos == state->req_pos) { + *new_pos = state->req_end; + return text_pos; + } + + return string_search_fld(safe_state, node, text_pos, state->slice_end, + new_pos, is_partial); +} + +/* Searches for a string, ignoring case, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_STRING_FLD_REV(RE_SafeState* + safe_state, RE_Node* node, Py_ssize_t text_pos, Py_ssize_t* new_pos, BOOL* + is_partial) { + RE_State* state; + + state = safe_state->re_state; + + *is_partial = FALSE; + + if ((node->status & RE_STATUS_REQUIRED) && text_pos == state->req_pos) { + *new_pos = state->req_end; + return text_pos; + } + + return string_search_fld_rev(safe_state, node, text_pos, + state->slice_start, new_pos, is_partial); +} + +/* Searches for a string, ignoring case. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_STRING_IGN(RE_SafeState* safe_state, + RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + RE_State* state; + + state = safe_state->re_state; + + *is_partial = FALSE; + + if ((node->status & RE_STATUS_REQUIRED) && text_pos == state->req_pos) + return text_pos; + + return string_search_ign(safe_state, node, text_pos, state->slice_end, + is_partial); +} + +/* Searches for a string, ignoring case, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_STRING_IGN_REV(RE_SafeState* + safe_state, RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + RE_State* state; + + state = safe_state->re_state; + + *is_partial = FALSE; + + if ((node->status & RE_STATUS_REQUIRED) && text_pos == state->req_pos) + return text_pos; + + return string_search_ign_rev(safe_state, node, text_pos, + state->slice_start, is_partial); +} + +/* Searches for a string, backwards. */ +Py_LOCAL_INLINE(Py_ssize_t) search_start_STRING_REV(RE_SafeState* safe_state, + RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) { + RE_State* state; + + state = safe_state->re_state; + + *is_partial = FALSE; + + if ((node->status & RE_STATUS_REQUIRED) && text_pos == state->req_pos) + return text_pos; + + return string_search_rev(safe_state, node, text_pos, state->slice_start, + is_partial); +} + +/* Searches for the start of a match. */ +Py_LOCAL_INLINE(int) search_start(RE_SafeState* safe_state, RE_NextNode* next, + RE_Position* new_position, int search_index) { + RE_State* state; + Py_ssize_t start_pos; + RE_Node* test; + RE_Node* node; + RE_SearchPosition* info; + Py_ssize_t text_pos; + + state = safe_state->re_state; + + start_pos = state->text_pos; + TRACE(("<<search_start>> at %d\n", start_pos)) + + test = next->test; + node = next->node; + + if (state->reverse) { + if (start_pos < state->slice_start) { + if (state->partial_side == RE_PARTIAL_LEFT) { + new_position->text_pos = state->slice_start; + return RE_ERROR_PARTIAL; + } + + return RE_ERROR_FAILURE; + } + } else { + if (start_pos > state->slice_end) { + if (state->partial_side == RE_PARTIAL_RIGHT) { + new_position->text_pos = state->slice_end; + return RE_ERROR_PARTIAL; + } + } + } + + if (test->status & RE_STATUS_FUZZY) { + /* Don't call 'search_start' again. */ + state->pattern->do_search_start = FALSE; + + state->match_pos = start_pos; + new_position->node = node; + new_position->text_pos = start_pos; + + return RE_ERROR_SUCCESS; + } + +again: + if (!state->pattern->is_fuzzy && state->partial_side == RE_PARTIAL_NONE) { + if (state->reverse) { + if (start_pos - state->min_width < state->slice_start) + return RE_ERROR_FAILURE; + } else { + if (start_pos + state->min_width > state->slice_end) + return RE_ERROR_FAILURE; + } + } + + if (search_index < MAX_SEARCH_POSITIONS) { + info = &state->search_positions[search_index]; + if (state->reverse) { + if (info->start_pos >= 0 && info->start_pos >= start_pos && + start_pos >= info->match_pos) { + state->match_pos = info->match_pos; + + new_position->text_pos = state->match_pos; + new_position->node = node; + + return RE_ERROR_SUCCESS; + } + } else { + if (info->start_pos >= 0 && info->start_pos <= start_pos && + start_pos <= info->match_pos) { + state->match_pos = info->match_pos; + + new_position->text_pos = state->match_pos; + new_position->node = node; + + return RE_ERROR_SUCCESS; + } + } + } else + info = NULL; + + switch (test->op) { + case RE_OP_ANY: + start_pos = match_many_ANY(state, test, start_pos, state->slice_end, + FALSE); + + if (start_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + } + + if (start_pos >= state->slice_end) + return RE_ERROR_FAILURE; + break; + case RE_OP_ANY_ALL: + case RE_OP_ANY_ALL_REV: + break; + case RE_OP_ANY_REV: + start_pos = match_many_ANY_REV(state, test, start_pos, + state->slice_start, FALSE); + + if (start_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + } + + if (start_pos <= state->slice_start) + return RE_ERROR_FAILURE; + break; + case RE_OP_ANY_U: + start_pos = match_many_ANY_U(state, test, start_pos, state->slice_end, + FALSE); + + if (start_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + } + + if (start_pos >= state->slice_end) + return RE_ERROR_FAILURE; + break; + case RE_OP_ANY_U_REV: + start_pos = match_many_ANY_U_REV(state, test, start_pos, + state->slice_start, FALSE); + + if (start_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + } + + if (start_pos <= state->slice_start) + return RE_ERROR_FAILURE; + break; + case RE_OP_BOUNDARY: + { + BOOL is_partial; + + if (state->reverse) + start_pos = search_start_BOUNDARY_rev(state, test, start_pos, + &is_partial); + else + start_pos = search_start_BOUNDARY(state, test, start_pos, + &is_partial); + + if (start_pos < 0) + return RE_ERROR_FAILURE; + + if (is_partial) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + break; + } + case RE_OP_CHARACTER: + start_pos = match_many_CHARACTER(state, test, start_pos, + state->slice_end, FALSE); + + if (start_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + } + + if (start_pos >= state->slice_end) + return RE_ERROR_FAILURE; + break; + case RE_OP_CHARACTER_IGN: + start_pos = match_many_CHARACTER_IGN(state, test, start_pos, + state->slice_end, FALSE); + + if (start_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + } + + if (start_pos >= state->slice_end) + return RE_ERROR_FAILURE; + break; + case RE_OP_CHARACTER_IGN_REV: + start_pos = match_many_CHARACTER_IGN_REV(state, test, start_pos, + state->slice_start, FALSE); + + if (start_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + } + + if (start_pos <= state->slice_start) + return RE_ERROR_FAILURE; + break; + case RE_OP_CHARACTER_REV: + start_pos = match_many_CHARACTER_REV(state, test, start_pos, + state->slice_start, FALSE); + + if (start_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + } + + if (start_pos <= state->slice_start) + return RE_ERROR_FAILURE; + break; + case RE_OP_DEFAULT_BOUNDARY: + { + BOOL is_partial; + + if (state->reverse) + start_pos = search_start_DEFAULT_BOUNDARY_rev(state, test, + start_pos, &is_partial); + else + start_pos = search_start_DEFAULT_BOUNDARY(state, test, start_pos, + &is_partial); + + if (start_pos < 0) + return RE_ERROR_FAILURE; + + if (is_partial) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + break; + } + case RE_OP_DEFAULT_END_OF_WORD: + { + BOOL is_partial; + + if (state->reverse) + start_pos = search_start_DEFAULT_END_OF_WORD_rev(state, test, + start_pos, &is_partial); + else + start_pos = search_start_DEFAULT_END_OF_WORD(state, test, + start_pos, &is_partial); + + if (start_pos < 0) + return RE_ERROR_FAILURE; + + if (is_partial) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + break; + } + case RE_OP_DEFAULT_START_OF_WORD: + { + BOOL is_partial; + + if (state->reverse) + start_pos = search_start_DEFAULT_START_OF_WORD_rev(state, test, + start_pos, &is_partial); + else + start_pos = search_start_DEFAULT_START_OF_WORD(state, test, + start_pos, &is_partial); + + if (start_pos < 0) + return RE_ERROR_FAILURE; + + if (is_partial) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + break; + } + case RE_OP_END_OF_LINE: + { + BOOL is_partial; + + if (state->reverse) + start_pos = search_start_END_OF_LINE_rev(state, test, start_pos, + &is_partial); + else + start_pos = search_start_END_OF_LINE(state, test, start_pos, + &is_partial); + + if (start_pos < 0) + return RE_ERROR_FAILURE; + + if (is_partial) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + break; + } + case RE_OP_END_OF_STRING: + { + BOOL is_partial; + + if (state->reverse) + start_pos = search_start_END_OF_STRING_rev(state, test, start_pos, + &is_partial); + else + start_pos = search_start_END_OF_STRING(state, test, start_pos, + &is_partial); + + if (start_pos < 0) + return RE_ERROR_FAILURE; + + if (is_partial) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + break; + } + case RE_OP_END_OF_STRING_LINE: + { + BOOL is_partial; + + if (state->reverse) + start_pos = search_start_END_OF_STRING_LINE_rev(state, test, + start_pos, &is_partial); + else + start_pos = search_start_END_OF_STRING_LINE(state, test, start_pos, + &is_partial); + + if (start_pos < 0) + return RE_ERROR_FAILURE; + + if (is_partial) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + break; + } + case RE_OP_END_OF_WORD: + { + BOOL is_partial; + + if (state->reverse) + start_pos = search_start_END_OF_WORD_rev(state, test, start_pos, + &is_partial); + else + start_pos = search_start_END_OF_WORD(state, test, start_pos, + &is_partial); + + if (start_pos < 0) + return RE_ERROR_FAILURE; + + if (is_partial) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + break; + } + case RE_OP_GRAPHEME_BOUNDARY: + { + BOOL is_partial; + + if (state->reverse) + start_pos = search_start_GRAPHEME_BOUNDARY_rev(state, test, + start_pos, &is_partial); + else + start_pos = search_start_GRAPHEME_BOUNDARY(state, test, start_pos, + &is_partial); + + if (start_pos < 0) + return RE_ERROR_FAILURE; + + if (is_partial) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + break; + } + case RE_OP_PROPERTY: + start_pos = match_many_PROPERTY(state, test, start_pos, + state->slice_end, FALSE); + + if (start_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + } + + if (start_pos >= state->slice_end) + return RE_ERROR_FAILURE; + break; + case RE_OP_PROPERTY_IGN: + start_pos = match_many_PROPERTY_IGN(state, test, start_pos, + state->slice_end, FALSE); + + if (start_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + } + + if (start_pos >= state->slice_end) + return RE_ERROR_FAILURE; + break; + case RE_OP_PROPERTY_IGN_REV: + start_pos = match_many_PROPERTY_IGN_REV(state, test, start_pos, + state->slice_start, FALSE); + + if (start_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + } + + if (start_pos <= state->slice_start) + return RE_ERROR_FAILURE; + break; + case RE_OP_PROPERTY_REV: + start_pos = match_many_PROPERTY_REV(state, test, start_pos, + state->slice_start, FALSE); + + if (start_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + } + + if (start_pos <= state->slice_start) + return RE_ERROR_FAILURE; + break; + case RE_OP_RANGE: + start_pos = match_many_RANGE(state, test, start_pos, state->slice_end, + FALSE); + + if (start_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + } + + if (start_pos >= state->slice_end) + return RE_ERROR_FAILURE; + break; + case RE_OP_RANGE_IGN: + start_pos = match_many_RANGE_IGN(state, test, start_pos, + state->slice_end, FALSE); + + if (start_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + } + + if (start_pos >= state->slice_end) + return RE_ERROR_FAILURE; + break; + case RE_OP_RANGE_IGN_REV: + start_pos = match_many_RANGE_IGN_REV(state, test, start_pos, + state->slice_start, FALSE); + + if (start_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + } + + if (start_pos <= state->slice_start) + return RE_ERROR_FAILURE; + break; + case RE_OP_RANGE_REV: + start_pos = match_many_RANGE_REV(state, test, start_pos, + state->slice_start, FALSE); + + if (start_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + } + + if (start_pos <= state->slice_start) + return RE_ERROR_FAILURE; + break; + case RE_OP_SEARCH_ANCHOR: + if (state->reverse) { + if (start_pos < state->search_anchor) + return RE_ERROR_FAILURE; + } else { + if (start_pos > state->search_anchor) + return RE_ERROR_FAILURE; + } + + start_pos = state->search_anchor; + break; + case RE_OP_SET_DIFF: + case RE_OP_SET_INTER: + case RE_OP_SET_SYM_DIFF: + case RE_OP_SET_UNION: + start_pos = match_many_SET(state, test, start_pos, state->slice_end, + FALSE); + + if (start_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + } + + if (start_pos >= state->slice_end) + return FALSE; + break; + case RE_OP_SET_DIFF_IGN: + case RE_OP_SET_INTER_IGN: + case RE_OP_SET_SYM_DIFF_IGN: + case RE_OP_SET_UNION_IGN: + start_pos = match_many_SET_IGN(state, test, start_pos, + state->slice_end, FALSE); + + if (start_pos >= state->text_length) { + if (state->partial_side == RE_PARTIAL_RIGHT) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + } + + if (start_pos >= state->slice_end) + return FALSE; + break; + case RE_OP_SET_DIFF_IGN_REV: + case RE_OP_SET_INTER_IGN_REV: + case RE_OP_SET_SYM_DIFF_IGN_REV: + case RE_OP_SET_UNION_IGN_REV: + start_pos = match_many_SET_IGN_REV(state, test, start_pos, + state->slice_start, FALSE); + + if (start_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + } + + if (start_pos <= state->slice_start) + return FALSE; + break; + case RE_OP_SET_DIFF_REV: + case RE_OP_SET_INTER_REV: + case RE_OP_SET_SYM_DIFF_REV: + case RE_OP_SET_UNION_REV: + start_pos = match_many_SET_REV(state, test, start_pos, + state->slice_start, FALSE); + + if (start_pos <= 0) { + if (state->partial_side == RE_PARTIAL_LEFT) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + } + + if (start_pos <= state->slice_start) + return FALSE; + break; + case RE_OP_START_OF_LINE: + { + BOOL is_partial; + + if (state->reverse) + start_pos = search_start_START_OF_LINE_rev(state, test, start_pos, + &is_partial); + else + start_pos = search_start_START_OF_LINE(state, test, start_pos, + &is_partial); + + if (start_pos < 0) + return RE_ERROR_FAILURE; + + if (is_partial) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + break; + } + case RE_OP_START_OF_STRING: + { + BOOL is_partial; + + if (state->reverse) + start_pos = search_start_START_OF_STRING_rev(state, test, + start_pos, &is_partial); + else + start_pos = search_start_START_OF_STRING(state, test, start_pos, + &is_partial); + + if (start_pos < 0) + return RE_ERROR_FAILURE; + + if (is_partial) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + break; + } + case RE_OP_START_OF_WORD: + { + BOOL is_partial; + + if (state->reverse) + start_pos = search_start_START_OF_WORD_rev(state, test, start_pos, + &is_partial); + else + start_pos = search_start_START_OF_WORD(state, test, start_pos, + &is_partial); + + if (start_pos < 0) + return RE_ERROR_FAILURE; + + if (is_partial) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + break; + } + case RE_OP_STRING: + { + BOOL is_partial; + + start_pos = search_start_STRING(safe_state, test, start_pos, + &is_partial); + if (start_pos < 0) + return RE_ERROR_FAILURE; + + if (is_partial) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + break; + } + case RE_OP_STRING_FLD: + { + Py_ssize_t new_pos; + BOOL is_partial; + + start_pos = search_start_STRING_FLD(safe_state, test, start_pos, + &new_pos, &is_partial); + if (start_pos < 0) + return RE_ERROR_FAILURE; + + if (is_partial) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + + /* Can we look further ahead? */ + if (test == node) { + if (test->next_1.node) { + int status; + + status = try_match(state, &test->next_1, new_pos, + new_position); + if (status < 0) + return status; + + if (status == RE_ERROR_FAILURE) { + ++start_pos; + + if (start_pos >= state->slice_end) { + if (state->partial_side == RE_PARTIAL_RIGHT) { + new_position->text_pos = state->slice_start; + return RE_ERROR_PARTIAL; + } + + return RE_ERROR_FAILURE; + } + + goto again; + } + } + + /* It's a possible match. */ + state->match_pos = start_pos; + + if (info) { + info->start_pos = state->text_pos; + info->match_pos = state->match_pos; + } + + return RE_ERROR_SUCCESS; + } + break; + } + case RE_OP_STRING_FLD_REV: + { + Py_ssize_t new_pos; + BOOL is_partial; + + start_pos = search_start_STRING_FLD_REV(safe_state, test, start_pos, + &new_pos, &is_partial); + if (start_pos < 0) + return RE_ERROR_FAILURE; + + if (is_partial) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + + /* Can we look further ahead? */ + if (test == node) { + if (test->next_1.node) { + int status; + + status = try_match(state, &test->next_1, new_pos, + new_position); + if (status < 0) + return status; + + if (status == RE_ERROR_FAILURE) { + --start_pos; + + if (start_pos <= state->slice_start) { + if (state->partial_side == RE_PARTIAL_LEFT) { + new_position->text_pos = state->slice_start; + return RE_ERROR_PARTIAL; + } + + return RE_ERROR_FAILURE; + } + + goto again; + } + } + + /* It's a possible match. */ + state->match_pos = start_pos; + + if (info) { + info->start_pos = state->text_pos; + info->match_pos = state->match_pos; + } + + return RE_ERROR_SUCCESS; + } + break; + } + case RE_OP_STRING_IGN: + { + BOOL is_partial; + + start_pos = search_start_STRING_IGN(safe_state, test, start_pos, + &is_partial); + if (start_pos < 0) + return RE_ERROR_FAILURE; + + if (is_partial) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + break; + } + case RE_OP_STRING_IGN_REV: + { + BOOL is_partial; + + start_pos = search_start_STRING_IGN_REV(safe_state, test, start_pos, + &is_partial); + if (start_pos < 0) + return RE_ERROR_FAILURE; + + if (is_partial) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + break; + } + case RE_OP_STRING_REV: + { + BOOL is_partial; + + start_pos = search_start_STRING_REV(safe_state, test, start_pos, + &is_partial); + if (start_pos < 0) + return RE_ERROR_FAILURE; + + if (is_partial) { + new_position->text_pos = start_pos; + return RE_ERROR_PARTIAL; + } + break; + } + default: + /* Don't call 'search_start' again. */ + state->pattern->do_search_start = FALSE; + + state->match_pos = start_pos; + new_position->node = node; + new_position->text_pos = start_pos; + return RE_ERROR_SUCCESS; + } + + /* Can we look further ahead? */ + if (test == node) { + text_pos = start_pos + test->step; + + if (test->next_1.node) { + int status; + + status = try_match(state, &test->next_1, text_pos, new_position); + if (status < 0) + return status; + + if (status == RE_ERROR_FAILURE) { + if (state->reverse) { + --start_pos; + + if (start_pos < state->slice_start) { + if (state->partial_side == RE_PARTIAL_LEFT) { + new_position->text_pos = state->slice_start; + return RE_ERROR_PARTIAL; + } + + return RE_ERROR_FAILURE; + } + } else { + ++start_pos; + + if (start_pos > state->slice_end) { + if (state->partial_side == RE_PARTIAL_RIGHT) { + new_position->text_pos = state->slice_end; + return RE_ERROR_PARTIAL; + } + + return RE_ERROR_FAILURE; + } + } + + goto again; + } + } + } else { + new_position->node = node; + new_position->text_pos = start_pos; + } + + /* It's a possible match. */ + state->match_pos = start_pos; + + if (info) { + info->start_pos = state->text_pos; + info->match_pos = state->match_pos; + } + + return RE_ERROR_SUCCESS; +} + +/* Saves a capture group. */ +Py_LOCAL_INLINE(BOOL) save_capture(RE_SafeState* safe_state, size_t + private_index, size_t public_index) { + RE_State* state; + RE_GroupData* private_group; + RE_GroupData* public_group; + + state = safe_state->re_state; + + /* Capture group indexes are 1-based (excluding group 0, which is the + * entire matched string). + */ + private_group = &state->groups[private_index - 1]; + public_group = &state->groups[public_index - 1]; + + /* Will the repeated captures ever be visible? */ + if (!state->visible_captures) { + public_group->captures[0] = private_group->span; + public_group->capture_count = 1; + + return TRUE; + } + + if (public_group->capture_count >= public_group->capture_capacity) { + size_t new_capacity; + RE_GroupSpan* new_captures; + + new_capacity = public_group->capture_capacity * 2; + new_capacity = max_size_t(new_capacity, RE_INIT_CAPTURE_SIZE); + new_captures = (RE_GroupSpan*)safe_realloc(safe_state, + public_group->captures, new_capacity * sizeof(RE_GroupSpan)); + if (!new_captures) + return FALSE; + + public_group->captures = new_captures; + public_group->capture_capacity = new_capacity; + } + + public_group->captures[public_group->capture_count++] = + private_group->span; + + return TRUE; +} + +/* Unsaves a capture group. */ +Py_LOCAL_INLINE(void) unsave_capture(RE_State* state, size_t private_index, + size_t public_index) { + /* Capture group indexes are 1-based (excluding group 0, which is the + * entire matched string). + */ + if (state->groups[public_index - 1].capture_count > 0) + --state->groups[public_index - 1].capture_count; +} + +/* Pushes the groups for backtracking. */ +Py_LOCAL_INLINE(BOOL) push_groups(RE_SafeState* safe_state) { + RE_State* state; + size_t group_count; + RE_SavedGroups* current; + size_t g; + + state = safe_state->re_state; + + group_count = state->pattern->true_group_count; + if (group_count == 0) + return TRUE; + + current = state->current_saved_groups; + + if (current && current->next) + current = current->next; + else if (!current && state->first_saved_groups) + current = state->first_saved_groups; + else { + RE_SavedGroups* new_block; + + new_block = (RE_SavedGroups*)safe_alloc(safe_state, + sizeof(RE_SavedGroups)); + if (!new_block) + return FALSE; + + new_block->spans = (RE_GroupSpan*)safe_alloc(safe_state, group_count * + sizeof(RE_GroupSpan)); + new_block->counts = (size_t*)safe_alloc(safe_state, group_count * + sizeof(Py_ssize_t)); + if (!new_block->spans || !new_block->counts) { + safe_dealloc(safe_state, new_block->spans); + safe_dealloc(safe_state, new_block->counts); + safe_dealloc(safe_state, new_block); + return FALSE; + } + + new_block->previous = current; + new_block->next = NULL; + + if (new_block->previous) + new_block->previous->next = new_block; + else + state->first_saved_groups = new_block; + + current = new_block; + } + + for (g = 0; g < group_count; g++) { + current->spans[g] = state->groups[g].span; + current->counts[g] = state->groups[g].capture_count; + } + + state->current_saved_groups = current; + + return TRUE; +} + +/* Pops the groups for backtracking. */ +Py_LOCAL_INLINE(void) pop_groups(RE_State* state) { + size_t group_count; + RE_SavedGroups* current; + size_t g; + + group_count = state->pattern->true_group_count; + if (group_count == 0) + return; + + current = state->current_saved_groups; + + for (g = 0; g < group_count; g++) { + state->groups[g].span = current->spans[g]; + state->groups[g].capture_count = current->counts[g]; + } + + state->current_saved_groups = current->previous; +} + +/* Drops the groups for backtracking. */ +Py_LOCAL_INLINE(void) drop_groups(RE_State* state) { + if (state->pattern->true_group_count != 0) + state->current_saved_groups = state->current_saved_groups->previous; +} + +/* Pushes the repeats for backtracking. */ +Py_LOCAL_INLINE(BOOL) push_repeats(RE_SafeState* safe_state) { + RE_State* state; + PatternObject* pattern; + size_t repeat_count; + RE_SavedRepeats* current; + size_t r; + + state = safe_state->re_state; + pattern = state->pattern; + + repeat_count = pattern->repeat_count; + if (repeat_count == 0) + return TRUE; + + current = state->current_saved_repeats; + + if (current && current->next) + current = current->next; + else if (!current && state->first_saved_repeats) + current = state->first_saved_repeats; + else { + RE_SavedRepeats* new_block; + + new_block = (RE_SavedRepeats*)safe_alloc(safe_state, + sizeof(RE_SavedRepeats)); + if (!new_block) + return FALSE; + + new_block->repeats = (RE_RepeatData*)safe_alloc(safe_state, + repeat_count * sizeof(RE_RepeatData)); + if (!new_block->repeats) { + safe_dealloc(safe_state, new_block); + return FALSE; + } + + memset(new_block->repeats, 0, repeat_count * sizeof(RE_RepeatData)); + + new_block->previous = current; + new_block->next = NULL; + + if (new_block->previous) + new_block->previous->next = new_block; + else + state->first_saved_repeats = new_block; + + current = new_block; + } + + for (r = 0; r < repeat_count; r++) { + if (!copy_repeat_data(safe_state, ¤t->repeats[r], + &state->repeats[r])) + return FALSE; + } + + state->current_saved_repeats = current; + + return TRUE; +} + +/* Pops the repeats for backtracking. */ +Py_LOCAL_INLINE(void) pop_repeats(RE_State* state) { + PatternObject* pattern; + size_t repeat_count; + RE_SavedRepeats* current; + size_t r; + + pattern = state->pattern; + + repeat_count = pattern->repeat_count; + if (repeat_count == 0) + return; + + current = state->current_saved_repeats; + + for (r = 0; r < repeat_count; r++) + copy_repeat_data(NULL, &state->repeats[r], ¤t->repeats[r]); + + state->current_saved_repeats = current->previous; +} + +/* Drops the repeats for backtracking. */ +Py_LOCAL_INLINE(void) drop_repeats(RE_State* state) { + PatternObject* pattern; + size_t repeat_count; + RE_SavedRepeats* current; + + pattern = state->pattern; + + repeat_count = pattern->repeat_count; + if (repeat_count == 0) + return; + + current = state->current_saved_repeats; + state->current_saved_repeats = current->previous; +} + +/* Inserts a new span in a guard list. */ +Py_LOCAL_INLINE(BOOL) insert_guard_span(RE_SafeState* safe_state, RE_GuardList* + guard_list, size_t index) { + size_t n; + + if (guard_list->count >= guard_list->capacity) { + size_t new_capacity; + RE_GuardSpan* new_spans; + + new_capacity = guard_list->capacity * 2; + if (new_capacity == 0) + new_capacity = RE_INIT_GUARDS_BLOCK_SIZE; + new_spans = (RE_GuardSpan*)safe_realloc(safe_state, guard_list->spans, + new_capacity * sizeof(RE_GuardSpan)); + if (!new_spans) + return FALSE; + + guard_list->capacity = new_capacity; + guard_list->spans = new_spans; + } + + n = guard_list->count - index; + if (n > 0) + memmove(guard_list->spans + index + 1, guard_list->spans + index, n * + sizeof(RE_GuardSpan)); + ++guard_list->count; + + return TRUE; +} + +/* Deletes a span in a guard list. */ +Py_LOCAL_INLINE(void) delete_guard_span(RE_GuardList* guard_list, size_t index) + { + size_t n; + + n = guard_list->count - index - 1; + if (n > 0) + memmove(guard_list->spans + index, guard_list->spans + index + 1, n * + sizeof(RE_GuardSpan)); + --guard_list->count; +} + +/* Checks whether a position is guarded against further matching. */ +Py_LOCAL_INLINE(BOOL) is_guarded(RE_GuardList* guard_list, Py_ssize_t text_pos) + { + size_t low; + size_t high; + + /* Is this position in the guard list? */ + if (guard_list->count == 0 || text_pos < guard_list->spans[0].low) + guard_list->last_low = 0; + else if (text_pos > guard_list->spans[guard_list->count - 1].high) + guard_list->last_low = guard_list->count; + else { + low = 0; + high = guard_list->count; + while (low < high) { + size_t mid; + RE_GuardSpan* span; + + mid = (low + high) / 2; + span = &guard_list->spans[mid]; + if (text_pos < span->low) + high = mid; + else if (text_pos > span->high) + low = mid + 1; + else + return span->protect; + } + + guard_list->last_low = low; + } + + guard_list->last_text_pos = text_pos; + + return FALSE; +} + +/* Guards a position against further matching. */ +Py_LOCAL_INLINE(BOOL) guard(RE_SafeState* safe_state, RE_GuardList* guard_list, + Py_ssize_t text_pos, BOOL protect) { + size_t low; + size_t high; + + /* Where should be new position be added? */ + if (text_pos == guard_list->last_text_pos) + low = guard_list->last_low; + else { + low = 0; + high = guard_list->count; + while (low < high) { + size_t mid; + RE_GuardSpan* span; + + mid = (low + high) / 2; + span = &guard_list->spans[mid]; + if (text_pos < span->low) + high = mid; + else if (text_pos > span->high) + low = mid + 1; + else + return TRUE; + } + } + + /* Add the position to the guard list. */ + if (low > 0 && guard_list->spans[low - 1].high + 1 == text_pos && + guard_list->spans[low - 1].protect == protect) { + /* The new position is just above this span. */ + if (low < guard_list->count && guard_list->spans[low].low - 1 == + text_pos && guard_list->spans[low].protect == protect) { + /* The new position joins 2 spans */ + guard_list->spans[low - 1].high = guard_list->spans[low].high; + delete_guard_span(guard_list, low); + } else + /* Extend the span. */ + guard_list->spans[low - 1].high = text_pos; + } else if (low < guard_list->count && guard_list->spans[low].low - 1 == + text_pos && guard_list->spans[low].protect == protect) + /* The new position is just below this span. */ + /* Extend the span. */ + guard_list->spans[low].low = text_pos; + else { + /* Insert a new span. */ + if (!insert_guard_span(safe_state, guard_list, low)) + return FALSE; + guard_list->spans[low].low = text_pos; + guard_list->spans[low].high = text_pos; + guard_list->spans[low].protect = protect; + } + + guard_list->last_text_pos = -1; + + return TRUE; +} + +/* Guards a position against further matching for a repeat. */ +Py_LOCAL_INLINE(BOOL) guard_repeat(RE_SafeState* safe_state, size_t index, + Py_ssize_t text_pos, RE_STATUS_T guard_type, BOOL protect) { + RE_State* state; + RE_GuardList* guard_list; + + state = safe_state->re_state; + + /* Is a guard active here? */ + if (!(state->pattern->repeat_info[index].status & guard_type)) + return TRUE; + + /* Which guard list? */ + if (guard_type & RE_STATUS_BODY) + guard_list = &state->repeats[index].body_guard_list; + else + guard_list = &state->repeats[index].tail_guard_list; + + return guard(safe_state, guard_list, text_pos, protect); +} + +/* Guards a range of positions against further matching for a repeat. */ +Py_LOCAL_INLINE(BOOL) guard_repeat_range(RE_SafeState* safe_state, size_t + index, Py_ssize_t lo_pos, Py_ssize_t hi_pos, RE_STATUS_T guard_type, BOOL + protect) { + RE_State* state; + RE_GuardList* guard_list; + Py_ssize_t pos; + + state = safe_state->re_state; + + /* Is a guard active here? */ + if (!(state->pattern->repeat_info[index].status & guard_type)) + return TRUE; + + /* Which guard list? */ + if (guard_type & RE_STATUS_BODY) + guard_list = &state->repeats[index].body_guard_list; + else + guard_list = &state->repeats[index].tail_guard_list; + + for (pos = lo_pos; pos <= hi_pos; pos++) { + if (!guard(safe_state, guard_list, pos, protect)) + return FALSE; + } + + return TRUE; +} + +/* Checks whether a position is guarded against further matching for a repeat. + */ +Py_LOCAL_INLINE(BOOL) is_repeat_guarded(RE_SafeState* safe_state, size_t index, + Py_ssize_t text_pos, RE_STATUS_T guard_type) { + RE_State* state; + RE_GuardList* guard_list; + + state = safe_state->re_state; + + /* Is a guard active here? */ + if (!(state->pattern->repeat_info[index].status & guard_type)) + return FALSE; + + /* Which guard list? */ + if (guard_type == RE_STATUS_BODY) + guard_list = &state->repeats[index].body_guard_list; + else + guard_list = &state->repeats[index].tail_guard_list; + + return is_guarded(guard_list, text_pos); +} + +/* Builds a Unicode string. */ +Py_LOCAL_INLINE(PyObject*) build_unicode_value(void* buffer, Py_ssize_t len, + Py_ssize_t buffer_charsize) { + return PyUnicode_FromUnicode(buffer, len); +} + +/* Builds a bytestring. Returns NULL if any member is too wide. */ +Py_LOCAL_INLINE(PyObject*) build_bytes_value(void* buffer, Py_ssize_t len, + Py_ssize_t buffer_charsize) { + Py_UCS1* byte_buffer; + Py_ssize_t i; + PyObject* result; + + if (buffer_charsize == 1) + return Py_BuildValue("s#", buffer, len); + + byte_buffer = re_alloc((size_t)len); + if (!byte_buffer) + return NULL; + + for (i = 0; i < len; i++) { + Py_UCS2 c = ((Py_UCS2*)buffer)[i]; + if (c > 0xFF) + goto too_wide; + + byte_buffer[i] = (Py_UCS1)c; + } + + result = Py_BuildValue("s#", byte_buffer, len); + + re_dealloc(byte_buffer); + + return result; + +too_wide: + re_dealloc(byte_buffer); + + return NULL; +} + +/* Looks for a string in a string set. */ +Py_LOCAL_INLINE(int) string_set_contains(RE_State* state, PyObject* string_set, + Py_ssize_t first, Py_ssize_t last) { + PyObject* string; + int status; + + if (state->is_unicode) + string = build_unicode_value(state->point_to(state->text, first), last + - first, state->charsize); + else + string = build_bytes_value(state->point_to(state->text, first), last - + first, state->charsize); + if (!string) + return RE_ERROR_INTERNAL; + + status = PySet_Contains(string_set, string); + Py_DECREF(string); + + return status; +} + +/* Looks for a string in a string set, ignoring case. */ +Py_LOCAL_INLINE(int) string_set_contains_ign(RE_State* state, PyObject* + string_set, void* buffer, Py_ssize_t index, Py_ssize_t len, Py_ssize_t + buffer_charsize) { + Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); + void (*set_char_at)(void* text, Py_ssize_t pos, Py_UCS4 ch); + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + BOOL (*possible_turkic)(RE_LocaleInfo* locale_info, Py_UCS4 ch); + Py_UCS4 codepoints[4]; + + switch (buffer_charsize) { + case 1: + char_at = bytes1_char_at; + set_char_at = bytes1_set_char_at; + break; + case 2: + char_at = bytes2_char_at; + set_char_at = bytes2_set_char_at; + break; + case 4: + char_at = bytes4_char_at; + set_char_at = bytes4_set_char_at; + break; + default: + char_at = bytes1_char_at; + set_char_at = bytes1_set_char_at; + break; + } + + encoding = state->encoding; + locale_info = state->locale_info; + possible_turkic = encoding->possible_turkic; + + /* Look for a possible Turkic 'I'. */ + while (index < len && !possible_turkic(locale_info, char_at(buffer, + index))) + ++index; + + if (index < len) { + /* Possible Turkic 'I'. */ + int count; + int i; + + /* Try all the alternatives to the 'I'. */ + count = encoding->all_turkic_i(locale_info, char_at(buffer, index), + codepoints); + + for (i = 0; i < count; i++) { + int status; + + set_char_at(buffer, index, codepoints[i]); + + /* Recurse for the remainder of the string. */ + status = string_set_contains_ign(state, string_set, buffer, index + + 1, len, buffer_charsize); + if (status != 0) + return status; + } + + return 0; + } else { + /* No Turkic 'I'. */ + PyObject* string; + int status; + + if (state->is_unicode) + string = build_unicode_value(buffer, len, buffer_charsize); + else + string = build_bytes_value(buffer, len, buffer_charsize); + if (!string) + return RE_ERROR_MEMORY; + + status = PySet_Contains(string_set, string); + Py_DECREF(string); + + return status; + } +} + +/* Creates a partial string set for truncation at the left or right side. */ +Py_LOCAL_INLINE(int) make_partial_string_set(RE_State* state, RE_Node* node) { + PatternObject* pattern; + int partial_side; + PyObject* string_set; + PyObject* partial_set; + PyObject* iter = NULL; + PyObject* item = NULL; + PyObject* slice = NULL; + + pattern = state->pattern; + partial_side = state->partial_side; + if (partial_side != RE_PARTIAL_LEFT && partial_side != RE_PARTIAL_RIGHT) + return RE_ERROR_INTERNAL; + + /* Fetch the full string set. PyList_GET_ITEM borrows a reference. */ + string_set = PyList_GET_ITEM(pattern->named_list_indexes, node->values[0]); + if (!string_set) + return RE_ERROR_INTERNAL; + + /* Gets the list of partial string sets. */ + if (!pattern->partial_named_lists[partial_side]) { + size_t size; + + size = pattern->named_lists_count * sizeof(PyObject*); + pattern->partial_named_lists[partial_side] = re_alloc(size); + if (!pattern->partial_named_lists[partial_side]) + return RE_ERROR_INTERNAL; + + memset(pattern->partial_named_lists[partial_side], 0, size); + } + + /* Get the partial string set. */ + partial_set = pattern->partial_named_lists[partial_side][node->values[0]]; + if (partial_set) + return 1; + + /* Build the partial string set. */ + partial_set = PySet_New(NULL); + if (!partial_set) + return RE_ERROR_INTERNAL; + + iter = PyObject_GetIter(string_set); + if (!iter) + goto error; + + item = PyIter_Next(iter); + + while (item) { + Py_ssize_t len; + Py_ssize_t first; + Py_ssize_t last; + + len = PySequence_Length(item); + if (len == -1) + goto error; + + first = 0; + last = len; + + while (last - first > 1) { + int status; + + /* Shorten the entry. */ + if (partial_side == RE_PARTIAL_LEFT) + ++first; + else + --last; + + slice = PySequence_GetSlice(item, first, last); + if (!slice) + goto error; + + status = PySet_Add(partial_set, slice); + Py_DECREF(slice); + if (status < 0) + goto error; + } + + Py_DECREF(item); + item = PyIter_Next(iter); + } + + if (PyErr_Occurred()) + goto error; + + Py_DECREF(iter); + + pattern->partial_named_lists[partial_side][node->values[0]] = partial_set; + + return 1; + +error: + Py_XDECREF(item); + Py_XDECREF(iter); + Py_DECREF(partial_set); + + return RE_ERROR_INTERNAL; +} + +/* Tries to match a string at the current position with a member of a string + * set, forwards or backwards. + */ +Py_LOCAL_INLINE(int) string_set_match_fwdrev(RE_SafeState* safe_state, RE_Node* + node, BOOL reverse) { + RE_State* state; + Py_ssize_t min_len; + Py_ssize_t max_len; + Py_ssize_t text_available; + Py_ssize_t slice_available; + int partial_side; + Py_ssize_t len; + Py_ssize_t first; + Py_ssize_t last; + int status; + PyObject* string_set; + + state = safe_state->re_state; + + min_len = (Py_ssize_t)node->values[1]; + max_len = (Py_ssize_t)node->values[2]; + + acquire_GIL(safe_state); + + if (reverse) { + text_available = state->text_pos; + slice_available = state->text_pos - state->slice_start; + partial_side = RE_PARTIAL_LEFT; + } else { + text_available = state->text_length - state->text_pos; + slice_available = state->slice_end - state->text_pos; + partial_side = RE_PARTIAL_RIGHT; + } + + /* Get as many characters as we need for the longest possible match. */ + len = min_ssize_t(max_len, slice_available); + + if (reverse) { + first = state->text_pos - len; + last = state->text_pos; + } else { + first = state->text_pos; + last = state->text_pos + len; + } + + /* If we didn't get all of the characters we need, is a partial match + * allowed? + */ + if (len < max_len && len == text_available && state->partial_side == + partial_side) { + if (len == 0) { + /* An empty string is always a possible partial match. */ + status = RE_ERROR_PARTIAL; + goto finished; + } + + /* Make a set of the possible partial matches. */ + status = make_partial_string_set(state, node); + if (status < 0) + goto finished; + + /* Fetch the partial string set. */ + string_set = + state->pattern->partial_named_lists[partial_side][node->values[0]]; + + /* Is the text we have a partial match? */ + status = string_set_contains(state, string_set, first, last); + if (status < 0) + goto finished; + + if (status == 1) { + /* Advance past the match. */ + if (reverse) + state->text_pos -= len; + else + state->text_pos += len; + + status = RE_ERROR_PARTIAL; + goto finished; + } + } + + /* Fetch the string set. PyList_GET_ITEM borrows a reference. */ + string_set = PyList_GET_ITEM(state->pattern->named_list_indexes, + node->values[0]); + if (!string_set) { + status = RE_ERROR_INTERNAL; + goto finished; + } + + /* We've already looked for a partial match (if allowed), but what about a + * complete match? + */ + while (len >= min_len) { + status = string_set_contains(state, string_set, first, last); + + if (status == 1) { + /* Advance past the match. */ + if (reverse) + state->text_pos -= len; + else + state->text_pos += len; + + status = 1; + goto finished; + } + + /* Look for a shorter match. */ + --len; + if (reverse) + ++first; + else + --last; + } + + /* No match. */ + status = 0; + +finished: + release_GIL(safe_state); + + return status; +} + +/* Tries to match a string at the current position with a member of a string + * set, ignoring case, forwards or backwards. + */ +Py_LOCAL_INLINE(int) string_set_match_fld_fwdrev(RE_SafeState* safe_state, + RE_Node* node, BOOL reverse) { + RE_State* state; + int (*full_case_fold)(RE_LocaleInfo* locale_info, Py_UCS4 ch, Py_UCS4* + folded); + Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); + Py_ssize_t folded_charsize; + void (*set_char_at)(void* text, Py_ssize_t pos, Py_UCS4 ch); + Py_ssize_t min_len; + Py_ssize_t max_len; + Py_ssize_t buf_len; + void* folded; + int status; + BOOL* end_of_fold = NULL; + Py_ssize_t text_available; + Py_ssize_t slice_available; + Py_ssize_t t_pos; + Py_ssize_t f_pos; + int step; + int partial_side; + Py_ssize_t len; + Py_ssize_t consumed; + Py_UCS4 codepoints[RE_MAX_FOLDED]; + Py_ssize_t first; + Py_ssize_t last; + PyObject* string_set; + + state = safe_state->re_state; + full_case_fold = state->encoding->full_case_fold; + char_at = state->char_at; + + /* The folded string will have the same width as the original string. */ + folded_charsize = state->charsize; + + switch (folded_charsize) { + case 1: + set_char_at = bytes1_set_char_at; + break; + case 2: + set_char_at = bytes2_set_char_at; + break; + case 4: + set_char_at = bytes4_set_char_at; + break; + default: + return RE_ERROR_INTERNAL; + } + + min_len = (Py_ssize_t)node->values[1]; + max_len = (Py_ssize_t)node->values[2]; + + acquire_GIL(safe_state); + + /* Allocate a buffer for the folded string. */ + buf_len = max_len + RE_MAX_FOLDED; + folded = re_alloc((size_t)(buf_len * folded_charsize)); + if (!folded) { + status = RE_ERROR_MEMORY; + goto finished; + } + + end_of_fold = re_alloc((size_t)buf_len * sizeof(BOOL)); + if (!end_of_fold) { + status = RE_ERROR_MEMORY; + goto finished; + } + + memset(end_of_fold, 0, (size_t)buf_len * sizeof(BOOL)); + + if (reverse) { + text_available = state->text_pos; + slice_available = state->text_pos - state->slice_start; + t_pos = state->text_pos - 1; + f_pos = buf_len; + step = -1; + partial_side = RE_PARTIAL_LEFT; + } else { + text_available = state->text_length - state->text_pos; + slice_available = state->slice_end - state->text_pos; + t_pos = state->text_pos; + f_pos = 0; + step = 1; + partial_side = RE_PARTIAL_RIGHT; + } + + /* We can stop getting characters as soon as the case-folded string is long + * enough (each codepoint from the text can expand to more than one folded + * codepoint). + */ + len = 0; + end_of_fold[len] = TRUE; + + consumed = 0; + while (len < max_len && consumed < slice_available) { + int count; + int j; + + count = full_case_fold(state->locale_info, char_at(state->text, t_pos), + codepoints); + + if (reverse) + f_pos -= count; + + for (j = 0; j < count; j++) + set_char_at(folded, f_pos + j, codepoints[j]); + + if (!reverse) + f_pos += count; + + len += count; + end_of_fold[len] = TRUE; + ++consumed; + t_pos += step; + } + + if (reverse) { + first = f_pos; + last = buf_len; + } else { + first = 0; + last = f_pos; + } + + /* If we didn't get all of the characters we need, is a partial match + * allowed? + */ + if (len < max_len && len == text_available && state->partial_side == + partial_side) { + if (len == 0) { + /* An empty string is always a possible partial match. */ + status = RE_ERROR_PARTIAL; + goto finished; + } + + /* Make a set of the possible partial matches. */ + status = make_partial_string_set(state, node); + if (status < 0) + goto finished; + + /* Fetch the partial string set. */ + string_set = + state->pattern->partial_named_lists[partial_side][node->values[0]]; + + /* Is the text we have a partial match? */ + status = string_set_contains_ign(state, string_set, folded, first, + last, folded_charsize); + if (status < 0) + goto finished; + + if (status == 1) { + /* Advance past the match. */ + if (reverse) + state->text_pos -= consumed; + else + state->text_pos += consumed; + + status = RE_ERROR_PARTIAL; + goto finished; + } + } + + /* Fetch the string set. PyList_GET_ITEM borrows a reference. */ + string_set = PyList_GET_ITEM(state->pattern->named_list_indexes, + node->values[0]); + if (!string_set) { + status = RE_ERROR_INTERNAL; + goto finished; + } + + /* We've already looked for a partial match (if allowed), but what about a + * complete match? + */ + while (len >= min_len) { + if (end_of_fold[len]) { + status = string_set_contains_ign(state, string_set, folded, first, + last, folded_charsize); + + if (status == 1) { + /* Advance past the match. */ + if (reverse) + state->text_pos -= consumed; + else + state->text_pos += consumed; + + status = 1; + goto finished; + } + + --consumed; + } + + /* Look for a shorter match. */ + --len; + if (reverse) + ++first; + else + --last; + } + + /* No match. */ + status = 0; + +finished: + re_dealloc(end_of_fold); + re_dealloc(folded); + + release_GIL(safe_state); + + return status; +} + +/* Tries to match a string at the current position with a member of a string + * set, ignoring case, forwards or backwards. + */ +Py_LOCAL_INLINE(int) string_set_match_ign_fwdrev(RE_SafeState* safe_state, + RE_Node* node, BOOL reverse) { + RE_State* state; + Py_UCS4 (*simple_case_fold)(RE_LocaleInfo* locale_info, Py_UCS4 ch); + Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); + Py_ssize_t folded_charsize; + void (*set_char_at)(void* text, Py_ssize_t pos, Py_UCS4 ch); + Py_ssize_t min_len; + Py_ssize_t max_len; + void* folded; + int status; + Py_ssize_t text_available; + Py_ssize_t slice_available; + Py_ssize_t t_pos; + Py_ssize_t f_pos; + int step; + int partial_side; + Py_ssize_t len; + Py_ssize_t i; + Py_ssize_t first; + Py_ssize_t last; + PyObject* string_set; + + state = safe_state->re_state; + simple_case_fold = state->encoding->simple_case_fold; + char_at = state->char_at; + + /* The folded string will have the same width as the original string. */ + folded_charsize = state->charsize; + + switch (folded_charsize) { + case 1: + set_char_at = bytes1_set_char_at; + break; + case 2: + set_char_at = bytes2_set_char_at; + break; + case 4: + set_char_at = bytes4_set_char_at; + break; + default: + return RE_ERROR_INTERNAL; + } + + min_len = (Py_ssize_t)node->values[1]; + max_len = (Py_ssize_t)node->values[2]; + + acquire_GIL(safe_state); + + /* Allocate a buffer for the folded string. */ + folded = re_alloc((size_t)(max_len * folded_charsize)); + if (!folded) { + status = RE_ERROR_MEMORY; + goto finished; + } + + if (reverse) { + text_available = state->text_pos; + slice_available = state->text_pos - state->slice_start; + t_pos = state->text_pos - 1; + f_pos = max_len - 1; + step = -1; + partial_side = RE_PARTIAL_LEFT; + } else { + text_available = state->text_length - state->text_pos; + slice_available = state->slice_end - state->text_pos; + t_pos = state->text_pos; + f_pos = 0; + step = 1; + partial_side = RE_PARTIAL_RIGHT; + } + + /* Get as many characters as we need for the longest possible match. */ + len = min_ssize_t(max_len, slice_available); + + for (i = 0; i < len; i ++) { + Py_UCS4 ch; + + ch = simple_case_fold(state->locale_info, char_at(state->text, t_pos)); + set_char_at(folded, f_pos, ch); + t_pos += step; + f_pos += step; + } + + if (reverse) { + first = f_pos; + last = max_len; + } else { + first = 0; + last = f_pos; + } + + /* If we didn't get all of the characters we need, is a partial match + * allowed? + */ + if (len < max_len && len == text_available && state->partial_side == + partial_side) { + if (len == 0) { + /* An empty string is always a possible partial match. */ + status = RE_ERROR_PARTIAL; + goto finished; + } + + /* Make a set of the possible partial matches. */ + status = make_partial_string_set(state, node); + if (status < 0) + goto finished; + + /* Fetch the partial string set. */ + string_set = + state->pattern->partial_named_lists[partial_side][node->values[0]]; + + /* Is the text we have a partial match? */ + status = string_set_contains_ign(state, string_set, folded, first, + last, folded_charsize); + if (status < 0) + goto finished; + + if (status == 1) { + /* Advance past the match. */ + if (reverse) + state->text_pos -= len; + else + state->text_pos += len; + + status = RE_ERROR_PARTIAL; + goto finished; + } + } + + /* Fetch the string set. PyList_GET_ITEM borrows a reference. */ + string_set = PyList_GET_ITEM(state->pattern->named_list_indexes, + node->values[0]); + if (!string_set) { + status = RE_ERROR_INTERNAL; + goto finished; + } + + /* We've already looked for a partial match (if allowed), but what about a + * complete match? + */ + while (len >= min_len) { + status = string_set_contains_ign(state, string_set, folded, first, + last, folded_charsize); + + if (status == 1) { + /* Advance past the match. */ + if (reverse) + state->text_pos -= len; + else + state->text_pos += len; + + status = 1; + goto finished; + } + + /* Look for a shorter match. */ + --len; + if (reverse) + ++first; + else + --last; + } + + /* No match. */ + status = 0; + +finished: + re_dealloc(folded); + + release_GIL(safe_state); + + return status; +} + +/* Checks whether any additional fuzzy error is permitted. */ +Py_LOCAL_INLINE(BOOL) any_error_permitted(RE_State* state) { + RE_FuzzyInfo* fuzzy_info; + RE_CODE* values; + + fuzzy_info = &state->fuzzy_info; + values = fuzzy_info->node->values; + + return fuzzy_info->total_cost <= values[RE_FUZZY_VAL_MAX_COST] && + fuzzy_info->counts[RE_FUZZY_ERR] < values[RE_FUZZY_VAL_MAX_ERR] && + state->total_errors <= state->max_errors; +} + +/* Checks whether this additional fuzzy error is permitted. */ +Py_LOCAL_INLINE(BOOL) this_error_permitted(RE_State* state, int fuzzy_type) { + RE_FuzzyInfo* fuzzy_info; + RE_CODE* values; + + fuzzy_info = &state->fuzzy_info; + values = fuzzy_info->node->values; + + return fuzzy_info->total_cost + values[RE_FUZZY_VAL_COST_BASE + fuzzy_type] + <= values[RE_FUZZY_VAL_MAX_COST] && fuzzy_info->counts[fuzzy_type] < + values[RE_FUZZY_VAL_MAX_BASE + fuzzy_type] && state->total_errors + 1 <= + state->max_errors; +} + +/* Checks whether we've reachsd the end of the text during a fuzzy partial + * match. + */ +Py_LOCAL_INLINE(int) check_fuzzy_partial(RE_State* state, Py_ssize_t text_pos) + { + switch (state->partial_side) { + case RE_PARTIAL_LEFT: + if (text_pos < 0) + return RE_ERROR_PARTIAL; + break; + case RE_PARTIAL_RIGHT: + if (text_pos > state->text_length) + return RE_ERROR_PARTIAL; + break; + } + + return RE_ERROR_FAILURE; +} + +/* Checks a fuzzy match of an item. */ +Py_LOCAL_INLINE(int) next_fuzzy_match_item(RE_State* state, RE_FuzzyData* data, + BOOL is_string, int step) { + Py_ssize_t new_pos; + + if (this_error_permitted(state, data->fuzzy_type)) { + switch (data->fuzzy_type) { + case RE_FUZZY_DEL: + /* Could a character at text_pos have been deleted? */ + if (is_string) + data->new_string_pos += step; + else + data->new_node = data->new_node->next_1.node; + return RE_ERROR_SUCCESS; + case RE_FUZZY_INS: + /* Could the character at text_pos have been inserted? */ + if (!data->permit_insertion) + return RE_ERROR_FAILURE; + + new_pos = data->new_text_pos + step; + if (state->slice_start <= new_pos && new_pos <= state->slice_end) { + data->new_text_pos = new_pos; + return RE_ERROR_SUCCESS; + } + + return check_fuzzy_partial(state, new_pos); + case RE_FUZZY_SUB: + /* Could the character at text_pos have been substituted? */ + new_pos = data->new_text_pos + step; + if (state->slice_start <= new_pos && new_pos <= state->slice_end) { + data->new_text_pos = new_pos; + if (is_string) + data->new_string_pos += step; + else + data->new_node = data->new_node->next_1.node; + return RE_ERROR_SUCCESS; + } + + return check_fuzzy_partial(state, new_pos); + } + } + + return RE_ERROR_FAILURE; +} + +/* Tries a fuzzy match of an item of width 0 or 1. */ +Py_LOCAL_INLINE(int) fuzzy_match_item(RE_SafeState* safe_state, BOOL search, + Py_ssize_t* text_pos, RE_Node** node, int step) { + RE_State* state; + RE_FuzzyData data; + RE_FuzzyInfo* fuzzy_info; + RE_CODE* values; + RE_BacktrackData* bt_data; + + state = safe_state->re_state; + + if (!any_error_permitted(state)) { + *node = NULL; + return RE_ERROR_SUCCESS; + } + + data.new_text_pos = *text_pos; + data.new_node = *node; + + fuzzy_info = &state->fuzzy_info; + values = fuzzy_info->node->values; + + if (step == 0) { + if (data.new_node->status & RE_STATUS_REVERSE) { + data.step = -1; + data.limit = state->slice_start; + } else { + data.step = 1; + data.limit = state->slice_end; + } + } else + data.step = step; + + /* Permit insertion except initially when searching (it's better just to + * start searching one character later). + */ + data.permit_insertion = !search || data.new_text_pos != + state->search_anchor; + + for (data.fuzzy_type = 0; data.fuzzy_type < RE_FUZZY_COUNT; + data.fuzzy_type++) { + int status; + + status = next_fuzzy_match_item(state, &data, FALSE, step); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + goto found; + } + + *node = NULL; + return RE_ERROR_SUCCESS; + +found: + if (!add_backtrack(safe_state, (*node)->op)) + return RE_ERROR_FAILURE; + bt_data = state->backtrack; + bt_data->fuzzy_item.position.text_pos = *text_pos; + bt_data->fuzzy_item.position.node = *node; + bt_data->fuzzy_item.fuzzy_type = (RE_INT8)data.fuzzy_type; + bt_data->fuzzy_item.step = (RE_INT8)step; + + ++fuzzy_info->counts[data.fuzzy_type]; + ++fuzzy_info->counts[RE_FUZZY_ERR]; + fuzzy_info->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type]; + ++state->total_errors; + + *text_pos = data.new_text_pos; + *node = data.new_node; + + return RE_ERROR_SUCCESS; +} + +/* Retries a fuzzy match of a item of width 0 or 1. */ +Py_LOCAL_INLINE(int) retry_fuzzy_match_item(RE_SafeState* safe_state, BOOL + search, Py_ssize_t* text_pos, RE_Node** node, BOOL advance) { + RE_State* state; + RE_FuzzyInfo* fuzzy_info; + RE_CODE* values; + RE_BacktrackData* bt_data; + RE_FuzzyData data; + int step; + + state = safe_state->re_state; + fuzzy_info = &state->fuzzy_info; + values = fuzzy_info->node->values; + + bt_data = state->backtrack; + data.new_text_pos = bt_data->fuzzy_item.position.text_pos; + data.new_node = bt_data->fuzzy_item.position.node; + data.fuzzy_type = bt_data->fuzzy_item.fuzzy_type; + data.step = bt_data->fuzzy_item.step; + + if (data.fuzzy_type >= 0) { + --fuzzy_info->counts[data.fuzzy_type]; + --fuzzy_info->counts[RE_FUZZY_ERR]; + fuzzy_info->total_cost -= values[RE_FUZZY_VAL_COST_BASE + + data.fuzzy_type]; + --state->total_errors; + } + + /* Permit insertion except initially when searching (it's better just to + * start searching one character later). + */ + data.permit_insertion = !search || data.new_text_pos != + state->search_anchor; + + step = advance ? data.step : 0; + + for (++data.fuzzy_type; data.fuzzy_type < RE_FUZZY_COUNT; + data.fuzzy_type++) { + int status; + + status = next_fuzzy_match_item(state, &data, FALSE, step); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + goto found; + } + + discard_backtrack(state); + *node = NULL; + return RE_ERROR_SUCCESS; + +found: + bt_data->fuzzy_item.fuzzy_type = (RE_INT8)data.fuzzy_type; + + ++fuzzy_info->counts[data.fuzzy_type]; + ++fuzzy_info->counts[RE_FUZZY_ERR]; + fuzzy_info->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type]; + ++state->total_errors; + + *text_pos = data.new_text_pos; + *node = data.new_node; + + return RE_ERROR_SUCCESS; +} + +/* Tries a fuzzy insertion. */ +Py_LOCAL_INLINE(int) fuzzy_insert(RE_SafeState* safe_state, Py_ssize_t + text_pos, RE_Node* node) { + RE_State* state; + RE_BacktrackData* bt_data; + RE_FuzzyInfo* fuzzy_info; + RE_CODE* values; + + state = safe_state->re_state; + + /* No insertion or deletion. */ + if (!add_backtrack(safe_state, node->op)) + return RE_ERROR_FAILURE; + bt_data = state->backtrack; + bt_data->fuzzy_insert.position.text_pos = text_pos; + bt_data->fuzzy_insert.position.node = node; + bt_data->fuzzy_insert.count = 0; + bt_data->fuzzy_insert.too_few_errors = state->too_few_errors; + bt_data->fuzzy_insert.fuzzy_node = node; /* END_FUZZY node. */ + + /* Check whether there are too few errors. */ + fuzzy_info = &state->fuzzy_info; + + /* The node in this case is the END_FUZZY node. */ + values = node->values; + + if (fuzzy_info->counts[RE_FUZZY_DEL] < values[RE_FUZZY_VAL_MIN_DEL] || + fuzzy_info->counts[RE_FUZZY_INS] < values[RE_FUZZY_VAL_MIN_INS] || + fuzzy_info->counts[RE_FUZZY_SUB] < values[RE_FUZZY_VAL_MIN_SUB] || + fuzzy_info->counts[RE_FUZZY_ERR] < values[RE_FUZZY_VAL_MIN_ERR]) + state->too_few_errors = RE_ERROR_SUCCESS; + + return RE_ERROR_SUCCESS; +} + +/* Retries a fuzzy insertion. */ +Py_LOCAL_INLINE(int) retry_fuzzy_insert(RE_SafeState* safe_state, Py_ssize_t* + text_pos, RE_Node** node) { + RE_State* state; + RE_FuzzyInfo* fuzzy_info; + RE_CODE* values; + RE_BacktrackData* bt_data; + Py_ssize_t new_text_pos; + RE_Node* new_node; + int step; + Py_ssize_t limit; + RE_Node* fuzzy_node; + + state = safe_state->re_state; + fuzzy_info = &state->fuzzy_info; + values = fuzzy_info->node->values; + + bt_data = state->backtrack; + new_text_pos = bt_data->fuzzy_insert.position.text_pos; + new_node = bt_data->fuzzy_insert.position.node; + + if (new_node->status & RE_STATUS_REVERSE) { + step = -1; + limit = state->slice_start; + } else { + step = 1; + limit = state->slice_end; + } + + /* Could the character at text_pos have been inserted? */ + if (!this_error_permitted(state, RE_FUZZY_INS) || new_text_pos == limit) { + size_t count; + + count = bt_data->fuzzy_insert.count; + + fuzzy_info->counts[RE_FUZZY_INS] -= count; + fuzzy_info->counts[RE_FUZZY_ERR] -= count; + fuzzy_info->total_cost -= values[RE_FUZZY_VAL_INS_COST] * count; + state->total_errors -= count; + state->too_few_errors = bt_data->fuzzy_insert.too_few_errors; + + discard_backtrack(state); + *node = NULL; + return RE_ERROR_SUCCESS; + } + + ++bt_data->fuzzy_insert.count; + + ++fuzzy_info->counts[RE_FUZZY_INS]; + ++fuzzy_info->counts[RE_FUZZY_ERR]; + fuzzy_info->total_cost += values[RE_FUZZY_VAL_INS_COST]; + ++state->total_errors; + + /* Check whether there are too few errors. */ + state->too_few_errors = bt_data->fuzzy_insert.too_few_errors; + fuzzy_node = bt_data->fuzzy_insert.fuzzy_node; /* END_FUZZY node. */ + values = fuzzy_node->values; + if (fuzzy_info->counts[RE_FUZZY_DEL] < values[RE_FUZZY_VAL_MIN_DEL] || + fuzzy_info->counts[RE_FUZZY_INS] < values[RE_FUZZY_VAL_MIN_INS] || + fuzzy_info->counts[RE_FUZZY_SUB] < values[RE_FUZZY_VAL_MIN_SUB] || + fuzzy_info->counts[RE_FUZZY_ERR] < values[RE_FUZZY_VAL_MIN_ERR]) + state->too_few_errors = RE_ERROR_SUCCESS; + + *text_pos = new_text_pos + step * (Py_ssize_t)bt_data->fuzzy_insert.count; + *node = new_node; + + return RE_ERROR_SUCCESS; +} + +/* Tries a fuzzy match of a string. */ +Py_LOCAL_INLINE(int) fuzzy_match_string(RE_SafeState* safe_state, BOOL search, + Py_ssize_t* text_pos, RE_Node* node, Py_ssize_t* string_pos, BOOL* matched, + int step) { + RE_State* state; + RE_FuzzyData data; + RE_FuzzyInfo* fuzzy_info; + RE_CODE* values; + RE_BacktrackData* bt_data; + + state = safe_state->re_state; + + if (!any_error_permitted(state)) { + *matched = FALSE; + return RE_ERROR_SUCCESS; + } + + data.new_text_pos = *text_pos; + data.new_string_pos = *string_pos; + data.step = step; + + fuzzy_info = &state->fuzzy_info; + values = fuzzy_info->node->values; + + /* Permit insertion except initially when searching (it's better just to + * start searching one character later). + */ + data.permit_insertion = !search || data.new_text_pos != + state->search_anchor; + + for (data.fuzzy_type = 0; data.fuzzy_type < RE_FUZZY_COUNT; + data.fuzzy_type++) { + int status; + + status = next_fuzzy_match_item(state, &data, TRUE, data.step); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + goto found; + } + + *matched = FALSE; + return RE_ERROR_SUCCESS; + +found: + if (!add_backtrack(safe_state, node->op)) + return RE_ERROR_FAILURE; + bt_data = state->backtrack; + bt_data->fuzzy_string.position.text_pos = *text_pos; + bt_data->fuzzy_string.position.node = node; + bt_data->fuzzy_string.string_pos = *string_pos; + bt_data->fuzzy_string.fuzzy_type = (RE_INT8)data.fuzzy_type; + bt_data->fuzzy_string.step = (RE_INT8)step; + + ++fuzzy_info->counts[data.fuzzy_type]; + ++fuzzy_info->counts[RE_FUZZY_ERR]; + fuzzy_info->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type]; + ++state->total_errors; + + *text_pos = data.new_text_pos; + *string_pos = data.new_string_pos; + *matched = TRUE; + + return RE_ERROR_SUCCESS; +} + +/* Retries a fuzzy match of a string. */ +Py_LOCAL_INLINE(int) retry_fuzzy_match_string(RE_SafeState* safe_state, BOOL + search, Py_ssize_t* text_pos, RE_Node** node, Py_ssize_t* string_pos, BOOL* + matched) { + RE_State* state; + RE_FuzzyInfo* fuzzy_info; + RE_CODE* values; + RE_BacktrackData* bt_data; + RE_FuzzyData data; + RE_Node* new_node; + + state = safe_state->re_state; + fuzzy_info = &state->fuzzy_info; + values = fuzzy_info->node->values; + + bt_data = state->backtrack; + data.new_text_pos = bt_data->fuzzy_string.position.text_pos; + new_node = bt_data->fuzzy_string.position.node; + data.new_string_pos = bt_data->fuzzy_string.string_pos; + data.fuzzy_type = bt_data->fuzzy_string.fuzzy_type; + data.step = bt_data->fuzzy_string.step; + + --fuzzy_info->counts[data.fuzzy_type]; + --fuzzy_info->counts[RE_FUZZY_ERR]; + fuzzy_info->total_cost -= values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type]; + --state->total_errors; + + /* Permit insertion except initially when searching (it's better just to + * start searching one character later). + */ + data.permit_insertion = !search || data.new_text_pos != + state->search_anchor; + + for (++data.fuzzy_type; data.fuzzy_type < RE_FUZZY_COUNT; + data.fuzzy_type++) { + int status; + + status = next_fuzzy_match_item(state, &data, TRUE, data.step); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + goto found; + } + + discard_backtrack(state); + *matched = FALSE; + return RE_ERROR_SUCCESS; + +found: + bt_data->fuzzy_string.fuzzy_type = (RE_INT8)data.fuzzy_type; + + ++fuzzy_info->counts[data.fuzzy_type]; + ++fuzzy_info->counts[RE_FUZZY_ERR]; + fuzzy_info->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type]; + ++state->total_errors; + + *text_pos = data.new_text_pos; + *node = new_node; + *string_pos = data.new_string_pos; + *matched = TRUE; + + return RE_ERROR_SUCCESS; +} + +/* Checks a fuzzy match of a atring. */ +Py_LOCAL_INLINE(int) next_fuzzy_match_string_fld(RE_State* state, RE_FuzzyData* + data) { + int new_pos; + + if (this_error_permitted(state, data->fuzzy_type)) { + switch (data->fuzzy_type) { + case RE_FUZZY_DEL: + /* Could a character at text_pos have been deleted? */ + data->new_string_pos += data->step; + return RE_ERROR_SUCCESS; + case RE_FUZZY_INS: + /* Could the character at text_pos have been inserted? */ + if (!data->permit_insertion) + return RE_ERROR_FAILURE; + + new_pos = data->new_folded_pos + data->step; + if (0 <= new_pos && new_pos <= data->folded_len) { + data->new_folded_pos = new_pos; + return RE_ERROR_SUCCESS; + } + + return check_fuzzy_partial(state, new_pos); + case RE_FUZZY_SUB: + /* Could the character at text_pos have been substituted? */ + new_pos = data->new_folded_pos + data->step; + if (0 <= new_pos && new_pos <= data->folded_len) { + data->new_folded_pos = new_pos; + data->new_string_pos += data->step; + return RE_ERROR_SUCCESS; + } + + return check_fuzzy_partial(state, new_pos); + } + } + + return RE_ERROR_FAILURE; +} + +/* Tries a fuzzy match of a string, ignoring case. */ +Py_LOCAL_INLINE(int) fuzzy_match_string_fld(RE_SafeState* safe_state, BOOL + search, Py_ssize_t* text_pos, RE_Node* node, Py_ssize_t* string_pos, int* + folded_pos, int folded_len, BOOL* matched, int step) { + RE_State* state; + Py_ssize_t new_text_pos; + RE_FuzzyData data; + RE_FuzzyInfo* fuzzy_info; + RE_CODE* values; + RE_BacktrackData* bt_data; + + state = safe_state->re_state; + + if (!any_error_permitted(state)) { + *matched = FALSE; + return RE_ERROR_SUCCESS; + } + + new_text_pos = *text_pos; + data.new_string_pos = *string_pos; + data.new_folded_pos = *folded_pos; + data.folded_len = folded_len; + data.step = step; + + fuzzy_info = &state->fuzzy_info; + values = fuzzy_info->node->values; + + /* Permit insertion except initially when searching (it's better just to + * start searching one character later). + */ + data.permit_insertion = !search || new_text_pos != state->search_anchor; + if (step > 0) { + if (data.new_folded_pos != 0) + data.permit_insertion = RE_ERROR_SUCCESS; + } else { + if (data.new_folded_pos != folded_len) + data.permit_insertion = RE_ERROR_SUCCESS; + } + + for (data.fuzzy_type = 0; data.fuzzy_type < RE_FUZZY_COUNT; + data.fuzzy_type++) { + int status; + + status = next_fuzzy_match_string_fld(state, &data); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + goto found; + } + + *matched = FALSE; + return RE_ERROR_SUCCESS; + +found: + if (!add_backtrack(safe_state, node->op)) + return RE_ERROR_FAILURE; + bt_data = state->backtrack; + bt_data->fuzzy_string.position.text_pos = *text_pos; + bt_data->fuzzy_string.position.node = node; + bt_data->fuzzy_string.string_pos = *string_pos; + bt_data->fuzzy_string.folded_pos = (RE_INT8)(*folded_pos); + bt_data->fuzzy_string.folded_len = (RE_INT8)folded_len; + bt_data->fuzzy_string.fuzzy_type = (RE_INT8)data.fuzzy_type; + bt_data->fuzzy_string.step = (RE_INT8)step; + + ++fuzzy_info->counts[data.fuzzy_type]; + ++fuzzy_info->counts[RE_FUZZY_ERR]; + fuzzy_info->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type]; + ++state->total_errors; + + *text_pos = new_text_pos; + *string_pos = data.new_string_pos; + *folded_pos = data.new_folded_pos; + *matched = TRUE; + + return RE_ERROR_SUCCESS; +} + +/* Retries a fuzzy match of a string, ignoring case. */ +Py_LOCAL_INLINE(int) retry_fuzzy_match_string_fld(RE_SafeState* safe_state, + BOOL search, Py_ssize_t* text_pos, RE_Node** node, Py_ssize_t* string_pos, + int* folded_pos, BOOL* matched) { + RE_State* state; + RE_FuzzyInfo* fuzzy_info; + RE_CODE* values; + RE_BacktrackData* bt_data; + Py_ssize_t new_text_pos; + RE_Node* new_node; + RE_FuzzyData data; + + state = safe_state->re_state; + fuzzy_info = &state->fuzzy_info; + values = fuzzy_info->node->values; + + bt_data = state->backtrack; + new_text_pos = bt_data->fuzzy_string.position.text_pos; + new_node = bt_data->fuzzy_string.position.node; + data.new_string_pos = bt_data->fuzzy_string.string_pos; + data.new_folded_pos = bt_data->fuzzy_string.folded_pos; + data.folded_len = bt_data->fuzzy_string.folded_len; + data.fuzzy_type = bt_data->fuzzy_string.fuzzy_type; + data.step = bt_data->fuzzy_string.step; + + --fuzzy_info->counts[data.fuzzy_type]; + --fuzzy_info->counts[RE_FUZZY_ERR]; + fuzzy_info->total_cost -= values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type]; + --state->total_errors; + + /* Permit insertion except initially when searching (it's better just to + * start searching one character later). + */ + data.permit_insertion = !search || new_text_pos != state->search_anchor; + if (data.step > 0) { + if (data.new_folded_pos != 0) + data.permit_insertion = RE_ERROR_SUCCESS; + } else { + if (data.new_folded_pos != bt_data->fuzzy_string.folded_len) + data.permit_insertion = RE_ERROR_SUCCESS; + } + + for (++data.fuzzy_type; data.fuzzy_type < RE_FUZZY_COUNT; + data.fuzzy_type++) { + int status; + + status = next_fuzzy_match_string_fld(state, &data); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + goto found; + } + + discard_backtrack(state); + *matched = FALSE; + return RE_ERROR_SUCCESS; + +found: + bt_data->fuzzy_string.fuzzy_type = (RE_INT8)data.fuzzy_type; + + ++fuzzy_info->counts[data.fuzzy_type]; + ++fuzzy_info->counts[RE_FUZZY_ERR]; + fuzzy_info->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type]; + ++state->total_errors; + + *text_pos = new_text_pos; + *node = new_node; + *string_pos = data.new_string_pos; + *folded_pos = data.new_folded_pos; + *matched = TRUE; + + return RE_ERROR_SUCCESS; +} + +/* Checks a fuzzy match of a atring. */ +Py_LOCAL_INLINE(int) next_fuzzy_match_group_fld(RE_State* state, RE_FuzzyData* + data) { + int new_pos; + + if (this_error_permitted(state, data->fuzzy_type)) { + switch (data->fuzzy_type) { + case RE_FUZZY_DEL: + /* Could a character at text_pos have been deleted? */ + data->new_gfolded_pos += data->step; + return RE_ERROR_SUCCESS; + case RE_FUZZY_INS: + /* Could the character at text_pos have been inserted? */ + if (!data->permit_insertion) + return RE_ERROR_FAILURE; + + new_pos = data->new_folded_pos + data->step; + if (0 <= new_pos && new_pos <= data->folded_len) { + data->new_folded_pos = new_pos; + return RE_ERROR_SUCCESS; + } + + return check_fuzzy_partial(state, new_pos); + case RE_FUZZY_SUB: + /* Could the character at text_pos have been substituted? */ + new_pos = data->new_folded_pos + data->step; + if (0 <= new_pos && new_pos <= data->folded_len) { + data->new_folded_pos = new_pos; + data->new_gfolded_pos += data->step; + return RE_ERROR_SUCCESS; + } + + return check_fuzzy_partial(state, new_pos); + } + } + + return RE_ERROR_FAILURE; +} + +/* Tries a fuzzy match of a group reference, ignoring case. */ +Py_LOCAL_INLINE(int) fuzzy_match_group_fld(RE_SafeState* safe_state, BOOL + search, Py_ssize_t* text_pos, RE_Node* node, int* folded_pos, int folded_len, + Py_ssize_t* group_pos, int* gfolded_pos, int gfolded_len, BOOL* matched, int + step) { + RE_State* state; + Py_ssize_t new_text_pos; + RE_FuzzyData data; + Py_ssize_t new_group_pos; + RE_FuzzyInfo* fuzzy_info; + RE_CODE* values; + RE_BacktrackData* bt_data; + + state = safe_state->re_state; + + if (!any_error_permitted(state)) { + *matched = FALSE; + return RE_ERROR_SUCCESS; + } + + new_text_pos = *text_pos; + data.new_folded_pos = *folded_pos; + data.folded_len = folded_len; + new_group_pos = *group_pos; + data.new_gfolded_pos = *gfolded_pos; + data.step = step; + + fuzzy_info = &state->fuzzy_info; + values = fuzzy_info->node->values; + + /* Permit insertion except initially when searching (it's better just to + * start searching one character later). + */ + data.permit_insertion = !search || new_text_pos != state->search_anchor; + if (data.step > 0) { + if (data.new_folded_pos != 0) + data.permit_insertion = RE_ERROR_SUCCESS; + } else { + if (data.new_folded_pos != folded_len) + data.permit_insertion = RE_ERROR_SUCCESS; + } + + for (data.fuzzy_type = 0; data.fuzzy_type < RE_FUZZY_COUNT; + data.fuzzy_type++) { + int status; + + status = next_fuzzy_match_group_fld(state, &data); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + goto found; + } + + *matched = FALSE; + return RE_ERROR_SUCCESS; + +found: + if (!add_backtrack(safe_state, node->op)) + return RE_ERROR_FAILURE; + bt_data = state->backtrack; + bt_data->fuzzy_string.position.text_pos = *text_pos; + bt_data->fuzzy_string.position.node = node; + bt_data->fuzzy_string.string_pos = *group_pos; + bt_data->fuzzy_string.folded_pos = (RE_INT8)(*folded_pos); + bt_data->fuzzy_string.folded_len = (RE_INT8)folded_len; + bt_data->fuzzy_string.gfolded_pos = (RE_INT8)(*gfolded_pos); + bt_data->fuzzy_string.gfolded_len = (RE_INT8)gfolded_len; + bt_data->fuzzy_string.fuzzy_type = (RE_INT8)data.fuzzy_type; + bt_data->fuzzy_string.step = (RE_INT8)step; + + ++fuzzy_info->counts[data.fuzzy_type]; + ++fuzzy_info->counts[RE_FUZZY_ERR]; + fuzzy_info->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type]; + ++state->total_errors; + + *text_pos = new_text_pos; + *group_pos = new_group_pos; + *folded_pos = data.new_folded_pos; + *gfolded_pos = data.new_gfolded_pos; + *matched = TRUE; + + return RE_ERROR_SUCCESS; +} + +/* Retries a fuzzy match of a group reference, ignoring case. */ +Py_LOCAL_INLINE(int) retry_fuzzy_match_group_fld(RE_SafeState* safe_state, BOOL + search, Py_ssize_t* text_pos, RE_Node** node, int* folded_pos, Py_ssize_t* + group_pos, int* gfolded_pos, BOOL* matched) { + RE_State* state; + RE_FuzzyInfo* fuzzy_info; + RE_CODE* values; + RE_BacktrackData* bt_data; + Py_ssize_t new_text_pos; + RE_Node* new_node; + Py_ssize_t new_group_pos; + RE_FuzzyData data; + + state = safe_state->re_state; + fuzzy_info = &state->fuzzy_info; + values = fuzzy_info->node->values; + + bt_data = state->backtrack; + new_text_pos = bt_data->fuzzy_string.position.text_pos; + new_node = bt_data->fuzzy_string.position.node; + new_group_pos = bt_data->fuzzy_string.string_pos; + data.new_folded_pos = bt_data->fuzzy_string.folded_pos; + data.folded_len = bt_data->fuzzy_string.folded_len; + data.new_gfolded_pos = bt_data->fuzzy_string.gfolded_pos; + data.fuzzy_type = bt_data->fuzzy_string.fuzzy_type; + data.step = bt_data->fuzzy_string.step; + + --fuzzy_info->counts[data.fuzzy_type]; + --fuzzy_info->counts[RE_FUZZY_ERR]; + fuzzy_info->total_cost -= values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type]; + --state->total_errors; + + /* Permit insertion except initially when searching (it's better just to + * start searching one character later). + */ + data.permit_insertion = !search || new_text_pos != state->search_anchor || + data.new_folded_pos != bt_data->fuzzy_string.folded_len; + + for (++data.fuzzy_type; data.fuzzy_type < RE_FUZZY_COUNT; + data.fuzzy_type++) { + int status; + + status = next_fuzzy_match_group_fld(state, &data); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + goto found; + } + + discard_backtrack(state); + *matched = FALSE; + return RE_ERROR_SUCCESS; + +found: + bt_data->fuzzy_string.fuzzy_type = (RE_INT8)data.fuzzy_type; + + ++fuzzy_info->counts[data.fuzzy_type]; + ++fuzzy_info->counts[RE_FUZZY_ERR]; + fuzzy_info->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type]; + ++state->total_errors; + + *text_pos = new_text_pos; + *node = new_node; + *group_pos = new_group_pos; + *folded_pos = data.new_folded_pos; + *gfolded_pos = data.new_gfolded_pos; + *matched = TRUE; + + return RE_ERROR_SUCCESS; +} + +/* Locates the required string, if there's one. */ +Py_LOCAL_INLINE(Py_ssize_t) locate_required_string(RE_SafeState* safe_state, + BOOL search) { + RE_State* state; + PatternObject* pattern; + Py_ssize_t found_pos; + Py_ssize_t end_pos; + + state = safe_state->re_state; + pattern = state->pattern; + + if (!pattern->req_string) + /* There isn't a required string, so start matching from the current + * position. + */ + return state->text_pos; + + /* Search for the required string and calculate where to start matching. */ + switch (pattern->req_string->op) { + case RE_OP_STRING: + { + BOOL is_partial; + Py_ssize_t limit; + + if (search || pattern->req_offset < 0) + limit = state->slice_end; + else { + limit = state->slice_start + pattern->req_offset + + (Py_ssize_t)pattern->req_string->value_count; + if (limit > state->slice_end || limit < 0) + limit = state->slice_end; + } + + if (state->req_pos < 0 || state->text_pos > state->req_pos) + /* First time or already passed it. */ + found_pos = string_search(safe_state, pattern->req_string, + state->text_pos, limit, &is_partial); + else { + found_pos = state->req_pos; + is_partial = FALSE; + } + + if (found_pos < 0) + /* The required string wasn't found. */ + return -1; + + if (!is_partial) { + /* Record where the required string matched. */ + state->req_pos = found_pos; + state->req_end = found_pos + + (Py_ssize_t)pattern->req_string->value_count; + } + + if (pattern->req_offset >= 0) { + /* Step back from the required string to where we should start + * matching. + */ + found_pos -= pattern->req_offset; + if (found_pos >= state->text_pos) + return found_pos; + } + break; + } + case RE_OP_STRING_FLD: + { + BOOL is_partial; + Py_ssize_t limit; + + if (search || pattern->req_offset < 0) + limit = state->slice_end; + else { + limit = state->slice_start + pattern->req_offset + + (Py_ssize_t)pattern->req_string->value_count; + if (limit > state->slice_end || limit < 0) + limit = state->slice_end; + } + + if (state->req_pos < 0 || state->text_pos > state->req_pos) + /* First time or already passed it. */ + found_pos = string_search_fld(safe_state, pattern->req_string, + state->text_pos, limit, &end_pos, &is_partial); + else { + found_pos = state->req_pos; + is_partial = FALSE; + } + + if (found_pos < 0) + /* The required string wasn't found. */ + return -1; + + if (!is_partial) { + /* Record where the required string matched. */ + state->req_pos = found_pos; + state->req_end = end_pos; + } + + if (pattern->req_offset >= 0) { + /* Step back from the required string to where we should start + * matching. + */ + found_pos -= pattern->req_offset; + if (found_pos >= state->text_pos) + return found_pos; + } + break; + } + case RE_OP_STRING_FLD_REV: + { + BOOL is_partial; + Py_ssize_t limit; + + if (search || pattern->req_offset < 0) + limit = state->slice_start; + else { + limit = state->slice_end - pattern->req_offset - + (Py_ssize_t)pattern->req_string->value_count; + if (limit < state->slice_start) + limit = state->slice_start; + } + + if (state->req_pos < 0 || state->text_pos < state->req_pos) + /* First time or already passed it. */ + found_pos = string_search_fld_rev(safe_state, pattern->req_string, + state->text_pos, limit, &end_pos, &is_partial); + else { + found_pos = state->req_pos; + is_partial = FALSE; + } + + if (found_pos < 0) + /* The required string wasn't found. */ + return -1; + + if (!is_partial) { + /* Record where the required string matched. */ + state->req_pos = found_pos; + state->req_end = end_pos; + } + + if (pattern->req_offset >= 0) { + /* Step back from the required string to where we should start + * matching. + */ + found_pos += pattern->req_offset; + if (found_pos <= state->text_pos) + return found_pos; + } + break; + } + case RE_OP_STRING_IGN: + { + BOOL is_partial; + Py_ssize_t limit; + + if (search || pattern->req_offset < 0) + limit = state->slice_end; + else { + limit = state->slice_start + pattern->req_offset + + (Py_ssize_t)pattern->req_string->value_count; + if (limit > state->slice_end || limit < 0) + limit = state->slice_end; + } + + if (state->req_pos < 0 || state->text_pos > state->req_pos) + /* First time or already passed it. */ + found_pos = string_search_ign(safe_state, pattern->req_string, + state->text_pos, limit, &is_partial); + else { + found_pos = state->req_pos; + is_partial = FALSE; + } + + if (found_pos < 0) + /* The required string wasn't found. */ + return -1; + + if (!is_partial) { + /* Record where the required string matched. */ + state->req_pos = found_pos; + state->req_end = found_pos + + (Py_ssize_t)pattern->req_string->value_count; + } + + if (pattern->req_offset >= 0) { + /* Step back from the required string to where we should start + * matching. + */ + found_pos -= pattern->req_offset; + if (found_pos >= state->text_pos) + return found_pos; + } + break; + } + case RE_OP_STRING_IGN_REV: + { + BOOL is_partial; + Py_ssize_t limit; + + if (search || pattern->req_offset < 0) + limit = state->slice_start; + else { + limit = state->slice_end - pattern->req_offset - + (Py_ssize_t)pattern->req_string->value_count; + if (limit < state->slice_start) + limit = state->slice_start; + } + + if (state->req_pos < 0 || state->text_pos < state->req_pos) + /* First time or already passed it. */ + found_pos = string_search_ign_rev(safe_state, pattern->req_string, + state->text_pos, limit, &is_partial); + else { + found_pos = state->req_pos; + is_partial = FALSE; + } + + if (found_pos < 0) + /* The required string wasn't found. */ + return -1; + + if (!is_partial) { + /* Record where the required string matched. */ + state->req_pos = found_pos; + state->req_end = found_pos - + (Py_ssize_t)pattern->req_string->value_count; + } + + if (pattern->req_offset >= 0) { + /* Step back from the required string to where we should start + * matching. + */ + found_pos += pattern->req_offset; + if (found_pos <= state->text_pos) + return found_pos; + } + break; + } + case RE_OP_STRING_REV: + { + BOOL is_partial; + Py_ssize_t limit; + + if (search || pattern->req_offset < 0) + limit = state->slice_start; + else { + limit = state->slice_end - pattern->req_offset - + (Py_ssize_t)pattern->req_string->value_count; + if (limit < state->slice_start) + limit = state->slice_start; + } + + if (state->req_pos < 0 || state->text_pos < state->req_pos) + /* First time or already passed it. */ + found_pos = string_search_rev(safe_state, pattern->req_string, + state->text_pos, limit, &is_partial); + else { + found_pos = state->req_pos; + is_partial = FALSE; + } + + if (found_pos < 0) + /* The required string wasn't found. */ + return -1; + + if (!is_partial) { + /* Record where the required string matched. */ + state->req_pos = found_pos; + state->req_end = found_pos - + (Py_ssize_t)pattern->req_string->value_count; + } + + if (pattern->req_offset >= 0) { + /* Step back from the required string to where we should start + * matching. + */ + found_pos += pattern->req_offset; + if (found_pos <= state->text_pos) + return found_pos; + } + break; + } + } + + /* Start matching from the current position. */ + return state->text_pos; +} + +/* Tries to match a character pattern. */ +Py_LOCAL_INLINE(int) match_one(RE_State* state, RE_Node* node, Py_ssize_t + text_pos) { + switch (node->op) { + case RE_OP_ANY: + return try_match_ANY(state, node, text_pos); + case RE_OP_ANY_ALL: + return try_match_ANY_ALL(state, node, text_pos); + case RE_OP_ANY_ALL_REV: + return try_match_ANY_ALL_REV(state, node, text_pos); + case RE_OP_ANY_REV: + return try_match_ANY_REV(state, node, text_pos); + case RE_OP_ANY_U: + return try_match_ANY_U(state, node, text_pos); + case RE_OP_ANY_U_REV: + return try_match_ANY_U_REV(state, node, text_pos); + case RE_OP_CHARACTER: + return try_match_CHARACTER(state, node, text_pos); + case RE_OP_CHARACTER_IGN: + return try_match_CHARACTER_IGN(state, node, text_pos); + case RE_OP_CHARACTER_IGN_REV: + return try_match_CHARACTER_IGN_REV(state, node, text_pos); + case RE_OP_CHARACTER_REV: + return try_match_CHARACTER_REV(state, node, text_pos); + case RE_OP_PROPERTY: + return try_match_PROPERTY(state, node, text_pos); + case RE_OP_PROPERTY_IGN: + return try_match_PROPERTY_IGN(state, node, text_pos); + case RE_OP_PROPERTY_IGN_REV: + return try_match_PROPERTY_IGN_REV(state, node, text_pos); + case RE_OP_PROPERTY_REV: + return try_match_PROPERTY_REV(state, node, text_pos); + case RE_OP_RANGE: + return try_match_RANGE(state, node, text_pos); + case RE_OP_RANGE_IGN: + return try_match_RANGE_IGN(state, node, text_pos); + case RE_OP_RANGE_IGN_REV: + return try_match_RANGE_IGN_REV(state, node, text_pos); + case RE_OP_RANGE_REV: + return try_match_RANGE_REV(state, node, text_pos); + case RE_OP_SET_DIFF: + case RE_OP_SET_INTER: + case RE_OP_SET_SYM_DIFF: + case RE_OP_SET_UNION: + return try_match_SET(state, node, text_pos); + case RE_OP_SET_DIFF_IGN: + case RE_OP_SET_INTER_IGN: + case RE_OP_SET_SYM_DIFF_IGN: + case RE_OP_SET_UNION_IGN: + return try_match_SET_IGN(state, node, text_pos); + case RE_OP_SET_DIFF_IGN_REV: + case RE_OP_SET_INTER_IGN_REV: + case RE_OP_SET_SYM_DIFF_IGN_REV: + case RE_OP_SET_UNION_IGN_REV: + return try_match_SET_IGN_REV(state, node, text_pos); + case RE_OP_SET_DIFF_REV: + case RE_OP_SET_INTER_REV: + case RE_OP_SET_SYM_DIFF_REV: + case RE_OP_SET_UNION_REV: + return try_match_SET_REV(state, node, text_pos); + } + + return FALSE; +} + +/* Tests whether 2 nodes contains the same values. */ +Py_LOCAL_INLINE(BOOL) same_values(RE_Node* node_1, RE_Node* node_2) { + size_t i; + + if (node_1->value_count != node_2->value_count) + return FALSE; + + for (i = 0; i < node_1->value_count; i++) { + if (node_1->values[i] != node_2->values[i]) + return FALSE; + } + + return TRUE; +} + +/* Tests whether 2 nodes are equivalent (both string-like in the same way). */ +Py_LOCAL_INLINE(BOOL) equivalent_nodes(RE_Node* node_1, RE_Node* node_2) { + switch (node_1->op) { + case RE_OP_CHARACTER: + case RE_OP_STRING: + switch (node_2->op) { + case RE_OP_CHARACTER: + case RE_OP_STRING: + return same_values(node_1, node_2); + } + break; + case RE_OP_CHARACTER_IGN: + case RE_OP_STRING_IGN: + switch (node_2->op) { + case RE_OP_CHARACTER_IGN: + case RE_OP_STRING_IGN: + return same_values(node_1, node_2); + } + break; + case RE_OP_CHARACTER_IGN_REV: + case RE_OP_STRING_IGN_REV: + switch (node_2->op) { + case RE_OP_CHARACTER_IGN_REV: + case RE_OP_STRING_IGN_REV: + return same_values(node_1, node_2); + } + break; + case RE_OP_CHARACTER_REV: + case RE_OP_STRING_REV: + switch (node_2->op) { + case RE_OP_CHARACTER_REV: + case RE_OP_STRING_REV: + return same_values(node_1, node_2); + } + break; + } + + return FALSE; +} + +/* Prunes the backtracking. */ +Py_LOCAL_INLINE(void) prune_backtracking(RE_State* state) { + RE_AtomicBlock* current; + + current = state->current_atomic_block; + if (current && current->count > 0) { + /* In an atomic group or a lookaround. */ + RE_AtomicData* atomic; + + /* Discard any backtracking info from inside the atomic group or + * lookaround. + */ + atomic = ¤t->items[current->count - 1]; + state->current_backtrack_block = atomic->current_backtrack_block; + state->current_backtrack_block->count = atomic->backtrack_count; + } else { + /* In the outermost pattern. */ + while (state->current_backtrack_block->previous) + state->current_backtrack_block = + state->current_backtrack_block->previous; + + /* Keep the bottom FAILURE on the backtracking stack. */ + state->current_backtrack_block->count = 1; + } +} + +/* Saves the match as the best POSIX match (leftmost longest) found so far. */ +Py_LOCAL_INLINE(BOOL) save_best_match(RE_SafeState* safe_state) { + RE_State* state; + size_t group_count; + size_t g; + + state = safe_state->re_state; + + state->best_match_pos = state->match_pos; + state->best_text_pos = state->text_pos; + state->found_match = TRUE; + + memmove(state->best_fuzzy_counts, state->total_fuzzy_counts, + sizeof(state->total_fuzzy_counts)); + + group_count = state->pattern->true_group_count; + if (group_count == 0) + return TRUE; + + acquire_GIL(safe_state); + + if (!state->best_match_groups) { + /* Allocate storage for the groups of the best match. */ + state->best_match_groups = (RE_GroupData*)re_alloc(group_count * + sizeof(RE_GroupData)); + if (!state->best_match_groups) + goto error; + + memset(state->best_match_groups, 0, group_count * + sizeof(RE_GroupData)); + + for (g = 0; g < group_count; g++) { + RE_GroupData* best; + RE_GroupData* group; + + best = &state->best_match_groups[g]; + group = &state->groups[g]; + + best->capture_capacity = group->capture_capacity; + best->captures = (RE_GroupSpan*)re_alloc(best->capture_capacity * + sizeof(RE_GroupSpan)); + if (!best->captures) + goto error; + } + } + + /* Copy the group spans and captures. */ + for (g = 0; g < group_count; g++) { + RE_GroupData* best; + RE_GroupData* group; + + best = &state->best_match_groups[g]; + group = &state->groups[g]; + + best->span = group->span; + best->capture_count = group->capture_count; + + if (best->capture_count < best->capture_capacity) { + /* We need more space for the captures. */ + re_dealloc(best->captures); + best->captures = (RE_GroupSpan*)re_alloc(best->capture_capacity * + sizeof(RE_GroupSpan)); + if (!best->captures) + goto error; + } + + /* Copy the captures for this group. */ + memmove(best->captures, group->captures, group->capture_count * + sizeof(RE_GroupSpan)); + } + + release_GIL(safe_state); + + return TRUE; + +error: + release_GIL(safe_state); + + return FALSE; +} + +/* Restores the best match for a POSIX match (leftmost longest). */ +Py_LOCAL_INLINE(void) restore_best_match(RE_SafeState* safe_state) { + RE_State* state; + size_t group_count; + size_t g; + + state = safe_state->re_state; + + if (!state->found_match) + return; + + state->match_pos = state->best_match_pos; + state->text_pos = state->best_text_pos; + + memmove(state->total_fuzzy_counts, state->best_fuzzy_counts, + sizeof(state->total_fuzzy_counts)); + + group_count = state->pattern->true_group_count; + if (group_count == 0) + return; + + /* Copy the group spans and captures. */ + for (g = 0; g < group_count; g++) { + RE_GroupData* group; + RE_GroupData* best; + + group = &state->groups[g]; + best = &state->best_match_groups[g]; + + group->span = best->span; + group->capture_count = best->capture_count; + + /* Copy the captures for this group. */ + memmove(group->captures, best->captures, best->capture_count * + sizeof(RE_GroupSpan)); + } +} + +/* Checks whether the new match is better than the current match for a POSIX + * match (leftmost longest) and saves it if it is. + */ +Py_LOCAL_INLINE(BOOL) check_posix_match(RE_SafeState* safe_state) { + RE_State* state; + Py_ssize_t best_length; + Py_ssize_t new_length; + + state = safe_state->re_state; + + if (!state->found_match) + return save_best_match(safe_state); + + /* Check the overall match. */ + if (state->reverse) { + /* We're searching backwards. */ + best_length = state->match_pos - state->best_text_pos; + new_length = state->match_pos - state->text_pos; + } else { + /* We're searching forwards. */ + best_length = state->best_text_pos - state->match_pos; + new_length = state->text_pos - state->match_pos; + } + + if (new_length > best_length) + /* It's a longer match. */ + return save_best_match(safe_state); + + return TRUE; +} + +/* Performs a depth-first match or search from the context. */ +Py_LOCAL_INLINE(int) basic_match(RE_SafeState* safe_state, BOOL search) { + RE_State* state; + RE_EncodingTable* encoding; + RE_LocaleInfo* locale_info; + PatternObject* pattern; + RE_Node* start_node; + RE_NextNode start_pair; + Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); + Py_ssize_t pattern_step; /* The overall step of the pattern (forwards or backwards). */ + Py_ssize_t string_pos; + BOOL do_search_start; + Py_ssize_t found_pos; + int status; + RE_Node* node; + int folded_pos; + int gfolded_pos; + TRACE(("<<basic_match>>\n")) + + state = safe_state->re_state; + encoding = state->encoding; + locale_info = state->locale_info; + pattern = state->pattern; + start_node = pattern->start_node; + + /* Look beyond any initial group node. */ + start_pair.node = start_node; + start_pair.test = pattern->start_test; + + /* Is the pattern anchored to the start or end of the string? */ + switch (start_pair.test->op) { + case RE_OP_END_OF_STRING: + if (state->reverse) { + /* Searching backwards. */ + if (state->text_pos != state->text_length) + return RE_ERROR_FAILURE; + + /* Don't bother to search further because it's anchored. */ + search = FALSE; + } + break; + case RE_OP_START_OF_STRING: + if (!state->reverse) { + /* Searching forwards. */ + if (state->text_pos != 0) + return RE_ERROR_FAILURE; + + /* Don't bother to search further because it's anchored. */ + search = FALSE; + } + break; + } + + char_at = state->char_at; + pattern_step = state->reverse ? -1 : 1; + string_pos = -1; + do_search_start = pattern->do_search_start; + state->fewest_errors = state->max_errors; + + if (do_search_start && pattern->req_string && + equivalent_nodes(start_pair.test, pattern->req_string)) + do_search_start = FALSE; + + /* Add a backtrack entry for failure. */ + if (!add_backtrack(safe_state, RE_OP_FAILURE)) + return RE_ERROR_BACKTRACKING; + +start_match: + /* If we're searching, advance along the string until there could be a + * match. + */ + if (pattern->pattern_call_ref >= 0) { + RE_GuardList* guard_list; + + guard_list = &state->group_call_guard_list[pattern->pattern_call_ref]; + guard_list->count = 0; + guard_list->last_text_pos = -1; + } + + /* Locate the required string, if there's one, unless this is a recursive + * call of 'basic_match'. + */ + if (!pattern->req_string) + found_pos = state->text_pos; + else { + found_pos = locate_required_string(safe_state, search); + if (found_pos < 0) + return RE_ERROR_FAILURE; + } + + if (search) { + state->text_pos = found_pos; + + if (do_search_start) { + RE_Position new_position; + +next_match_1: + /* 'search_start' will clear 'do_search_start' if it can't perform + * a fast search for the next possible match. This enables us to + * avoid the overhead of the call subsequently. + */ + status = search_start(safe_state, &start_pair, &new_position, 0); + if (status == RE_ERROR_PARTIAL) { + state->match_pos = state->text_pos; + return status; + } else if (status != RE_ERROR_SUCCESS) + return status; + + node = new_position.node; + state->text_pos = new_position.text_pos; + + if (node->op == RE_OP_SUCCESS) { + /* Must the match advance past its start? */ + if (state->text_pos != state->search_anchor || + !state->must_advance) + return RE_ERROR_SUCCESS; + + state->text_pos = state->match_pos + pattern_step; + goto next_match_1; + } + + /* 'do_search_start' may have been cleared. */ + do_search_start = pattern->do_search_start; + } else { + /* Avoiding 'search_start', which we've found can't perform a fast + * search for the next possible match. + */ + node = start_node; + +next_match_2: + if (state->reverse) { + if (state->text_pos < state->slice_start) { + if (state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + } else { + if (state->text_pos > state->slice_end) { + if (state-> partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + return RE_ERROR_FAILURE; + } + } + + state->match_pos = state->text_pos; + + if (node->op == RE_OP_SUCCESS) { + /* Must the match advance past its start? */ + if (state->text_pos != state->search_anchor || + !state->must_advance) { + BOOL success; + + if (state->match_all) { + /* We want to match all of the slice. */ + if (state->reverse) + success = state->text_pos == state->slice_start; + else + success = state->text_pos == state->slice_end; + } else + success = TRUE; + + if (success) + return RE_ERROR_SUCCESS; + } + + state->text_pos = state->match_pos + pattern_step; + goto next_match_2; + } + } + } else { + /* The start position is anchored to the current position. */ + if (found_pos != state->text_pos) + return RE_ERROR_FAILURE; + + node = start_node; + } + +advance: + /* The main matching loop. */ + for (;;) { + TRACE(("%d|", state->text_pos)) + + /* Should we abort the matching? */ + ++state->iterations; + + if (state->iterations == 0 && safe_check_signals(safe_state)) + return RE_ERROR_INTERRUPTED; + + switch (node->op) { + case RE_OP_ANY: /* Any character except a newline. */ + TRACE(("%s\n", re_op_text[node->op])) + + status = try_match_ANY(state, node, state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) { + ++state->text_pos; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 1); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_ANY_ALL: /* Any character at all. */ + TRACE(("%s\n", re_op_text[node->op])) + + status = try_match_ANY_ALL(state, node, state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) { + ++state->text_pos; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 1); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_ANY_ALL_REV: /* Any character at all, backwards. */ + TRACE(("%s\n", re_op_text[node->op])) + + status = try_match_ANY_ALL_REV(state, node, state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) { + --state->text_pos; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, -1); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_ANY_REV: /* Any character except a newline, backwards. */ + TRACE(("%s\n", re_op_text[node->op])) + + status = try_match_ANY_REV(state, node, state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) { + --state->text_pos; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, -1); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_ANY_U: /* Any character except a line separator. */ + TRACE(("%s\n", re_op_text[node->op])) + + status = try_match_ANY_U(state, node, state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) { + ++state->text_pos; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 1); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_ANY_U_REV: /* Any character except a line separator, backwards. */ + TRACE(("%s\n", re_op_text[node->op])) + + status = try_match_ANY_U_REV(state, node, state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) { + --state->text_pos; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, -1); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_ATOMIC: /* Start of an atomic group. */ + { + RE_AtomicData* atomic; + TRACE(("%s\n", re_op_text[node->op])) + + if (!add_backtrack(safe_state, RE_OP_ATOMIC)) + return RE_ERROR_BACKTRACKING; + state->backtrack->atomic.too_few_errors = state->too_few_errors; + state->backtrack->atomic.capture_change = state->capture_change; + + atomic = push_atomic(safe_state); + if (!atomic) + return RE_ERROR_MEMORY; + atomic->backtrack_count = state->current_backtrack_block->count; + atomic->current_backtrack_block = state->current_backtrack_block; + atomic->is_lookaround = FALSE; + atomic->has_groups = (node->status & RE_STATUS_HAS_GROUPS) != 0; + atomic->has_repeats = (node->status & RE_STATUS_HAS_REPEATS) != 0; + + /* Save the groups and repeats. */ + if (atomic->has_groups && !push_groups(safe_state)) + return RE_ERROR_MEMORY; + + if (atomic->has_repeats && !push_repeats(safe_state)) + return RE_ERROR_MEMORY; + + node = node->next_1.node; + break; + } + case RE_OP_BOUNDARY: /* On a word boundary. */ + TRACE(("%s %d\n", re_op_text[node->op], node->match)) + + status = try_match_BOUNDARY(state, node, state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + node = node->next_1.node; + else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 0); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_BRANCH: /* 2-way branch. */ + { + RE_Position next_position; + TRACE(("%s\n", re_op_text[node->op])) + + status = try_match(state, &node->next_1, state->text_pos, + &next_position); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) { + if (!add_backtrack(safe_state, RE_OP_BRANCH)) + return RE_ERROR_BACKTRACKING; + state->backtrack->branch.position.node = + node->nonstring.next_2.node; + state->backtrack->branch.position.text_pos = state->text_pos; + + node = next_position.node; + state->text_pos = next_position.text_pos; + } else + node = node->nonstring.next_2.node; + break; + } + case RE_OP_CALL_REF: /* A group call reference. */ + { + TRACE(("%s %d\n", re_op_text[node->op], node->values[0])) + + if (!push_group_return(safe_state, NULL)) + return RE_ERROR_MEMORY; + + if (!add_backtrack(safe_state, RE_OP_CALL_REF)) + return RE_ERROR_BACKTRACKING; + + node = node->next_1.node; + break; + } + case RE_OP_CHARACTER: /* A character. */ + TRACE(("%s %d %d\n", re_op_text[node->op], node->match, + node->values[0])) + + if (state->text_pos >= state->text_length && state->partial_side == + RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + if (state->text_pos < state->slice_end && + matches_CHARACTER(encoding, locale_info, node, + char_at(state->text, state->text_pos)) == node->match) { + state->text_pos += node->step; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_CHARACTER_IGN: /* A character, ignoring case. */ + TRACE(("%s %d %d\n", re_op_text[node->op], node->match, + node->values[0])) + + if (state->text_pos >= state->text_length && state->partial_side == + RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + if (state->text_pos < state->slice_end && + matches_CHARACTER_IGN(encoding, locale_info, node, + char_at(state->text, state->text_pos)) == node->match) { + state->text_pos += node->step; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_CHARACTER_IGN_REV: /* A character, backwards, ignoring case. */ + TRACE(("%s %d %d\n", re_op_text[node->op], node->match, + node->values[0])) + + if (state->text_pos <= 0 && state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + if (state->text_pos > state->slice_start && + matches_CHARACTER_IGN(encoding, locale_info, node, + char_at(state->text, state->text_pos - 1)) == node->match) { + state->text_pos += node->step; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, -1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_CHARACTER_REV: /* A character, backwards. */ + TRACE(("%s %d %d\n", re_op_text[node->op], node->match, + node->values[0])) + + if (state->text_pos <= 0 && state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + if (state->text_pos > state->slice_start && + matches_CHARACTER(encoding, locale_info, node, + char_at(state->text, state->text_pos - 1)) == node->match) { + state->text_pos += node->step; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, -1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_CONDITIONAL: /* Start of a conditional subpattern. */ + { + RE_AtomicData* conditional; + TRACE(("%s %d\n", re_op_text[node->op], node->match)) + + if (!add_backtrack(safe_state, RE_OP_CONDITIONAL)) + return RE_ERROR_BACKTRACKING; + state->backtrack->lookaround.too_few_errors = + state->too_few_errors; + state->backtrack->lookaround.capture_change = + state->capture_change; + state->backtrack->lookaround.inside = TRUE; + state->backtrack->lookaround.node = node; + + conditional = push_atomic(safe_state); + if (!conditional) + return RE_ERROR_MEMORY; + conditional->backtrack_count = + state->current_backtrack_block->count; + conditional->current_backtrack_block = + state->current_backtrack_block; + conditional->slice_start = state->slice_start; + conditional->slice_end = state->slice_end; + conditional->text_pos = state->text_pos; + conditional->node = node; + conditional->backtrack = state->backtrack; + conditional->is_lookaround = TRUE; + conditional->has_groups = (node->status & RE_STATUS_HAS_GROUPS) != + 0; + conditional->has_repeats = (node->status & RE_STATUS_HAS_REPEATS) + != 0; + + /* Save the groups and repeats. */ + if (conditional->has_groups && !push_groups(safe_state)) + return RE_ERROR_MEMORY; + + if (conditional->has_repeats && !push_repeats(safe_state)) + return RE_ERROR_MEMORY; + + conditional->saved_groups = state->current_saved_groups; + conditional->saved_repeats = state->current_saved_repeats; + + state->slice_start = 0; + state->slice_end = state->text_length; + + node = node->next_1.node; + break; + } + case RE_OP_DEFAULT_BOUNDARY: /* On a default word boundary. */ + TRACE(("%s %d\n", re_op_text[node->op], node->match)) + + status = try_match_DEFAULT_BOUNDARY(state, node, state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + node = node->next_1.node; + else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 0); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_DEFAULT_END_OF_WORD: /* At the default end of a word. */ + TRACE(("%s\n", re_op_text[node->op])) + + status = try_match_DEFAULT_END_OF_WORD(state, node, + state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + node = node->next_1.node; + else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 0); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_DEFAULT_START_OF_WORD: /* At the default start of a word. */ + TRACE(("%s\n", re_op_text[node->op])) + + status = try_match_DEFAULT_START_OF_WORD(state, node, + state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + node = node->next_1.node; + else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 0); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_END_ATOMIC: /* End of an atomic group. */ + { + RE_AtomicData* atomic; + + /* Discard any backtracking info from inside the atomic group. */ + atomic = top_atomic(safe_state); + state->current_backtrack_block = atomic->current_backtrack_block; + state->current_backtrack_block->count = atomic->backtrack_count; + + node = node->next_1.node; + break; + } + case RE_OP_END_CONDITIONAL: /* End of a conditional subpattern. */ + { + RE_AtomicData* conditional; + + conditional = pop_atomic(safe_state); + while (!conditional->is_lookaround) { + if (conditional->has_repeats) + drop_repeats(state); + + if (conditional->has_groups) + drop_groups(state); + + conditional = pop_atomic(safe_state); + } + state->text_pos = conditional->text_pos; + state->slice_end = conditional->slice_end; + state->slice_start = conditional->slice_start; + + /* Discard any backtracking info from inside the lookaround. */ + state->current_backtrack_block = + conditional->current_backtrack_block; + state->current_backtrack_block->count = + conditional->backtrack_count; + state->current_saved_groups = conditional->saved_groups; + state->current_saved_repeats = conditional->saved_repeats; + + /* It's a positive lookaround that's succeeded. We're now going to + * leave the lookaround. + */ + conditional->backtrack->lookaround.inside = FALSE; + + if (conditional->node->match) { + /* It's a positive lookaround that's succeeded. + * + * Go to the 'true' branch. + */ + node = node->next_1.node; + } else { + /* It's a negative lookaround that's succeeded. + * + * Go to the 'false' branch. + */ + node = node->nonstring.next_2.node; + } + break; + } + case RE_OP_END_FUZZY: /* End of fuzzy matching. */ + TRACE(("%s\n", re_op_text[node->op])) + + if (!fuzzy_insert(safe_state, state->text_pos, node)) + return RE_ERROR_BACKTRACKING; + + /* If there were too few errors, in the fuzzy section, try again. + */ + if (state->too_few_errors) { + state->too_few_errors = FALSE; + goto backtrack; + } + + state->total_fuzzy_counts[RE_FUZZY_SUB] += + state->fuzzy_info.counts[RE_FUZZY_SUB]; + state->total_fuzzy_counts[RE_FUZZY_INS] += + state->fuzzy_info.counts[RE_FUZZY_INS]; + state->total_fuzzy_counts[RE_FUZZY_DEL] += + state->fuzzy_info.counts[RE_FUZZY_DEL]; + + node = node->next_1.node; + break; + case RE_OP_END_GREEDY_REPEAT: /* End of a greedy repeat. */ + { + RE_CODE index; + RE_RepeatData* rp_data; + BOOL changed; + BOOL try_body; + int body_status; + RE_Position next_body_position; + BOOL try_tail; + int tail_status; + RE_Position next_tail_position; + RE_BacktrackData* bt_data; + TRACE(("%s %d\n", re_op_text[node->op], node->values[0])) + + /* Repeat indexes are 0-based. */ + index = node->values[0]; + rp_data = &state->repeats[index]; + + /* The body has matched successfully at this position. */ + if (!guard_repeat(safe_state, index, rp_data->start, + RE_STATUS_BODY, FALSE)) + return RE_ERROR_MEMORY; + + ++rp_data->count; + + /* Have we advanced through the text or has a capture group change? + */ + changed = rp_data->capture_change != state->capture_change || + state->text_pos != rp_data->start; + + /* The counts are of type size_t, so the format needs to specify + * that. + */ + TRACE(("min is %" PY_FORMAT_SIZE_T "u, max is %" PY_FORMAT_SIZE_T + "u, count is %" PY_FORMAT_SIZE_T "u\n", node->values[1], + node->values[2], rp_data->count)) + + /* Could the body or tail match? */ + try_body = changed && (rp_data->count < node->values[2] || + ~node->values[2] == 0) && !is_repeat_guarded(safe_state, index, + state->text_pos, RE_STATUS_BODY); + if (try_body) { + body_status = try_match(state, &node->next_1, state->text_pos, + &next_body_position); + if (body_status < 0) + return body_status; + + if (body_status == RE_ERROR_FAILURE) + try_body = FALSE; + } else + body_status = RE_ERROR_FAILURE; + + try_tail = (!changed || rp_data->count >= node->values[1]) && + !is_repeat_guarded(safe_state, index, state->text_pos, + RE_STATUS_TAIL); + if (try_tail) { + tail_status = try_match(state, &node->nonstring.next_2, + state->text_pos, &next_tail_position); + if (tail_status < 0) + return tail_status; + + if (tail_status == RE_ERROR_FAILURE) + try_tail = FALSE; + } else + tail_status = RE_ERROR_FAILURE; + + if (!try_body && !try_tail) { + /* Neither the body nor the tail could match. */ + --rp_data->count; + goto backtrack; + } + + if (body_status < 0 || (body_status == 0 && tail_status < 0)) + return RE_ERROR_PARTIAL; + + /* Record info in case we backtrack into the body. */ + if (!add_backtrack(safe_state, RE_OP_BODY_END)) + return RE_ERROR_BACKTRACKING; + bt_data = state->backtrack; + bt_data->repeat.index = index; + bt_data->repeat.count = rp_data->count - 1; + bt_data->repeat.start = rp_data->start; + bt_data->repeat.capture_change = rp_data->capture_change; + + if (try_body) { + /* Both the body and the tail could match. */ + if (try_tail) { + /* The body takes precedence. If the body fails to match + * then we want to try the tail before backtracking + * further. + */ + + /* Record backtracking info for matching the tail. */ + if (!add_backtrack(safe_state, RE_OP_MATCH_TAIL)) + return RE_ERROR_BACKTRACKING; + bt_data = state->backtrack; + bt_data->repeat.position = next_tail_position; + bt_data->repeat.index = index; + bt_data->repeat.count = rp_data->count; + bt_data->repeat.start = rp_data->start; + bt_data->repeat.capture_change = rp_data->capture_change; + bt_data->repeat.text_pos = state->text_pos; + } + + /* Record backtracking info in case the body fails to match. */ + if (!add_backtrack(safe_state, RE_OP_BODY_START)) + return RE_ERROR_BACKTRACKING; + bt_data = state->backtrack; + bt_data->repeat.index = index; + bt_data->repeat.text_pos = state->text_pos; + + rp_data->capture_change = state->capture_change; + rp_data->start = state->text_pos; + + /* Advance into the body. */ + node = next_body_position.node; + state->text_pos = next_body_position.text_pos; + } else { + /* Only the tail could match. */ + + /* Advance into the tail. */ + node = next_tail_position.node; + state->text_pos = next_tail_position.text_pos; + } + break; + } + case RE_OP_END_GROUP: /* End of a capture group. */ + { + RE_CODE private_index; + RE_CODE public_index; + RE_GroupData* group; + RE_BacktrackData* bt_data; + TRACE(("%s %d\n", re_op_text[node->op], node->values[1])) + + /* Capture group indexes are 1-based (excluding group 0, which is + * the entire matched string). + */ + private_index = node->values[0]; + public_index = node->values[1]; + group = &state->groups[private_index - 1]; + + if (!add_backtrack(safe_state, RE_OP_END_GROUP)) + return RE_ERROR_BACKTRACKING; + bt_data = state->backtrack; + bt_data->group.private_index = private_index; + bt_data->group.public_index = public_index; + bt_data->group.text_pos = group->span.end; + bt_data->group.capture = (BOOL)node->values[2]; + bt_data->group.current_capture = group->current_capture; + + if (pattern->group_info[private_index - 1].referenced && + group->span.end != state->text_pos) + ++state->capture_change; + group->span.end = state->text_pos; + + /* Save the capture? */ + if (node->values[2]) { + group->current_capture = (Py_ssize_t)group->capture_count; + if (!save_capture(safe_state, private_index, public_index)) + return RE_ERROR_MEMORY; + } + + node = node->next_1.node; + break; + } + case RE_OP_END_LAZY_REPEAT: /* End of a lazy repeat. */ + { + RE_CODE index; + RE_RepeatData* rp_data; + BOOL changed; + BOOL try_body; + int body_status; + RE_Position next_body_position; + BOOL try_tail; + int tail_status; + RE_Position next_tail_position; + RE_BacktrackData* bt_data; + TRACE(("%s %d\n", re_op_text[node->op], node->values[0])) + + /* Repeat indexes are 0-based. */ + index = node->values[0]; + rp_data = &state->repeats[index]; + + /* The body has matched successfully at this position. */ + if (!guard_repeat(safe_state, index, rp_data->start, + RE_STATUS_BODY, FALSE)) + return RE_ERROR_MEMORY; + + ++rp_data->count; + + /* Have we advanced through the text or has a capture group change? + */ + changed = rp_data->capture_change != state->capture_change || + state->text_pos != rp_data->start; + + /* The counts are of type size_t, so the format needs to specify + * that. + */ + TRACE(("min is %" PY_FORMAT_SIZE_T "u, max is %" PY_FORMAT_SIZE_T + "u, count is %" PY_FORMAT_SIZE_T "u\n", node->values[1], + node->values[2], rp_data->count)) + + /* Could the body or tail match? */ + try_body = changed && (rp_data->count < node->values[2] || + ~node->values[2] == 0) && !is_repeat_guarded(safe_state, index, + state->text_pos, RE_STATUS_BODY); + if (try_body) { + body_status = try_match(state, &node->next_1, state->text_pos, + &next_body_position); + if (body_status < 0) + return body_status; + + if (body_status == RE_ERROR_FAILURE) + try_body = FALSE; + } else + body_status = RE_ERROR_FAILURE; + + try_tail = (!changed || rp_data->count >= node->values[1]); + if (try_tail) { + tail_status = try_match(state, &node->nonstring.next_2, + state->text_pos, &next_tail_position); + if (tail_status < 0) + return tail_status; + + if (tail_status == RE_ERROR_FAILURE) + try_tail = FALSE; + } else + tail_status = RE_ERROR_FAILURE; + + if (!try_body && !try_tail) { + /* Neither the body nor the tail could match. */ + --rp_data->count; + goto backtrack; + } + + if (body_status < 0 || (body_status == 0 && tail_status < 0)) + return RE_ERROR_PARTIAL; + + /* Record info in case we backtrack into the body. */ + if (!add_backtrack(safe_state, RE_OP_BODY_END)) + return RE_ERROR_BACKTRACKING; + bt_data = state->backtrack; + bt_data->repeat.index = index; + bt_data->repeat.count = rp_data->count - 1; + bt_data->repeat.start = rp_data->start; + bt_data->repeat.capture_change = rp_data->capture_change; + + if (try_body) { + /* Both the body and the tail could match. */ + if (try_tail) { + /* The tail takes precedence. If the tail fails to match + * then we want to try the body before backtracking + * further. + */ + + /* Record backtracking info for matching the body. */ + if (!add_backtrack(safe_state, RE_OP_MATCH_BODY)) + return RE_ERROR_BACKTRACKING; + bt_data = state->backtrack; + bt_data->repeat.position = next_body_position; + bt_data->repeat.index = index; + bt_data->repeat.count = rp_data->count; + bt_data->repeat.start = rp_data->start; + bt_data->repeat.capture_change = rp_data->capture_change; + bt_data->repeat.text_pos = state->text_pos; + + /* Advance into the tail. */ + node = next_tail_position.node; + state->text_pos = next_tail_position.text_pos; + } else { + /* Only the body could match. */ + + /* Record backtracking info in case the body fails to + * match. + */ + if (!add_backtrack(safe_state, RE_OP_BODY_START)) + return RE_ERROR_BACKTRACKING; + bt_data = state->backtrack; + bt_data->repeat.index = index; + bt_data->repeat.text_pos = state->text_pos; + + rp_data->capture_change = state->capture_change; + rp_data->start = state->text_pos; + + /* Advance into the body. */ + node = next_body_position.node; + state->text_pos = next_body_position.text_pos; + } + } else { + /* Only the tail could match. */ + + /* Advance into the tail. */ + node = next_tail_position.node; + state->text_pos = next_tail_position.text_pos; + } + break; + } + case RE_OP_END_LOOKAROUND: /* End of a lookaround subpattern. */ + { + RE_AtomicData* lookaround; + + lookaround = pop_atomic(safe_state); + while (!lookaround->is_lookaround) { + if (lookaround->has_repeats) + drop_repeats(state); + + if (lookaround->has_groups) + drop_groups(state); + + lookaround = pop_atomic(safe_state); + } + state->text_pos = lookaround->text_pos; + state->slice_end = lookaround->slice_end; + state->slice_start = lookaround->slice_start; + + /* Discard any backtracking info from inside the lookaround. */ + state->current_backtrack_block = + lookaround->current_backtrack_block; + state->current_backtrack_block->count = + lookaround->backtrack_count; + state->current_saved_groups = lookaround->saved_groups; + state->current_saved_repeats = lookaround->saved_repeats; + + if (lookaround->node->match) { + /* It's a positive lookaround that's succeeded. We're now going + * to leave the lookaround. + */ + lookaround->backtrack->lookaround.inside = FALSE; + + node = node->next_1.node; + } else { + /* It's a negative lookaround that's succeeded. The groups and + * certain flags may have changed. We need to restore them and + * then backtrack. + */ + if (lookaround->has_repeats) + pop_repeats(state); + + if (lookaround->has_groups) + pop_groups(state); + + state->too_few_errors = + lookaround->backtrack->lookaround.too_few_errors; + state->capture_change = + lookaround->backtrack->lookaround.capture_change; + + discard_backtrack(state); + goto backtrack; + } + break; + } + case RE_OP_END_OF_LINE: /* At the end of a line. */ + TRACE(("%s\n", re_op_text[node->op])) + + status = try_match_END_OF_LINE(state, node, state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + node = node->next_1.node; + else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 0); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_END_OF_LINE_U: /* At the end of a line. */ + TRACE(("%s\n", re_op_text[node->op])) + + status = try_match_END_OF_LINE_U(state, node, state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + node = node->next_1.node; + else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 0); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_END_OF_STRING: /* At the end of the string. */ + TRACE(("%s\n", re_op_text[node->op])) + + status = try_match_END_OF_STRING(state, node, state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + node = node->next_1.node; + else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 0); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_END_OF_STRING_LINE: /* At end of string or final newline. */ + TRACE(("%s\n", re_op_text[node->op])) + + status = try_match_END_OF_STRING_LINE(state, node, + state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + node = node->next_1.node; + else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 0); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_END_OF_STRING_LINE_U: /* At end of string or final newline. */ + TRACE(("%s\n", re_op_text[node->op])) + + status = try_match_END_OF_STRING_LINE_U(state, node, + state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + node = node->next_1.node; + else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 0); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_END_OF_WORD: /* At the end of a word. */ + TRACE(("%s\n", re_op_text[node->op])) + + status = try_match_END_OF_WORD(state, node, state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + node = node->next_1.node; + else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 0); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_FAILURE: /* Failure. */ + goto backtrack; + case RE_OP_FUZZY: /* Fuzzy matching. */ + { + RE_FuzzyInfo* fuzzy_info; + RE_BacktrackData* bt_data; + TRACE(("%s\n", re_op_text[node->op])) + + fuzzy_info = &state->fuzzy_info; + + /* Save the current fuzzy info. */ + if (!add_backtrack(safe_state, RE_OP_FUZZY)) + return RE_ERROR_BACKTRACKING; + bt_data = state->backtrack; + memmove(&bt_data->fuzzy.fuzzy_info, fuzzy_info, + sizeof(RE_FuzzyInfo)); + bt_data->fuzzy.index = node->values[0]; + bt_data->fuzzy.text_pos = state->text_pos; + + /* Initialise the new fuzzy info. */ + memset(fuzzy_info->counts, 0, 4 * sizeof(fuzzy_info->counts[0])); + fuzzy_info->total_cost = 0; + fuzzy_info->node = node; + + node = node->next_1.node; + break; + } + case RE_OP_GRAPHEME_BOUNDARY: /* On a grapheme boundary. */ + TRACE(("%s\n", re_op_text[node->op])) + + status = try_match_GRAPHEME_BOUNDARY(state, node, state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + node = node->next_1.node; + else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 0); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_GREEDY_REPEAT: /* Greedy repeat. */ + { + RE_CODE index; + RE_RepeatData* rp_data; + RE_BacktrackData* bt_data; + BOOL try_body; + int body_status; + RE_Position next_body_position; + BOOL try_tail; + int tail_status; + RE_Position next_tail_position; + TRACE(("%s %d\n", re_op_text[node->op], node->values[0])) + + /* Repeat indexes are 0-based. */ + index = node->values[0]; + rp_data = &state->repeats[index]; + + /* We might need to backtrack into the head, so save the current + * repeat. + */ + if (!add_backtrack(safe_state, RE_OP_GREEDY_REPEAT)) + return RE_ERROR_BACKTRACKING; + bt_data = state->backtrack; + bt_data->repeat.index = index; + bt_data->repeat.count = rp_data->count; + bt_data->repeat.start = rp_data->start; + bt_data->repeat.capture_change = rp_data->capture_change; + bt_data->repeat.text_pos = state->text_pos; + + /* Initialise the new repeat. */ + rp_data->count = 0; + rp_data->start = state->text_pos; + rp_data->capture_change = state->capture_change; + + /* Could the body or tail match? */ + try_body = node->values[2] > 0 && !is_repeat_guarded(safe_state, + index, state->text_pos, RE_STATUS_BODY); + if (try_body) { + body_status = try_match(state, &node->next_1, state->text_pos, + &next_body_position); + if (body_status < 0) + return body_status; + + if (body_status == RE_ERROR_FAILURE) + try_body = FALSE; + } else + body_status = RE_ERROR_FAILURE; + + try_tail = node->values[1] == 0; + if (try_tail) { + tail_status = try_match(state, &node->nonstring.next_2, + state->text_pos, &next_tail_position); + if (tail_status < 0) + return tail_status; + + if (tail_status == RE_ERROR_FAILURE) + try_tail = FALSE; + } else + tail_status = RE_ERROR_FAILURE; + if (!try_body && !try_tail) + /* Neither the body nor the tail could match. */ + goto backtrack; + + if (body_status < 0 || (body_status == 0 && tail_status < 0)) + return RE_ERROR_PARTIAL; + + if (try_body) { + if (try_tail) { + /* Both the body and the tail could match, but the body + * takes precedence. If the body fails to match then we + * want to try the tail before backtracking further. + */ + + /* Record backtracking info for matching the tail. */ + if (!add_backtrack(safe_state, RE_OP_MATCH_TAIL)) + return RE_ERROR_BACKTRACKING; + bt_data = state->backtrack; + bt_data->repeat.position = next_tail_position; + bt_data->repeat.index = index; + bt_data->repeat.count = rp_data->count; + bt_data->repeat.start = rp_data->start; + bt_data->repeat.capture_change = rp_data->capture_change; + bt_data->repeat.text_pos = state->text_pos; + } + + /* Advance into the body. */ + node = next_body_position.node; + state->text_pos = next_body_position.text_pos; + } else { + /* Only the tail could match. */ + + /* Advance into the tail. */ + node = next_tail_position.node; + state->text_pos = next_tail_position.text_pos; + } + break; + } + case RE_OP_GREEDY_REPEAT_ONE: /* Greedy repeat for one character. */ + { + RE_CODE index; + RE_RepeatData* rp_data; + size_t count; + BOOL is_partial; + BOOL match; + RE_BacktrackData* bt_data; + TRACE(("%s %d\n", re_op_text[node->op], node->values[0])) + + /* Repeat indexes are 0-based. */ + index = node->values[0]; + rp_data = &state->repeats[index]; + + if (is_repeat_guarded(safe_state, index, state->text_pos, + RE_STATUS_BODY)) + goto backtrack; + + /* Count how many times the character repeats, up to the maximum. + */ + count = count_one(state, node->nonstring.next_2.node, + state->text_pos, node->values[2], &is_partial); + if (is_partial) { + state->text_pos += (Py_ssize_t)count * node->step; + return RE_ERROR_PARTIAL; + } + + /* Unmatch until it's not guarded. */ + match = FALSE; + for (;;) { + if (count < node->values[1]) + /* The number of repeats is below the minimum. */ + break; + + if (!is_repeat_guarded(safe_state, index, state->text_pos + + (Py_ssize_t)count * node->step, RE_STATUS_TAIL)) { + /* It's not guarded at this position. */ + match = TRUE; + break; + } + + if (count == 0) + break; + + --count; + } + + if (!match) { + /* The repeat has failed to match at this position. */ + if (!guard_repeat(safe_state, index, state->text_pos, + RE_STATUS_BODY, TRUE)) + return RE_ERROR_MEMORY; + goto backtrack; + } + + if (count > node->values[1]) { + /* Record the backtracking info. */ + if (!add_backtrack(safe_state, RE_OP_GREEDY_REPEAT_ONE)) + return RE_ERROR_BACKTRACKING; + bt_data = state->backtrack; + bt_data->repeat.position.node = node; + bt_data->repeat.index = index; + bt_data->repeat.text_pos = rp_data->start; + bt_data->repeat.count = rp_data->count; + + rp_data->start = state->text_pos; + rp_data->count = count; + } + + /* Advance into the tail. */ + state->text_pos += (Py_ssize_t)count * node->step; + node = node->next_1.node; + break; + } + case RE_OP_GROUP_CALL: /* Group call. */ + { + size_t index; + size_t g; + size_t r; + TRACE(("%s %d\n", re_op_text[node->op], node->values[0])) + + index = node->values[0]; + + /* Save the capture groups and repeat guards. */ + if (!push_group_return(safe_state, node->next_1.node)) + return RE_ERROR_MEMORY; + + /* Clear the capture groups for the group call. They'll be restored + * on return. + */ + for (g = 0; g < state->pattern->true_group_count; g++) { + RE_GroupData* group; + + group = &state->groups[g]; + group->span.start = -1; + group->span.end = -1; + group->current_capture = -1; + } + + /* Clear the repeat guards for the group call. They'll be restored + * on return. + */ + for (r = 0; r < state->pattern->repeat_count; r++) { + RE_RepeatData* repeat; + + repeat = &state->repeats[r]; + repeat->body_guard_list.count = 0; + repeat->body_guard_list.last_text_pos = -1; + repeat->tail_guard_list.count = 0; + repeat->tail_guard_list.last_text_pos = -1; + } + + /* Call a group, skipping its CALL_REF node. */ + node = pattern->call_ref_info[index].node->next_1.node; + + if (!add_backtrack(safe_state, RE_OP_GROUP_CALL)) + return RE_ERROR_BACKTRACKING; + break; + } + case RE_OP_GROUP_EXISTS: /* Capture group exists. */ + { + TRACE(("%s %d\n", re_op_text[node->op], node->values[0])) + + /* Capture group indexes are 1-based (excluding group 0, which is + * the entire matched string). + * + * Check whether the captured text, if any, exists at this position + * in the string. + * + * A group index of 0, however, means that it's a DEFINE, which we + * should skip. + */ + if (node->values[0] == 0) + /* Skip past the body. */ + node = node->nonstring.next_2.node; + else { + RE_GroupData* group; + + group = &state->groups[node->values[0] - 1]; + if (group->current_capture >= 0) + /* The 'true' branch. */ + node = node->next_1.node; + else + /* The 'false' branch. */ + node = node->nonstring.next_2.node; + } + break; + } + case RE_OP_GROUP_RETURN: /* Group return. */ + { + RE_Node* return_node; + RE_BacktrackData* bt_data; + TRACE(("%s\n", re_op_text[node->op])) + + return_node = top_group_return(state); + + if (!add_backtrack(safe_state, RE_OP_GROUP_RETURN)) + return RE_ERROR_BACKTRACKING; + bt_data = state->backtrack; + bt_data->group_call.node = return_node; + bt_data->group_call.capture_change = state->capture_change; + + if (return_node) { + /* The group was called. */ + node = return_node; + + /* Save the groups. */ + if (!push_groups(safe_state)) + return RE_ERROR_MEMORY; + + /* Save the repeats. */ + if (!push_repeats(safe_state)) + return RE_ERROR_MEMORY; + } else + /* The group was not called. */ + node = node->next_1.node; + + pop_group_return(state); + break; + } + case RE_OP_KEEP: /* Keep. */ + { + RE_BacktrackData* bt_data; + TRACE(("%s\n", re_op_text[node->op])) + + if (!add_backtrack(safe_state, RE_OP_KEEP)) + return RE_ERROR_BACKTRACKING; + bt_data = state->backtrack; + bt_data->keep.match_pos = state->match_pos; + state->match_pos = state->text_pos; + node = node->next_1.node; + break; + } + case RE_OP_LAZY_REPEAT: /* Lazy repeat. */ + { + RE_CODE index; + RE_RepeatData* rp_data; + RE_BacktrackData* bt_data; + BOOL try_body; + int body_status; + RE_Position next_body_position; + BOOL try_tail; + int tail_status; + RE_Position next_tail_position; + TRACE(("%s %d\n", re_op_text[node->op], node->values[0])) + + /* Repeat indexes are 0-based. */ + index = node->values[0]; + rp_data = &state->repeats[index]; + + /* We might need to backtrack into the head, so save the current + * repeat. + */ + if (!add_backtrack(safe_state, RE_OP_LAZY_REPEAT)) + return RE_ERROR_BACKTRACKING; + bt_data = state->backtrack; + bt_data->repeat.index = index; + bt_data->repeat.count = rp_data->count; + bt_data->repeat.start = rp_data->start; + bt_data->repeat.capture_change = rp_data->capture_change; + bt_data->repeat.text_pos = state->text_pos; + + /* Initialise the new repeat. */ + rp_data->count = 0; + rp_data->start = state->text_pos; + rp_data->capture_change = state->capture_change; + + /* Could the body or tail match? */ + try_body = node->values[2] > 0 && !is_repeat_guarded(safe_state, + index, state->text_pos, RE_STATUS_BODY); + if (try_body) { + body_status = try_match(state, &node->next_1, state->text_pos, + &next_body_position); + if (body_status < 0) + return body_status; + + if (body_status == RE_ERROR_FAILURE) + try_body = FALSE; + } else + body_status = RE_ERROR_FAILURE; + + try_tail = node->values[1] == 0; + if (try_tail) { + tail_status = try_match(state, &node->nonstring.next_2, + state->text_pos, &next_tail_position); + if (tail_status < 0) + return tail_status; + + if (tail_status == RE_ERROR_FAILURE) + try_tail = FALSE; + } else + tail_status = RE_ERROR_FAILURE; + + if (!try_body && !try_tail) + /* Neither the body nor the tail could match. */ + goto backtrack; + + if (body_status < 0 || (body_status == 0 && tail_status < 0)) + return RE_ERROR_PARTIAL; + + if (try_body) { + if (try_tail) { + /* Both the body and the tail could match, but the tail + * takes precedence. If the tail fails to match then we + * want to try the body before backtracking further. + */ + + /* Record backtracking info for matching the tail. */ + if (!add_backtrack(safe_state, RE_OP_MATCH_BODY)) + return RE_ERROR_BACKTRACKING; + bt_data = state->backtrack; + bt_data->repeat.position = next_body_position; + bt_data->repeat.index = index; + bt_data->repeat.count = rp_data->count; + bt_data->repeat.start = rp_data->start; + bt_data->repeat.capture_change = rp_data->capture_change; + bt_data->repeat.text_pos = state->text_pos; + + /* Advance into the tail. */ + node = next_tail_position.node; + state->text_pos = next_tail_position.text_pos; + } else { + /* Advance into the body. */ + node = next_body_position.node; + state->text_pos = next_body_position.text_pos; + } + } else { + /* Only the tail could match. */ + + /* Advance into the tail. */ + node = next_tail_position.node; + state->text_pos = next_tail_position.text_pos; + } + break; + } + case RE_OP_LAZY_REPEAT_ONE: /* Lazy repeat for one character. */ + { + RE_CODE index; + RE_RepeatData* rp_data; + size_t count; + BOOL is_partial; + TRACE(("%s %d\n", re_op_text[node->op], node->values[0])) + + /* Repeat indexes are 0-based. */ + index = node->values[0]; + rp_data = &state->repeats[index]; + + if (is_repeat_guarded(safe_state, index, state->text_pos, + RE_STATUS_BODY)) + goto backtrack; + + /* Count how many times the character repeats, up to the minimum. + */ + count = count_one(state, node->nonstring.next_2.node, + state->text_pos, node->values[1], &is_partial); + if (is_partial) { + state->text_pos += (Py_ssize_t)count * node->step; + return RE_ERROR_PARTIAL; + } + + /* Have we matched at least the minimum? */ + if (count < node->values[1]) { + /* The repeat has failed to match at this position. */ + if (!guard_repeat(safe_state, index, state->text_pos, + RE_STATUS_BODY, TRUE)) + return RE_ERROR_MEMORY; + goto backtrack; + } + + if (count < node->values[2]) { + /* The match is shorter than the maximum, so we might need to + * backtrack the repeat to consume more. + */ + RE_BacktrackData* bt_data; + + /* Get the offset to the repeat values in the context. */ + rp_data = &state->repeats[index]; + if (!add_backtrack(safe_state, RE_OP_LAZY_REPEAT_ONE)) + return RE_ERROR_BACKTRACKING; + bt_data = state->backtrack; + bt_data->repeat.position.node = node; + bt_data->repeat.index = index; + bt_data->repeat.text_pos = rp_data->start; + bt_data->repeat.count = rp_data->count; + + rp_data->start = state->text_pos; + rp_data->count = count; + } + + /* Advance into the tail. */ + state->text_pos += (Py_ssize_t)count * node->step; + node = node->next_1.node; + break; + } + case RE_OP_LOOKAROUND: /* Start of a lookaround subpattern. */ + { + RE_AtomicData* lookaround; + TRACE(("%s %d\n", re_op_text[node->op], node->match)) + + if (!add_backtrack(safe_state, RE_OP_LOOKAROUND)) + return RE_ERROR_BACKTRACKING; + state->backtrack->lookaround.too_few_errors = + state->too_few_errors; + state->backtrack->lookaround.capture_change = + state->capture_change; + state->backtrack->lookaround.inside = TRUE; + state->backtrack->lookaround.node = node; + + lookaround = push_atomic(safe_state); + if (!lookaround) + return RE_ERROR_MEMORY; + lookaround->backtrack_count = + state->current_backtrack_block->count; + lookaround->current_backtrack_block = + state->current_backtrack_block; + lookaround->slice_start = state->slice_start; + lookaround->slice_end = state->slice_end; + lookaround->text_pos = state->text_pos; + lookaround->node = node; + lookaround->backtrack = state->backtrack; + lookaround->is_lookaround = TRUE; + lookaround->has_groups = (node->status & RE_STATUS_HAS_GROUPS) != + 0; + lookaround->has_repeats = (node->status & RE_STATUS_HAS_REPEATS) != + 0; + + /* Save the groups and repeats. */ + if (lookaround->has_groups && !push_groups(safe_state)) + return RE_ERROR_MEMORY; + + if (lookaround->has_repeats && !push_repeats(safe_state)) + return RE_ERROR_MEMORY; + + lookaround->saved_groups = state->current_saved_groups; + lookaround->saved_repeats = state->current_saved_repeats; + + state->slice_start = 0; + state->slice_end = state->text_length; + + node = node->next_1.node; + break; + } + case RE_OP_PROPERTY: /* A property. */ + TRACE(("%s %d %d\n", re_op_text[node->op], node->match, + node->values[0])) + + if (state->text_pos >= state->text_length && state->partial_side == + RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + if (state->text_pos < state->slice_end && + matches_PROPERTY(encoding, locale_info, node, + char_at(state->text, state->text_pos)) == node->match) { + state->text_pos += node->step; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_PROPERTY_IGN: /* A property, ignoring case. */ + TRACE(("%s %d %d\n", re_op_text[node->op], node->match, + node->values[0])) + + if (state->text_pos >= state->text_length && state->partial_side == + RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + if (state->text_pos < state->slice_end && + matches_PROPERTY_IGN(encoding, locale_info, node, + char_at(state->text, state->text_pos)) == node->match) { + state->text_pos += node->step; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_PROPERTY_IGN_REV: /* A property, backwards, ignoring case. */ + TRACE(("%s %d %d\n", re_op_text[node->op], node->match, + node->values[0])) + + if (state->text_pos <= 0 && state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + if (state->text_pos > state->slice_start && + matches_PROPERTY_IGN(encoding, locale_info, node, + char_at(state->text, state->text_pos - 1)) == node->match) { + state->text_pos += node->step; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, -1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_PROPERTY_REV: /* A property, backwards. */ + TRACE(("%s %d %d\n", re_op_text[node->op], node->match, + node->values[0])) + + if (state->text_pos <= 0 && state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + if (state->text_pos > state->slice_start && + matches_PROPERTY(encoding, locale_info, node, + char_at(state->text, state->text_pos - 1)) == node->match) { + state->text_pos += node->step; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, -1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_PRUNE: /* Prune the backtracking. */ + TRACE(("%s\n", re_op_text[node->op])) + + prune_backtracking(state); + + node = node->next_1.node; + break; + case RE_OP_RANGE: /* A range. */ + TRACE(("%s %d %d %d\n", re_op_text[node->op], node->match, + node->values[0], node->values[1])) + + if (state->text_pos >= state->text_length && state->partial_side == + RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + if (state->text_pos < state->slice_end && matches_RANGE(encoding, + locale_info, node, char_at(state->text, state->text_pos)) == + node->match) { + state->text_pos += node->step; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_RANGE_IGN: /* A range, ignoring case. */ + TRACE(("%s %d %d %d\n", re_op_text[node->op], node->match, + node->values[0], node->values[1])) + + if (state->text_pos >= state->text_length && state->partial_side == + RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + if (state->text_pos < state->slice_end && + matches_RANGE_IGN(encoding, locale_info, node, + char_at(state->text, state->text_pos)) == node->match) { + state->text_pos += node->step; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_RANGE_IGN_REV: /* A range, backwards, ignoring case. */ + TRACE(("%s %d %d %d\n", re_op_text[node->op], node->match, + node->values[0], node->values[1])) + + if (state->text_pos <= 0 && state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + if (state->text_pos > state->slice_start && + matches_RANGE_IGN(encoding, locale_info, node, + char_at(state->text, state->text_pos - 1)) == node->match) { + state->text_pos += node->step; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, -1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_RANGE_REV: /* A range, backwards. */ + TRACE(("%s %d %d %d\n", re_op_text[node->op], node->match, + node->values[0], node->values[1])) + + if (state->text_pos <= 0 && state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + if (state->text_pos > state->slice_start && matches_RANGE(encoding, + locale_info, node, char_at(state->text, state->text_pos - 1)) == + node->match) { + state->text_pos += node->step; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, -1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_REF_GROUP: /* Reference to a capture group. */ + { + RE_GroupData* group; + RE_GroupSpan* span; + TRACE(("%s %d\n", re_op_text[node->op], node->values[0])) + + /* Capture group indexes are 1-based (excluding group 0, which is + * the entire matched string). + * + * Check whether the captured text, if any, exists at this position + * in the string. + */ + + /* Did the group capture anything? */ + group = &state->groups[node->values[0] - 1]; + if (group->current_capture < 0) + goto backtrack; + + span = &group->captures[group->current_capture]; + + if (string_pos < 0) + string_pos = span->start; + + /* Try comparing. */ + while (string_pos < span->end) { + if (state->text_pos >= state->text_length && + state->partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + if (state->text_pos < state->slice_end && + same_char(char_at(state->text, state->text_pos), + char_at(state->text, string_pos))) { + ++string_pos; + ++state->text_pos; + } else if (node->status & RE_STATUS_FUZZY) { + BOOL matched; + + status = fuzzy_match_string(safe_state, search, + &state->text_pos, node, &string_pos, &matched, 1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!matched) { + string_pos = -1; + goto backtrack; + } + } else { + string_pos = -1; + goto backtrack; + } + } + + string_pos = -1; + + /* Successful match. */ + node = node->next_1.node; + break; + } + case RE_OP_REF_GROUP_FLD: /* Reference to a capture group, ignoring case. */ + { + RE_GroupData* group; + RE_GroupSpan* span; + int (*full_case_fold)(RE_LocaleInfo* locale_info, Py_UCS4 ch, + Py_UCS4* folded); + int folded_len; + int gfolded_len; + Py_UCS4 folded[RE_MAX_FOLDED]; + Py_UCS4 gfolded[RE_MAX_FOLDED]; + TRACE(("%s %d\n", re_op_text[node->op], node->values[0])) + + /* Capture group indexes are 1-based (excluding group 0, which is + * the entire matched string). + * + * Check whether the captured text, if any, exists at this position + * in the string. + */ + + /* Did the group capture anything? */ + group = &state->groups[node->values[0] - 1]; + if (group->current_capture < 0) + goto backtrack; + + span = &group->captures[group->current_capture]; + + full_case_fold = encoding->full_case_fold; + + if (string_pos < 0) { + string_pos = span->start; + folded_pos = 0; + folded_len = 0; + gfolded_pos = 0; + gfolded_len = 0; + } else { + folded_len = full_case_fold(locale_info, char_at(state->text, + state->text_pos), folded); + gfolded_len = full_case_fold(locale_info, char_at(state->text, + string_pos), gfolded); + } + + /* Try comparing. */ + while (string_pos < span->end) { + /* Case-fold at current position in text. */ + if (folded_pos >= folded_len) { + if (state->text_pos >= state->text_length && + state->partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + if (state->text_pos < state->slice_end) + folded_len = full_case_fold(locale_info, + char_at(state->text, state->text_pos), folded); + else + folded_len = 0; + + folded_pos = 0; + } + + /* Case-fold at current position in group. */ + if (gfolded_pos >= gfolded_len) { + gfolded_len = full_case_fold(locale_info, + char_at(state->text, string_pos), gfolded); + gfolded_pos = 0; + } + + if (folded_pos < folded_len && folded[folded_pos] == + gfolded[gfolded_pos]) { + ++folded_pos; + ++gfolded_pos; + } else if (node->status & RE_STATUS_FUZZY) { + BOOL matched; + + status = fuzzy_match_group_fld(safe_state, search, + &state->text_pos, node, &folded_pos, folded_len, + &string_pos, &gfolded_pos, gfolded_len, &matched, 1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!matched) { + string_pos = -1; + goto backtrack; + } + } else { + string_pos = -1; + goto backtrack; + } + + if (folded_pos >= folded_len && folded_len > 0) + ++state->text_pos; + + if (gfolded_pos >= gfolded_len) + ++string_pos; + } + + string_pos = -1; + + if (folded_pos < folded_len || gfolded_pos < gfolded_len) + goto backtrack; + + /* Successful match. */ + node = node->next_1.node; + break; + } + case RE_OP_REF_GROUP_FLD_REV: /* Reference to a capture group, ignoring case. */ + { + RE_GroupData* group; + RE_GroupSpan* span; + int (*full_case_fold)(RE_LocaleInfo* locale_info, Py_UCS4 ch, + Py_UCS4* folded); + int folded_len; + int gfolded_len; + Py_UCS4 folded[RE_MAX_FOLDED]; + Py_UCS4 gfolded[RE_MAX_FOLDED]; + TRACE(("%s %d\n", re_op_text[node->op], node->values[0])) + + /* Capture group indexes are 1-based (excluding group 0, which is + * the entire matched string). + * + * Check whether the captured text, if any, exists at this position + * in the string. + */ + + /* Did the group capture anything? */ + group = &state->groups[node->values[0] - 1]; + if (group->current_capture < 0) + goto backtrack; + + span = &group->captures[group->current_capture]; + + full_case_fold = encoding->full_case_fold; + + if (string_pos < 0) { + string_pos = span->end; + folded_pos = 0; + folded_len = 0; + gfolded_pos = 0; + gfolded_len = 0; + } else { + folded_len = full_case_fold(locale_info, char_at(state->text, + state->text_pos - 1), folded); + gfolded_len = full_case_fold(locale_info, char_at(state->text, + string_pos - 1), gfolded); + } + + /* Try comparing. */ + while (string_pos > span->start) { + /* Case-fold at current position in text. */ + if (folded_pos <= 0) { + if (state->text_pos <= 0 && state->partial_side == + RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + if (state->text_pos > state->slice_start) + folded_len = full_case_fold(locale_info, + char_at(state->text, state->text_pos - 1), folded); + else + folded_len = 0; + + folded_pos = folded_len; + } + + /* Case-fold at current position in group. */ + if (gfolded_pos <= 0) { + gfolded_len = full_case_fold(locale_info, + char_at(state->text, string_pos - 1), gfolded); + gfolded_pos = gfolded_len; + } + + if (folded_pos > 0 && folded[folded_pos - 1] == + gfolded[gfolded_pos - 1]) { + --folded_pos; + --gfolded_pos; + } else if (node->status & RE_STATUS_FUZZY) { + BOOL matched; + + status = fuzzy_match_group_fld(safe_state, search, + &state->text_pos, node, &folded_pos, folded_len, + &string_pos, &gfolded_pos, gfolded_len, &matched, -1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!matched) { + string_pos = -1; + goto backtrack; + } + } else { + string_pos = -1; + goto backtrack; + } + + if (folded_pos <= 0 && folded_len > 0) + --state->text_pos; + + if (gfolded_pos <= 0) + --string_pos; + } + + string_pos = -1; + + if (folded_pos > 0 || gfolded_pos > 0) + goto backtrack; + + /* Successful match. */ + node = node->next_1.node; + break; + } + case RE_OP_REF_GROUP_IGN: /* Reference to a capture group, ignoring case. */ + { + RE_GroupData* group; + RE_GroupSpan* span; + TRACE(("%s %d\n", re_op_text[node->op], node->values[0])) + + /* Capture group indexes are 1-based (excluding group 0, which is + * the entire matched string). + * + * Check whether the captured text, if any, exists at this position + * in the string. + */ + + /* Did the group capture anything? */ + group = &state->groups[node->values[0] - 1]; + if (group->current_capture < 0) + goto backtrack; + + span = &group->captures[group->current_capture]; + + if (string_pos < 0) + string_pos = span->start; + + /* Try comparing. */ + while (string_pos < span->end) { + if (state->text_pos >= state->text_length && + state->partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + if (state->text_pos < state->slice_end && + same_char_ign(encoding, locale_info, char_at(state->text, + state->text_pos), char_at(state->text, string_pos))) { + ++string_pos; + ++state->text_pos; + } else if (node->status & RE_STATUS_FUZZY) { + BOOL matched; + + status = fuzzy_match_string(safe_state, search, + &state->text_pos, node, &string_pos, &matched, 1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!matched) { + string_pos = -1; + goto backtrack; + } + } else { + string_pos = -1; + goto backtrack; + } + } + + string_pos = -1; + + /* Successful match. */ + node = node->next_1.node; + break; + } + case RE_OP_REF_GROUP_IGN_REV: /* Reference to a capture group, ignoring case. */ + { + RE_GroupData* group; + RE_GroupSpan* span; + TRACE(("%s %d\n", re_op_text[node->op], node->values[0])) + + /* Capture group indexes are 1-based (excluding group 0, which is + * the entire matched string). + * + * Check whether the captured text, if any, exists at this position + * in the string. + */ + + /* Did the group capture anything? */ + group = &state->groups[node->values[0] - 1]; + if (group->current_capture < 0) + goto backtrack; + + span = &group->captures[group->current_capture]; + + if (string_pos < 0) + string_pos = span->end; + + /* Try comparing. */ + while (string_pos > span->start) { + if (state->text_pos <= 0 && state->partial_side == + RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + if (state->text_pos > state->slice_start && + same_char_ign(encoding, locale_info, char_at(state->text, + state->text_pos - 1), char_at(state->text, string_pos - 1))) + { + --string_pos; + --state->text_pos; + } else if (node->status & RE_STATUS_FUZZY) { + BOOL matched; + + status = fuzzy_match_string(safe_state, search, + &state->text_pos, node, &string_pos, &matched, -1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!matched) { + string_pos = -1; + goto backtrack; + } + } else { + string_pos = -1; + goto backtrack; + } + } + + string_pos = -1; + + /* Successful match. */ + node = node->next_1.node; + break; + } + case RE_OP_REF_GROUP_REV: /* Reference to a capture group. */ + { + RE_GroupData* group; + RE_GroupSpan* span; + TRACE(("%s %d\n", re_op_text[node->op], node->values[0])) + + /* Capture group indexes are 1-based (excluding group 0, which is + * the entire matched string). + * + * Check whether the captured text, if any, exists at this position + * in the string. + */ + + /* Did the group capture anything? */ + group = &state->groups[node->values[0] - 1]; + if (group->current_capture < 0) + goto backtrack; + + span = &group->captures[group->current_capture]; + + if (string_pos < 0) + string_pos = span->end; + + /* Try comparing. */ + while (string_pos > span->start) { + if (state->text_pos <= 0 && state->partial_side == + RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + if (state->text_pos > state->slice_start && + same_char(char_at(state->text, state->text_pos - 1), + char_at(state->text, string_pos - 1))) { + --string_pos; + --state->text_pos; + } else if (node->status & RE_STATUS_FUZZY) { + BOOL matched; + + status = fuzzy_match_string(safe_state, search, + &state->text_pos, node, &string_pos, &matched, -1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!matched) { + string_pos = -1; + goto backtrack; + } + } else { + string_pos = -1; + goto backtrack; + } + } + + string_pos = -1; + + /* Successful match. */ + node = node->next_1.node; + break; + } + case RE_OP_SEARCH_ANCHOR: /* At the start of the search. */ + TRACE(("%s %d\n", re_op_text[node->op], node->values[0])) + + if (state->text_pos == state->search_anchor) + node = node->next_1.node; + else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 0); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_SET_DIFF: /* Character set. */ + case RE_OP_SET_INTER: + case RE_OP_SET_SYM_DIFF: + case RE_OP_SET_UNION: + TRACE(("%s %d\n", re_op_text[node->op], node->match)) + + if (state->text_pos >= state->text_length && state->partial_side == + RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + if (state->text_pos < state->slice_end && matches_SET(encoding, + locale_info, node, char_at(state->text, state->text_pos)) == + node->match) { + state->text_pos += node->step; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_SET_DIFF_IGN: /* Character set, ignoring case. */ + case RE_OP_SET_INTER_IGN: + case RE_OP_SET_SYM_DIFF_IGN: + case RE_OP_SET_UNION_IGN: + TRACE(("%s %d\n", re_op_text[node->op], node->match)) + + if (state->text_pos >= state->text_length && state->partial_side == + RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + if (state->text_pos < state->slice_end && matches_SET_IGN(encoding, + locale_info, node, char_at(state->text, state->text_pos)) == + node->match) { + state->text_pos += node->step; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_SET_DIFF_IGN_REV: /* Character set, ignoring case. */ + case RE_OP_SET_INTER_IGN_REV: + case RE_OP_SET_SYM_DIFF_IGN_REV: + case RE_OP_SET_UNION_IGN_REV: + TRACE(("%s %d\n", re_op_text[node->op], node->match)) + + if (state->text_pos <= 0 && state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + if (state->text_pos > state->slice_start && + matches_SET_IGN(encoding, locale_info, node, char_at(state->text, + state->text_pos - 1)) == node->match) { + state->text_pos += node->step; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, -1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_SET_DIFF_REV: /* Character set. */ + case RE_OP_SET_INTER_REV: + case RE_OP_SET_SYM_DIFF_REV: + case RE_OP_SET_UNION_REV: + TRACE(("%s %d\n", re_op_text[node->op], node->match)) + + if (state->text_pos <= 0 && state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + if (state->text_pos > state->slice_start && matches_SET(encoding, + locale_info, node, char_at(state->text, state->text_pos - 1)) == + node->match) { + state->text_pos += node->step; + node = node->next_1.node; + } else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, -1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_SKIP: /* Skip the part of the text already matched. */ + TRACE(("%s\n", re_op_text[node->op])) + + if (node->status & RE_STATUS_REVERSE) + state->slice_end = state->text_pos; + else + state->slice_start = state->text_pos; + + prune_backtracking(state); + node = node->next_1.node; + break; + case RE_OP_START_GROUP: /* Start of a capture group. */ + { + RE_CODE private_index; + RE_CODE public_index; + RE_GroupData* group; + RE_BacktrackData* bt_data; + TRACE(("%s %d\n", re_op_text[node->op], node->values[1])) + + /* Capture group indexes are 1-based (excluding group 0, which is + * the entire matched string). + */ + private_index = node->values[0]; + public_index = node->values[1]; + group = &state->groups[private_index - 1]; + + if (!add_backtrack(safe_state, RE_OP_START_GROUP)) + return RE_ERROR_BACKTRACKING; + bt_data = state->backtrack; + bt_data->group.private_index = private_index; + bt_data->group.public_index = public_index; + bt_data->group.text_pos = group->span.start; + bt_data->group.capture = (BOOL)node->values[2]; + bt_data->group.current_capture = group->current_capture; + + if (pattern->group_info[private_index - 1].referenced && + group->span.start != state->text_pos) + ++state->capture_change; + group->span.start = state->text_pos; + + /* Save the capture? */ + if (node->values[2]) { + group->current_capture = (Py_ssize_t)group->capture_count; + if (!save_capture(safe_state, private_index, public_index)) + return RE_ERROR_MEMORY; + } + + node = node->next_1.node; + break; + } + case RE_OP_START_OF_LINE: /* At the start of a line. */ + TRACE(("%s\n", re_op_text[node->op])) + + status = try_match_START_OF_LINE(state, node, state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + node = node->next_1.node; + else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 0); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_START_OF_LINE_U: /* At the start of a line. */ + TRACE(("%s\n", re_op_text[node->op])) + + status = try_match_START_OF_LINE_U(state, node, state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + node = node->next_1.node; + else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 0); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_START_OF_STRING: /* At the start of the string. */ + TRACE(("%s\n", re_op_text[node->op])) + + status = try_match_START_OF_STRING(state, node, state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + node = node->next_1.node; + else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 0); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_START_OF_WORD: /* At the start of a word. */ + TRACE(("%s\n", re_op_text[node->op])) + + status = try_match_START_OF_WORD(state, node, state->text_pos); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS) + node = node->next_1.node; + else if (node->status & RE_STATUS_FUZZY) { + status = fuzzy_match_item(safe_state, search, &state->text_pos, + &node, 0); + if (status < 0) + return status; + + if (!node) + goto backtrack; + } else + goto backtrack; + break; + case RE_OP_STRING: /* A string. */ + { + Py_ssize_t length; + RE_CODE* values; + TRACE(("%s %d\n", re_op_text[node->op], node->value_count)) + + if ((node->status & RE_STATUS_REQUIRED) && state->text_pos == + state->req_pos && string_pos < 0) + state->text_pos = state->req_end; + else { + length = (Py_ssize_t)node->value_count; + + if (string_pos < 0) + string_pos = 0; + + values = node->values; + + /* Try comparing. */ + while (string_pos < length) { + if (state->text_pos >= state->text_length && + state->partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + if (state->text_pos < state->slice_end && + same_char(char_at(state->text, state->text_pos), + values[string_pos])) { + ++string_pos; + ++state->text_pos; + } else if (node->status & RE_STATUS_FUZZY) { + BOOL matched; + + status = fuzzy_match_string(safe_state, search, + &state->text_pos, node, &string_pos, &matched, 1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!matched) { + string_pos = -1; + goto backtrack; + } + } else { + string_pos = -1; + goto backtrack; + } + } + } + + string_pos = -1; + + /* Successful match. */ + node = node->next_1.node; + break; + } + case RE_OP_STRING_FLD: /* A string, ignoring case. */ + { + Py_ssize_t length; + int (*full_case_fold)(RE_LocaleInfo* locale_info, Py_UCS4 ch, + Py_UCS4* folded); + RE_CODE* values; + int folded_len; + Py_UCS4 folded[RE_MAX_FOLDED]; + TRACE(("%s %d\n", re_op_text[node->op], node->value_count)) + + if ((node->status & RE_STATUS_REQUIRED) && state->text_pos == + state->req_pos && string_pos < 0) + state->text_pos = state->req_end; + else { + length = (Py_ssize_t)node->value_count; + + full_case_fold = encoding->full_case_fold; + + if (string_pos < 0) { + string_pos = 0; + folded_pos = 0; + folded_len = 0; + } else { + folded_len = full_case_fold(locale_info, + char_at(state->text, state->text_pos), folded); + if (folded_pos >= folded_len) { + if (state->text_pos >= state->slice_end) + goto backtrack; + + ++state->text_pos; + folded_pos = 0; + folded_len = 0; + } + } + + values = node->values; + + /* Try comparing. */ + while (string_pos < length) { + if (folded_pos >= folded_len) { + if (state->text_pos >= state->text_length && + state->partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + if (state->text_pos < state->slice_end) + folded_len = full_case_fold(locale_info, + char_at(state->text, state->text_pos), folded); + else + folded_len = 0; + + folded_pos = 0; + } + + if (folded_pos < folded_len && same_char_ign(encoding, + locale_info, folded[folded_pos], values[string_pos])) { + ++string_pos; + ++folded_pos; + + if (folded_pos >= folded_len) + ++state->text_pos; + } else if (node->status & RE_STATUS_FUZZY) { + BOOL matched; + + status = fuzzy_match_string_fld(safe_state, search, + &state->text_pos, node, &string_pos, &folded_pos, + folded_len, &matched, 1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!matched) { + string_pos = -1; + goto backtrack; + } + + if (folded_pos >= folded_len && folded_len > 0) + ++state->text_pos; + } else { + string_pos = -1; + goto backtrack; + } + } + + if (node->status & RE_STATUS_FUZZY) { + while (folded_pos < folded_len) { + BOOL matched; + + if (!fuzzy_match_string_fld(safe_state, search, + &state->text_pos, node, &string_pos, &folded_pos, + folded_len, &matched, 1)) + return RE_ERROR_BACKTRACKING; + + if (!matched) { + string_pos = -1; + goto backtrack; + } + + if (folded_pos >= folded_len && folded_len > 0) + ++state->text_pos; + } + } + + string_pos = -1; + + if (folded_pos < folded_len) + goto backtrack; + } + + /* Successful match. */ + node = node->next_1.node; + break; + } + case RE_OP_STRING_FLD_REV: /* A string, ignoring case. */ + { + Py_ssize_t length; + int (*full_case_fold)(RE_LocaleInfo* locale_info, Py_UCS4 ch, + Py_UCS4* folded); + RE_CODE* values; + int folded_len; + Py_UCS4 folded[RE_MAX_FOLDED]; + TRACE(("%s %d\n", re_op_text[node->op], node->value_count)) + + if ((node->status & RE_STATUS_REQUIRED) && state->text_pos == + state->req_pos && string_pos < 0) + state->text_pos = state->req_end; + else { + length = (Py_ssize_t)node->value_count; + + full_case_fold = encoding->full_case_fold; + + if (string_pos < 0) { + string_pos = length; + folded_pos = 0; + folded_len = 0; + } else { + folded_len = full_case_fold(locale_info, + char_at(state->text, state->text_pos - 1), folded); + if (folded_pos <= 0) { + if (state->text_pos <= state->slice_start) + goto backtrack; + + --state->text_pos; + folded_pos = 0; + folded_len = 0; + } + } + + values = node->values; + + /* Try comparing. */ + while (string_pos > 0) { + if (folded_pos <= 0) { + if (state->text_pos <= 0 && state->partial_side == + RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + if (state->text_pos > state->slice_start) + folded_len = full_case_fold(locale_info, + char_at(state->text, state->text_pos - 1), + folded); + else + folded_len = 0; + + folded_pos = folded_len; + } + + if (folded_pos > 0 && same_char_ign(encoding, locale_info, + folded[folded_pos - 1], values[string_pos - 1])) { + --string_pos; + --folded_pos; + + if (folded_pos <= 0) + --state->text_pos; + } else if (node->status & RE_STATUS_FUZZY) { + BOOL matched; + + status = fuzzy_match_string_fld(safe_state, search, + &state->text_pos, node, &string_pos, &folded_pos, + folded_len, &matched, -1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!matched) { + string_pos = -1; + goto backtrack; + } + + if (folded_pos <= 0 && folded_len > 0) + --state->text_pos; + } else { + string_pos = -1; + goto backtrack; + } + } + + if (node->status & RE_STATUS_FUZZY) { + while (folded_pos > 0) { + BOOL matched; + + if (!fuzzy_match_string_fld(safe_state, search, + &state->text_pos, node, &string_pos, &folded_pos, + folded_len, &matched, -1)) + return RE_ERROR_BACKTRACKING; + + if (!matched) { + string_pos = -1; + goto backtrack; + } + + if (folded_pos <= 0 && folded_len > 0) + --state->text_pos; + } + } + + string_pos = -1; + + if (folded_pos > 0) + goto backtrack; + } + + /* Successful match. */ + node = node->next_1.node; + break; + } + case RE_OP_STRING_IGN: /* A string, ignoring case. */ + { + Py_ssize_t length; + RE_CODE* values; + TRACE(("%s %d\n", re_op_text[node->op], node->value_count)) + + if ((node->status & RE_STATUS_REQUIRED) && state->text_pos == + state->req_pos && string_pos < 0) + state->text_pos = state->req_end; + else { + length = (Py_ssize_t)node->value_count; + + if (string_pos < 0) + string_pos = 0; + + values = node->values; + + /* Try comparing. */ + while (string_pos < length) { + if (state->text_pos >= state->text_length && + state->partial_side == RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + if (state->text_pos < state->slice_end && + same_char_ign(encoding, locale_info, char_at(state->text, + state->text_pos), values[string_pos])) { + ++string_pos; + ++state->text_pos; + } else if (node->status & RE_STATUS_FUZZY) { + BOOL matched; + + status = fuzzy_match_string(safe_state, search, + &state->text_pos, node, &string_pos, &matched, 1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!matched) { + string_pos = -1; + goto backtrack; + } + } else { + string_pos = -1; + goto backtrack; + } + } + } + + string_pos = -1; + + /* Successful match. */ + node = node->next_1.node; + break; + } + case RE_OP_STRING_IGN_REV: /* A string, ignoring case. */ + { + Py_ssize_t length; + RE_CODE* values; + TRACE(("%s %d\n", re_op_text[node->op], node->value_count)) + + if ((node->status & RE_STATUS_REQUIRED) && state->text_pos == + state->req_pos && string_pos < 0) + state->text_pos = state->req_end; + else { + length = (Py_ssize_t)node->value_count; + + if (string_pos < 0) + string_pos = length; + + values = node->values; + + /* Try comparing. */ + while (string_pos > 0) { + if (state->text_pos <= 0 && state->partial_side == + RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + if (state->text_pos > state->slice_start && + same_char_ign(encoding, locale_info, char_at(state->text, + state->text_pos - 1), values[string_pos - 1])) { + --string_pos; + --state->text_pos; + } else if (node->status & RE_STATUS_FUZZY) { + BOOL matched; + + status = fuzzy_match_string(safe_state, search, + &state->text_pos, node, &string_pos, &matched, -1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!matched) { + string_pos = -1; + goto backtrack; + } + } else { + string_pos = -1; + goto backtrack; + } + } + } + + string_pos = -1; + + /* Successful match. */ + node = node->next_1.node; + break; + } + case RE_OP_STRING_REV: /* A string. */ + { + Py_ssize_t length; + RE_CODE* values; + TRACE(("%s %d\n", re_op_text[node->op], node->value_count)) + + if ((node->status & RE_STATUS_REQUIRED) && state->text_pos == + state->req_pos && string_pos < 0) + state->text_pos = state->req_end; + else { + length = (Py_ssize_t)node->value_count; + + if (string_pos < 0) + string_pos = length; + + values = node->values; + + /* Try comparing. */ + while (string_pos > 0) { + if (state->text_pos <= 0 && state->partial_side == + RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + if (state->text_pos > state->slice_start && + same_char(char_at(state->text, state->text_pos - 1), + values[string_pos - 1])) { + --string_pos; + --state->text_pos; + } else if (node->status & RE_STATUS_FUZZY) { + BOOL matched; + + status = fuzzy_match_string(safe_state, search, + &state->text_pos, node, &string_pos, &matched, -1); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (!matched) { + string_pos = -1; + goto backtrack; + } + } else { + string_pos = -1; + goto backtrack; + } + } + } + + string_pos = -1; + + /* Successful match. */ + node = node->next_1.node; + break; + } + case RE_OP_STRING_SET: /* Member of a string set. */ + { + int status; + TRACE(("%s\n", re_op_text[node->op])) + + status = string_set_match_fwdrev(safe_state, node, FALSE); + if (status < 0) + return status; + if (status == 0) + goto backtrack; + node = node->next_1.node; + break; + } + case RE_OP_STRING_SET_FLD: /* Member of a string set, ignoring case. */ + { + int status; + TRACE(("%s\n", re_op_text[node->op])) + + status = string_set_match_fld_fwdrev(safe_state, node, FALSE); + if (status < 0) + return status; + if (status == 0) + goto backtrack; + node = node->next_1.node; + break; + } + case RE_OP_STRING_SET_FLD_REV: /* Member of a string set, ignoring case. */ + { + int status; + TRACE(("%s\n", re_op_text[node->op])) + + status = string_set_match_fld_fwdrev(safe_state, node, TRUE); + if (status < 0) + return status; + if (status == 0) + goto backtrack; + node = node->next_1.node; + break; + } + case RE_OP_STRING_SET_IGN: /* Member of a string set, ignoring case. */ + { + int status; + TRACE(("%s\n", re_op_text[node->op])) + + status = string_set_match_ign_fwdrev(safe_state, node, FALSE); + if (status < 0) + return status; + if (status == 0) + goto backtrack; + node = node->next_1.node; + break; + } + case RE_OP_STRING_SET_IGN_REV: /* Member of a string set, ignoring case. */ + { + int status; + TRACE(("%s\n", re_op_text[node->op])) + + status = string_set_match_ign_fwdrev(safe_state, node, TRUE); + if (status < 0) + return status; + if (status == 0) + goto backtrack; + node = node->next_1.node; + break; + } + case RE_OP_STRING_SET_REV: /* Member of a string set. */ + { + int status; + TRACE(("%s\n", re_op_text[node->op])) + + status = string_set_match_fwdrev(safe_state, node, TRUE); + if (status < 0) + return status; + if (status == 0) + goto backtrack; + node = node->next_1.node; + break; + } + case RE_OP_SUCCESS: /* Success. */ + /* Must the match advance past its start? */ + TRACE(("%s\n", re_op_text[node->op])) + + if (state->text_pos == state->search_anchor && state->must_advance) + goto backtrack; + + if (state->match_all) { + /* We want to match all of the slice. */ + if (state->reverse) { + if (state->text_pos != state->slice_start) + goto backtrack; + } else { + if (state->text_pos != state->slice_end) + goto backtrack; + } + } + + if (state->pattern->flags & RE_FLAG_POSIX) { + /* If we're looking for a POSIX match, check whether this one + * is better and then keep looking. + */ + if (!check_posix_match(safe_state)) + return RE_ERROR_MEMORY; + + goto backtrack; + } + + return RE_ERROR_SUCCESS; + default: /* Illegal opcode! */ + TRACE(("UNKNOWN OP %d\n", node->op)) + return RE_ERROR_ILLEGAL; + } + } + +backtrack: + for (;;) { + RE_BacktrackData* bt_data; + TRACE(("BACKTRACK ")) + + /* Should we abort the matching? */ + ++state->iterations; + + if (state->iterations == 0 && safe_check_signals(safe_state)) + return RE_ERROR_INTERRUPTED; + + bt_data = last_backtrack(state); + + switch (bt_data->op) { + case RE_OP_ANY: /* Any character except a newline. */ + case RE_OP_ANY_ALL: /* Any character at all. */ + case RE_OP_ANY_ALL_REV: /* Any character at all, backwards. */ + case RE_OP_ANY_REV: /* Any character except a newline, backwards. */ + case RE_OP_ANY_U: /* Any character except a line separator. */ + case RE_OP_ANY_U_REV: /* Any character except a line separator, backwards. */ + case RE_OP_CHARACTER: /* A character. */ + case RE_OP_CHARACTER_IGN: /* A character, ignoring case. */ + case RE_OP_CHARACTER_IGN_REV: /* A character, ignoring case, backwards. */ + case RE_OP_CHARACTER_REV: /* A character, backwards. */ + case RE_OP_PROPERTY: /* A property. */ + case RE_OP_PROPERTY_IGN: /* A property, ignoring case. */ + case RE_OP_PROPERTY_IGN_REV: /* A property, ignoring case, backwards. */ + case RE_OP_PROPERTY_REV: /* A property, backwards. */ + case RE_OP_RANGE: /* A range. */ + case RE_OP_RANGE_IGN: /* A range, ignoring case. */ + case RE_OP_RANGE_IGN_REV: /* A range, ignoring case, backwards. */ + case RE_OP_RANGE_REV: /* A range, backwards. */ + case RE_OP_SET_DIFF: /* Set difference. */ + case RE_OP_SET_DIFF_IGN: /* Set difference, ignoring case. */ + case RE_OP_SET_DIFF_IGN_REV: /* Set difference, ignoring case, backwards. */ + case RE_OP_SET_DIFF_REV: /* Set difference, backwards. */ + case RE_OP_SET_INTER: /* Set intersection. */ + case RE_OP_SET_INTER_IGN: /* Set intersection, ignoring case. */ + case RE_OP_SET_INTER_IGN_REV: /* Set intersection, ignoring case, backwards. */ + case RE_OP_SET_INTER_REV: /* Set intersection, backwards. */ + case RE_OP_SET_SYM_DIFF: /* Set symmetric difference. */ + case RE_OP_SET_SYM_DIFF_IGN: /* Set symmetric difference, ignoring case. */ + case RE_OP_SET_SYM_DIFF_IGN_REV: /* Set symmetric difference, ignoring case, backwards. */ + case RE_OP_SET_SYM_DIFF_REV: /* Set symmetric difference, backwards. */ + case RE_OP_SET_UNION: /* Set union. */ + case RE_OP_SET_UNION_IGN: /* Set union, ignoring case. */ + case RE_OP_SET_UNION_IGN_REV: /* Set union, ignoring case, backwards. */ + case RE_OP_SET_UNION_REV: /* Set union, backwards. */ + TRACE(("%s\n", re_op_text[bt_data->op])) + + status = retry_fuzzy_match_item(safe_state, search, + &state->text_pos, &node, TRUE); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (node) + goto advance; + break; + case RE_OP_ATOMIC: /* Start of an atomic group. */ + { + RE_AtomicData* atomic; + /* backtrack to the start of an atomic group. */ + atomic = pop_atomic(safe_state); + + if (atomic->has_repeats) + pop_repeats(state); + + if (atomic->has_groups) + pop_groups(state); + + state->too_few_errors = bt_data->atomic.too_few_errors; + state->capture_change = bt_data->atomic.capture_change; + + discard_backtrack(state); + break; + } + case RE_OP_BODY_END: + { + RE_RepeatData* rp_data; + TRACE(("%s %d\n", re_op_text[bt_data->op], bt_data->repeat.index)) + + /* We're backtracking into the body. */ + rp_data = &state->repeats[bt_data->repeat.index]; + + /* Restore the repeat info. */ + rp_data->count = bt_data->repeat.count; + rp_data->start = bt_data->repeat.start; + rp_data->capture_change = bt_data->repeat.capture_change; + + discard_backtrack(state); + break; + } + case RE_OP_BODY_START: + { + TRACE(("%s %d\n", re_op_text[bt_data->op], bt_data->repeat.index)) + + /* The body may have failed to match at this position. */ + if (!guard_repeat(safe_state, bt_data->repeat.index, + bt_data->repeat.text_pos, RE_STATUS_BODY, TRUE)) + return RE_ERROR_MEMORY; + + discard_backtrack(state); + break; + } + case RE_OP_BOUNDARY: /* On a word boundary. */ + case RE_OP_DEFAULT_BOUNDARY: /* On a default word boundary. */ + case RE_OP_DEFAULT_END_OF_WORD: /* At a default end of a word. */ + case RE_OP_DEFAULT_START_OF_WORD: /* At a default start of a word. */ + case RE_OP_END_OF_LINE: /* At the end of a line. */ + case RE_OP_END_OF_LINE_U: /* At the end of a line. */ + case RE_OP_END_OF_STRING: /* At the end of the string. */ + case RE_OP_END_OF_STRING_LINE: /* At end of string or final newline. */ + case RE_OP_END_OF_STRING_LINE_U: /* At end of string or final newline. */ + case RE_OP_END_OF_WORD: /* At end of a word. */ + case RE_OP_GRAPHEME_BOUNDARY: /* On a grapheme boundary. */ + case RE_OP_SEARCH_ANCHOR: /* At the start of the search. */ + case RE_OP_START_OF_LINE: /* At the start of a line. */ + case RE_OP_START_OF_LINE_U: /* At the start of a line. */ + case RE_OP_START_OF_STRING: /* At the start of the string. */ + case RE_OP_START_OF_WORD: /* At start of a word. */ + TRACE(("%s\n", re_op_text[bt_data->op])) + + status = retry_fuzzy_match_item(safe_state, search, + &state->text_pos, &node, FALSE); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (node) + goto advance; + break; + case RE_OP_BRANCH: /* 2-way branch. */ + TRACE(("%s\n", re_op_text[bt_data->op])) + + node = bt_data->branch.position.node; + state->text_pos = bt_data->branch.position.text_pos; + discard_backtrack(state); + goto advance; + case RE_OP_CALL_REF: /* A group call ref. */ + case RE_OP_GROUP_CALL: /* Group call. */ + TRACE(("%s\n", re_op_text[bt_data->op])) + + pop_group_return(state); + discard_backtrack(state); + break; + case RE_OP_CONDITIONAL: /* Conditional subpattern. */ + { + TRACE(("%s\n", re_op_text[bt_data->op])) + + if (bt_data->lookaround.inside) { + /* Backtracked to the start of a lookaround. */ + RE_AtomicData* conditional; + + conditional = pop_atomic(safe_state); + state->text_pos = conditional->text_pos; + state->slice_end = conditional->slice_end; + state->slice_start = conditional->slice_start; + state->current_backtrack_block = + conditional->current_backtrack_block; + state->current_backtrack_block->count = + conditional->backtrack_count; + + /* Restore the groups and repeats and certain flags. */ + if (conditional->has_repeats) + pop_repeats(state); + + if (conditional->has_groups) + pop_groups(state); + + state->too_few_errors = bt_data->lookaround.too_few_errors; + state->capture_change = bt_data->lookaround.capture_change; + + if (bt_data->lookaround.node->match) { + /* It's a positive lookaround that's failed. + * + * Go to the 'false' branch. + */ + node = bt_data->lookaround.node->nonstring.next_2.node; + } else { + /* It's a negative lookaround that's failed. + * + * Go to the 'true' branch. + */ + node = bt_data->lookaround.node->nonstring.next_2.node; + } + + discard_backtrack(state); + + goto advance; + } else { + /* Backtracked to a lookaround. If it's a positive lookaround + * that succeeded, we need to restore the groups; if it's a + * negative lookaround that failed, it would have completely + * backtracked inside and already restored the groups. We also + * need to restore certain flags. + */ + if (bt_data->lookaround.node->match) + pop_groups(state); + + state->too_few_errors = bt_data->lookaround.too_few_errors; + state->capture_change = bt_data->lookaround.capture_change; + + discard_backtrack(state); + } + break; + } + case RE_OP_END_FUZZY: /* End of fuzzy matching. */ + TRACE(("%s\n", re_op_text[bt_data->op])) + + state->total_fuzzy_counts[RE_FUZZY_SUB] -= + state->fuzzy_info.counts[RE_FUZZY_SUB]; + state->total_fuzzy_counts[RE_FUZZY_INS] -= + state->fuzzy_info.counts[RE_FUZZY_INS]; + state->total_fuzzy_counts[RE_FUZZY_DEL] -= + state->fuzzy_info.counts[RE_FUZZY_DEL]; + + /* We need to retry the fuzzy match. */ + status = retry_fuzzy_insert(safe_state, &state->text_pos, &node); + if (status < 0) + return RE_ERROR_PARTIAL; + + /* If there were too few errors, in the fuzzy section, try again. + */ + if (state->too_few_errors) { + state->too_few_errors = FALSE; + goto backtrack; + } + + if (node) { + state->total_fuzzy_counts[RE_FUZZY_SUB] += + state->fuzzy_info.counts[RE_FUZZY_SUB]; + state->total_fuzzy_counts[RE_FUZZY_INS] += + state->fuzzy_info.counts[RE_FUZZY_INS]; + state->total_fuzzy_counts[RE_FUZZY_DEL] += + state->fuzzy_info.counts[RE_FUZZY_DEL]; + + node = node->next_1.node; + goto advance; + } + break; + case RE_OP_END_GROUP: /* End of a capture group. */ + { + RE_CODE private_index; + RE_GroupData* group; + TRACE(("%s %d\n", re_op_text[bt_data->op], + bt_data->group.public_index)) + + private_index = bt_data->group.private_index; + group = &state->groups[private_index - 1]; + + /* Unsave the capture? */ + if (bt_data->group.capture) + unsave_capture(state, bt_data->group.private_index, + bt_data->group.public_index); + + if (pattern->group_info[private_index - 1].referenced && + group->span.end != bt_data->group.text_pos) + --state->capture_change; + group->span.end = bt_data->group.text_pos; + group->current_capture = bt_data->group.current_capture; + + discard_backtrack(state); + break; + } + case RE_OP_FAILURE: + { + TRACE(("%s\n", re_op_text[bt_data->op])) + + /* Have we been looking for a POSIX match? */ + if (state->found_match) { + restore_best_match(safe_state); + return RE_OP_SUCCESS; + } + + /* Do we have to advance? */ + if (!search) + return RE_ERROR_FAILURE; + + /* Can we advance? */ + state->text_pos = state->match_pos; + + if (state->reverse) { + if (state->text_pos <= state->slice_start) + return RE_ERROR_FAILURE; + } else { + if (state->text_pos >= state->slice_end) + return RE_ERROR_FAILURE; + } + + /* Skip over any repeated leading characters. */ + switch (start_node->op) { + case RE_OP_GREEDY_REPEAT_ONE: + case RE_OP_LAZY_REPEAT_ONE: + { + size_t count; + BOOL is_partial; + + /* How many characters did the repeat actually match? */ + count = count_one(state, start_node->nonstring.next_2.node, + state->text_pos, start_node->values[2], &is_partial); + + /* If it's fewer than the maximum then skip over those + * characters. + */ + if (count < start_node->values[2]) + state->text_pos += (Py_ssize_t)count * pattern_step; + break; + } + } + + /* Advance and try to match again. e also need to check whether we + * need to skip. + */ + if (state->reverse) { + if (state->text_pos > state->slice_end) + state->text_pos = state->slice_end; + else + --state->text_pos; + } else { + if (state->text_pos < state->slice_start) + state->text_pos = state->slice_start; + else + ++state->text_pos; + } + + /* Clear the groups. */ + clear_groups(state); + + goto start_match; + } + case RE_OP_FUZZY: /* Fuzzy matching. */ + { + RE_FuzzyInfo* fuzzy_info; + TRACE(("%s\n", re_op_text[bt_data->op])) + + /* Restore the previous fuzzy info. */ + fuzzy_info = &state->fuzzy_info; + memmove(fuzzy_info, &bt_data->fuzzy.fuzzy_info, + sizeof(RE_FuzzyInfo)); + + discard_backtrack(state); + break; + } + case RE_OP_GREEDY_REPEAT: /* Greedy repeat. */ + case RE_OP_LAZY_REPEAT: /* Lazy repeat. */ + { + RE_RepeatData* rp_data; + TRACE(("%s\n", re_op_text[bt_data->op])) + + /* The repeat failed to match. */ + rp_data = &state->repeats[bt_data->repeat.index]; + + /* The body may have failed to match at this position. */ + if (!guard_repeat(safe_state, bt_data->repeat.index, + bt_data->repeat.text_pos, RE_STATUS_BODY, TRUE)) + return RE_ERROR_MEMORY; + + /* Restore the previous repeat. */ + rp_data->count = bt_data->repeat.count; + rp_data->start = bt_data->repeat.start; + rp_data->capture_change = bt_data->repeat.capture_change; + + discard_backtrack(state); + break; + } + case RE_OP_GREEDY_REPEAT_ONE: /* Greedy repeat for one character. */ + { + RE_RepeatData* rp_data; + size_t count; + Py_ssize_t step; + Py_ssize_t pos; + Py_ssize_t limit; + RE_Node* test; + BOOL match; + BOOL m; + size_t index; + TRACE(("%s\n", re_op_text[bt_data->op])) + + node = bt_data->repeat.position.node; + + rp_data = &state->repeats[bt_data->repeat.index]; + + /* Unmatch one character at a time until the tail could match or we + * have reached the minimum. + */ + state->text_pos = rp_data->start; + + count = rp_data->count; + step = node->step; + pos = state->text_pos + (Py_ssize_t)count * step; + limit = state->text_pos + (Py_ssize_t)node->values[1] * step; + + /* The tail failed to match at this position. */ + if (!guard_repeat(safe_state, bt_data->repeat.index, pos, + RE_STATUS_TAIL, TRUE)) + return RE_ERROR_MEMORY; + + /* A (*SKIP) might have change the size of the slice. */ + if (step > 0) { + if (limit < state->slice_start) + limit = state->slice_start; + } else { + if (limit > state->slice_end) + limit = state->slice_end; + } + + if (pos == limit) { + /* We've backtracked the repeat as far as we can. */ + rp_data->start = bt_data->repeat.text_pos; + rp_data->count = bt_data->repeat.count; + discard_backtrack(state); + break; + } + + test = node->next_1.test; + + m = test->match; + index = node->values[0]; + + match = FALSE; + + if (test->status & RE_STATUS_FUZZY) { + for (;;) { + int status; + RE_Position next_position; + + pos -= step; + + status = try_match(state, &node->next_1, pos, + &next_position); + if (status < 0) + return status; + + if (status != RE_ERROR_FAILURE && + !is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + + if (pos == limit) + break; + } + } else { + /* A repeated single-character match is often followed by a + * literal, so checking specially for it can be a good + * optimisation when working with long strings. + */ + switch (test->op) { + case RE_OP_CHARACTER: + { + Py_UCS4 ch; + + ch = test->values[0]; + + for (;;) { + --pos; + + if (same_char(char_at(state->text, pos), ch) == m && + !is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + + if (pos == limit) + break; + + } + break; + } + case RE_OP_CHARACTER_IGN: + { + Py_UCS4 ch; + + ch = test->values[0]; + + for (;;) { + --pos; + + if (same_char_ign(encoding, locale_info, + char_at(state->text, pos), ch) == m && + !is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + + if (pos == limit) + break; + + } + break; + } + case RE_OP_CHARACTER_IGN_REV: + { + Py_UCS4 ch; + + ch = test->values[0]; + + for (;;) { + ++pos; + + if (same_char_ign(encoding, locale_info, + char_at(state->text, pos - 1), ch) == m && + !is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + + if (pos == limit) + break; + + } + break; + } + case RE_OP_CHARACTER_REV: + { + Py_UCS4 ch; + + ch = test->values[0]; + + for (;;) { + ++pos; + + if (same_char(char_at(state->text, pos - 1), ch) == m + && !is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + + if (pos == limit) + break; + + } + break; + } + case RE_OP_STRING: + { + Py_ssize_t length; + + length = (Py_ssize_t)test->value_count; + + /* The tail is a string. We don't want to go off the end of + * the slice. + */ + pos = min_ssize_t(pos - 1, state->slice_end - length); + + for (;;) { + Py_ssize_t found; + BOOL is_partial; + + if (pos < limit) + break; + + found = string_search_rev(safe_state, test, pos + + length, limit, &is_partial); + if (is_partial) + return RE_ERROR_PARTIAL; + + if (found < 0) + break; + + pos = found - length; + + if (!is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + + --pos; + } + break; + } + case RE_OP_STRING_FLD: + { + int (*full_case_fold)(RE_LocaleInfo* locale_info, Py_UCS4 + ch, Py_UCS4* folded); + Py_ssize_t folded_length; + size_t i; + Py_UCS4 folded[RE_MAX_FOLDED]; + + full_case_fold = encoding->full_case_fold; + + folded_length = 0; + for (i = 0; i < test->value_count; i++) + folded_length += full_case_fold(locale_info, + test->values[i], folded); + + /* The tail is a string. We don't want to go off the end of + * the slice. + */ + pos = min_ssize_t(pos - 1, state->slice_end - + folded_length); + + for (;;) { + Py_ssize_t found; + Py_ssize_t new_pos; + BOOL is_partial; + + if (pos < limit) + break; + + found = string_search_fld_rev(safe_state, test, pos + + folded_length, limit, &new_pos, &is_partial); + if (is_partial) + return RE_ERROR_PARTIAL; + + if (found < 0) + break; + + pos = found - folded_length; + + if (!is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + + --pos; + } + break; + } + case RE_OP_STRING_FLD_REV: + { + int (*full_case_fold)(RE_LocaleInfo* locale_info, Py_UCS4 + ch, Py_UCS4* folded); + Py_ssize_t folded_length; + size_t i; + Py_UCS4 folded[RE_MAX_FOLDED]; + + full_case_fold = encoding->full_case_fold; + + folded_length = 0; + for (i = 0; i < test->value_count; i++) + folded_length += full_case_fold(locale_info, + test->values[i], folded); + + /* The tail is a string. We don't want to go off the end of + * the slice. + */ + pos = max_ssize_t(pos + 1, state->slice_start + + folded_length); + + for (;;) { + Py_ssize_t found; + Py_ssize_t new_pos; + BOOL is_partial; + + if (pos > limit) + break; + + found = string_search_fld(safe_state, test, pos - + folded_length, limit, &new_pos, &is_partial); + if (is_partial) + return RE_ERROR_PARTIAL; + + if (found < 0) + break; + + pos = found + folded_length; + + if (!is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + + ++pos; + } + break; + } + case RE_OP_STRING_IGN: + { + Py_ssize_t length; + + length = (Py_ssize_t)test->value_count; + + /* The tail is a string. We don't want to go off the end of + * the slice. + */ + pos = min_ssize_t(pos - 1, state->slice_end - length); + + for (;;) { + Py_ssize_t found; + BOOL is_partial; + + if (pos < limit) + break; + + found = string_search_ign_rev(safe_state, test, pos + + length, limit, &is_partial); + if (is_partial) + return RE_ERROR_PARTIAL; + + if (found < 0) + break; + + pos = found - length; + + if (!is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + + --pos; + } + break; + } + case RE_OP_STRING_IGN_REV: + { + Py_ssize_t length; + + length = (Py_ssize_t)test->value_count; + + /* The tail is a string. We don't want to go off the end of + * the slice. + */ + pos = max_ssize_t(pos + 1, state->slice_start + length); + + for (;;) { + Py_ssize_t found; + BOOL is_partial; + + if (pos > limit) + break; + + found = string_search_ign(safe_state, test, pos - + length, limit, &is_partial); + if (is_partial) + return RE_ERROR_PARTIAL; + + if (found < 0) + break; + + pos = found + length; + + if (!is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + + ++pos; + } + break; + } + case RE_OP_STRING_REV: + { + Py_ssize_t length; + + length = (Py_ssize_t)test->value_count; + + /* The tail is a string. We don't want to go off the end of + * the slice. + */ + pos = max_ssize_t(pos + 1, state->slice_start + length); + + for (;;) { + Py_ssize_t found; + BOOL is_partial; + + if (pos > limit) + break; + + found = string_search(safe_state, test, pos - length, + limit, &is_partial); + if (is_partial) + return RE_ERROR_PARTIAL; + + if (found < 0) + break; + + pos = found + length; + + if (!is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + + ++pos; + } + break; + } + default: + for (;;) { + RE_Position next_position; + + pos -= step; + + status = try_match(state, &node->next_1, pos, + &next_position); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS && + !is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + + if (pos == limit) + break; + } + break; + } + } + + if (match) { + count = (size_t)abs_ssize_t(pos - state->text_pos); + + /* The tail could match. */ + if (count > node->values[1]) + /* The match is longer than the minimum, so we might need + * to backtrack the repeat again to consume less. + */ + rp_data->count = count; + else { + /* We've reached or passed the minimum, so we won't need to + * backtrack the repeat again. + */ + rp_data->start = bt_data->repeat.text_pos; + rp_data->count = bt_data->repeat.count; + discard_backtrack(state); + + /* Have we passed the minimum? */ + if (count < node->values[1]) + goto backtrack; + } + + node = node->next_1.node; + state->text_pos = pos; + goto advance; + } else { + /* Don't try this repeated match again. */ + if (step > 0) { + if (!guard_repeat_range(safe_state, bt_data->repeat.index, + limit, pos, RE_STATUS_BODY, TRUE)) + return RE_ERROR_MEMORY; + } else if (step < 0) { + if (!guard_repeat_range(safe_state, bt_data->repeat.index, + pos, limit, RE_STATUS_BODY, TRUE)) + return RE_ERROR_MEMORY; + } + + /* We've backtracked the repeat as far as we can. */ + rp_data->start = bt_data->repeat.text_pos; + rp_data->count = bt_data->repeat.count; + discard_backtrack(state); + } + break; + } + case RE_OP_GROUP_RETURN: /* Group return. */ + { + RE_Node* return_node; + TRACE(("%s\n", re_op_text[bt_data->op])) + + return_node = bt_data->group_call.node; + + push_group_return(safe_state, return_node); + + if (return_node) { + /* Restore the groups. */ + pop_groups(state); + state->capture_change = bt_data->group_call.capture_change; + + /* Restore the repeats. */ + pop_repeats(state); + } + + discard_backtrack(state); + break; + } + case RE_OP_KEEP: /* Keep. */ + { + state->match_pos = bt_data->keep.match_pos; + discard_backtrack(state); + break; + } + case RE_OP_LAZY_REPEAT_ONE: /* Lazy repeat for one character. */ + { + RE_RepeatData* rp_data; + size_t count; + Py_ssize_t step; + Py_ssize_t pos; + Py_ssize_t available; + size_t max_count; + Py_ssize_t limit; + RE_Node* repeated; + RE_Node* test; + BOOL match; + BOOL m; + size_t index; + TRACE(("%s\n", re_op_text[bt_data->op])) + + node = bt_data->repeat.position.node; + + rp_data = &state->repeats[bt_data->repeat.index]; + + /* Match one character at a time until the tail could match or we + * have reached the maximum. + */ + state->text_pos = rp_data->start; + count = rp_data->count; + + step = node->step; + pos = state->text_pos + (Py_ssize_t)count * step; + available = step > 0 ? state->slice_end - state->text_pos : + state->text_pos - state->slice_start; + max_count = min_size_t((size_t)available, node->values[2]); + limit = state->text_pos + (Py_ssize_t)max_count * step; + + repeated = node->nonstring.next_2.node; + + test = node->next_1.test; + + m = test->match; + index = node->values[0]; + + match = FALSE; + + if (test->status & RE_STATUS_FUZZY) { + for (;;) { + RE_Position next_position; + + status = match_one(state, repeated, pos); + if (status < 0) + return status; + + if (status == RE_ERROR_FAILURE) + break; + + pos += step; + + status = try_match(state, &node->next_1, pos, + &next_position); + if (status < 0) + return status; + + if (status == RE_ERROR_SUCCESS && + !is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + + if (pos == limit) + break; + } + } else { + /* A repeated single-character match is often followed by a + * literal, so checking specially for it can be a good + * optimisation when working with long strings. + */ + switch (test->op) { + case RE_OP_CHARACTER: + { + Py_UCS4 ch; + + ch = test->values[0]; + + /* The tail is a character. We don't want to go off the end + * of the slice. + */ + limit = min_ssize_t(limit, state->slice_end - 1); + + for (;;) { + if (pos >= state->text_length && state->partial_side == + RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + if (pos >= limit) + break; + + status = match_one(state, repeated, pos); + if (status < 0) + return status; + + if (status == RE_ERROR_FAILURE) + break; + + ++pos; + + if (same_char(char_at(state->text, pos), ch) == m && + !is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + } + break; + } + case RE_OP_CHARACTER_IGN: + { + Py_UCS4 ch; + + ch = test->values[0]; + + /* The tail is a character. We don't want to go off the end + * of the slice. + */ + limit = min_ssize_t(limit, state->slice_end - 1); + + for (;;) { + if (pos >= state->text_length && state->partial_side == + RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + if (pos >= limit) + break; + + status = match_one(state, repeated, pos); + if (status < 0) + return status; + + if (status == RE_ERROR_FAILURE) + break; + + ++pos; + + if (same_char_ign(encoding, locale_info, + char_at(state->text, pos), ch) == m && + !is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + } + break; + } + case RE_OP_CHARACTER_IGN_REV: + { + Py_UCS4 ch; + + ch = test->values[0]; + + /* The tail is a character. We don't want to go off the end + * of the slice. + */ + limit = max_ssize_t(limit, state->slice_start + 1); + + for (;;) { + if (pos <= 0 && state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + if (pos <= limit) + break; + + status = match_one(state, repeated, pos); + if (status < 0) + return status; + + if (status == RE_ERROR_FAILURE) + break; + + --pos; + + if (same_char_ign(encoding, locale_info, + char_at(state->text, pos - 1), ch) == m && + !is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + } + break; + } + case RE_OP_CHARACTER_REV: + { + Py_UCS4 ch; + + ch = test->values[0]; + + /* The tail is a character. We don't want to go off the end + * of the slice. + */ + limit = max_ssize_t(limit, state->slice_start + 1); + + for (;;) { + if (pos <= 0 && state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + if (pos <= limit) + break; + + status = match_one(state, repeated, pos); + if (status < 0) + return status; + + if (status == RE_ERROR_FAILURE) + break; + + --pos; + + if (same_char(char_at(state->text, pos - 1), ch) == m + && !is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + } + break; + } + case RE_OP_STRING: + { + Py_ssize_t length; + + length = (Py_ssize_t)test->value_count; + + /* The tail is a string. We don't want to go off the end of + * the slice. + */ + limit = min_ssize_t(limit, state->slice_end - length); + + for (;;) { + Py_ssize_t found; + BOOL is_partial; + + if (pos >= state->text_length && state->partial_side == + RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + if (pos >= limit) + break; + + /* Look for the tail string. */ + found = string_search(safe_state, test, pos + 1, limit + + length, &is_partial); + if (is_partial) + return RE_ERROR_PARTIAL; + + if (found < 0) + break; + + if (repeated->op == RE_OP_ANY_ALL) + /* Anything can precede the tail. */ + pos = found; + else { + /* Check that what precedes the tail will match. */ + while (pos != found) { + status = match_one(state, repeated, pos); + if (status < 0) + return status; + + if (status == RE_ERROR_FAILURE) + break; + + ++pos; + } + + if (pos != found) + /* Something preceding the tail didn't match. + */ + break; + } + + if (!is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + } + break; + } + case RE_OP_STRING_FLD: + { + /* The tail is a string. We don't want to go off the end of + * the slice. + */ + limit = min_ssize_t(limit, state->slice_end); + + for (;;) { + Py_ssize_t found; + Py_ssize_t new_pos; + BOOL is_partial; + + if (pos >= state->text_length && state->partial_side == + RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + if (pos >= limit) + break; + + /* Look for the tail string. */ + found = string_search_fld(safe_state, test, pos + 1, + limit, &new_pos, &is_partial); + if (is_partial) + return RE_ERROR_PARTIAL; + + if (found < 0) + break; + + if (repeated->op == RE_OP_ANY_ALL) + /* Anything can precede the tail. */ + pos = found; + else { + /* Check that what precedes the tail will match. */ + while (pos != found) { + status = match_one(state, repeated, pos); + if (status < 0) + return status; + + if (status == RE_ERROR_FAILURE) + break; + + ++pos; + } + + if (pos != found) + /* Something preceding the tail didn't match. + */ + break; + } + + if (!is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + } + break; + } + case RE_OP_STRING_FLD_REV: + { + /* The tail is a string. We don't want to go off the end of + * the slice. + */ + limit = max_ssize_t(limit, state->slice_start); + + for (;;) { + Py_ssize_t found; + Py_ssize_t new_pos; + BOOL is_partial; + + if (pos <= 0 && state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + if (pos <= limit) + break; + + /* Look for the tail string. */ + found = string_search_fld_rev(safe_state, test, pos - + 1, limit, &new_pos, &is_partial); + if (is_partial) + return RE_ERROR_PARTIAL; + + if (found < 0) + break; + + if (repeated->op == RE_OP_ANY_ALL) + /* Anything can precede the tail. */ + pos = found; + else { + /* Check that what precedes the tail will match. */ + while (pos != found) { + status = match_one(state, repeated, pos); + if (status < 0) + return status; + + if (status == RE_ERROR_FAILURE) + break; + + --pos; + } + + if (pos != found) + /* Something preceding the tail didn't match. + */ + break; + } + + if (!is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + } + break; + } + case RE_OP_STRING_IGN: + { + Py_ssize_t length; + + length = (Py_ssize_t)test->value_count; + + /* The tail is a string. We don't want to go off the end of + * the slice. + */ + limit = min_ssize_t(limit, state->slice_end - length); + + for (;;) { + Py_ssize_t found; + BOOL is_partial; + + if (pos >= state->text_length && state->partial_side == + RE_PARTIAL_RIGHT) + return RE_ERROR_PARTIAL; + + if (pos >= limit) + break; + + /* Look for the tail string. */ + found = string_search_ign(safe_state, test, pos + 1, + limit + length, &is_partial); + if (is_partial) + return RE_ERROR_PARTIAL; + + if (found < 0) + break; + + if (repeated->op == RE_OP_ANY_ALL) + /* Anything can precede the tail. */ + pos = found; + else { + /* Check that what precedes the tail will match. */ + while (pos != found) { + status = match_one(state, repeated, pos); + if (status < 0) + return status; + + if (status == RE_ERROR_FAILURE) + break; + + ++pos; + } + + if (pos != found) + /* Something preceding the tail didn't match. + */ + break; + } + + if (!is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + } + break; + } + case RE_OP_STRING_IGN_REV: + { + Py_ssize_t length; + + length = (Py_ssize_t)test->value_count; + + /* The tail is a string. We don't want to go off the end of + * the slice. + */ + limit = max_ssize_t(limit, state->slice_start + length); + + for (;;) { + Py_ssize_t found; + BOOL is_partial; + + if (pos <= 0 && state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + if (pos <= limit) + break; + + /* Look for the tail string. */ + found = string_search_ign_rev(safe_state, test, pos - + 1, limit - length, &is_partial); + if (is_partial) + return RE_ERROR_PARTIAL; + + if (found < 0) + break; + + if (repeated->op == RE_OP_ANY_ALL) + /* Anything can precede the tail. */ + pos = found; + else { + /* Check that what precedes the tail will match. */ + while (pos != found) { + status = match_one(state, repeated, pos); + if (status < 0) + return status; + + if (status == RE_ERROR_FAILURE) + break; + + --pos; + } + + if (pos != found) + /* Something preceding the tail didn't match. + */ + break; + } + + if (!is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + } + break; + } + case RE_OP_STRING_REV: + { + Py_ssize_t length; + + length = (Py_ssize_t)test->value_count; + + /* The tail is a string. We don't want to go off the end of + * the slice. + */ + limit = max_ssize_t(limit, state->slice_start + length); + + for (;;) { + Py_ssize_t found; + BOOL is_partial; + + if (pos <= 0 && state->partial_side == RE_PARTIAL_LEFT) + return RE_ERROR_PARTIAL; + + if (pos <= limit) + break; + + /* Look for the tail string. */ + found = string_search_rev(safe_state, test, pos - 1, + limit - length, &is_partial); + if (is_partial) + return RE_ERROR_PARTIAL; + + if (found < 0) + break; + + if (repeated->op == RE_OP_ANY_ALL) + /* Anything can precede the tail. */ + pos = found; + else { + /* Check that what precedes the tail will match. */ + while (pos != found) { + status = match_one(state, repeated, pos); + if (status < 0) + return status; + + if (status == RE_ERROR_FAILURE) + break; + + --pos; + } + + if (pos != found) + /* Something preceding the tail didn't match. + */ + break; + } + + if (!is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + } + break; + } + default: + for (;;) { + RE_Position next_position; + + status = match_one(state, repeated, pos); + if (status < 0) + return status; + + if (status == RE_ERROR_FAILURE) + break; + + pos += step; + + status = try_match(state, &node->next_1, pos, + &next_position); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (status == RE_ERROR_SUCCESS && + !is_repeat_guarded(safe_state, index, pos, + RE_STATUS_TAIL)) { + match = TRUE; + break; + } + + if (pos == limit) + break; + } + break; + } + } + + if (match) { + /* The tail could match. */ + count = (size_t)abs_ssize_t(pos - state->text_pos); + state->text_pos = pos; + + if (count < max_count) { + /* The match is shorter than the maximum, so we might need + * to backtrack the repeat again to consume more. + */ + rp_data->count = count; + } else { + /* We've reached or passed the maximum, so we won't need to + * backtrack the repeat again. + */ + rp_data->start = bt_data->repeat.text_pos; + rp_data->count = bt_data->repeat.count; + discard_backtrack(state); + + /* Have we passed the maximum? */ + if (count > max_count) + goto backtrack; + } + + node = node->next_1.node; + goto advance; + } else { + /* The tail couldn't match. */ + rp_data->start = bt_data->repeat.text_pos; + rp_data->count = bt_data->repeat.count; + discard_backtrack(state); + } + break; + } + case RE_OP_LOOKAROUND: /* Lookaround subpattern. */ + { + TRACE(("%s\n", re_op_text[bt_data->op])) + + if (bt_data->lookaround.inside) { + /* Backtracked to the start of a lookaround. */ + RE_AtomicData* lookaround; + + lookaround = pop_atomic(safe_state); + state->text_pos = lookaround->text_pos; + state->slice_end = lookaround->slice_end; + state->slice_start = lookaround->slice_start; + state->current_backtrack_block = + lookaround->current_backtrack_block; + state->current_backtrack_block->count = + lookaround->backtrack_count; + + /* Restore the groups and repeats and certain flags. */ + if (lookaround->has_repeats) + pop_repeats(state); + + if (lookaround->has_groups) + pop_groups(state); + + state->too_few_errors = bt_data->lookaround.too_few_errors; + state->capture_change = bt_data->lookaround.capture_change; + + if (bt_data->lookaround.node->match) { + /* It's a positive lookaround that's failed. */ + discard_backtrack(state); + } else { + /* It's a negative lookaround that's failed. Record that + * we've now left the lookaround and continue to the + * following node. + */ + bt_data->lookaround.inside = FALSE; + node = bt_data->lookaround.node->nonstring.next_2.node; + goto advance; + } + } else { + /* Backtracked to a lookaround. If it's a positive lookaround + * that succeeded, we need to restore the groups; if it's a + * negative lookaround that failed, it would have completely + * backtracked inside and already restored the groups. We also + * need to restore certain flags. + */ + if (bt_data->lookaround.node->match && + (bt_data->lookaround.node->status & RE_STATUS_HAS_GROUPS)) + pop_groups(state); + + state->too_few_errors = bt_data->lookaround.too_few_errors; + state->capture_change = bt_data->lookaround.capture_change; + + discard_backtrack(state); + } + break; + } + case RE_OP_MATCH_BODY: + { + RE_RepeatData* rp_data; + TRACE(("%s %d\n", re_op_text[bt_data->op], bt_data->repeat.index)) + + /* We want to match the body. */ + rp_data = &state->repeats[bt_data->repeat.index]; + + /* Restore the repeat info. */ + rp_data->count = bt_data->repeat.count; + rp_data->start = bt_data->repeat.start; + rp_data->capture_change = bt_data->repeat.capture_change; + + /* Record backtracking info in case the body fails to match. */ + bt_data->op = RE_OP_BODY_START; + + /* Advance into the body. */ + node = bt_data->repeat.position.node; + state->text_pos = bt_data->repeat.position.text_pos; + goto advance; + } + case RE_OP_MATCH_TAIL: + { + RE_RepeatData* rp_data; + TRACE(("%s %d\n", re_op_text[bt_data->op], bt_data->repeat.index)) + + /* We want to match the tail. */ + rp_data = &state->repeats[bt_data->repeat.index]; + + /* Restore the repeat info. */ + rp_data->count = bt_data->repeat.count; + rp_data->start = bt_data->repeat.start; + rp_data->capture_change = bt_data->repeat.capture_change; + + /* Advance into the tail. */ + node = bt_data->repeat.position.node; + state->text_pos = bt_data->repeat.position.text_pos; + + discard_backtrack(state); + goto advance; + } + case RE_OP_REF_GROUP: /* Reference to a capture group. */ + case RE_OP_REF_GROUP_IGN: /* Reference to a capture group, ignoring case. */ + case RE_OP_REF_GROUP_IGN_REV: /* Reference to a capture group, backwards, ignoring case. */ + case RE_OP_REF_GROUP_REV: /* Reference to a capture group, backwards. */ + case RE_OP_STRING: /* A string. */ + case RE_OP_STRING_IGN: /* A string, ignoring case. */ + case RE_OP_STRING_IGN_REV: /* A string, backwards, ignoring case. */ + case RE_OP_STRING_REV: /* A string, backwards. */ + { + BOOL matched; + TRACE(("%s\n", re_op_text[bt_data->op])) + + status = retry_fuzzy_match_string(safe_state, search, + &state->text_pos, &node, &string_pos, &matched); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (matched) + goto advance; + + string_pos = -1; + break; + } + case RE_OP_REF_GROUP_FLD: /* Reference to a capture group, ignoring case. */ + case RE_OP_REF_GROUP_FLD_REV: /* Reference to a capture group, backwards, ignoring case. */ + { + BOOL matched; + TRACE(("%s\n", re_op_text[bt_data->op])) + + status = retry_fuzzy_match_group_fld(safe_state, search, + &state->text_pos, &node, &folded_pos, &string_pos, &gfolded_pos, + &matched); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (matched) + goto advance; + + string_pos = -1; + break; + } + case RE_OP_START_GROUP: /* Start of a capture group. */ + { + RE_CODE private_index; + RE_GroupData* group; + TRACE(("%s %d\n", re_op_text[bt_data->op], + bt_data->group.public_index)) + + private_index = bt_data->group.private_index; + group = &state->groups[private_index - 1]; + + /* Unsave the capture? */ + if (bt_data->group.capture) + unsave_capture(state, bt_data->group.private_index, + bt_data->group.public_index); + + if (pattern->group_info[private_index - 1].referenced && + group->span.start != bt_data->group.text_pos) + --state->capture_change; + group->span.start = bt_data->group.text_pos; + group->current_capture = bt_data->group.current_capture; + + discard_backtrack(state); + break; + } + case RE_OP_STRING_FLD: /* A string, ignoring case. */ + case RE_OP_STRING_FLD_REV: /* A string, backwards, ignoring case. */ + { + BOOL matched; + TRACE(("%s\n", re_op_text[bt_data->op])) + + status = retry_fuzzy_match_string_fld(safe_state, search, + &state->text_pos, &node, &string_pos, &folded_pos, &matched); + if (status < 0) + return RE_ERROR_PARTIAL; + + if (matched) + goto advance; + + string_pos = -1; + break; + } + default: + TRACE(("UNKNOWN OP %d\n", bt_data->op)) + return RE_ERROR_ILLEGAL; + } + } +} + +/* Saves group data for fuzzy matching. */ +Py_LOCAL_INLINE(RE_GroupData*) save_groups(RE_SafeState* safe_state, + RE_GroupData* saved_groups) { + RE_State* state; + PatternObject* pattern; + size_t g; + + /* Re-acquire the GIL. */ + acquire_GIL(safe_state); + + state = safe_state->re_state; + pattern = state->pattern; + + if (!saved_groups) { + saved_groups = (RE_GroupData*)re_alloc(pattern->true_group_count * + sizeof(RE_GroupData)); + if (!saved_groups) + goto error; + + memset(saved_groups, 0, pattern->true_group_count * + sizeof(RE_GroupData)); + } + + for (g = 0; g < pattern->true_group_count; g++) { + RE_GroupData* orig; + RE_GroupData* copy; + + orig = &state->groups[g]; + copy = &saved_groups[g]; + + copy->span = orig->span; + + if (orig->capture_count > copy->capture_capacity) { + RE_GroupSpan* cap_copy; + + cap_copy = (RE_GroupSpan*)re_realloc(copy->captures, + orig->capture_count * sizeof(RE_GroupSpan)); + if (!cap_copy) + goto error; + + copy->capture_capacity = orig->capture_count; + copy->captures = cap_copy; + } + + copy->capture_count = orig->capture_count; + Py_MEMCPY(copy->captures, orig->captures, orig->capture_count * + sizeof(RE_GroupSpan)); + } + + /* Release the GIL. */ + release_GIL(safe_state); + + return saved_groups; + +error: + if (saved_groups) { + for (g = 0; g < pattern->true_group_count; g++) + re_dealloc(saved_groups[g].captures); + + re_dealloc(saved_groups); + } + + /* Release the GIL. */ + release_GIL(safe_state); + + return NULL; +} + +/* Restores group data for fuzzy matching. */ +Py_LOCAL_INLINE(void) restore_groups(RE_SafeState* safe_state, RE_GroupData* + saved_groups) { + RE_State* state; + PatternObject* pattern; + size_t g; + + /* Re-acquire the GIL. */ + acquire_GIL(safe_state); + + state = safe_state->re_state; + pattern = state->pattern; + + for (g = 0; g < pattern->true_group_count; g++) + re_dealloc(state->groups[g].captures); + + Py_MEMCPY(state->groups, saved_groups, pattern->true_group_count * + sizeof(RE_GroupData)); + + re_dealloc(saved_groups); + + /* Release the GIL. */ + release_GIL(safe_state); +} + +/* Discards group data for fuzzy matching. */ +Py_LOCAL_INLINE(void) discard_groups(RE_SafeState* safe_state, RE_GroupData* + saved_groups) { + RE_State* state; + PatternObject* pattern; + size_t g; + + /* Re-acquire the GIL. */ + acquire_GIL(safe_state); + + state = safe_state->re_state; + pattern = state->pattern; + + for (g = 0; g < pattern->true_group_count; g++) + re_dealloc(saved_groups[g].captures); + + re_dealloc(saved_groups); + + /* Release the GIL. */ + release_GIL(safe_state); +} + +/* Saves the fuzzy info. */ +Py_LOCAL_INLINE(void) save_fuzzy_counts(RE_State* state, size_t* fuzzy_counts) + { + Py_MEMCPY(fuzzy_counts, state->total_fuzzy_counts, + sizeof(state->total_fuzzy_counts)); +} + +/* Restores the fuzzy info. */ +Py_LOCAL_INLINE(void) restore_fuzzy_counts(RE_State* state, size_t* + fuzzy_counts) { + Py_MEMCPY(state->total_fuzzy_counts, fuzzy_counts, + sizeof(state->total_fuzzy_counts)); +} + +/* Makes the list of best matches found so far. */ +Py_LOCAL_INLINE(void) make_best_list(RE_BestList* best_list) { + best_list->capacity = 0; + best_list->count = 0; + best_list->entries = NULL; +} + +/* Clears the list of best matches found so far. */ +Py_LOCAL_INLINE(void) clear_best_list(RE_BestList* best_list) { + best_list->count = 0; +} + +/* Adds a new entry to the list of best matches found so far. */ +Py_LOCAL_INLINE(BOOL) add_to_best_list(RE_SafeState* safe_state, RE_BestList* + best_list, Py_ssize_t match_pos, Py_ssize_t text_pos) { + RE_BestEntry* entry; + + if (best_list->count >= best_list->capacity) { + RE_BestEntry* new_entries; + + best_list->capacity = best_list->capacity == 0 ? 16 : + best_list->capacity * 2; + new_entries = safe_realloc(safe_state, best_list->entries, + best_list->capacity * sizeof(RE_BestEntry)); + if (!new_entries) + return FALSE; + + best_list->entries = new_entries; + } + + entry = &best_list->entries[best_list->count++]; + entry->match_pos = match_pos; + entry->text_pos = text_pos; + + return TRUE; +} + +/* Destroy the list of best matches found so far. */ +Py_LOCAL_INLINE(void) destroy_best_list(RE_SafeState* safe_state, RE_BestList* + best_list) { + if (best_list->entries) + safe_dealloc(safe_state, best_list->entries); +} + +/* Performs a match or search from the current text position for a best fuzzy + * match. + */ +Py_LOCAL_INLINE(int) do_best_fuzzy_match(RE_SafeState* safe_state, BOOL search) + { + RE_State* state; + Py_ssize_t available; + int step; + size_t fewest_errors; + BOOL must_advance; + BOOL found_match; + RE_BestList best_list; + Py_ssize_t start_pos; + int status; + TRACE(("<<do_best_fuzzy_match>>\n")) + + state = safe_state->re_state; + + if (state->reverse) { + available = state->text_pos - state->slice_start; + step = -1; + } else { + available = state->slice_end - state->text_pos; + step = 1; + } + + /* The maximum permitted cost. */ + state->max_errors = PY_SSIZE_T_MAX; + fewest_errors = PY_SSIZE_T_MAX; + + state->best_text_pos = state->reverse ? state->slice_start : + state->slice_end; + + must_advance = state->must_advance; + found_match = FALSE; + + make_best_list(&best_list); + + /* Search the text for the best match. */ + start_pos = state->text_pos; + while (state->slice_start <= start_pos && start_pos <= state->slice_end) { + state->text_pos = start_pos; + state->must_advance = must_advance; + + /* Initialise the state. */ + init_match(state); + + status = RE_ERROR_SUCCESS; + if (state->max_errors == 0 && state->partial_side == RE_PARTIAL_NONE) { + /* An exact match, and partial matches not permitted. */ + if (available < state->min_width || (available == 0 && + state->must_advance)) + status = RE_ERROR_FAILURE; + } + + if (status == RE_ERROR_SUCCESS) + status = basic_match(safe_state, search); + + /* Has an error occurred, or is it a partial match? */ + if (status < 0) + break; + + if (status == RE_ERROR_SUCCESS) { + /* It was a successful match. */ + found_match = TRUE; + + if (state->total_errors < fewest_errors) { + /* This match was better than any of the previous ones. */ + fewest_errors = state->total_errors; + + if (state->total_errors == 0) + /* It was a perfect match. */ + break; + + /* Forget all the previous worse matches and remember this one. + */ + clear_best_list(&best_list); + if (!add_to_best_list(safe_state, &best_list, state->match_pos, + state->text_pos)) + return RE_ERROR_MEMORY; + } else if (state->total_errors == fewest_errors) + /* This match was as good as the previous matches. Remember + * this one. + */ + add_to_best_list(safe_state, &best_list, state->match_pos, + state->text_pos); + } + + /* Should we keep searching? */ + if (!search) + break; + + start_pos = state->match_pos + step; + } + + if (found_match) { + /* We found a match. */ + if (fewest_errors > 0) { + /* It doesn't look like a perfect match. */ + int i; + Py_ssize_t slice_start; + Py_ssize_t slice_end; + size_t error_limit; + size_t best_fuzzy_counts[RE_FUZZY_COUNT]; + RE_GroupData* best_groups; + Py_ssize_t best_match_pos; + Py_ssize_t best_text_pos; + + slice_start = state->slice_start; + slice_end = state->slice_end; + + error_limit = fewest_errors; + + if (error_limit > RE_MAX_ERRORS) + error_limit = RE_MAX_ERRORS; + + best_groups = NULL; + + /* Look again at the best of the matches that we've seen. */ + for (i = 0; i < best_list.count; i++) { + RE_BestEntry* entry; + Py_ssize_t max_offset; + Py_ssize_t offset; + + /* Look for the best fit at this position. */ + entry = &best_list.entries[i]; + + if (search) { + max_offset = state->reverse ? entry->match_pos - + state->slice_start : state->slice_end - entry->match_pos; + + if (max_offset > (Py_ssize_t)fewest_errors) + max_offset = (Py_ssize_t)fewest_errors; + + if (max_offset > (Py_ssize_t)error_limit) + max_offset = (Py_ssize_t)error_limit; + } else + max_offset = 0; + + start_pos = entry->match_pos; + offset = 0; + + while (offset <= max_offset) { + state->max_errors = 1; + + while (state->max_errors <= error_limit) { + state->text_pos = start_pos; + init_match(state); + status = basic_match(safe_state, FALSE); + + if (status == RE_ERROR_SUCCESS) { + BOOL better; + + if (state->total_errors < error_limit || i == 0 && + offset == 0) + better = TRUE; + else if (state->total_errors == error_limit) + /* The cost is as low as the current best, but + * is it earlier? + */ + better = state->reverse ? state->match_pos > + best_match_pos : state->match_pos < + best_match_pos; + + if (better) { + save_fuzzy_counts(state, best_fuzzy_counts); + + best_groups = save_groups(safe_state, + best_groups); + if (!best_groups) { + destroy_best_list(safe_state, &best_list); + return RE_ERROR_MEMORY; + } + + best_match_pos = state->match_pos; + best_text_pos = state->text_pos; + error_limit = state->total_errors; + } + + break; + } + + ++state->max_errors; + } + + start_pos += step; + ++offset; + } + + if (status == RE_ERROR_SUCCESS && state->total_errors == 0) + break; + } + + if (best_groups) { + status = RE_ERROR_SUCCESS; + state->match_pos = best_match_pos; + state->text_pos = best_text_pos; + + restore_groups(safe_state, best_groups); + restore_fuzzy_counts(state, best_fuzzy_counts); + } else { + /* None of the "best" matches could be improved on, so pick the + * first. + */ + RE_BestEntry* entry; + + /* Look at only the part of the string around the match. */ + entry = &best_list.entries[0]; + + if (state->reverse) { + state->slice_start = entry->text_pos; + state->slice_end = entry->match_pos; + } else { + state->slice_start = entry->match_pos; + state->slice_end = entry->text_pos; + } + + /* We'll expand the part that we're looking at to take to + * compensate for any matching errors that have occurred. + */ + if (state->slice_start - slice_start >= + (Py_ssize_t)fewest_errors) + state->slice_start -= (Py_ssize_t)fewest_errors; + else + state->slice_start = slice_start; + + if (slice_end - state->slice_end >= (Py_ssize_t)fewest_errors) + state->slice_end += (Py_ssize_t)fewest_errors; + else + state->slice_end = slice_end; + + state->max_errors = fewest_errors; + state->text_pos = entry->match_pos; + init_match(state); + status = basic_match(safe_state, search); + } + + state->slice_start = slice_start; + state->slice_end = slice_end; + } + } + + destroy_best_list(safe_state, &best_list); + + return status; +} + +/* Performs a match or search from the current text position for an enhanced + * fuzzy match. + */ +Py_LOCAL_INLINE(int) do_enhanced_fuzzy_match(RE_SafeState* safe_state, BOOL + search) { + RE_State* state; + PatternObject* pattern; + Py_ssize_t available; + size_t fewest_errors; + RE_GroupData* best_groups; + Py_ssize_t best_match_pos; + BOOL must_advance; + Py_ssize_t slice_start; + Py_ssize_t slice_end; + int status; + size_t best_fuzzy_counts[RE_FUZZY_COUNT]; + Py_ssize_t best_text_pos = 0; /* Initialise to stop compiler warning. */ + TRACE(("<<do_enhanced_fuzzy_match>>\n")) + + state = safe_state->re_state; + pattern = state->pattern; + + if (state->reverse) + available = state->text_pos - state->slice_start; + else + available = state->slice_end - state->text_pos; + + /* The maximum permitted cost. */ + state->max_errors = PY_SSIZE_T_MAX; + fewest_errors = PY_SSIZE_T_MAX; + + best_groups = NULL; + + state->best_match_pos = state->text_pos; + state->best_text_pos = state->reverse ? state->slice_start : + state->slice_end; + + best_match_pos = state->text_pos; + must_advance = state->must_advance; + + slice_start = state->slice_start; + slice_end = state->slice_end; + + for (;;) { + /* If there's a better match, it won't start earlier in the string than + * the current best match, so there's no need to start earlier than + * that match. + */ + state->must_advance = must_advance; + + /* Initialise the state. */ + init_match(state); + + status = RE_ERROR_SUCCESS; + if (state->max_errors == 0 && state->partial_side == RE_PARTIAL_NONE) { + /* An exact match, and partial matches not permitted. */ + if (available < state->min_width || (available == 0 && + state->must_advance)) + status = RE_ERROR_FAILURE; + } + + if (status == RE_ERROR_SUCCESS) + status = basic_match(safe_state, search); + + /* Has an error occurred, or is it a partial match? */ + if (status < 0) + break; + + if (status == RE_ERROR_SUCCESS) { + BOOL better; + + better = state->total_errors < fewest_errors; + + if (better) { + BOOL same_match; + + fewest_errors = state->total_errors; + state->max_errors = fewest_errors; + + save_fuzzy_counts(state, best_fuzzy_counts); + + same_match = state->match_pos == best_match_pos && + state->text_pos == best_text_pos; + same_match = FALSE; + + if (best_groups) { + size_t g; + + /* Did we get the same match as the best so far? */ + for (g = 0; same_match && g < pattern->public_group_count; + g++) { + same_match = state->groups[g].span.start == + best_groups[g].span.start && + state->groups[g].span.end == best_groups[g].span.end; + } + } + + /* Save the best result so far. */ + best_groups = save_groups(safe_state, best_groups); + if (!best_groups) { + status = RE_ERROR_MEMORY; + break; + } + + best_match_pos = state->match_pos; + best_text_pos = state->text_pos; + + if (same_match || state->total_errors == 0) + break; + + state->max_errors = state->total_errors; + if (state->max_errors < RE_MAX_ERRORS) + --state->max_errors; + } else + break; + + if (state->reverse) { + state->slice_start = state->text_pos; + state->slice_end = state->match_pos; + } else { + state->slice_start = state->match_pos; + state->slice_end = state->text_pos; + } + + state->text_pos = state->match_pos; + + if (state->max_errors == PY_SSIZE_T_MAX) + state->max_errors = 0; + } else + break; + } + + state->slice_start = slice_start; + state->slice_end = slice_end; + + if (best_groups) { + if (status == RE_ERROR_SUCCESS && state->total_errors == 0) + /* We have a perfect match, so the previous best match. */ + discard_groups(safe_state, best_groups); + else { + /* Restore the previous best match. */ + status = RE_ERROR_SUCCESS; + + state->match_pos = best_match_pos; + state->text_pos = best_text_pos; + + restore_groups(safe_state, best_groups); + restore_fuzzy_counts(state, best_fuzzy_counts); + } + } + + return status; +} + +/* Performs a match or search from the current text position for a simple fuzzy + * match. + */ +Py_LOCAL_INLINE(int) do_simple_fuzzy_match(RE_SafeState* safe_state, BOOL + search) { + RE_State* state; + Py_ssize_t available; + int status; + TRACE(("<<do_simple_fuzzy_match>>\n")) + + state = safe_state->re_state; + + if (state->reverse) + available = state->text_pos - state->slice_start; + else + available = state->slice_end - state->text_pos; + + /* The maximum permitted cost. */ + state->max_errors = PY_SSIZE_T_MAX; + + state->best_match_pos = state->text_pos; + state->best_text_pos = state->reverse ? state->slice_start : + state->slice_end; + + /* Initialise the state. */ + init_match(state); + + status = RE_ERROR_SUCCESS; + if (state->max_errors == 0 && state->partial_side == RE_PARTIAL_NONE) { + /* An exact match, and partial matches not permitted. */ + if (available < state->min_width || (available == 0 && + state->must_advance)) + status = RE_ERROR_FAILURE; + } + + if (status == RE_ERROR_SUCCESS) + status = basic_match(safe_state, search); + + return status; +} + +/* Performs a match or search from the current text position for an exact + * match. + */ +Py_LOCAL_INLINE(int) do_exact_match(RE_SafeState* safe_state, BOOL search) { + RE_State* state; + Py_ssize_t available; + int status; + TRACE(("<<do_exact_match>>\n")) + + state = safe_state->re_state; + + if (state->reverse) + available = state->text_pos - state->slice_start; + else + available = state->slice_end - state->text_pos; + + /* The maximum permitted cost. */ + state->max_errors = 0; + + state->best_match_pos = state->text_pos; + state->best_text_pos = state->reverse ? state->slice_start : + state->slice_end; + + /* Initialise the state. */ + init_match(state); + + status = RE_ERROR_SUCCESS; + if (state->max_errors == 0 && state->partial_side == RE_PARTIAL_NONE) { + /* An exact match, and partial matches not permitted. */ + if (available < state->min_width || (available == 0 && + state->must_advance)) + status = RE_ERROR_FAILURE; + } + + if (status == RE_ERROR_SUCCESS) + status = basic_match(safe_state, search); + + return status; +} + +/* Performs a match or search from the current text position. + * + * The state can sometimes be shared across threads. In such instances there's + * a lock (mutex) on it. The lock is held for the duration of matching. + */ +Py_LOCAL_INLINE(int) do_match(RE_SafeState* safe_state, BOOL search) { + RE_State* state; + PatternObject* pattern; + int status; + TRACE(("<<do_match>>\n")) + + state = safe_state->re_state; + pattern = state->pattern; + + /* Is there enough to search? */ + if (state->reverse) { + if (state->text_pos < state->slice_start) + return FALSE; + } else { + if (state->text_pos > state->slice_end) + return FALSE; + } + + /* Release the GIL. */ + release_GIL(safe_state); + + if (pattern->is_fuzzy) { + if (pattern->flags & RE_FLAG_BESTMATCH) + status = do_best_fuzzy_match(safe_state, search); + else if (pattern->flags & RE_FLAG_ENHANCEMATCH) + status = do_enhanced_fuzzy_match(safe_state, search); + else + status = do_simple_fuzzy_match(safe_state, search); + } else + status = do_exact_match(safe_state, search); + + if (status == RE_ERROR_SUCCESS || status == RE_ERROR_PARTIAL) { + Py_ssize_t max_end_index; + RE_GroupInfo* group_info; + size_t g; + + /* Store the results. */ + state->lastindex = -1; + state->lastgroup = -1; + max_end_index = -1; + + if (status == RE_ERROR_PARTIAL) { + /* We've matched up to the limit of the slice. */ + if (state->reverse) + state->text_pos = state->slice_start; + else + state->text_pos = state->slice_end; + } + + /* Store the capture groups. */ + group_info = pattern->group_info; + + for (g = 0; g < pattern->public_group_count; g++) { + RE_GroupSpan* span; + + span = &state->groups[g].span; + /* The string positions are of type Py_ssize_t, so the format needs + * to specify that. + */ + TRACE(("group %d from %" PY_FORMAT_SIZE_T "d to %" PY_FORMAT_SIZE_T + "d\n", g + 1, span->start, span->end)) + + if (span->start >= 0 && span->end >= 0 && group_info[g].end_index > + max_end_index) { + max_end_index = group_info[g].end_index; + state->lastindex = (Py_ssize_t)g + 1; + if (group_info[g].has_name) + state->lastgroup = (Py_ssize_t)g + 1; + } + } + } + + /* Re-acquire the GIL. */ + acquire_GIL(safe_state); + + if (status < 0 && status != RE_ERROR_PARTIAL && !PyErr_Occurred()) + set_error(status, NULL); + + return status; +} + +/* Gets a string from a Python object. + * + * If the function returns true and str_info->should_release is true then it's + * the responsibility of the caller to release the buffer when it's no longer + * needed. + */ +Py_LOCAL_INLINE(BOOL) get_string(PyObject* string, RE_StringInfo* str_info) { + /* Given a Python object, return a data pointer, a length (in characters), + * and a character size. Return FALSE if the object is not a string (or not + * compatible). + */ + PyBufferProcs* buffer; + Py_ssize_t bytes; + Py_ssize_t size; + + /* Unicode objects do not support the buffer API. So, get the data directly + * instead. + */ + if (PyUnicode_Check(string)) { + /* Unicode strings doesn't always support the buffer interface. */ + str_info->characters = (void*)PyUnicode_AS_DATA(string); + str_info->length = PyUnicode_GET_SIZE(string); + str_info->charsize = sizeof(Py_UNICODE); + str_info->is_unicode = TRUE; + str_info->should_release = FALSE; + return TRUE; + } + + /* Get pointer to string buffer. */ +#if PY_VERSION_HEX >= 0x02060000 + buffer = Py_TYPE(string)->tp_as_buffer; + str_info->view.len = -1; +#else + buffer = string->ob_type->tp_as_buffer; +#endif + + if (!buffer) { + PyErr_SetString(PyExc_TypeError, "expected string or buffer"); + return FALSE; + } + +#if PY_VERSION_HEX >= 0x02060000 + if (buffer->bf_getbuffer && (*buffer->bf_getbuffer)(string, + &str_info->view, PyBUF_SIMPLE) >= 0) + /* It's a new-style buffer. */ + str_info->should_release = TRUE; + else +#endif + if (buffer->bf_getreadbuffer && buffer->bf_getsegcount && + buffer->bf_getsegcount(string, NULL) == 1) + /* It's an old-style buffer. */ + str_info->should_release = FALSE; + else { + PyErr_SetString(PyExc_TypeError, "expected string or buffer"); + return FALSE; + } + + /* Determine buffer size. */ +#if PY_VERSION_HEX >= 0x02060000 + if (str_info->should_release) { + /* It's a new-style buffer. */ + bytes = str_info->view.len; + str_info->characters = str_info->view.buf; + + if (str_info->characters == NULL) { + PyBuffer_Release(&str_info->view); + PyErr_SetString(PyExc_ValueError, "buffer is NULL"); + return FALSE; + } + } else +#endif + /* It's an old-style buffer. */ + bytes = buffer->bf_getreadbuffer(string, 0, &str_info->characters); + + if (bytes < 0) { +#if PY_VERSION_HEX >= 0x02060000 + if (str_info->should_release) + PyBuffer_Release(&str_info->view); +#endif + PyErr_SetString(PyExc_TypeError, "buffer has negative size"); + return FALSE; + } + + /* Determine character size. */ + size = PyObject_Size(string); + + if (PyString_Check(string) || bytes == size) + str_info->charsize = 1; + else { +#if PY_VERSION_HEX >= 0x02060000 + if (str_info->should_release) + PyBuffer_Release(&str_info->view); +#endif + PyErr_SetString(PyExc_TypeError, "buffer size mismatch"); + return FALSE; + } + + str_info->length = size; + str_info->is_unicode = FALSE; + + return TRUE; +} + +/* Deallocates the groups storage. */ +Py_LOCAL_INLINE(void) dealloc_groups(RE_GroupData* groups, size_t group_count) + { + size_t g; + + if (!groups) + return; + + for (g = 0; g < group_count; g++) + re_dealloc(groups[g].captures); + + re_dealloc(groups); +} + +/* Initialises a state object. */ +Py_LOCAL_INLINE(BOOL) state_init_2(RE_State* state, PatternObject* pattern, + PyObject* string, RE_StringInfo* str_info, Py_ssize_t start, Py_ssize_t end, + BOOL overlapped, int concurrent, BOOL partial, BOOL use_lock, BOOL + visible_captures, BOOL match_all) { + int i; + Py_ssize_t final_pos; + + state->groups = NULL; + state->best_match_groups = NULL; + state->repeats = NULL; + state->visible_captures = visible_captures; + state->match_all = match_all; + state->backtrack_block.previous = NULL; + state->backtrack_block.next = NULL; + state->backtrack_block.capacity = RE_BACKTRACK_BLOCK_SIZE; + state->backtrack_allocated = RE_BACKTRACK_BLOCK_SIZE; + state->current_atomic_block = NULL; + state->first_saved_groups = NULL; + state->current_saved_groups = NULL; + state->first_saved_repeats = NULL; + state->current_saved_repeats = NULL; + state->lock = NULL; + state->fuzzy_guards = NULL; + state->first_group_call_frame = NULL; + state->current_group_call_frame = NULL; + state->group_call_guard_list = NULL; + state->req_pos = -1; + + /* The call guards used by recursive patterns. */ + if (pattern->call_ref_info_count > 0) { + state->group_call_guard_list = + (RE_GuardList*)re_alloc(pattern->call_ref_info_count * + sizeof(RE_GuardList)); + if (!state->group_call_guard_list) + goto error; + memset(state->group_call_guard_list, 0, pattern->call_ref_info_count * + sizeof(RE_GuardList)); + } + + /* The capture groups. */ + if (pattern->true_group_count) { + size_t g; + + if (pattern->groups_storage) { + state->groups = pattern->groups_storage; + pattern->groups_storage = NULL; + } else { + state->groups = (RE_GroupData*)re_alloc(pattern->true_group_count * + sizeof(RE_GroupData)); + if (!state->groups) + goto error; + memset(state->groups, 0, pattern->true_group_count * + sizeof(RE_GroupData)); + + for (g = 0; g < pattern->true_group_count; g++) { + RE_GroupSpan* captures; + + captures = (RE_GroupSpan*)re_alloc(sizeof(RE_GroupSpan)); + if (!captures) { + size_t i; + + for (i = 0; i < g; i++) + re_dealloc(state->groups[i].captures); + + goto error; + } + + state->groups[g].captures = captures; + state->groups[g].capture_capacity = 1; + } + } + } + + /* Adjust boundaries. */ + if (start < 0) + start += str_info->length; + if (start < 0) + start = 0; + else if (start > str_info->length) + start = str_info->length; + + if (end < 0) + end += str_info->length; + if (end < 0) + end = 0; + else if (end > str_info->length) + end = str_info->length; + + state->overlapped = overlapped; + state->min_width = pattern->min_width; + + /* Initialise the getters and setters for the character size. */ + state->charsize = str_info->charsize; + state->is_unicode = str_info->is_unicode; + +#if PY_VERSION_HEX >= 0x02060000 + /* Are we using a buffer object? If so, we need to copy the info. */ + state->should_release = str_info->should_release; + if (state->should_release) + state->view = str_info->view; + +#endif + switch (state->charsize) { + case 1: + state->char_at = bytes1_char_at; + state->set_char_at = bytes1_set_char_at; + state->point_to = bytes1_point_to; + break; + case 2: + state->char_at = bytes2_char_at; + state->set_char_at = bytes2_set_char_at; + state->point_to = bytes2_point_to; + break; + case 4: + state->char_at = bytes4_char_at; + state->set_char_at = bytes4_set_char_at; + state->point_to = bytes4_point_to; + break; + default: + goto error; + } + + state->encoding = pattern->encoding; + state->locale_info = pattern->locale_info; + + /* The state object contains a reference to the string and also a pointer + * to its contents. + * + * The documentation says that the end of the slice behaves like the end of + * the string. + */ + state->text = str_info->characters; + state->text_length = end; + + state->reverse = (pattern->flags & RE_FLAG_REVERSE) != 0; + if (partial) + state->partial_side = state->reverse ? RE_PARTIAL_LEFT : + RE_PARTIAL_RIGHT; + else + state->partial_side = RE_PARTIAL_NONE; + + state->slice_start = start; + state->slice_end = state->text_length; + state->text_pos = state->reverse ? state->slice_end : state->slice_start; + + /* Point to the final newline and line separator if it's at the end of the + * string, otherwise just -1. + */ + state->final_newline = -1; + state->final_line_sep = -1; + final_pos = state->text_length - 1; + if (final_pos >= 0) { + Py_UCS4 ch; + + ch = state->char_at(state->text, final_pos); + if (ch == 0x0A) { + /* The string ends with LF. */ + state->final_newline = final_pos; + state->final_line_sep = final_pos; + + /* Does the string end with CR/LF? */ + --final_pos; + if (final_pos >= 0 && state->char_at(state->text, final_pos) == + 0x0D) + state->final_line_sep = final_pos; + } else { + /* The string doesn't end with LF, but it could be another kind of + * line separator. + */ + if (state->encoding->is_line_sep(ch)) + state->final_line_sep = final_pos; + } + } + + /* If the 'new' behaviour is enabled then split correctly on zero-width + * matches. + */ + state->version_0 = (pattern->flags & RE_FLAG_VERSION1) == 0; + state->must_advance = FALSE; + + state->pattern = pattern; + state->string = string; + + if (pattern->repeat_count) { + if (pattern->repeats_storage) { + state->repeats = pattern->repeats_storage; + pattern->repeats_storage = NULL; + } else { + state->repeats = (RE_RepeatData*)re_alloc(pattern->repeat_count * + sizeof(RE_RepeatData)); + if (!state->repeats) + goto error; + memset(state->repeats, 0, pattern->repeat_count * + sizeof(RE_RepeatData)); + } + } + + if (pattern->fuzzy_count) { + state->fuzzy_guards = (RE_FuzzyGuards*)re_alloc(pattern->fuzzy_count * + sizeof(RE_FuzzyGuards)); + if (!state->fuzzy_guards) + goto error; + memset(state->fuzzy_guards, 0, pattern->fuzzy_count * + sizeof(RE_FuzzyGuards)); + } + + Py_INCREF(state->pattern); + Py_INCREF(state->string); + + /* Multithreading is allowed during matching when explicitly enabled or on + * immutable strings. + */ + switch (concurrent) { + case RE_CONC_NO: + state->is_multithreaded = FALSE; + break; + case RE_CONC_YES: + state->is_multithreaded = TRUE; + break; + default: + state->is_multithreaded = PyUnicode_Check(string) || + PyString_Check(string); + break; + } + + /* A state struct can sometimes be shared across threads. In such + * instances, if multithreading is enabled we need to protect the state + * with a lock (mutex) during matching. + */ + if (state->is_multithreaded && use_lock) + state->lock = PyThread_allocate_lock(); + + for (i = 0; i < MAX_SEARCH_POSITIONS; i++) + state->search_positions[i].start_pos = -1; + + return TRUE; + +error: + re_dealloc(state->group_call_guard_list); + re_dealloc(state->repeats); + dealloc_groups(state->groups, pattern->true_group_count); + re_dealloc(state->fuzzy_guards); + state->repeats = NULL; + state->groups = NULL; + state->fuzzy_guards = NULL; + return FALSE; +} + +#if PY_VERSION_HEX >= 0x02060000 +/* Releases the string's buffer, if necessary. */ +Py_LOCAL_INLINE(void) release_buffer(RE_StringInfo* str_info) { + if (str_info->should_release) + PyBuffer_Release(&str_info->view); +} + +#endif +/* Initialises a state object. */ +Py_LOCAL_INLINE(BOOL) state_init(RE_State* state, PatternObject* pattern, + PyObject* string, Py_ssize_t start, Py_ssize_t end, BOOL overlapped, int + concurrent, BOOL partial, BOOL use_lock, BOOL visible_captures, BOOL + match_all) { + RE_StringInfo str_info; + + /* Get the string to search or match. */ + if (!get_string(string, &str_info)) + return FALSE; + + /* If we fail to initialise the state then we need to release the buffer if + * the string is a buffer object. + */ + if (!state_init_2(state, pattern, string, &str_info, start, end, + overlapped, concurrent, partial, use_lock, visible_captures, match_all)) + { +#if PY_VERSION_HEX >= 0x02060000 + release_buffer(&str_info); + +#endif + return FALSE; + } + + /* The state has been initialised successfully, so now the state has the + * responsibility of releasing the buffer if the string is a buffer object. + */ + return TRUE; +} + +/* Deallocates repeat data. */ +Py_LOCAL_INLINE(void) dealloc_repeats(RE_RepeatData* repeats, size_t + repeat_count) { + size_t i; + + if (!repeats) + return; + + for (i = 0; i < repeat_count; i++) { + re_dealloc(repeats[i].body_guard_list.spans); + re_dealloc(repeats[i].tail_guard_list.spans); + } + + re_dealloc(repeats); +} + +/* Deallocates fuzzy guards. */ +Py_LOCAL_INLINE(void) dealloc_fuzzy_guards(RE_FuzzyGuards* guards, size_t + fuzzy_count) { + size_t i; + + if (!guards) + return; + + for (i = 0; i < fuzzy_count; i++) { + re_dealloc(guards[i].body_guard_list.spans); + re_dealloc(guards[i].tail_guard_list.spans); + } + + re_dealloc(guards); +} + +/* Finalises a state object, discarding its contents. */ +Py_LOCAL_INLINE(void) state_fini(RE_State* state) { + RE_BacktrackBlock* current_backtrack; + RE_AtomicBlock* current_atomic; + PatternObject* pattern; + RE_SavedGroups* saved_groups; + RE_SavedRepeats* saved_repeats; + RE_GroupCallFrame* frame; + size_t i; + + /* Discard the lock (mutex) if there's one. */ + if (state->lock) + PyThread_free_lock(state->lock); + + /* Deallocate the backtrack blocks. */ + current_backtrack = state->backtrack_block.next; + while (current_backtrack) { + RE_BacktrackBlock* next; + + next = current_backtrack->next; + re_dealloc(current_backtrack); + state->backtrack_allocated -= RE_BACKTRACK_BLOCK_SIZE; + current_backtrack = next; + } + + /* Deallocate the atomic blocks. */ + current_atomic = state->current_atomic_block; + while (current_atomic) { + RE_AtomicBlock* next; + + next = current_atomic->next; + re_dealloc(current_atomic); + current_atomic = next; + } + + state->current_atomic_block = NULL; + + pattern = state->pattern; + + saved_groups = state->first_saved_groups; + while (saved_groups) { + RE_SavedGroups* next; + + next = saved_groups->next; + re_dealloc(saved_groups->spans); + re_dealloc(saved_groups->counts); + re_dealloc(saved_groups); + saved_groups = next; + } + + saved_repeats = state->first_saved_repeats; + while (saved_repeats) { + RE_SavedRepeats* next; + + next = saved_repeats->next; + + dealloc_repeats(saved_repeats->repeats, pattern->repeat_count); + + re_dealloc(saved_repeats); + saved_repeats = next; + } + + if (state->best_match_groups) + dealloc_groups(state->best_match_groups, pattern->true_group_count); + + if (pattern->groups_storage) + dealloc_groups(state->groups, pattern->true_group_count); + else + pattern->groups_storage = state->groups; + + if (pattern->repeats_storage) + dealloc_repeats(state->repeats, pattern->repeat_count); + else + pattern->repeats_storage = state->repeats; + + frame = state->first_group_call_frame; + while (frame) { + RE_GroupCallFrame* next; + + next = frame->next; + + dealloc_groups(frame->groups, pattern->true_group_count); + dealloc_repeats(frame->repeats, pattern->repeat_count); + + re_dealloc(frame); + frame = next; + } + + for (i = 0; i < pattern->call_ref_info_count; i++) + re_dealloc(state->group_call_guard_list[i].spans); + + if (state->group_call_guard_list) + re_dealloc(state->group_call_guard_list); + + if (state->fuzzy_guards) + dealloc_fuzzy_guards(state->fuzzy_guards, pattern->fuzzy_count); + + Py_DECREF(state->pattern); + Py_DECREF(state->string); +#if PY_VERSION_HEX >= 0x02060000 + + if (state->should_release) + PyBuffer_Release(&state->view); +#endif +} + +/* Converts a string index to an integer. + * + * If the index is None then the default will be returned. + */ +Py_LOCAL_INLINE(Py_ssize_t) as_string_index(PyObject* obj, Py_ssize_t def) { + Py_ssize_t value; + + if (obj == Py_None) + return def; + + value = PyInt_AsSsize_t(obj); + if (value != -1 || !PyErr_Occurred()) + return value; + + PyErr_Clear(); + + value = PyLong_AsLong(obj); + if (value != -1 || !PyErr_Occurred()) + return value; + + set_error(RE_ERROR_INDEX, NULL); + return 0; +} + +/* Deallocates a MatchObject. */ +static void match_dealloc(PyObject* self_) { + MatchObject* self; + + self = (MatchObject*)self_; + + Py_XDECREF(self->string); + Py_XDECREF(self->substring); + Py_DECREF(self->pattern); + if (self->groups) + re_dealloc(self->groups); + Py_XDECREF(self->regs); + PyObject_DEL(self); +} + +/* Restricts a value to a range. */ +Py_LOCAL_INLINE(Py_ssize_t) limited_range(Py_ssize_t value, Py_ssize_t lower, + Py_ssize_t upper) { + if (value < lower) + return lower; + + if (value > upper) + return upper; + + return value; +} + +/* Gets a slice from a Unicode string. */ +Py_LOCAL_INLINE(PyObject*) unicode_slice(PyObject* string, Py_ssize_t start, + Py_ssize_t end) { + Py_ssize_t length; + Py_UNICODE* buffer; + + length = PyUnicode_GET_SIZE(string); + start = limited_range(start, 0, length); + end = limited_range(end, 0, length); + + buffer = PyUnicode_AsUnicode(string); + + return PyUnicode_FromUnicode(buffer + start, end - start); +} + +/* Gets a slice from a bytestring. */ +Py_LOCAL_INLINE(PyObject*) bytes_slice(PyObject* string, Py_ssize_t start, + Py_ssize_t end) { + Py_ssize_t length; + char* buffer; + + length = PyString_GET_SIZE(string); + start = limited_range(start, 0, length); + end = limited_range(end, 0, length); + + buffer = PyString_AsString(string); + + return PyString_FromStringAndSize(buffer + start, end - start); +} + +/* Gets a slice from a string, returning either a Unicode string or a + * bytestring. + */ +Py_LOCAL_INLINE(PyObject*) get_slice(PyObject* string, Py_ssize_t start, + Py_ssize_t end) { + if (PyUnicode_Check(string)) + return unicode_slice(string, start, end); + + if (PyString_Check(string)) + return bytes_slice(string, start, end); + + return PySequence_GetSlice(string, start, end); +} + +/* Gets a MatchObject's group by integer index. */ +static PyObject* match_get_group_by_index(MatchObject* self, Py_ssize_t index, + PyObject* def) { + RE_GroupSpan* span; + + if (index < 0 || (size_t)index > self->group_count) { + /* Raise error if we were given a bad group number. */ + set_error(RE_ERROR_NO_SUCH_GROUP, NULL); + return NULL; + } + + if (index == 0) + return get_slice(self->substring, self->match_start - + self->substring_offset, self->match_end - self->substring_offset); + + /* Capture group indexes are 1-based (excluding group 0, which is the + * entire matched string). + */ + span = &self->groups[index - 1].span; + + if (span->start < 0 || span->end < 0) { + /* Return default value if the string or group is undefined. */ + Py_INCREF(def); + return def; + } + + return get_slice(self->substring, span->start - self->substring_offset, + span->end - self->substring_offset); +} + +/* Gets a MatchObject's start by integer index. */ +static PyObject* match_get_start_by_index(MatchObject* self, Py_ssize_t index) + { + RE_GroupSpan* span; + + if (index < 0 || (size_t)index > self->group_count) { + /* Raise error if we were given a bad group number. */ + set_error(RE_ERROR_NO_SUCH_GROUP, NULL); + return NULL; + } + + if (index == 0) + return Py_BuildValue("n", self->match_start); + + /* Capture group indexes are 1-based (excluding group 0, which is the + * entire matched string). + */ + span = &self->groups[index - 1].span; + return Py_BuildValue("n", span->start); +} + +/* Gets a MatchObject's starts by integer index. */ +static PyObject* match_get_starts_by_index(MatchObject* self, Py_ssize_t index) + { + RE_GroupData* group; + PyObject* result; + PyObject* item; + size_t i; + + if (index < 0 || (size_t)index > self->group_count) { + /* Raise error if we were given a bad group number. */ + set_error(RE_ERROR_NO_SUCH_GROUP, NULL); + return NULL; + } + + if (index == 0) { + result = PyList_New(1); + if (!result) + return NULL; + + item = Py_BuildValue("n", self->match_start); + if (!item) + goto error; + + /* PyList_SET_ITEM borrows the reference. */ + PyList_SET_ITEM(result, 0, item); + + return result; + } + + /* Capture group indexes are 1-based (excluding group 0, which is the + * entire matched string). + */ + group = &self->groups[index - 1]; + + result = PyList_New((Py_ssize_t)group->capture_count); + if (!result) + return NULL; + + for (i = 0; i < group->capture_count; i++) { + item = Py_BuildValue("n", group->captures[i].start); + if (!item) + goto error; + + /* PyList_SET_ITEM borrows the reference. */ + PyList_SET_ITEM(result, i, item); + } + + return result; + +error: + Py_DECREF(result); + return NULL; +} + +/* Gets a MatchObject's end by integer index. */ +static PyObject* match_get_end_by_index(MatchObject* self, Py_ssize_t index) { + RE_GroupSpan* span; + + if (index < 0 || (size_t)index > self->group_count) { + /* Raise error if we were given a bad group number. */ + set_error(RE_ERROR_NO_SUCH_GROUP, NULL); + return NULL; + } + + if (index == 0) + return Py_BuildValue("n", self->match_end); + + /* Capture group indexes are 1-based (excluding group 0, which is the + * entire matched string). + */ + span = &self->groups[index - 1].span; + return Py_BuildValue("n", span->end); +} + +/* Gets a MatchObject's ends by integer index. */ +static PyObject* match_get_ends_by_index(MatchObject* self, Py_ssize_t index) { + RE_GroupData* group; + PyObject* result; + PyObject* item; + size_t i; + + if (index < 0 || (size_t)index > self->group_count) { + /* Raise error if we were given a bad group number. */ + set_error(RE_ERROR_NO_SUCH_GROUP, NULL); + return NULL; + } + + if (index == 0) { + result = PyList_New(1); + if (!result) + return NULL; + + item = Py_BuildValue("n", self->match_end); + if (!item) + goto error; + + /* PyList_SET_ITEM borrows the reference. */ + PyList_SET_ITEM(result, 0, item); + + return result; + } + + /* Capture group indexes are 1-based (excluding group 0, which is the + * entire matched string). + */ + group = &self->groups[index - 1]; + + result = PyList_New((Py_ssize_t)group->capture_count); + if (!result) + return NULL; + + for (i = 0; i < group->capture_count; i++) { + item = Py_BuildValue("n", group->captures[i].end); + if (!item) + goto error; + + /* PyList_SET_ITEM borrows the reference. */ + PyList_SET_ITEM(result, i, item); + } + + return result; + +error: + Py_DECREF(result); + return NULL; +} + +/* Gets a MatchObject's span by integer index. */ +static PyObject* match_get_span_by_index(MatchObject* self, Py_ssize_t index) { + RE_GroupSpan* span; + + if (index < 0 || (size_t)index > self->group_count) { + /* Raise error if we were given a bad group number. */ + set_error(RE_ERROR_NO_SUCH_GROUP, NULL); + return NULL; + } + + if (index == 0) + return Py_BuildValue("nn", self->match_start, self->match_end); + + /* Capture group indexes are 1-based (excluding group 0, which is the + * entire matched string). + */ + span = &self->groups[index - 1].span; + return Py_BuildValue("nn", span->start, span->end); +} + +/* Gets a MatchObject's spans by integer index. */ +static PyObject* match_get_spans_by_index(MatchObject* self, Py_ssize_t index) + { + PyObject* result; + PyObject* item; + RE_GroupData* group; + size_t i; + + if (index < 0 || (size_t)index > self->group_count) { + /* Raise error if we were given a bad group number. */ + set_error(RE_ERROR_NO_SUCH_GROUP, NULL); + return NULL; + } + + if (index == 0) { + result = PyList_New(1); + if (!result) + return NULL; + + item = Py_BuildValue("nn", self->match_start, self->match_end); + if (!item) + goto error; + + /* PyList_SET_ITEM borrows the reference. */ + PyList_SET_ITEM(result, 0, item); + + return result; + } + + /* Capture group indexes are 1-based (excluding group 0, which is the + * entire matched string). + */ + group = &self->groups[index - 1]; + + result = PyList_New((Py_ssize_t)group->capture_count); + if (!result) + return NULL; + + for (i = 0; i < group->capture_count; i++) { + item = Py_BuildValue("nn", group->captures[i].start, + group->captures[i].end); + if (!item) + goto error; + + /* PyList_SET_ITEM borrows the reference. */ + PyList_SET_ITEM(result, i, item); + } + + return result; + +error: + Py_DECREF(result); + return NULL; +} + +/* Gets a MatchObject's captures by integer index. */ +static PyObject* match_get_captures_by_index(MatchObject* self, Py_ssize_t + index) { + PyObject* result; + PyObject* slice; + RE_GroupData* group; + size_t i; + + if (index < 0 || (size_t)index > self->group_count) { + /* Raise error if we were given a bad group number. */ + set_error(RE_ERROR_NO_SUCH_GROUP, NULL); + return NULL; + } + + if (index == 0) { + result = PyList_New(1); + if (!result) + return NULL; + + slice = get_slice(self->substring, self->match_start - + self->substring_offset, self->match_end - self->substring_offset); + if (!slice) + goto error; + + /* PyList_SET_ITEM borrows the reference. */ + PyList_SET_ITEM(result, 0, slice); + + return result; + } + + /* Capture group indexes are 1-based (excluding group 0, which is the + * entire matched string). + */ + group = &self->groups[index - 1]; + + result = PyList_New((Py_ssize_t)group->capture_count); + if (!result) + return NULL; + + for (i = 0; i < group->capture_count; i++) { + slice = get_slice(self->substring, group->captures[i].start - + self->substring_offset, group->captures[i].end - + self->substring_offset); + if (!slice) + goto error; + + /* PyList_SET_ITEM borrows the reference. */ + PyList_SET_ITEM(result, i, slice); + } + + return result; + +error: + Py_DECREF(result); + return NULL; +} + +/* Converts a group index to an integer. */ +Py_LOCAL_INLINE(Py_ssize_t) as_group_index(PyObject* obj) { + Py_ssize_t value; + + value = PyInt_AsSsize_t(obj); + if (value != -1 || !PyErr_Occurred()) + return value; + + PyErr_Clear(); + + value = PyLong_AsLong(obj); + if (value != -1 || !PyErr_Occurred()) + return value; + + set_error(RE_ERROR_INDEX, NULL); + return -1; +} + +/* Gets a MatchObject's group index. + * + * The supplied index can be an integer or a string (group name) object. + */ +Py_LOCAL_INLINE(Py_ssize_t) match_get_group_index(MatchObject* self, PyObject* + index, BOOL allow_neg) { + Py_ssize_t group; + + /* Is the index an integer? */ + group = as_group_index(index); + if (group != -1 || !PyErr_Occurred()) { + Py_ssize_t min_group = 0; + + /* Adjust negative indices where valid and allowed. */ + if (group < 0 && allow_neg) { + group += (Py_ssize_t)self->group_count + 1; + min_group = 1; + } + + if (min_group <= group && (size_t)group <= self->group_count) + return group; + + return -1; + } + + /* The index might be a group name. */ + if (self->pattern->groupindex) { + /* Look up the name. */ + PyErr_Clear(); + + index = PyObject_GetItem(self->pattern->groupindex, index); + if (index) { + /* Check that we have an integer. */ + group = as_group_index(index); + Py_DECREF(index); + if (group != -1 || !PyErr_Occurred()) + return group; + } + } + + PyErr_Clear(); + return -1; +} + +/* Gets a MatchObject's group by object index. */ +Py_LOCAL_INLINE(PyObject*) match_get_group(MatchObject* self, PyObject* index, + PyObject* def, BOOL allow_neg) { + /* Check that the index is an integer or a string. */ + if (PyInt_Check(index) || PyLong_Check(index) || PyUnicode_Check(index) || + PyString_Check(index)) + return match_get_group_by_index(self, match_get_group_index(self, + index, allow_neg), def); + + set_error(RE_ERROR_GROUP_INDEX_TYPE, index); + return NULL; +} + +/* Gets info from a MatchObject by object index. */ +Py_LOCAL_INLINE(PyObject*) get_by_arg(MatchObject* self, PyObject* index, + RE_GetByIndexFunc get_by_index) { + /* Check that the index is an integer or a string. */ + if (PyInt_Check(index) || PyLong_Check(index) || PyUnicode_Check(index) || + PyString_Check(index)) + return get_by_index(self, match_get_group_index(self, index, FALSE)); + + set_error(RE_ERROR_GROUP_INDEX_TYPE, index); + return NULL; +} + +/* MatchObject's 'group' method. */ +static PyObject* match_group(MatchObject* self, PyObject* args) { + Py_ssize_t size; + PyObject* result; + Py_ssize_t i; + + size = PyTuple_GET_SIZE(args); + + switch (size) { + case 0: + /* group() */ + result = match_get_group_by_index(self, 0, Py_None); + break; + case 1: + /* group(x). PyTuple_GET_ITEM borrows the reference. */ + result = match_get_group(self, PyTuple_GET_ITEM(args, 0), Py_None, + FALSE); + break; + default: + /* group(x, y, z, ...) */ + /* Fetch multiple items. */ + result = PyTuple_New(size); + if (!result) + return NULL; + + for (i = 0; i < size; i++) { + PyObject* item; + + /* PyTuple_GET_ITEM borrows the reference. */ + item = match_get_group(self, PyTuple_GET_ITEM(args, i), Py_None, + FALSE); + if (!item) { + Py_DECREF(result); + return NULL; + } + + /* PyTuple_SET_ITEM borrows the reference. */ + PyTuple_SET_ITEM(result, i, item); + } + break; + } + + return result; +} + +/* Generic method for getting info from a MatchObject. */ +Py_LOCAL_INLINE(PyObject*) get_from_match(MatchObject* self, PyObject* args, + RE_GetByIndexFunc get_by_index) { + Py_ssize_t size; + PyObject* result; + Py_ssize_t i; + + size = PyTuple_GET_SIZE(args); + + switch (size) { + case 0: + /* get() */ + result = get_by_index(self, 0); + break; + case 1: + /* get(x). PyTuple_GET_ITEM borrows the reference. */ + result = get_by_arg(self, PyTuple_GET_ITEM(args, 0), get_by_index); + break; + default: + /* get(x, y, z, ...) */ + /* Fetch multiple items. */ + result = PyTuple_New(size); + if (!result) + return NULL; + + for (i = 0; i < size; i++) { + PyObject* item; + + /* PyTuple_GET_ITEM borrows the reference. */ + item = get_by_arg(self, PyTuple_GET_ITEM(args, i), get_by_index); + if (!item) { + Py_DECREF(result); + return NULL; + } + + /* PyTuple_SET_ITEM borrows the reference. */ + PyTuple_SET_ITEM(result, i, item); + } + break; + } + + return result; +} + +/* MatchObject's 'start' method. */ +static PyObject* match_start(MatchObject* self, PyObject* args) { + return get_from_match(self, args, match_get_start_by_index); +} + +/* MatchObject's 'starts' method. */ +static PyObject* match_starts(MatchObject* self, PyObject* args) { + return get_from_match(self, args, match_get_starts_by_index); +} + +/* MatchObject's 'end' method. */ +static PyObject* match_end(MatchObject* self, PyObject* args) { + return get_from_match(self, args, match_get_end_by_index); +} + +/* MatchObject's 'ends' method. */ +static PyObject* match_ends(MatchObject* self, PyObject* args) { + return get_from_match(self, args, match_get_ends_by_index); +} + +/* MatchObject's 'span' method. */ +static PyObject* match_span(MatchObject* self, PyObject* args) { + return get_from_match(self, args, match_get_span_by_index); +} + +/* MatchObject's 'spans' method. */ +static PyObject* match_spans(MatchObject* self, PyObject* args) { + return get_from_match(self, args, match_get_spans_by_index); +} + +/* MatchObject's 'captures' method. */ +static PyObject* match_captures(MatchObject* self, PyObject* args) { + return get_from_match(self, args, match_get_captures_by_index); +} + +/* MatchObject's 'groups' method. */ +static PyObject* match_groups(MatchObject* self, PyObject* args, PyObject* + kwargs) { + PyObject* result; + size_t g; + + PyObject* def = Py_None; + static char* kwlist[] = { "default", NULL }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O:groups", kwlist, &def)) + return NULL; + + result = PyTuple_New((Py_ssize_t)self->group_count); + if (!result) + return NULL; + + /* Group 0 is the entire matched portion of the string. */ + for (g = 0; g < self->group_count; g++) { + PyObject* item; + + item = match_get_group_by_index(self, (Py_ssize_t)g + 1, def); + if (!item) + goto error; + + /* PyTuple_SET_ITEM borrows the reference. */ + PyTuple_SET_ITEM(result, g, item); + } + + return result; + +error: + Py_DECREF(result); + return NULL; +} + +/* MatchObject's 'groupdict' method. */ +static PyObject* match_groupdict(MatchObject* self, PyObject* args, PyObject* + kwargs) { + PyObject* result; + PyObject* keys; + Py_ssize_t g; + + PyObject* def = Py_None; + static char* kwlist[] = { "default", NULL }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O:groupdict", kwlist, + &def)) + return NULL; + + result = PyDict_New(); + if (!result || !self->pattern->groupindex) + return result; + + keys = PyMapping_Keys(self->pattern->groupindex); + if (!keys) + goto failed; + + for (g = 0; g < PyList_GET_SIZE(keys); g++) { + PyObject* key; + PyObject* value; + int status; + + /* PyList_GET_ITEM borrows a reference. */ + key = PyList_GET_ITEM(keys, g); + if (!key) + goto failed; + + value = match_get_group(self, key, def, FALSE); + if (!value) + goto failed; + + status = PyDict_SetItem(result, key, value); + Py_DECREF(value); + if (status < 0) + goto failed; + } + + Py_DECREF(keys); + + return result; + +failed: + Py_XDECREF(keys); + Py_DECREF(result); + return NULL; +} + +/* MatchObject's 'capturesdict' method. */ +static PyObject* match_capturesdict(MatchObject* self) { + PyObject* result; + PyObject* keys; + Py_ssize_t g; + + result = PyDict_New(); + if (!result || !self->pattern->groupindex) + return result; + + keys = PyMapping_Keys(self->pattern->groupindex); + if (!keys) + goto failed; + + for (g = 0; g < PyList_GET_SIZE(keys); g++) { + PyObject* key; + Py_ssize_t group; + PyObject* captures; + int status; + + /* PyList_GET_ITEM borrows a reference. */ + key = PyList_GET_ITEM(keys, g); + if (!key) + goto failed; + + group = match_get_group_index(self, key, FALSE); + if (group < 0) + goto failed; + + captures = match_get_captures_by_index(self, group); + if (!captures) + goto failed; + + status = PyDict_SetItem(result, key, captures); + Py_DECREF(captures); + if (status < 0) + goto failed; + } + + Py_DECREF(keys); + + return result; + +failed: + Py_XDECREF(keys); + Py_DECREF(result); + return NULL; +} + +/* Gets a Python object by name from a named module. */ +Py_LOCAL_INLINE(PyObject*) get_object(char* module_name, char* object_name) { + PyObject* module; + PyObject* object; + + module = PyImport_ImportModule(module_name); + if (!module) + return NULL; + + object = PyObject_GetAttrString(module, object_name); + Py_DECREF(module); + + return object; +} + +/* Calls a function in a module. */ +Py_LOCAL_INLINE(PyObject*) call(char* module_name, char* function_name, + PyObject* args) { + PyObject* function; + PyObject* result; + + if (!args) + return NULL; + + function = get_object(module_name, function_name); + if (!function) + return NULL; + + result = PyObject_CallObject(function, args); + Py_DECREF(function); + Py_DECREF(args); + + return result; +} + +/* Gets a replacement item from the replacement list. + * + * The replacement item could be a string literal or a group. + */ +Py_LOCAL_INLINE(PyObject*) get_match_replacement(MatchObject* self, PyObject* + item, size_t group_count) { + Py_ssize_t index; + + if (PyUnicode_Check(item) || PyString_Check(item)) { + /* It's a literal, which can be added directly to the list. */ + Py_INCREF(item); + return item; + } + + /* Is it a group reference? */ + index = as_group_index(item); + if (index == -1 && PyErr_Occurred()) { + /* Not a group either! */ + set_error(RE_ERROR_REPLACEMENT, NULL); + return NULL; + } + + if (index == 0) { + /* The entire matched portion of the string. */ + return get_slice(self->substring, self->match_start - + self->substring_offset, self->match_end - self->substring_offset); + } else if (index >= 1 && (size_t)index <= group_count) { + /* A group. If it didn't match then return None instead. */ + RE_GroupData* group; + + group = &self->groups[index - 1]; + + if (group->capture_count > 0) + return get_slice(self->substring, group->span.start - + self->substring_offset, group->span.end - + self->substring_offset); + else { + Py_INCREF(Py_None); + return Py_None; + } + } else { + /* No such group. */ + set_error(RE_ERROR_NO_SUCH_GROUP, NULL); + return NULL; + } +} + +/* Initialises the join list. */ +Py_LOCAL_INLINE(void) init_join_list(JoinInfo* join_info, BOOL reversed, BOOL + is_unicode) { + join_info->list = NULL; + join_info->item = NULL; + join_info->reversed = reversed; + join_info->is_unicode = is_unicode; +} + +/* Adds an item to the join list. */ +Py_LOCAL_INLINE(int) add_to_join_list(JoinInfo* join_info, PyObject* item) { + PyObject* new_item; + int status; + + if (join_info->is_unicode) { + if (PyUnicode_Check(item)) { + new_item = item; + Py_INCREF(new_item); + } else { + new_item = PyUnicode_FromObject(item); + if (!new_item) { + set_error(RE_ERROR_NOT_UNICODE, item); + return RE_ERROR_NOT_UNICODE; + } + } + } else { + if (PyString_Check(item)) { + new_item = item; + Py_INCREF(new_item); + } else { + new_item = PyUnicode_FromObject(item); + if (!new_item) { + set_error(RE_ERROR_NOT_STRING, item); + return RE_ERROR_NOT_STRING; + } + } + } + + /* If the list already exists then just add the item to it. */ + if (join_info->list) { + status = PyList_Append(join_info->list, new_item); + if (status < 0) + goto error; + + Py_DECREF(new_item); + return status; + } + + /* If we already have an item then we now have 2(!) and we need to put them + * into a list. + */ + if (join_info->item) { + join_info->list = PyList_New(2); + if (!join_info->list) { + status = RE_ERROR_MEMORY; + goto error; + } + + /* PyList_SET_ITEM borrows the reference. */ + PyList_SET_ITEM(join_info->list, 0, join_info->item); + join_info->item = NULL; + + /* PyList_SET_ITEM borrows the reference. */ + PyList_SET_ITEM(join_info->list, 1, new_item); + return 0; + } + + /* This is the first item. */ + join_info->item = new_item; + + return 0; + +error: + Py_DECREF(new_item); + set_error(status, NULL); + return status; +} + +/* Clears the join list. */ +Py_LOCAL_INLINE(void) clear_join_list(JoinInfo* join_info) { + Py_XDECREF(join_info->list); + Py_XDECREF(join_info->item); +} + +/* Joins together a list of strings for pattern_subx. */ +Py_LOCAL_INLINE(PyObject*) join_list_info(JoinInfo* join_info) { + /* If the list already exists then just do the join. */ + if (join_info->list) { + PyObject* joiner; + PyObject* result; + + if (join_info->reversed) + /* The list needs to be reversed before being joined. */ + PyList_Reverse(join_info->list); + + if (join_info->is_unicode) { + /* Concatenate the Unicode strings. */ + joiner = PyUnicode_FromUnicode(NULL, 0); + if (!joiner) { + clear_join_list(join_info); + return NULL; + } + + result = PyUnicode_Join(joiner, join_info->list); + } else { + joiner = PyString_FromString(""); + if (!joiner) { + clear_join_list(join_info); + return NULL; + } + + /* Concatenate the bytestrings. */ + result = _PyString_Join(joiner, join_info->list); + } + + Py_DECREF(joiner); + clear_join_list(join_info); + + return result; + } + + /* If we have only 1 item, so we'll just return it. */ + if (join_info->item) + return join_info->item; + + /* There are no items, so return an empty string. */ + if (join_info->is_unicode) + return PyUnicode_FromUnicode(NULL, 0); + else + return PyString_FromString(""); +} + +/* Checks whether a string replacement is a literal. + * + * To keep it simple we'll say that a literal is a string which can be used + * as-is. + * + * Returns its length if it is a literal, otherwise -1. + */ +Py_LOCAL_INLINE(Py_ssize_t) check_replacement_string(PyObject* str_replacement, + unsigned char special_char) { + RE_StringInfo str_info; + Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); + Py_ssize_t pos; + + if (!get_string(str_replacement, &str_info)) + return -1; + + switch (str_info.charsize) { + case 1: + char_at = bytes1_char_at; + break; + case 2: + char_at = bytes2_char_at; + break; + case 4: + char_at = bytes4_char_at; + break; + default: +#if PY_VERSION_HEX >= 0x02060000 + release_buffer(&str_info); +#endif + return -1; + } + + for (pos = 0; pos < str_info.length; pos++) { + if (char_at(str_info.characters, pos) == special_char) { +#if PY_VERSION_HEX >= 0x02060000 + release_buffer(&str_info); + +#endif + return -1; + } + } + +#if PY_VERSION_HEX >= 0x02060000 + release_buffer(&str_info); + +#endif + return str_info.length; +} + +/* MatchObject's 'expand' method. */ +static PyObject* match_expand(MatchObject* self, PyObject* str_template) { + Py_ssize_t literal_length; + PyObject* replacement; + JoinInfo join_info; + Py_ssize_t size; + Py_ssize_t i; + + /* Is the template just a literal? */ + literal_length = check_replacement_string(str_template, '\\'); + if (literal_length >= 0) { + /* It's a literal. */ + Py_INCREF(str_template); + return str_template; + } + + /* Hand the template to the template compiler. */ + replacement = call(RE_MODULE, "_compile_replacement_helper", + PyTuple_Pack(2, self->pattern, str_template)); + if (!replacement) + return NULL; + + init_join_list(&join_info, FALSE, PyUnicode_Check(self->string)); + + /* Add each part of the template to the list. */ + size = PyList_GET_SIZE(replacement); + for (i = 0; i < size; i++) { + PyObject* item; + PyObject* str_item; + + /* PyList_GET_ITEM borrows a reference. */ + item = PyList_GET_ITEM(replacement, i); + str_item = get_match_replacement(self, item, self->group_count); + if (!str_item) + goto error; + + /* Add to the list. */ + if (str_item == Py_None) + Py_DECREF(str_item); + else { + int status; + + status = add_to_join_list(&join_info, str_item); + Py_DECREF(str_item); + if (status < 0) + goto error; + } + } + + Py_DECREF(replacement); + + /* Convert the list to a single string (also cleans up join_info). */ + return join_list_info(&join_info); + +error: + clear_join_list(&join_info); + Py_DECREF(replacement); + return NULL; +} + +#if PY_VERSION_HEX >= 0x02060000 +/* Gets a MatchObject's group dictionary. */ +Py_LOCAL_INLINE(PyObject*) match_get_group_dict(MatchObject* self) { + PyObject* result; + PyObject* keys; + Py_ssize_t g; + + result = PyDict_New(); + if (!result || !self->pattern->groupindex) + return result; + + keys = PyMapping_Keys(self->pattern->groupindex); + if (!keys) + goto failed; + + for (g = 0; g < PyList_GET_SIZE(keys); g++) { + PyObject* key; + PyObject* value; + int status; + + /* PyList_GET_ITEM borrows a reference. */ + key = PyList_GET_ITEM(keys, g); + if (!key) + goto failed; + + value = match_get_group(self, key, Py_None, FALSE); + if (!value) + goto failed; + + status = PyDict_SetItem(result, key, value); + Py_DECREF(value); + if (status < 0) + goto failed; + } + + Py_DECREF(keys); + + return result; + +failed: + Py_XDECREF(keys); + Py_DECREF(result); + return NULL; +} + +static PyTypeObject Capture_Type = { + PyObject_HEAD_INIT(NULL) + 0, + "_" RE_MODULE "." "Capture", + sizeof(MatchObject) +}; + +/* Creates a new CaptureObject. */ +Py_LOCAL_INLINE(PyObject*) make_capture_object(MatchObject** match_indirect, + Py_ssize_t index) { + CaptureObject* capture; + + capture = PyObject_NEW(CaptureObject, &Capture_Type); + if (!capture) + return NULL; + + capture->group_index = index; + capture->match_indirect = match_indirect; + + return (PyObject*)capture; +} + +#if PY_VERSION_HEX >= 0x02060000 +/* Makes a MatchObject's capture dictionary. */ +Py_LOCAL_INLINE(PyObject*) make_capture_dict(MatchObject* match, MatchObject** + match_indirect) { + PyObject* result; + PyObject* keys; + PyObject* values = NULL; + Py_ssize_t g; + + result = PyDict_New(); + if (!result) + return result; + + keys = PyMapping_Keys(match->pattern->groupindex); + if (!keys) + goto failed; + + values = PyMapping_Values(match->pattern->groupindex); + if (!values) + goto failed; + + for (g = 0; g < PyList_GET_SIZE(keys); g++) { + PyObject* key; + PyObject* value; + Py_ssize_t v; + int status; + + /* PyList_GET_ITEM borrows a reference. */ + key = PyList_GET_ITEM(keys, g); + if (!key) + goto failed; + + /* PyList_GET_ITEM borrows a reference. */ + value = PyList_GET_ITEM(values, g); + if (!value) + goto failed; + + v = PyLong_AsLong(value); + if (v == -1 && PyErr_Occurred()) + goto failed; + + value = make_capture_object(match_indirect, v); + if (!value) + goto failed; + + status = PyDict_SetItem(result, key, value); + Py_DECREF(value); + if (status < 0) + goto failed; + } + + Py_DECREF(values); + Py_DECREF(keys); + + return result; + +failed: + Py_XDECREF(values); + Py_XDECREF(keys); + Py_DECREF(result); + return NULL; +} +#endif + +/* MatchObject's 'expandf' method. */ +static PyObject* match_expandf(MatchObject* self, PyObject* str_template) { + PyObject* format_func; + PyObject* args = NULL; + size_t g; + PyObject* kwargs = NULL; + PyObject* result; + + format_func = PyObject_GetAttrString(str_template, "format"); + if (!format_func) + return NULL; + + args = PyTuple_New((Py_ssize_t)self->group_count + 1); + if (!args) + goto error; + + for (g = 0; g < self->group_count + 1; g++) + /* PyTuple_SetItem borrows the reference. */ + PyTuple_SetItem(args, (Py_ssize_t)g, make_capture_object(&self, + (Py_ssize_t)g)); + + kwargs = make_capture_dict(self, &self); + if (!kwargs) + goto error; + + result = PyObject_Call(format_func, args, kwargs); + + Py_DECREF(kwargs); + Py_DECREF(args); + Py_DECREF(format_func); + + return result; + +error: + Py_XDECREF(args); + Py_DECREF(format_func); + return NULL; +} + +#endif +Py_LOCAL_INLINE(PyObject*) make_match_copy(MatchObject* self); + +/* MatchObject's '__copy__' method. */ +static PyObject* match_copy(MatchObject* self, PyObject *unused) { + return make_match_copy(self); +} + +/* MatchObject's '__deepcopy__' method. */ +static PyObject* match_deepcopy(MatchObject* self, PyObject* memo) { + return make_match_copy(self); +} + +/* MatchObject's 'regs' attribute. */ +static PyObject* match_regs(MatchObject* self) { + PyObject* regs; + PyObject* item; + size_t g; + + regs = PyTuple_New((Py_ssize_t)self->group_count + 1); + if (!regs) + return NULL; + + item = Py_BuildValue("nn", self->match_start, self->match_end); + if (!item) + goto error; + + /* PyTuple_SET_ITEM borrows the reference. */ + PyTuple_SET_ITEM(regs, 0, item); + + for (g = 0; g < self->group_count; g++) { + RE_GroupSpan* span; + + span = &self->groups[g].span; + item = Py_BuildValue("nn", span->start, span->end); + if (!item) + goto error; + + /* PyTuple_SET_ITEM borrows the reference. */ + PyTuple_SET_ITEM(regs, g + 1, item); + } + + Py_INCREF(regs); + self->regs = regs; + + return regs; + +error: + Py_DECREF(regs); + return NULL; +} + +/* MatchObject's slice method. */ +Py_LOCAL_INLINE(PyObject*) match_get_group_slice(MatchObject* self, PyObject* + slice) { + Py_ssize_t start; + Py_ssize_t end; + Py_ssize_t step; + Py_ssize_t slice_length; + + if (PySlice_GetIndicesEx((PySliceObject*)slice, + (Py_ssize_t)self->group_count + 1, &start, &end, &step, &slice_length) < + 0) + return NULL; + + if (slice_length <= 0) + return PyTuple_New(0); + else { + PyObject* result; + Py_ssize_t cur; + Py_ssize_t i; + + result = PyTuple_New(slice_length); + if (!result) + return NULL; + + cur = start; + for (i = 0; i < slice_length; i++) { + /* PyTuple_SetItem borrows the reference. */ + PyTuple_SetItem(result, i, match_get_group_by_index(self, cur, + Py_None)); + cur += step; + } + + return result; + } +} + +/* MatchObject's length method. */ +Py_LOCAL_INLINE(Py_ssize_t) match_length(MatchObject* self) { + return (Py_ssize_t)self->group_count + 1; +} + +/* MatchObject's '__getitem__' method. */ +static PyObject* match_getitem(MatchObject* self, PyObject* item) { + if (PySlice_Check(item)) + return match_get_group_slice(self, item); + + return match_get_group(self, item, Py_None, TRUE); +} + +/* Determines the portion of the target string which is covered by the group + * captures. + */ +Py_LOCAL_INLINE(void) determine_target_substring(MatchObject* match, + Py_ssize_t* slice_start, Py_ssize_t* slice_end) { + Py_ssize_t start; + Py_ssize_t end; + size_t g; + + start = match->pos; + end = match->endpos; + + for (g = 0; g < match->group_count; g++) { + RE_GroupSpan* span; + size_t c; + + span = &match->groups[g].span; + if (span->start >= 0 && span->start < start) + start = span->start; + if (span->end >= 0 && span->end > end) + end = span->end; + + for (c = 0; c < match->groups[g].capture_count; c++) { + RE_GroupSpan* span; + + span = match->groups[g].captures; + if (span->start >= 0 && span->start < start) + start = span->start; + if (span->end >= 0 && span->end > end) + end = span->end; + } + } + + *slice_start = start; + *slice_end = end; +} + +/* MatchObject's 'detach_string' method. */ +static PyObject* match_detach_string(MatchObject* self, PyObject* unused) { + if (self->string) { + Py_ssize_t start; + Py_ssize_t end; + PyObject* substring; + + determine_target_substring(self, &start, &end); + + substring = get_slice(self->string, start, end); + if (substring) { + Py_XDECREF(self->substring); + self->substring = substring; + self->substring_offset = start; + + Py_DECREF(self->string); + self->string = NULL; + } + } + + Py_INCREF(Py_None); + return Py_None; +} + +/* The documentation of a MatchObject. */ +PyDoc_STRVAR(match_group_doc, + "group([group1, ...]) --> string or tuple of strings.\n\ + Return one or more subgroups of the match. If there is a single argument,\n\ + the result is a single string, or None if the group did not contribute to\n\ + the match; if there are multiple arguments, the result is a tuple with one\n\ + item per argument; if there are no arguments, the whole match is returned.\n\ + Group 0 is the whole match."); + +PyDoc_STRVAR(match_start_doc, + "start([group1, ...]) --> int or tuple of ints.\n\ + Return the index of the start of one or more subgroups of the match. If\n\ + there is a single argument, the result is an index, or -1 if the group did\n\ + not contribute to the match; if there are multiple arguments, the result is\n\ + a tuple with one item per argument; if there are no arguments, the index of\n\ + the start of the whole match is returned. Group 0 is the whole match."); + +PyDoc_STRVAR(match_end_doc, + "end([group1, ...]) --> int or tuple of ints.\n\ + Return the index of the end of one or more subgroups of the match. If there\n\ + is a single argument, the result is an index, or -1 if the group did not\n\ + contribute to the match; if there are multiple arguments, the result is a\n\ + tuple with one item per argument; if there are no arguments, the index of\n\ + the end of the whole match is returned. Group 0 is the whole match."); + +PyDoc_STRVAR(match_span_doc, + "span([group1, ...]) --> 2-tuple of int or tuple of 2-tuple of ints.\n\ + Return the span (a 2-tuple of the indices of the start and end) of one or\n\ + more subgroups of the match. If there is a single argument, the result is a\n\ + span, or (-1, -1) if the group did not contribute to the match; if there are\n\ + multiple arguments, the result is a tuple with one item per argument; if\n\ + there are no arguments, the span of the whole match is returned. Group 0 is\n\ + the whole match."); + +PyDoc_STRVAR(match_groups_doc, + "groups(default=None) --> tuple of strings.\n\ + Return a tuple containing all the subgroups of the match. The argument is\n\ + the default for groups that did not participate in the match."); + +PyDoc_STRVAR(match_groupdict_doc, + "groupdict(default=None) --> dict.\n\ + Return a dictionary containing all the named subgroups of the match, keyed\n\ + by the subgroup name. The argument is the value to be given for groups that\n\ + did not participate in the match."); + +PyDoc_STRVAR(match_capturesdict_doc, + "capturesdict() --> dict.\n\ + Return a dictionary containing the captures of all the named subgroups of the\n\ + match, keyed by the subgroup name."); + +PyDoc_STRVAR(match_expand_doc, + "expand(template) --> string.\n\ + Return the string obtained by doing backslash substitution on the template,\n\ + as done by the sub() method."); + +#if PY_VERSION_HEX >= 0x02060000 +PyDoc_STRVAR(match_expandf_doc, + "expandf(format) --> string.\n\ + Return the string obtained by using the format, as done by the subf()\n\ + method."); + +#endif +PyDoc_STRVAR(match_captures_doc, + "captures([group1, ...]) --> list of strings or tuple of list of strings.\n\ + Return the captures of one or more subgroups of the match. If there is a\n\ + single argument, the result is a list of strings; if there are multiple\n\ + arguments, the result is a tuple of lists with one item per argument; if\n\ + there are no arguments, the captures of the whole match is returned. Group\n\ + 0 is the whole match."); + +PyDoc_STRVAR(match_starts_doc, + "starts([group1, ...]) --> list of ints or tuple of list of ints.\n\ + Return the indices of the starts of the captures of one or more subgroups of\n\ + the match. If there is a single argument, the result is a list of indices;\n\ + if there are multiple arguments, the result is a tuple of lists with one\n\ + item per argument; if there are no arguments, the indices of the starts of\n\ + the captures of the whole match is returned. Group 0 is the whole match."); + +PyDoc_STRVAR(match_ends_doc, + "ends([group1, ...]) --> list of ints or tuple of list of ints.\n\ + Return the indices of the ends of the captures of one or more subgroups of\n\ + the match. If there is a single argument, the result is a list of indices;\n\ + if there are multiple arguments, the result is a tuple of lists with one\n\ + item per argument; if there are no arguments, the indices of the ends of the\n\ + captures of the whole match is returned. Group 0 is the whole match."); + +PyDoc_STRVAR(match_spans_doc, + "spans([group1, ...]) --> list of 2-tuple of ints or tuple of list of 2-tuple of ints.\n\ + Return the spans (a 2-tuple of the indices of the start and end) of the\n\ + captures of one or more subgroups of the match. If there is a single\n\ + argument, the result is a list of spans; if there are multiple arguments,\n\ + the result is a tuple of lists with one item per argument; if there are no\n\ + arguments, the spans of the captures of the whole match is returned. Group\n\ + 0 is the whole match."); + +PyDoc_STRVAR(match_detach_string_doc, + "detach_string()\n\ + Detaches the target string from the match object. The 'string' attribute\n\ + will become None."); + +/* MatchObject's methods. */ +static PyMethodDef match_methods[] = { + {"group", (PyCFunction)match_group, METH_VARARGS, match_group_doc}, + {"start", (PyCFunction)match_start, METH_VARARGS, match_start_doc}, + {"end", (PyCFunction)match_end, METH_VARARGS, match_end_doc}, + {"span", (PyCFunction)match_span, METH_VARARGS, match_span_doc}, + {"groups", (PyCFunction)match_groups, METH_VARARGS|METH_KEYWORDS, + match_groups_doc}, + {"groupdict", (PyCFunction)match_groupdict, METH_VARARGS|METH_KEYWORDS, + match_groupdict_doc}, + {"capturesdict", (PyCFunction)match_capturesdict, METH_NOARGS, + match_capturesdict_doc}, + {"expand", (PyCFunction)match_expand, METH_O, match_expand_doc}, +#if PY_VERSION_HEX >= 0x02060000 + {"expandf", (PyCFunction)match_expandf, METH_O, match_expandf_doc}, +#endif + {"captures", (PyCFunction)match_captures, METH_VARARGS, + match_captures_doc}, + {"starts", (PyCFunction)match_starts, METH_VARARGS, match_starts_doc}, + {"ends", (PyCFunction)match_ends, METH_VARARGS, match_ends_doc}, + {"spans", (PyCFunction)match_spans, METH_VARARGS, match_spans_doc}, + {"detach_string", (PyCFunction)match_detach_string, METH_NOARGS, + match_detach_string_doc}, + {"__copy__", (PyCFunction)match_copy, METH_NOARGS}, + {"__deepcopy__", (PyCFunction)match_deepcopy, METH_O}, + {"__getitem__", (PyCFunction)match_getitem, METH_O|METH_COEXIST}, + {NULL, NULL} +}; + +PyDoc_STRVAR(match_doc, "Match object"); + +/* MatchObject's 'lastindex' attribute. */ +static PyObject* match_lastindex(PyObject* self_) { + MatchObject* self; + + self = (MatchObject*)self_; + + if (self->lastindex >= 0) + return Py_BuildValue("n", self->lastindex); + + Py_INCREF(Py_None); + return Py_None; +} + +/* MatchObject's 'lastgroup' attribute. */ +static PyObject* match_lastgroup(PyObject* self_) { + MatchObject* self; + + self = (MatchObject*)self_; + + if (self->pattern->indexgroup && self->lastgroup >= 0) { + PyObject* index; + PyObject* result; + + index = Py_BuildValue("n", self->lastgroup); + + /* PyDict_GetItem returns borrows a reference. */ + result = PyDict_GetItem(self->pattern->indexgroup, index); + Py_DECREF(index); + if (result) { + Py_INCREF(result); + return result; + } + PyErr_Clear(); + } + + Py_INCREF(Py_None); + return Py_None; +} + +/* MatchObject's 'string' attribute. */ +static PyObject* match_string(PyObject* self_) { + MatchObject* self; + + self = (MatchObject*)self_; + + if (self->string) { + Py_INCREF(self->string); + return self->string; + } else { + Py_INCREF(Py_None); + return Py_None; + } +} +#if PY_VERSION_HEX < 0x02060000 + +/* MatchObject's 'partial' attribute. */ +static PyObject* match_partial(PyObject* self_) { + MatchObject* self; + PyObject* result; + + self = (MatchObject*)self_; + + result = self->partial ? Py_True : Py_False; + Py_INCREF(result); + + return result; +} +#endif + +/* MatchObject's 'fuzzy_counts' attribute. */ +static PyObject* match_fuzzy_counts(PyObject* self_) { + MatchObject* self; + + self = (MatchObject*)self_; + + return Py_BuildValue("nnn", self->fuzzy_counts[RE_FUZZY_SUB], + self->fuzzy_counts[RE_FUZZY_INS], self->fuzzy_counts[RE_FUZZY_DEL]); +} + +static PyGetSetDef match_getset[] = { + {"lastindex", (getter)match_lastindex, (setter)NULL, + "The group number of the last matched capturing group, or None."}, + {"lastgroup", (getter)match_lastgroup, (setter)NULL, + "The name of the last matched capturing group, or None."}, + {"regs", (getter)match_regs, (setter)NULL, + "A tuple of the spans of the capturing groups."}, + {"string", (getter)match_string, (setter)NULL, + "The string that was searched, or None if it has been detached."}, +#if PY_VERSION_HEX < 0x02060000 + {"partial", (getter)match_partial, (setter)NULL, + "Whether it's a partial match."}, +#endif + {"fuzzy_counts", (getter)match_fuzzy_counts, (setter)NULL, + "A tuple of the number of substitutions, insertions and deletions."}, + {NULL} /* Sentinel */ +}; + +static PyMemberDef match_members[] = { + {"re", T_OBJECT, offsetof(MatchObject, pattern), READONLY, + "The regex object that produced this match object."}, + {"pos", T_PYSSIZET, offsetof(MatchObject, pos), READONLY, + "The position at which the regex engine starting searching."}, + {"endpos", T_PYSSIZET, offsetof(MatchObject, endpos), READONLY, + "The final position beyond which the regex engine won't search."}, +#if PY_VERSION_HEX >= 0x02060000 + {"partial", T_BOOL, offsetof(MatchObject, partial), READONLY, + "Whether it's a partial match."}, +#endif + {NULL} /* Sentinel */ +}; + +static PyMappingMethods match_as_mapping = { + (lenfunc)match_length, /* mp_length */ + (binaryfunc)match_getitem, /* mp_subscript */ + 0, /* mp_ass_subscript */ +}; + +static PyTypeObject Match_Type = { + PyObject_HEAD_INIT(NULL) + 0, + "_" RE_MODULE "." "Match", + sizeof(MatchObject) +}; + +/* Copies the groups. */ +Py_LOCAL_INLINE(RE_GroupData*) copy_groups(RE_GroupData* groups, size_t + group_count) { + size_t span_count; + size_t g; + RE_GroupData* groups_copy; + RE_GroupSpan* spans_copy; + size_t offset; + + /* Calculate the total size of the group info. */ + span_count = 0; + for (g = 0; g < group_count; g++) + span_count += groups[g].capture_count; + + /* Allocate the storage for the group info in a single block. */ + groups_copy = (RE_GroupData*)re_alloc(group_count * sizeof(RE_GroupData) + + span_count * sizeof(RE_GroupSpan)); + if (!groups_copy) + return NULL; + + /* The storage for the spans comes after the other group info. */ + spans_copy = (RE_GroupSpan*)&groups_copy[group_count]; + + /* There's no need to initialise the spans info. */ + memset(groups_copy, 0, group_count * sizeof(RE_GroupData)); + + offset = 0; + for (g = 0; g < group_count; g++) { + RE_GroupData* orig; + RE_GroupData* copy; + + orig = &groups[g]; + copy = &groups_copy[g]; + copy->span = orig->span; + + copy->captures = &spans_copy[offset]; + offset += orig->capture_count; + + if (orig->capture_count > 0) { + Py_MEMCPY(copy->captures, orig->captures, orig->capture_count * + sizeof(RE_GroupSpan)); + copy->capture_capacity = orig->capture_count; + copy->capture_count = orig->capture_count; + } + } + + return groups_copy; +} + +/* Makes a copy of a MatchObject. */ +Py_LOCAL_INLINE(PyObject*) make_match_copy(MatchObject* self) { + MatchObject* match; + + if (!self->string) { + /* The target string has been detached, so the MatchObject is now + * immutable. + */ + Py_INCREF(self); + return (PyObject*)self; + } + + /* Create a MatchObject. */ + match = PyObject_NEW(MatchObject, &Match_Type); + if (!match) + return NULL; + + Py_MEMCPY(match, self, sizeof(MatchObject)); + + Py_INCREF(match->string); + Py_INCREF(match->substring); + Py_INCREF(match->pattern); + + /* Copy the groups to the MatchObject. */ + if (self->group_count > 0) { + match->groups = copy_groups(self->groups, self->group_count); + if (!match->groups) { + Py_DECREF(match); + return NULL; + } + } + + return (PyObject*)match; +} + +/* Creates a new MatchObject. */ +Py_LOCAL_INLINE(PyObject*) pattern_new_match(PatternObject* pattern, RE_State* + state, int status) { + /* Create MatchObject (from state object). */ + if (status > 0 || status == RE_ERROR_PARTIAL) { + MatchObject* match; + + /* Create a MatchObject. */ + match = PyObject_NEW(MatchObject, &Match_Type); + if (!match) + return NULL; + + match->string = state->string; + match->substring = state->string; + match->substring_offset = 0; + match->pattern = pattern; + match->regs = NULL; + + if (pattern->is_fuzzy) { + match->fuzzy_counts[RE_FUZZY_SUB] = + state->total_fuzzy_counts[RE_FUZZY_SUB]; + match->fuzzy_counts[RE_FUZZY_INS] = + state->total_fuzzy_counts[RE_FUZZY_INS]; + match->fuzzy_counts[RE_FUZZY_DEL] = + state->total_fuzzy_counts[RE_FUZZY_DEL]; + } else + memset(match->fuzzy_counts, 0, sizeof(match->fuzzy_counts)); + + match->partial = status == RE_ERROR_PARTIAL; + Py_INCREF(match->string); + Py_INCREF(match->substring); + Py_INCREF(match->pattern); + + /* Copy the groups to the MatchObject. */ + if (pattern->public_group_count > 0) { + match->groups = copy_groups(state->groups, + pattern->public_group_count); + if (!match->groups) { + Py_DECREF(match); + return NULL; + } + } else + match->groups = NULL; + + match->group_count = pattern->public_group_count; + + match->pos = state->slice_start; + match->endpos = state->slice_end; + + if (state->reverse) { + match->match_start = state->text_pos; + match->match_end = state->match_pos; + } else { + match->match_start = state->match_pos; + match->match_end = state->text_pos; + } + + match->lastindex = state->lastindex; + match->lastgroup = state->lastgroup; + + return (PyObject*)match; + } else if (status == 0) { + /* No match. */ + Py_INCREF(Py_None); + return Py_None; + } else { + /* Internal error. */ + set_error(status, NULL); + return NULL; + } +} + +/* Gets the text of a capture group from a state. */ +Py_LOCAL_INLINE(PyObject*) state_get_group(RE_State* state, Py_ssize_t index, + PyObject* string, BOOL empty) { + RE_GroupData* group; + Py_ssize_t start; + Py_ssize_t end; + + group = &state->groups[index - 1]; + + if (string != Py_None && index >= 1 && (size_t)index <= + state->pattern->public_group_count && group->capture_count > 0) { + start = group->span.start; + end = group->span.end; + } else { + if (empty) + /* Want an empty string. */ + start = end = 0; + else { + Py_INCREF(Py_None); + return Py_None; + } + } + + return get_slice(string, start, end); +} + +/* Acquires the lock (mutex) on the state if there's one. + * + * It also increments the owner's refcount just to ensure that it won't be + * destroyed by another thread. + */ +Py_LOCAL_INLINE(void) acquire_state_lock(PyObject* owner, RE_SafeState* + safe_state) { + RE_State* state; + + state = safe_state->re_state; + + if (state->lock) { + /* In order to avoid deadlock we need to release the GIL while trying + * to acquire the lock. + */ + Py_INCREF(owner); + if (!PyThread_acquire_lock(state->lock, 0)) { + release_GIL(safe_state); + PyThread_acquire_lock(state->lock, 1); + acquire_GIL(safe_state); + } + } +} + +/* Releases the lock (mutex) on the state if there's one. + * + * It also decrements the owner's refcount, which was incremented when the lock + * was acquired. + */ +Py_LOCAL_INLINE(void) release_state_lock(PyObject* owner, RE_SafeState* + safe_state) { + RE_State* state; + + state = safe_state->re_state; + + if (state->lock) { + PyThread_release_lock(state->lock); + Py_DECREF(owner); + } +} + +/* Implements the functionality of ScanObject's search and match methods. */ +Py_LOCAL_INLINE(PyObject*) scanner_search_or_match(ScannerObject* self, BOOL + search) { + RE_State* state; + RE_SafeState safe_state; + PyObject* match; + + state = &self->state; + + /* Initialise the "safe state" structure. */ + safe_state.re_state = state; + safe_state.thread_state = NULL; + + /* Acquire the state lock in case we're sharing the scanner object across + * threads. + */ + acquire_state_lock((PyObject*)self, &safe_state); + + if (self->status == RE_ERROR_FAILURE || self->status == RE_ERROR_PARTIAL) { + /* No or partial match. */ + release_state_lock((PyObject*)self, &safe_state); + Py_INCREF(Py_None); + return Py_None; + } else if (self->status < 0) { + /* Internal error. */ + release_state_lock((PyObject*)self, &safe_state); + set_error(self->status, NULL); + return NULL; + } + + /* Look for another match. */ + self->status = do_match(&safe_state, search); + if (self->status >= 0 || self->status == RE_ERROR_PARTIAL) { + /* Create the match object. */ + match = pattern_new_match(self->pattern, state, self->status); + + if (search && state->overlapped) { + /* Advance one character. */ + Py_ssize_t step; + + step = state->reverse ? -1 : 1; + state->text_pos = state->match_pos + step; + state->must_advance = FALSE; + } else + /* Continue from where we left off, but don't allow 2 contiguous + * zero-width matches. + */ + state->must_advance = state->text_pos == state->match_pos; + } else + /* Internal error. */ + match = NULL; + + /* Release the state lock. */ + release_state_lock((PyObject*)self, &safe_state); + + return match; +} + +/* ScannerObject's 'match' method. */ +static PyObject* scanner_match(ScannerObject* self, PyObject* unused) { + return scanner_search_or_match(self, FALSE); +} + +/* ScannerObject's 'search' method. */ +static PyObject* scanner_search(ScannerObject* self, PyObject *unused) { + return scanner_search_or_match(self, TRUE); +} + +/* ScannerObject's 'next' method. */ +static PyObject* scanner_next(PyObject* self) { + PyObject* match; + + match = scanner_search((ScannerObject*)self, NULL); + + if (match == Py_None) { + /* No match. */ + Py_DECREF(Py_None); + PyErr_SetNone(PyExc_StopIteration); + return NULL; + } + + return match; +} + +/* Returns an iterator for a ScannerObject. + * + * The iterator is actually the ScannerObject itself. + */ +static PyObject* scanner_iter(PyObject* self) { + Py_INCREF(self); + return self; +} + +/* Gets the next result from a scanner iterator. */ +static PyObject* scanner_iternext(PyObject* self) { + PyObject* match; + + match = scanner_search((ScannerObject*)self, NULL); + + if (match == Py_None) { + /* No match. */ + Py_DECREF(match); + return NULL; + } + + return match; +} + +/* Makes a copy of a ScannerObject. + * + * It actually doesn't make a copy, just returns the original object. + */ +Py_LOCAL_INLINE(PyObject*) make_scanner_copy(ScannerObject* self) { + Py_INCREF(self); + return (PyObject*)self; +} + +/* ScannerObject's '__copy__' method. */ +static PyObject* scanner_copy(ScannerObject* self, PyObject *unused) { + return make_scanner_copy(self); +} + +/* ScannerObject's '__deepcopy__' method. */ +static PyObject* scanner_deepcopy(ScannerObject* self, PyObject* memo) { + return make_scanner_copy(self); +} + +/* The documentation of a ScannerObject. */ +PyDoc_STRVAR(scanner_match_doc, + "match() --> MatchObject or None.\n\ + Match at the current position in the string."); + +PyDoc_STRVAR(scanner_search_doc, + "search() --> MatchObject or None.\n\ + Search from the current position in the string."); + +/* ScannerObject's methods. */ +static PyMethodDef scanner_methods[] = { + {"next", (PyCFunction)scanner_next, METH_NOARGS}, + {"match", (PyCFunction)scanner_match, METH_NOARGS, scanner_match_doc}, + {"search", (PyCFunction)scanner_search, METH_NOARGS, scanner_search_doc}, + {"__copy__", (PyCFunction)scanner_copy, METH_NOARGS}, + {"__deepcopy__", (PyCFunction)scanner_deepcopy, METH_O}, + {NULL, NULL} +}; + +PyDoc_STRVAR(scanner_doc, "Scanner object"); + +/* Deallocates a ScannerObject. */ +static void scanner_dealloc(PyObject* self_) { + ScannerObject* self; + + self = (ScannerObject*)self_; + + state_fini(&self->state); + Py_DECREF(self->pattern); + PyObject_DEL(self); +} + +static PyMemberDef scanner_members[] = { + {"pattern", T_OBJECT, offsetof(ScannerObject, pattern), READONLY, + "The regex object that produced this scanner object."}, + {NULL} /* Sentinel */ +}; + +static PyTypeObject Scanner_Type = { + PyObject_HEAD_INIT(NULL) + 0, + "_" RE_MODULE "." "Scanner", + sizeof(ScannerObject) +}; + +/* Decodes a 'concurrent' argument. */ +Py_LOCAL_INLINE(int) decode_concurrent(PyObject* concurrent) { + Py_ssize_t value; + + if (concurrent == Py_None) + return RE_CONC_DEFAULT; + + value = PyLong_AsLong(concurrent); + if (value == -1 && PyErr_Occurred()) { + set_error(RE_ERROR_CONCURRENT, NULL); + return -1; + } + + return value ? RE_CONC_YES : RE_CONC_NO; +} + +/* Decodes a 'partial' argument. */ +Py_LOCAL_INLINE(BOOL) decode_partial(PyObject* partial) { + Py_ssize_t value; + + if (partial == Py_False) + return FALSE; + + if (partial == Py_True) + return TRUE; + + value = PyLong_AsLong(partial); + if (value == -1 && PyErr_Occurred()) { + PyErr_Clear(); + return TRUE; + } + + return value != 0; +} + +/* Creates a new ScannerObject. */ +static PyObject* pattern_scanner(PatternObject* pattern, PyObject* args, + PyObject* kwargs) { + /* Create search state object. */ + ScannerObject* self; + Py_ssize_t start; + Py_ssize_t end; + int conc; + BOOL part; + + PyObject* string; + PyObject* pos = Py_None; + PyObject* endpos = Py_None; + Py_ssize_t overlapped = FALSE; + PyObject* concurrent = Py_None; + PyObject* partial = Py_False; + static char* kwlist[] = { "string", "pos", "endpos", "overlapped", + "concurrent", "partial", NULL }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OOnOO:scanner", kwlist, + &string, &pos, &endpos, &overlapped, &concurrent, &partial)) + return NULL; + + start = as_string_index(pos, 0); + if (start == -1 && PyErr_Occurred()) + return NULL; + + end = as_string_index(endpos, PY_SSIZE_T_MAX); + if (end == -1 && PyErr_Occurred()) + return NULL; + + conc = decode_concurrent(concurrent); + if (conc < 0) + return NULL; + + part = decode_partial(partial); + + /* Create a scanner object. */ + self = PyObject_NEW(ScannerObject, &Scanner_Type); + if (!self) + return NULL; + + self->pattern = pattern; + Py_INCREF(self->pattern); + + /* The MatchObject, and therefore repeated captures, will be visible. */ + if (!state_init(&self->state, pattern, string, start, end, overlapped != 0, + conc, part, TRUE, TRUE, FALSE)) { + PyObject_DEL(self); + return NULL; + } + + self->status = RE_ERROR_SUCCESS; + + return (PyObject*) self; +} + +/* Performs the split for the SplitterObject. */ +Py_LOCAL_INLINE(PyObject*) next_split_part(SplitterObject* self) { + RE_State* state; + RE_SafeState safe_state; + PyObject* result = NULL; /* Initialise to stop compiler warning. */ + + state = &self->state; + + /* Initialise the "safe state" structure. */ + safe_state.re_state = state; + safe_state.thread_state = NULL; + + /* Acquire the state lock in case we're sharing the splitter object across + * threads. + */ + acquire_state_lock((PyObject*)self, &safe_state); + + if (self->status == RE_ERROR_FAILURE || self->status == RE_ERROR_PARTIAL) { + /* Finished. */ + release_state_lock((PyObject*)self, &safe_state); + result = Py_False; + Py_INCREF(result); + return result; + } else if (self->status < 0) { + /* Internal error. */ + release_state_lock((PyObject*)self, &safe_state); + set_error(self->status, NULL); + return NULL; + } + + if (self->index == 0) { + if (self->split_count < self->maxsplit) { + Py_ssize_t step; + Py_ssize_t end_pos; + + if (state->reverse) { + step = -1; + end_pos = state->slice_start; + } else { + step = 1; + end_pos = state->slice_end; + } + +retry: + self->status = do_match(&safe_state, TRUE); + if (self->status < 0) + goto error; + + if (self->status == RE_ERROR_SUCCESS) { + if (state->version_0) { + /* Version 0 behaviour is to advance one character if the + * split was zero-width. Unfortunately, this can give an + * incorrect result. GvR wants this behaviour to be + * retained so as not to break any existing software which + * might rely on it. + */ + if (state->text_pos == state->match_pos) { + if (self->last_pos == end_pos) + goto no_match; + + /* Advance one character. */ + state->text_pos += step; + state->must_advance = FALSE; + goto retry; + } + } + + ++self->split_count; + + /* Get segment before this match. */ + if (state->reverse) + result = get_slice(state->string, state->match_pos, + self->last_pos); + else + result = get_slice(state->string, self->last_pos, + state->match_pos); + if (!result) + goto error; + + self->last_pos = state->text_pos; + + /* Version 0 behaviour is to advance one character if the match + * was zero-width. Unfortunately, this can give an incorrect + * result. GvR wants this behaviour to be retained so as not to + * break any existing software which might rely on it. + */ + if (state->version_0) { + if (state->text_pos == state->match_pos) + /* Advance one character. */ + state->text_pos += step; + + state->must_advance = FALSE; + } else + /* Continue from where we left off, but don't allow a + * contiguous zero-width match. + */ + state->must_advance = TRUE; + } + } else + goto no_match; + + if (self->status == RE_ERROR_FAILURE || self->status == + RE_ERROR_PARTIAL) { +no_match: + /* Get segment following last match (even if empty). */ + if (state->reverse) + result = get_slice(state->string, 0, self->last_pos); + else + result = get_slice(state->string, self->last_pos, + state->text_length); + if (!result) + goto error; + } + } else { + /* Add group. */ + result = state_get_group(state, self->index, state->string, FALSE); + if (!result) + goto error; + } + + ++self->index; + if ((size_t)self->index > state->pattern->public_group_count) + self->index = 0; + + /* Release the state lock. */ + release_state_lock((PyObject*)self, &safe_state); + + return result; + +error: + /* Release the state lock. */ + release_state_lock((PyObject*)self, &safe_state); + + return NULL; +} + +/* SplitterObject's 'split' method. */ +static PyObject* splitter_split(SplitterObject* self, PyObject *unused) { + PyObject* result; + + result = next_split_part(self); + + if (result == Py_False) { + /* The sentinel. */ + Py_DECREF(Py_False); + Py_INCREF(Py_None); + return Py_None; + } + + return result; +} + +/* SplitterObject's 'next' method. */ +static PyObject* splitter_next(PyObject* self) { + PyObject* result; + + result = next_split_part((SplitterObject*)self); + + if (result == Py_False) { + /* No match. */ + Py_DECREF(Py_False); + PyErr_SetNone(PyExc_StopIteration); + return NULL; + } + + return result; +} + +/* Returns an iterator for a SplitterObject. + * + * The iterator is actually the SplitterObject itself. + */ +static PyObject* splitter_iter(PyObject* self) { + Py_INCREF(self); + return self; +} + +/* Gets the next result from a splitter iterator. */ +static PyObject* splitter_iternext(PyObject* self) { + PyObject* result; + + result = next_split_part((SplitterObject*)self); + + if (result == Py_False) { + /* No match. */ + Py_DECREF(result); + return NULL; + } + + return result; +} + +/* Makes a copy of a SplitterObject. + * + * It actually doesn't make a copy, just returns the original object. + */ +Py_LOCAL_INLINE(PyObject*) make_splitter_copy(SplitterObject* self) { + Py_INCREF(self); + return (PyObject*)self; +} + +/* SplitterObject's '__copy__' method. */ +static PyObject* splitter_copy(SplitterObject* self, PyObject *unused) { + return make_splitter_copy(self); +} + +/* SplitterObject's '__deepcopy__' method. */ +static PyObject* splitter_deepcopy(SplitterObject* self, PyObject* memo) { + return make_splitter_copy(self); +} + +/* The documentation of a SplitterObject. */ +PyDoc_STRVAR(splitter_split_doc, + "split() --> string or None.\n\ + Return the next part of the split string."); + +/* SplitterObject's methods. */ +static PyMethodDef splitter_methods[] = { + {"next", (PyCFunction)splitter_next, METH_NOARGS}, + {"split", (PyCFunction)splitter_split, METH_NOARGS, splitter_split_doc}, + {"__copy__", (PyCFunction)splitter_copy, METH_NOARGS}, + {"__deepcopy__", (PyCFunction)splitter_deepcopy, METH_O}, + {NULL, NULL} +}; + +PyDoc_STRVAR(splitter_doc, "Splitter object"); + +/* Deallocates a SplitterObject. */ +static void splitter_dealloc(PyObject* self_) { + SplitterObject* self; + + self = (SplitterObject*)self_; + + state_fini(&self->state); + Py_DECREF(self->pattern); + PyObject_DEL(self); +} +#if PY_VERSION_HEX >= 0x02060000 + +/* Converts a captures index to an integer. + * + * A negative capture index in 'expandf' and 'subf' is passed as a string + * because negative indexes are not supported by 'str.format'. + */ +Py_LOCAL_INLINE(Py_ssize_t) index_to_integer(PyObject* item) { + Py_ssize_t value; + + value = PyInt_AsSsize_t(item); + if (value != -1 || !PyErr_Occurred()) + return value; + + PyErr_Clear(); + + value = PyLong_AsLong(item); + if (value != -1 || !PyErr_Occurred()) + return value; + + PyErr_Clear(); + + /* Is the index a string representation of an integer? */ + if (PyUnicode_Check(item)) { + PyObject* int_obj; + Py_UNICODE* characters; + Py_ssize_t length; + + characters = (Py_UNICODE*)PyUnicode_AS_DATA(item); + length = PyUnicode_GET_SIZE(item); + int_obj = PyLong_FromUnicode(characters, length, 0); + if (!int_obj) + goto error; + + value = PyLong_AsLong(int_obj); + Py_DECREF(int_obj); + if (!PyErr_Occurred()) + return value; + } else if (PyString_Check(item)) { + char* characters; + PyObject* int_obj; + + characters = PyString_AsString(item); + int_obj = PyLong_FromString(characters, NULL, 0); + if (!int_obj) + goto error; + + value = PyLong_AsLong(int_obj); + Py_DECREF(int_obj); + if (!PyErr_Occurred()) + return value; + } + +error: + PyErr_Format(PyExc_TypeError, "list indices must be integers, not %.200s", + item->ob_type->tp_name); + + return -1; +} + +/* CaptureObject's length method. */ +Py_LOCAL_INLINE(Py_ssize_t) capture_length(CaptureObject* self) { + MatchObject* match; + RE_GroupData* group; + + if (self->group_index == 0) + return 1; + + match = *self->match_indirect; + group = &match->groups[self->group_index - 1]; + + return (Py_ssize_t)group->capture_count; +} + +/* CaptureObject's '__getitem__' method. */ +static PyObject* capture_getitem(CaptureObject* self, PyObject* item) { + Py_ssize_t index; + MatchObject* match; + Py_ssize_t start; + Py_ssize_t end; + + index = index_to_integer(item); + if (index == -1 && PyErr_Occurred()) + return NULL; + + match = *self->match_indirect; + + if (self->group_index == 0) { + if (index < 0) + index += 1; + + if (index != 0) { + PyErr_SetString(PyExc_IndexError, "list index out of range"); + return NULL; + } + + start = match->match_start; + end = match->match_end; + } else { + RE_GroupData* group; + RE_GroupSpan* span; + + group = &match->groups[self->group_index - 1]; + + if (index < 0) + index += group->capture_count; + + if (index < 0 || index >= (Py_ssize_t)group->capture_count) { + PyErr_SetString(PyExc_IndexError, "list index out of range"); + return NULL; + } + + span = &group->captures[index]; + + start = span->start; + end = span->end; + } + + return get_slice(match->substring, start - match->substring_offset, end - + match->substring_offset); +} + +static PyMappingMethods capture_as_mapping = { + (lenfunc)capture_length, /* mp_length */ + (binaryfunc)capture_getitem, /* mp_subscript */ + 0, /* mp_ass_subscript */ +}; + +/* CaptureObject's methods. */ +static PyMethodDef capture_methods[] = { + {"__getitem__", (PyCFunction)capture_getitem, METH_O|METH_COEXIST}, + {NULL, NULL} +}; + +/* Deallocates a CaptureObject. */ +static void capture_dealloc(PyObject* self_) { + CaptureObject* self; + + self = (CaptureObject*)self_; + PyObject_DEL(self); +} + +/* CaptureObject's 'str' method. */ +static PyObject* capture_str(PyObject* self_) { + CaptureObject* self; + MatchObject* match; + + self = (CaptureObject*)self_; + match = *self->match_indirect; + + return match_get_group_by_index(match, self->group_index, Py_None); +} +#endif + +static PyMemberDef splitter_members[] = { + {"pattern", T_OBJECT, offsetof(SplitterObject, pattern), READONLY, + "The regex object that produced this splitter object."}, + {NULL} /* Sentinel */ +}; + +static PyTypeObject Splitter_Type = { + PyObject_HEAD_INIT(NULL) + 0, + "_" RE_MODULE "." "Splitter", + sizeof(SplitterObject) +}; + +/* Creates a new SplitterObject. */ +Py_LOCAL_INLINE(PyObject*) pattern_splitter(PatternObject* pattern, PyObject* + args, PyObject* kwargs) { + /* Create split state object. */ + int conc; + SplitterObject* self; + RE_State* state; + + PyObject* string; + Py_ssize_t maxsplit = 0; + PyObject* concurrent = Py_None; + static char* kwlist[] = { "string", "maxsplit", "concurrent", NULL }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|nO:splitter", kwlist, + &string, &maxsplit, &concurrent)) + return NULL; + + conc = decode_concurrent(concurrent); + if (conc < 0) + return NULL; + + /* Create a splitter object. */ + self = PyObject_NEW(SplitterObject, &Splitter_Type); + if (!self) + return NULL; + + self->pattern = pattern; + Py_INCREF(self->pattern); + + if (maxsplit == 0) + maxsplit = PY_SSIZE_T_MAX; + + state = &self->state; + + /* The MatchObject, and therefore repeated captures, will not be visible. + */ + if (!state_init(state, pattern, string, 0, PY_SSIZE_T_MAX, FALSE, conc, + FALSE, TRUE, FALSE, FALSE)) { + PyObject_DEL(self); + return NULL; + } + + self->maxsplit = maxsplit; + self->last_pos = state->reverse ? state->text_length : 0; + self->split_count = 0; + self->index = 0; + self->status = 1; + + return (PyObject*) self; +} + +/* Implements the functionality of PatternObject's search and match methods. */ +Py_LOCAL_INLINE(PyObject*) pattern_search_or_match(PatternObject* self, + PyObject* args, PyObject* kwargs, char* args_desc, BOOL search, BOOL + match_all) { + Py_ssize_t start; + Py_ssize_t end; + int conc; + BOOL part; + RE_State state; + RE_SafeState safe_state; + int status; + PyObject* match; + + PyObject* string; + PyObject* pos = Py_None; + PyObject* endpos = Py_None; + PyObject* concurrent = Py_None; + PyObject* partial = Py_False; + static char* kwlist[] = { "string", "pos", "endpos", "concurrent", + "partial", NULL }; + /* When working with a short string, such as a line from a file, the + * relative cost of PyArg_ParseTupleAndKeywords can be significant, and + * it's worth not using it when there are only positional arguments. + */ + Py_ssize_t arg_count; + if (args && !kwargs && PyTuple_CheckExact(args)) + arg_count = PyTuple_GET_SIZE(args); + else + arg_count = -1; + + if (1 <= arg_count && arg_count <= 5) { + /* PyTuple_GET_ITEM borrows the reference. */ + string = PyTuple_GET_ITEM(args, 0); + if (arg_count >= 2) + pos = PyTuple_GET_ITEM(args, 1); + if (arg_count >= 3) + endpos = PyTuple_GET_ITEM(args, 2); + if (arg_count >= 4) + concurrent = PyTuple_GET_ITEM(args, 3); + if (arg_count >= 5) + partial = PyTuple_GET_ITEM(args, 4); + } else if (!PyArg_ParseTupleAndKeywords(args, kwargs, args_desc, kwlist, + &string, &pos, &endpos, &concurrent, &partial)) + return NULL; + + start = as_string_index(pos, 0); + if (start == -1 && PyErr_Occurred()) + return NULL; + + end = as_string_index(endpos, PY_SSIZE_T_MAX); + if (end == -1 && PyErr_Occurred()) + return NULL; + + conc = decode_concurrent(concurrent); + if (conc < 0) + return NULL; + + part = decode_partial(partial); + + /* The MatchObject, and therefore repeated captures, will be visible. */ + if (!state_init(&state, self, string, start, end, FALSE, conc, part, FALSE, + TRUE, match_all)) + return NULL; + + /* Initialise the "safe state" structure. */ + safe_state.re_state = &state; + safe_state.thread_state = NULL; + + status = do_match(&safe_state, search); + + if (status >= 0 || status == RE_ERROR_PARTIAL) + /* Create the match object. */ + match = pattern_new_match(self, &state, status); + else + match = NULL; + + state_fini(&state); + + return match; +} + +/* PatternObject's 'match' method. */ +static PyObject* pattern_match(PatternObject* self, PyObject* args, PyObject* + kwargs) { + return pattern_search_or_match(self, args, kwargs, "O|OOOO:match", FALSE, + FALSE); +} + +/* PatternObject's 'fullmatch' method. */ +static PyObject* pattern_fullmatch(PatternObject* self, PyObject* args, + PyObject* kwargs) { + return pattern_search_or_match(self, args, kwargs, "O|OOOO:fullmatch", + FALSE, TRUE); +} + +/* PatternObject's 'search' method. */ +static PyObject* pattern_search(PatternObject* self, PyObject* args, PyObject* + kwargs) { + return pattern_search_or_match(self, args, kwargs, "O|OOOO:search", TRUE, + FALSE); +} + +/* Gets the limits of the matching. */ +Py_LOCAL_INLINE(BOOL) get_limits(PyObject* pos, PyObject* endpos, Py_ssize_t + length, Py_ssize_t* start, Py_ssize_t* end) { + Py_ssize_t s; + Py_ssize_t e; + + s = as_string_index(pos, 0); + if (s == -1 && PyErr_Occurred()) + return FALSE; + + e = as_string_index(endpos, PY_SSIZE_T_MAX); + if (e == -1 && PyErr_Occurred()) + return FALSE; + + /* Adjust boundaries. */ + if (s < 0) + s += length; + if (s < 0) + s = 0; + else if (s > length) + s = length; + + if (e < 0) + e += length; + if (e < 0) + e = 0; + else if (e > length) + e = length; + + *start = s; + *end = e; + + return TRUE; +} + +/* Gets a replacement item from the replacement list. + * + * The replacement item could be a string literal or a group. + * + * It can return None to represent an empty string. + */ +Py_LOCAL_INLINE(PyObject*) get_sub_replacement(PyObject* item, PyObject* + string, RE_State* state, size_t group_count) { + Py_ssize_t index; + + if (PyUnicode_CheckExact(item) || PyString_CheckExact(item)) { + /* It's a literal, which can be added directly to the list. */ + Py_INCREF(item); + return item; + } + + /* Is it a group reference? */ + index = as_group_index(item); + if (index == -1 && PyErr_Occurred()) { + /* Not a group either! */ + set_error(RE_ERROR_REPLACEMENT, NULL); + return NULL; + } + + if (index == 0) { + /* The entire matched portion of the string. */ + if (state->match_pos == state->text_pos) { + /* Return None for "". */ + Py_INCREF(Py_None); + return Py_None; + } + + if (state->reverse) + return get_slice(string, state->text_pos, state->match_pos); + else + return get_slice(string, state->match_pos, state->text_pos); + } else if (1 <= index && (size_t)index <= group_count) { + /* A group. */ + RE_GroupData* group; + + group = &state->groups[index - 1]; + + if (group->capture_count == 0 && group->span.start != group->span.end) + { + /* The group didn't match or is "", so return None for "". */ + Py_INCREF(Py_None); + return Py_None; + } + + return get_slice(string, group->span.start, group->span.end); + } else { + /* No such group. */ + set_error(RE_ERROR_INVALID_GROUP_REF, NULL); + return NULL; + } +} + +/* PatternObject's 'subx' method. */ +Py_LOCAL_INLINE(PyObject*) pattern_subx(PatternObject* self, PyObject* + str_template, PyObject* string, Py_ssize_t maxsub, int sub_type, PyObject* + pos, PyObject* endpos, int concurrent) { + RE_StringInfo str_info; + Py_ssize_t start; + Py_ssize_t end; + BOOL is_callable = FALSE; + PyObject* replacement = NULL; + BOOL is_literal = FALSE; +#if PY_VERSION_HEX >= 0x02060000 + BOOL is_format = FALSE; +#endif + BOOL is_template = FALSE; + RE_State state; + RE_SafeState safe_state; + JoinInfo join_info; + Py_ssize_t sub_count; + Py_ssize_t last_pos; + Py_ssize_t step; + PyObject* item; + MatchObject* match; +#if PY_VERSION_HEX >= 0x02060000 + BOOL built_capture = FALSE; +#endif + PyObject* args; + PyObject* kwargs; + Py_ssize_t end_pos; + + /* Get the string. */ + if (!get_string(string, &str_info)) + return NULL; + + /* Get the limits of the search. */ + if (!get_limits(pos, endpos, str_info.length, &start, &end)) { +#if PY_VERSION_HEX >= 0x02060000 + release_buffer(&str_info); + +#endif + return NULL; + } + + /* If the pattern is too long for the string, then take a shortcut, unless + * it's a fuzzy pattern. + */ + if (!self->is_fuzzy && self->min_width > end - start) { + PyObject* result; + + Py_INCREF(string); + + if (sub_type & RE_SUBN) + result = Py_BuildValue("Nn", string, 0); + else + result = string; + +#if PY_VERSION_HEX >= 0x02060000 + release_buffer(&str_info); + +#endif + return result; + } + + if (maxsub == 0) + maxsub = PY_SSIZE_T_MAX; + + /* sub/subn takes either a function or a string template. */ + if (PyCallable_Check(str_template)) { + /* It's callable. */ + is_callable = TRUE; + + replacement = str_template; + Py_INCREF(replacement); +#if PY_VERSION_HEX >= 0x02060000 + } else if (sub_type & RE_SUBF) { + /* Is it a literal format? + * + * To keep it simple we'll say that a literal is a string which can be + * used as-is, so no placeholders. + */ + Py_ssize_t literal_length; + + literal_length = check_replacement_string(str_template, '{'); + if (literal_length > 0) { + /* It's a literal. */ + is_literal = TRUE; + + replacement = str_template; + Py_INCREF(replacement); + } else if (literal_length < 0) { + /* It isn't a literal, so get the 'format' method. */ + is_format = TRUE; + + replacement = PyObject_GetAttrString(str_template, "format"); + if (!replacement) { + release_buffer(&str_info); + return NULL; + } + } +#endif + } else { + /* Is it a literal template? + * + * To keep it simple we'll say that a literal is a string which can be + * used as-is, so no backslashes. + */ + Py_ssize_t literal_length; + + literal_length = check_replacement_string(str_template, '\\'); + if (literal_length > 0) { + /* It's a literal. */ + is_literal = TRUE; + + replacement = str_template; + Py_INCREF(replacement); + } else if (literal_length < 0 ) { + /* It isn't a literal, so hand it over to the template compiler. */ + is_template = TRUE; + + replacement = call(RE_MODULE, "_compile_replacement_helper", + PyTuple_Pack(2, self, str_template)); + if (!replacement) { +#if PY_VERSION_HEX >= 0x02060000 + release_buffer(&str_info); + +#endif + return NULL; + } + } + } + + /* The MatchObject, and therefore repeated captures, will be visible only + * if the replacement is callable or subf is used. + */ +#if PY_VERSION_HEX >= 0x02060000 + if (!state_init_2(&state, self, string, &str_info, start, end, FALSE, + concurrent, FALSE, FALSE, is_callable || (sub_type & RE_SUBF) != 0, + FALSE)) { + release_buffer(&str_info); + +#else + if (!state_init_2(&state, self, string, &str_info, start, end, FALSE, + concurrent, FALSE, FALSE, is_callable, FALSE)) { +#endif + Py_XDECREF(replacement); + return NULL; + } + + /* Initialise the "safe state" structure. */ + safe_state.re_state = &state; + safe_state.thread_state = NULL; + + init_join_list(&join_info, state.reverse, PyUnicode_Check(string)); + + sub_count = 0; + last_pos = state.reverse ? state.text_length : 0; + step = state.reverse ? -1 : 1; + while (sub_count < maxsub) { + int status; + + status = do_match(&safe_state, TRUE); + if (status < 0) + goto error; + + if (status == 0) + break; + + /* Append the segment before this match. */ + if (state.match_pos != last_pos) { + if (state.reverse) + item = get_slice(string, state.match_pos, last_pos); + else + item = get_slice(string, last_pos, state.match_pos); + if (!item) + goto error; + + /* Add to the list. */ + status = add_to_join_list(&join_info, item); + Py_DECREF(item); + if (status < 0) + goto error; + } + + /* Add this match. */ + if (is_literal) { + /* The replacement is a literal string. */ + status = add_to_join_list(&join_info, replacement); + if (status < 0) + goto error; +#if PY_VERSION_HEX >= 0x02060000 + } else if (is_format) { + /* The replacement is a format string. */ + size_t g; + + /* We need to create the arguments for the 'format' method. We'll + * start by creating a MatchObject. + */ + match = (MatchObject*)pattern_new_match(self, &state, 1); + if (!match) + goto error; + + /* We'll build the args and kwargs the first time. They'll be using + * capture objects which refer to the match object indirectly; this + * means that args and kwargs can be reused with different match + * objects. + */ + if (!built_capture) { + /* The args are a tuple of the capture group matches. */ + args = PyTuple_New(match->group_count + 1); + if (!args) { + Py_DECREF(match); + goto error; + } + + for (g = 0; g < match->group_count + 1; g++) + /* PyTuple_SetItem borrows the reference. */ + PyTuple_SetItem(args, (Py_ssize_t)g, + make_capture_object(&match, (Py_ssize_t)g)); + + /* The kwargs are a dict of the named capture group matches. */ + kwargs = make_capture_dict(match, &match); + if (!kwargs) { + Py_DECREF(args); + Py_DECREF(match); + goto error; + } + + built_capture = TRUE; + } + + /* Call the 'format' method. */ + item = PyObject_Call(replacement, args, kwargs); + + Py_DECREF(match); + if (!item) + goto error; + + /* Add the result to the list. */ + status = add_to_join_list(&join_info, item); + Py_DECREF(item); + if (status < 0) + goto error; +#endif + } else if (is_template) { + /* The replacement is a list template. */ + Py_ssize_t count; + Py_ssize_t index; + Py_ssize_t step; + + /* Add each part of the template to the list. */ + count = PyList_GET_SIZE(replacement); + if (join_info.reversed) { + /* We're searching backwards, so we'll be reversing the list + * when it's complete. Therefore, we need to add the items of + * the template in reverse order for them to be in the correct + * order after the reversal. + */ + index = count - 1; + step = -1; + } else { + /* We're searching forwards. */ + index = 0; + step = 1; + } + + while (count > 0) { + PyObject* item; + PyObject* str_item; + + /* PyList_GET_ITEM borrows a reference. */ + item = PyList_GET_ITEM(replacement, index); + str_item = get_sub_replacement(item, string, &state, + self->public_group_count); + if (!str_item) + goto error; + + /* Add the result to the list. */ + if (str_item == Py_None) + /* None for "". */ + Py_DECREF(str_item); + else { + status = add_to_join_list(&join_info, str_item); + Py_DECREF(str_item); + if (status < 0) + goto error; + } + + --count; + index += step; + } + } else if (is_callable) { + /* Pass a MatchObject to the replacement function. */ + PyObject* match; + PyObject* args; + + /* We need to create a MatchObject to pass to the replacement + * function. + */ + match = pattern_new_match(self, &state, 1); + if (!match) + goto error; + + /* The args for the replacement function. */ + args = PyTuple_Pack(1, match); + if (!args) { + Py_DECREF(match); + goto error; + } + + /* Call the replacement function. */ + item = PyObject_CallObject(replacement, args); + Py_DECREF(args); + Py_DECREF(match); + if (!item) + goto error; + + /* Add the result to the list. */ + status = add_to_join_list(&join_info, item); + Py_DECREF(item); + if (status < 0) + goto error; + } + + ++sub_count; + + last_pos = state.text_pos; + + if (state.version_0) { + /* Always advance after a zero-width match. */ + if (state.match_pos == state.text_pos) { + state.text_pos += step; + state.must_advance = FALSE; + } else + state.must_advance = TRUE; + } else + /* Continue from where we left off, but don't allow a contiguous + * zero-width match. + */ + state.must_advance = state.match_pos == state.text_pos; + } + + /* Get the segment following the last match. We use 'length' instead of + * 'text_length' because the latter is truncated to 'slice_end', a + * documented idiosyncracy of the 're' module. + */ + end_pos = state.reverse ? 0 : str_info.length; + if (last_pos != end_pos) { + int status; + + /* The segment is part of the original string. */ + if (state.reverse) + item = get_slice(string, 0, last_pos); + else + item = get_slice(string, last_pos, str_info.length); + if (!item) + goto error; + + status = add_to_join_list(&join_info, item); + Py_DECREF(item); + if (status < 0) + goto error; + } + + Py_XDECREF(replacement); + + /* Convert the list to a single string (also cleans up join_info). */ + item = join_list_info(&join_info); + + state_fini(&state); + +#if PY_VERSION_HEX >= 0x02060000 + if (built_capture) { + Py_DECREF(kwargs); + Py_DECREF(args); + } + +#endif + if (!item) + return NULL; + + if (sub_type & RE_SUBN) + return Py_BuildValue("Nn", item, sub_count); + + return item; + +error: +#if PY_VERSION_HEX >= 0x02060000 + if (built_capture) { + Py_DECREF(kwargs); + Py_DECREF(args); + } + +#endif + clear_join_list(&join_info); + state_fini(&state); + Py_XDECREF(replacement); + return NULL; +} + +/* PatternObject's 'sub' method. */ +static PyObject* pattern_sub(PatternObject* self, PyObject* args, PyObject* + kwargs) { + int conc; + + PyObject* replacement; + PyObject* string; + Py_ssize_t count = 0; + PyObject* pos = Py_None; + PyObject* endpos = Py_None; + PyObject* concurrent = Py_None; + static char* kwlist[] = { "repl", "string", "count", "pos", "endpos", + "concurrent", NULL }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|nOOO:sub", kwlist, + &replacement, &string, &count, &pos, &endpos, &concurrent)) + return NULL; + + conc = decode_concurrent(concurrent); + if (conc < 0) + return NULL; + + return pattern_subx(self, replacement, string, count, RE_SUB, pos, endpos, + conc); +} + +#if PY_VERSION_HEX >= 0x02060000 +/* PatternObject's 'subf' method. */ +static PyObject* pattern_subf(PatternObject* self, PyObject* args, PyObject* + kwargs) { + int conc; + + PyObject* format; + PyObject* string; + Py_ssize_t count = 0; + PyObject* pos = Py_None; + PyObject* endpos = Py_None; + PyObject* concurrent = Py_None; + static char* kwlist[] = { "format", "string", "count", "pos", "endpos", + "concurrent", NULL }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|nOOO:sub", kwlist, + &format, &string, &count, &pos, &endpos, &concurrent)) + return NULL; + + conc = decode_concurrent(concurrent); + if (conc < 0) + return NULL; + + return pattern_subx(self, format, string, count, RE_SUBF, pos, endpos, + conc); +} + +#endif +/* PatternObject's 'subn' method. */ +static PyObject* pattern_subn(PatternObject* self, PyObject* args, PyObject* + kwargs) { + int conc; + + PyObject* replacement; + PyObject* string; + Py_ssize_t count = 0; + PyObject* pos = Py_None; + PyObject* endpos = Py_None; + PyObject* concurrent = Py_None; + static char* kwlist[] = { "repl", "string", "count", "pos", "endpos", + "concurrent", NULL }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|nOOO:subn", kwlist, + &replacement, &string, &count, &pos, &endpos, &concurrent)) + return NULL; + + conc = decode_concurrent(concurrent); + if (conc < 0) + return NULL; + + return pattern_subx(self, replacement, string, count, RE_SUBN, pos, endpos, + conc); +} + +#if PY_VERSION_HEX >= 0x02060000 +/* PatternObject's 'subfn' method. */ +static PyObject* pattern_subfn(PatternObject* self, PyObject* args, PyObject* + kwargs) { + int conc; + + PyObject* format; + PyObject* string; + Py_ssize_t count = 0; + PyObject* pos = Py_None; + PyObject* endpos = Py_None; + PyObject* concurrent = Py_None; + static char* kwlist[] = { "format", "string", "count", "pos", "endpos", + "concurrent", NULL }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|nOOO:subn", kwlist, + &format, &string, &count, &pos, &endpos, &concurrent)) + return NULL; + + conc = decode_concurrent(concurrent); + if (conc < 0) + return NULL; + + return pattern_subx(self, format, string, count, RE_SUBF | RE_SUBN, pos, + endpos, conc); +} + +#endif +/* PatternObject's 'split' method. */ +static PyObject* pattern_split(PatternObject* self, PyObject* args, PyObject* + kwargs) { + int conc; + + RE_State state; + RE_SafeState safe_state; + PyObject* list; + PyObject* item; + int status; + Py_ssize_t split_count; + size_t g; + Py_ssize_t start_pos; + Py_ssize_t end_pos; + Py_ssize_t step; + Py_ssize_t last_pos; + + PyObject* string; + Py_ssize_t maxsplit = 0; + PyObject* concurrent = Py_None; + static char* kwlist[] = { "string", "maxsplit", "concurrent", NULL }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|nO:split", kwlist, + &string, &maxsplit, &concurrent)) + return NULL; + + if (maxsplit == 0) + maxsplit = PY_SSIZE_T_MAX; + + conc = decode_concurrent(concurrent); + if (conc < 0) + return NULL; + + /* The MatchObject, and therefore repeated captures, will not be visible. + */ + if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX, FALSE, conc, + FALSE, FALSE, FALSE, FALSE)) + return NULL; + + /* Initialise the "safe state" structure. */ + safe_state.re_state = &state; + safe_state.thread_state = NULL; + + list = PyList_New(0); + if (!list) { + state_fini(&state); + return NULL; + } + + split_count = 0; + if (state.reverse) { + start_pos = state.text_length; + end_pos = 0; + step = -1; + } else { + start_pos = 0; + end_pos = state.text_length; + step = 1; + } + + last_pos = start_pos; + while (split_count < maxsplit) { + status = do_match(&safe_state, TRUE); + if (status < 0) + goto error; + + if (status == 0) + /* No more matches. */ + break; + + if (state.version_0) { + /* Version 0 behaviour is to advance one character if the split was + * zero-width. Unfortunately, this can give an incorrect result. + * GvR wants this behaviour to be retained so as not to break any + * existing software which might rely on it. + */ + if (state.text_pos == state.match_pos) { + if (last_pos == end_pos) + break; + + /* Advance one character. */ + state.text_pos += step; + state.must_advance = FALSE; + continue; + } + } + + /* Get segment before this match. */ + if (state.reverse) + item = get_slice(string, state.match_pos, last_pos); + else + item = get_slice(string, last_pos, state.match_pos); + if (!item) + goto error; + + status = PyList_Append(list, item); + Py_DECREF(item); + if (status < 0) + goto error; + + /* Add groups (if any). */ + for (g = 1; g <= self->public_group_count; g++) { + item = state_get_group(&state, (Py_ssize_t)g, string, FALSE); + if (!item) + goto error; + + status = PyList_Append(list, item); + Py_DECREF(item); + if (status < 0) + goto error; + } + + ++split_count; + last_pos = state.text_pos; + + /* Version 0 behaviour is to advance one character if the match was + * zero-width. Unfortunately, this can give an incorrect result. GvR + * wants this behaviour to be retained so as not to break any existing + * software which might rely on it. + */ + if (state.version_0) { + if (state.text_pos == state.match_pos) + /* Advance one character. */ + state.text_pos += step; + + state.must_advance = FALSE; + } else + /* Continue from where we left off, but don't allow a contiguous + * zero-width match. + */ + state.must_advance = TRUE; + } + + /* Get segment following last match (even if empty). */ + if (state.reverse) + item = get_slice(string, 0, last_pos); + else + item = get_slice(string, last_pos, state.text_length); + if (!item) + goto error; + + status = PyList_Append(list, item); + Py_DECREF(item); + if (status < 0) + goto error; + + state_fini(&state); + + return list; + +error: + Py_DECREF(list); + state_fini(&state); + return NULL; +} + +/* PatternObject's 'splititer' method. */ +static PyObject* pattern_splititer(PatternObject* pattern, PyObject* args, + PyObject* kwargs) { + return pattern_splitter(pattern, args, kwargs); +} + +/* PatternObject's 'findall' method. */ +static PyObject* pattern_findall(PatternObject* self, PyObject* args, PyObject* + kwargs) { + Py_ssize_t start; + Py_ssize_t end; + int conc; + RE_State state; + RE_SafeState safe_state; + PyObject* list; + Py_ssize_t step; + int status; + Py_ssize_t b; + Py_ssize_t e; + size_t g; + + PyObject* string; + PyObject* pos = Py_None; + PyObject* endpos = Py_None; + Py_ssize_t overlapped = FALSE; + PyObject* concurrent = Py_None; + static char* kwlist[] = { "string", "pos", "endpos", "overlapped", + "concurrent", NULL }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OOnO:findall", kwlist, + &string, &pos, &endpos, &overlapped, &concurrent)) + return NULL; + + start = as_string_index(pos, 0); + if (start == -1 && PyErr_Occurred()) + return NULL; + + end = as_string_index(endpos, PY_SSIZE_T_MAX); + if (end == -1 && PyErr_Occurred()) + return NULL; + + conc = decode_concurrent(concurrent); + if (conc < 0) + return NULL; + + /* The MatchObject, and therefore repeated captures, will not be visible. + */ + if (!state_init(&state, self, string, start, end, overlapped != 0, conc, + FALSE, FALSE, FALSE, FALSE)) + return NULL; + + /* Initialise the "safe state" structure. */ + safe_state.re_state = &state; + safe_state.thread_state = NULL; + + list = PyList_New(0); + if (!list) { + state_fini(&state); + return NULL; + } + + step = state.reverse ? -1 : 1; + while (state.slice_start <= state.text_pos && state.text_pos <= + state.slice_end) { + PyObject* item; + + status = do_match(&safe_state, TRUE); + if (status < 0) + goto error; + + if (status == 0) + break; + + /* Don't bother to build a MatchObject. */ + switch (self->public_group_count) { + case 0: + if (state.reverse) { + b = state.text_pos; + e = state.match_pos; + } else { + b = state.match_pos; + e = state.text_pos; + } + item = get_slice(string, b, e); + if (!item) + goto error; + break; + case 1: + item = state_get_group(&state, 1, string, TRUE); + if (!item) + goto error; + break; + default: + item = PyTuple_New((Py_ssize_t)self->public_group_count); + if (!item) + goto error; + + for (g = 0; g < self->public_group_count; g++) { + PyObject* o; + + o = state_get_group(&state, (Py_ssize_t)g + 1, string, TRUE); + if (!o) { + Py_DECREF(item); + goto error; + } + + /* PyTuple_SET_ITEM borrows the reference. */ + PyTuple_SET_ITEM(item, g, o); + } + break; + } + + status = PyList_Append(list, item); + Py_DECREF(item); + if (status < 0) + goto error; + + if (state.overlapped) { + /* Advance one character. */ + state.text_pos = state.match_pos + step; + state.must_advance = FALSE; + } else + /* Continue from where we left off, but don't allow 2 contiguous + * zero-width matches. + */ + state.must_advance = state.text_pos == state.match_pos; + } + + state_fini(&state); + + return list; + +error: + Py_DECREF(list); + state_fini(&state); + return NULL; +} + +/* PatternObject's 'finditer' method. */ +static PyObject* pattern_finditer(PatternObject* pattern, PyObject* args, + PyObject* kwargs) { + return pattern_scanner(pattern, args, kwargs); +} + +/* Makes a copy of a PatternObject. */ +Py_LOCAL_INLINE(PyObject*) make_pattern_copy(PatternObject* self) { + Py_INCREF(self); + return (PyObject*)self; +} + +/* PatternObject's '__copy__' method. */ +static PyObject* pattern_copy(PatternObject* self, PyObject *unused) { + return make_pattern_copy(self); +} + +/* PatternObject's '__deepcopy__' method. */ +static PyObject* pattern_deepcopy(PatternObject* self, PyObject* memo) { + return make_pattern_copy(self); +} + +/* The documentation of a PatternObject. */ +PyDoc_STRVAR(pattern_match_doc, + "match(string, pos=None, endpos=None, concurrent=None) --> MatchObject or None.\n\ + Match zero or more characters at the beginning of the string."); + +PyDoc_STRVAR(pattern_fullmatch_doc, + "fullmatch(string, pos=None, endpos=None, concurrent=None) --> MatchObject or None.\n\ + Match zero or more characters against all of the string."); + +PyDoc_STRVAR(pattern_search_doc, + "search(string, pos=None, endpos=None, concurrent=None) --> MatchObject or None.\n\ + Search through string looking for a match, and return a corresponding\n\ + match object instance. Return None if no match is found."); + +PyDoc_STRVAR(pattern_sub_doc, + "sub(repl, string, count=0, flags=0, pos=None, endpos=None, concurrent=None) --> newstring\n\ + Return the string obtained by replacing the leftmost (or rightmost with a\n\ + reverse pattern) non-overlapping occurrences of pattern in string by the\n\ + replacement repl."); + +#if PY_VERSION_HEX >= 0x02060000 +PyDoc_STRVAR(pattern_subf_doc, + "subf(format, string, count=0, flags=0, pos=None, endpos=None, concurrent=None) --> newstring\n\ + Return the string obtained by replacing the leftmost (or rightmost with a\n\ + reverse pattern) non-overlapping occurrences of pattern in string by the\n\ + replacement format."); + +#endif +PyDoc_STRVAR(pattern_subn_doc, + "subn(repl, string, count=0, flags=0, pos=None, endpos=None, concurrent=None) --> (newstring, number of subs)\n\ + Return the tuple (new_string, number_of_subs_made) found by replacing the\n\ + leftmost (or rightmost with a reverse pattern) non-overlapping occurrences\n\ + of pattern with the replacement repl."); + +#if PY_VERSION_HEX >= 0x02060000 +PyDoc_STRVAR(pattern_subfn_doc, + "subfn(format, string, count=0, flags=0, pos=None, endpos=None, concurrent=None) --> (newstring, number of subs)\n\ + Return the tuple (new_string, number_of_subs_made) found by replacing the\n\ + leftmost (or rightmost with a reverse pattern) non-overlapping occurrences\n\ + of pattern with the replacement format."); + +#endif +PyDoc_STRVAR(pattern_split_doc, + "split(string, string, maxsplit=0, concurrent=None) --> list.\n\ + Split string by the occurrences of pattern."); + +PyDoc_STRVAR(pattern_splititer_doc, + "splititer(string, maxsplit=0, concurrent=None) --> iterator.\n\ + Return an iterator yielding the parts of a split string."); + +PyDoc_STRVAR(pattern_findall_doc, + "findall(string, pos=None, endpos=None, overlapped=False, concurrent=None) --> list.\n\ + Return a list of all matches of pattern in string. The matches may be\n\ + overlapped if overlapped is True."); + +PyDoc_STRVAR(pattern_finditer_doc, + "finditer(string, pos=None, endpos=None, overlapped=False, concurrent=None) --> iterator.\n\ + Return an iterator over all matches for the RE pattern in string. The\n\ + matches may be overlapped if overlapped is True. For each match, the\n\ + iterator returns a MatchObject."); + +PyDoc_STRVAR(pattern_scanner_doc, + "scanner(string, pos=None, endpos=None, overlapped=False, concurrent=None) --> scanner.\n\ + Return an scanner for the RE pattern in string. The matches may be overlapped\n\ + if overlapped is True."); + +/* The methods of a PatternObject. */ +static PyMethodDef pattern_methods[] = { + {"match", (PyCFunction)pattern_match, METH_VARARGS|METH_KEYWORDS, + pattern_match_doc}, + {"fullmatch", (PyCFunction)pattern_fullmatch, METH_VARARGS|METH_KEYWORDS, + pattern_fullmatch_doc}, + {"search", (PyCFunction)pattern_search, METH_VARARGS|METH_KEYWORDS, + pattern_search_doc}, + {"sub", (PyCFunction)pattern_sub, METH_VARARGS|METH_KEYWORDS, + pattern_sub_doc}, +#if PY_VERSION_HEX >= 0x02060000 + {"subf", (PyCFunction)pattern_subf, METH_VARARGS|METH_KEYWORDS, + pattern_subf_doc}, +#endif + {"subn", (PyCFunction)pattern_subn, METH_VARARGS|METH_KEYWORDS, + pattern_subn_doc}, +#if PY_VERSION_HEX >= 0x02060000 + {"subfn", (PyCFunction)pattern_subfn, METH_VARARGS|METH_KEYWORDS, + pattern_subfn_doc}, +#endif + {"split", (PyCFunction)pattern_split, METH_VARARGS|METH_KEYWORDS, + pattern_split_doc}, + {"splititer", (PyCFunction)pattern_splititer, METH_VARARGS|METH_KEYWORDS, + pattern_splititer_doc}, + {"findall", (PyCFunction)pattern_findall, METH_VARARGS|METH_KEYWORDS, + pattern_findall_doc}, + {"finditer", (PyCFunction)pattern_finditer, METH_VARARGS|METH_KEYWORDS, + pattern_finditer_doc}, + {"scanner", (PyCFunction)pattern_scanner, METH_VARARGS|METH_KEYWORDS, + pattern_scanner_doc}, + {"__copy__", (PyCFunction)pattern_copy, METH_NOARGS}, + {"__deepcopy__", (PyCFunction)pattern_deepcopy, METH_O}, + {NULL, NULL} +}; + +PyDoc_STRVAR(pattern_doc, "Compiled regex object"); + +/* Deallocates a PatternObject. */ +static void pattern_dealloc(PyObject* self_) { + PatternObject* self; + size_t i; + int partial_side; + + self = (PatternObject*)self_; + + /* Discard the nodes. */ + for (i = 0; i < self->node_count; i++) { + RE_Node* node; + + node = self->node_list[i]; + re_dealloc(node->values); + if (node->status & RE_STATUS_STRING) { + re_dealloc(node->string.bad_character_offset); + re_dealloc(node->string.good_suffix_offset); + } + re_dealloc(node); + } + re_dealloc(self->node_list); + + /* Discard the group info. */ + re_dealloc(self->group_info); + + /* Discard the call_ref info. */ + re_dealloc(self->call_ref_info); + + /* Discard the repeat info. */ + re_dealloc(self->repeat_info); + + dealloc_groups(self->groups_storage, self->true_group_count); + + dealloc_repeats(self->repeats_storage, self->repeat_count); + + if (self->weakreflist) + PyObject_ClearWeakRefs((PyObject*)self); + Py_XDECREF(self->pattern); + Py_XDECREF(self->groupindex); + Py_XDECREF(self->indexgroup); + + for (partial_side = 0; partial_side < 2; partial_side++) { + if (self->partial_named_lists[partial_side]) { + for (i = 0; i < self->named_lists_count; i++) + Py_XDECREF(self->partial_named_lists[partial_side][i]); + + re_dealloc(self->partial_named_lists[partial_side]); + } + } + + Py_DECREF(self->named_lists); + Py_DECREF(self->named_list_indexes); + re_dealloc(self->locale_info); + PyObject_DEL(self); +} + +/* Info about the various flags that can be passed in. */ +typedef struct RE_FlagName { + char* name; + int value; +} RE_FlagName; + +/* We won't bother about the A flag in Python 2. */ +static RE_FlagName flag_names[] = { + {"B", RE_FLAG_BESTMATCH}, + {"D", RE_FLAG_DEBUG}, + {"S", RE_FLAG_DOTALL}, + {"F", RE_FLAG_FULLCASE}, + {"I", RE_FLAG_IGNORECASE}, + {"L", RE_FLAG_LOCALE}, + {"M", RE_FLAG_MULTILINE}, + {"P", RE_FLAG_POSIX}, + {"R", RE_FLAG_REVERSE}, + {"T", RE_FLAG_TEMPLATE}, + {"U", RE_FLAG_UNICODE}, + {"X", RE_FLAG_VERBOSE}, + {"V0", RE_FLAG_VERSION0}, + {"V1", RE_FLAG_VERSION1}, + {"W", RE_FLAG_WORD}, +}; + +/* Appends a string to a list. */ +Py_LOCAL_INLINE(BOOL) append_string(PyObject* list, char* string) { + PyObject* item; + int status; + + item = Py_BuildValue("s", string); + if (!item) + return FALSE; + + status = PyList_Append(list, item); + Py_DECREF(item); + if (status < 0) + return FALSE; + + return TRUE; +} + +/* Appends a (decimal) integer to a list. */ +Py_LOCAL_INLINE(BOOL) append_integer(PyObject* list, Py_ssize_t value) { + PyObject* int_obj; + PyObject* repr_obj; + int status; + + int_obj = Py_BuildValue("n", value); + if (!int_obj) + return FALSE; + + repr_obj = PyObject_Repr(int_obj); + Py_DECREF(int_obj); + if (!repr_obj) + return FALSE; + + status = PyList_Append(list, repr_obj); + Py_DECREF(repr_obj); + if (status < 0) + return FALSE; + + return TRUE; +} + +/* MatchObject's '__repr__' method. */ +static PyObject* match_repr(PyObject* self_) { + MatchObject* self; + PyObject* list; + PyObject* matched_substring; + PyObject* matched_repr; + int status; + PyObject* separator; + PyObject* result; + + self = (MatchObject*)self_; + + list = PyList_New(0); + if (!list) + return NULL; + + if (!append_string(list, "<regex.Match object; span=(")) + goto error; + + if (!append_integer(list, self->match_start)) + goto error; + + if (! append_string(list, ", ")) + goto error; + + if (!append_integer(list, self->match_end)) + goto error; + + if (!append_string(list, "), match=")) + goto error; + + matched_substring = get_slice(self->substring, self->match_start - + self->substring_offset, self->match_end - self->substring_offset); + if (!matched_substring) + goto error; + + matched_repr = PyObject_Repr(matched_substring); + Py_DECREF(matched_substring); + if (!matched_repr) + goto error; + + status = PyList_Append(list, matched_repr); + Py_DECREF(matched_repr); + if (status < 0) + goto error; + + if (self->fuzzy_counts[RE_FUZZY_SUB] != 0 || + self->fuzzy_counts[RE_FUZZY_INS] != 0 || self->fuzzy_counts[RE_FUZZY_DEL] + != 0) { + if (! append_string(list, ", fuzzy_counts=(")) + goto error; + + if (!append_integer(list, + (Py_ssize_t)self->fuzzy_counts[RE_FUZZY_SUB])) + goto error; + + if (! append_string(list, ", ")) + goto error; + + if (!append_integer(list, + (Py_ssize_t)self->fuzzy_counts[RE_FUZZY_INS])) + goto error; + + if (! append_string(list, ", ")) + goto error; + if (!append_integer(list, + (Py_ssize_t)self->fuzzy_counts[RE_FUZZY_DEL])) + goto error; + + if (! append_string(list, ")")) + goto error; + } + + if (self->partial) { + if (!append_string(list, ", partial=True")) + goto error; + } + + if (! append_string(list, ">")) + goto error; + + separator = Py_BuildValue("s", ""); + if (!separator) + goto error; + + result = PyUnicode_Join(separator, list); + Py_DECREF(separator); + Py_DECREF(list); + + return result; + +error: + Py_DECREF(list); + return NULL; +} + +/* PatternObject's '__repr__' method. */ +static PyObject* pattern_repr(PyObject* self_) { + PatternObject* self; + PyObject* list; + PyObject* item; + int status; + int flag_count; + unsigned int i; + Py_ssize_t pos; + PyObject *key; + PyObject *value; + PyObject* separator; + PyObject* result; + + self = (PatternObject*)self_; + + list = PyList_New(0); + if (!list) + return NULL; + + if (!append_string(list, "regex.Regex(")) + goto error; + + item = PyObject_Repr(self->pattern); + if (!item) + goto error; + + status = PyList_Append(list, item); + Py_DECREF(item); + if (status < 0) + goto error; + + flag_count = 0; + for (i = 0; i < sizeof(flag_names) / sizeof(flag_names[0]); i++) { + if (self->flags & flag_names[i].value) { + if (flag_count == 0) { + if (!append_string(list, ", flags=")) + goto error; + } else { + if (!append_string(list, " | ")) + goto error; + } + + if (!append_string(list, "regex.")) + goto error; + + if (!append_string(list, flag_names[i].name)) + goto error; + + ++flag_count; + } + } + + pos = 0; + /* PyDict_Next borrows references. */ + while (PyDict_Next(self->named_lists, &pos, &key, &value)) { + if (!append_string(list, ", ")) + goto error; + + status = PyList_Append(list, key); + if (status < 0) + goto error; + + if (!append_string(list, "=")) + goto error; + + item = PyObject_Repr(value); + if (!item) + goto error; + + status = PyList_Append(list, item); + Py_DECREF(item); + if (status < 0) + goto error; + } + + if (!append_string(list, ")")) + goto error; + + separator = Py_BuildValue("s", ""); + if (!separator) + goto error; + + result = PyUnicode_Join(separator, list); + Py_DECREF(separator); + Py_DECREF(list); + + return result; + +error: + Py_DECREF(list); + return NULL; +} + +/* PatternObject's 'groupindex' method. */ +static PyObject* pattern_groupindex(PyObject* self_) { + PatternObject* self; + + self = (PatternObject*)self_; + + return PyDict_Copy(self->groupindex); +} + +static PyGetSetDef pattern_getset[] = { + {"groupindex", (getter)pattern_groupindex, (setter)NULL, + "A dictionary mapping group names to group numbers."}, + {NULL} /* Sentinel */ +}; + +static PyMemberDef pattern_members[] = { + {"pattern", T_OBJECT, offsetof(PatternObject, pattern), READONLY, + "The pattern string from which the regex object was compiled."}, + {"flags", T_PYSSIZET, offsetof(PatternObject, flags), READONLY, + "The regex matching flags."}, + {"groups", T_PYSSIZET, offsetof(PatternObject, public_group_count), + READONLY, "The number of capturing groups in the pattern."}, + {"named_lists", T_OBJECT, offsetof(PatternObject, named_lists), READONLY, + "The named lists used by the regex."}, + {NULL} /* Sentinel */ +}; + +static PyTypeObject Pattern_Type = { + PyObject_HEAD_INIT(NULL) + 0, + "_" RE_MODULE "." "Pattern", + sizeof(PatternObject) +}; + +/* Building the nodes is made simpler by allowing branches to have a single + * exit. These need to be removed. + */ +Py_LOCAL_INLINE(void) skip_one_way_branches(PatternObject* pattern) { + BOOL modified; + + /* If a node refers to a 1-way branch then make the former refer to the + * latter's destination. Repeat until they're all done. + */ + do { + size_t i; + + modified = FALSE; + + for (i = 0; i < pattern->node_count; i++) { + RE_Node* node; + RE_Node* next; + + node = pattern->node_list[i]; + + /* Check the first destination. */ + next = node->next_1.node; + if (next && next->op == RE_OP_BRANCH && + !next->nonstring.next_2.node) { + node->next_1.node = next->next_1.node; + modified = TRUE; + } + + /* Check the second destination. */ + next = node->nonstring.next_2.node; + if (next && next->op == RE_OP_BRANCH && + !next->nonstring.next_2.node) { + node->nonstring.next_2.node = next->next_1.node; + modified = TRUE; + } + } + } while (modified); + + /* The start node might be a 1-way branch. Skip over it because it'll be + * removed. It might even be the first in a chain. + */ + while (pattern->start_node->op == RE_OP_BRANCH && + !pattern->start_node->nonstring.next_2.node) + pattern->start_node = pattern->start_node->next_1.node; +} + +/* Adds guards to repeats which are followed by a reference to a group. + * + * Returns whether a guard was added for a node at or after the given node. + */ +Py_LOCAL_INLINE(RE_STATUS_T) add_repeat_guards(PatternObject* pattern, RE_Node* + node) { + RE_STATUS_T result; + + result = RE_STATUS_NEITHER; + + for (;;) { + if (node->status & RE_STATUS_VISITED_AG) + return node->status & (RE_STATUS_REPEAT | RE_STATUS_REF); + + switch (node->op) { + case RE_OP_BRANCH: + { + RE_STATUS_T branch_1_result; + RE_STATUS_T branch_2_result; + RE_STATUS_T status; + + branch_1_result = add_repeat_guards(pattern, node->next_1.node); + branch_2_result = add_repeat_guards(pattern, + node->nonstring.next_2.node); + status = max_status_3(result, branch_1_result, branch_2_result); + node->status = RE_STATUS_VISITED_AG | status; + return status; + } + case RE_OP_END_GREEDY_REPEAT: + case RE_OP_END_LAZY_REPEAT: + node->status |= RE_STATUS_VISITED_AG; + return result; + case RE_OP_GREEDY_REPEAT: + case RE_OP_LAZY_REPEAT: + { + BOOL limited; + RE_STATUS_T body_result; + RE_STATUS_T tail_result; + RE_RepeatInfo* repeat_info; + RE_STATUS_T status; + + limited = ~node->values[2] != 0; + if (limited) + body_result = RE_STATUS_LIMITED; + else + body_result = add_repeat_guards(pattern, node->next_1.node); + tail_result = add_repeat_guards(pattern, + node->nonstring.next_2.node); + + repeat_info = &pattern->repeat_info[node->values[0]]; + if (body_result != RE_STATUS_REF) + repeat_info->status |= RE_STATUS_BODY; + if (tail_result != RE_STATUS_REF) + repeat_info->status |= RE_STATUS_TAIL; + if (limited) + result = max_status_2(result, RE_STATUS_LIMITED); + else + result = max_status_2(result, RE_STATUS_REPEAT); + status = max_status_3(result, body_result, tail_result); + node->status |= RE_STATUS_VISITED_AG | status; + return status; + } + case RE_OP_GREEDY_REPEAT_ONE: + case RE_OP_LAZY_REPEAT_ONE: + { + BOOL limited; + RE_STATUS_T tail_result; + RE_RepeatInfo* repeat_info; + RE_STATUS_T status; + + limited = ~node->values[2] != 0; + tail_result = add_repeat_guards(pattern, node->next_1.node); + + repeat_info = &pattern->repeat_info[node->values[0]]; + repeat_info->status |= RE_STATUS_BODY; + if (tail_result != RE_STATUS_REF) + repeat_info->status |= RE_STATUS_TAIL; + if (limited) + result = max_status_2(result, RE_STATUS_LIMITED); + else + result = max_status_2(result, RE_STATUS_REPEAT); + status = max_status_3(result, RE_STATUS_REPEAT, tail_result); + node->status = RE_STATUS_VISITED_AG | status; + return status; + } + case RE_OP_GROUP_CALL: + case RE_OP_REF_GROUP: + case RE_OP_REF_GROUP_FLD: + case RE_OP_REF_GROUP_FLD_REV: + case RE_OP_REF_GROUP_IGN: + case RE_OP_REF_GROUP_IGN_REV: + case RE_OP_REF_GROUP_REV: + result = RE_STATUS_REF; + node = node->next_1.node; + break; + case RE_OP_GROUP_EXISTS: + { + RE_STATUS_T branch_1_result; + RE_STATUS_T branch_2_result; + RE_STATUS_T status; + + branch_1_result = add_repeat_guards(pattern, node->next_1.node); + branch_2_result = add_repeat_guards(pattern, + node->nonstring.next_2.node); + status = max_status_4(result, branch_1_result, branch_2_result, + RE_STATUS_REF); + node->status = RE_STATUS_VISITED_AG | status; + return status; + } + case RE_OP_SUCCESS: + node->status = RE_STATUS_VISITED_AG | result; + return result; + default: + node = node->next_1.node; + break; + } + } +} + +/* Adds an index to a node's values unless it's already present. + * + * 'offset' is the offset of the index count within the values. + */ +Py_LOCAL_INLINE(BOOL) add_index(RE_Node* node, size_t offset, size_t index) { + size_t index_count; + size_t first_index; + size_t i; + RE_CODE* new_values; + + if (!node) + return TRUE; + + index_count = node->values[offset]; + first_index = offset + 1; + + /* Is the index already present? */ + for (i = 0; i < index_count; i++) { + if (node->values[first_index + i] == index) + return TRUE; + } + + /* Allocate more space for the new index. */ + new_values = re_realloc(node->values, (node->value_count + 1) * + sizeof(RE_CODE)); + if (!new_values) + return FALSE; + + ++node->value_count; + node->values = new_values; + + node->values[first_index + node->values[offset]++] = (RE_CODE)index; + + return TRUE; +} + +/* Records the index of every repeat and fuzzy section within atomic + * subpatterns and lookarounds. + */ +Py_LOCAL_INLINE(BOOL) record_subpattern_repeats_and_fuzzy_sections(RE_Node* + parent_node, size_t offset, size_t repeat_count, RE_Node* node) { + while (node) { + if (node->status & RE_STATUS_VISITED_REP) + return TRUE; + + node->status |= RE_STATUS_VISITED_REP; + + switch (node->op) { + case RE_OP_BRANCH: + case RE_OP_GROUP_EXISTS: + if (!record_subpattern_repeats_and_fuzzy_sections(parent_node, + offset, repeat_count, node->next_1.node)) + return FALSE; + node = node->nonstring.next_2.node; + break; + case RE_OP_END_FUZZY: + node = node->next_1.node; + break; + case RE_OP_END_GREEDY_REPEAT: + case RE_OP_END_LAZY_REPEAT: + return TRUE; + case RE_OP_FUZZY: + /* Record the fuzzy index. */ + if (!add_index(parent_node, offset, repeat_count + + node->values[0])) + return FALSE; + node = node->next_1.node; + break; + case RE_OP_GREEDY_REPEAT: + case RE_OP_LAZY_REPEAT: + /* Record the repeat index. */ + if (!add_index(parent_node, offset, node->values[0])) + return FALSE; + if (!record_subpattern_repeats_and_fuzzy_sections(parent_node, + offset, repeat_count, node->next_1.node)) + return FALSE; + node = node->nonstring.next_2.node; + break; + case RE_OP_GREEDY_REPEAT_ONE: + case RE_OP_LAZY_REPEAT_ONE: + /* Record the repeat index. */ + if (!add_index(parent_node, offset, node->values[0])) + return FALSE; + node = node->next_1.node; + break; + default: + node = node->next_1.node; + break; + } + } + + return TRUE; +} + +/* Marks nodes which are being used as used. */ +Py_LOCAL_INLINE(void) use_nodes(RE_Node* node) { + while (node && !(node->status & RE_STATUS_USED)) { + node->status |= RE_STATUS_USED; + if (!(node->status & RE_STATUS_STRING)) { + if (node->nonstring.next_2.node) + use_nodes(node->nonstring.next_2.node); + } + node = node->next_1.node; + } +} + +/* Discards any unused nodes. + * + * Optimising the nodes might result in some nodes no longer being used. + */ +Py_LOCAL_INLINE(void) discard_unused_nodes(PatternObject* pattern) { + size_t i; + size_t new_count; + + /* Mark the nodes which are being used. */ + use_nodes(pattern->start_node); + + for (i = 0; i < pattern->call_ref_info_capacity; i++) + use_nodes(pattern->call_ref_info[i].node); + + new_count = 0; + for (i = 0; i < pattern->node_count; i++) { + RE_Node* node; + + node = pattern->node_list[i]; + if (node->status & RE_STATUS_USED) + pattern->node_list[new_count++] = node; + else { + re_dealloc(node->values); + if (node->status & RE_STATUS_STRING) { + re_dealloc(node->string.bad_character_offset); + re_dealloc(node->string.good_suffix_offset); + } + re_dealloc(node); + } + } + + pattern->node_count = new_count; +} + +/* Marks all the group which are named. Returns FALSE if there's an error. */ +Py_LOCAL_INLINE(BOOL) mark_named_groups(PatternObject* pattern) { + size_t i; + + for (i = 0; i < pattern->public_group_count; i++) { + RE_GroupInfo* group_info; + PyObject* index; + int status; + + group_info = &pattern->group_info[i]; + index = Py_BuildValue("n", i + 1); + if (!index) + return FALSE; + + status = PyDict_Contains(pattern->indexgroup, index); + Py_DECREF(index); + if (status < 0) + return FALSE; + + group_info->has_name = status == 1; + } + + return TRUE; +} + +/* Gets the test node. + * + * The test node lets the matcher look ahead in the pattern, allowing it to + * avoid the cost of housekeeping, only to find that what follows doesn't match + * anyway. + */ +Py_LOCAL_INLINE(void) set_test_node(RE_NextNode* next) { + RE_Node* node = next->node; + RE_Node* test; + + next->test = node; + next->match_next = node; + next->match_step = 0; + + if (!node) + return; + + test = node; + while (test->op == RE_OP_END_GROUP || test->op == RE_OP_START_GROUP) + test = test->next_1.node; + + next->test = test; + + if (test != node) + return; + + switch (test->op) { + case RE_OP_ANY: + case RE_OP_ANY_ALL: + case RE_OP_ANY_ALL_REV: + case RE_OP_ANY_REV: + case RE_OP_ANY_U: + case RE_OP_ANY_U_REV: + case RE_OP_BOUNDARY: + case RE_OP_CHARACTER: + case RE_OP_CHARACTER_IGN: + case RE_OP_CHARACTER_IGN_REV: + case RE_OP_CHARACTER_REV: + case RE_OP_DEFAULT_BOUNDARY: + case RE_OP_DEFAULT_END_OF_WORD: + case RE_OP_DEFAULT_START_OF_WORD: + case RE_OP_END_OF_LINE: + case RE_OP_END_OF_LINE_U: + case RE_OP_END_OF_STRING: + case RE_OP_END_OF_STRING_LINE: + case RE_OP_END_OF_STRING_LINE_U: + case RE_OP_END_OF_WORD: + case RE_OP_GRAPHEME_BOUNDARY: + case RE_OP_PROPERTY: + case RE_OP_PROPERTY_IGN: + case RE_OP_PROPERTY_IGN_REV: + case RE_OP_PROPERTY_REV: + case RE_OP_RANGE: + case RE_OP_RANGE_IGN: + case RE_OP_RANGE_IGN_REV: + case RE_OP_RANGE_REV: + case RE_OP_SEARCH_ANCHOR: + case RE_OP_SET_DIFF: + case RE_OP_SET_DIFF_IGN: + case RE_OP_SET_DIFF_IGN_REV: + case RE_OP_SET_DIFF_REV: + case RE_OP_SET_INTER: + case RE_OP_SET_INTER_IGN: + case RE_OP_SET_INTER_IGN_REV: + case RE_OP_SET_INTER_REV: + case RE_OP_SET_SYM_DIFF: + case RE_OP_SET_SYM_DIFF_IGN: + case RE_OP_SET_SYM_DIFF_IGN_REV: + case RE_OP_SET_SYM_DIFF_REV: + case RE_OP_SET_UNION: + case RE_OP_SET_UNION_IGN: + case RE_OP_SET_UNION_IGN_REV: + case RE_OP_SET_UNION_REV: + case RE_OP_START_OF_LINE: + case RE_OP_START_OF_LINE_U: + case RE_OP_START_OF_STRING: + case RE_OP_START_OF_WORD: + case RE_OP_STRING: + case RE_OP_STRING_FLD: + case RE_OP_STRING_FLD_REV: + case RE_OP_STRING_IGN: + case RE_OP_STRING_IGN_REV: + case RE_OP_STRING_REV: + next->match_next = test->next_1.node; + next->match_step = test->step; + break; + case RE_OP_GREEDY_REPEAT_ONE: + case RE_OP_LAZY_REPEAT_ONE: + if (test->values[1] > 0) + next->test = test; + break; + } +} + +/* Sets the test nodes. */ +Py_LOCAL_INLINE(void) set_test_nodes(PatternObject* pattern) { + RE_Node** node_list; + size_t i; + + node_list = pattern->node_list; + for (i = 0; i < pattern->node_count; i++) { + RE_Node* node; + + node = node_list[i]; + set_test_node(&node->next_1); + if (!(node->status & RE_STATUS_STRING)) + set_test_node(&node->nonstring.next_2); + } +} + +/* Optimises the pattern. */ +Py_LOCAL_INLINE(BOOL) optimise_pattern(PatternObject* pattern) { + size_t i; + + /* Building the nodes is made simpler by allowing branches to have a single + * exit. These need to be removed. + */ + skip_one_way_branches(pattern); + + /* Add position guards for repeat bodies containing a reference to a group + * or repeat tails followed at some point by a reference to a group. + */ + add_repeat_guards(pattern, pattern->start_node); + + /* Record the index of repeats and fuzzy sections within the body of atomic + * and lookaround nodes. + */ + if (!record_subpattern_repeats_and_fuzzy_sections(NULL, 0, + pattern->repeat_count, pattern->start_node)) + return FALSE; + + for (i = 0; i < pattern->call_ref_info_count; i++) { + RE_Node* node; + + node = pattern->call_ref_info[i].node; + if (!record_subpattern_repeats_and_fuzzy_sections(NULL, 0, + pattern->repeat_count, node)) + return FALSE; + } + + /* Discard any unused nodes. */ + discard_unused_nodes(pattern); + + /* Set the test nodes. */ + set_test_nodes(pattern); + + /* Mark all the group that are named. */ + if (!mark_named_groups(pattern)) + return FALSE; + + return TRUE; +} + +/* Creates a new pattern node. */ +Py_LOCAL_INLINE(RE_Node*) create_node(PatternObject* pattern, RE_UINT8 op, + RE_CODE flags, Py_ssize_t step, size_t value_count) { + RE_Node* node; + + node = (RE_Node*)re_alloc(sizeof(*node)); + if (!node) + return NULL; + memset(node, 0, sizeof(RE_Node)); + + node->value_count = value_count; + if (node->value_count > 0) { + node->values = (RE_CODE*)re_alloc(node->value_count * sizeof(RE_CODE)); + if (!node->values) + goto error; + } else + node->values = NULL; + + node->op = op; + node->match = (flags & RE_POSITIVE_OP) != 0; + node->status = (RE_STATUS_T)(flags << RE_STATUS_SHIFT); + node->step = step; + + /* Ensure that there's enough storage to record the new node. */ + if (pattern->node_count >= pattern->node_capacity) { + RE_Node** new_node_list; + + pattern->node_capacity *= 2; + if (pattern->node_capacity == 0) + pattern->node_capacity = RE_INIT_NODE_LIST_SIZE; + new_node_list = (RE_Node**)re_realloc(pattern->node_list, + pattern->node_capacity * sizeof(RE_Node*)); + if (!new_node_list) + goto error; + pattern->node_list = new_node_list; + } + + /* Record the new node. */ + pattern->node_list[pattern->node_count++] = node; + + return node; + +error: + re_dealloc(node->values); + re_dealloc(node); + return NULL; +} + +/* Adds a node as a next node for another node. */ +Py_LOCAL_INLINE(void) add_node(RE_Node* node_1, RE_Node* node_2) { + if (!node_1->next_1.node) + node_1->next_1.node = node_2; + else + node_1->nonstring.next_2.node = node_2; +} + +/* Ensures that the entry for a group's details actually exists. */ +Py_LOCAL_INLINE(BOOL) ensure_group(PatternObject* pattern, size_t group) { + size_t old_capacity; + size_t new_capacity; + RE_GroupInfo* new_group_info; + + if (group <= pattern->true_group_count) + /* We already have an entry for the group. */ + return TRUE; + + /* Increase the storage capacity to include the new entry if it's + * insufficient. + */ + old_capacity = pattern->group_info_capacity; + new_capacity = pattern->group_info_capacity; + while (group > new_capacity) + new_capacity += RE_LIST_SIZE_INC; + + if (new_capacity > old_capacity) { + new_group_info = (RE_GroupInfo*)re_realloc(pattern->group_info, + new_capacity * sizeof(RE_GroupInfo)); + if (!new_group_info) + return FALSE; + memset(new_group_info + old_capacity, 0, (new_capacity - old_capacity) + * sizeof(RE_GroupInfo)); + + pattern->group_info = new_group_info; + pattern->group_info_capacity = new_capacity; + } + + pattern->true_group_count = group; + + return TRUE; +} + +/* Records that there's a reference to a group. */ +Py_LOCAL_INLINE(BOOL) record_ref_group(PatternObject* pattern, size_t group) { + if (!ensure_group(pattern, group)) + return FALSE; + + pattern->group_info[group - 1].referenced = TRUE; + + return TRUE; +} + +/* Records that there's a new group. */ +Py_LOCAL_INLINE(BOOL) record_group(PatternObject* pattern, size_t group, + RE_Node* node) { + if (!ensure_group(pattern, group)) + return FALSE; + + if (group >= 1) { + RE_GroupInfo* info; + + info = &pattern->group_info[group - 1]; + info->end_index = (Py_ssize_t)pattern->true_group_count; + info->node = node; + } + + return TRUE; +} + +/* Records that a group has closed. */ +Py_LOCAL_INLINE(void) record_group_end(PatternObject* pattern, size_t group) { + if (group >= 1) + pattern->group_info[group - 1].end_index = ++pattern->group_end_index; +} + +/* Ensures that the entry for a call_ref's details actually exists. */ +Py_LOCAL_INLINE(BOOL) ensure_call_ref(PatternObject* pattern, size_t call_ref) + { + size_t old_capacity; + size_t new_capacity; + RE_CallRefInfo* new_call_ref_info; + + if (call_ref < pattern->call_ref_info_count) + /* We already have an entry for the call_ref. */ + return TRUE; + + /* Increase the storage capacity to include the new entry if it's + * insufficient. + */ + old_capacity = pattern->call_ref_info_capacity; + new_capacity = pattern->call_ref_info_capacity; + while (call_ref >= new_capacity) + new_capacity += RE_LIST_SIZE_INC; + + if (new_capacity > old_capacity) { + new_call_ref_info = (RE_CallRefInfo*)re_realloc(pattern->call_ref_info, + new_capacity * sizeof(RE_CallRefInfo)); + if (!new_call_ref_info) + return FALSE; + memset(new_call_ref_info + old_capacity, 0, (new_capacity - + old_capacity) * sizeof(RE_CallRefInfo)); + + pattern->call_ref_info = new_call_ref_info; + pattern->call_ref_info_capacity = new_capacity; + } + + pattern->call_ref_info_count = 1 + call_ref; + + return TRUE; +} + +/* Records that a call_ref is defined. */ +Py_LOCAL_INLINE(BOOL) record_call_ref_defined(PatternObject* pattern, size_t + call_ref, RE_Node* node) { + if (!ensure_call_ref(pattern, call_ref)) + return FALSE; + + pattern->call_ref_info[call_ref].defined = TRUE; + pattern->call_ref_info[call_ref].node = node; + + return TRUE; +} + +/* Records that a call_ref is used. */ +Py_LOCAL_INLINE(BOOL) record_call_ref_used(PatternObject* pattern, size_t + call_ref) { + if (!ensure_call_ref(pattern, call_ref)) + return FALSE; + + pattern->call_ref_info[call_ref].used = TRUE; + + return TRUE; +} + +/* Checks whether a node matches one and only one character. */ +Py_LOCAL_INLINE(BOOL) sequence_matches_one(RE_Node* node) { + while (node->op == RE_OP_BRANCH && !node->nonstring.next_2.node) + node = node->next_1.node; + + if (node->next_1.node || (node->status & RE_STATUS_FUZZY)) + return FALSE; + + return node_matches_one_character(node); +} + +/* Records a repeat. */ +Py_LOCAL_INLINE(BOOL) record_repeat(PatternObject* pattern, size_t index, + size_t repeat_depth) { + size_t old_capacity; + size_t new_capacity; + + /* Increase the storage capacity to include the new entry if it's + * insufficient. + */ + old_capacity = pattern->repeat_info_capacity; + new_capacity = pattern->repeat_info_capacity; + while (index >= new_capacity) + new_capacity += RE_LIST_SIZE_INC; + + if (new_capacity > old_capacity) { + RE_RepeatInfo* new_repeat_info; + + new_repeat_info = (RE_RepeatInfo*)re_realloc(pattern->repeat_info, + new_capacity * sizeof(RE_RepeatInfo)); + if (!new_repeat_info) + return FALSE; + memset(new_repeat_info + old_capacity, 0, (new_capacity - old_capacity) + * sizeof(RE_RepeatInfo)); + + pattern->repeat_info = new_repeat_info; + pattern->repeat_info_capacity = new_capacity; + } + + if (index >= pattern->repeat_count) + pattern->repeat_count = index + 1; + + if (repeat_depth > 0) + pattern->repeat_info[index].status |= RE_STATUS_INNER; + + return TRUE; +} + +Py_LOCAL_INLINE(Py_ssize_t) get_step(RE_CODE op) { + switch (op) { + case RE_OP_ANY: + case RE_OP_ANY_ALL: + case RE_OP_ANY_U: + case RE_OP_CHARACTER: + case RE_OP_CHARACTER_IGN: + case RE_OP_PROPERTY: + case RE_OP_PROPERTY_IGN: + case RE_OP_RANGE: + case RE_OP_RANGE_IGN: + case RE_OP_SET_DIFF: + case RE_OP_SET_DIFF_IGN: + case RE_OP_SET_INTER: + case RE_OP_SET_INTER_IGN: + case RE_OP_SET_SYM_DIFF: + case RE_OP_SET_SYM_DIFF_IGN: + case RE_OP_SET_UNION: + case RE_OP_SET_UNION_IGN: + case RE_OP_STRING: + case RE_OP_STRING_FLD: + case RE_OP_STRING_IGN: + return 1; + case RE_OP_ANY_ALL_REV: + case RE_OP_ANY_REV: + case RE_OP_ANY_U_REV: + case RE_OP_CHARACTER_IGN_REV: + case RE_OP_CHARACTER_REV: + case RE_OP_PROPERTY_IGN_REV: + case RE_OP_PROPERTY_REV: + case RE_OP_RANGE_IGN_REV: + case RE_OP_RANGE_REV: + case RE_OP_SET_DIFF_IGN_REV: + case RE_OP_SET_DIFF_REV: + case RE_OP_SET_INTER_IGN_REV: + case RE_OP_SET_INTER_REV: + case RE_OP_SET_SYM_DIFF_IGN_REV: + case RE_OP_SET_SYM_DIFF_REV: + case RE_OP_SET_UNION_IGN_REV: + case RE_OP_SET_UNION_REV: + case RE_OP_STRING_FLD_REV: + case RE_OP_STRING_IGN_REV: + case RE_OP_STRING_REV: + return -1; + } + + return 0; +} + +Py_LOCAL_INLINE(int) build_sequence(RE_CompileArgs* args); + +/* Builds an ANY node. */ +Py_LOCAL_INLINE(int) build_ANY(RE_CompileArgs* args) { + RE_UINT8 op; + RE_CODE flags; + Py_ssize_t step; + RE_Node* node; + + /* codes: opcode, flags. */ + if (args->code + 1 > args->end_code) + return RE_ERROR_ILLEGAL; + + op = (RE_UINT8)args->code[0]; + flags = args->code[1]; + + step = get_step(op); + + /* Create the node. */ + node = create_node(args->pattern, op, flags, step, 0); + if (!node) + return RE_ERROR_MEMORY; + + args->code += 2; + + /* Append the node. */ + add_node(args->end, node); + args->end = node; + + ++args->min_width; + + return RE_ERROR_SUCCESS; +} + +/* Builds a FUZZY node. */ +Py_LOCAL_INLINE(int) build_FUZZY(RE_CompileArgs* args) { + RE_CODE flags; + RE_Node* start_node; + RE_Node* end_node; + RE_CODE index; + RE_CompileArgs subargs; + int status; + + /* codes: opcode, flags, constraints, sequence, end. */ + if (args->code + 13 > args->end_code) + return RE_ERROR_ILLEGAL; + + flags = args->code[1]; + + /* Create nodes for the start and end of the fuzzy sequence. */ + start_node = create_node(args->pattern, RE_OP_FUZZY, flags, 0, 9); + end_node = create_node(args->pattern, RE_OP_END_FUZZY, flags, 0, 5); + if (!start_node || !end_node) + return RE_ERROR_MEMORY; + + index = (RE_CODE)args->pattern->fuzzy_count++; + start_node->values[0] = index; + end_node->values[0] = index; + + /* The constraints consist of 4 pairs of limits and the cost equation. */ + end_node->values[RE_FUZZY_VAL_MIN_DEL] = args->code[2]; /* Deletion minimum. */ + end_node->values[RE_FUZZY_VAL_MIN_INS] = args->code[4]; /* Insertion minimum. */ + end_node->values[RE_FUZZY_VAL_MIN_SUB] = args->code[6]; /* Substitution minimum. */ + end_node->values[RE_FUZZY_VAL_MIN_ERR] = args->code[8]; /* Error minimum. */ + + start_node->values[RE_FUZZY_VAL_MAX_DEL] = args->code[3]; /* Deletion maximum. */ + start_node->values[RE_FUZZY_VAL_MAX_INS] = args->code[5]; /* Insertion maximum. */ + start_node->values[RE_FUZZY_VAL_MAX_SUB] = args->code[7]; /* Substitution maximum. */ + start_node->values[RE_FUZZY_VAL_MAX_ERR] = args->code[9]; /* Error maximum. */ + + start_node->values[RE_FUZZY_VAL_DEL_COST] = args->code[10]; /* Deletion cost. */ + start_node->values[RE_FUZZY_VAL_INS_COST] = args->code[11]; /* Insertion cost. */ + start_node->values[RE_FUZZY_VAL_SUB_COST] = args->code[12]; /* Substitution cost. */ + start_node->values[RE_FUZZY_VAL_MAX_COST] = args->code[13]; /* Total cost. */ + + args->code += 14; + + subargs = *args; + subargs.within_fuzzy = TRUE; + + /* Compile the sequence and check that we've reached the end of the + * subpattern. + */ + status = build_sequence(&subargs); + if (status != RE_ERROR_SUCCESS) + return status; + + if (subargs.code[0] != RE_OP_END) + return RE_ERROR_ILLEGAL; + + args->code = subargs.code; + args->min_width += subargs.min_width; + args->has_captures |= subargs.has_captures; + args->is_fuzzy = TRUE; + args->has_groups |= subargs.has_groups; + args->has_repeats |= subargs.has_repeats; + + ++args->code; + + /* Append the fuzzy sequence. */ + add_node(args->end, start_node); + add_node(start_node, subargs.start); + add_node(subargs.end, end_node); + args->end = end_node; + + return RE_ERROR_SUCCESS; +} + +/* Builds an ATOMIC node. */ +Py_LOCAL_INLINE(int) build_ATOMIC(RE_CompileArgs* args) { + RE_Node* atomic_node; + RE_CompileArgs subargs; + int status; + RE_Node* end_node; + + /* codes: opcode, sequence, end. */ + if (args->code + 1 > args->end_code) + return RE_ERROR_ILLEGAL; + + atomic_node = create_node(args->pattern, RE_OP_ATOMIC, 0, 0, 0); + if (!atomic_node) + return RE_ERROR_MEMORY; + + ++args->code; + + /* Compile the sequence and check that we've reached the end of it. */ + subargs = *args; + + status = build_sequence(&subargs); + if (status != RE_ERROR_SUCCESS) + return status; + + if (subargs.code[0] != RE_OP_END) + return RE_ERROR_ILLEGAL; + + args->code = subargs.code; + ++args->code; + + /* Check the subpattern. */ + args->min_width += subargs.min_width; + args->has_captures |= subargs.has_captures; + args->is_fuzzy |= subargs.is_fuzzy; + args->has_groups |= subargs.has_groups; + args->has_repeats |= subargs.has_repeats; + + if (subargs.has_groups) + atomic_node->status |= RE_STATUS_HAS_GROUPS; + + if (subargs.has_repeats) + atomic_node->status |= RE_STATUS_HAS_REPEATS; + + /* Create the node to terminate the subpattern. */ + end_node = create_node(subargs.pattern, RE_OP_END_ATOMIC, 0, 0, 0); + if (!end_node) + return RE_ERROR_MEMORY; + + /* Append the new sequence. */ + add_node(args->end, atomic_node); + add_node(atomic_node, subargs.start); + add_node(subargs.end, end_node); + args->end = end_node; + + return RE_ERROR_SUCCESS; +} + +/* Builds a BOUNDARY node. */ +Py_LOCAL_INLINE(int) build_BOUNDARY(RE_CompileArgs* args) { + RE_UINT8 op; + RE_CODE flags; + RE_Node* node; + + /* codes: opcode, flags. */ + if (args->code + 1 > args->end_code) + return RE_ERROR_ILLEGAL; + + op = (RE_UINT8)args->code[0]; + flags = args->code[1]; + + args->code += 2; + + /* Create the node. */ + node = create_node(args->pattern, op, flags, 0, 0); + if (!node) + return RE_ERROR_MEMORY; + + /* Append the node. */ + add_node(args->end, node); + args->end = node; + + return RE_ERROR_SUCCESS; +} + +/* Builds a BRANCH node. */ +Py_LOCAL_INLINE(int) build_BRANCH(RE_CompileArgs* args) { + RE_Node* branch_node; + RE_Node* join_node; + Py_ssize_t min_width; + RE_CompileArgs subargs; + int status; + + /* codes: opcode, branch, next, branch, end. */ + if (args->code + 2 > args->end_code) + return RE_ERROR_ILLEGAL; + + /* Create nodes for the start and end of the branch sequence. */ + branch_node = create_node(args->pattern, RE_OP_BRANCH, 0, 0, 0); + join_node = create_node(args->pattern, RE_OP_BRANCH, 0, 0, 0); + if (!branch_node || !join_node) + return RE_ERROR_MEMORY; + + /* Append the node. */ + add_node(args->end, branch_node); + args->end = join_node; + + min_width = PY_SSIZE_T_MAX; + + subargs = *args; + + /* A branch in the regular expression is compiled into a series of 2-way + * branches. + */ + do { + RE_Node* next_branch_node; + + /* Skip over the 'BRANCH' or 'NEXT' opcode. */ + ++subargs.code; + + /* Compile the sequence until the next 'BRANCH' or 'NEXT' opcode. */ + status = build_sequence(&subargs); + if (status != RE_ERROR_SUCCESS) + return status; + + min_width = min_ssize_t(min_width, subargs.min_width); + + args->has_captures |= subargs.has_captures; + args->is_fuzzy |= subargs.is_fuzzy; + args->has_groups |= subargs.has_groups; + args->has_repeats |= subargs.has_repeats; + + /* Append the sequence. */ + add_node(branch_node, subargs.start); + add_node(subargs.end, join_node); + + /* Create a start node for the next sequence and append it. */ + next_branch_node = create_node(subargs.pattern, RE_OP_BRANCH, 0, 0, 0); + if (!next_branch_node) + return RE_ERROR_MEMORY; + + add_node(branch_node, next_branch_node); + branch_node = next_branch_node; + } while (subargs.code < subargs.end_code && subargs.code[0] == RE_OP_NEXT); + + /* We should have reached the end of the branch. */ + if (subargs.code[0] != RE_OP_END) + return RE_ERROR_ILLEGAL; + + args->code = subargs.code; + + ++args->code; + args->min_width += min_width; + + return RE_ERROR_SUCCESS; +} + +/* Builds a CALL_REF node. */ +Py_LOCAL_INLINE(int) build_CALL_REF(RE_CompileArgs* args) { + RE_CODE call_ref; + RE_Node* start_node; + RE_Node* end_node; + RE_CompileArgs subargs; + int status; + + /* codes: opcode, call_ref. */ + if (args->code + 1 > args->end_code) + return RE_ERROR_ILLEGAL; + + call_ref = args->code[1]; + + args->code += 2; + + /* Create nodes for the start and end of the subpattern. */ + start_node = create_node(args->pattern, RE_OP_CALL_REF, 0, 0, 1); + end_node = create_node(args->pattern, RE_OP_GROUP_RETURN, 0, 0, 0); + if (!start_node || !end_node) + return RE_ERROR_MEMORY; + + start_node->values[0] = call_ref; + + /* Compile the sequence and check that we've reached the end of the + * subpattern. + */ + subargs = *args; + status = build_sequence(&subargs); + if (status != RE_ERROR_SUCCESS) + return status; + + if (subargs.code[0] != RE_OP_END) + return RE_ERROR_ILLEGAL; + + args->code = subargs.code; + args->min_width += subargs.min_width; + args->has_captures |= subargs.has_captures; + args->is_fuzzy |= subargs.is_fuzzy; + args->has_groups |= subargs.has_groups; + args->has_repeats |= subargs.has_repeats; + + ++args->code; + + /* Record that we defined a call_ref. */ + if (!record_call_ref_defined(args->pattern, call_ref, start_node)) + return RE_ERROR_MEMORY; + + /* Append the node. */ + add_node(args->end, start_node); + add_node(start_node, subargs.start); + add_node(subargs.end, end_node); + args->end = end_node; + + return RE_ERROR_SUCCESS; +} + +/* Builds a CHARACTER or PROPERTY node. */ +Py_LOCAL_INLINE(int) build_CHARACTER_or_PROPERTY(RE_CompileArgs* args) { + RE_UINT8 op; + RE_CODE flags; + Py_ssize_t step; + RE_Node* node; + + /* codes: opcode, flags, value. */ + if (args->code + 2 > args->end_code) + return RE_ERROR_ILLEGAL; + + op = (RE_UINT8)args->code[0]; + flags = args->code[1]; + + step = get_step(op); + + if (flags & RE_ZEROWIDTH_OP) + step = 0; + + /* Create the node. */ + node = create_node(args->pattern, op, flags, step, 1); + if (!node) + return RE_ERROR_MEMORY; + + node->values[0] = args->code[2]; + + args->code += 3; + + /* Append the node. */ + add_node(args->end, node); + args->end = node; + + if (step != 0) + ++args->min_width; + + return RE_ERROR_SUCCESS; +} + +/* Builds a CONDITIONAL node. */ +Py_LOCAL_INLINE(int) build_CONDITIONAL(RE_CompileArgs* args) { + RE_CODE flags; + BOOL forward; + RE_Node* test_node; + RE_CompileArgs subargs; + int status; + RE_Node* end_test_node; + RE_Node* end_node; + Py_ssize_t min_width; + + /* codes: opcode, flags, forward, sequence, next, sequence, next, sequence, + * end. + */ + if (args->code + 4 > args->end_code) + return RE_ERROR_ILLEGAL; + + flags = args->code[1]; + forward = (BOOL)args->code[2]; + + /* Create a node for the lookaround. */ + test_node = create_node(args->pattern, RE_OP_CONDITIONAL, flags, 0, 0); + if (!test_node) + return RE_ERROR_MEMORY; + + args->code += 3; + + add_node(args->end, test_node); + + /* Compile the lookaround test and check that we've reached the end of the + * subpattern. + */ + subargs = *args; + subargs.forward = forward; + status = build_sequence(&subargs); + if (status != RE_ERROR_SUCCESS) + return status; + + if (subargs.code[0] != RE_OP_NEXT) + return RE_ERROR_ILLEGAL; + + args->code = subargs.code; + ++args->code; + + /* Check the lookaround subpattern. */ + args->has_captures |= subargs.has_captures; + args->is_fuzzy |= subargs.is_fuzzy; + args->has_groups |= subargs.has_groups; + args->has_repeats |= subargs.has_repeats; + + if (subargs.has_groups) + test_node->status |= RE_STATUS_HAS_GROUPS; + + if (subargs.has_repeats) + test_node->status |= RE_STATUS_HAS_REPEATS; + + /* Create the node to terminate the test. */ + end_test_node = create_node(args->pattern, RE_OP_END_CONDITIONAL, 0, 0, 0); + if (!end_test_node) + return RE_ERROR_MEMORY; + + /* test node -> test -> end test node */ + add_node(test_node, subargs.start); + add_node(subargs.end, end_test_node); + + /* Compile the true branch. */ + subargs = *args; + status = build_sequence(&subargs); + if (status != RE_ERROR_SUCCESS) + return status; + + /* Check the true branch. */ + args->code = subargs.code; + args->has_captures |= subargs.has_captures; + args->is_fuzzy |= subargs.is_fuzzy; + args->has_groups |= subargs.has_groups; + args->has_repeats |= subargs.has_repeats; + + min_width = subargs.min_width; + + /* Create the terminating node. */ + end_node = create_node(args->pattern, RE_OP_BRANCH, 0, 0, 0); + if (!end_node) + return RE_ERROR_MEMORY; + + /* end test node -> true branch -> end node */ + add_node(end_test_node, subargs.start); + add_node(subargs.end, end_node); + + if (args->code[0] == RE_OP_NEXT) { + /* There's a false branch. */ + ++args->code; + + /* Compile the false branch. */ + subargs.code = args->code; + status = build_sequence(&subargs); + if (status != RE_ERROR_SUCCESS) + return status; + + /* Check the false branch. */ + args->code = subargs.code; + args->has_captures |= subargs.has_captures; + args->is_fuzzy |= subargs.is_fuzzy; + args->has_groups |= subargs.has_groups; + args->has_repeats |= subargs.has_repeats; + + min_width = min_ssize_t(min_width, subargs.min_width); + + /* test node -> false branch -> end node */ + add_node(test_node, subargs.start); + add_node(subargs.end, end_node); + } else + /* end test node -> end node */ + add_node(end_test_node, end_node); + + if (args->code[0] != RE_OP_END) + return RE_ERROR_ILLEGAL; + + args->min_width += min_width; + + ++args->code; + + args->end = end_node; + + return RE_ERROR_SUCCESS; +} + +/* Builds a GROUP node. */ +Py_LOCAL_INLINE(int) build_GROUP(RE_CompileArgs* args) { + RE_CODE private_group; + RE_CODE public_group; + RE_Node* start_node; + RE_Node* end_node; + RE_CompileArgs subargs; + int status; + + /* codes: opcode, private_group, public_group. */ + if (args->code + 2 > args->end_code) + return RE_ERROR_ILLEGAL; + + private_group = args->code[1]; + public_group = args->code[2]; + + args->code += 3; + + /* Create nodes for the start and end of the capture group. */ + start_node = create_node(args->pattern, args->forward ? RE_OP_START_GROUP : + RE_OP_END_GROUP, 0, 0, 3); + end_node = create_node(args->pattern, args->forward ? RE_OP_END_GROUP : + RE_OP_START_GROUP, 0, 0, 3); + if (!start_node || !end_node) + return RE_ERROR_MEMORY; + + start_node->values[0] = private_group; + end_node->values[0] = private_group; + start_node->values[1] = public_group; + end_node->values[1] = public_group; + + /* Signal that the capture should be saved when it's complete. */ + start_node->values[2] = 0; + end_node->values[2] = 1; + + /* Record that we have a new capture group. */ + if (!record_group(args->pattern, private_group, start_node)) + return RE_ERROR_MEMORY; + + /* Compile the sequence and check that we've reached the end of the capture + * group. + */ + subargs = *args; + status = build_sequence(&subargs); + if (status != RE_ERROR_SUCCESS) + return status; + + if (subargs.code[0] != RE_OP_END) + return RE_ERROR_ILLEGAL; + + args->code = subargs.code; + args->min_width += subargs.min_width; + args->has_captures |= subargs.has_captures | subargs.visible_captures; + args->is_fuzzy |= subargs.is_fuzzy; + args->has_groups |= TRUE; + args->has_repeats |= subargs.has_repeats; + + ++args->code; + + /* Record that the capture group has closed. */ + record_group_end(args->pattern, private_group); + + /* Append the capture group. */ + add_node(args->end, start_node); + add_node(start_node, subargs.start); + add_node(subargs.end, end_node); + args->end = end_node; + + return RE_ERROR_SUCCESS; +} + +/* Builds a GROUP_CALL node. */ +Py_LOCAL_INLINE(int) build_GROUP_CALL(RE_CompileArgs* args) { + RE_CODE call_ref; + RE_Node* node; + + /* codes: opcode, call_ref. */ + if (args->code + 1 > args->end_code) + return RE_ERROR_ILLEGAL; + + call_ref = args->code[1]; + + /* Create the node. */ + node = create_node(args->pattern, RE_OP_GROUP_CALL, 0, 0, 1); + if (!node) + return RE_ERROR_MEMORY; + + node->values[0] = call_ref; + + node->status |= RE_STATUS_HAS_GROUPS; + node->status |= RE_STATUS_HAS_REPEATS; + + args->code += 2; + + /* Record that we used a call_ref. */ + if (!record_call_ref_used(args->pattern, call_ref)) + return RE_ERROR_MEMORY; + + /* Append the node. */ + add_node(args->end, node); + args->end = node; + + return RE_ERROR_SUCCESS; +} + +/* Builds a GROUP_EXISTS node. */ +Py_LOCAL_INLINE(int) build_GROUP_EXISTS(RE_CompileArgs* args) { + RE_CODE group; + RE_Node* start_node; + RE_Node* end_node; + RE_CompileArgs subargs; + int status; + Py_ssize_t min_width; + + /* codes: opcode, sequence, next, sequence, end. */ + if (args->code + 2 > args->end_code) + return RE_ERROR_ILLEGAL; + + group = args->code[1]; + + args->code += 2; + + /* Record that we have a reference to a group. If group is 0, then we have + * a DEFINE and not a true group. + */ + if (group > 0 && !record_ref_group(args->pattern, group)) + return RE_ERROR_MEMORY; + + /* Create nodes for the start and end of the structure. */ + start_node = create_node(args->pattern, RE_OP_GROUP_EXISTS, 0, 0, 1); + end_node = create_node(args->pattern, RE_OP_BRANCH, 0, 0, 0); + if (!start_node || !end_node) + return RE_ERROR_MEMORY; + + start_node->values[0] = group; + + subargs = *args; + status = build_sequence(&subargs); + if (status != RE_ERROR_SUCCESS) + return status; + + args->code = subargs.code; + args->has_captures |= subargs.has_captures; + args->is_fuzzy |= subargs.is_fuzzy; + args->has_groups |= subargs.has_groups; + args->has_repeats |= subargs.has_repeats; + + min_width = subargs.min_width; + + /* Append the start node. */ + add_node(args->end, start_node); + add_node(start_node, subargs.start); + + if (args->code[0] == RE_OP_NEXT) { + RE_Node* true_branch_end; + + ++args->code; + + true_branch_end = subargs.end; + + subargs.code = args->code; + + status = build_sequence(&subargs); + if (status != RE_ERROR_SUCCESS) + return status; + + args->code = subargs.code; + args->has_captures |= subargs.has_captures; + args->is_fuzzy |= subargs.is_fuzzy; + + if (group == 0) { + /* Join the 2 branches end-to-end and bypass it. The sequence + * itself will never be matched as a whole, so it doesn't matter. + */ + min_width = 0; + + add_node(start_node, end_node); + add_node(true_branch_end, subargs.start); + } else { + args->has_groups |= subargs.has_groups; + args->has_repeats |= subargs.has_repeats; + + min_width = min_ssize_t(min_width, subargs.min_width); + + add_node(start_node, subargs.start); + add_node(true_branch_end, end_node); + } + + add_node(subargs.end, end_node); + } else { + add_node(start_node, end_node); + add_node(subargs.end, end_node); + + min_width = 0; + } + + args->min_width += min_width; + + if (args->code[0] != RE_OP_END) + return RE_ERROR_ILLEGAL; + + ++args->code; + + args->end = end_node; + + return RE_ERROR_SUCCESS; +} + +/* Builds a LOOKAROUND node. */ +Py_LOCAL_INLINE(int) build_LOOKAROUND(RE_CompileArgs* args) { + RE_CODE flags; + BOOL forward; + RE_Node* lookaround_node; + RE_CompileArgs subargs; + int status; + RE_Node* end_node; + RE_Node* next_node; + + /* codes: opcode, flags, forward, sequence, end. */ + if (args->code + 3 > args->end_code) + return RE_ERROR_ILLEGAL; + + flags = args->code[1]; + forward = (BOOL)args->code[2]; + + /* Create a node for the lookaround. */ + lookaround_node = create_node(args->pattern, RE_OP_LOOKAROUND, flags, 0, + 0); + if (!lookaround_node) + return RE_ERROR_MEMORY; + + args->code += 3; + + /* Compile the sequence and check that we've reached the end of the + * subpattern. + */ + subargs = *args; + subargs.forward = forward; + status = build_sequence(&subargs); + if (status != RE_ERROR_SUCCESS) + return status; + + if (subargs.code[0] != RE_OP_END) + return RE_ERROR_ILLEGAL; + + args->code = subargs.code; + ++args->code; + + /* Check the subpattern. */ + args->has_captures |= subargs.has_captures; + args->is_fuzzy |= subargs.is_fuzzy; + args->has_groups |= subargs.has_groups; + args->has_repeats |= subargs.has_repeats; + + if (subargs.has_groups) + lookaround_node->status |= RE_STATUS_HAS_GROUPS; + + if (subargs.has_repeats) + lookaround_node->status |= RE_STATUS_HAS_REPEATS; + + /* Create the node to terminate the subpattern. */ + end_node = create_node(args->pattern, RE_OP_END_LOOKAROUND, 0, 0, 0); + if (!end_node) + return RE_ERROR_MEMORY; + + /* Make a continuation node. */ + next_node = create_node(args->pattern, RE_OP_BRANCH, 0, 0, 0); + if (!next_node) + return RE_ERROR_MEMORY; + + /* Append the new sequence. */ + add_node(args->end, lookaround_node); + add_node(lookaround_node, subargs.start); + add_node(lookaround_node, next_node); + add_node(subargs.end, end_node); + add_node(end_node, next_node); + + args->end = next_node; + + return RE_ERROR_SUCCESS; +} + +/* Builds a RANGE node. */ +Py_LOCAL_INLINE(int) build_RANGE(RE_CompileArgs* args) { + RE_UINT8 op; + RE_CODE flags; + Py_ssize_t step; + RE_Node* node; + + /* codes: opcode, flags, lower, upper. */ + if (args->code + 3 > args->end_code) + return RE_ERROR_ILLEGAL; + + op = (RE_UINT8)args->code[0]; + flags = args->code[1]; + + step = get_step(op); + + if (flags & RE_ZEROWIDTH_OP) + step = 0; + + /* Create the node. */ + node = create_node(args->pattern, op, flags, step, 2); + if (!node) + return RE_ERROR_MEMORY; + + node->values[0] = args->code[2]; + node->values[1] = args->code[3]; + + args->code += 4; + + /* Append the node. */ + add_node(args->end, node); + args->end = node; + + if (step != 0) + ++args->min_width; + + return RE_ERROR_SUCCESS; +} + +/* Builds a REF_GROUP node. */ +Py_LOCAL_INLINE(int) build_REF_GROUP(RE_CompileArgs* args) { + RE_CODE flags; + RE_CODE group; + RE_Node* node; + + /* codes: opcode, flags, group. */ + if (args->code + 2 > args->end_code) + return RE_ERROR_ILLEGAL; + + flags = args->code[1]; + group = args->code[2]; + node = create_node(args->pattern, (RE_UINT8)args->code[0], flags, 0, 1); + if (!node) + return RE_ERROR_MEMORY; + + node->values[0] = group; + + args->code += 3; + + /* Record that we have a reference to a group. */ + if (!record_ref_group(args->pattern, group)) + return RE_ERROR_MEMORY; + + /* Append the reference. */ + add_node(args->end, node); + args->end = node; + + return RE_ERROR_SUCCESS; +} + +/* Builds a REPEAT node. */ +Py_LOCAL_INLINE(int) build_REPEAT(RE_CompileArgs* args) { + BOOL greedy; + RE_CODE min_count; + RE_CODE max_count; + int status; + + /* codes: opcode, min_count, max_count, sequence, end. */ + if (args->code + 3 > args->end_code) + return RE_ERROR_ILLEGAL; + + greedy = args->code[0] == RE_OP_GREEDY_REPEAT; + min_count = args->code[1]; + max_count = args->code[2]; + if (args->code[1] > args->code[2]) + return RE_ERROR_ILLEGAL; + + args->code += 3; + + if (min_count == 1 && max_count == 1) { + /* Singly-repeated sequence. */ + RE_CompileArgs subargs; + + subargs = *args; + status = build_sequence(&subargs); + if (status != RE_ERROR_SUCCESS) + return status; + + if (subargs.code[0] != RE_OP_END) + return RE_ERROR_ILLEGAL; + + args->code = subargs.code; + args->min_width += subargs.min_width; + args->has_captures |= subargs.has_captures; + args->is_fuzzy |= subargs.is_fuzzy; + args->has_groups |= subargs.has_groups; + args->has_repeats |= subargs.has_repeats; + + ++args->code; + + /* Append the sequence. */ + add_node(args->end, subargs.start); + args->end = subargs.end; + } else { + size_t index; + RE_Node* repeat_node; + RE_CompileArgs subargs; + + index = args->pattern->repeat_count; + + /* Create the nodes for the repeat. */ + repeat_node = create_node(args->pattern, greedy ? RE_OP_GREEDY_REPEAT : + RE_OP_LAZY_REPEAT, 0, args->forward ? 1 : -1, 4); + if (!repeat_node || !record_repeat(args->pattern, index, + args->repeat_depth)) + return RE_ERROR_MEMORY; + + repeat_node->values[0] = (RE_CODE)index; + repeat_node->values[1] = min_count; + repeat_node->values[2] = max_count; + repeat_node->values[3] = args->forward; + + if (args->within_fuzzy) + args->pattern->repeat_info[index].status |= RE_STATUS_BODY; + + /* Compile the 'body' and check that we've reached the end of it. */ + subargs = *args; + subargs.visible_captures = TRUE; + ++subargs.repeat_depth; + status = build_sequence(&subargs); + if (status != RE_ERROR_SUCCESS) + return status; + + if (subargs.code[0] != RE_OP_END) + return RE_ERROR_ILLEGAL; + + args->code = subargs.code; + args->min_width += (Py_ssize_t)min_count * subargs.min_width; + args->has_captures |= subargs.has_captures; + args->is_fuzzy |= subargs.is_fuzzy; + args->has_groups |= subargs.has_groups; + args->has_repeats = TRUE; + + ++args->code; + + /* Is it a repeat of something which will match a single character? + * + * If it's in a fuzzy section then it won't be optimised as a + * single-character repeat. + */ + if (sequence_matches_one(subargs.start)) { + repeat_node->op = greedy ? RE_OP_GREEDY_REPEAT_ONE : + RE_OP_LAZY_REPEAT_ONE; + + /* Append the new sequence. */ + add_node(args->end, repeat_node); + repeat_node->nonstring.next_2.node = subargs.start; + args->end = repeat_node; + } else { + RE_Node* end_repeat_node; + RE_Node* end_node; + + end_repeat_node = create_node(args->pattern, greedy ? + RE_OP_END_GREEDY_REPEAT : RE_OP_END_LAZY_REPEAT, 0, args->forward + ? 1 : -1, 4); + if (!end_repeat_node) + return RE_ERROR_MEMORY; + + end_repeat_node->values[0] = repeat_node->values[0]; + end_repeat_node->values[1] = repeat_node->values[1]; + end_repeat_node->values[2] = repeat_node->values[2]; + end_repeat_node->values[3] = args->forward; + + end_node = create_node(args->pattern, RE_OP_BRANCH, 0, 0, 0); + if (!end_node) + return RE_ERROR_MEMORY; + + /* Append the new sequence. */ + add_node(args->end, repeat_node); + add_node(repeat_node, subargs.start); + add_node(repeat_node, end_node); + add_node(subargs.end, end_repeat_node); + add_node(end_repeat_node, subargs.start); + add_node(end_repeat_node, end_node); + args->end = end_node; + } + } + + return RE_ERROR_SUCCESS; +} + +/* Builds a STRING node. */ +Py_LOCAL_INLINE(int) build_STRING(RE_CompileArgs* args, BOOL is_charset) { + RE_CODE flags; + RE_CODE length; + RE_UINT8 op; + Py_ssize_t step; + RE_Node* node; + size_t i; + + /* codes: opcode, flags, length, characters. */ + flags = args->code[1]; + length = args->code[2]; + if (args->code + 3 + length > args->end_code) + return RE_ERROR_ILLEGAL; + + op = (RE_UINT8)args->code[0]; + + step = get_step(op); + + /* Create the node. */ + node = create_node(args->pattern, op, flags, step * (Py_ssize_t)length, + length); + if (!node) + return RE_ERROR_MEMORY; + if (!is_charset) + node->status |= RE_STATUS_STRING; + + for (i = 0; i < length; i++) + node->values[i] = args->code[3 + i]; + + args->code += 3 + length; + + /* Append the node. */ + add_node(args->end, node); + args->end = node; + + /* Because of full case-folding, one character in the text could match + * multiple characters in the pattern. + */ + if (op == RE_OP_STRING_FLD || op == RE_OP_STRING_FLD_REV) + args->min_width += possible_unfolded_length((Py_ssize_t)length); + else + args->min_width += (Py_ssize_t)length; + + return RE_ERROR_SUCCESS; +} + +/* Builds a SET node. */ +Py_LOCAL_INLINE(int) build_SET(RE_CompileArgs* args) { + RE_UINT8 op; + RE_CODE flags; + Py_ssize_t step; + RE_Node* node; + Py_ssize_t min_width; + int status; + + /* codes: opcode, flags, members. */ + op = (RE_UINT8)args->code[0]; + flags = args->code[1]; + + step = get_step(op); + + if (flags & RE_ZEROWIDTH_OP) + step = 0; + + node = create_node(args->pattern, op, flags, step, 0); + if (!node) + return RE_ERROR_MEMORY; + + args->code += 2; + + /* Append the node. */ + add_node(args->end, node); + args->end = node; + + min_width = args->min_width; + + /* Compile the character set. */ + do { + switch (args->code[0]) { + case RE_OP_CHARACTER: + case RE_OP_PROPERTY: + status = build_CHARACTER_or_PROPERTY(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_RANGE: + status = build_RANGE(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_SET_DIFF: + case RE_OP_SET_INTER: + case RE_OP_SET_SYM_DIFF: + case RE_OP_SET_UNION: + status = build_SET(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_STRING: + /* A set of characters. */ + if (!build_STRING(args, TRUE)) + return FALSE; + break; + default: + /* Illegal opcode for a character set. */ + return RE_ERROR_ILLEGAL; + } + } while (args->code < args->end_code && args->code[0] != RE_OP_END); + + /* Check that we've reached the end correctly. (The last opcode should be + * 'END'.) + */ + if (args->code >= args->end_code || args->code[0] != RE_OP_END) + return RE_ERROR_ILLEGAL; + + ++args->code; + + /* At this point the set's members are in the main sequence. They need to + * be moved out-of-line. + */ + node->nonstring.next_2.node = node->next_1.node; + node->next_1.node = NULL; + args->end = node; + + args->min_width = min_width; + + if (step != 0) + ++args->min_width; + + return RE_ERROR_SUCCESS; +} + +/* Builds a STRING_SET node. */ +Py_LOCAL_INLINE(int) build_STRING_SET(RE_CompileArgs* args) { + RE_CODE index; + RE_CODE min_len; + RE_CODE max_len; + RE_Node* node; + + /* codes: opcode, index, min_len, max_len. */ + if (args->code + 3 > args->end_code) + return RE_ERROR_ILLEGAL; + + index = args->code[1]; + min_len = args->code[2]; + max_len = args->code[3]; + node = create_node(args->pattern, (RE_UINT8)args->code[0], 0, 0, 3); + if (!node) + return RE_ERROR_MEMORY; + + node->values[0] = index; + node->values[1] = min_len; + node->values[2] = max_len; + + args->code += 4; + + /* Append the reference. */ + add_node(args->end, node); + args->end = node; + + return RE_ERROR_SUCCESS; +} + +/* Builds a SUCCESS node . */ +Py_LOCAL_INLINE(int) build_SUCCESS(RE_CompileArgs* args) { + RE_Node* node; + /* code: opcode. */ + + /* Create the node. */ + node = create_node(args->pattern, (RE_UINT8)args->code[0], 0, 0, 0); + if (!node) + return RE_ERROR_MEMORY; + + ++args->code; + + /* Append the node. */ + add_node(args->end, node); + args->end = node; + + return RE_ERROR_SUCCESS; +} + +/* Builds a zero-width node. */ +Py_LOCAL_INLINE(int) build_zerowidth(RE_CompileArgs* args) { + RE_CODE flags; + RE_Node* node; + + /* codes: opcode, flags. */ + if (args->code + 1 > args->end_code) + return RE_ERROR_ILLEGAL; + + flags = args->code[1]; + + /* Create the node. */ + node = create_node(args->pattern, (RE_UINT8)args->code[0], flags, 0, 0); + if (!node) + return RE_ERROR_MEMORY; + + args->code += 2; + + /* Append the node. */ + add_node(args->end, node); + args->end = node; + + return RE_ERROR_SUCCESS; +} + +/* Builds a sequence of nodes from regular expression code. */ +Py_LOCAL_INLINE(int) build_sequence(RE_CompileArgs* args) { + int status; + + /* Guarantee that there's something to attach to. */ + args->start = create_node(args->pattern, RE_OP_BRANCH, 0, 0, 0); + args->end = args->start; + + args->min_width = 0; + args->has_captures = FALSE; + args->is_fuzzy = FALSE; + args->has_groups = FALSE; + args->has_repeats = FALSE; + + /* The sequence should end with an opcode we don't understand. If it + * doesn't then the code is illegal. + */ + while (args->code < args->end_code) { + /* The following code groups opcodes by format, not function. */ + switch (args->code[0]) { + case RE_OP_ANY: + case RE_OP_ANY_ALL: + case RE_OP_ANY_ALL_REV: + case RE_OP_ANY_REV: + case RE_OP_ANY_U: + case RE_OP_ANY_U_REV: + /* A simple opcode with no trailing codewords and width of 1. */ + status = build_ANY(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_ATOMIC: + /* An atomic sequence. */ + status = build_ATOMIC(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_BOUNDARY: + case RE_OP_DEFAULT_BOUNDARY: + case RE_OP_DEFAULT_END_OF_WORD: + case RE_OP_DEFAULT_START_OF_WORD: + case RE_OP_END_OF_WORD: + case RE_OP_GRAPHEME_BOUNDARY: + case RE_OP_KEEP: + case RE_OP_SKIP: + case RE_OP_START_OF_WORD: + /* A word or grapheme boundary. */ + status = build_BOUNDARY(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_BRANCH: + /* A 2-way branch. */ + status = build_BRANCH(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_CALL_REF: + /* A group call ref. */ + status = build_CALL_REF(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_CHARACTER: + case RE_OP_CHARACTER_IGN: + case RE_OP_CHARACTER_IGN_REV: + case RE_OP_CHARACTER_REV: + case RE_OP_PROPERTY: + case RE_OP_PROPERTY_IGN: + case RE_OP_PROPERTY_IGN_REV: + case RE_OP_PROPERTY_REV: + /* A character literal or a property. */ + status = build_CHARACTER_or_PROPERTY(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_CONDITIONAL: + /* A lookaround conditional. */ + status = build_CONDITIONAL(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_END_OF_LINE: + case RE_OP_END_OF_LINE_U: + case RE_OP_END_OF_STRING: + case RE_OP_END_OF_STRING_LINE: + case RE_OP_END_OF_STRING_LINE_U: + case RE_OP_SEARCH_ANCHOR: + case RE_OP_START_OF_LINE: + case RE_OP_START_OF_LINE_U: + case RE_OP_START_OF_STRING: + /* A simple opcode with no trailing codewords and width of 0. */ + status = build_zerowidth(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_FAILURE: + case RE_OP_PRUNE: + case RE_OP_SUCCESS: + status = build_SUCCESS(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_FUZZY: + /* A fuzzy sequence. */ + status = build_FUZZY(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_GREEDY_REPEAT: + case RE_OP_LAZY_REPEAT: + /* A repeated sequence. */ + status = build_REPEAT(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_GROUP: + /* A capture group. */ + status = build_GROUP(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_GROUP_CALL: + /* A group call. */ + status = build_GROUP_CALL(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_GROUP_EXISTS: + /* A conditional sequence. */ + status = build_GROUP_EXISTS(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_LOOKAROUND: + /* A lookaround. */ + status = build_LOOKAROUND(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_RANGE: + case RE_OP_RANGE_IGN: + case RE_OP_RANGE_IGN_REV: + case RE_OP_RANGE_REV: + /* A range. */ + status = build_RANGE(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_REF_GROUP: + case RE_OP_REF_GROUP_FLD: + case RE_OP_REF_GROUP_FLD_REV: + case RE_OP_REF_GROUP_IGN: + case RE_OP_REF_GROUP_IGN_REV: + case RE_OP_REF_GROUP_REV: + /* A reference to a group. */ + status = build_REF_GROUP(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_SET_DIFF: + case RE_OP_SET_DIFF_IGN: + case RE_OP_SET_DIFF_IGN_REV: + case RE_OP_SET_DIFF_REV: + case RE_OP_SET_INTER: + case RE_OP_SET_INTER_IGN: + case RE_OP_SET_INTER_IGN_REV: + case RE_OP_SET_INTER_REV: + case RE_OP_SET_SYM_DIFF: + case RE_OP_SET_SYM_DIFF_IGN: + case RE_OP_SET_SYM_DIFF_IGN_REV: + case RE_OP_SET_SYM_DIFF_REV: + case RE_OP_SET_UNION: + case RE_OP_SET_UNION_IGN: + case RE_OP_SET_UNION_IGN_REV: + case RE_OP_SET_UNION_REV: + /* A set. */ + status = build_SET(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + case RE_OP_STRING: + case RE_OP_STRING_FLD: + case RE_OP_STRING_FLD_REV: + case RE_OP_STRING_IGN: + case RE_OP_STRING_IGN_REV: + case RE_OP_STRING_REV: + /* A string literal. */ + if (!build_STRING(args, FALSE)) + return FALSE; + break; + case RE_OP_STRING_SET: + case RE_OP_STRING_SET_FLD: + case RE_OP_STRING_SET_FLD_REV: + case RE_OP_STRING_SET_IGN: + case RE_OP_STRING_SET_IGN_REV: + case RE_OP_STRING_SET_REV: + /* A reference to a list. */ + status = build_STRING_SET(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; + default: + /* We've found an opcode which we don't recognise. We'll leave it + * for the caller. + */ + return RE_ERROR_SUCCESS; + } + } + + /* If we're here then we should be at the end of the code, otherwise we + * have an error. + */ + return args->code == args->end_code; +} + +/* Compiles the regular expression code to 'nodes'. + * + * Various details about the regular expression are discovered during + * compilation and stored in the PatternObject. + */ +Py_LOCAL_INLINE(BOOL) compile_to_nodes(RE_CODE* code, RE_CODE* end_code, + PatternObject* pattern) { + RE_CompileArgs args; + int status; + + /* Compile a regex sequence and then check that we've reached the end + * correctly. (The last opcode should be 'SUCCESS'.) + * + * If successful, 'start' and 'end' will point to the start and end nodes + * of the compiled sequence. + */ + args.code = code; + args.end_code = end_code; + args.pattern = pattern; + args.forward = (pattern->flags & RE_FLAG_REVERSE) == 0; + args.visible_captures = FALSE; + args.has_captures = FALSE; + args.repeat_depth = 0; + args.is_fuzzy = FALSE; + args.within_fuzzy = FALSE; + status = build_sequence(&args); + if (status == RE_ERROR_ILLEGAL) + set_error(RE_ERROR_ILLEGAL, NULL); + + if (status != RE_ERROR_SUCCESS) + return FALSE; + + pattern->min_width = args.min_width; + pattern->is_fuzzy = args.is_fuzzy; + pattern->do_search_start = TRUE; + pattern->start_node = args.start; + + /* Optimise the pattern. */ + if (!optimise_pattern(pattern)) + return FALSE; + + pattern->start_test = locate_test_start(pattern->start_node); + + /* Get the call_ref for the entire pattern, if any. */ + if (pattern->start_node->op == RE_OP_CALL_REF) + pattern->pattern_call_ref = (Py_ssize_t)pattern->start_node->values[0]; + else + pattern->pattern_call_ref = -1; + + return TRUE; +} + +/* Gets the required characters for a regex. + * + * In the event of an error, it just pretends that there are no required + * characters. + */ +Py_LOCAL_INLINE(void) get_required_chars(PyObject* required_chars, RE_CODE** + req_chars, size_t* req_length) { + Py_ssize_t len; + RE_CODE* chars; + Py_ssize_t i; + + *req_chars = NULL; + *req_length = 0; + + len = PyTuple_GET_SIZE(required_chars); + if (len < 1 || PyErr_Occurred()) { + PyErr_Clear(); + return; + } + + chars = (RE_CODE*)re_alloc((size_t)len * sizeof(RE_CODE)); + if (!chars) + goto error; + + for (i = 0; i < len; i++) { + PyObject* o; + size_t value; + + /* PyTuple_SET_ITEM borrows the reference. */ + o = PyTuple_GET_ITEM(required_chars, i); + + value = PyLong_AsUnsignedLong(o); + if ((Py_ssize_t)value == -1 && PyErr_Occurred()) + goto error; + + chars[i] = (RE_CODE)value; + if (chars[i] != value) + goto error; + } + + *req_chars = chars; + *req_length = (size_t)len; + + return; + +error: + PyErr_Clear(); + re_dealloc(chars); +} + +/* Makes a STRING node. */ +Py_LOCAL_INLINE(RE_Node*) make_STRING_node(PatternObject* pattern, RE_UINT8 op, + size_t length, RE_CODE* chars) { + Py_ssize_t step; + RE_Node* node; + size_t i; + + step = get_step(op); + + /* Create the node. */ + node = create_node(pattern, op, 0, step * (Py_ssize_t)length, length); + if (!node) + return NULL; + + node->status |= RE_STATUS_STRING; + + for (i = 0; i < length; i++) + node->values[i] = chars[i]; + + return node; +} + +/* Scans all of the characters in the current locale for their properties. */ +Py_LOCAL_INLINE(void) scan_locale_chars(RE_LocaleInfo* locale_info) { + int c; + + for (c = 0; c < 0x100; c++) { + unsigned short props = 0; + + if (isalnum(c)) + props |= RE_LOCALE_ALNUM; + if (isalpha(c)) + props |= RE_LOCALE_ALPHA; + if (iscntrl(c)) + props |= RE_LOCALE_CNTRL; + if (isdigit(c)) + props |= RE_LOCALE_DIGIT; + if (isgraph(c)) + props |= RE_LOCALE_GRAPH; + if (islower(c)) + props |= RE_LOCALE_LOWER; + if (isprint(c)) + props |= RE_LOCALE_PRINT; + if (ispunct(c)) + props |= RE_LOCALE_PUNCT; + if (isspace(c)) + props |= RE_LOCALE_SPACE; + if (isupper(c)) + props |= RE_LOCALE_UPPER; + + locale_info->properties[c] = props; + locale_info->uppercase[c] = (unsigned char)toupper(c); + locale_info->lowercase[c] = (unsigned char)tolower(c); + } +} + +/* Compiles regular expression code to a PatternObject. + * + * The regular expression code is provided as a list and is then compiled to + * 'nodes'. Various details about the regular expression are discovered during + * compilation and stored in the PatternObject. + */ +static PyObject* re_compile(PyObject* self_, PyObject* args) { + PyObject* pattern; + Py_ssize_t flags = 0; + PyObject* code_list; + PyObject* groupindex; + PyObject* indexgroup; + PyObject* named_lists; + PyObject* named_list_indexes; + Py_ssize_t req_offset; + PyObject* required_chars; + Py_ssize_t req_flags; + size_t public_group_count; + Py_ssize_t code_len; + RE_CODE* code; + Py_ssize_t i; + RE_CODE* req_chars; + size_t req_length; + PatternObject* self; + BOOL unicode; + BOOL locale; + BOOL ascii; + BOOL ok; + + if (!PyArg_ParseTuple(args, "OnOOOOOnOnn:re_compile", &pattern, &flags, + &code_list, &groupindex, &indexgroup, &named_lists, &named_list_indexes, + &req_offset, &required_chars, &req_flags, &public_group_count)) + return NULL; + + /* Read the regex code. */ + code_len = PyList_GET_SIZE(code_list); + code = (RE_CODE*)re_alloc((size_t)code_len * sizeof(RE_CODE)); + if (!code) + return NULL; + + for (i = 0; i < code_len; i++) { + PyObject* o; + size_t value; + + /* PyList_GET_ITEM borrows a reference. */ + o = PyList_GET_ITEM(code_list, i); + + value = PyLong_AsUnsignedLong(o); + if ((Py_ssize_t)value == -1 && PyErr_Occurred()) + goto error; + + code[i] = (RE_CODE)value; + if (code[i] != value) + goto error; + } + + /* Get the required characters. */ + get_required_chars(required_chars, &req_chars, &req_length); + + /* Create the PatternObject. */ + self = PyObject_NEW(PatternObject, &Pattern_Type); + if (!self) { + set_error(RE_ERROR_MEMORY, NULL); + re_dealloc(req_chars); + re_dealloc(code); + return NULL; + } + + /* Initialise the PatternObject. */ + self->pattern = pattern; + self->flags = flags; + self->weakreflist = NULL; + self->start_node = NULL; + self->repeat_count = 0; + self->true_group_count = 0; + self->public_group_count = public_group_count; + self->group_end_index = 0; + self->groupindex = groupindex; + self->indexgroup = indexgroup; + self->named_lists = named_lists; + self->named_lists_count = (size_t)PyDict_Size(named_lists); + self->partial_named_lists[0] = NULL; + self->partial_named_lists[1] = NULL; + self->named_list_indexes = named_list_indexes; + self->node_capacity = 0; + self->node_count = 0; + self->node_list = NULL; + self->group_info_capacity = 0; + self->group_info = NULL; + self->call_ref_info_capacity = 0; + self->call_ref_info_count = 0; + self->call_ref_info = NULL; + self->repeat_info_capacity = 0; + self->repeat_info = NULL; + self->groups_storage = NULL; + self->repeats_storage = NULL; + self->fuzzy_count = 0; + self->recursive = FALSE; + self->req_offset = req_offset; + self->req_string = NULL; + self->locale_info = NULL; + Py_INCREF(self->pattern); + Py_INCREF(self->groupindex); + Py_INCREF(self->indexgroup); + Py_INCREF(self->named_lists); + Py_INCREF(self->named_list_indexes); + + /* Initialise the character encoding. */ + unicode = (flags & RE_FLAG_UNICODE) != 0; + locale = (flags & RE_FLAG_LOCALE) != 0; + ascii = (flags & RE_FLAG_ASCII) != 0; + if (!unicode && !locale && !ascii) { + if (PyString_Check(self->pattern)) + ascii = RE_FLAG_ASCII; + else + unicode = RE_FLAG_UNICODE; + } + if (unicode) + self->encoding = &unicode_encoding; + else if (locale) + self->encoding = &locale_encoding; + else if (ascii) + self->encoding = &ascii_encoding; + + /* Compile the regular expression code to nodes. */ + ok = compile_to_nodes(code, code + code_len, self); + + /* We no longer need the regular expression code. */ + re_dealloc(code); + + if (!ok) { + Py_DECREF(self); + re_dealloc(req_chars); + return NULL; + } + + /* Make a node for the required string, if there's one. */ + if (req_chars) { + /* Remove the FULLCASE flag if it's not a Unicode pattern or not + * ignoring case. + */ + if (!(self->flags & RE_FLAG_UNICODE) || !(self->flags & + RE_FLAG_IGNORECASE)) + req_flags &= ~RE_FLAG_FULLCASE; + + if (self->flags & RE_FLAG_REVERSE) { + switch (req_flags) { + case 0: + self->req_string = make_STRING_node(self, RE_OP_STRING_REV, + req_length, req_chars); + break; + case RE_FLAG_IGNORECASE | RE_FLAG_FULLCASE: + self->req_string = make_STRING_node(self, RE_OP_STRING_FLD_REV, + req_length, req_chars); + break; + case RE_FLAG_IGNORECASE: + self->req_string = make_STRING_node(self, RE_OP_STRING_IGN_REV, + req_length, req_chars); + break; + } + } else { + switch (req_flags) { + case 0: + self->req_string = make_STRING_node(self, RE_OP_STRING, + req_length, req_chars); + break; + case RE_FLAG_IGNORECASE | RE_FLAG_FULLCASE: + self->req_string = make_STRING_node(self, RE_OP_STRING_FLD, + req_length, req_chars); + break; + case RE_FLAG_IGNORECASE: + self->req_string = make_STRING_node(self, RE_OP_STRING_IGN, + req_length, req_chars); + break; + } + } + + re_dealloc(req_chars); + } + + if (locale) { + /* Store info about the characters in the locale for locale-sensitive + * matching. + */ + self->locale_info = re_alloc(sizeof(RE_LocaleInfo)); + if (!self->locale_info) { + Py_DECREF(self); + return NULL; + } + + scan_locale_chars(self->locale_info); + } + + return (PyObject*)self; + +error: + re_dealloc(code); + set_error(RE_ERROR_ILLEGAL, NULL); + return NULL; +} + +/* Gets the size of the codewords. */ +static PyObject* get_code_size(PyObject* self, PyObject* unused) { + return Py_BuildValue("n", sizeof(RE_CODE)); +} + +/* Gets the property dict. */ +static PyObject* get_properties(PyObject* self_, PyObject* args) { + Py_INCREF(property_dict); + + return property_dict; +} + +/* Folds the case of a string. */ +static PyObject* fold_case(PyObject* self_, PyObject* args) { + RE_StringInfo str_info; + Py_UCS4 (*char_at)(void* text, Py_ssize_t pos); + RE_EncodingTable* encoding; + RE_LocaleInfo locale_info; + Py_ssize_t folded_charsize; + void (*set_char_at)(void* text, Py_ssize_t pos, Py_UCS4 ch); + Py_ssize_t buf_size; + void* folded; + Py_ssize_t folded_len; + PyObject* result; + + Py_ssize_t flags; + PyObject* string; + if (!PyArg_ParseTuple(args, "nO:fold_case", &flags, &string)) + return NULL; + + if (!(flags & RE_FLAG_IGNORECASE)) { + Py_INCREF(string); + return string; + } + + /* Get the string. */ + if (!get_string(string, &str_info)) + return NULL; + + /* Get the function for reading from the original string. */ + switch (str_info.charsize) { + case 1: + char_at = bytes1_char_at; + break; + case 2: + char_at = bytes2_char_at; + break; + case 4: + char_at = bytes4_char_at; + break; + default: +#if PY_VERSION_HEX >= 0x02060000 + release_buffer(&str_info); + +#endif + return NULL; + } + + /* What's the encoding? */ + if (flags & RE_FLAG_UNICODE) + encoding = &unicode_encoding; + else if (flags & RE_FLAG_LOCALE) { + encoding = &locale_encoding; + scan_locale_chars(&locale_info); + } else if (flags & RE_FLAG_ASCII) + encoding = &ascii_encoding; + else + encoding = &unicode_encoding; + + /* The folded string will have the same width as the original string. */ + folded_charsize = str_info.charsize; + + /* Get the function for writing to the folded string. */ + switch (folded_charsize) { + case 1: + set_char_at = bytes1_set_char_at; + break; + case 2: + set_char_at = bytes2_set_char_at; + break; + case 4: + set_char_at = bytes4_set_char_at; + break; + default: +#if PY_VERSION_HEX >= 0x02060000 + release_buffer(&str_info); + +#endif + return NULL; + } + + /* Allocate a buffer for the folded string. */ + if (flags & RE_FLAG_FULLCASE) + /* When using full case-folding with Unicode, some single codepoints + * are mapped to multiple codepoints. + */ + buf_size = str_info.length * RE_MAX_FOLDED; + else + buf_size = str_info.length; + + folded = re_alloc((size_t)(buf_size * folded_charsize)); + if (!folded) { +#if PY_VERSION_HEX >= 0x02060000 + release_buffer(&str_info); + +#endif + return NULL; + } + + /* Fold the case of the string. */ + folded_len = 0; + + if (flags & RE_FLAG_FULLCASE) { + /* Full case-folding. */ + int (*full_case_fold)(RE_LocaleInfo* locale_info, Py_UCS4 ch, Py_UCS4* + folded); + Py_ssize_t i; + Py_UCS4 codepoints[RE_MAX_FOLDED]; + + full_case_fold = encoding->full_case_fold; + + for (i = 0; i < str_info.length; i++) { + int count; + int j; + + count = full_case_fold(&locale_info, char_at(str_info.characters, + i), codepoints); + for (j = 0; j < count; j++) + set_char_at(folded, folded_len + j, codepoints[j]); + + folded_len += count; + } + } else { + /* Simple case-folding. */ + Py_UCS4 (*simple_case_fold)(RE_LocaleInfo* locale_info, Py_UCS4 ch); + Py_ssize_t i; + + simple_case_fold = encoding->simple_case_fold; + + for (i = 0; i < str_info.length; i++) { + Py_UCS4 ch; + + ch = simple_case_fold(&locale_info, char_at(str_info.characters, + i)); + set_char_at(folded, i, ch); + } + + folded_len = str_info.length; + } + + /* Build the result string. */ + if (str_info.is_unicode) + result = build_unicode_value(folded, folded_len, folded_charsize); + else + result = build_bytes_value(folded, folded_len, folded_charsize); + + re_dealloc(folded); + +#if PY_VERSION_HEX >= 0x02060000 + /* Release the original string's buffer. */ + release_buffer(&str_info); + +#endif + return result; +} + +/* Returns a tuple of the Unicode characters that expand on full case-folding. + */ +static PyObject* get_expand_on_folding(PyObject* self, PyObject* unused) { + int count; + PyObject* result; + int i; + + /* How many characters are there? */ + count = sizeof(re_expand_on_folding) / sizeof(re_expand_on_folding[0]); + + /* Put all the characters in a tuple. */ + result = PyTuple_New(count); + if (!result) + return NULL; + + for (i = 0; i < count; i++) { + Py_UNICODE codepoint; + PyObject* item; + + codepoint = re_expand_on_folding[i]; + + item = build_unicode_value(&codepoint, 1, sizeof(codepoint)); + if (!item) + goto error; + + /* PyTuple_SetItem borrows the reference. */ + PyTuple_SetItem(result, i, item); + } + + return result; + +error: + Py_DECREF(result); + return NULL; +} + +/* Returns whether a character has a given value for a Unicode property. */ +static PyObject* has_property_value(PyObject* self_, PyObject* args) { + BOOL v; + + Py_ssize_t property_value; + Py_ssize_t character; + if (!PyArg_ParseTuple(args, "nn:has_property_value", &property_value, + &character)) + return NULL; + + v = unicode_has_property((RE_CODE)property_value, (Py_UCS4)character) ? 1 : + 0; + + return Py_BuildValue("n", v); +} + +/* Returns a list of all the simple cases of a character. + * + * If full case-folding is turned on and the character also expands on full + * case-folding, a None is appended to the list. + */ +static PyObject* get_all_cases(PyObject* self_, PyObject* args) { + RE_EncodingTable* encoding; + RE_LocaleInfo locale_info; + int count; + Py_UCS4 cases[RE_MAX_CASES]; + PyObject* result; + int i; + Py_UCS4 folded[RE_MAX_FOLDED]; + + Py_ssize_t flags; + Py_ssize_t character; + if (!PyArg_ParseTuple(args, "nn:get_all_cases", &flags, &character)) + return NULL; + + /* What's the encoding? */ + if (flags & RE_FLAG_UNICODE) + encoding = &unicode_encoding; + else if (flags & RE_FLAG_LOCALE) { + encoding = &locale_encoding; + scan_locale_chars(&locale_info); + } else if (flags & RE_FLAG_ASCII) + encoding = &ascii_encoding; + else + encoding = &ascii_encoding; + + /* Get all the simple cases. */ + count = encoding->all_cases(&locale_info, (Py_UCS4)character, cases); + + result = PyList_New(count); + if (!result) + return NULL; + + for (i = 0; i < count; i++) { + PyObject* item; + + item = Py_BuildValue("n", cases[i]); + if (!item) + goto error; + + /* PyList_SetItem borrows the reference. */ + PyList_SetItem(result, i, item); + } + + /* If the character also expands on full case-folding, append a None. */ + if ((flags & RE_FULL_CASE_FOLDING) == RE_FULL_CASE_FOLDING) { + count = encoding->full_case_fold(&locale_info, (Py_UCS4)character, + folded); + if (count > 1) + PyList_Append(result, Py_None); + } + + return result; + +error: + Py_DECREF(result); + return NULL; +} + +/* The table of the module's functions. */ +static PyMethodDef _functions[] = { + {"compile", (PyCFunction)re_compile, METH_VARARGS}, + {"get_code_size", (PyCFunction)get_code_size, METH_NOARGS}, + {"get_properties", (PyCFunction)get_properties, METH_VARARGS}, + {"fold_case", (PyCFunction)fold_case, METH_VARARGS}, + {"get_expand_on_folding", (PyCFunction)get_expand_on_folding, METH_NOARGS}, + {"has_property_value", (PyCFunction)has_property_value, METH_VARARGS}, + {"get_all_cases", (PyCFunction)get_all_cases, METH_VARARGS}, + {NULL, NULL} +}; + +/* Initialises the property dictionary. */ +Py_LOCAL_INLINE(BOOL) init_property_dict(void) { + size_t value_set_count; + size_t i; + PyObject** value_dicts; + + property_dict = NULL; + + /* How many value sets are there? */ + value_set_count = 0; + + for (i = 0; i < sizeof(re_property_values) / sizeof(re_property_values[0]); + i++) { + RE_PropertyValue* value; + + value = &re_property_values[i]; + if (value->value_set >= value_set_count) + value_set_count = (size_t)value->value_set + 1; + } + + /* Quick references for the value sets. */ + value_dicts = (PyObject**)re_alloc(value_set_count * + sizeof(value_dicts[0])); + if (!value_dicts) + return FALSE; + + memset(value_dicts, 0, value_set_count * sizeof(value_dicts[0])); + + /* Build the property values dictionaries. */ + for (i = 0; i < sizeof(re_property_values) / sizeof(re_property_values[0]); + i++) { + RE_PropertyValue* value; + PyObject* v; + int status; + + value = &re_property_values[i]; + if (!value_dicts[value->value_set]) { + value_dicts[value->value_set] = PyDict_New(); + if (!value_dicts[value->value_set]) + goto error; + } + + v = Py_BuildValue("i", value->id); + if (!v) + goto error; + + status = PyDict_SetItemString(value_dicts[value->value_set], + re_strings[value->name], v); + Py_DECREF(v); + if (status < 0) + goto error; + } + + /* Build the property dictionary. */ + property_dict = PyDict_New(); + if (!property_dict) + goto error; + + for (i = 0; i < sizeof(re_properties) / sizeof(re_properties[0]); i++) { + RE_Property* property; + PyObject* v; + int status; + + property = &re_properties[i]; + v = Py_BuildValue("iO", property->id, + value_dicts[property->value_set]); + if (!v) + goto error; + + status = PyDict_SetItemString(property_dict, + re_strings[property->name], v); + Py_DECREF(v); + if (status < 0) + goto error; + } + + /* DECREF the value sets. Any unused ones will be deallocated. */ + for (i = 0; i < value_set_count; i++) + Py_XDECREF(value_dicts[i]); + + re_dealloc(value_dicts); + + return TRUE; + +error: + Py_XDECREF(property_dict); + + /* DECREF the value sets. */ + for (i = 0; i < value_set_count; i++) + Py_XDECREF(value_dicts[i]); + + re_dealloc(value_dicts); + + return FALSE; +} + +/* Initialises the module. */ +PyMODINIT_FUNC init_regex(void) { + PyObject* m; + PyObject* d; + PyObject* x; + +#if defined(VERBOSE) + /* Unbuffered in case it crashes! */ + setvbuf(stdout, NULL, _IONBF, 0); + +#endif + /* Initialise Pattern_Type. */ + Pattern_Type.tp_dealloc = pattern_dealloc; + Pattern_Type.tp_repr = pattern_repr; + Pattern_Type.tp_flags = Py_TPFLAGS_HAVE_WEAKREFS; + Pattern_Type.tp_doc = pattern_doc; + Pattern_Type.tp_weaklistoffset = offsetof(PatternObject, weakreflist); + Pattern_Type.tp_methods = pattern_methods; + Pattern_Type.tp_members = pattern_members; + Pattern_Type.tp_getset = pattern_getset; + + /* Initialise Match_Type. */ + Match_Type.tp_dealloc = match_dealloc; + Match_Type.tp_repr = match_repr; + Match_Type.tp_as_mapping = &match_as_mapping; + Match_Type.tp_flags = Py_TPFLAGS_DEFAULT; + Match_Type.tp_doc = match_doc; + Match_Type.tp_methods = match_methods; + Match_Type.tp_members = match_members; + Match_Type.tp_getset = match_getset; + + /* Initialise Scanner_Type. */ + Scanner_Type.tp_dealloc = scanner_dealloc; + Scanner_Type.tp_flags = Py_TPFLAGS_DEFAULT; + Scanner_Type.tp_doc = scanner_doc; + Scanner_Type.tp_iter = scanner_iter; + Scanner_Type.tp_iternext = scanner_iternext; + Scanner_Type.tp_methods = scanner_methods; + Scanner_Type.tp_members = scanner_members; + + /* Initialise Splitter_Type. */ + Splitter_Type.tp_dealloc = splitter_dealloc; + Splitter_Type.tp_flags = Py_TPFLAGS_DEFAULT; + Splitter_Type.tp_doc = splitter_doc; + Splitter_Type.tp_iter = splitter_iter; + Splitter_Type.tp_iternext = splitter_iternext; + Splitter_Type.tp_methods = splitter_methods; + Splitter_Type.tp_members = splitter_members; +#if PY_VERSION_HEX >= 0x02060000 + + /* Initialise Capture_Type. */ + Capture_Type.tp_dealloc = capture_dealloc; + Capture_Type.tp_str = capture_str; + Capture_Type.tp_as_mapping = &capture_as_mapping; + Capture_Type.tp_flags = Py_TPFLAGS_DEFAULT; + Capture_Type.tp_methods = capture_methods; +#endif + + /* Initialize object types */ + if (PyType_Ready(&Pattern_Type) < 0) + return; + if (PyType_Ready(&Match_Type) < 0) + return; + if (PyType_Ready(&Scanner_Type) < 0) + return; + if (PyType_Ready(&Splitter_Type) < 0) + return; +#if PY_VERSION_HEX >= 0x02060000 + if (PyType_Ready(&Capture_Type) < 0) + return; +#endif + + error_exception = NULL; + + m = Py_InitModule("_" RE_MODULE, _functions); + if (!m) + return; + + d = PyModule_GetDict(m); + + x = PyInt_FromLong(RE_MAGIC); + if (x) { + PyDict_SetItemString(d, "MAGIC", x); + Py_DECREF(x); + } + + x = PyInt_FromLong(sizeof(RE_CODE)); + if (x) { + PyDict_SetItemString(d, "CODE_SIZE", x); + Py_DECREF(x); + } + + x = PyString_FromString(copyright); + if (x) { + PyDict_SetItemString(d, "copyright", x); + Py_DECREF(x); + } + + /* Initialise the property dictionary. */ + if (!init_property_dict()) + return; +} + +/* vim:ts=4:sw=4:et */ diff --git a/lib/regex/_regex.h b/lib/regex/_regex.h new file mode 100644 index 0000000000000000000000000000000000000000..37ab8a9c6e7a46043c16d107c2eb7aa4400d4ae2 --- /dev/null +++ b/lib/regex/_regex.h @@ -0,0 +1,243 @@ +/* + * Secret Labs' Regular Expression Engine + * + * regular expression matching engine + * + * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. + * + * NOTE: This file is generated by regex.py. If you need + * to change anything in here, edit regex.py and run it. + * + * 2010-01-16 mrab Re-written + */ + +/* Supports Unicode version 8.0.0. */ + +#define RE_MAGIC 20100116 + +#include "_regex_unicode.h" + +/* Operators. */ +#define RE_OP_FAILURE 0 +#define RE_OP_SUCCESS 1 +#define RE_OP_ANY 2 +#define RE_OP_ANY_ALL 3 +#define RE_OP_ANY_ALL_REV 4 +#define RE_OP_ANY_REV 5 +#define RE_OP_ANY_U 6 +#define RE_OP_ANY_U_REV 7 +#define RE_OP_ATOMIC 8 +#define RE_OP_BOUNDARY 9 +#define RE_OP_BRANCH 10 +#define RE_OP_CALL_REF 11 +#define RE_OP_CHARACTER 12 +#define RE_OP_CHARACTER_IGN 13 +#define RE_OP_CHARACTER_IGN_REV 14 +#define RE_OP_CHARACTER_REV 15 +#define RE_OP_CONDITIONAL 16 +#define RE_OP_DEFAULT_BOUNDARY 17 +#define RE_OP_DEFAULT_END_OF_WORD 18 +#define RE_OP_DEFAULT_START_OF_WORD 19 +#define RE_OP_END 20 +#define RE_OP_END_OF_LINE 21 +#define RE_OP_END_OF_LINE_U 22 +#define RE_OP_END_OF_STRING 23 +#define RE_OP_END_OF_STRING_LINE 24 +#define RE_OP_END_OF_STRING_LINE_U 25 +#define RE_OP_END_OF_WORD 26 +#define RE_OP_FUZZY 27 +#define RE_OP_GRAPHEME_BOUNDARY 28 +#define RE_OP_GREEDY_REPEAT 29 +#define RE_OP_GROUP 30 +#define RE_OP_GROUP_CALL 31 +#define RE_OP_GROUP_EXISTS 32 +#define RE_OP_KEEP 33 +#define RE_OP_LAZY_REPEAT 34 +#define RE_OP_LOOKAROUND 35 +#define RE_OP_NEXT 36 +#define RE_OP_PROPERTY 37 +#define RE_OP_PROPERTY_IGN 38 +#define RE_OP_PROPERTY_IGN_REV 39 +#define RE_OP_PROPERTY_REV 40 +#define RE_OP_PRUNE 41 +#define RE_OP_RANGE 42 +#define RE_OP_RANGE_IGN 43 +#define RE_OP_RANGE_IGN_REV 44 +#define RE_OP_RANGE_REV 45 +#define RE_OP_REF_GROUP 46 +#define RE_OP_REF_GROUP_FLD 47 +#define RE_OP_REF_GROUP_FLD_REV 48 +#define RE_OP_REF_GROUP_IGN 49 +#define RE_OP_REF_GROUP_IGN_REV 50 +#define RE_OP_REF_GROUP_REV 51 +#define RE_OP_SEARCH_ANCHOR 52 +#define RE_OP_SET_DIFF 53 +#define RE_OP_SET_DIFF_IGN 54 +#define RE_OP_SET_DIFF_IGN_REV 55 +#define RE_OP_SET_DIFF_REV 56 +#define RE_OP_SET_INTER 57 +#define RE_OP_SET_INTER_IGN 58 +#define RE_OP_SET_INTER_IGN_REV 59 +#define RE_OP_SET_INTER_REV 60 +#define RE_OP_SET_SYM_DIFF 61 +#define RE_OP_SET_SYM_DIFF_IGN 62 +#define RE_OP_SET_SYM_DIFF_IGN_REV 63 +#define RE_OP_SET_SYM_DIFF_REV 64 +#define RE_OP_SET_UNION 65 +#define RE_OP_SET_UNION_IGN 66 +#define RE_OP_SET_UNION_IGN_REV 67 +#define RE_OP_SET_UNION_REV 68 +#define RE_OP_SKIP 69 +#define RE_OP_START_OF_LINE 70 +#define RE_OP_START_OF_LINE_U 71 +#define RE_OP_START_OF_STRING 72 +#define RE_OP_START_OF_WORD 73 +#define RE_OP_STRING 74 +#define RE_OP_STRING_FLD 75 +#define RE_OP_STRING_FLD_REV 76 +#define RE_OP_STRING_IGN 77 +#define RE_OP_STRING_IGN_REV 78 +#define RE_OP_STRING_REV 79 +#define RE_OP_STRING_SET 80 +#define RE_OP_STRING_SET_FLD 81 +#define RE_OP_STRING_SET_FLD_REV 82 +#define RE_OP_STRING_SET_IGN 83 +#define RE_OP_STRING_SET_IGN_REV 84 +#define RE_OP_STRING_SET_REV 85 +#define RE_OP_BODY_END 86 +#define RE_OP_BODY_START 87 +#define RE_OP_END_ATOMIC 88 +#define RE_OP_END_CONDITIONAL 89 +#define RE_OP_END_FUZZY 90 +#define RE_OP_END_GREEDY_REPEAT 91 +#define RE_OP_END_GROUP 92 +#define RE_OP_END_LAZY_REPEAT 93 +#define RE_OP_END_LOOKAROUND 94 +#define RE_OP_GREEDY_REPEAT_ONE 95 +#define RE_OP_GROUP_RETURN 96 +#define RE_OP_LAZY_REPEAT_ONE 97 +#define RE_OP_MATCH_BODY 98 +#define RE_OP_MATCH_TAIL 99 +#define RE_OP_START_GROUP 100 + +char* re_op_text[] = { + "RE_OP_FAILURE", + "RE_OP_SUCCESS", + "RE_OP_ANY", + "RE_OP_ANY_ALL", + "RE_OP_ANY_ALL_REV", + "RE_OP_ANY_REV", + "RE_OP_ANY_U", + "RE_OP_ANY_U_REV", + "RE_OP_ATOMIC", + "RE_OP_BOUNDARY", + "RE_OP_BRANCH", + "RE_OP_CALL_REF", + "RE_OP_CHARACTER", + "RE_OP_CHARACTER_IGN", + "RE_OP_CHARACTER_IGN_REV", + "RE_OP_CHARACTER_REV", + "RE_OP_CONDITIONAL", + "RE_OP_DEFAULT_BOUNDARY", + "RE_OP_DEFAULT_END_OF_WORD", + "RE_OP_DEFAULT_START_OF_WORD", + "RE_OP_END", + "RE_OP_END_OF_LINE", + "RE_OP_END_OF_LINE_U", + "RE_OP_END_OF_STRING", + "RE_OP_END_OF_STRING_LINE", + "RE_OP_END_OF_STRING_LINE_U", + "RE_OP_END_OF_WORD", + "RE_OP_FUZZY", + "RE_OP_GRAPHEME_BOUNDARY", + "RE_OP_GREEDY_REPEAT", + "RE_OP_GROUP", + "RE_OP_GROUP_CALL", + "RE_OP_GROUP_EXISTS", + "RE_OP_KEEP", + "RE_OP_LAZY_REPEAT", + "RE_OP_LOOKAROUND", + "RE_OP_NEXT", + "RE_OP_PROPERTY", + "RE_OP_PROPERTY_IGN", + "RE_OP_PROPERTY_IGN_REV", + "RE_OP_PROPERTY_REV", + "RE_OP_PRUNE", + "RE_OP_RANGE", + "RE_OP_RANGE_IGN", + "RE_OP_RANGE_IGN_REV", + "RE_OP_RANGE_REV", + "RE_OP_REF_GROUP", + "RE_OP_REF_GROUP_FLD", + "RE_OP_REF_GROUP_FLD_REV", + "RE_OP_REF_GROUP_IGN", + "RE_OP_REF_GROUP_IGN_REV", + "RE_OP_REF_GROUP_REV", + "RE_OP_SEARCH_ANCHOR", + "RE_OP_SET_DIFF", + "RE_OP_SET_DIFF_IGN", + "RE_OP_SET_DIFF_IGN_REV", + "RE_OP_SET_DIFF_REV", + "RE_OP_SET_INTER", + "RE_OP_SET_INTER_IGN", + "RE_OP_SET_INTER_IGN_REV", + "RE_OP_SET_INTER_REV", + "RE_OP_SET_SYM_DIFF", + "RE_OP_SET_SYM_DIFF_IGN", + "RE_OP_SET_SYM_DIFF_IGN_REV", + "RE_OP_SET_SYM_DIFF_REV", + "RE_OP_SET_UNION", + "RE_OP_SET_UNION_IGN", + "RE_OP_SET_UNION_IGN_REV", + "RE_OP_SET_UNION_REV", + "RE_OP_SKIP", + "RE_OP_START_OF_LINE", + "RE_OP_START_OF_LINE_U", + "RE_OP_START_OF_STRING", + "RE_OP_START_OF_WORD", + "RE_OP_STRING", + "RE_OP_STRING_FLD", + "RE_OP_STRING_FLD_REV", + "RE_OP_STRING_IGN", + "RE_OP_STRING_IGN_REV", + "RE_OP_STRING_REV", + "RE_OP_STRING_SET", + "RE_OP_STRING_SET_FLD", + "RE_OP_STRING_SET_FLD_REV", + "RE_OP_STRING_SET_IGN", + "RE_OP_STRING_SET_IGN_REV", + "RE_OP_STRING_SET_REV", + "RE_OP_BODY_END", + "RE_OP_BODY_START", + "RE_OP_END_ATOMIC", + "RE_OP_END_CONDITIONAL", + "RE_OP_END_FUZZY", + "RE_OP_END_GREEDY_REPEAT", + "RE_OP_END_GROUP", + "RE_OP_END_LAZY_REPEAT", + "RE_OP_END_LOOKAROUND", + "RE_OP_GREEDY_REPEAT_ONE", + "RE_OP_GROUP_RETURN", + "RE_OP_LAZY_REPEAT_ONE", + "RE_OP_MATCH_BODY", + "RE_OP_MATCH_TAIL", + "RE_OP_START_GROUP", +}; + +#define RE_FLAG_ASCII 0x80 +#define RE_FLAG_BESTMATCH 0x1000 +#define RE_FLAG_DEBUG 0x200 +#define RE_FLAG_DOTALL 0x10 +#define RE_FLAG_ENHANCEMATCH 0x8000 +#define RE_FLAG_FULLCASE 0x4000 +#define RE_FLAG_IGNORECASE 0x2 +#define RE_FLAG_LOCALE 0x4 +#define RE_FLAG_MULTILINE 0x8 +#define RE_FLAG_POSIX 0x10000 +#define RE_FLAG_REVERSE 0x400 +#define RE_FLAG_TEMPLATE 0x1 +#define RE_FLAG_UNICODE 0x20 +#define RE_FLAG_VERBOSE 0x40 +#define RE_FLAG_VERSION0 0x2000 +#define RE_FLAG_VERSION1 0x100 +#define RE_FLAG_WORD 0x800 diff --git a/lib/regex/_regex.so b/lib/regex/_regex.so new file mode 100644 index 0000000000000000000000000000000000000000..f0a87043e5c65d0996b2a8dbc501bdeeed7685dc Binary files /dev/null and b/lib/regex/_regex.so differ diff --git a/lib/regex/_regex_core.py b/lib/regex/_regex_core.py new file mode 100644 index 0000000000000000000000000000000000000000..25771b6fe8ce9f8cea9048b2167313db7319e74a --- /dev/null +++ b/lib/regex/_regex_core.py @@ -0,0 +1,4317 @@ +# +# Secret Labs' Regular Expression Engine core module +# +# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. +# +# This version of the SRE library can be redistributed under CNRI's +# Python 1.6 license. For any other use, please contact Secret Labs +# AB (info@pythonware.com). +# +# Portions of this engine have been developed in cooperation with +# CNRI. Hewlett-Packard provided funding for 1.6 integration and +# other compatibility work. +# +# 2010-01-16 mrab Python front-end re-written and extended + +import string +import sys +import unicodedata +from collections import defaultdict + +import _regex + +__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH", + "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P", + "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE", + "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error", + "Scanner"] + +# The regex exception. +class error(Exception): + def __init__(self, message, pattern=None, pos=None): + newline = u'\n' if isinstance(pattern, unicode) else '\n' + self.msg = message + self.pattern = pattern + self.pos = pos + if pattern is not None and pos is not None: + self.lineno = pattern.count(newline, 0, pos) + 1 + self.colno = pos - pattern.rfind(newline, 0, pos) + + message = "%s at position %d" % (message, pos) + + if newline in pattern: + message += " (line %d, column %d)" % (self.lineno, self.colno) + + Exception.__init__(self, message) + +# The exception for when a positional flag has been turned on in the old +# behaviour. +class _UnscopedFlagSet(Exception): + pass + +# The exception for when parsing fails and we want to try something else. +class ParseError(Exception): + pass + +# The exception for when there isn't a valid first set. +class _FirstSetError(Exception): + pass + +# Flags. +A = ASCII = 0x80 # Assume ASCII locale. +B = BESTMATCH = 0x1000 # Best fuzzy match. +D = DEBUG = 0x200 # Print parsed pattern. +E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first + # fuzzy match. +F = FULLCASE = 0x4000 # Unicode full case-folding. +I = IGNORECASE = 0x2 # Ignore case. +L = LOCALE = 0x4 # Assume current 8-bit locale. +M = MULTILINE = 0x8 # Make anchors look for newline. +P = POSIX = 0x10000 # POSIX-style matching (leftmost longest). +R = REVERSE = 0x400 # Search backwards. +S = DOTALL = 0x10 # Make dot match newline. +U = UNICODE = 0x20 # Assume Unicode locale. +V0 = VERSION0 = 0x2000 # Old legacy behaviour. +V1 = VERSION1 = 0x100 # New enhanced behaviour. +W = WORD = 0x800 # Default Unicode word breaks. +X = VERBOSE = 0x40 # Ignore whitespace and comments. +T = TEMPLATE = 0x1 # Template (present because re module has it). + +DEFAULT_VERSION = VERSION1 + +_ALL_VERSIONS = VERSION0 | VERSION1 +_ALL_ENCODINGS = ASCII | LOCALE | UNICODE + +# The default flags for the various versions. +DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE} + +# The mask for the flags. +GLOBAL_FLAGS = (_ALL_ENCODINGS | _ALL_VERSIONS | BESTMATCH | DEBUG | + ENHANCEMATCH | POSIX | REVERSE) +SCOPED_FLAGS = FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE + +ALPHA = frozenset(string.ascii_letters) +DIGITS = frozenset(string.digits) +ALNUM = ALPHA | DIGITS +OCT_DIGITS = frozenset(string.octdigits) +HEX_DIGITS = frozenset(string.hexdigits) +SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""]) +NAMED_CHAR_PART = ALNUM | frozenset(" -") +PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.") +SET_OPS = ("||", "~~", "&&", "--") + +# The width of the code words inside the regex engine. +BYTES_PER_CODE = _regex.get_code_size() +BITS_PER_CODE = BYTES_PER_CODE * 8 + +# The repeat count which represents infinity. +UNLIMITED = (1 << BITS_PER_CODE) - 1 + +# The regular expression flags. +REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE, + "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE, + "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x": + VERBOSE} + +# The case flags. +CASE_FLAGS = FULLCASE | IGNORECASE +NOCASE = 0 +FULLIGNORECASE = FULLCASE | IGNORECASE + +FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE + +CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE, + FULLIGNORECASE: FULLIGNORECASE} + +# The number of digits in hexadecimal escapes. +HEX_ESCAPES = {"x": 2, "u": 4, "U": 8} + +# A singleton which indicates a comment within a pattern. +COMMENT = object() +FLAGS = object() + +# The names of the opcodes. +OPCODES = """ +FAILURE +SUCCESS +ANY +ANY_ALL +ANY_ALL_REV +ANY_REV +ANY_U +ANY_U_REV +ATOMIC +BOUNDARY +BRANCH +CALL_REF +CHARACTER +CHARACTER_IGN +CHARACTER_IGN_REV +CHARACTER_REV +CONDITIONAL +DEFAULT_BOUNDARY +DEFAULT_END_OF_WORD +DEFAULT_START_OF_WORD +END +END_OF_LINE +END_OF_LINE_U +END_OF_STRING +END_OF_STRING_LINE +END_OF_STRING_LINE_U +END_OF_WORD +FUZZY +GRAPHEME_BOUNDARY +GREEDY_REPEAT +GROUP +GROUP_CALL +GROUP_EXISTS +KEEP +LAZY_REPEAT +LOOKAROUND +NEXT +PROPERTY +PROPERTY_IGN +PROPERTY_IGN_REV +PROPERTY_REV +PRUNE +RANGE +RANGE_IGN +RANGE_IGN_REV +RANGE_REV +REF_GROUP +REF_GROUP_FLD +REF_GROUP_FLD_REV +REF_GROUP_IGN +REF_GROUP_IGN_REV +REF_GROUP_REV +SEARCH_ANCHOR +SET_DIFF +SET_DIFF_IGN +SET_DIFF_IGN_REV +SET_DIFF_REV +SET_INTER +SET_INTER_IGN +SET_INTER_IGN_REV +SET_INTER_REV +SET_SYM_DIFF +SET_SYM_DIFF_IGN +SET_SYM_DIFF_IGN_REV +SET_SYM_DIFF_REV +SET_UNION +SET_UNION_IGN +SET_UNION_IGN_REV +SET_UNION_REV +SKIP +START_OF_LINE +START_OF_LINE_U +START_OF_STRING +START_OF_WORD +STRING +STRING_FLD +STRING_FLD_REV +STRING_IGN +STRING_IGN_REV +STRING_REV +STRING_SET +STRING_SET_FLD +STRING_SET_FLD_REV +STRING_SET_IGN +STRING_SET_IGN_REV +STRING_SET_REV +""" + +# Define the opcodes in a namespace. +class Namespace(object): + pass + +OP = Namespace() +for i, op in enumerate(OPCODES.split()): + setattr(OP, op, i) + +def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5): + """Make room in the given cache. + + Args: + cache_dict: The cache dictionary to modify. + args_dict: The dictionary of named list args used by patterns. + max_length: Maximum # of entries in cache_dict before it is shrunk. + divisor: Cache will shrink to max_length - 1/divisor*max_length items. + """ + # Toss out a fraction of the entries at random to make room for new ones. + # A random algorithm was chosen as opposed to simply cache_dict.popitem() + # as popitem could penalize the same regular expression repeatedly based + # on its internal hash value. Being random should spread the cache miss + # love around. + cache_keys = tuple(cache_dict.keys()) + overage = len(cache_keys) - max_length + if overage < 0: + # Cache is already within limits. Normally this should not happen + # but it could due to multithreading. + return + + number_to_toss = max_length // divisor + overage + + # The import is done here to avoid a circular dependency. + import random + if not hasattr(random, 'sample'): + # Do nothing while resolving the circular dependency: + # re->random->warnings->tokenize->string->re + return + + for doomed_key in random.sample(cache_keys, number_to_toss): + try: + del cache_dict[doomed_key] + except KeyError: + # Ignore problems if the cache changed from another thread. + pass + + # Rebuild the arguments and locale-sensitivity dictionaries. + args_dict.clear() + sensitivity_dict = {} + for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict): + args_dict[pattern, pattern_type, flags, default_version, locale] = args + try: + sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern] + except KeyError: + pass + + locale_sensitive.clear() + locale_sensitive.update(sensitivity_dict) + +def _fold_case(info, string): + "Folds the case of a string." + flags = info.flags + if (flags & _ALL_ENCODINGS) == 0: + flags |= info.guess_encoding + + return _regex.fold_case(flags, string) + +def is_cased(info, char): + "Checks whether a character is cased." + return len(_regex.get_all_cases(info.flags, char)) > 1 + +def _compile_firstset(info, fs): + "Compiles the firstset for the pattern." + if not fs or None in fs: + return [] + + # If we ignore the case, for simplicity we won't build a firstset. + members = set() + for i in fs: + if isinstance(i, Character) and not i.positive: + return [] + + if i.case_flags: + if isinstance(i, Character): + if is_cased(info, i.value): + return [] + elif isinstance(i, SetBase): + return [] + + members.add(i.with_flags(case_flags=NOCASE)) + + # Build the firstset. + fs = SetUnion(info, list(members), zerowidth=True) + fs = fs.optimise(info, in_set=True) + + # Compile the firstset. + return fs.compile(bool(info.flags & REVERSE)) + +def _flatten_code(code): + "Flattens the code from a list of tuples." + flat_code = [] + for c in code: + flat_code.extend(c) + + return flat_code + +def make_character(info, value, in_set=False): + "Makes a character literal." + if in_set: + # A character set is built case-sensitively. + return Character(value) + + return Character(value, case_flags=info.flags & CASE_FLAGS) + +def make_ref_group(info, name, position): + "Makes a group reference." + return RefGroup(info, name, position, case_flags=info.flags & CASE_FLAGS) + +def make_string_set(info, name): + "Makes a string set." + return StringSet(info, name, case_flags=info.flags & CASE_FLAGS) + +def make_property(info, prop, in_set): + "Makes a property." + if in_set: + return prop + + return prop.with_flags(case_flags=info.flags & CASE_FLAGS) + +def _parse_pattern(source, info): + "Parses a pattern, eg. 'a|b|c'." + branches = [parse_sequence(source, info)] + while source.match("|"): + branches.append(parse_sequence(source, info)) + + if len(branches) == 1: + return branches[0] + return Branch(branches) + +def parse_sequence(source, info): + "Parses a sequence, eg. 'abc'." + sequence = [] + applied = False + while True: + # Get literal characters followed by an element. + characters, case_flags, element = parse_literal_and_element(source, + info) + if not element: + # No element, just a literal. We've also reached the end of the + # sequence. + append_literal(characters, case_flags, sequence) + break + + if element is COMMENT or element is FLAGS: + append_literal(characters, case_flags, sequence) + elif type(element) is tuple: + # It looks like we've found a quantifier. + ch, saved_pos = element + + counts = parse_quantifier(source, info, ch) + if counts: + # It _is_ a quantifier. + apply_quantifier(source, info, counts, characters, case_flags, + ch, saved_pos, applied, sequence) + applied = True + else: + # It's not a quantifier. Maybe it's a fuzzy constraint. + constraints = parse_fuzzy(source, ch) + if constraints: + # It _is_ a fuzzy constraint. + apply_constraint(source, info, constraints, characters, + case_flags, saved_pos, applied, sequence) + applied = True + else: + # The element was just a literal. + characters.append(ord(ch)) + append_literal(characters, case_flags, sequence) + applied = False + else: + # We have a literal followed by something else. + append_literal(characters, case_flags, sequence) + sequence.append(element) + applied = False + + return make_sequence(sequence) + +def apply_quantifier(source, info, counts, characters, case_flags, ch, + saved_pos, applied, sequence): + if characters: + # The quantifier applies to the last character. + append_literal(characters[ : -1], case_flags, sequence) + element = Character(characters[-1], case_flags=case_flags) + else: + # The quantifier applies to the last item in the sequence. + if applied: + raise error("multiple repeat", source.string, saved_pos) + + if not sequence: + raise error("nothing to repeat", source.string, saved_pos) + + element = sequence.pop() + + min_count, max_count = counts + saved_pos = source.pos + ch = source.get() + if ch == "?": + # The "?" suffix that means it's a lazy repeat. + repeated = LazyRepeat + elif ch == "+": + # The "+" suffix that means it's a possessive repeat. + repeated = PossessiveRepeat + else: + # No suffix means that it's a greedy repeat. + source.pos = saved_pos + repeated = GreedyRepeat + + # Ignore the quantifier if it applies to a zero-width item or the number of + # repeats is fixed at 1. + if not element.is_empty() and (min_count != 1 or max_count != 1): + element = repeated(element, min_count, max_count) + + sequence.append(element) + +def apply_constraint(source, info, constraints, characters, case_flags, + saved_pos, applied, sequence): + if characters: + # The constraint applies to the last character. + append_literal(characters[ : -1], case_flags, sequence) + element = Character(characters[-1], case_flags=case_flags) + sequence.append(Fuzzy(element, constraints)) + else: + # The constraint applies to the last item in the sequence. + if applied or not sequence: + raise error("nothing for fuzzy constraint", source.string, + saved_pos) + + element = sequence.pop() + + # If a group is marked as fuzzy then put all of the fuzzy part in the + # group. + if isinstance(element, Group): + element.subpattern = Fuzzy(element.subpattern, constraints) + sequence.append(element) + else: + sequence.append(Fuzzy(element, constraints)) + +def append_literal(characters, case_flags, sequence): + if characters: + sequence.append(Literal(characters, case_flags=case_flags)) + +def PossessiveRepeat(element, min_count, max_count): + "Builds a possessive repeat." + return Atomic(GreedyRepeat(element, min_count, max_count)) + +_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)} + +def parse_quantifier(source, info, ch): + "Parses a quantifier." + q = _QUANTIFIERS.get(ch) + if q: + # It's a quantifier. + return q + + if ch == "{": + # Looks like a limited repeated element, eg. 'a{2,3}'. + counts = parse_limited_quantifier(source) + if counts: + return counts + + return None + +def is_above_limit(count): + "Checks whether a count is above the maximum." + return count is not None and count >= UNLIMITED + +def parse_limited_quantifier(source): + "Parses a limited quantifier." + saved_pos = source.pos + min_count = parse_count(source) + if source.match(","): + max_count = parse_count(source) + + # No minimum means 0 and no maximum means unlimited. + min_count = int(min_count or 0) + max_count = int(max_count) if max_count else None + + if max_count is not None and min_count > max_count: + raise error("min repeat greater than max repeat", source.string, + saved_pos) + else: + if not min_count: + source.pos = saved_pos + return None + + min_count = max_count = int(min_count) + + if is_above_limit(min_count) or is_above_limit(max_count): + raise error("repeat count too big", source.string, saved_pos) + + if not source.match ("}"): + source.pos = saved_pos + return None + + return min_count, max_count + +def parse_fuzzy(source, ch): + "Parses a fuzzy setting, if present." + if ch != "{": + return None + + saved_pos = source.pos + + constraints = {} + try: + parse_fuzzy_item(source, constraints) + while source.match(","): + parse_fuzzy_item(source, constraints) + except ParseError: + source.pos = saved_pos + return None + + if not source.match("}"): + raise error("expected }", source.string, source.pos) + + return constraints + +def parse_fuzzy_item(source, constraints): + "Parses a fuzzy setting item." + saved_pos = source.pos + try: + parse_cost_constraint(source, constraints) + except ParseError: + source.pos = saved_pos + + parse_cost_equation(source, constraints) + +def parse_cost_constraint(source, constraints): + "Parses a cost constraint." + saved_pos = source.pos + ch = source.get() + if ch in ALPHA: + # Syntax: constraint [("<=" | "<") cost] + constraint = parse_constraint(source, constraints, ch) + + max_inc = parse_fuzzy_compare(source) + + if max_inc is None: + # No maximum cost. + constraints[constraint] = 0, None + else: + # There's a maximum cost. + cost_pos = source.pos + max_cost = int(parse_count(source)) + + # Inclusive or exclusive limit? + if not max_inc: + max_cost -= 1 + + if max_cost < 0: + raise error("bad fuzzy cost limit", source.string, cost_pos) + + constraints[constraint] = 0, max_cost + elif ch in DIGITS: + # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost + source.pos = saved_pos + try: + # Minimum cost. + min_cost = int(parse_count(source)) + + min_inc = parse_fuzzy_compare(source) + if min_inc is None: + raise ParseError() + + constraint = parse_constraint(source, constraints, source.get()) + + max_inc = parse_fuzzy_compare(source) + if max_inc is None: + raise ParseError() + + # Maximum cost. + cost_pos = source.pos + max_cost = int(parse_count(source)) + + # Inclusive or exclusive limits? + if not min_inc: + min_cost += 1 + if not max_inc: + max_cost -= 1 + + if not 0 <= min_cost <= max_cost: + raise error("bad fuzzy cost limit", source.string, cost_pos) + + constraints[constraint] = min_cost, max_cost + except ValueError: + raise ParseError() + else: + raise ParseError() + +def parse_constraint(source, constraints, ch): + "Parses a constraint." + if ch not in "deis": + raise error("bad fuzzy constraint", source.string, source.pos) + + if ch in constraints: + raise error("repeated fuzzy constraint", source.string, source.pos) + + return ch + +def parse_fuzzy_compare(source): + "Parses a cost comparator." + if source.match("<="): + return True + elif source.match("<"): + return False + else: + return None + +def parse_cost_equation(source, constraints): + "Parses a cost equation." + if "cost" in constraints: + raise error("more than one cost equation", source.string, source.pos) + + cost = {} + + parse_cost_term(source, cost) + while source.match("+"): + parse_cost_term(source, cost) + + max_inc = parse_fuzzy_compare(source) + if max_inc is None: + raise error("missing fuzzy cost limit", source.string, source.pos) + + max_cost = int(parse_count(source)) + + if not max_inc: + max_cost -= 1 + + if max_cost < 0: + raise error("bad fuzzy cost limit", source.string, source.pos) + + cost["max"] = max_cost + + constraints["cost"] = cost + +def parse_cost_term(source, cost): + "Parses a cost equation term." + coeff = parse_count(source) + ch = source.get() + if ch not in "dis": + raise ParseError() + + if ch in cost: + raise error("repeated fuzzy cost", source.string, source.pos) + + cost[ch] = int(coeff or 1) + +def parse_count(source): + "Parses a quantifier's count, which can be empty." + return source.get_while(DIGITS) + +def parse_literal_and_element(source, info): + """Parses a literal followed by an element. The element is FLAGS if it's an + inline flag or None if it has reached the end of a sequence. + """ + characters = [] + case_flags = info.flags & CASE_FLAGS + while True: + saved_pos = source.pos + ch = source.get() + if ch in SPECIAL_CHARS: + if ch in ")|": + # The end of a sequence. At the end of the pattern ch is "". + source.pos = saved_pos + return characters, case_flags, None + elif ch == "\\": + # An escape sequence outside a set. + element = parse_escape(source, info, False) + return characters, case_flags, element + elif ch == "(": + # A parenthesised subpattern or a flag. + element = parse_paren(source, info) + if element and element is not COMMENT: + return characters, case_flags, element + elif ch == ".": + # Any character. + if info.flags & DOTALL: + element = AnyAll() + elif info.flags & WORD: + element = AnyU() + else: + element = Any() + + return characters, case_flags, element + elif ch == "[": + # A character set. + element = parse_set(source, info) + return characters, case_flags, element + elif ch == "^": + # The start of a line or the string. + if info.flags & MULTILINE: + if info.flags & WORD: + element = StartOfLineU() + else: + element = StartOfLine() + else: + element = StartOfString() + + return characters, case_flags, element + elif ch == "$": + # The end of a line or the string. + if info.flags & MULTILINE: + if info.flags & WORD: + element = EndOfLineU() + else: + element = EndOfLine() + else: + if info.flags & WORD: + element = EndOfStringLineU() + else: + element = EndOfStringLine() + + return characters, case_flags, element + elif ch in "?*+{": + # Looks like a quantifier. + return characters, case_flags, (ch, saved_pos) + else: + # A literal. + characters.append(ord(ch)) + else: + # A literal. + characters.append(ord(ch)) + +def parse_paren(source, info): + """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an + inline flag. + """ + saved_pos = source.pos + ch = source.get() + if ch == "?": + # (?... + saved_pos_2 = source.pos + ch = source.get() + if ch == "<": + # (?<... + saved_pos_3 = source.pos + ch = source.get() + if ch in ("=", "!"): + # (?<=... or (?<!...: lookbehind. + return parse_lookaround(source, info, True, ch == "=") + + # (?<...: a named capture group. + source.pos = saved_pos_3 + name = parse_name(source) + group = info.open_group(name) + source.expect(">") + saved_flags = info.flags + try: + subpattern = _parse_pattern(source, info) + source.expect(")") + finally: + info.flags = saved_flags + source.ignore_space = bool(info.flags & VERBOSE) + + info.close_group() + return Group(info, group, subpattern) + if ch in ("=", "!"): + # (?=... or (?!...: lookahead. + return parse_lookaround(source, info, False, ch == "=") + if ch == "P": + # (?P...: a Python extension. + return parse_extension(source, info) + if ch == "#": + # (?#...: a comment. + return parse_comment(source) + if ch == "(": + # (?(...: a conditional subpattern. + return parse_conditional(source, info) + if ch == ">": + # (?>...: an atomic subpattern. + return parse_atomic(source, info) + if ch == "|": + # (?|...: a common/reset groups branch. + return parse_common(source, info) + if ch == "R" or "0" <= ch <= "9": + # (?R...: probably a call to a group. + return parse_call_group(source, info, ch, saved_pos_2) + if ch == "&": + # (?&...: a call to a named group. + return parse_call_named_group(source, info, saved_pos_2) + + # (?...: probably a flags subpattern. + source.pos = saved_pos_2 + return parse_flags_subpattern(source, info) + + if ch == "*": + # (*... + saved_pos_2 = source.pos + word = source.get_while(set(")>"), include=False) + if word[ : 1].isalpha(): + verb = VERBS.get(word) + if not verb: + raise error("unknown verb", source.string, saved_pos_2) + + source.expect(")") + + return verb + + # (...: an unnamed capture group. + source.pos = saved_pos + group = info.open_group() + saved_flags = info.flags + try: + subpattern = _parse_pattern(source, info) + source.expect(")") + finally: + info.flags = saved_flags + source.ignore_space = bool(info.flags & VERBOSE) + + info.close_group() + + return Group(info, group, subpattern) + +def parse_extension(source, info): + "Parses a Python extension." + saved_pos = source.pos + ch = source.get() + if ch == "<": + # (?P<...: a named capture group. + name = parse_name(source) + group = info.open_group(name) + source.expect(">") + saved_flags = info.flags + try: + subpattern = _parse_pattern(source, info) + source.expect(")") + finally: + info.flags = saved_flags + source.ignore_space = bool(info.flags & VERBOSE) + + info.close_group() + + return Group(info, group, subpattern) + if ch == "=": + # (?P=...: a named group reference. + name = parse_name(source, allow_numeric=True) + source.expect(")") + if info.is_open_group(name): + raise error("cannot refer to an open group", source.string, + saved_pos) + + return make_ref_group(info, name, saved_pos) + if ch == ">" or ch == "&": + # (?P>...: a call to a group. + return parse_call_named_group(source, info, saved_pos) + + source.pos = saved_pos + raise error("unknown extension", source.string, saved_pos) + +def parse_comment(source): + "Parses a comment." + source.skip_while(set(")"), include=False) + source.expect(")") + + return COMMENT + +def parse_lookaround(source, info, behind, positive): + "Parses a lookaround." + saved_flags = info.flags + try: + subpattern = _parse_pattern(source, info) + source.expect(")") + finally: + info.flags = saved_flags + source.ignore_space = bool(info.flags & VERBOSE) + + return LookAround(behind, positive, subpattern) + +def parse_conditional(source, info): + "Parses a conditional subpattern." + saved_flags = info.flags + saved_pos = source.pos + ch = source.get() + if ch == "?": + # (?(?... + ch = source.get() + if ch in ("=", "!"): + # (?(?=... or (?(?!...: lookahead conditional. + return parse_lookaround_conditional(source, info, False, ch == "=") + if ch == "<": + # (?(?<... + ch = source.get() + if ch in ("=", "!"): + # (?(?<=... or (?(?<!...: lookbehind conditional. + return parse_lookaround_conditional(source, info, True, ch == + "=") + + source.pos = saved_pos + raise error("expected lookaround conditional", source.string, + source.pos) + + source.pos = saved_pos + try: + group = parse_name(source, True) + source.expect(")") + yes_branch = parse_sequence(source, info) + if source.match("|"): + no_branch = parse_sequence(source, info) + else: + no_branch = Sequence() + + source.expect(")") + finally: + info.flags = saved_flags + source.ignore_space = bool(info.flags & VERBOSE) + + if yes_branch.is_empty() and no_branch.is_empty(): + return Sequence() + + return Conditional(info, group, yes_branch, no_branch, saved_pos) + +def parse_lookaround_conditional(source, info, behind, positive): + saved_flags = info.flags + try: + subpattern = _parse_pattern(source, info) + source.expect(")") + finally: + info.flags = saved_flags + source.ignore_space = bool(info.flags & VERBOSE) + + yes_branch = parse_sequence(source, info) + if source.match("|"): + no_branch = parse_sequence(source, info) + else: + no_branch = Sequence() + + source.expect(")") + + return LookAroundConditional(behind, positive, subpattern, yes_branch, + no_branch) + +def parse_atomic(source, info): + "Parses an atomic subpattern." + saved_flags = info.flags + try: + subpattern = _parse_pattern(source, info) + source.expect(")") + finally: + info.flags = saved_flags + source.ignore_space = bool(info.flags & VERBOSE) + + return Atomic(subpattern) + +def parse_common(source, info): + "Parses a common groups branch." + # Capture group numbers in different branches can reuse the group numbers. + initial_group_count = info.group_count + branches = [parse_sequence(source, info)] + final_group_count = info.group_count + while source.match("|"): + info.group_count = initial_group_count + branches.append(parse_sequence(source, info)) + final_group_count = max(final_group_count, info.group_count) + + info.group_count = final_group_count + source.expect(")") + + if len(branches) == 1: + return branches[0] + return Branch(branches) + +def parse_call_group(source, info, ch, pos): + "Parses a call to a group." + if ch == "R": + group = "0" + else: + group = ch + source.get_while(DIGITS) + + source.expect(")") + + return CallGroup(info, group, pos) + +def parse_call_named_group(source, info, pos): + "Parses a call to a named group." + group = parse_name(source) + source.expect(")") + + return CallGroup(info, group, pos) + +def parse_flag_set(source): + "Parses a set of inline flags." + flags = 0 + + try: + while True: + saved_pos = source.pos + ch = source.get() + if ch == "V": + ch += source.get() + flags |= REGEX_FLAGS[ch] + except KeyError: + source.pos = saved_pos + + return flags + +def parse_flags(source, info): + "Parses flags being turned on/off." + flags_on = parse_flag_set(source) + if source.match("-"): + flags_off = parse_flag_set(source) + if not flags_off: + raise error("bad inline flags: no flags after '-'", source.string, + source.pos) + else: + flags_off = 0 + + if flags_on & LOCALE: + # Remember that this pattern as an inline locale flag. + info.inline_locale = True + + return flags_on, flags_off + +def parse_subpattern(source, info, flags_on, flags_off): + "Parses a subpattern with scoped flags." + saved_flags = info.flags + info.flags = (info.flags | flags_on) & ~flags_off + source.ignore_space = bool(info.flags & VERBOSE) + try: + subpattern = _parse_pattern(source, info) + source.expect(")") + finally: + info.flags = saved_flags + source.ignore_space = bool(info.flags & VERBOSE) + + return subpattern + +def parse_flags_subpattern(source, info): + """Parses a flags subpattern. It could be inline flags or a subpattern + possibly with local flags. If it's a subpattern, then that's returned; + if it's a inline flags, then FLAGS is returned. + """ + flags_on, flags_off = parse_flags(source, info) + + if flags_off & GLOBAL_FLAGS: + raise error("bad inline flags: cannot turn off global flag", + source.string, source.pos) + + if flags_on & flags_off: + raise error("bad inline flags: flag turned on and off", source.string, + source.pos) + + # Handle flags which are global in all regex behaviours. + new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS + if new_global_flags: + info.global_flags |= new_global_flags + + # A global has been turned on, so reparse the pattern. + raise _UnscopedFlagSet(info.global_flags) + + # Ensure that from now on we have only scoped flags. + flags_on &= ~GLOBAL_FLAGS + + if source.match(":"): + return parse_subpattern(source, info, flags_on, flags_off) + + if source.match(")"): + parse_positional_flags(source, info, flags_on, flags_off) + return FLAGS + + raise error("unknown extension", source.string, source.pos) + +def parse_positional_flags(source, info, flags_on, flags_off): + "Parses positional flags." + version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION + if version == VERSION0: + # Positional flags are global and can only be turned on. + if flags_off: + raise error("bad inline flags: cannot turn flags off", + source.string, source.pos) + + new_global_flags = flags_on & ~info.global_flags + if new_global_flags: + info.global_flags |= new_global_flags + + # A global has been turned on, so reparse the pattern. + raise _UnscopedFlagSet(info.global_flags) + else: + info.flags = (info.flags | flags_on) & ~flags_off + + source.ignore_space = bool(info.flags & VERBOSE) + +def parse_name(source, allow_numeric=False, allow_group_0=False): + "Parses a name." + name = source.get_while(set(")>"), include=False) + + if not name: + raise error("missing group name", source.string, source.pos) + + if name.isdigit(): + min_group = 0 if allow_group_0 else 1 + if not allow_numeric or int(name) < min_group: + raise error("bad character in group name", source.string, + source.pos) + else: + if not is_identifier(name): + raise error("bad character in group name", source.string, + source.pos) + + return name + +def is_identifier(name): + if not name: + return False + + if name[0] not in ALPHA and name[0] != "_": + return False + + name = name.replace("_", "") + + return not name or all(c in ALNUM for c in name) + +def is_octal(string): + "Checks whether a string is octal." + return all(ch in OCT_DIGITS for ch in string) + +def is_decimal(string): + "Checks whether a string is decimal." + return all(ch in DIGITS for ch in string) + +def is_hexadecimal(string): + "Checks whether a string is hexadecimal." + return all(ch in HEX_DIGITS for ch in string) + +def parse_escape(source, info, in_set): + "Parses an escape sequence." + saved_ignore = source.ignore_space + source.ignore_space = False + ch = source.get() + source.ignore_space = saved_ignore + if not ch: + # A backslash at the end of the pattern. + raise error("bad escape (end of pattern)", source.string, source.pos) + if ch in HEX_ESCAPES: + # A hexadecimal escape sequence. + return parse_hex_escape(source, info, HEX_ESCAPES[ch], in_set, ch) + elif ch == "g" and not in_set: + # A group reference. + saved_pos = source.pos + try: + return parse_group_ref(source, info) + except error: + # Invalid as a group reference, so assume it's a literal. + source.pos = saved_pos + + return make_character(info, ord(ch), in_set) + elif ch == "G" and not in_set: + # A search anchor. + return SearchAnchor() + elif ch == "L" and not in_set: + # A string set. + return parse_string_set(source, info) + elif ch == "N": + # A named codepoint. + return parse_named_char(source, info, in_set) + elif ch in "pP": + # A Unicode property, positive or negative. + return parse_property(source, info, ch == "p", in_set) + elif ch == "X" and not in_set: + # A grapheme cluster. + return Grapheme() + elif ch in ALPHA: + # An alphabetic escape sequence. + # Positional escapes aren't allowed inside a character set. + if not in_set: + if info.flags & WORD: + value = WORD_POSITION_ESCAPES.get(ch) + else: + value = POSITION_ESCAPES.get(ch) + + if value: + return value + + value = CHARSET_ESCAPES.get(ch) + if value: + return value + + value = CHARACTER_ESCAPES.get(ch) + if value: + return Character(ord(value)) + + return make_character(info, ord(ch), in_set) + elif ch in DIGITS: + # A numeric escape sequence. + return parse_numeric_escape(source, info, ch, in_set) + else: + # A literal. + return make_character(info, ord(ch), in_set) + +def parse_numeric_escape(source, info, ch, in_set): + "Parses a numeric escape sequence." + if in_set or ch == "0": + # Octal escape sequence, max 3 digits. + return parse_octal_escape(source, info, [ch], in_set) + + # At least 1 digit, so either octal escape or group. + digits = ch + saved_pos = source.pos + ch = source.get() + if ch in DIGITS: + # At least 2 digits, so either octal escape or group. + digits += ch + saved_pos = source.pos + ch = source.get() + if is_octal(digits) and ch in OCT_DIGITS: + # 3 octal digits, so octal escape sequence. + encoding = info.flags & _ALL_ENCODINGS + if encoding == ASCII or encoding == LOCALE: + octal_mask = 0xFF + else: + octal_mask = 0x1FF + + value = int(digits + ch, 8) & octal_mask + return make_character(info, value) + + # Group reference. + source.pos = saved_pos + if info.is_open_group(digits): + raise error("cannot refer to an open group", source.string, source.pos) + + return make_ref_group(info, digits, source.pos) + +def parse_octal_escape(source, info, digits, in_set): + "Parses an octal escape sequence." + saved_pos = source.pos + ch = source.get() + while len(digits) < 3 and ch in OCT_DIGITS: + digits.append(ch) + saved_pos = source.pos + ch = source.get() + + source.pos = saved_pos + try: + value = int("".join(digits), 8) + return make_character(info, value, in_set) + except ValueError: + if digits[0] in OCT_DIGITS: + raise error("incomplete escape \\%s" % ''.join(digits), + source.string, source.pos) + else: + raise error("bad escape \\%s" % digits[0], source.string, + source.pos) + +def parse_hex_escape(source, info, expected_len, in_set, type): + "Parses a hex escape sequence." + digits = [] + for i in range(expected_len): + ch = source.get() + if ch not in HEX_DIGITS: + raise error("incomplete escape \\%s%s" % (type, ''.join(digits)), + source.string, source.pos) + digits.append(ch) + + value = int("".join(digits), 16) + return make_character(info, value, in_set) + +def parse_group_ref(source, info): + "Parses a group reference." + source.expect("<") + saved_pos = source.pos + name = parse_name(source, True) + source.expect(">") + if info.is_open_group(name): + raise error("cannot refer to an open group", source.string, source.pos) + + return make_ref_group(info, name, saved_pos) + +def parse_string_set(source, info): + "Parses a string set reference." + source.expect("<") + name = parse_name(source, True) + source.expect(">") + if name is None or name not in info.kwargs: + raise error("undefined named list", source.string, source.pos) + + return make_string_set(info, name) + +def parse_named_char(source, info, in_set): + "Parses a named character." + saved_pos = source.pos + if source.match("{"): + name = source.get_while(NAMED_CHAR_PART) + if source.match("}"): + try: + value = unicodedata.lookup(name) + return make_character(info, ord(value), in_set) + except KeyError: + raise error("undefined character name", source.string, + source.pos) + + source.pos = saved_pos + return make_character(info, ord("N"), in_set) + +def parse_property(source, info, positive, in_set): + "Parses a Unicode property." + saved_pos = source.pos + ch = source.get() + if ch == "{": + negate = source.match("^") + prop_name, name = parse_property_name(source) + if source.match("}"): + # It's correctly delimited. + prop = lookup_property(prop_name, name, positive != negate, source) + return make_property(info, prop, in_set) + elif ch and ch in "CLMNPSZ": + # An abbreviated property, eg \pL. + prop = lookup_property(None, ch, positive, source) + return make_property(info, prop, in_set) + + # Not a property, so treat as a literal "p" or "P". + source.pos = saved_pos + ch = "p" if positive else "P" + return make_character(info, ord(ch), in_set) + +def parse_property_name(source): + "Parses a property name, which may be qualified." + name = source.get_while(PROPERTY_NAME_PART) + saved_pos = source.pos + + ch = source.get() + if ch and ch in ":=": + prop_name = name + name = source.get_while(ALNUM | set(" &_-./")).strip() + + if name: + # Name after the ":" or "=", so it's a qualified name. + saved_pos = source.pos + else: + # No name after the ":" or "=", so assume it's an unqualified name. + prop_name, name = None, prop_name + else: + prop_name = None + + source.pos = saved_pos + return prop_name, name + +def parse_set(source, info): + "Parses a character set." + version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION + + saved_ignore = source.ignore_space + source.ignore_space = False + # Negative set? + negate = source.match("^") + try: + if version == VERSION0: + item = parse_set_imp_union(source, info) + else: + item = parse_set_union(source, info) + + if not source.match("]"): + raise error("missing ]", source.string, source.pos) + finally: + source.ignore_space = saved_ignore + + if negate: + item = item.with_flags(positive=not item.positive) + + item = item.with_flags(case_flags=info.flags & CASE_FLAGS) + + return item + +def parse_set_union(source, info): + "Parses a set union ([x||y])." + items = [parse_set_symm_diff(source, info)] + while source.match("||"): + items.append(parse_set_symm_diff(source, info)) + + if len(items) == 1: + return items[0] + return SetUnion(info, items) + +def parse_set_symm_diff(source, info): + "Parses a set symmetric difference ([x~~y])." + items = [parse_set_inter(source, info)] + while source.match("~~"): + items.append(parse_set_inter(source, info)) + + if len(items) == 1: + return items[0] + return SetSymDiff(info, items) + +def parse_set_inter(source, info): + "Parses a set intersection ([x&&y])." + items = [parse_set_diff(source, info)] + while source.match("&&"): + items.append(parse_set_diff(source, info)) + + if len(items) == 1: + return items[0] + return SetInter(info, items) + +def parse_set_diff(source, info): + "Parses a set difference ([x--y])." + items = [parse_set_imp_union(source, info)] + while source.match("--"): + items.append(parse_set_imp_union(source, info)) + + if len(items) == 1: + return items[0] + return SetDiff(info, items) + +def parse_set_imp_union(source, info): + "Parses a set implicit union ([xy])." + version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION + + items = [parse_set_member(source, info)] + while True: + saved_pos = source.pos + if source.match("]"): + # End of the set. + source.pos = saved_pos + break + + if version == VERSION1 and any(source.match(op) for op in SET_OPS): + # The new behaviour has set operators. + source.pos = saved_pos + break + + items.append(parse_set_member(source, info)) + + if len(items) == 1: + return items[0] + return SetUnion(info, items) + +def parse_set_member(source, info): + "Parses a member in a character set." + # Parse a set item. + start = parse_set_item(source, info) + saved_pos1 = source.pos + if (not isinstance(start, Character) or not start.positive or not + source.match("-")): + # It's not the start of a range. + return start + + version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION + + # It looks like the start of a range of characters. + saved_pos2 = source.pos + if version == VERSION1 and source.match("-"): + # It's actually the set difference operator '--', so return the + # character. + source.pos = saved_pos1 + return start + + if source.match("]"): + # We've reached the end of the set, so return both the character and + # hyphen. + source.pos = saved_pos2 + return SetUnion(info, [start, Character(ord("-"))]) + + # Parse a set item. + end = parse_set_item(source, info) + if not isinstance(end, Character) or not end.positive: + # It's not a range, so return the character, hyphen and property. + return SetUnion(info, [start, Character(ord("-")), end]) + + # It _is_ a range. + if start.value > end.value: + raise error("bad character range", source.string, source.pos) + + if start.value == end.value: + return start + + return Range(start.value, end.value) + +def parse_set_item(source, info): + "Parses an item in a character set." + version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION + + if source.match("\\"): + # An escape sequence in a set. + return parse_escape(source, info, True) + + saved_pos = source.pos + if source.match("[:"): + # Looks like a POSIX character class. + try: + return parse_posix_class(source, info) + except ParseError: + # Not a POSIX character class. + source.pos = saved_pos + + if version == VERSION1 and source.match("["): + # It's the start of a nested set. + + # Negative set? + negate = source.match("^") + item = parse_set_union(source, info) + + if not source.match("]"): + raise error("missing ]", source.string, source.pos) + + if negate: + item = item.with_flags(positive=not item.positive) + + return item + + ch = source.get() + if not ch: + raise error("unterminated character set", source.string, source.pos) + + return Character(ord(ch)) + +def parse_posix_class(source, info): + "Parses a POSIX character class." + negate = source.match("^") + prop_name, name = parse_property_name(source) + if not source.match(":]"): + raise ParseError() + + return lookup_property(prop_name, name, not negate, source, posix=True) + +def float_to_rational(flt): + "Converts a float to a rational pair." + int_part = int(flt) + error = flt - int_part + if abs(error) < 0.0001: + return int_part, 1 + + den, num = float_to_rational(1.0 / error) + + return int_part * den + num, den + +def numeric_to_rational(numeric): + "Converts a numeric string to a rational string, if possible." + if numeric[ : 1] == "-": + sign, numeric = numeric[0], numeric[1 : ] + else: + sign = "" + + parts = numeric.split("/") + if len(parts) == 2: + num, den = float_to_rational(float(parts[0]) / float(parts[1])) + elif len(parts) == 1: + num, den = float_to_rational(float(parts[0])) + else: + raise ValueError() + + result = "%s%s/%s" % (sign, num, den) + if result.endswith("/1"): + return result[ : -2] + + return result + +def standardise_name(name): + "Standardises a property or value name." + try: + return numeric_to_rational("".join(name)) + except (ValueError, ZeroDivisionError): + return "".join(ch for ch in name if ch not in "_- ").upper() + +_posix_classes = set('ALNUM DIGIT PUNCT XDIGIT'.split()) + +def lookup_property(property, value, positive, source=None, posix=False): + "Looks up a property." + # Normalise the names (which may still be lists). + property = standardise_name(property) if property else None + value = standardise_name(value) + + if (property, value) == ("GENERALCATEGORY", "ASSIGNED"): + property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive + + if posix and not property and value.upper() in _posix_classes: + value = 'POSIX' + value + + if property: + # Both the property and the value are provided. + prop = PROPERTIES.get(property) + if not prop: + if not source: + raise error("unknown property") + + raise error("unknown property", source.string, source.pos) + + prop_id, value_dict = prop + val_id = value_dict.get(value) + if val_id is None: + if not source: + raise error("unknown property value") + + raise error("unknown property value", source.string, source.pos) + + if "YES" in value_dict and val_id == 0: + positive, val_id = not positive, 1 + + return Property((prop_id << 16) | val_id, positive) + + # Only the value is provided. + # It might be the name of a GC, script or block value. + for property in ("GC", "SCRIPT", "BLOCK"): + prop_id, value_dict = PROPERTIES.get(property) + val_id = value_dict.get(value) + if val_id is not None: + return Property((prop_id << 16) | val_id, positive) + + # It might be the name of a binary property. + prop = PROPERTIES.get(value) + if prop: + prop_id, value_dict = prop + + if "YES" in value_dict: + return Property((prop_id << 16) | 1, positive) + + # It might be the name of a binary property starting with a prefix. + if value.startswith("IS"): + prop = PROPERTIES.get(value[2 : ]) + if prop: + prop_id, value_dict = prop + if "YES" in value_dict: + return Property((prop_id << 16) | 1, positive) + + # It might be the name of a script or block starting with a prefix. + for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")): + if value.startswith(prefix): + prop_id, value_dict = PROPERTIES.get(property) + val_id = value_dict.get(value[2 : ]) + if val_id is not None: + return Property((prop_id << 16) | val_id, positive) + + # Unknown property. + if not source: + raise error("unknown property") + + raise error("unknown property", source.string, source.pos) + +def _compile_replacement(source, pattern, is_unicode): + "Compiles a replacement template escape sequence." + ch = source.get() + if ch in ALPHA: + # An alphabetic escape sequence. + value = CHARACTER_ESCAPES.get(ch) + if value: + return False, [ord(value)] + + if ch in HEX_ESCAPES and (ch == "x" or is_unicode): + # A hexadecimal escape sequence. + return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)] + + if ch == "g": + # A group preference. + return True, [compile_repl_group(source, pattern)] + + if ch == "N" and is_unicode: + # A named character. + value = parse_repl_named_char(source) + if value is not None: + return False, [value] + + return False, [ord("\\"), ord(ch)] + + if isinstance(source.sep, str): + octal_mask = 0xFF + else: + octal_mask = 0x1FF + + if ch == "0": + # An octal escape sequence. + digits = ch + while len(digits) < 3: + saved_pos = source.pos + ch = source.get() + if ch not in OCT_DIGITS: + source.pos = saved_pos + break + digits += ch + + return False, [int(digits, 8) & octal_mask] + + if ch in DIGITS: + # Either an octal escape sequence (3 digits) or a group reference (max + # 2 digits). + digits = ch + saved_pos = source.pos + ch = source.get() + if ch in DIGITS: + digits += ch + saved_pos = source.pos + ch = source.get() + if ch and is_octal(digits + ch): + # An octal escape sequence. + return False, [int(digits + ch, 8) & octal_mask] + + # A group reference. + source.pos = saved_pos + return True, [int(digits)] + + if ch == "\\": + # An escaped backslash is a backslash. + return False, [ord("\\")] + + if not ch: + # A trailing backslash. + raise error("bad escape (end of pattern)", source.string, source.pos) + + # An escaped non-backslash is a backslash followed by the literal. + return False, [ord("\\"), ord(ch)] + +def parse_repl_hex_escape(source, expected_len, type): + "Parses a hex escape sequence in a replacement string." + digits = [] + for i in range(expected_len): + ch = source.get() + if ch not in HEX_DIGITS: + raise error("incomplete escape \\%s%s" % (type, ''.join(digits)), + source.string, source.pos) + digits.append(ch) + + return int("".join(digits), 16) + +def parse_repl_named_char(source): + "Parses a named character in a replacement string." + saved_pos = source.pos + if source.match("{"): + name = source.get_while(ALPHA | set(" ")) + + if source.match("}"): + try: + value = unicodedata.lookup(name) + return ord(value) + except KeyError: + raise error("undefined character name", source.string, + source.pos) + + source.pos = saved_pos + return None + +def compile_repl_group(source, pattern): + "Compiles a replacement template group reference." + source.expect("<") + name = parse_name(source, True, True) + + source.expect(">") + if name.isdigit(): + index = int(name) + if not 0 <= index <= pattern.groups: + raise error("invalid group reference", source.string, source.pos) + + return index + + try: + return pattern.groupindex[name] + except KeyError: + raise IndexError("unknown group") + +# The regular expression is parsed into a syntax tree. The different types of +# node are defined below. + +INDENT = " " +POSITIVE_OP = 0x1 +ZEROWIDTH_OP = 0x2 +FUZZY_OP = 0x4 +REVERSE_OP = 0x8 +REQUIRED_OP = 0x10 + +POS_TEXT = {False: "NON-MATCH", True: "MATCH"} +CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "", + FULLIGNORECASE: " FULL_IGNORE_CASE"} + +def make_sequence(items): + if len(items) == 1: + return items[0] + return Sequence(items) + +# Common base class for all nodes. +class RegexBase(object): + def __init__(self): + self._key = self.__class__ + + def with_flags(self, positive=None, case_flags=None, zerowidth=None): + if positive is None: + positive = self.positive + else: + positive = bool(positive) + if case_flags is None: + case_flags = self.case_flags + else: + case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS] + if zerowidth is None: + zerowidth = self.zerowidth + else: + zerowidth = bool(zerowidth) + + if (positive == self.positive and case_flags == self.case_flags and + zerowidth == self.zerowidth): + return self + + return self.rebuild(positive, case_flags, zerowidth) + + def fix_groups(self, pattern, reverse, fuzzy): + pass + + def optimise(self, info): + return self + + def pack_characters(self, info): + return self + + def remove_captures(self): + return self + + def is_atomic(self): + return True + + def can_be_affix(self): + return True + + def contains_group(self): + return False + + def get_firstset(self, reverse): + raise _FirstSetError() + + def has_simple_start(self): + return False + + def compile(self, reverse=False, fuzzy=False): + return self._compile(reverse, fuzzy) + + def dump(self, indent, reverse): + self._dump(indent, reverse) + + def is_empty(self): + return False + + def __hash__(self): + return hash(self._key) + + def __eq__(self, other): + return type(self) is type(other) and self._key == other._key + + def __ne__(self, other): + return not self.__eq__(other) + + def get_required_string(self, reverse): + return self.max_width(), None + +# Base class for zero-width nodes. +class ZeroWidthBase(RegexBase): + def __init__(self, positive=True): + RegexBase.__init__(self) + self.positive = bool(positive) + + self._key = self.__class__, self.positive + + def get_firstset(self, reverse): + return set([None]) + + def _compile(self, reverse, fuzzy): + flags = 0 + if self.positive: + flags |= POSITIVE_OP + if fuzzy: + flags |= FUZZY_OP + if reverse: + flags |= REVERSE_OP + return [(self._opcode, flags)] + + def _dump(self, indent, reverse): + print "%s%s %s" % (INDENT * indent, self._op_name, + POS_TEXT[self.positive]) + + def max_width(self): + return 0 + +class Any(RegexBase): + _opcode = {False: OP.ANY, True: OP.ANY_REV} + _op_name = "ANY" + + def has_simple_start(self): + return True + + def _compile(self, reverse, fuzzy): + flags = 0 + if fuzzy: + flags |= FUZZY_OP + return [(self._opcode[reverse], flags)] + + def _dump(self, indent, reverse): + print "%s%s" % (INDENT * indent, self._op_name) + + def max_width(self): + return 1 + +class AnyAll(Any): + _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV} + _op_name = "ANY_ALL" + +class AnyU(Any): + _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV} + _op_name = "ANY_U" + +class Atomic(RegexBase): + def __init__(self, subpattern): + RegexBase.__init__(self) + self.subpattern = subpattern + + def fix_groups(self, pattern, reverse, fuzzy): + self.subpattern.fix_groups(pattern, reverse, fuzzy) + + def optimise(self, info): + self.subpattern = self.subpattern.optimise(info) + + if self.subpattern.is_empty(): + return self.subpattern + return self + + def pack_characters(self, info): + self.subpattern = self.subpattern.pack_characters(info) + return self + + def remove_captures(self): + self.subpattern = self.subpattern.remove_captures() + return self + + def can_be_affix(self): + return self.subpattern.can_be_affix() + + def contains_group(self): + return self.subpattern.contains_group() + + def get_firstset(self, reverse): + return self.subpattern.get_firstset(reverse) + + def has_simple_start(self): + return self.subpattern.has_simple_start() + + def _compile(self, reverse, fuzzy): + return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) + + [(OP.END, )]) + + def _dump(self, indent, reverse): + print "%sATOMIC" % (INDENT * indent) + self.subpattern.dump(indent + 1, reverse) + + def is_empty(self): + return self.subpattern.is_empty() + + def __eq__(self, other): + return (type(self) is type(other) and self.subpattern == + other.subpattern) + + def max_width(self): + return self.subpattern.max_width() + + def get_required_string(self, reverse): + return self.subpattern.get_required_string(reverse) + +class Boundary(ZeroWidthBase): + _opcode = OP.BOUNDARY + _op_name = "BOUNDARY" + +class Branch(RegexBase): + def __init__(self, branches): + RegexBase.__init__(self) + self.branches = branches + + def fix_groups(self, pattern, reverse, fuzzy): + for b in self.branches: + b.fix_groups(pattern, reverse, fuzzy) + + def optimise(self, info): + # Flatten branches within branches. + branches = Branch._flatten_branches(info, self.branches) + + # Move any common prefix or suffix out of the branches. + prefix, branches = Branch._split_common_prefix(info, branches) + + # Try to reduce adjacent single-character branches to sets. + branches = Branch._reduce_to_set(info, branches) + + if len(branches) > 1: + sequence = [Branch(branches)] + else: + sequence = branches + + return make_sequence(prefix + sequence) + + def pack_characters(self, info): + self.branches = [b.pack_characters(info) for b in self.branches] + return self + + def remove_captures(self): + self.branches = [b.remove_captures() for b in self.branches] + return self + + def is_atomic(self): + return all(b.is_atomic() for b in self.branches) + + def can_be_affix(self): + return all(b.can_be_affix() for b in self.branches) + + def contains_group(self): + return any(b.contains_group() for b in self.branches) + + def get_firstset(self, reverse): + fs = set() + for b in self.branches: + fs |= b.get_firstset(reverse) + + return fs or set([None]) + + def _compile(self, reverse, fuzzy): + code = [(OP.BRANCH, )] + for b in self.branches: + code.extend(b.compile(reverse, fuzzy)) + code.append((OP.NEXT, )) + + code[-1] = (OP.END, ) + + return code + + def _dump(self, indent, reverse): + print "%sBRANCH" % (INDENT * indent) + self.branches[0].dump(indent + 1, reverse) + for b in self.branches[1 : ]: + print "%sOR" % (INDENT * indent) + b.dump(indent + 1, reverse) + + @staticmethod + def _flatten_branches(info, branches): + # Flatten the branches so that there aren't branches of branches. + new_branches = [] + for b in branches: + b = b.optimise(info) + if isinstance(b, Branch): + new_branches.extend(b.branches) + else: + new_branches.append(b) + + return new_branches + + @staticmethod + def _split_common_prefix(info, branches): + # Common leading items can be moved out of the branches. + # Get the items in the branches. + alternatives = [] + for b in branches: + if isinstance(b, Sequence): + alternatives.append(b.items) + else: + alternatives.append([b]) + + # What is the maximum possible length of the prefix? + max_count = min(len(a) for a in alternatives) + + # What is the longest common prefix? + prefix = alternatives[0] + pos = 0 + end_pos = max_count + while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] == + prefix[pos] for a in alternatives): + pos += 1 + count = pos + + if info.flags & UNICODE: + # We need to check that we're not splitting a sequence of + # characters which could form part of full case-folding. + count = pos + while count > 0 and not all(Branch._can_split(a, count) for a in + alternatives): + count -= 1 + + # No common prefix is possible. + if count == 0: + return [], branches + + # Rebuild the branches. + new_branches = [] + for a in alternatives: + new_branches.append(make_sequence(a[count : ])) + + return prefix[ : count], new_branches + + @staticmethod + def _split_common_suffix(info, branches): + # Common trailing items can be moved out of the branches. + # Get the items in the branches. + alternatives = [] + for b in branches: + if isinstance(b, Sequence): + alternatives.append(b.items) + else: + alternatives.append([b]) + + # What is the maximum possible length of the suffix? + max_count = min(len(a) for a in alternatives) + + # What is the longest common suffix? + suffix = alternatives[0] + pos = -1 + end_pos = -1 - max_count + while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] == + suffix[pos] for a in alternatives): + pos -= 1 + count = -1 - pos + + if info.flags & UNICODE: + # We need to check that we're not splitting a sequence of + # characters which could form part of full case-folding. + while count > 0 and not all(Branch._can_split_rev(a, count) for a + in alternatives): + count -= 1 + + # No common suffix is possible. + if count == 0: + return [], branches + + # Rebuild the branches. + new_branches = [] + for a in alternatives: + new_branches.append(make_sequence(a[ : -count])) + + return suffix[-count : ], new_branches + + @staticmethod + def _can_split(items, count): + # Check the characters either side of the proposed split. + if not Branch._is_full_case(items, count - 1): + return True + + if not Branch._is_full_case(items, count): + return True + + # Check whether a 1-1 split would be OK. + if Branch._is_folded(items[count - 1 : count + 1]): + return False + + # Check whether a 1-2 split would be OK. + if (Branch._is_full_case(items, count + 2) and + Branch._is_folded(items[count - 1 : count + 2])): + return False + + # Check whether a 2-1 split would be OK. + if (Branch._is_full_case(items, count - 2) and + Branch._is_folded(items[count - 2 : count + 1])): + return False + + return True + + @staticmethod + def _can_split_rev(items, count): + end = len(items) + + # Check the characters either side of the proposed split. + if not Branch._is_full_case(items, end - count): + return True + + if not Branch._is_full_case(items, end - count - 1): + return True + + # Check whether a 1-1 split would be OK. + if Branch._is_folded(items[end - count - 1 : end - count + 1]): + return False + + # Check whether a 1-2 split would be OK. + if (Branch._is_full_case(items, end - count + 2) and + Branch._is_folded(items[end - count - 1 : end - count + 2])): + return False + + # Check whether a 2-1 split would be OK. + if (Branch._is_full_case(items, end - count - 2) and + Branch._is_folded(items[end - count - 2 : end - count + 1])): + return False + + return True + + @staticmethod + def _merge_common_prefixes(info, branches): + # Branches with the same case-sensitive character prefix can be grouped + # together if they are separated only by other branches with a + # character prefix. + prefixed = defaultdict(list) + order = {} + new_branches = [] + for b in branches: + if Branch._is_simple_character(b): + # Branch starts with a simple character. + prefixed[b.value].append([b]) + order.setdefault(b.value, len(order)) + elif (isinstance(b, Sequence) and b.items and + Branch._is_simple_character(b.items[0])): + # Branch starts with a simple character. + prefixed[b.items[0].value].append(b.items) + order.setdefault(b.items[0].value, len(order)) + else: + Branch._flush_char_prefix(info, prefixed, order, new_branches) + + new_branches.append(b) + + Branch._flush_char_prefix(info, prefixed, order, new_branches) + + return new_branches + + @staticmethod + def _is_simple_character(c): + return isinstance(c, Character) and c.positive and not c.case_flags + + @staticmethod + def _reduce_to_set(info, branches): + # Can the branches be reduced to a set? + new_branches = [] + items = set() + case_flags = NOCASE + for b in branches: + if isinstance(b, (Character, Property, SetBase)): + # Branch starts with a single character. + if b.case_flags != case_flags: + # Different case sensitivity, so flush. + Branch._flush_set_members(info, items, case_flags, + new_branches) + + case_flags = b.case_flags + + items.add(b.with_flags(case_flags=NOCASE)) + else: + Branch._flush_set_members(info, items, case_flags, + new_branches) + + new_branches.append(b) + + Branch._flush_set_members(info, items, case_flags, new_branches) + + return new_branches + + @staticmethod + def _flush_char_prefix(info, prefixed, order, new_branches): + # Flush the prefixed branches. + if not prefixed: + return + + for value, branches in sorted(prefixed.items(), key=lambda pair: + order[pair[0]]): + if len(branches) == 1: + new_branches.append(make_sequence(branches[0])) + else: + subbranches = [] + optional = False + for b in branches: + if len(b) > 1: + subbranches.append(make_sequence(b[1 : ])) + elif not optional: + subbranches.append(Sequence()) + optional = True + + sequence = Sequence([Character(value), Branch(subbranches)]) + new_branches.append(sequence.optimise(info)) + + prefixed.clear() + order.clear() + + @staticmethod + def _flush_set_members(info, items, case_flags, new_branches): + # Flush the set members. + if not items: + return + + if len(items) == 1: + item = list(items)[0] + else: + item = SetUnion(info, list(items)).optimise(info) + + new_branches.append(item.with_flags(case_flags=case_flags)) + + items.clear() + + @staticmethod + def _is_full_case(items, i): + if not 0 <= i < len(items): + return False + + item = items[i] + return (isinstance(item, Character) and item.positive and + (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE) + + @staticmethod + def _is_folded(items): + if len(items) < 2: + return False + + for i in items: + if (not isinstance(i, Character) or not i.positive or not + i.case_flags): + return False + + folded = u"".join(unichr(i.value) for i in items) + folded = _regex.fold_case(FULL_CASE_FOLDING, folded) + + # Get the characters which expand to multiple codepoints on folding. + expanding_chars = _regex.get_expand_on_folding() + + for c in expanding_chars: + if folded == _regex.fold_case(FULL_CASE_FOLDING, c): + return True + + return False + + def is_empty(self): + return all(b.is_empty() for b in self.branches) + + def __eq__(self, other): + return type(self) is type(other) and self.branches == other.branches + + def max_width(self): + return max(b.max_width() for b in self.branches) + +class CallGroup(RegexBase): + def __init__(self, info, group, position): + RegexBase.__init__(self) + self.info = info + self.group = group + self.position = position + + self._key = self.__class__, self.group + + def fix_groups(self, pattern, reverse, fuzzy): + try: + self.group = int(self.group) + except ValueError: + try: + self.group = self.info.group_index[self.group] + except KeyError: + raise error("invalid group reference", pattern, self.position) + + if not 0 <= self.group <= self.info.group_count: + raise error("unknown group", pattern, self.position) + + if self.group > 0 and self.info.open_group_count[self.group] > 1: + raise error("ambiguous group reference", pattern, self.position) + + self.info.group_calls.append((self, reverse, fuzzy)) + + self._key = self.__class__, self.group + + def remove_captures(self): + raise error("group reference not allowed", pattern, self.position) + + def _compile(self, reverse, fuzzy): + return [(OP.GROUP_CALL, self.call_ref)] + + def _dump(self, indent, reverse): + print "%sGROUP_CALL %s" % (INDENT * indent, self.group) + + def __eq__(self, other): + return type(self) is type(other) and self.group == other.group + + def max_width(self): + return UNLIMITED + +class Character(RegexBase): + _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False): + OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE, + False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE, + True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV, + (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV} + + def __init__(self, value, positive=True, case_flags=NOCASE, + zerowidth=False): + RegexBase.__init__(self) + self.value = value + self.positive = bool(positive) + self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] + self.zerowidth = bool(zerowidth) + + if (self.positive and (self.case_flags & FULLIGNORECASE) == + FULLIGNORECASE): + self.folded = _regex.fold_case(FULL_CASE_FOLDING, unichr(self.value)) + else: + self.folded = unichr(self.value) + + self._key = (self.__class__, self.value, self.positive, + self.case_flags, self.zerowidth) + + def rebuild(self, positive, case_flags, zerowidth): + return Character(self.value, positive, case_flags, zerowidth) + + def optimise(self, info, in_set=False): + return self + + def get_firstset(self, reverse): + return set([self]) + + def has_simple_start(self): + return True + + def _compile(self, reverse, fuzzy): + flags = 0 + if self.positive: + flags |= POSITIVE_OP + if self.zerowidth: + flags |= ZEROWIDTH_OP + if fuzzy: + flags |= FUZZY_OP + + code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags, + self.value]) + + if len(self.folded) > 1: + # The character expands on full case-folding. + code = Branch([code, String([ord(c) for c in self.folded], + case_flags=self.case_flags)]) + + return code.compile(reverse, fuzzy) + + def _dump(self, indent, reverse): + display = repr(unichr(self.value)).lstrip("bu") + print "%sCHARACTER %s %s%s" % (INDENT * indent, + POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]) + + def matches(self, ch): + return (ch == self.value) == self.positive + + def max_width(self): + return len(self.folded) + + def get_required_string(self, reverse): + if not self.positive: + return 1, None + + self.folded_characters = tuple(ord(c) for c in self.folded) + + return 0, self + +class Conditional(RegexBase): + def __init__(self, info, group, yes_item, no_item, position): + RegexBase.__init__(self) + self.info = info + self.group = group + self.yes_item = yes_item + self.no_item = no_item + self.position = position + + def fix_groups(self, pattern, reverse, fuzzy): + try: + self.group = int(self.group) + except ValueError: + try: + self.group = self.info.group_index[self.group] + except KeyError: + if self.group == 'DEFINE': + # 'DEFINE' is a special name unless there's a group with + # that name. + self.group = 0 + else: + raise error("unknown group", pattern, self.position) + + if not 0 <= self.group <= self.info.group_count: + raise error("invalid group reference", pattern, self.position) + + self.yes_item.fix_groups(pattern, reverse, fuzzy) + self.no_item.fix_groups(pattern, reverse, fuzzy) + + def optimise(self, info): + yes_item = self.yes_item.optimise(info) + no_item = self.no_item.optimise(info) + + return Conditional(info, self.group, yes_item, no_item, self.position) + + def pack_characters(self, info): + self.yes_item = self.yes_item.pack_characters(info) + self.no_item = self.no_item.pack_characters(info) + return self + + def remove_captures(self): + self.yes_item = self.yes_item.remove_captures() + self.no_item = self.no_item.remove_captures() + + def is_atomic(self): + return self.yes_item.is_atomic() and self.no_item.is_atomic() + + def can_be_affix(self): + return self.yes_item.can_be_affix() and self.no_item.can_be_affix() + + def contains_group(self): + return self.yes_item.contains_group() or self.no_item.contains_group() + + def get_firstset(self, reverse): + return (self.yes_item.get_firstset(reverse) | + self.no_item.get_firstset(reverse)) + + def _compile(self, reverse, fuzzy): + code = [(OP.GROUP_EXISTS, self.group)] + code.extend(self.yes_item.compile(reverse, fuzzy)) + add_code = self.no_item.compile(reverse, fuzzy) + if add_code: + code.append((OP.NEXT, )) + code.extend(add_code) + + code.append((OP.END, )) + + return code + + def _dump(self, indent, reverse): + print "%sGROUP_EXISTS %s" % (INDENT * indent, self.group) + self.yes_item.dump(indent + 1, reverse) + if not self.no_item.is_empty(): + print "%sOR" % (INDENT * indent) + self.no_item.dump(indent + 1, reverse) + + def is_empty(self): + return self.yes_item.is_empty() and self.no_item.is_empty() + + def __eq__(self, other): + return type(self) is type(other) and (self.group, self.yes_item, + self.no_item) == (other.group, other.yes_item, other.no_item) + + def max_width(self): + return max(self.yes_item.max_width(), self.no_item.max_width()) + +class DefaultBoundary(ZeroWidthBase): + _opcode = OP.DEFAULT_BOUNDARY + _op_name = "DEFAULT_BOUNDARY" + +class DefaultEndOfWord(ZeroWidthBase): + _opcode = OP.DEFAULT_END_OF_WORD + _op_name = "DEFAULT_END_OF_WORD" + +class DefaultStartOfWord(ZeroWidthBase): + _opcode = OP.DEFAULT_START_OF_WORD + _op_name = "DEFAULT_START_OF_WORD" + +class EndOfLine(ZeroWidthBase): + _opcode = OP.END_OF_LINE + _op_name = "END_OF_LINE" + +class EndOfLineU(EndOfLine): + _opcode = OP.END_OF_LINE_U + _op_name = "END_OF_LINE_U" + +class EndOfString(ZeroWidthBase): + _opcode = OP.END_OF_STRING + _op_name = "END_OF_STRING" + +class EndOfStringLine(ZeroWidthBase): + _opcode = OP.END_OF_STRING_LINE + _op_name = "END_OF_STRING_LINE" + +class EndOfStringLineU(EndOfStringLine): + _opcode = OP.END_OF_STRING_LINE_U + _op_name = "END_OF_STRING_LINE_U" + +class EndOfWord(ZeroWidthBase): + _opcode = OP.END_OF_WORD + _op_name = "END_OF_WORD" + +class Failure(ZeroWidthBase): + _op_name = "FAILURE" + + def _compile(self, reverse, fuzzy): + return [(OP.FAILURE, )] + +class Fuzzy(RegexBase): + def __init__(self, subpattern, constraints=None): + RegexBase.__init__(self) + if constraints is None: + constraints = {} + self.subpattern = subpattern + self.constraints = constraints + + # If an error type is mentioned in the cost equation, then its maximum + # defaults to unlimited. + if "cost" in constraints: + for e in "dis": + if e in constraints["cost"]: + constraints.setdefault(e, (0, None)) + + # If any error type is mentioned, then all the error maxima default to + # 0, otherwise they default to unlimited. + if set(constraints) & set("dis"): + for e in "dis": + constraints.setdefault(e, (0, 0)) + else: + for e in "dis": + constraints.setdefault(e, (0, None)) + + # The maximum of the generic error type defaults to unlimited. + constraints.setdefault("e", (0, None)) + + # The cost equation defaults to equal costs. Also, the cost of any + # error type not mentioned in the cost equation defaults to 0. + if "cost" in constraints: + for e in "dis": + constraints["cost"].setdefault(e, 0) + else: + constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max": + constraints["e"][1]} + + def fix_groups(self, pattern, reverse, fuzzy): + self.subpattern.fix_groups(pattern, reverse, True) + + def pack_characters(self, info): + self.subpattern = self.subpattern.pack_characters(info) + return self + + def remove_captures(self): + self.subpattern = self.subpattern.remove_captures() + return self + + def is_atomic(self): + return self.subpattern.is_atomic() + + def contains_group(self): + return self.subpattern.contains_group() + + def _compile(self, reverse, fuzzy): + # The individual limits. + arguments = [] + for e in "dise": + v = self.constraints[e] + arguments.append(v[0]) + arguments.append(UNLIMITED if v[1] is None else v[1]) + + # The coeffs of the cost equation. + for e in "dis": + arguments.append(self.constraints["cost"][e]) + + # The maximum of the cost equation. + v = self.constraints["cost"]["max"] + arguments.append(UNLIMITED if v is None else v) + + flags = 0 + if reverse: + flags |= REVERSE_OP + + return ([(OP.FUZZY, flags) + tuple(arguments)] + + self.subpattern.compile(reverse, True) + [(OP.END,)]) + + def _dump(self, indent, reverse): + constraints = self._constraints_to_string() + if constraints: + constraints = " " + constraints + print "%sFUZZY%s" % (INDENT * indent, constraints) + self.subpattern.dump(indent + 1, reverse) + + def is_empty(self): + return self.subpattern.is_empty() + + def __eq__(self, other): + return (type(self) is type(other) and self.subpattern == + other.subpattern) + + def max_width(self): + return UNLIMITED + + def _constraints_to_string(self): + constraints = [] + + for name in "ids": + min, max = self.constraints[name] + if max == 0: + continue + + con = "" + + if min > 0: + con = "%s<=" % min + + con += name + + if max is not None: + con += "<=%s" % max + + constraints.append(con) + + cost = [] + for name in "ids": + coeff = self.constraints["cost"][name] + if coeff > 0: + cost.append("%s%s" % (coeff, name)) + + limit = self.constraints["cost"]["max"] + if limit is not None and limit > 0: + cost = "%s<=%s" % ("+".join(cost), limit) + constraints.append(cost) + + return ",".join(constraints) + +class Grapheme(RegexBase): + def _compile(self, reverse, fuzzy): + # Match at least 1 character until a grapheme boundary is reached. Note + # that this is the same whether matching forwards or backwards. + grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None), + GraphemeBoundary()])) + + return grapheme_matcher.compile(reverse, fuzzy) + + def _dump(self, indent, reverse): + print "%sGRAPHEME" % (INDENT * indent) + + def max_width(self): + return UNLIMITED + +class GraphemeBoundary: + def compile(self, reverse, fuzzy): + return [(OP.GRAPHEME_BOUNDARY, 1)] + +class GreedyRepeat(RegexBase): + _opcode = OP.GREEDY_REPEAT + _op_name = "GREEDY_REPEAT" + + def __init__(self, subpattern, min_count, max_count): + RegexBase.__init__(self) + self.subpattern = subpattern + self.min_count = min_count + self.max_count = max_count + + def fix_groups(self, pattern, reverse, fuzzy): + self.subpattern.fix_groups(pattern, reverse, fuzzy) + + def optimise(self, info): + subpattern = self.subpattern.optimise(info) + + return type(self)(subpattern, self.min_count, self.max_count) + + def pack_characters(self, info): + self.subpattern = self.subpattern.pack_characters(info) + return self + + def remove_captures(self): + self.subpattern = self.subpattern.remove_captures() + return self + + def is_atomic(self): + return self.min_count == self.max_count and self.subpattern.is_atomic() + + def contains_group(self): + return self.subpattern.contains_group() + + def get_firstset(self, reverse): + fs = self.subpattern.get_firstset(reverse) + if self.min_count == 0: + fs.add(None) + + return fs + + def _compile(self, reverse, fuzzy): + repeat = [self._opcode, self.min_count] + if self.max_count is None: + repeat.append(UNLIMITED) + else: + repeat.append(self.max_count) + + subpattern = self.subpattern.compile(reverse, fuzzy) + if not subpattern: + return [] + + return ([tuple(repeat)] + subpattern + [(OP.END, )]) + + def _dump(self, indent, reverse): + if self.max_count is None: + limit = "INF" + else: + limit = self.max_count + print "%s%s %s %s" % (INDENT * indent, self._op_name, self.min_count, + limit) + + self.subpattern.dump(indent + 1, reverse) + + def is_empty(self): + return self.subpattern.is_empty() + + def __eq__(self, other): + return type(self) is type(other) and (self.subpattern, self.min_count, + self.max_count) == (other.subpattern, other.min_count, + other.max_count) + + def max_width(self): + if self.max_count is None: + return UNLIMITED + + return self.subpattern.max_width() * self.max_count + + def get_required_string(self, reverse): + max_count = UNLIMITED if self.max_count is None else self.max_count + if self.min_count == 0: + w = self.subpattern.max_width() * max_count + return min(w, UNLIMITED), None + + ofs, req = self.subpattern.get_required_string(reverse) + if req: + return ofs, req + + w = self.subpattern.max_width() * max_count + return min(w, UNLIMITED), None + +class Group(RegexBase): + def __init__(self, info, group, subpattern): + RegexBase.__init__(self) + self.info = info + self.group = group + self.subpattern = subpattern + + self.call_ref = None + + def fix_groups(self, pattern, reverse, fuzzy): + self.info.defined_groups[self.group] = (self, reverse, fuzzy) + self.subpattern.fix_groups(pattern, reverse, fuzzy) + + def optimise(self, info): + subpattern = self.subpattern.optimise(info) + + return Group(self.info, self.group, subpattern) + + def pack_characters(self, info): + self.subpattern = self.subpattern.pack_characters(info) + return self + + def remove_captures(self): + return self.subpattern.remove_captures() + + def is_atomic(self): + return self.subpattern.is_atomic() + + def can_be_affix(self): + return False + + def contains_group(self): + return True + + def get_firstset(self, reverse): + return self.subpattern.get_firstset(reverse) + + def has_simple_start(self): + return self.subpattern.has_simple_start() + + def _compile(self, reverse, fuzzy): + code = [] + + key = self.group, reverse, fuzzy + ref = self.info.call_refs.get(key) + if ref is not None: + code += [(OP.CALL_REF, ref)] + + public_group = private_group = self.group + if private_group < 0: + public_group = self.info.private_groups[private_group] + private_group = self.info.group_count - private_group + + code += ([(OP.GROUP, private_group, public_group)] + + self.subpattern.compile(reverse, fuzzy) + [(OP.END, )]) + + if ref is not None: + code += [(OP.END, )] + + return code + + def _dump(self, indent, reverse): + group = self.group + if group < 0: + group = private_groups[group] + print "%sGROUP %s" % (INDENT * indent, group) + self.subpattern.dump(indent + 1, reverse) + + def __eq__(self, other): + return (type(self) is type(other) and (self.group, self.subpattern) == + (other.group, other.subpattern)) + + def max_width(self): + return self.subpattern.max_width() + + def get_required_string(self, reverse): + return self.subpattern.get_required_string(reverse) + +class Keep(ZeroWidthBase): + _opcode = OP.KEEP + _op_name = "KEEP" + +class LazyRepeat(GreedyRepeat): + _opcode = OP.LAZY_REPEAT + _op_name = "LAZY_REPEAT" + +class LookAround(RegexBase): + _dir_text = {False: "AHEAD", True: "BEHIND"} + + def __init__(self, behind, positive, subpattern): + RegexBase.__init__(self) + self.behind = bool(behind) + self.positive = bool(positive) + self.subpattern = subpattern + + def fix_groups(self, pattern, reverse, fuzzy): + self.subpattern.fix_groups(pattern, self.behind, fuzzy) + + def optimise(self, info): + subpattern = self.subpattern.optimise(info) + if self.positive and subpattern.is_empty(): + return subpattern + + return LookAround(self.behind, self.positive, subpattern) + + def pack_characters(self, info): + self.subpattern = self.subpattern.pack_characters(info) + return self + + def remove_captures(self): + return self.subpattern.remove_captures() + + def is_atomic(self): + return self.subpattern.is_atomic() + + def can_be_affix(self): + return self.subpattern.can_be_affix() + + def contains_group(self): + return self.subpattern.contains_group() + + def _compile(self, reverse, fuzzy): + return ([(OP.LOOKAROUND, int(self.positive), int(not self.behind))] + + self.subpattern.compile(self.behind) + [(OP.END, )]) + + def _dump(self, indent, reverse): + print "%sLOOK%s %s" % (INDENT * indent, self._dir_text[self.behind], + POS_TEXT[self.positive]) + self.subpattern.dump(indent + 1, self.behind) + + def is_empty(self): + return self.positive and self.subpattern.is_empty() + + def __eq__(self, other): + return type(self) is type(other) and (self.behind, self.positive, + self.subpattern) == (other.behind, other.positive, other.subpattern) + + def max_width(self): + return 0 + +class LookAroundConditional(RegexBase): + _dir_text = {False: "AHEAD", True: "BEHIND"} + + def __init__(self, behind, positive, subpattern, yes_item, no_item): + RegexBase.__init__(self) + self.behind = bool(behind) + self.positive = bool(positive) + self.subpattern = subpattern + self.yes_item = yes_item + self.no_item = no_item + + def fix_groups(self, pattern, reverse, fuzzy): + self.subpattern.fix_groups(pattern, reverse, fuzzy) + self.yes_item.fix_groups(pattern, reverse, fuzzy) + self.no_item.fix_groups(pattern, reverse, fuzzy) + + def optimise(self, info): + subpattern = self.subpattern.optimise(info) + yes_item = self.yes_item.optimise(info) + no_item = self.no_item.optimise(info) + + return LookAroundConditional(self.behind, self.positive, subpattern, + yes_item, no_item) + + def pack_characters(self, info): + self.subpattern = self.subpattern.pack_characters(info) + self.yes_item = self.yes_item.pack_characters(info) + self.no_item = self.no_item.pack_characters(info) + return self + + def remove_captures(self): + self.subpattern = self.subpattern.remove_captures() + self.yes_item = self.yes_item.remove_captures() + self.no_item = self.no_item.remove_captures() + + def is_atomic(self): + return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and + self.no_item.is_atomic()) + + def can_be_affix(self): + return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix() + and self.no_item.can_be_affix()) + + def contains_group(self): + return (self.subpattern.contains_group() or + self.yes_item.contains_group() or self.no_item.contains_group()) + + def get_firstset(self, reverse): + return (self.subpattern.get_firstset(reverse) | + self.no_item.get_firstset(reverse)) + + def _compile(self, reverse, fuzzy): + code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))] + code.extend(self.subpattern.compile(self.behind, fuzzy)) + code.append((OP.NEXT, )) + code.extend(self.yes_item.compile(reverse, fuzzy)) + add_code = self.no_item.compile(reverse, fuzzy) + if add_code: + code.append((OP.NEXT, )) + code.extend(add_code) + + code.append((OP.END, )) + + return code + + def _dump(self, indent, reverse): + print("%sCONDITIONAL %s %s" % (INDENT * indent, + self._dir_text[self.behind], POS_TEXT[self.positive])) + self.subpattern.dump(indent + 1, self.behind) + print("%sEITHER" % (INDENT * indent)) + self.yes_item.dump(indent + 1, reverse) + if not self.no_item.is_empty(): + print("%sOR".format(INDENT * indent)) + self.no_item.dump(indent + 1, reverse) + + def is_empty(self): + return (self.subpattern.is_empty() and self.yes_item.is_empty() or + self.no_item.is_empty()) + + def __eq__(self, other): + return type(self) is type(other) and (self.subpattern, self.yes_item, + self.no_item) == (other.subpattern, other.yes_item, other.no_item) + + def max_width(self): + return max(self.yes_item.max_width(), self.no_item.max_width()) + + def get_required_string(self, reverse): + return self.max_width(), None + +class PrecompiledCode(RegexBase): + def __init__(self, code): + self.code = code + + def _compile(self, reverse, fuzzy): + return [tuple(self.code)] + +class Property(RegexBase): + _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False): + OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False): + OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True): + OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE, + True): OP.PROPERTY_IGN_REV} + + def __init__(self, value, positive=True, case_flags=NOCASE, + zerowidth=False): + RegexBase.__init__(self) + self.value = value + self.positive = bool(positive) + self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] + self.zerowidth = bool(zerowidth) + + self._key = (self.__class__, self.value, self.positive, + self.case_flags, self.zerowidth) + + def rebuild(self, positive, case_flags, zerowidth): + return Property(self.value, positive, case_flags, zerowidth) + + def optimise(self, info, in_set=False): + return self + + def get_firstset(self, reverse): + return set([self]) + + def has_simple_start(self): + return True + + def _compile(self, reverse, fuzzy): + flags = 0 + if self.positive: + flags |= POSITIVE_OP + if self.zerowidth: + flags |= ZEROWIDTH_OP + if fuzzy: + flags |= FUZZY_OP + return [(self._opcode[self.case_flags, reverse], flags, self.value)] + + def _dump(self, indent, reverse): + prop = PROPERTY_NAMES[self.value >> 16] + name, value = prop[0], prop[1][self.value & 0xFFFF] + print "%sPROPERTY %s %s:%s%s" % (INDENT * indent, + POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags]) + + def matches(self, ch): + return _regex.has_property_value(self.value, ch) == self.positive + + def max_width(self): + return 1 + +class Prune(ZeroWidthBase): + _op_name = "PRUNE" + + def _compile(self, reverse, fuzzy): + return [(OP.PRUNE, )] + +class Range(RegexBase): + _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN, + (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN, + (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV, + (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV} + _op_name = "RANGE" + + def __init__(self, lower, upper, positive=True, case_flags=NOCASE, + zerowidth=False): + RegexBase.__init__(self) + self.lower = lower + self.upper = upper + self.positive = bool(positive) + self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] + self.zerowidth = bool(zerowidth) + + self._key = (self.__class__, self.lower, self.upper, self.positive, + self.case_flags, self.zerowidth) + + def rebuild(self, positive, case_flags, zerowidth): + return Range(self.lower, self.upper, positive, case_flags, zerowidth) + + def optimise(self, info, in_set=False): + # Is the range case-sensitive? + if not self.positive or not (self.case_flags & IGNORECASE) or in_set: + return self + + # Is full case-folding possible? + if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) != + FULLIGNORECASE): + return self + + # Get the characters which expand to multiple codepoints on folding. + expanding_chars = _regex.get_expand_on_folding() + + # Get the folded characters in the range. + items = [] + for ch in expanding_chars: + if self.lower <= ord(ch) <= self.upper: + folded = _regex.fold_case(FULL_CASE_FOLDING, ch) + items.append(String([ord(c) for c in folded], + case_flags=self.case_flags)) + + if not items: + # We can fall back to simple case-folding. + return self + + if len(items) < self.upper - self.lower + 1: + # Not all the characters are covered by the full case-folding. + items.insert(0, self) + + return Branch(items) + + def _compile(self, reverse, fuzzy): + flags = 0 + if self.positive: + flags |= POSITIVE_OP + if self.zerowidth: + flags |= ZEROWIDTH_OP + if fuzzy: + flags |= FUZZY_OP + return [(self._opcode[self.case_flags, reverse], flags, self.lower, + self.upper)] + + def _dump(self, indent, reverse): + display_lower = repr(unichr(self.lower)).lstrip("bu") + display_upper = repr(unichr(self.upper)).lstrip("bu") + print "%sRANGE %s %s %s%s" % (INDENT * indent, POS_TEXT[self.positive], + display_lower, display_upper, CASE_TEXT[self.case_flags]) + + def matches(self, ch): + return (self.lower <= ch <= self.upper) == self.positive + + def max_width(self): + return 1 + +class RefGroup(RegexBase): + _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False): + OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE, + False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE, + True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV, + (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV} + + def __init__(self, info, group, position, case_flags=NOCASE): + RegexBase.__init__(self) + self.info = info + self.group = group + self.position = position + self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] + + self._key = self.__class__, self.group, self.case_flags + + def fix_groups(self, pattern, reverse, fuzzy): + try: + self.group = int(self.group) + except ValueError: + try: + self.group = self.info.group_index[self.group] + except KeyError: + raise error("unknown group", pattern, self.position) + + if not 1 <= self.group <= self.info.group_count: + raise error("invalid group reference", pattern, self.position) + + self._key = self.__class__, self.group, self.case_flags + + def remove_captures(self): + raise error("group reference not allowed", pattern, self.position) + + def _compile(self, reverse, fuzzy): + flags = 0 + if fuzzy: + flags |= FUZZY_OP + return [(self._opcode[self.case_flags, reverse], flags, self.group)] + + def _dump(self, indent, reverse): + print "%sREF_GROUP %s%s" % (INDENT * indent, self.group, + CASE_TEXT[self.case_flags]) + + def max_width(self): + return UNLIMITED + +class SearchAnchor(ZeroWidthBase): + _opcode = OP.SEARCH_ANCHOR + _op_name = "SEARCH_ANCHOR" + +class Sequence(RegexBase): + def __init__(self, items=None): + RegexBase.__init__(self) + if items is None: + items = [] + + self.items = items + + def fix_groups(self, pattern, reverse, fuzzy): + for s in self.items: + s.fix_groups(pattern, reverse, fuzzy) + + def optimise(self, info): + # Flatten the sequences. + items = [] + for s in self.items: + s = s.optimise(info) + if isinstance(s, Sequence): + items.extend(s.items) + else: + items.append(s) + + return make_sequence(items) + + def pack_characters(self, info): + "Packs sequences of characters into strings." + items = [] + characters = [] + case_flags = NOCASE + for s in self.items: + if type(s) is Character and s.positive: + if s.case_flags != case_flags: + # Different case sensitivity, so flush, unless neither the + # previous nor the new character are cased. + if s.case_flags or is_cased(info, s.value): + Sequence._flush_characters(info, characters, + case_flags, items) + + case_flags = s.case_flags + + characters.append(s.value) + elif type(s) is String or type(s) is Literal: + if s.case_flags != case_flags: + # Different case sensitivity, so flush, unless the neither + # the previous nor the new string are cased. + if s.case_flags or any(is_cased(info, c) for c in + characters): + Sequence._flush_characters(info, characters, + case_flags, items) + + case_flags = s.case_flags + + characters.extend(s.characters) + else: + Sequence._flush_characters(info, characters, case_flags, items) + + items.append(s.pack_characters(info)) + + Sequence._flush_characters(info, characters, case_flags, items) + + return make_sequence(items) + + def remove_captures(self): + self.items = [s.remove_captures() for s in self.items] + return self + + def is_atomic(self): + return all(s.is_atomic() for s in self.items) + + def can_be_affix(self): + return False + + def contains_group(self): + return any(s.contains_group() for s in self.items) + + def get_firstset(self, reverse): + fs = set() + items = self.items + if reverse: + items.reverse() + for s in items: + fs |= s.get_firstset(reverse) + if None not in fs: + return fs + fs.discard(None) + + return fs | set([None]) + + def has_simple_start(self): + return bool(self.items) and self.items[0].has_simple_start() + + def _compile(self, reverse, fuzzy): + seq = self.items + if reverse: + seq = seq[::-1] + + code = [] + for s in seq: + code.extend(s.compile(reverse, fuzzy)) + + return code + + def _dump(self, indent, reverse): + for s in self.items: + s.dump(indent, reverse) + + @staticmethod + def _flush_characters(info, characters, case_flags, items): + if not characters: + return + + # Disregard case_flags if all of the characters are case-less. + if case_flags & IGNORECASE: + if not any(is_cased(info, c) for c in characters): + case_flags = NOCASE + + if len(characters) == 1: + items.append(Character(characters[0], case_flags=case_flags)) + else: + items.append(String(characters, case_flags=case_flags)) + + characters[:] = [] + + def is_empty(self): + return all(i.is_empty() for i in self.items) + + def __eq__(self, other): + return type(self) is type(other) and self.items == other.items + + def max_width(self): + return sum(s.max_width() for s in self.items) + + def get_required_string(self, reverse): + seq = self.items + if reverse: + seq = seq[::-1] + + offset = 0 + + for s in seq: + ofs, req = s.get_required_string(reverse) + offset += ofs + if req: + return offset, req + + return offset, None + +class SetBase(RegexBase): + def __init__(self, info, items, positive=True, case_flags=NOCASE, + zerowidth=False): + RegexBase.__init__(self) + self.info = info + self.items = tuple(items) + self.positive = bool(positive) + self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] + self.zerowidth = bool(zerowidth) + + self.char_width = 1 + + self._key = (self.__class__, self.items, self.positive, + self.case_flags, self.zerowidth) + + def rebuild(self, positive, case_flags, zerowidth): + return type(self)(self.info, self.items, positive, case_flags, + zerowidth).optimise(self.info) + + def get_firstset(self, reverse): + return set([self]) + + def has_simple_start(self): + return True + + def _compile(self, reverse, fuzzy): + flags = 0 + if self.positive: + flags |= POSITIVE_OP + if self.zerowidth: + flags |= ZEROWIDTH_OP + if fuzzy: + flags |= FUZZY_OP + code = [(self._opcode[self.case_flags, reverse], flags)] + for m in self.items: + code.extend(m.compile()) + + code.append((OP.END, )) + + return code + + def _dump(self, indent, reverse): + print "%s%s %s%s" % (INDENT * indent, self._op_name, + POS_TEXT[self.positive], CASE_TEXT[self.case_flags]) + for i in self.items: + i.dump(indent + 1, reverse) + + def _handle_case_folding(self, info, in_set): + # Is the set case-sensitive? + if not self.positive or not (self.case_flags & IGNORECASE) or in_set: + return self + + # Is full case-folding possible? + if (not (self.info.flags & UNICODE) or (self.case_flags & + FULLIGNORECASE) != FULLIGNORECASE): + return self + + # Get the characters which expand to multiple codepoints on folding. + expanding_chars = _regex.get_expand_on_folding() + + # Get the folded characters in the set. + items = [] + seen = set() + for ch in expanding_chars: + if self.matches(ord(ch)): + folded = _regex.fold_case(FULL_CASE_FOLDING, ch) + if folded not in seen: + items.append(String([ord(c) for c in folded], + case_flags=self.case_flags)) + seen.add(folded) + + if not items: + # We can fall back to simple case-folding. + return self + + return Branch([self] + items) + + def max_width(self): + # Is the set case-sensitive? + if not self.positive or not (self.case_flags & IGNORECASE): + return 1 + + # Is full case-folding possible? + if (not (self.info.flags & UNICODE) or (self.case_flags & + FULLIGNORECASE) != FULLIGNORECASE): + return 1 + + # Get the characters which expand to multiple codepoints on folding. + expanding_chars = _regex.get_expand_on_folding() + + # Get the folded characters in the set. + seen = set() + for ch in expanding_chars: + if self.matches(ord(ch)): + folded = _regex.fold_case(FULL_CASE_FOLDING, ch) + seen.add(folded) + + if not seen: + return 1 + + return max(len(folded) for folded in seen) + +class SetDiff(SetBase): + _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False): + OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False): + OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True): + OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE, + True): OP.SET_DIFF_IGN_REV} + _op_name = "SET_DIFF" + + def optimise(self, info, in_set=False): + items = self.items + if len(items) > 2: + items = [items[0], SetUnion(info, items[1 : ])] + + if len(items) == 1: + return items[0].with_flags(case_flags=self.case_flags, + zerowidth=self.zerowidth).optimise(info, in_set) + + self.items = tuple(m.optimise(info, in_set=True) for m in items) + + return self._handle_case_folding(info, in_set) + + def matches(self, ch): + m = self.items[0].matches(ch) and not self.items[1].matches(ch) + return m == self.positive + +class SetInter(SetBase): + _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False): + OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE, + False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE, + True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV, + (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV} + _op_name = "SET_INTER" + + def optimise(self, info, in_set=False): + items = [] + for m in self.items: + m = m.optimise(info, in_set=True) + if isinstance(m, SetInter) and m.positive: + # Intersection in intersection. + items.extend(m.items) + else: + items.append(m) + + if len(items) == 1: + return items[0].with_flags(case_flags=self.case_flags, + zerowidth=self.zerowidth).optimise(info, in_set) + + self.items = tuple(items) + + return self._handle_case_folding(info, in_set) + + def matches(self, ch): + m = all(i.matches(ch) for i in self.items) + return m == self.positive + +class SetSymDiff(SetBase): + _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False): + OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE, + False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV, + (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True): + OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV} + _op_name = "SET_SYM_DIFF" + + def optimise(self, info, in_set=False): + items = [] + for m in self.items: + m = m.optimise(info, in_set=True) + if isinstance(m, SetSymDiff) and m.positive: + # Symmetric difference in symmetric difference. + items.extend(m.items) + else: + items.append(m) + + if len(items) == 1: + return items[0].with_flags(case_flags=self.case_flags, + zerowidth=self.zerowidth).optimise(info, in_set) + + self.items = tuple(items) + + return self._handle_case_folding(info, in_set) + + def matches(self, ch): + m = False + for i in self.items: + m = m != i.matches(ch) + + return m == self.positive + +class SetUnion(SetBase): + _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False): + OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE, + False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE, + True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV, + (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV} + _op_name = "SET_UNION" + + def optimise(self, info, in_set=False): + items = [] + for m in self.items: + m = m.optimise(info, in_set=True) + if isinstance(m, SetUnion) and m.positive: + # Union in union. + items.extend(m.items) + else: + items.append(m) + + if len(items) == 1: + i = items[0] + return i.with_flags(positive=i.positive == self.positive, + case_flags=self.case_flags, + zerowidth=self.zerowidth).optimise(info, in_set) + + self.items = tuple(items) + + return self._handle_case_folding(info, in_set) + + def _compile(self, reverse, fuzzy): + flags = 0 + if self.positive: + flags |= POSITIVE_OP + if self.zerowidth: + flags |= ZEROWIDTH_OP + if fuzzy: + flags |= FUZZY_OP + + characters, others = defaultdict(list), [] + for m in self.items: + if isinstance(m, Character): + characters[m.positive].append(m.value) + else: + others.append(m) + + code = [(self._opcode[self.case_flags, reverse], flags)] + + for positive, values in characters.items(): + flags = 0 + if positive: + flags |= POSITIVE_OP + if len(values) == 1: + code.append((OP.CHARACTER, flags, values[0])) + else: + code.append((OP.STRING, flags, len(values)) + tuple(values)) + + for m in others: + code.extend(m.compile()) + + code.append((OP.END, )) + + return code + + def matches(self, ch): + m = any(i.matches(ch) for i in self.items) + return m == self.positive + +class Skip(ZeroWidthBase): + _op_name = "SKIP" + _opcode = OP.SKIP + +class StartOfLine(ZeroWidthBase): + _opcode = OP.START_OF_LINE + _op_name = "START_OF_LINE" + +class StartOfLineU(StartOfLine): + _opcode = OP.START_OF_LINE_U + _op_name = "START_OF_LINE_U" + +class StartOfString(ZeroWidthBase): + _opcode = OP.START_OF_STRING + _op_name = "START_OF_STRING" + +class StartOfWord(ZeroWidthBase): + _opcode = OP.START_OF_WORD + _op_name = "START_OF_WORD" + +class String(RegexBase): + _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN, + (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD, + (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV, + (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True): + OP.STRING_FLD_REV} + + def __init__(self, characters, case_flags=NOCASE): + self.characters = tuple(characters) + self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] + + if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE: + folded_characters = [] + for char in self.characters: + folded = _regex.fold_case(FULL_CASE_FOLDING, unichr(char)) + folded_characters.extend(ord(c) for c in folded) + else: + folded_characters = self.characters + + self.folded_characters = tuple(folded_characters) + self.required = False + + self._key = self.__class__, self.characters, self.case_flags + + def get_firstset(self, reverse): + if reverse: + pos = -1 + else: + pos = 0 + return set([Character(self.characters[pos], + case_flags=self.case_flags)]) + + def has_simple_start(self): + return True + + def _compile(self, reverse, fuzzy): + flags = 0 + if fuzzy: + flags |= FUZZY_OP + if self.required: + flags |= REQUIRED_OP + return [(self._opcode[self.case_flags, reverse], flags, + len(self.folded_characters)) + self.folded_characters] + + def _dump(self, indent, reverse): + display = repr("".join(unichr(c) for c in self.characters)).lstrip("bu") + print "%sSTRING %s%s" % (INDENT * indent, display, + CASE_TEXT[self.case_flags]) + + def max_width(self): + return len(self.folded_characters) + + def get_required_string(self, reverse): + return 0, self + +class Literal(String): + def _dump(self, indent, reverse): + for c in self.characters: + display = repr(unichr(c)).lstrip("bu") + print "%sCHARACTER MATCH %s%s" % (INDENT * indent, display, + CASE_TEXT[self.case_flags]) + +class StringSet(RegexBase): + _opcode = {(NOCASE, False): OP.STRING_SET, (IGNORECASE, False): + OP.STRING_SET_IGN, (FULLCASE, False): OP.STRING_SET, (FULLIGNORECASE, + False): OP.STRING_SET_FLD, (NOCASE, True): OP.STRING_SET_REV, + (IGNORECASE, True): OP.STRING_SET_IGN_REV, (FULLCASE, True): + OP.STRING_SET_REV, (FULLIGNORECASE, True): OP.STRING_SET_FLD_REV} + + def __init__(self, info, name, case_flags=NOCASE): + self.info = info + self.name = name + self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] + + self._key = self.__class__, self.name, self.case_flags + + self.set_key = (name, self.case_flags) + if self.set_key not in info.named_lists_used: + info.named_lists_used[self.set_key] = len(info.named_lists_used) + + def _compile(self, reverse, fuzzy): + index = self.info.named_lists_used[self.set_key] + items = self.info.kwargs[self.name] + + case_flags = self.case_flags + + if not items: + return [] + + encoding = self.info.flags & _ALL_ENCODINGS + fold_flags = encoding | case_flags + + if fuzzy: + choices = [self._folded(fold_flags, i) for i in items] + + # Sort from longest to shortest. + choices.sort(key=lambda s: (-len(s), s)) + + branches = [] + for string in choices: + branches.append(Sequence([Character(c, case_flags=case_flags) + for c in string])) + + if len(branches) > 1: + branch = Branch(branches) + else: + branch = branches[0] + branch = branch.optimise(self.info).pack_characters(self.info) + + return branch.compile(reverse, fuzzy) + else: + min_len = min(len(i) for i in items) + max_len = max(len(self._folded(fold_flags, i)) for i in items) + return [(self._opcode[case_flags, reverse], index, min_len, + max_len)] + + def _dump(self, indent, reverse): + print "%sSTRING_SET %s%s" % (INDENT * indent, self.name, + CASE_TEXT[self.case_flags]) + + def _folded(self, fold_flags, item): + if isinstance(item, unicode): + return [ord(c) for c in _regex.fold_case(fold_flags, item)] + else: + return [ord(c) for c in item] + + def _flatten(self, s): + # Flattens the branches. + if isinstance(s, Branch): + for b in s.branches: + self._flatten(b) + elif isinstance(s, Sequence) and s.items: + seq = s.items + + while isinstance(seq[-1], Sequence): + seq[-1 : ] = seq[-1].items + + n = 0 + while n < len(seq) and isinstance(seq[n], Character): + n += 1 + + if n > 1: + seq[ : n] = [String([c.value for c in seq[ : n]], + case_flags=self.case_flags)] + + self._flatten(seq[-1]) + + def max_width(self): + if not self.info.kwargs[self.name]: + return 0 + + if self.case_flags & IGNORECASE: + fold_flags = (self.info.flags & _ALL_ENCODINGS) | self.case_flags + return max(len(_regex.fold_case(fold_flags, i)) for i in + self.info.kwargs[self.name]) + else: + return max(len(i) for i in self.info.kwargs[self.name]) + +class Source(object): + "Scanner for the regular expression source string." + def __init__(self, string): + if isinstance(string, unicode): + self.string = string + self.char_type = unichr + else: + self.string = string + self.char_type = chr + + self.pos = 0 + self.ignore_space = False + self.sep = string[ : 0] + + def get(self): + string = self.string + pos = self.pos + + try: + if self.ignore_space: + while True: + if string[pos].isspace(): + # Skip over the whitespace. + pos += 1 + elif string[pos] == "#": + # Skip over the comment to the end of the line. + pos = string.index("\n", pos) + else: + break + + ch = string[pos] + self.pos = pos + 1 + return ch + except IndexError: + # We've reached the end of the string. + self.pos = pos + return string[ : 0] + except ValueError: + # The comment extended to the end of the string. + self.pos = len(string) + return string[ : 0] + + def get_many(self, count=1): + string = self.string + pos = self.pos + + try: + if self.ignore_space: + substring = [] + + while len(substring) < count: + while True: + if string[pos].isspace(): + # Skip over the whitespace. + pos += 1 + elif string[pos] == "#": + # Skip over the comment to the end of the line. + pos = string.index("\n", pos) + else: + break + + substring.append(string[pos]) + pos += 1 + + substring = "".join(substring) + else: + substring = string[pos : pos + count] + pos += len(substring) + + self.pos = pos + return substring + except IndexError: + # We've reached the end of the string. + self.pos = len(string) + return "".join(substring) + except ValueError: + # The comment extended to the end of the string. + self.pos = len(string) + return "".join(substring) + + def get_while(self, test_set, include=True): + string = self.string + pos = self.pos + + if self.ignore_space: + try: + substring = [] + + while True: + if string[pos].isspace(): + # Skip over the whitespace. + pos += 1 + elif string[pos] == "#": + # Skip over the comment to the end of the line. + pos = string.index("\n", pos) + elif (string[pos] in test_set) == include: + substring.append(string[pos]) + pos += 1 + else: + break + + self.pos = pos + except IndexError: + # We've reached the end of the string. + self.pos = len(string) + except ValueError: + # The comment extended to the end of the string. + self.pos = len(string) + + return "".join(substring) + else: + try: + while (string[pos] in test_set) == include: + pos += 1 + + substring = string[self.pos : pos] + + self.pos = pos + + return substring + except IndexError: + # We've reached the end of the string. + substring = string[self.pos : pos] + + self.pos = pos + + return substring + + def skip_while(self, test_set, include=True): + string = self.string + pos = self.pos + + try: + if self.ignore_space: + while True: + if string[pos].isspace(): + # Skip over the whitespace. + pos += 1 + elif string[pos] == "#": + # Skip over the comment to the end of the line. + pos = string.index("\n", pos) + elif (string[pos] in test_set) == include: + pos += 1 + else: + break + else: + while (string[pos] in test_set) == include: + pos += 1 + + self.pos = pos + except IndexError: + # We've reached the end of the string. + self.pos = len(string) + except ValueError: + # The comment extended to the end of the string. + self.pos = len(string) + + def match(self, substring): + string = self.string + pos = self.pos + + if self.ignore_space: + try: + for c in substring: + while True: + if string[pos].isspace(): + # Skip over the whitespace. + pos += 1 + elif string[pos] == "#": + # Skip over the comment to the end of the line. + pos = string.index("\n", pos) + else: + break + + if string[pos] != c: + return False + + pos += 1 + + self.pos = pos + + return True + except IndexError: + # We've reached the end of the string. + return False + except ValueError: + # The comment extended to the end of the string. + return False + else: + if not string.startswith(substring, pos): + return False + + self.pos = pos + len(substring) + + return True + + def expect(self, substring): + if not self.match(substring): + raise error("missing %s" % substring, self.string, self.pos) + + def at_end(self): + string = self.string + pos = self.pos + + try: + if self.ignore_space: + while True: + if string[pos].isspace(): + pos += 1 + elif string[pos] == "#": + pos = string.index("\n", pos) + else: + break + + return pos >= len(string) + except IndexError: + # We've reached the end of the string. + return True + except ValueError: + # The comment extended to the end of the string. + return True + +class Info(object): + "Info about the regular expression." + + def __init__(self, flags=0, char_type=None, kwargs={}): + flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION] + self.flags = flags + self.global_flags = flags + self.inline_locale = False + + self.kwargs = kwargs + + self.group_count = 0 + self.group_index = {} + self.group_name = {} + self.char_type = char_type + self.named_lists_used = {} + self.open_groups = [] + self.open_group_count = {} + self.defined_groups = {} + self.group_calls = [] + self.private_groups = {} + + def open_group(self, name=None): + group = self.group_index.get(name) + if group is None: + while True: + self.group_count += 1 + if name is None or self.group_count not in self.group_name: + break + + group = self.group_count + if name: + self.group_index[name] = group + self.group_name[group] = name + + if group in self.open_groups: + # We have a nested named group. We'll assign it a private group + # number, initially negative until we can assign a proper + # (positive) number. + group_alias = -(len(self.private_groups) + 1) + self.private_groups[group_alias] = group + group = group_alias + + self.open_groups.append(group) + self.open_group_count[group] = self.open_group_count.get(group, 0) + 1 + + return group + + def close_group(self): + self.open_groups.pop() + + def is_open_group(self, name): + # In version 1, a group reference can refer to an open group. We'll + # just pretend the group isn't open. + version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION + if version == VERSION1: + return False + + if name.isdigit(): + group = int(name) + else: + group = self.group_index.get(name) + + return group in self.open_groups + +def _check_group_features(info, parsed): + """Checks whether the reverse and fuzzy features of the group calls match + the groups which they call. + """ + call_refs = {} + additional_groups = [] + for call, reverse, fuzzy in info.group_calls: + # Look up the reference of this group call. + key = (call.group, reverse, fuzzy) + ref = call_refs.get(key) + if ref is None: + # This group doesn't have a reference yet, so look up its features. + if call.group == 0: + # Calling the pattern as a whole. + rev = bool(info.flags & REVERSE) + fuz = isinstance(parsed, Fuzzy) + if (rev, fuz) != (reverse, fuzzy): + # The pattern as a whole doesn't have the features we want, + # so we'll need to make a copy of it with the desired + # features. + additional_groups.append((parsed, reverse, fuzzy)) + else: + # Calling a capture group. + def_info = info.defined_groups[call.group] + group = def_info[0] + if def_info[1 : ] != (reverse, fuzzy): + # The group doesn't have the features we want, so we'll + # need to make a copy of it with the desired features. + additional_groups.append((group, reverse, fuzzy)) + + ref = len(call_refs) + call_refs[key] = ref + + call.call_ref = ref + + info.call_refs = call_refs + info.additional_groups = additional_groups + +def _get_required_string(parsed, flags): + "Gets the required string and related info of a parsed pattern." + + req_offset, required = parsed.get_required_string(bool(flags & REVERSE)) + if required: + required.required = True + if req_offset >= UNLIMITED: + req_offset = -1 + + req_flags = required.case_flags + if not (flags & UNICODE): + req_flags &= ~UNICODE + + req_chars = required.folded_characters + else: + req_offset = 0 + req_chars = () + req_flags = 0 + + return req_offset, req_chars, req_flags + +class Scanner: + def __init__(self, lexicon, flags=0): + self.lexicon = lexicon + + # Combine phrases into a compound pattern. + patterns = [] + for phrase, action in lexicon: + # Parse the regular expression. + source = Source(phrase) + info = Info(flags, source.char_type) + source.ignore_space = bool(info.flags & VERBOSE) + parsed = _parse_pattern(source, info) + if not source.at_end(): + raise error("unbalanced parenthesis", source.string, source.pos) + + # We want to forbid capture groups within each phrase. + patterns.append(parsed.remove_captures()) + + # Combine all the subpatterns into one pattern. + info = Info(flags) + patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)] + parsed = Branch(patterns) + + # Optimise the compound pattern. + parsed = parsed.optimise(info) + parsed = parsed.pack_characters(info) + + # Get the required string. + req_offset, req_chars, req_flags = _get_required_string(parsed, + info.flags) + + # Check the features of the groups. + _check_group_features(info, parsed) + + # Complain if there are any group calls. They are not supported by the + # Scanner class. + if info.call_refs: + raise error("recursive regex not supported by Scanner", + source.string, source.pos) + + reverse = bool(info.flags & REVERSE) + + # Compile the compound pattern. The result is a list of tuples. + code = parsed.compile(reverse) + [(OP.SUCCESS, )] + + # Flatten the code into a list of ints. + code = _flatten_code(code) + + if not parsed.has_simple_start(): + # Get the first set, if possible. + try: + fs_code = _compile_firstset(info, parsed.get_firstset(reverse)) + fs_code = _flatten_code(fs_code) + code = fs_code + code + except _FirstSetError: + pass + + # Check the global flags for conflicts. + version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION + if version not in (0, VERSION0, VERSION1): + raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible") + + # Create the PatternObject. + # + # Local flags like IGNORECASE affect the code generation, but aren't + # needed by the PatternObject itself. Conversely, global flags like + # LOCALE _don't_ affect the code generation but _are_ needed by the + # PatternObject. + self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version, + code, {}, {}, {}, [], req_offset, req_chars, req_flags, + len(patterns)) + + def scan(self, string): + result = [] + append = result.append + match = self.scanner.scanner(string).match + i = 0 + while True: + m = match() + if not m: + break + j = m.end() + if i == j: + break + action = self.lexicon[m.lastindex - 1][1] + if hasattr(action, '__call__'): + self.match = m + action = action(self, m.group()) + if action is not None: + append(action) + i = j + + return result, string[i : ] + +# Get the known properties dict. +PROPERTIES = _regex.get_properties() + +# Build the inverse of the properties dict. +PROPERTY_NAMES = {} +for prop_name, (prop_id, values) in PROPERTIES.items(): + name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {})) + name = max(name, prop_name, key=len) + PROPERTY_NAMES[prop_id] = name, prop_values + + for val_name, val_id in values.items(): + prop_values[val_id] = max(prop_values.get(val_id, ""), val_name, + key=len) + +# Character escape sequences. +CHARACTER_ESCAPES = { + "a": "\a", + "b": "\b", + "f": "\f", + "n": "\n", + "r": "\r", + "t": "\t", + "v": "\v", +} + +# Predefined character set escape sequences. +CHARSET_ESCAPES = { + "d": lookup_property(None, "Digit", True), + "D": lookup_property(None, "Digit", False), + "s": lookup_property(None, "Space", True), + "S": lookup_property(None, "Space", False), + "w": lookup_property(None, "Word", True), + "W": lookup_property(None, "Word", False), +} + +# Positional escape sequences. +POSITION_ESCAPES = { + "A": StartOfString(), + "b": Boundary(), + "B": Boundary(False), + "K": Keep(), + "m": StartOfWord(), + "M": EndOfWord(), + "Z": EndOfString(), +} + +# Positional escape sequences when WORD flag set. +WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES) +WORD_POSITION_ESCAPES.update({ + "b": DefaultBoundary(), + "B": DefaultBoundary(False), + "m": DefaultStartOfWord(), + "M": DefaultEndOfWord(), +}) + +# Regex control verbs. +VERBS = { + "FAIL": Failure(), + "F": Failure(), + "PRUNE": Prune(), + "SKIP": Skip(), +} diff --git a/lib/regex/_regex_unicode.c b/lib/regex/_regex_unicode.c new file mode 100644 index 0000000000000000000000000000000000000000..47c896e680e30764224fedce3ff022d5ad2fb82b --- /dev/null +++ b/lib/regex/_regex_unicode.c @@ -0,0 +1,14258 @@ +/* For Unicode version 8.0.0 */ + +#include "_regex_unicode.h" + +#define RE_BLANK_MASK ((1 << RE_PROP_ZL) | (1 << RE_PROP_ZP)) +#define RE_GRAPH_MASK ((1 << RE_PROP_CC) | (1 << RE_PROP_CS) | (1 << RE_PROP_CN)) +#define RE_WORD_MASK (RE_PROP_M_MASK | (1 << RE_PROP_ND) | (1 << RE_PROP_PC)) + +typedef struct RE_AllCases { + RE_INT32 diffs[RE_MAX_CASES - 1]; +} RE_AllCases; + +typedef struct RE_FullCaseFolding { + RE_INT32 diff; + RE_UINT16 codepoints[RE_MAX_FOLDED - 1]; +} RE_FullCaseFolding; + +/* strings. */ + +char* re_strings[] = { + "-1/2", + "0", + "1", + "1/10", + "1/12", + "1/16", + "1/2", + "1/3", + "1/4", + "1/5", + "1/6", + "1/7", + "1/8", + "1/9", + "10", + "100", + "1000", + "10000", + "100000", + "1000000", + "100000000", + "10000000000", + "1000000000000", + "103", + "107", + "11", + "11/12", + "11/2", + "118", + "12", + "122", + "129", + "13", + "13/2", + "130", + "132", + "133", + "14", + "15", + "15/2", + "16", + "17", + "17/2", + "18", + "19", + "2", + "2/3", + "2/5", + "20", + "200", + "2000", + "20000", + "200000", + "202", + "21", + "214", + "216", + "216000", + "218", + "22", + "220", + "222", + "224", + "226", + "228", + "23", + "230", + "232", + "233", + "234", + "24", + "240", + "25", + "26", + "27", + "28", + "29", + "3", + "3/16", + "3/2", + "3/4", + "3/5", + "3/8", + "30", + "300", + "3000", + "30000", + "300000", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "4", + "4/5", + "40", + "400", + "4000", + "40000", + "400000", + "41", + "42", + "43", + "432000", + "44", + "45", + "46", + "47", + "48", + "49", + "5", + "5/12", + "5/2", + "5/6", + "5/8", + "50", + "500", + "5000", + "50000", + "500000", + "6", + "60", + "600", + "6000", + "60000", + "600000", + "7", + "7/12", + "7/2", + "7/8", + "70", + "700", + "7000", + "70000", + "700000", + "8", + "80", + "800", + "8000", + "80000", + "800000", + "84", + "9", + "9/2", + "90", + "900", + "9000", + "90000", + "900000", + "91", + "A", + "ABOVE", + "ABOVELEFT", + "ABOVERIGHT", + "AEGEANNUMBERS", + "AGHB", + "AHEX", + "AHOM", + "AI", + "AIN", + "AL", + "ALAPH", + "ALCHEMICAL", + "ALCHEMICALSYMBOLS", + "ALEF", + "ALETTER", + "ALNUM", + "ALPHA", + "ALPHABETIC", + "ALPHABETICPF", + "ALPHABETICPRESENTATIONFORMS", + "ALPHANUMERIC", + "AMBIGUOUS", + "AN", + "ANATOLIANHIEROGLYPHS", + "ANCIENTGREEKMUSIC", + "ANCIENTGREEKMUSICALNOTATION", + "ANCIENTGREEKNUMBERS", + "ANCIENTSYMBOLS", + "ANY", + "AR", + "ARAB", + "ARABIC", + "ARABICEXTA", + "ARABICEXTENDEDA", + "ARABICLETTER", + "ARABICMATH", + "ARABICMATHEMATICALALPHABETICSYMBOLS", + "ARABICNUMBER", + "ARABICPFA", + "ARABICPFB", + "ARABICPRESENTATIONFORMSA", + "ARABICPRESENTATIONFORMSB", + "ARABICSUP", + "ARABICSUPPLEMENT", + "ARMENIAN", + "ARMI", + "ARMN", + "ARROWS", + "ASCII", + "ASCIIHEXDIGIT", + "ASSIGNED", + "AT", + "ATA", + "ATAR", + "ATB", + "ATBL", + "ATERM", + "ATTACHEDABOVE", + "ATTACHEDABOVERIGHT", + "ATTACHEDBELOW", + "ATTACHEDBELOWLEFT", + "AVAGRAHA", + "AVESTAN", + "AVST", + "B", + "B2", + "BA", + "BALI", + "BALINESE", + "BAMU", + "BAMUM", + "BAMUMSUP", + "BAMUMSUPPLEMENT", + "BASICLATIN", + "BASS", + "BASSAVAH", + "BATAK", + "BATK", + "BB", + "BC", + "BEH", + "BELOW", + "BELOWLEFT", + "BELOWRIGHT", + "BENG", + "BENGALI", + "BETH", + "BIDIC", + "BIDICLASS", + "BIDICONTROL", + "BIDIM", + "BIDIMIRRORED", + "BINDU", + "BK", + "BL", + "BLANK", + "BLK", + "BLOCK", + "BLOCKELEMENTS", + "BN", + "BOPO", + "BOPOMOFO", + "BOPOMOFOEXT", + "BOPOMOFOEXTENDED", + "BOTTOM", + "BOTTOMANDRIGHT", + "BOUNDARYNEUTRAL", + "BOXDRAWING", + "BR", + "BRAH", + "BRAHMI", + "BRAHMIJOININGNUMBER", + "BRAI", + "BRAILLE", + "BRAILLEPATTERNS", + "BREAKAFTER", + "BREAKBEFORE", + "BREAKBOTH", + "BREAKSYMBOLS", + "BUGI", + "BUGINESE", + "BUHD", + "BUHID", + "BURUSHASKIYEHBARREE", + "BYZANTINEMUSIC", + "BYZANTINEMUSICALSYMBOLS", + "C", + "C&", + "CAKM", + "CAN", + "CANADIANABORIGINAL", + "CANADIANSYLLABICS", + "CANONICAL", + "CANONICALCOMBININGCLASS", + "CANS", + "CANTILLATIONMARK", + "CARI", + "CARIAN", + "CARRIAGERETURN", + "CASED", + "CASEDLETTER", + "CASEIGNORABLE", + "CAUCASIANALBANIAN", + "CB", + "CC", + "CCC", + "CCC10", + "CCC103", + "CCC107", + "CCC11", + "CCC118", + "CCC12", + "CCC122", + "CCC129", + "CCC13", + "CCC130", + "CCC132", + "CCC133", + "CCC14", + "CCC15", + "CCC16", + "CCC17", + "CCC18", + "CCC19", + "CCC20", + "CCC21", + "CCC22", + "CCC23", + "CCC24", + "CCC25", + "CCC26", + "CCC27", + "CCC28", + "CCC29", + "CCC30", + "CCC31", + "CCC32", + "CCC33", + "CCC34", + "CCC35", + "CCC36", + "CCC84", + "CCC91", + "CF", + "CHAKMA", + "CHAM", + "CHANGESWHENCASEFOLDED", + "CHANGESWHENCASEMAPPED", + "CHANGESWHENLOWERCASED", + "CHANGESWHENTITLECASED", + "CHANGESWHENUPPERCASED", + "CHER", + "CHEROKEE", + "CHEROKEESUP", + "CHEROKEESUPPLEMENT", + "CI", + "CIRCLE", + "CJ", + "CJK", + "CJKCOMPAT", + "CJKCOMPATFORMS", + "CJKCOMPATIBILITY", + "CJKCOMPATIBILITYFORMS", + "CJKCOMPATIBILITYIDEOGRAPHS", + "CJKCOMPATIBILITYIDEOGRAPHSSUPPLEMENT", + "CJKCOMPATIDEOGRAPHS", + "CJKCOMPATIDEOGRAPHSSUP", + "CJKEXTA", + "CJKEXTB", + "CJKEXTC", + "CJKEXTD", + "CJKEXTE", + "CJKRADICALSSUP", + "CJKRADICALSSUPPLEMENT", + "CJKSTROKES", + "CJKSYMBOLS", + "CJKSYMBOLSANDPUNCTUATION", + "CJKUNIFIEDIDEOGRAPHS", + "CJKUNIFIEDIDEOGRAPHSEXTENSIONA", + "CJKUNIFIEDIDEOGRAPHSEXTENSIONB", + "CJKUNIFIEDIDEOGRAPHSEXTENSIONC", + "CJKUNIFIEDIDEOGRAPHSEXTENSIOND", + "CJKUNIFIEDIDEOGRAPHSEXTENSIONE", + "CL", + "CLOSE", + "CLOSEPARENTHESIS", + "CLOSEPUNCTUATION", + "CM", + "CN", + "CNTRL", + "CO", + "COM", + "COMBININGDIACRITICALMARKS", + "COMBININGDIACRITICALMARKSEXTENDED", + "COMBININGDIACRITICALMARKSFORSYMBOLS", + "COMBININGDIACRITICALMARKSSUPPLEMENT", + "COMBININGHALFMARKS", + "COMBININGMARK", + "COMBININGMARKSFORSYMBOLS", + "COMMON", + "COMMONINDICNUMBERFORMS", + "COMMONSEPARATOR", + "COMPAT", + "COMPATJAMO", + "COMPLEXCONTEXT", + "CONDITIONALJAPANESESTARTER", + "CONNECTORPUNCTUATION", + "CONSONANT", + "CONSONANTDEAD", + "CONSONANTFINAL", + "CONSONANTHEADLETTER", + "CONSONANTKILLER", + "CONSONANTMEDIAL", + "CONSONANTPLACEHOLDER", + "CONSONANTPRECEDINGREPHA", + "CONSONANTPREFIXED", + "CONSONANTSUBJOINED", + "CONSONANTSUCCEEDINGREPHA", + "CONSONANTWITHSTACKER", + "CONTINGENTBREAK", + "CONTROL", + "CONTROLPICTURES", + "COPT", + "COPTIC", + "COPTICEPACTNUMBERS", + "COUNTINGROD", + "COUNTINGRODNUMERALS", + "CP", + "CPRT", + "CR", + "CS", + "CUNEIFORM", + "CUNEIFORMNUMBERS", + "CUNEIFORMNUMBERSANDPUNCTUATION", + "CURRENCYSYMBOL", + "CURRENCYSYMBOLS", + "CWCF", + "CWCM", + "CWL", + "CWT", + "CWU", + "CYPRIOT", + "CYPRIOTSYLLABARY", + "CYRILLIC", + "CYRILLICEXTA", + "CYRILLICEXTB", + "CYRILLICEXTENDEDA", + "CYRILLICEXTENDEDB", + "CYRILLICSUP", + "CYRILLICSUPPLEMENT", + "CYRILLICSUPPLEMENTARY", + "CYRL", + "D", + "DA", + "DAL", + "DALATHRISH", + "DASH", + "DASHPUNCTUATION", + "DB", + "DE", + "DECIMAL", + "DECIMALNUMBER", + "DECOMPOSITIONTYPE", + "DEFAULTIGNORABLECODEPOINT", + "DEP", + "DEPRECATED", + "DESERET", + "DEVA", + "DEVANAGARI", + "DEVANAGARIEXT", + "DEVANAGARIEXTENDED", + "DI", + "DIA", + "DIACRITIC", + "DIACRITICALS", + "DIACRITICALSEXT", + "DIACRITICALSFORSYMBOLS", + "DIACRITICALSSUP", + "DIGIT", + "DINGBATS", + "DOMINO", + "DOMINOTILES", + "DOUBLEABOVE", + "DOUBLEBELOW", + "DOUBLEQUOTE", + "DQ", + "DSRT", + "DT", + "DUALJOINING", + "DUPL", + "DUPLOYAN", + "E", + "EA", + "EARLYDYNASTICCUNEIFORM", + "EASTASIANWIDTH", + "EGYP", + "EGYPTIANHIEROGLYPHS", + "ELBA", + "ELBASAN", + "EMOTICONS", + "EN", + "ENC", + "ENCLOSEDALPHANUM", + "ENCLOSEDALPHANUMERICS", + "ENCLOSEDALPHANUMERICSUPPLEMENT", + "ENCLOSEDALPHANUMSUP", + "ENCLOSEDCJK", + "ENCLOSEDCJKLETTERSANDMONTHS", + "ENCLOSEDIDEOGRAPHICSUP", + "ENCLOSEDIDEOGRAPHICSUPPLEMENT", + "ENCLOSINGMARK", + "ES", + "ET", + "ETHI", + "ETHIOPIC", + "ETHIOPICEXT", + "ETHIOPICEXTA", + "ETHIOPICEXTENDED", + "ETHIOPICEXTENDEDA", + "ETHIOPICSUP", + "ETHIOPICSUPPLEMENT", + "EUROPEANNUMBER", + "EUROPEANSEPARATOR", + "EUROPEANTERMINATOR", + "EX", + "EXCLAMATION", + "EXT", + "EXTEND", + "EXTENDER", + "EXTENDNUMLET", + "F", + "FALSE", + "FARSIYEH", + "FE", + "FEH", + "FIN", + "FINAL", + "FINALPUNCTUATION", + "FINALSEMKATH", + "FIRSTSTRONGISOLATE", + "FO", + "FONT", + "FORMAT", + "FRA", + "FRACTION", + "FSI", + "FULLWIDTH", + "GAF", + "GAMAL", + "GC", + "GCB", + "GEMINATIONMARK", + "GENERALCATEGORY", + "GENERALPUNCTUATION", + "GEOMETRICSHAPES", + "GEOMETRICSHAPESEXT", + "GEOMETRICSHAPESEXTENDED", + "GEOR", + "GEORGIAN", + "GEORGIANSUP", + "GEORGIANSUPPLEMENT", + "GL", + "GLAG", + "GLAGOLITIC", + "GLUE", + "GOTH", + "GOTHIC", + "GRAN", + "GRANTHA", + "GRAPH", + "GRAPHEMEBASE", + "GRAPHEMECLUSTERBREAK", + "GRAPHEMEEXTEND", + "GRAPHEMELINK", + "GRBASE", + "GREEK", + "GREEKANDCOPTIC", + "GREEKEXT", + "GREEKEXTENDED", + "GREK", + "GREXT", + "GRLINK", + "GUJARATI", + "GUJR", + "GURMUKHI", + "GURU", + "H", + "H2", + "H3", + "HAH", + "HALFANDFULLFORMS", + "HALFMARKS", + "HALFWIDTH", + "HALFWIDTHANDFULLWIDTHFORMS", + "HAMZAONHEHGOAL", + "HAN", + "HANG", + "HANGUL", + "HANGULCOMPATIBILITYJAMO", + "HANGULJAMO", + "HANGULJAMOEXTENDEDA", + "HANGULJAMOEXTENDEDB", + "HANGULSYLLABLES", + "HANGULSYLLABLETYPE", + "HANI", + "HANO", + "HANUNOO", + "HATR", + "HATRAN", + "HE", + "HEBR", + "HEBREW", + "HEBREWLETTER", + "HEH", + "HEHGOAL", + "HETH", + "HEX", + "HEXDIGIT", + "HIGHPRIVATEUSESURROGATES", + "HIGHPUSURROGATES", + "HIGHSURROGATES", + "HIRA", + "HIRAGANA", + "HL", + "HLUW", + "HMNG", + "HRKT", + "HST", + "HUNG", + "HY", + "HYPHEN", + "ID", + "IDC", + "IDCONTINUE", + "IDEO", + "IDEOGRAPHIC", + "IDEOGRAPHICDESCRIPTIONCHARACTERS", + "IDS", + "IDSB", + "IDSBINARYOPERATOR", + "IDST", + "IDSTART", + "IDSTRINARYOPERATOR", + "IMPERIALARAMAIC", + "IN", + "INDICNUMBERFORMS", + "INDICPOSITIONALCATEGORY", + "INDICSYLLABICCATEGORY", + "INFIXNUMERIC", + "INHERITED", + "INIT", + "INITIAL", + "INITIALPUNCTUATION", + "INPC", + "INSC", + "INSCRIPTIONALPAHLAVI", + "INSCRIPTIONALPARTHIAN", + "INSEPARABLE", + "INSEPERABLE", + "INVISIBLESTACKER", + "IOTASUBSCRIPT", + "IPAEXT", + "IPAEXTENSIONS", + "IS", + "ISO", + "ISOLATED", + "ITAL", + "JAMO", + "JAMOEXTA", + "JAMOEXTB", + "JAVA", + "JAVANESE", + "JG", + "JL", + "JOINC", + "JOINCAUSING", + "JOINCONTROL", + "JOINER", + "JOININGGROUP", + "JOININGTYPE", + "JT", + "JV", + "KA", + "KAF", + "KAITHI", + "KALI", + "KANA", + "KANASUP", + "KANASUPPLEMENT", + "KANAVOICING", + "KANBUN", + "KANGXI", + "KANGXIRADICALS", + "KANNADA", + "KAPH", + "KATAKANA", + "KATAKANAEXT", + "KATAKANAORHIRAGANA", + "KATAKANAPHONETICEXTENSIONS", + "KAYAHLI", + "KHAPH", + "KHAR", + "KHAROSHTHI", + "KHMER", + "KHMERSYMBOLS", + "KHMR", + "KHOJ", + "KHOJKI", + "KHUDAWADI", + "KNDA", + "KNOTTEDHEH", + "KTHI", + "KV", + "L", + "L&", + "LAM", + "LAMADH", + "LANA", + "LAO", + "LAOO", + "LATIN", + "LATIN1", + "LATIN1SUP", + "LATIN1SUPPLEMENT", + "LATINEXTA", + "LATINEXTADDITIONAL", + "LATINEXTB", + "LATINEXTC", + "LATINEXTD", + "LATINEXTE", + "LATINEXTENDEDA", + "LATINEXTENDEDADDITIONAL", + "LATINEXTENDEDB", + "LATINEXTENDEDC", + "LATINEXTENDEDD", + "LATINEXTENDEDE", + "LATN", + "LB", + "LC", + "LE", + "LEADINGJAMO", + "LEFT", + "LEFTANDRIGHT", + "LEFTJOINING", + "LEFTTORIGHT", + "LEFTTORIGHTEMBEDDING", + "LEFTTORIGHTISOLATE", + "LEFTTORIGHTOVERRIDE", + "LEPC", + "LEPCHA", + "LETTER", + "LETTERLIKESYMBOLS", + "LETTERNUMBER", + "LF", + "LIMB", + "LIMBU", + "LINA", + "LINB", + "LINEARA", + "LINEARB", + "LINEARBIDEOGRAMS", + "LINEARBSYLLABARY", + "LINEBREAK", + "LINEFEED", + "LINESEPARATOR", + "LISU", + "LL", + "LM", + "LO", + "LOE", + "LOGICALORDEREXCEPTION", + "LOWER", + "LOWERCASE", + "LOWERCASELETTER", + "LOWSURROGATES", + "LRE", + "LRI", + "LRO", + "LT", + "LU", + "LV", + "LVSYLLABLE", + "LVT", + "LVTSYLLABLE", + "LYCI", + "LYCIAN", + "LYDI", + "LYDIAN", + "M", + "M&", + "MAHAJANI", + "MAHJ", + "MAHJONG", + "MAHJONGTILES", + "MALAYALAM", + "MAND", + "MANDAIC", + "MANDATORYBREAK", + "MANI", + "MANICHAEAN", + "MANICHAEANALEPH", + "MANICHAEANAYIN", + "MANICHAEANBETH", + "MANICHAEANDALETH", + "MANICHAEANDHAMEDH", + "MANICHAEANFIVE", + "MANICHAEANGIMEL", + "MANICHAEANHETH", + "MANICHAEANHUNDRED", + "MANICHAEANKAPH", + "MANICHAEANLAMEDH", + "MANICHAEANMEM", + "MANICHAEANNUN", + "MANICHAEANONE", + "MANICHAEANPE", + "MANICHAEANQOPH", + "MANICHAEANRESH", + "MANICHAEANSADHE", + "MANICHAEANSAMEKH", + "MANICHAEANTAW", + "MANICHAEANTEN", + "MANICHAEANTETH", + "MANICHAEANTHAMEDH", + "MANICHAEANTWENTY", + "MANICHAEANWAW", + "MANICHAEANYODH", + "MANICHAEANZAYIN", + "MARK", + "MATH", + "MATHALPHANUM", + "MATHEMATICALALPHANUMERICSYMBOLS", + "MATHEMATICALOPERATORS", + "MATHOPERATORS", + "MATHSYMBOL", + "MB", + "MC", + "ME", + "MED", + "MEDIAL", + "MEEM", + "MEETEIMAYEK", + "MEETEIMAYEKEXT", + "MEETEIMAYEKEXTENSIONS", + "MEND", + "MENDEKIKAKUI", + "MERC", + "MERO", + "MEROITICCURSIVE", + "MEROITICHIEROGLYPHS", + "MIAO", + "MIDLETTER", + "MIDNUM", + "MIDNUMLET", + "MIM", + "MISCARROWS", + "MISCELLANEOUSMATHEMATICALSYMBOLSA", + "MISCELLANEOUSMATHEMATICALSYMBOLSB", + "MISCELLANEOUSSYMBOLS", + "MISCELLANEOUSSYMBOLSANDARROWS", + "MISCELLANEOUSSYMBOLSANDPICTOGRAPHS", + "MISCELLANEOUSTECHNICAL", + "MISCMATHSYMBOLSA", + "MISCMATHSYMBOLSB", + "MISCPICTOGRAPHS", + "MISCSYMBOLS", + "MISCTECHNICAL", + "ML", + "MLYM", + "MN", + "MODI", + "MODIFIERLETTER", + "MODIFIERLETTERS", + "MODIFIERSYMBOL", + "MODIFIERTONELETTERS", + "MODIFYINGLETTER", + "MONG", + "MONGOLIAN", + "MRO", + "MROO", + "MTEI", + "MULT", + "MULTANI", + "MUSIC", + "MUSICALSYMBOLS", + "MYANMAR", + "MYANMAREXTA", + "MYANMAREXTB", + "MYANMAREXTENDEDA", + "MYANMAREXTENDEDB", + "MYMR", + "N", + "N&", + "NA", + "NABATAEAN", + "NAN", + "NAR", + "NARB", + "NARROW", + "NB", + "NBAT", + "NCHAR", + "ND", + "NEUTRAL", + "NEWLINE", + "NEWTAILUE", + "NEXTLINE", + "NK", + "NKO", + "NKOO", + "NL", + "NO", + "NOBLOCK", + "NOBREAK", + "NOJOININGGROUP", + "NONCHARACTERCODEPOINT", + "NONE", + "NONJOINER", + "NONJOINING", + "NONSPACINGMARK", + "NONSTARTER", + "NOON", + "NOTAPPLICABLE", + "NOTREORDERED", + "NR", + "NS", + "NSM", + "NT", + "NU", + "NUKTA", + "NUMBER", + "NUMBERFORMS", + "NUMBERJOINER", + "NUMERIC", + "NUMERICTYPE", + "NUMERICVALUE", + "NUN", + "NV", + "NYA", + "OALPHA", + "OCR", + "ODI", + "OGAM", + "OGHAM", + "OGREXT", + "OIDC", + "OIDS", + "OLCHIKI", + "OLCK", + "OLDHUNGARIAN", + "OLDITALIC", + "OLDNORTHARABIAN", + "OLDPERMIC", + "OLDPERSIAN", + "OLDSOUTHARABIAN", + "OLDTURKIC", + "OLETTER", + "OLOWER", + "OMATH", + "ON", + "OP", + "OPENPUNCTUATION", + "OPTICALCHARACTERRECOGNITION", + "ORIYA", + "ORKH", + "ORNAMENTALDINGBATS", + "ORYA", + "OSMA", + "OSMANYA", + "OTHER", + "OTHERALPHABETIC", + "OTHERDEFAULTIGNORABLECODEPOINT", + "OTHERGRAPHEMEEXTEND", + "OTHERIDCONTINUE", + "OTHERIDSTART", + "OTHERLETTER", + "OTHERLOWERCASE", + "OTHERMATH", + "OTHERNEUTRAL", + "OTHERNUMBER", + "OTHERPUNCTUATION", + "OTHERSYMBOL", + "OTHERUPPERCASE", + "OUPPER", + "OV", + "OVERLAY", + "OVERSTRUCK", + "P", + "P&", + "PAHAWHHMONG", + "PALM", + "PALMYRENE", + "PARAGRAPHSEPARATOR", + "PATSYN", + "PATTERNSYNTAX", + "PATTERNWHITESPACE", + "PATWS", + "PAUC", + "PAUCINHAU", + "PC", + "PD", + "PDF", + "PDI", + "PE", + "PERM", + "PF", + "PHAG", + "PHAGSPA", + "PHAISTOS", + "PHAISTOSDISC", + "PHLI", + "PHLP", + "PHNX", + "PHOENICIAN", + "PHONETICEXT", + "PHONETICEXTENSIONS", + "PHONETICEXTENSIONSSUPPLEMENT", + "PHONETICEXTSUP", + "PI", + "PLAYINGCARDS", + "PLRD", + "PO", + "POPDIRECTIONALFORMAT", + "POPDIRECTIONALISOLATE", + "POSIXALNUM", + "POSIXDIGIT", + "POSIXPUNCT", + "POSIXXDIGIT", + "POSTFIXNUMERIC", + "PP", + "PR", + "PREFIXNUMERIC", + "PREPEND", + "PRINT", + "PRIVATEUSE", + "PRIVATEUSEAREA", + "PRTI", + "PS", + "PSALTERPAHLAVI", + "PUA", + "PUNCT", + "PUNCTUATION", + "PUREKILLER", + "QAAC", + "QAAI", + "QAF", + "QAPH", + "QMARK", + "QU", + "QUOTATION", + "QUOTATIONMARK", + "R", + "RADICAL", + "REGIONALINDICATOR", + "REGISTERSHIFTER", + "REH", + "REJANG", + "REVERSEDPE", + "RI", + "RIGHT", + "RIGHTJOINING", + "RIGHTTOLEFT", + "RIGHTTOLEFTEMBEDDING", + "RIGHTTOLEFTISOLATE", + "RIGHTTOLEFTOVERRIDE", + "RJNG", + "RLE", + "RLI", + "RLO", + "ROHINGYAYEH", + "RUMI", + "RUMINUMERALSYMBOLS", + "RUNIC", + "RUNR", + "S", + "S&", + "SA", + "SAD", + "SADHE", + "SAMARITAN", + "SAMR", + "SARB", + "SAUR", + "SAURASHTRA", + "SB", + "SC", + "SCONTINUE", + "SCRIPT", + "SD", + "SE", + "SEEN", + "SEGMENTSEPARATOR", + "SEMKATH", + "SENTENCEBREAK", + "SEP", + "SEPARATOR", + "SG", + "SGNW", + "SHARADA", + "SHAVIAN", + "SHAW", + "SHIN", + "SHORTHANDFORMATCONTROLS", + "SHRD", + "SIDD", + "SIDDHAM", + "SIGNWRITING", + "SIND", + "SINGLEQUOTE", + "SINH", + "SINHALA", + "SINHALAARCHAICNUMBERS", + "SK", + "SM", + "SMALL", + "SMALLFORMS", + "SMALLFORMVARIANTS", + "SML", + "SO", + "SOFTDOTTED", + "SORA", + "SORASOMPENG", + "SP", + "SPACE", + "SPACESEPARATOR", + "SPACINGMARK", + "SPACINGMODIFIERLETTERS", + "SPECIALS", + "SQ", + "SQR", + "SQUARE", + "ST", + "STERM", + "STRAIGHTWAW", + "SUB", + "SUND", + "SUNDANESE", + "SUNDANESESUP", + "SUNDANESESUPPLEMENT", + "SUP", + "SUPARROWSA", + "SUPARROWSB", + "SUPARROWSC", + "SUPER", + "SUPERANDSUB", + "SUPERSCRIPTSANDSUBSCRIPTS", + "SUPMATHOPERATORS", + "SUPPLEMENTALARROWSA", + "SUPPLEMENTALARROWSB", + "SUPPLEMENTALARROWSC", + "SUPPLEMENTALMATHEMATICALOPERATORS", + "SUPPLEMENTALPUNCTUATION", + "SUPPLEMENTALSYMBOLSANDPICTOGRAPHS", + "SUPPLEMENTARYPRIVATEUSEAREAA", + "SUPPLEMENTARYPRIVATEUSEAREAB", + "SUPPUAA", + "SUPPUAB", + "SUPPUNCTUATION", + "SUPSYMBOLSANDPICTOGRAPHS", + "SURROGATE", + "SUTTONSIGNWRITING", + "SWASHKAF", + "SY", + "SYLLABLEMODIFIER", + "SYLO", + "SYLOTINAGRI", + "SYMBOL", + "SYRC", + "SYRIAC", + "SYRIACWAW", + "T", + "TAGALOG", + "TAGB", + "TAGBANWA", + "TAGS", + "TAH", + "TAILE", + "TAITHAM", + "TAIVIET", + "TAIXUANJING", + "TAIXUANJINGSYMBOLS", + "TAKR", + "TAKRI", + "TALE", + "TALU", + "TAMIL", + "TAML", + "TAVT", + "TAW", + "TEHMARBUTA", + "TEHMARBUTAGOAL", + "TELU", + "TELUGU", + "TERM", + "TERMINALPUNCTUATION", + "TETH", + "TFNG", + "TGLG", + "THAA", + "THAANA", + "THAI", + "TIBETAN", + "TIBT", + "TIFINAGH", + "TIRH", + "TIRHUTA", + "TITLECASELETTER", + "TONELETTER", + "TONEMARK", + "TOP", + "TOPANDBOTTOM", + "TOPANDBOTTOMANDRIGHT", + "TOPANDLEFT", + "TOPANDLEFTANDRIGHT", + "TOPANDRIGHT", + "TRAILINGJAMO", + "TRANSPARENT", + "TRANSPORTANDMAP", + "TRANSPORTANDMAPSYMBOLS", + "TRUE", + "U", + "UCAS", + "UCASEXT", + "UGAR", + "UGARITIC", + "UIDEO", + "UNASSIGNED", + "UNIFIEDCANADIANABORIGINALSYLLABICS", + "UNIFIEDCANADIANABORIGINALSYLLABICSEXTENDED", + "UNIFIEDIDEOGRAPH", + "UNKNOWN", + "UP", + "UPPER", + "UPPERCASE", + "UPPERCASELETTER", + "V", + "VAI", + "VAII", + "VARIATIONSELECTOR", + "VARIATIONSELECTORS", + "VARIATIONSELECTORSSUPPLEMENT", + "VEDICEXT", + "VEDICEXTENSIONS", + "VERT", + "VERTICAL", + "VERTICALFORMS", + "VIRAMA", + "VISARGA", + "VISUALORDERLEFT", + "VOWEL", + "VOWELDEPENDENT", + "VOWELINDEPENDENT", + "VOWELJAMO", + "VR", + "VS", + "VSSUP", + "W", + "WARA", + "WARANGCITI", + "WAW", + "WB", + "WHITESPACE", + "WIDE", + "WJ", + "WORD", + "WORDBREAK", + "WORDJOINER", + "WS", + "WSPACE", + "XDIGIT", + "XIDC", + "XIDCONTINUE", + "XIDS", + "XIDSTART", + "XPEO", + "XSUX", + "XX", + "Y", + "YEH", + "YEHBARREE", + "YEHWITHTAIL", + "YES", + "YI", + "YIII", + "YIJING", + "YIJINGHEXAGRAMSYMBOLS", + "YIRADICALS", + "YISYLLABLES", + "YUDH", + "YUDHHE", + "Z", + "Z&", + "ZAIN", + "ZHAIN", + "ZINH", + "ZL", + "ZP", + "ZS", + "ZW", + "ZWSPACE", + "ZYYY", + "ZZZZ", +}; + +/* strings: 12240 bytes. */ + +/* properties. */ + +RE_Property re_properties[] = { + { 547, 0, 0}, + { 544, 0, 0}, + { 252, 1, 1}, + { 251, 1, 1}, + {1081, 2, 2}, + {1079, 2, 2}, + {1259, 3, 3}, + {1254, 3, 3}, + { 566, 4, 4}, + { 545, 4, 4}, + {1087, 5, 5}, + {1078, 5, 5}, + { 823, 6, 6}, + { 172, 7, 6}, + { 171, 7, 6}, + { 767, 8, 6}, + { 766, 8, 6}, + {1227, 9, 6}, + {1226, 9, 6}, + { 294, 10, 6}, + { 296, 11, 6}, + { 350, 11, 6}, + { 343, 12, 6}, + { 433, 12, 6}, + { 345, 13, 6}, + { 435, 13, 6}, + { 344, 14, 6}, + { 434, 14, 6}, + { 341, 15, 6}, + { 431, 15, 6}, + { 342, 16, 6}, + { 432, 16, 6}, + { 636, 17, 6}, + { 632, 17, 6}, + { 628, 18, 6}, + { 627, 18, 6}, + {1267, 19, 6}, + {1266, 19, 6}, + {1265, 20, 6}, + {1264, 20, 6}, + { 458, 21, 6}, + { 466, 21, 6}, + { 567, 22, 6}, + { 575, 22, 6}, + { 565, 23, 6}, + { 569, 23, 6}, + { 568, 24, 6}, + { 576, 24, 6}, + {1255, 25, 6}, + {1262, 25, 6}, + {1117, 25, 6}, + { 244, 26, 6}, + { 242, 26, 6}, + { 671, 27, 6}, + { 669, 27, 6}, + { 451, 28, 6}, + { 625, 29, 6}, + {1044, 30, 6}, + {1041, 30, 6}, + {1188, 31, 6}, + {1187, 31, 6}, + { 971, 32, 6}, + { 952, 32, 6}, + { 612, 33, 6}, + { 611, 33, 6}, + { 204, 34, 6}, + { 160, 34, 6}, + { 964, 35, 6}, + { 933, 35, 6}, + { 630, 36, 6}, + { 629, 36, 6}, + { 468, 37, 6}, + { 467, 37, 6}, + { 523, 38, 6}, + { 521, 38, 6}, + { 970, 39, 6}, + { 951, 39, 6}, + { 976, 40, 6}, + { 977, 40, 6}, + { 909, 41, 6}, + { 895, 41, 6}, + { 966, 42, 6}, + { 938, 42, 6}, + { 634, 43, 6}, + { 633, 43, 6}, + { 637, 44, 6}, + { 635, 44, 6}, + {1046, 45, 6}, + {1223, 46, 6}, + {1219, 46, 6}, + { 965, 47, 6}, + { 935, 47, 6}, + { 460, 48, 6}, + { 459, 48, 6}, + {1113, 49, 6}, + {1082, 49, 6}, + { 765, 50, 6}, + { 764, 50, 6}, + { 968, 51, 6}, + { 940, 51, 6}, + { 967, 52, 6}, + { 939, 52, 6}, + {1126, 53, 6}, + {1232, 54, 6}, + {1248, 54, 6}, + { 989, 55, 6}, + { 990, 55, 6}, + { 988, 56, 6}, + { 987, 56, 6}, + { 598, 57, 7}, + { 622, 57, 7}, + { 243, 58, 8}, + { 234, 58, 8}, + { 288, 59, 9}, + { 300, 59, 9}, + { 457, 60, 10}, + { 482, 60, 10}, + { 489, 61, 11}, + { 487, 61, 11}, + { 673, 62, 12}, + { 667, 62, 12}, + { 674, 63, 13}, + { 675, 63, 13}, + { 757, 64, 14}, + { 732, 64, 14}, + { 928, 65, 15}, + { 921, 65, 15}, + { 929, 66, 16}, + { 931, 66, 16}, + { 246, 67, 6}, + { 245, 67, 6}, + { 641, 68, 17}, + { 648, 68, 17}, + { 642, 69, 18}, + { 649, 69, 18}, + { 175, 70, 6}, + { 170, 70, 6}, + { 183, 71, 6}, + { 250, 72, 6}, + { 564, 73, 6}, + {1027, 74, 6}, + {1258, 75, 6}, + {1263, 76, 6}, + {1019, 77, 6}, + {1018, 78, 6}, + {1020, 79, 6}, + {1021, 80, 6}, +}; + +/* properties: 588 bytes. */ + +/* property values. */ + +RE_PropertyValue re_property_values[] = { + {1220, 0, 0}, + { 383, 0, 0}, + {1228, 0, 1}, + { 774, 0, 1}, + { 768, 0, 2}, + { 761, 0, 2}, + {1200, 0, 3}, + { 773, 0, 3}, + { 865, 0, 4}, + { 762, 0, 4}, + { 969, 0, 5}, + { 763, 0, 5}, + { 913, 0, 6}, + { 863, 0, 6}, + { 505, 0, 7}, + { 831, 0, 7}, + {1119, 0, 8}, + { 830, 0, 8}, + { 456, 0, 9}, + { 896, 0, 9}, + { 473, 0, 9}, + { 747, 0, 10}, + { 904, 0, 10}, + { 973, 0, 11}, + { 905, 0, 11}, + {1118, 0, 12}, + {1291, 0, 12}, + { 759, 0, 13}, + {1289, 0, 13}, + { 986, 0, 14}, + {1290, 0, 14}, + { 415, 0, 15}, + { 299, 0, 15}, + { 384, 0, 15}, + { 537, 0, 16}, + { 338, 0, 16}, + {1028, 0, 17}, + { 385, 0, 17}, + {1153, 0, 18}, + { 425, 0, 18}, + { 452, 0, 19}, + { 994, 0, 19}, + { 955, 0, 20}, + {1031, 0, 20}, + { 381, 0, 21}, + { 997, 0, 21}, + { 401, 0, 22}, + { 993, 0, 22}, + { 974, 0, 23}, + {1015, 0, 23}, + { 828, 0, 24}, + {1107, 0, 24}, + { 429, 0, 25}, + {1079, 0, 25}, + { 867, 0, 26}, + {1106, 0, 26}, + { 975, 0, 27}, + {1112, 0, 27}, + { 647, 0, 28}, + {1012, 0, 28}, + { 532, 0, 29}, + { 999, 0, 29}, + { 963, 0, 30}, + { 281, 0, 30}, + { 282, 0, 30}, + { 745, 0, 31}, + { 708, 0, 31}, + { 709, 0, 31}, + { 822, 0, 32}, + { 783, 0, 32}, + { 392, 0, 32}, + { 784, 0, 32}, + { 924, 0, 33}, + { 885, 0, 33}, + { 886, 0, 33}, + {1035, 0, 34}, + { 981, 0, 34}, + {1034, 0, 34}, + { 982, 0, 34}, + {1160, 0, 35}, + {1068, 0, 35}, + {1069, 0, 35}, + {1089, 0, 36}, + {1284, 0, 36}, + {1285, 0, 36}, + { 295, 0, 37}, + { 733, 0, 37}, + { 205, 0, 38}, + { 906, 1, 0}, + { 893, 1, 0}, + { 228, 1, 1}, + { 203, 1, 1}, + { 718, 1, 2}, + { 717, 1, 2}, + { 716, 1, 2}, + { 725, 1, 3}, + { 719, 1, 3}, + { 727, 1, 4}, + { 721, 1, 4}, + { 657, 1, 5}, + { 656, 1, 5}, + {1120, 1, 6}, + { 866, 1, 6}, + { 387, 1, 7}, + { 469, 1, 7}, + { 571, 1, 8}, + { 570, 1, 8}, + { 438, 1, 9}, + { 444, 1, 10}, + { 443, 1, 10}, + { 445, 1, 10}, + { 199, 1, 11}, + { 606, 1, 12}, + { 186, 1, 13}, + {1162, 1, 14}, + { 198, 1, 15}, + { 197, 1, 15}, + {1193, 1, 16}, + { 902, 1, 17}, + {1073, 1, 18}, + { 791, 1, 19}, + { 188, 1, 20}, + { 187, 1, 20}, + { 463, 1, 21}, + { 240, 1, 22}, + { 579, 1, 23}, + { 577, 1, 24}, + { 957, 1, 25}, + {1179, 1, 26}, + {1186, 1, 27}, + { 688, 1, 28}, + { 789, 1, 29}, + {1104, 1, 30}, + {1194, 1, 31}, + { 713, 1, 32}, + {1195, 1, 33}, + { 879, 1, 34}, + { 553, 1, 35}, + { 594, 1, 36}, + { 662, 1, 36}, + { 509, 1, 37}, + { 515, 1, 38}, + { 514, 1, 38}, + { 347, 1, 39}, + {1221, 1, 40}, + {1215, 1, 40}, + { 286, 1, 40}, + { 937, 1, 41}, + {1066, 1, 42}, + {1165, 1, 43}, + { 601, 1, 44}, + { 277, 1, 45}, + {1167, 1, 46}, + { 698, 1, 47}, + { 871, 1, 48}, + {1222, 1, 49}, + {1216, 1, 49}, + { 750, 1, 50}, + {1170, 1, 51}, + { 899, 1, 52}, + { 699, 1, 53}, + { 275, 1, 54}, + {1171, 1, 55}, + { 388, 1, 56}, + { 470, 1, 56}, + { 223, 1, 57}, + {1130, 1, 58}, + { 231, 1, 59}, + { 744, 1, 60}, + { 941, 1, 61}, + {1132, 1, 62}, + {1131, 1, 62}, + {1236, 1, 63}, + {1235, 1, 63}, + {1009, 1, 64}, + {1008, 1, 64}, + {1010, 1, 65}, + {1011, 1, 65}, + { 390, 1, 66}, + { 472, 1, 66}, + { 726, 1, 67}, + { 720, 1, 67}, + { 573, 1, 68}, + { 572, 1, 68}, + { 548, 1, 69}, + {1035, 1, 69}, + {1139, 1, 70}, + {1138, 1, 70}, + { 430, 1, 71}, + { 389, 1, 72}, + { 471, 1, 72}, + { 393, 1, 72}, + { 746, 1, 73}, + { 925, 1, 74}, + { 202, 1, 75}, + { 826, 1, 76}, + { 827, 1, 76}, + { 855, 1, 77}, + { 860, 1, 77}, + { 416, 1, 78}, + { 956, 1, 79}, + { 934, 1, 79}, + { 498, 1, 80}, + { 497, 1, 80}, + { 262, 1, 81}, + { 253, 1, 82}, + { 549, 1, 83}, + { 852, 1, 84}, + { 859, 1, 84}, + { 474, 1, 85}, + { 850, 1, 86}, + { 856, 1, 86}, + {1141, 1, 87}, + {1134, 1, 87}, + { 269, 1, 88}, + { 268, 1, 88}, + {1142, 1, 89}, + {1135, 1, 89}, + { 851, 1, 90}, + { 857, 1, 90}, + {1144, 1, 91}, + {1140, 1, 91}, + { 853, 1, 92}, + { 849, 1, 92}, + { 558, 1, 93}, + { 728, 1, 94}, + { 722, 1, 94}, + { 418, 1, 95}, + { 555, 1, 96}, + { 554, 1, 96}, + {1197, 1, 97}, + { 512, 1, 98}, + { 510, 1, 98}, + { 441, 1, 99}, + { 439, 1, 99}, + {1145, 1, 100}, + {1151, 1, 100}, + { 368, 1, 101}, + { 367, 1, 101}, + { 687, 1, 102}, + { 686, 1, 102}, + { 631, 1, 103}, + { 627, 1, 103}, + { 371, 1, 104}, + { 370, 1, 104}, + { 617, 1, 105}, + { 690, 1, 106}, + { 256, 1, 107}, + { 593, 1, 108}, + { 398, 1, 108}, + { 685, 1, 109}, + { 258, 1, 110}, + { 257, 1, 110}, + { 369, 1, 111}, + { 693, 1, 112}, + { 691, 1, 112}, + { 502, 1, 113}, + { 501, 1, 113}, + { 356, 1, 114}, + { 354, 1, 114}, + { 373, 1, 115}, + { 362, 1, 115}, + {1279, 1, 116}, + {1278, 1, 116}, + { 372, 1, 117}, + { 353, 1, 117}, + {1281, 1, 118}, + {1280, 1, 119}, + { 760, 1, 120}, + {1230, 1, 121}, + { 442, 1, 122}, + { 440, 1, 122}, + { 225, 1, 123}, + { 868, 1, 124}, + { 729, 1, 125}, + { 723, 1, 125}, + {1159, 1, 126}, + { 395, 1, 127}, + { 640, 1, 127}, + {1001, 1, 128}, + {1077, 1, 129}, + { 465, 1, 130}, + { 464, 1, 130}, + { 694, 1, 131}, + {1050, 1, 132}, + { 595, 1, 133}, + { 663, 1, 133}, + { 666, 1, 134}, + { 883, 1, 135}, + { 881, 1, 135}, + { 340, 1, 136}, + { 882, 1, 137}, + { 880, 1, 137}, + {1172, 1, 138}, + { 837, 1, 139}, + { 836, 1, 139}, + { 513, 1, 140}, + { 511, 1, 140}, + { 730, 1, 141}, + { 724, 1, 141}, + { 349, 1, 142}, + { 348, 1, 142}, + { 835, 1, 143}, + { 597, 1, 144}, + { 592, 1, 144}, + { 596, 1, 145}, + { 664, 1, 145}, + { 615, 1, 146}, + { 613, 1, 147}, + { 614, 1, 147}, + { 769, 1, 148}, + {1029, 1, 149}, + {1033, 1, 149}, + {1028, 1, 149}, + { 358, 1, 150}, + { 360, 1, 150}, + { 174, 1, 151}, + { 173, 1, 151}, + { 195, 1, 152}, + { 193, 1, 152}, + {1233, 1, 153}, + {1248, 1, 153}, + {1239, 1, 154}, + { 391, 1, 155}, + { 586, 1, 155}, + { 357, 1, 156}, + { 355, 1, 156}, + {1110, 1, 157}, + {1109, 1, 157}, + { 196, 1, 158}, + { 194, 1, 158}, + { 588, 1, 159}, + { 585, 1, 159}, + {1121, 1, 160}, + { 756, 1, 161}, + { 755, 1, 162}, + { 158, 1, 163}, + { 181, 1, 164}, + { 182, 1, 165}, + {1003, 1, 166}, + {1002, 1, 166}, + { 780, 1, 167}, + { 292, 1, 168}, + { 419, 1, 169}, + { 944, 1, 170}, + { 561, 1, 171}, + { 946, 1, 172}, + {1218, 1, 173}, + { 947, 1, 174}, + { 461, 1, 175}, + {1093, 1, 176}, + { 962, 1, 177}, + { 493, 1, 178}, + { 297, 1, 179}, + { 753, 1, 180}, + { 437, 1, 181}, + { 638, 1, 182}, + { 985, 1, 183}, + { 888, 1, 184}, + { 603, 1, 185}, + {1007, 1, 186}, + { 782, 1, 187}, + { 843, 1, 188}, + { 842, 1, 189}, + { 697, 1, 190}, + { 948, 1, 191}, + { 945, 1, 192}, + { 794, 1, 193}, + { 217, 1, 194}, + { 651, 1, 195}, + { 650, 1, 196}, + {1032, 1, 197}, + { 949, 1, 198}, + { 943, 1, 199}, + {1065, 1, 200}, + {1064, 1, 200}, + { 265, 1, 201}, + { 679, 1, 202}, + {1115, 1, 203}, + { 339, 1, 204}, + { 785, 1, 205}, + {1092, 1, 206}, + {1105, 1, 207}, + { 702, 1, 208}, + { 876, 1, 209}, + { 703, 1, 210}, + { 563, 1, 211}, + {1199, 1, 212}, + {1099, 1, 213}, + { 864, 1, 214}, + {1176, 1, 215}, + { 161, 1, 216}, + {1252, 1, 217}, + { 992, 1, 218}, + { 426, 1, 219}, + { 428, 1, 220}, + { 427, 1, 220}, + { 488, 1, 221}, + { 491, 1, 222}, + { 178, 1, 223}, + { 227, 1, 224}, + { 226, 1, 224}, + { 872, 1, 225}, + { 230, 1, 226}, + { 983, 1, 227}, + { 844, 1, 228}, + { 683, 1, 229}, + { 682, 1, 229}, + { 485, 1, 230}, + {1096, 1, 231}, + { 280, 1, 232}, + { 279, 1, 232}, + { 878, 1, 233}, + { 877, 1, 233}, + { 180, 1, 234}, + { 179, 1, 234}, + {1174, 1, 235}, + {1173, 1, 235}, + { 421, 1, 236}, + { 420, 1, 236}, + { 825, 1, 237}, + { 824, 1, 237}, + {1154, 1, 238}, + { 839, 1, 239}, + { 191, 1, 240}, + { 190, 1, 240}, + { 788, 1, 241}, + { 787, 1, 241}, + { 476, 1, 242}, + { 475, 1, 242}, + {1013, 1, 243}, + { 499, 1, 244}, + { 500, 1, 244}, + { 504, 1, 245}, + { 503, 1, 245}, + { 854, 1, 246}, + { 858, 1, 246}, + { 494, 1, 247}, + { 959, 1, 248}, + {1212, 1, 249}, + {1211, 1, 249}, + { 167, 1, 250}, + { 166, 1, 250}, + { 551, 1, 251}, + { 550, 1, 251}, + {1143, 1, 252}, + {1136, 1, 252}, + {1146, 1, 253}, + {1152, 1, 253}, + { 374, 1, 254}, + { 363, 1, 254}, + { 375, 1, 255}, + { 364, 1, 255}, + { 376, 1, 256}, + { 365, 1, 256}, + { 377, 1, 257}, + { 366, 1, 257}, + { 359, 1, 258}, + { 361, 1, 258}, + {1168, 1, 259}, + {1234, 1, 260}, + {1249, 1, 260}, + {1147, 1, 261}, + {1149, 1, 261}, + {1148, 1, 262}, + {1150, 1, 262}, + {1224, 2, 0}, + {1295, 2, 0}, + { 394, 2, 1}, + {1294, 2, 1}, + { 715, 2, 2}, + { 731, 2, 2}, + { 570, 2, 3}, + { 574, 2, 3}, + { 438, 2, 4}, + { 446, 2, 4}, + { 199, 2, 5}, + { 201, 2, 5}, + { 606, 2, 6}, + { 605, 2, 6}, + { 186, 2, 7}, + { 185, 2, 7}, + {1162, 2, 8}, + {1161, 2, 8}, + {1193, 2, 9}, + {1192, 2, 9}, + { 463, 2, 10}, + { 462, 2, 10}, + { 240, 2, 11}, + { 239, 2, 11}, + { 579, 2, 12}, + { 580, 2, 12}, + { 577, 2, 13}, + { 578, 2, 13}, + { 957, 2, 14}, + { 960, 2, 14}, + {1179, 2, 15}, + {1180, 2, 15}, + {1186, 2, 16}, + {1185, 2, 16}, + { 688, 2, 17}, + { 704, 2, 17}, + { 789, 2, 18}, + { 862, 2, 18}, + {1104, 2, 19}, + {1103, 2, 19}, + {1194, 2, 20}, + { 713, 2, 21}, + { 714, 2, 21}, + {1195, 2, 22}, + {1196, 2, 22}, + { 879, 2, 23}, + { 884, 2, 23}, + { 553, 2, 24}, + { 552, 2, 24}, + { 592, 2, 25}, + { 591, 2, 25}, + { 509, 2, 26}, + { 508, 2, 26}, + { 347, 2, 27}, + { 346, 2, 27}, + { 285, 2, 28}, + { 289, 2, 28}, + { 937, 2, 29}, + { 936, 2, 29}, + {1066, 2, 30}, + {1067, 2, 30}, + { 698, 2, 31}, + { 700, 2, 31}, + { 871, 2, 32}, + { 870, 2, 32}, + { 617, 2, 33}, + { 616, 2, 33}, + { 690, 2, 34}, + { 681, 2, 34}, + { 256, 2, 35}, + { 255, 2, 35}, + { 590, 2, 36}, + { 599, 2, 36}, + {1276, 2, 37}, + {1277, 2, 37}, + { 944, 2, 38}, + { 661, 2, 38}, + { 561, 2, 39}, + { 560, 2, 39}, + { 461, 2, 40}, + { 481, 2, 40}, + { 644, 2, 41}, + {1288, 2, 41}, + {1038, 2, 41}, + {1165, 2, 42}, + {1191, 2, 42}, + { 601, 2, 43}, + { 600, 2, 43}, + { 277, 2, 44}, + { 276, 2, 44}, + {1167, 2, 45}, + {1166, 2, 45}, + { 750, 2, 46}, + { 749, 2, 46}, + {1170, 2, 47}, + {1177, 2, 47}, + { 754, 2, 48}, + { 752, 2, 48}, + {1218, 2, 49}, + {1217, 2, 49}, + {1093, 2, 50}, + {1094, 2, 50}, + { 962, 2, 51}, + { 961, 2, 51}, + { 436, 2, 52}, + { 423, 2, 52}, + { 268, 2, 53}, + { 267, 2, 53}, + { 275, 2, 54}, + { 274, 2, 54}, + { 418, 2, 55}, + { 417, 2, 55}, + {1037, 2, 55}, + { 899, 2, 56}, + {1178, 2, 56}, + { 558, 2, 57}, + { 557, 2, 57}, + {1197, 2, 58}, + {1190, 2, 58}, + {1159, 2, 59}, + {1158, 2, 59}, + { 947, 2, 60}, + {1268, 2, 60}, + { 697, 2, 61}, + { 696, 2, 61}, + { 223, 2, 62}, + { 222, 2, 62}, + { 426, 2, 63}, + {1269, 2, 63}, + {1007, 2, 64}, + {1006, 2, 64}, + {1001, 2, 65}, + {1000, 2, 65}, + { 902, 2, 66}, + { 903, 2, 66}, + {1130, 2, 67}, + {1129, 2, 67}, + { 744, 2, 68}, + { 743, 2, 68}, + { 941, 2, 69}, + { 942, 2, 69}, + {1230, 2, 70}, + {1231, 2, 70}, + {1077, 2, 71}, + {1076, 2, 71}, + { 694, 2, 72}, + { 680, 2, 72}, + {1050, 2, 73}, + {1059, 2, 73}, + { 780, 2, 74}, + { 779, 2, 74}, + { 292, 2, 75}, + { 291, 2, 75}, + { 782, 2, 76}, + { 781, 2, 76}, + { 340, 2, 77}, + {1171, 2, 78}, + { 712, 2, 78}, + {1172, 2, 79}, + {1181, 2, 79}, + { 217, 2, 80}, + { 218, 2, 80}, + { 491, 2, 81}, + { 490, 2, 81}, + {1073, 2, 82}, + {1074, 2, 82}, + { 760, 2, 83}, + { 225, 2, 84}, + { 224, 2, 84}, + { 666, 2, 85}, + { 665, 2, 85}, + { 835, 2, 86}, + { 874, 2, 86}, + { 638, 2, 87}, + { 200, 2, 87}, + { 948, 2, 88}, + {1075, 2, 88}, + { 651, 2, 89}, + {1030, 2, 89}, + { 650, 2, 90}, + {1004, 2, 90}, + { 949, 2, 91}, + { 958, 2, 91}, + { 679, 2, 92}, + { 706, 2, 92}, + { 231, 2, 93}, + { 232, 2, 93}, + { 265, 2, 94}, + { 264, 2, 94}, + { 791, 2, 95}, + { 790, 2, 95}, + { 339, 2, 96}, + { 283, 2, 96}, + { 842, 2, 97}, + { 840, 2, 97}, + { 843, 2, 98}, + { 841, 2, 98}, + { 844, 2, 99}, + {1014, 2, 99}, + {1092, 2, 100}, + {1097, 2, 100}, + {1115, 2, 101}, + {1114, 2, 101}, + {1176, 2, 102}, + {1175, 2, 102}, + { 297, 2, 103}, + { 159, 2, 103}, + { 230, 2, 104}, + { 229, 2, 104}, + { 485, 2, 105}, + { 484, 2, 105}, + { 493, 2, 106}, + { 492, 2, 106}, + { 563, 2, 107}, + { 562, 2, 107}, + { 983, 2, 108}, + { 620, 2, 108}, + { 702, 2, 109}, + { 701, 2, 109}, + { 753, 2, 110}, + { 751, 2, 110}, + { 785, 2, 111}, + { 786, 2, 111}, + { 794, 2, 112}, + { 793, 2, 112}, + { 839, 2, 113}, + { 838, 2, 113}, + { 864, 2, 114}, + { 872, 2, 115}, + { 873, 2, 115}, + { 945, 2, 116}, + { 891, 2, 116}, + { 888, 2, 117}, + { 894, 2, 117}, + { 985, 2, 118}, + { 984, 2, 118}, + { 992, 2, 119}, + { 991, 2, 119}, + { 946, 2, 120}, + { 998, 2, 120}, + {1032, 2, 121}, + {1005, 2, 121}, + {1099, 2, 122}, + {1098, 2, 122}, + { 703, 2, 123}, + {1101, 2, 123}, + {1199, 2, 124}, + {1198, 2, 124}, + {1252, 2, 125}, + {1251, 2, 125}, + { 161, 2, 126}, + { 178, 2, 127}, + { 619, 2, 127}, + { 603, 2, 128}, + { 602, 2, 128}, + { 876, 2, 129}, + { 875, 2, 129}, + { 943, 2, 130}, + { 623, 2, 130}, + {1100, 2, 131}, + {1091, 2, 131}, + { 692, 2, 132}, + { 621, 2, 132}, + { 963, 3, 0}, + {1270, 3, 0}, + { 479, 3, 1}, + { 480, 3, 1}, + {1102, 3, 2}, + {1122, 3, 2}, + { 607, 3, 3}, + { 618, 3, 3}, + { 424, 3, 4}, + { 748, 3, 5}, + { 898, 3, 6}, + { 904, 3, 6}, + { 522, 3, 7}, + {1047, 3, 8}, + {1052, 3, 8}, + { 537, 3, 9}, + { 535, 3, 9}, + { 690, 3, 10}, + { 677, 3, 10}, + { 169, 3, 11}, + { 734, 3, 11}, + { 845, 3, 12}, + { 861, 3, 12}, + { 846, 3, 13}, + { 863, 3, 13}, + { 847, 3, 14}, + { 829, 3, 14}, + { 927, 3, 15}, + { 922, 3, 15}, + { 524, 3, 16}, + { 519, 3, 16}, + { 963, 4, 0}, + {1270, 4, 0}, + { 424, 4, 1}, + { 748, 4, 2}, + { 415, 4, 3}, + { 383, 4, 3}, + { 522, 4, 4}, + { 519, 4, 4}, + {1047, 4, 5}, + {1052, 4, 5}, + {1119, 4, 6}, + {1107, 4, 6}, + { 708, 4, 7}, + {1229, 4, 8}, + {1164, 4, 9}, + { 775, 4, 10}, + { 777, 4, 11}, + {1026, 4, 12}, + {1023, 4, 12}, + { 963, 5, 0}, + {1270, 5, 0}, + { 424, 5, 1}, + { 748, 5, 2}, + { 522, 5, 3}, + { 519, 5, 3}, + {1088, 5, 4}, + {1083, 5, 4}, + { 537, 5, 5}, + { 535, 5, 5}, + {1116, 5, 6}, + { 766, 5, 7}, + { 763, 5, 7}, + {1226, 5, 8}, + {1225, 5, 8}, + { 950, 5, 9}, + { 734, 5, 9}, + { 927, 5, 10}, + { 922, 5, 10}, + { 211, 5, 11}, + { 206, 5, 11}, + {1126, 5, 12}, + {1125, 5, 12}, + { 379, 5, 13}, + { 378, 5, 13}, + {1080, 5, 14}, + {1079, 5, 14}, + { 905, 6, 0}, + { 885, 6, 0}, + { 525, 6, 0}, + { 526, 6, 0}, + {1275, 6, 1}, + {1271, 6, 1}, + {1164, 6, 1}, + {1213, 6, 1}, + { 916, 7, 0}, + { 887, 7, 0}, + { 735, 7, 1}, + { 708, 7, 1}, + {1246, 7, 2}, + {1229, 7, 2}, + {1209, 7, 3}, + {1164, 7, 3}, + { 776, 7, 4}, + { 775, 7, 4}, + { 778, 7, 5}, + { 777, 7, 5}, + { 739, 8, 0}, + { 708, 8, 0}, + {1055, 8, 1}, + {1045, 8, 1}, + { 516, 8, 2}, + { 495, 8, 2}, + { 517, 8, 3}, + { 506, 8, 3}, + { 518, 8, 4}, + { 507, 8, 4}, + { 192, 8, 5}, + { 177, 8, 5}, + { 396, 8, 6}, + { 425, 8, 6}, + { 986, 8, 7}, + { 219, 8, 7}, + {1085, 8, 8}, + {1068, 8, 8}, + {1255, 8, 9}, + {1261, 8, 9}, + { 972, 8, 10}, + { 953, 8, 10}, + { 261, 8, 11}, + { 254, 8, 11}, + { 913, 8, 12}, + { 920, 8, 12}, + { 189, 8, 13}, + { 164, 8, 13}, + { 742, 8, 14}, + { 772, 8, 14}, + {1058, 8, 15}, + {1062, 8, 15}, + { 740, 8, 16}, + { 770, 8, 16}, + {1056, 8, 17}, + {1060, 8, 17}, + {1016, 8, 18}, + { 995, 8, 18}, + { 741, 8, 19}, + { 771, 8, 19}, + {1057, 8, 20}, + {1061, 8, 20}, + { 534, 8, 21}, + { 540, 8, 21}, + {1017, 8, 22}, + { 996, 8, 22}, + { 917, 9, 0}, + { 1, 9, 0}, + { 918, 9, 0}, + { 979, 9, 1}, + { 2, 9, 1}, + { 978, 9, 1}, + { 923, 9, 2}, + { 130, 9, 2}, + { 901, 9, 2}, + { 684, 9, 3}, + { 139, 9, 3}, + { 707, 9, 3}, + {1240, 9, 4}, + { 146, 9, 4}, + {1247, 9, 4}, + { 301, 9, 5}, + { 14, 9, 5}, + { 304, 9, 6}, + { 25, 9, 6}, + { 306, 9, 7}, + { 29, 9, 7}, + { 309, 9, 8}, + { 32, 9, 8}, + { 313, 9, 9}, + { 37, 9, 9}, + { 314, 9, 10}, + { 38, 9, 10}, + { 315, 9, 11}, + { 40, 9, 11}, + { 316, 9, 12}, + { 41, 9, 12}, + { 317, 9, 13}, + { 43, 9, 13}, + { 318, 9, 14}, + { 44, 9, 14}, + { 319, 9, 15}, + { 48, 9, 15}, + { 320, 9, 16}, + { 54, 9, 16}, + { 321, 9, 17}, + { 59, 9, 17}, + { 322, 9, 18}, + { 65, 9, 18}, + { 323, 9, 19}, + { 70, 9, 19}, + { 324, 9, 20}, + { 72, 9, 20}, + { 325, 9, 21}, + { 73, 9, 21}, + { 326, 9, 22}, + { 74, 9, 22}, + { 327, 9, 23}, + { 75, 9, 23}, + { 328, 9, 24}, + { 76, 9, 24}, + { 329, 9, 25}, + { 83, 9, 25}, + { 330, 9, 26}, + { 88, 9, 26}, + { 331, 9, 27}, + { 89, 9, 27}, + { 332, 9, 28}, + { 90, 9, 28}, + { 333, 9, 29}, + { 91, 9, 29}, + { 334, 9, 30}, + { 92, 9, 30}, + { 335, 9, 31}, + { 93, 9, 31}, + { 336, 9, 32}, + { 145, 9, 32}, + { 337, 9, 33}, + { 153, 9, 33}, + { 302, 9, 34}, + { 23, 9, 34}, + { 303, 9, 35}, + { 24, 9, 35}, + { 305, 9, 36}, + { 28, 9, 36}, + { 307, 9, 37}, + { 30, 9, 37}, + { 308, 9, 38}, + { 31, 9, 38}, + { 310, 9, 39}, + { 34, 9, 39}, + { 311, 9, 40}, + { 35, 9, 40}, + { 214, 9, 41}, + { 53, 9, 41}, + { 209, 9, 41}, + { 212, 9, 42}, + { 55, 9, 42}, + { 207, 9, 42}, + { 213, 9, 43}, + { 56, 9, 43}, + { 208, 9, 43}, + { 237, 9, 44}, + { 58, 9, 44}, + { 249, 9, 44}, + { 236, 9, 45}, + { 60, 9, 45}, + { 219, 9, 45}, + { 238, 9, 46}, + { 61, 9, 46}, + { 263, 9, 46}, + { 736, 9, 47}, + { 62, 9, 47}, + { 708, 9, 47}, + {1053, 9, 48}, + { 63, 9, 48}, + {1045, 9, 48}, + { 156, 9, 49}, + { 64, 9, 49}, + { 164, 9, 49}, + { 155, 9, 50}, + { 66, 9, 50}, + { 154, 9, 50}, + { 157, 9, 51}, + { 67, 9, 51}, + { 184, 9, 51}, + { 478, 9, 52}, + { 68, 9, 52}, + { 453, 9, 52}, + { 477, 9, 53}, + { 69, 9, 53}, + { 448, 9, 53}, + { 655, 9, 54}, + { 71, 9, 54}, + { 658, 9, 54}, + { 312, 9, 55}, + { 36, 9, 55}, + { 215, 9, 56}, + { 49, 9, 56}, + { 210, 9, 56}, + { 910, 10, 0}, + { 287, 10, 1}, + { 284, 10, 1}, + { 397, 10, 2}, + { 386, 10, 2}, + { 536, 10, 3}, + { 907, 10, 4}, + { 893, 10, 4}, + { 646, 10, 5}, + { 645, 10, 5}, + { 833, 10, 6}, + { 832, 10, 6}, + { 531, 10, 7}, + { 530, 10, 7}, + { 660, 10, 8}, + { 659, 10, 8}, + { 351, 10, 9}, + { 496, 10, 9}, + {1137, 10, 10}, + {1133, 10, 10}, + {1128, 10, 11}, + {1238, 10, 12}, + {1237, 10, 12}, + {1256, 10, 13}, + { 892, 10, 14}, + { 890, 10, 14}, + {1108, 10, 15}, + {1111, 10, 15}, + {1124, 10, 16}, + {1123, 10, 16}, + { 539, 10, 17}, + { 538, 10, 17}, + { 897, 11, 0}, + { 885, 11, 0}, + { 176, 11, 1}, + { 154, 11, 1}, + { 587, 11, 2}, + { 581, 11, 2}, + {1256, 11, 3}, + {1250, 11, 3}, + { 541, 11, 4}, + { 525, 11, 4}, + { 892, 11, 5}, + { 887, 11, 5}, + { 908, 12, 0}, + { 163, 12, 1}, + { 165, 12, 2}, + { 168, 12, 3}, + { 235, 12, 4}, + { 241, 12, 5}, + { 449, 12, 6}, + { 450, 12, 7}, + { 486, 12, 8}, + { 529, 12, 9}, + { 533, 12, 10}, + { 542, 12, 11}, + { 543, 12, 12}, + { 584, 12, 13}, + { 589, 12, 14}, + {1184, 12, 14}, + { 604, 12, 15}, + { 608, 12, 16}, + { 609, 12, 17}, + { 610, 12, 18}, + { 678, 12, 19}, + { 689, 12, 20}, + { 705, 12, 21}, + { 710, 12, 22}, + { 711, 12, 23}, + { 834, 12, 24}, + { 848, 12, 25}, + { 915, 12, 26}, + { 930, 12, 27}, + { 997, 12, 28}, + {1039, 12, 29}, + {1040, 12, 30}, + {1049, 12, 31}, + {1051, 12, 32}, + {1071, 12, 33}, + {1072, 12, 34}, + {1084, 12, 35}, + {1086, 12, 36}, + {1095, 12, 37}, + {1155, 12, 38}, + {1169, 12, 39}, + {1182, 12, 40}, + {1183, 12, 41}, + {1189, 12, 42}, + {1253, 12, 43}, + {1163, 12, 44}, + {1272, 12, 45}, + {1273, 12, 46}, + {1274, 12, 47}, + {1282, 12, 48}, + {1283, 12, 49}, + {1286, 12, 50}, + {1287, 12, 51}, + { 695, 12, 52}, + { 528, 12, 53}, + { 278, 12, 54}, + { 527, 12, 55}, + { 932, 12, 56}, + {1063, 12, 57}, + {1127, 12, 58}, + { 795, 12, 59}, + { 796, 12, 60}, + { 797, 12, 61}, + { 798, 12, 62}, + { 799, 12, 63}, + { 800, 12, 64}, + { 801, 12, 65}, + { 802, 12, 66}, + { 803, 12, 67}, + { 804, 12, 68}, + { 805, 12, 69}, + { 806, 12, 70}, + { 807, 12, 71}, + { 808, 12, 72}, + { 809, 12, 73}, + { 810, 12, 74}, + { 811, 12, 75}, + { 812, 12, 76}, + { 813, 12, 77}, + { 814, 12, 78}, + { 815, 12, 79}, + { 816, 12, 80}, + { 817, 12, 81}, + { 818, 12, 82}, + { 819, 12, 83}, + { 820, 12, 84}, + { 821, 12, 85}, + { 912, 13, 0}, + {1214, 13, 0}, + { 670, 13, 1}, + { 281, 13, 1}, + { 483, 13, 2}, + { 447, 13, 2}, + {1054, 13, 3}, + {1045, 13, 3}, + { 738, 13, 4}, + { 708, 13, 4}, + {1210, 13, 5}, + {1164, 13, 5}, + {1224, 14, 0}, + {1270, 14, 0}, + { 955, 14, 1}, + { 954, 14, 1}, + { 381, 14, 2}, + { 378, 14, 2}, + {1043, 14, 3}, + {1042, 14, 3}, + { 559, 14, 4}, + { 556, 14, 4}, + { 914, 14, 5}, + { 919, 14, 5}, + { 520, 14, 6}, + { 519, 14, 6}, + { 273, 14, 7}, + {1156, 14, 7}, + { 643, 14, 8}, + { 658, 14, 8}, + {1025, 14, 9}, + {1024, 14, 9}, + {1022, 14, 10}, + {1015, 14, 10}, + { 927, 14, 11}, + { 922, 14, 11}, + { 172, 14, 12}, + { 164, 14, 12}, + { 630, 14, 13}, + { 626, 14, 13}, + { 652, 14, 14}, + { 639, 14, 14}, + { 653, 14, 14}, + { 625, 14, 15}, + { 624, 14, 15}, + { 392, 14, 16}, + { 382, 14, 16}, + { 271, 14, 17}, + { 233, 14, 17}, + { 270, 14, 18}, + { 221, 14, 18}, + {1117, 14, 19}, + {1116, 14, 19}, + { 792, 14, 20}, + { 248, 14, 20}, + { 293, 14, 21}, + { 424, 14, 21}, + { 758, 14, 22}, + { 748, 14, 22}, + { 414, 14, 23}, + { 298, 14, 23}, + { 399, 14, 24}, + {1070, 14, 24}, + { 176, 14, 25}, + { 162, 14, 25}, + { 272, 14, 26}, + { 220, 14, 26}, + {1153, 14, 27}, + {1090, 14, 27}, + {1293, 14, 28}, + {1292, 14, 28}, + { 900, 14, 29}, + { 904, 14, 29}, + {1260, 14, 30}, + {1257, 14, 30}, + { 668, 14, 31}, + { 676, 14, 32}, + { 675, 14, 33}, + { 582, 14, 34}, + { 583, 14, 35}, + { 380, 14, 36}, + { 422, 14, 36}, + { 607, 14, 37}, + { 618, 14, 37}, + { 400, 14, 38}, + { 352, 14, 38}, + {1047, 14, 39}, + {1052, 14, 39}, + { 910, 15, 0}, + { 927, 15, 1}, + { 922, 15, 1}, + { 473, 15, 2}, + { 466, 15, 2}, + { 455, 15, 3}, + { 454, 15, 3}, + { 889, 16, 0}, + { 0, 16, 1}, + { 1, 16, 2}, + { 5, 16, 3}, + { 4, 16, 4}, + { 3, 16, 5}, + { 13, 16, 6}, + { 12, 16, 7}, + { 11, 16, 8}, + { 10, 16, 9}, + { 78, 16, 10}, + { 9, 16, 11}, + { 8, 16, 12}, + { 7, 16, 13}, + { 82, 16, 14}, + { 47, 16, 15}, + { 115, 16, 16}, + { 6, 16, 17}, + { 131, 16, 18}, + { 81, 16, 19}, + { 118, 16, 20}, + { 46, 16, 21}, + { 80, 16, 22}, + { 98, 16, 23}, + { 117, 16, 24}, + { 133, 16, 25}, + { 26, 16, 26}, + { 2, 16, 27}, + { 79, 16, 28}, + { 45, 16, 29}, + { 116, 16, 30}, + { 77, 16, 31}, + { 132, 16, 32}, + { 97, 16, 33}, + { 147, 16, 34}, + { 114, 16, 35}, + { 27, 16, 36}, + { 124, 16, 37}, + { 33, 16, 38}, + { 130, 16, 39}, + { 39, 16, 40}, + { 139, 16, 41}, + { 42, 16, 42}, + { 146, 16, 43}, + { 14, 16, 44}, + { 25, 16, 45}, + { 29, 16, 46}, + { 32, 16, 47}, + { 37, 16, 48}, + { 38, 16, 49}, + { 40, 16, 50}, + { 41, 16, 51}, + { 43, 16, 52}, + { 44, 16, 53}, + { 48, 16, 54}, + { 54, 16, 55}, + { 59, 16, 56}, + { 65, 16, 57}, + { 70, 16, 58}, + { 72, 16, 59}, + { 73, 16, 60}, + { 74, 16, 61}, + { 75, 16, 62}, + { 76, 16, 63}, + { 83, 16, 64}, + { 88, 16, 65}, + { 89, 16, 66}, + { 90, 16, 67}, + { 91, 16, 68}, + { 92, 16, 69}, + { 93, 16, 70}, + { 94, 16, 71}, + { 95, 16, 72}, + { 96, 16, 73}, + { 99, 16, 74}, + { 104, 16, 75}, + { 105, 16, 76}, + { 106, 16, 77}, + { 108, 16, 78}, + { 109, 16, 79}, + { 110, 16, 80}, + { 111, 16, 81}, + { 112, 16, 82}, + { 113, 16, 83}, + { 119, 16, 84}, + { 125, 16, 85}, + { 134, 16, 86}, + { 140, 16, 87}, + { 148, 16, 88}, + { 15, 16, 89}, + { 49, 16, 90}, + { 84, 16, 91}, + { 100, 16, 92}, + { 120, 16, 93}, + { 126, 16, 94}, + { 135, 16, 95}, + { 141, 16, 96}, + { 149, 16, 97}, + { 16, 16, 98}, + { 50, 16, 99}, + { 85, 16, 100}, + { 101, 16, 101}, + { 121, 16, 102}, + { 127, 16, 103}, + { 136, 16, 104}, + { 142, 16, 105}, + { 150, 16, 106}, + { 17, 16, 107}, + { 51, 16, 108}, + { 86, 16, 109}, + { 102, 16, 110}, + { 122, 16, 111}, + { 128, 16, 112}, + { 137, 16, 113}, + { 143, 16, 114}, + { 151, 16, 115}, + { 18, 16, 116}, + { 52, 16, 117}, + { 57, 16, 118}, + { 87, 16, 119}, + { 103, 16, 120}, + { 107, 16, 121}, + { 123, 16, 122}, + { 129, 16, 123}, + { 138, 16, 124}, + { 144, 16, 125}, + { 152, 16, 126}, + { 19, 16, 127}, + { 20, 16, 128}, + { 21, 16, 129}, + { 22, 16, 130}, + { 887, 17, 0}, + {1053, 17, 1}, + { 736, 17, 2}, + {1242, 17, 3}, + { 737, 17, 4}, + {1203, 17, 5}, + { 259, 17, 6}, + {1204, 17, 7}, + {1208, 17, 8}, + {1206, 17, 9}, + {1207, 17, 10}, + { 260, 17, 11}, + {1205, 17, 12}, + { 980, 17, 13}, + { 963, 18, 0}, + { 247, 18, 1}, + {1241, 18, 2}, + { 216, 18, 3}, + { 923, 18, 4}, + {1240, 18, 5}, + {1036, 18, 6}, + { 654, 18, 7}, + {1245, 18, 8}, + {1244, 18, 9}, + {1243, 18, 10}, + { 408, 18, 11}, + { 402, 18, 12}, + { 403, 18, 13}, + { 413, 18, 14}, + { 410, 18, 15}, + { 409, 18, 16}, + { 412, 18, 17}, + { 411, 18, 18}, + { 407, 18, 19}, + { 404, 18, 20}, + { 405, 18, 21}, + { 869, 18, 22}, + {1201, 18, 23}, + {1202, 18, 24}, + { 546, 18, 25}, + { 290, 18, 26}, + {1048, 18, 27}, + {1157, 18, 28}, + { 406, 18, 29}, + { 911, 18, 30}, + { 672, 18, 31}, + { 926, 18, 32}, + { 924, 18, 33}, + { 266, 18, 34}, +}; + +/* property values: 5648 bytes. */ + +/* Codepoints which expand on full case-folding. */ + +RE_UINT16 re_expand_on_folding[] = { + 223, 304, 329, 496, 912, 944, 1415, 7830, + 7831, 7832, 7833, 7834, 7838, 8016, 8018, 8020, + 8022, 8064, 8065, 8066, 8067, 8068, 8069, 8070, + 8071, 8072, 8073, 8074, 8075, 8076, 8077, 8078, + 8079, 8080, 8081, 8082, 8083, 8084, 8085, 8086, + 8087, 8088, 8089, 8090, 8091, 8092, 8093, 8094, + 8095, 8096, 8097, 8098, 8099, 8100, 8101, 8102, + 8103, 8104, 8105, 8106, 8107, 8108, 8109, 8110, + 8111, 8114, 8115, 8116, 8118, 8119, 8124, 8130, + 8131, 8132, 8134, 8135, 8140, 8146, 8147, 8150, + 8151, 8162, 8163, 8164, 8166, 8167, 8178, 8179, + 8180, 8182, 8183, 8188, 64256, 64257, 64258, 64259, + 64260, 64261, 64262, 64275, 64276, 64277, 64278, 64279, +}; + +/* expand_on_folding: 208 bytes. */ + +/* General_Category. */ + +static RE_UINT8 re_general_category_stage_1[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 9, 10, 11, 7, 7, 7, 7, 12, 13, 14, 14, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 21, 23, 21, 21, 21, 21, 24, 21, 21, + 21, 21, 21, 21, 21, 21, 25, 26, 21, 21, 27, 28, 21, 29, 30, 31, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 32, 7, 33, 34, 7, 35, 21, 21, 21, 21, 21, 36, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 37, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 38, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 38, +}; + +static RE_UINT8 re_general_category_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 34, 35, 36, 37, 38, 39, 34, 34, 34, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 64, 65, 66, 67, 68, 69, 70, 71, 69, 72, 73, + 69, 69, 64, 74, 64, 64, 75, 76, 77, 78, 79, 80, 81, 82, 69, 83, + 84, 85, 86, 87, 88, 89, 69, 69, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 90, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 91, + 92, 34, 34, 34, 34, 34, 34, 34, 34, 93, 34, 34, 94, 95, 96, 97, + 98, 99, 100, 101, 102, 103, 104, 105, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 106, + 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, + 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 34, 34, 109, 110, 111, 112, 34, 34, 113, 114, 115, 116, 117, 118, + 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 123, 34, 34, 130, 123, + 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 123, 123, 141, 123, 123, 123, + 142, 143, 144, 145, 146, 147, 148, 123, 123, 149, 123, 150, 151, 152, 153, 123, + 123, 154, 123, 123, 123, 155, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, + 34, 34, 34, 34, 34, 34, 34, 156, 157, 34, 158, 123, 123, 123, 123, 123, + 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, + 34, 34, 34, 34, 34, 34, 34, 34, 159, 123, 123, 123, 123, 123, 123, 123, + 123, 123, 123, 123, 123, 123, 123, 123, 34, 34, 34, 34, 160, 123, 123, 123, + 34, 34, 34, 34, 161, 162, 163, 164, 123, 123, 123, 123, 123, 123, 165, 166, + 167, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, + 123, 123, 123, 123, 123, 123, 123, 123, 168, 169, 123, 123, 123, 123, 123, 123, + 69, 170, 171, 172, 173, 123, 174, 123, 175, 176, 177, 178, 179, 180, 181, 182, + 69, 69, 69, 69, 183, 184, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, + 34, 185, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 186, 187, 123, 123, + 188, 189, 190, 191, 192, 123, 69, 193, 69, 69, 194, 195, 69, 196, 197, 198, + 199, 200, 201, 202, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 203, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 204, 34, + 205, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 206, 123, 123, + 34, 34, 34, 34, 207, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, + 208, 123, 209, 210, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, + 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 211, +}; + +static RE_UINT16 re_general_category_stage_3[] = { + 0, 0, 1, 2, 3, 4, 5, 6, 0, 0, 7, 8, 9, 10, 11, 12, + 13, 13, 13, 14, 15, 13, 13, 16, 17, 18, 19, 20, 21, 22, 13, 23, + 13, 13, 13, 24, 25, 11, 11, 11, 11, 26, 11, 27, 28, 29, 30, 31, + 32, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 11, 37, 38, 13, 39, + 9, 9, 9, 11, 11, 11, 13, 13, 40, 13, 13, 13, 41, 13, 13, 13, + 13, 13, 13, 42, 9, 43, 44, 11, 45, 46, 32, 47, 48, 49, 50, 51, + 52, 53, 49, 49, 54, 32, 55, 56, 49, 49, 49, 49, 49, 57, 58, 59, + 60, 61, 49, 32, 62, 49, 49, 49, 49, 49, 63, 64, 65, 49, 66, 67, + 49, 68, 69, 70, 49, 71, 72, 72, 72, 72, 49, 73, 72, 72, 74, 32, + 75, 49, 49, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, + 89, 82, 83, 90, 91, 92, 93, 94, 95, 96, 83, 97, 98, 99, 87, 100, + 101, 82, 83, 102, 103, 104, 87, 105, 106, 107, 108, 109, 110, 111, 93, 112, + 113, 114, 83, 115, 116, 117, 87, 118, 119, 114, 83, 120, 121, 122, 87, 123, + 119, 114, 49, 124, 125, 126, 87, 127, 128, 129, 49, 130, 131, 132, 93, 133, + 134, 49, 49, 135, 136, 137, 72, 72, 138, 139, 140, 141, 142, 143, 72, 72, + 144, 145, 146, 147, 148, 49, 149, 150, 151, 152, 32, 153, 154, 155, 72, 72, + 49, 49, 156, 157, 158, 159, 160, 161, 162, 163, 9, 9, 164, 49, 49, 165, + 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 166, 167, 49, 49, + 166, 49, 49, 168, 169, 170, 49, 49, 49, 169, 49, 49, 49, 171, 172, 173, + 49, 174, 9, 9, 9, 9, 9, 175, 176, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 177, 49, 178, 179, 49, 49, 49, 49, 180, 181, + 182, 183, 49, 184, 49, 185, 182, 186, 49, 49, 49, 187, 188, 189, 190, 191, + 192, 190, 49, 49, 193, 49, 49, 194, 49, 49, 195, 49, 49, 49, 49, 196, + 49, 197, 198, 199, 200, 49, 201, 73, 49, 49, 202, 49, 203, 204, 205, 205, + 49, 206, 49, 49, 49, 207, 208, 209, 190, 190, 210, 211, 72, 72, 72, 72, + 212, 49, 49, 213, 214, 158, 215, 216, 217, 49, 218, 65, 49, 49, 219, 220, + 49, 49, 221, 222, 223, 65, 49, 224, 72, 72, 72, 72, 225, 226, 227, 228, + 11, 11, 229, 27, 27, 27, 230, 231, 11, 232, 27, 27, 32, 32, 32, 233, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 234, 13, 13, 13, 13, 13, 13, + 235, 236, 235, 235, 236, 237, 235, 238, 239, 239, 239, 240, 241, 242, 243, 244, + 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 72, 257, 258, 259, + 260, 261, 262, 263, 264, 265, 266, 266, 267, 268, 269, 205, 270, 271, 205, 272, + 273, 273, 273, 273, 273, 273, 273, 273, 274, 205, 275, 205, 205, 205, 205, 276, + 205, 277, 273, 278, 205, 279, 280, 281, 205, 205, 282, 72, 281, 72, 265, 265, + 265, 283, 205, 205, 205, 205, 284, 265, 205, 205, 205, 205, 205, 205, 205, 205, + 205, 205, 205, 285, 286, 205, 205, 287, 205, 205, 205, 205, 205, 205, 288, 205, + 205, 205, 205, 205, 205, 205, 289, 290, 265, 291, 205, 205, 292, 273, 293, 273, + 294, 295, 273, 273, 273, 296, 273, 297, 205, 205, 205, 273, 298, 205, 205, 299, + 205, 300, 205, 301, 302, 303, 304, 72, 9, 9, 305, 11, 11, 306, 307, 308, + 13, 13, 13, 13, 13, 13, 309, 310, 11, 11, 311, 49, 49, 49, 312, 313, + 49, 314, 315, 315, 315, 315, 32, 32, 316, 317, 318, 319, 320, 72, 72, 72, + 205, 321, 205, 205, 205, 205, 205, 322, 205, 205, 205, 205, 205, 323, 72, 324, + 325, 326, 327, 328, 134, 49, 49, 49, 49, 329, 176, 49, 49, 49, 49, 330, + 331, 49, 201, 134, 49, 49, 49, 49, 197, 332, 49, 50, 205, 205, 322, 49, + 205, 333, 334, 205, 335, 336, 205, 205, 334, 205, 205, 336, 205, 205, 205, 333, + 49, 49, 49, 196, 205, 205, 205, 205, 49, 49, 49, 49, 49, 196, 72, 72, + 49, 337, 49, 49, 49, 49, 49, 49, 149, 205, 205, 205, 282, 49, 49, 224, + 338, 49, 339, 72, 13, 13, 340, 341, 13, 342, 49, 49, 49, 49, 343, 344, + 31, 345, 346, 347, 13, 13, 13, 348, 349, 350, 351, 352, 72, 72, 72, 353, + 354, 49, 355, 356, 49, 49, 49, 357, 358, 49, 49, 359, 360, 190, 32, 361, + 65, 49, 362, 49, 363, 364, 49, 149, 75, 49, 49, 365, 366, 367, 368, 369, + 49, 49, 370, 371, 372, 373, 49, 374, 49, 49, 49, 375, 376, 377, 378, 379, + 380, 381, 315, 11, 11, 382, 383, 11, 11, 11, 11, 11, 49, 49, 384, 190, + 49, 49, 385, 49, 386, 49, 49, 202, 387, 387, 387, 387, 387, 387, 387, 387, + 388, 388, 388, 388, 388, 388, 388, 388, 49, 49, 49, 49, 49, 49, 201, 49, + 49, 49, 49, 49, 49, 203, 72, 72, 389, 390, 391, 392, 393, 49, 49, 49, + 49, 49, 49, 394, 395, 396, 49, 49, 49, 49, 49, 397, 72, 49, 49, 49, + 49, 398, 49, 49, 194, 72, 72, 399, 32, 400, 32, 401, 402, 403, 404, 405, + 49, 49, 49, 49, 49, 49, 49, 406, 407, 2, 3, 4, 5, 408, 409, 410, + 49, 411, 49, 197, 412, 413, 414, 415, 416, 49, 170, 417, 201, 201, 72, 72, + 49, 49, 49, 49, 49, 49, 49, 50, 418, 265, 265, 419, 266, 266, 266, 420, + 421, 324, 422, 72, 72, 205, 205, 423, 72, 72, 72, 72, 72, 72, 72, 72, + 49, 149, 49, 49, 49, 99, 424, 425, 49, 49, 426, 49, 427, 49, 49, 428, + 49, 429, 49, 49, 430, 431, 72, 72, 9, 9, 432, 11, 11, 49, 49, 49, + 49, 201, 190, 72, 72, 72, 72, 72, 49, 49, 194, 49, 49, 49, 433, 72, + 49, 49, 49, 314, 49, 196, 194, 72, 434, 49, 49, 435, 49, 436, 49, 437, + 49, 197, 438, 72, 72, 72, 49, 439, 49, 440, 49, 441, 72, 72, 72, 72, + 49, 49, 49, 442, 265, 443, 265, 265, 444, 445, 49, 446, 447, 448, 49, 449, + 49, 450, 72, 72, 451, 49, 452, 453, 49, 49, 49, 454, 49, 455, 49, 456, + 49, 457, 458, 72, 72, 72, 72, 72, 49, 49, 49, 49, 459, 72, 72, 72, + 9, 9, 9, 460, 11, 11, 11, 461, 72, 72, 72, 72, 72, 72, 265, 462, + 463, 49, 49, 464, 465, 443, 466, 467, 217, 49, 49, 468, 469, 49, 459, 190, + 470, 49, 471, 472, 473, 49, 49, 474, 217, 49, 49, 475, 476, 477, 478, 479, + 49, 96, 480, 481, 72, 72, 72, 72, 482, 483, 484, 49, 49, 485, 486, 190, + 487, 82, 83, 97, 488, 489, 490, 491, 49, 49, 49, 492, 493, 190, 72, 72, + 49, 49, 494, 495, 496, 497, 72, 72, 49, 49, 49, 498, 499, 190, 72, 72, + 49, 49, 500, 501, 190, 72, 72, 72, 49, 502, 503, 504, 72, 72, 72, 72, + 72, 72, 9, 9, 11, 11, 146, 505, 72, 72, 72, 72, 49, 49, 49, 459, + 49, 203, 72, 72, 72, 72, 72, 72, 266, 266, 266, 266, 266, 266, 506, 507, + 49, 49, 49, 49, 385, 72, 72, 72, 49, 49, 197, 72, 72, 72, 72, 72, + 49, 49, 49, 49, 314, 72, 72, 72, 49, 49, 49, 459, 49, 197, 367, 72, + 72, 72, 72, 72, 72, 49, 201, 508, 49, 49, 49, 509, 510, 511, 512, 513, + 49, 72, 72, 72, 72, 72, 72, 72, 49, 49, 49, 49, 73, 514, 515, 516, + 467, 517, 72, 72, 72, 72, 72, 72, 518, 72, 72, 72, 72, 72, 72, 72, + 49, 49, 49, 49, 49, 49, 50, 149, 459, 519, 520, 72, 72, 72, 72, 72, + 205, 205, 205, 205, 205, 205, 205, 323, 205, 205, 521, 205, 205, 205, 522, 523, + 524, 205, 525, 205, 205, 205, 526, 72, 205, 205, 205, 205, 527, 72, 72, 72, + 205, 205, 205, 205, 205, 282, 265, 528, 9, 529, 11, 530, 531, 532, 235, 9, + 533, 534, 535, 536, 537, 9, 529, 11, 538, 539, 11, 540, 541, 542, 543, 9, + 544, 11, 9, 529, 11, 530, 531, 11, 235, 9, 533, 543, 9, 544, 11, 9, + 529, 11, 545, 9, 546, 547, 548, 549, 11, 550, 9, 551, 552, 553, 554, 11, + 555, 9, 556, 11, 557, 558, 558, 558, 32, 32, 32, 559, 32, 32, 560, 561, + 562, 563, 46, 72, 72, 72, 72, 72, 49, 49, 49, 49, 564, 565, 72, 72, + 566, 49, 567, 568, 569, 570, 571, 572, 573, 202, 574, 202, 72, 72, 72, 575, + 205, 205, 324, 205, 205, 205, 205, 205, 205, 322, 333, 576, 576, 576, 205, 323, + 173, 205, 333, 205, 205, 205, 324, 205, 205, 281, 72, 72, 72, 72, 577, 205, + 578, 205, 205, 281, 526, 303, 72, 72, 205, 205, 205, 205, 205, 205, 205, 579, + 205, 205, 205, 205, 205, 205, 205, 321, 205, 205, 580, 205, 205, 205, 205, 205, + 205, 205, 205, 205, 205, 422, 581, 322, 205, 205, 205, 205, 205, 205, 205, 322, + 205, 205, 205, 205, 205, 582, 72, 72, 324, 205, 205, 205, 583, 174, 205, 205, + 583, 205, 584, 72, 72, 72, 72, 72, 72, 526, 72, 72, 72, 72, 72, 72, + 582, 72, 72, 72, 422, 72, 72, 72, 49, 49, 49, 49, 49, 314, 72, 72, + 49, 49, 49, 73, 49, 49, 49, 49, 49, 201, 49, 49, 49, 49, 49, 49, + 49, 49, 518, 72, 72, 72, 72, 72, 49, 201, 72, 72, 72, 72, 72, 72, + 585, 72, 586, 586, 586, 586, 586, 586, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 72, 388, 388, 388, 388, 388, 388, 388, 587, +}; + +static RE_UINT8 re_general_category_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 2, 4, 5, 6, 2, + 7, 7, 7, 7, 7, 2, 8, 9, 10, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 17, 18, 19, 1, 20, 20, 21, 22, 23, 24, 25, + 26, 27, 15, 2, 28, 29, 27, 30, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 31, 11, 11, 11, 32, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 33, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 34, 34, 34, 34, 34, 34, 34, 34, 16, 32, 32, 32, + 32, 32, 32, 32, 11, 34, 34, 16, 34, 32, 32, 11, 34, 11, 16, 11, + 11, 34, 32, 11, 32, 16, 11, 34, 32, 32, 32, 11, 34, 16, 32, 11, + 34, 11, 34, 34, 32, 35, 32, 16, 36, 36, 37, 34, 38, 37, 34, 34, + 34, 34, 34, 34, 34, 34, 16, 32, 34, 38, 32, 11, 32, 32, 32, 32, + 32, 32, 16, 16, 16, 11, 34, 32, 34, 34, 11, 32, 32, 32, 32, 32, + 16, 16, 39, 16, 16, 16, 16, 16, 40, 40, 40, 40, 40, 40, 40, 40, + 40, 41, 41, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 41, 41, + 40, 40, 42, 41, 41, 41, 42, 42, 41, 41, 41, 41, 41, 41, 41, 41, + 43, 43, 43, 43, 43, 43, 43, 43, 32, 32, 42, 32, 44, 45, 16, 10, + 44, 44, 41, 46, 11, 47, 47, 11, 34, 11, 11, 11, 11, 11, 11, 11, + 11, 48, 11, 11, 11, 11, 16, 16, 16, 16, 16, 16, 16, 16, 16, 34, + 16, 11, 32, 16, 32, 32, 32, 32, 16, 16, 32, 49, 34, 32, 34, 11, + 32, 50, 43, 43, 51, 32, 32, 32, 11, 34, 34, 34, 34, 34, 34, 16, + 48, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 47, 52, 2, 2, 2, + 53, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 54, 55, 56, 57, + 58, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 59, + 60, 61, 43, 60, 44, 44, 44, 44, 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 62, 44, 44, 36, 63, 64, 44, 44, 44, 44, 44, + 65, 65, 65, 8, 9, 66, 2, 67, 43, 43, 43, 43, 43, 61, 68, 2, + 69, 36, 36, 36, 36, 70, 43, 43, 7, 7, 7, 7, 7, 2, 2, 36, + 71, 36, 36, 36, 36, 36, 36, 36, 36, 36, 72, 43, 43, 43, 73, 50, + 43, 43, 74, 75, 76, 43, 43, 36, 7, 7, 7, 7, 7, 36, 77, 78, + 2, 2, 2, 2, 2, 2, 2, 79, 70, 36, 36, 36, 36, 36, 36, 36, + 43, 43, 43, 43, 43, 80, 81, 36, 36, 36, 36, 43, 43, 43, 43, 43, + 71, 44, 44, 44, 44, 44, 44, 44, 7, 7, 7, 7, 7, 36, 36, 36, + 36, 36, 36, 36, 36, 70, 43, 43, 43, 43, 40, 21, 2, 82, 44, 44, + 36, 36, 36, 43, 43, 75, 43, 43, 43, 43, 75, 43, 75, 43, 43, 44, + 2, 2, 2, 2, 2, 2, 2, 64, 36, 36, 36, 36, 70, 43, 44, 64, + 44, 44, 44, 44, 44, 44, 44, 44, 36, 36, 62, 44, 44, 44, 44, 44, + 44, 58, 43, 43, 43, 43, 43, 43, 43, 83, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 83, 71, 84, 85, 43, 43, 43, 83, 84, 85, 84, + 70, 43, 43, 43, 36, 36, 36, 36, 36, 43, 2, 7, 7, 7, 7, 7, + 86, 36, 36, 36, 36, 36, 36, 36, 70, 84, 81, 36, 36, 36, 62, 81, + 62, 81, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 62, 36, 36, 36, + 62, 62, 44, 36, 36, 44, 71, 84, 85, 43, 80, 87, 88, 87, 85, 62, + 44, 44, 44, 87, 44, 44, 36, 81, 36, 43, 44, 7, 7, 7, 7, 7, + 36, 20, 27, 27, 27, 57, 44, 44, 58, 83, 81, 36, 36, 62, 44, 81, + 62, 36, 81, 62, 36, 44, 80, 84, 85, 80, 44, 58, 80, 58, 43, 44, + 58, 44, 44, 44, 81, 36, 62, 62, 44, 44, 44, 7, 7, 7, 7, 7, + 43, 36, 70, 44, 44, 44, 44, 44, 58, 83, 81, 36, 36, 36, 36, 81, + 36, 81, 36, 36, 36, 36, 36, 36, 62, 36, 81, 36, 36, 44, 71, 84, + 85, 43, 43, 58, 83, 87, 85, 44, 62, 44, 44, 44, 44, 44, 44, 44, + 66, 44, 44, 44, 81, 44, 44, 44, 58, 84, 81, 36, 36, 36, 62, 81, + 62, 36, 81, 36, 36, 44, 71, 85, 85, 43, 80, 87, 88, 87, 85, 44, + 44, 44, 44, 83, 44, 44, 36, 81, 78, 27, 27, 27, 44, 44, 44, 44, + 44, 71, 81, 36, 36, 62, 44, 36, 62, 36, 36, 44, 81, 62, 62, 36, + 44, 81, 62, 44, 36, 62, 44, 36, 36, 36, 36, 36, 36, 44, 44, 84, + 83, 88, 44, 84, 88, 84, 85, 44, 62, 44, 44, 87, 44, 44, 44, 44, + 27, 89, 67, 67, 57, 90, 44, 44, 83, 84, 81, 36, 36, 36, 62, 36, + 62, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 44, 81, 43, + 83, 84, 88, 43, 80, 43, 43, 44, 44, 44, 58, 80, 36, 62, 44, 44, + 44, 44, 44, 44, 27, 27, 27, 89, 58, 84, 81, 36, 36, 36, 62, 36, + 36, 36, 81, 36, 36, 44, 71, 85, 84, 84, 88, 83, 88, 84, 43, 44, + 44, 44, 87, 88, 44, 44, 44, 62, 81, 62, 44, 44, 44, 44, 44, 44, + 36, 36, 36, 36, 36, 62, 81, 84, 85, 43, 80, 84, 88, 84, 85, 62, + 44, 44, 44, 87, 44, 44, 44, 81, 27, 27, 27, 44, 56, 36, 36, 36, + 44, 84, 81, 36, 36, 36, 36, 36, 36, 36, 36, 62, 44, 36, 36, 36, + 36, 81, 36, 36, 36, 36, 81, 44, 36, 36, 36, 62, 44, 80, 44, 87, + 84, 43, 80, 80, 84, 84, 84, 84, 44, 84, 64, 44, 44, 44, 44, 44, + 81, 36, 36, 36, 36, 36, 36, 36, 70, 36, 43, 43, 43, 80, 44, 91, + 36, 36, 36, 75, 43, 43, 43, 61, 7, 7, 7, 7, 7, 2, 44, 44, + 81, 62, 62, 81, 62, 62, 81, 44, 44, 44, 36, 36, 81, 36, 36, 36, + 81, 36, 81, 81, 44, 36, 81, 36, 70, 36, 43, 43, 43, 58, 71, 44, + 36, 36, 62, 82, 43, 43, 43, 44, 7, 7, 7, 7, 7, 44, 36, 36, + 77, 67, 2, 2, 2, 2, 2, 2, 2, 92, 92, 67, 43, 67, 67, 67, + 7, 7, 7, 7, 7, 27, 27, 27, 27, 27, 50, 50, 50, 4, 4, 84, + 36, 36, 36, 36, 81, 36, 36, 36, 36, 36, 36, 36, 36, 36, 62, 44, + 58, 43, 43, 43, 43, 43, 43, 83, 43, 43, 61, 43, 36, 36, 70, 43, + 43, 43, 43, 43, 58, 43, 43, 43, 43, 43, 43, 43, 43, 43, 80, 67, + 67, 67, 67, 76, 67, 67, 90, 67, 2, 2, 92, 67, 21, 64, 44, 44, + 36, 36, 36, 36, 36, 93, 85, 43, 83, 43, 43, 43, 85, 83, 85, 71, + 7, 7, 7, 7, 7, 2, 2, 2, 36, 36, 36, 84, 43, 36, 36, 43, + 71, 84, 94, 93, 84, 84, 84, 36, 70, 43, 71, 36, 36, 36, 36, 36, + 36, 83, 85, 83, 84, 84, 85, 93, 7, 7, 7, 7, 7, 84, 85, 67, + 11, 11, 11, 48, 44, 44, 48, 44, 36, 36, 36, 36, 36, 63, 69, 36, + 36, 36, 36, 36, 62, 36, 36, 44, 36, 36, 36, 62, 62, 36, 36, 44, + 62, 36, 36, 44, 36, 36, 36, 62, 62, 36, 36, 44, 36, 36, 36, 36, + 36, 36, 36, 62, 36, 36, 36, 36, 36, 36, 36, 36, 36, 62, 58, 43, + 2, 2, 2, 2, 95, 27, 27, 27, 27, 27, 27, 27, 27, 27, 96, 44, + 67, 67, 67, 67, 67, 44, 44, 44, 11, 11, 11, 44, 16, 16, 16, 44, + 97, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 63, 72, + 98, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 99, 100, 44, + 36, 36, 36, 36, 36, 63, 2, 101, 102, 36, 36, 36, 62, 44, 44, 44, + 36, 36, 36, 36, 36, 36, 62, 36, 36, 43, 80, 44, 44, 44, 44, 44, + 36, 43, 61, 64, 44, 44, 44, 44, 36, 43, 44, 44, 44, 44, 44, 44, + 62, 43, 44, 44, 44, 44, 44, 44, 36, 36, 43, 85, 43, 43, 43, 84, + 84, 84, 84, 83, 85, 43, 43, 43, 43, 43, 2, 86, 2, 66, 70, 44, + 7, 7, 7, 7, 7, 44, 44, 44, 27, 27, 27, 27, 27, 44, 44, 44, + 2, 2, 2, 103, 2, 60, 43, 68, 36, 104, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 44, 44, 44, 44, 36, 36, 36, 36, 70, 62, 44, 44, + 36, 36, 36, 44, 44, 44, 44, 44, 36, 36, 36, 36, 36, 36, 36, 62, + 43, 83, 84, 85, 83, 84, 44, 44, 84, 83, 84, 84, 85, 43, 44, 44, + 90, 44, 2, 7, 7, 7, 7, 7, 36, 36, 36, 36, 36, 36, 36, 44, + 36, 36, 36, 36, 36, 36, 44, 44, 36, 36, 36, 36, 36, 44, 44, 44, + 7, 7, 7, 7, 7, 96, 44, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 36, 36, 36, 70, 83, 85, 44, 2, 36, 36, 93, 83, 43, 43, 43, 80, + 83, 83, 85, 43, 43, 43, 83, 84, 84, 85, 43, 43, 43, 43, 80, 58, + 2, 2, 2, 86, 2, 2, 2, 44, 43, 43, 43, 43, 43, 43, 43, 105, + 43, 43, 94, 36, 36, 36, 36, 36, 36, 36, 83, 43, 43, 83, 83, 84, + 84, 83, 94, 36, 36, 36, 44, 44, 92, 67, 67, 67, 67, 50, 43, 43, + 43, 43, 67, 67, 67, 67, 90, 44, 43, 94, 36, 36, 36, 36, 36, 36, + 93, 43, 43, 84, 43, 85, 43, 36, 36, 36, 36, 83, 43, 84, 85, 85, + 43, 84, 44, 44, 44, 44, 2, 2, 36, 36, 84, 84, 84, 84, 43, 43, + 43, 43, 84, 43, 44, 54, 2, 2, 7, 7, 7, 7, 7, 44, 81, 36, + 36, 36, 36, 36, 40, 40, 40, 2, 2, 2, 2, 2, 44, 44, 44, 44, + 43, 61, 43, 43, 43, 43, 43, 43, 83, 43, 43, 43, 71, 36, 70, 36, + 36, 84, 71, 62, 43, 44, 44, 44, 16, 16, 16, 16, 16, 16, 40, 40, + 40, 40, 40, 40, 40, 45, 16, 16, 16, 16, 16, 16, 45, 16, 16, 16, + 16, 16, 16, 16, 16, 106, 40, 40, 43, 43, 43, 44, 44, 44, 43, 43, + 32, 32, 32, 16, 16, 16, 16, 32, 16, 16, 16, 16, 11, 11, 11, 11, + 16, 16, 16, 44, 11, 11, 11, 44, 16, 16, 16, 16, 48, 48, 48, 48, + 16, 16, 16, 16, 16, 16, 16, 44, 16, 16, 16, 16, 107, 107, 107, 107, + 16, 16, 108, 16, 11, 11, 109, 110, 41, 16, 108, 16, 11, 11, 109, 41, + 16, 16, 44, 16, 11, 11, 111, 41, 16, 16, 16, 16, 11, 11, 112, 41, + 44, 16, 108, 16, 11, 11, 109, 113, 114, 114, 114, 114, 114, 115, 65, 65, + 116, 116, 116, 2, 117, 118, 117, 118, 2, 2, 2, 2, 119, 65, 65, 120, + 2, 2, 2, 2, 121, 122, 2, 123, 124, 2, 125, 126, 2, 2, 2, 2, + 2, 9, 124, 2, 2, 2, 2, 127, 65, 65, 68, 65, 65, 65, 65, 65, + 128, 44, 27, 27, 27, 8, 125, 129, 27, 27, 27, 27, 27, 8, 125, 100, + 40, 40, 40, 40, 40, 40, 82, 44, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 130, 43, 43, 43, 43, 43, 43, 131, 51, + 132, 51, 132, 43, 43, 43, 43, 43, 80, 44, 44, 44, 44, 44, 44, 44, + 67, 133, 67, 134, 67, 34, 11, 16, 11, 32, 134, 67, 49, 11, 11, 67, + 67, 67, 133, 133, 133, 11, 11, 135, 11, 11, 35, 36, 39, 67, 16, 11, + 8, 8, 49, 16, 16, 26, 67, 136, 27, 27, 27, 27, 27, 27, 27, 27, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 137, 138, 101, 139, 67, 44, 44, + 8, 8, 140, 67, 67, 8, 67, 67, 140, 26, 67, 140, 67, 67, 67, 140, + 67, 67, 67, 67, 67, 67, 67, 8, 67, 140, 140, 67, 67, 67, 67, 67, + 67, 67, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 67, 67, 67, 67, 4, 4, 67, 67, 8, 67, 67, 67, 141, 142, 67, 67, + 67, 67, 67, 67, 67, 67, 140, 67, 67, 67, 67, 67, 67, 26, 8, 8, + 8, 8, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 8, 8, + 8, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 90, 44, 44, + 67, 67, 67, 90, 44, 44, 44, 44, 27, 27, 27, 27, 27, 27, 67, 67, + 67, 67, 67, 67, 67, 27, 27, 27, 67, 67, 67, 26, 67, 67, 67, 67, + 26, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 8, 8, 8, 8, + 67, 67, 67, 67, 67, 67, 67, 26, 67, 67, 67, 67, 4, 4, 4, 4, + 4, 4, 4, 27, 27, 27, 27, 27, 27, 27, 67, 67, 67, 67, 67, 67, + 8, 8, 125, 143, 8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, + 8, 125, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 143, 8, 8, 8, + 8, 8, 8, 8, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, + 8, 8, 140, 26, 8, 8, 140, 67, 67, 67, 44, 67, 67, 67, 67, 67, + 67, 67, 67, 44, 67, 67, 67, 67, 67, 67, 67, 67, 67, 44, 56, 67, + 67, 67, 67, 67, 90, 67, 67, 67, 67, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 67, 67, 11, 11, 11, 11, 11, 11, 11, 47, + 16, 16, 16, 16, 16, 16, 16, 108, 32, 11, 32, 34, 34, 34, 34, 11, + 32, 32, 34, 16, 16, 16, 40, 11, 32, 32, 136, 67, 67, 134, 34, 145, + 43, 32, 44, 44, 54, 2, 95, 2, 16, 16, 16, 53, 44, 44, 53, 44, + 36, 36, 36, 36, 44, 44, 44, 52, 64, 44, 44, 44, 44, 44, 44, 58, + 36, 36, 36, 62, 44, 44, 44, 44, 36, 36, 36, 62, 36, 36, 36, 62, + 2, 117, 117, 2, 121, 122, 117, 2, 2, 2, 2, 6, 2, 103, 117, 2, + 117, 4, 4, 4, 4, 2, 2, 86, 2, 2, 2, 2, 2, 116, 2, 2, + 103, 146, 44, 44, 44, 44, 44, 44, 67, 67, 67, 67, 67, 56, 67, 67, + 67, 67, 44, 44, 44, 44, 44, 44, 67, 67, 67, 44, 44, 44, 44, 44, + 67, 67, 67, 67, 67, 67, 44, 44, 1, 2, 147, 148, 4, 4, 4, 4, + 4, 67, 4, 4, 4, 4, 149, 150, 151, 101, 101, 101, 101, 43, 43, 84, + 152, 40, 40, 67, 101, 153, 63, 67, 36, 36, 36, 62, 58, 154, 155, 69, + 36, 36, 36, 36, 36, 63, 40, 69, 44, 44, 81, 36, 36, 36, 36, 36, + 67, 27, 27, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 90, + 27, 27, 27, 27, 27, 67, 67, 67, 67, 67, 67, 67, 27, 27, 27, 27, + 156, 27, 27, 27, 27, 27, 27, 27, 36, 36, 104, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 157, 2, 7, 7, 7, 7, 7, 36, 44, 44, + 32, 32, 32, 32, 32, 32, 32, 70, 51, 158, 43, 43, 43, 43, 43, 86, + 32, 32, 32, 32, 32, 32, 40, 43, 36, 36, 36, 101, 101, 101, 101, 101, + 43, 2, 2, 2, 44, 44, 44, 44, 41, 41, 41, 155, 40, 40, 40, 40, + 41, 32, 32, 32, 32, 32, 32, 32, 16, 32, 32, 32, 32, 32, 32, 32, + 45, 16, 16, 16, 34, 34, 34, 32, 32, 32, 32, 32, 42, 159, 34, 35, + 32, 32, 16, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 11, 11, 44, + 11, 11, 32, 32, 44, 44, 44, 44, 44, 44, 44, 81, 40, 35, 36, 36, + 36, 71, 36, 71, 36, 70, 36, 36, 36, 93, 85, 83, 67, 67, 44, 44, + 27, 27, 27, 67, 160, 44, 44, 44, 36, 36, 2, 2, 44, 44, 44, 44, + 84, 36, 36, 36, 36, 36, 36, 36, 36, 36, 84, 84, 84, 84, 84, 84, + 84, 84, 80, 44, 44, 44, 44, 2, 43, 36, 36, 36, 2, 72, 72, 44, + 36, 36, 36, 43, 43, 43, 43, 2, 36, 36, 36, 70, 43, 43, 43, 43, + 43, 84, 44, 44, 44, 44, 44, 54, 36, 70, 84, 43, 43, 84, 83, 84, + 161, 2, 2, 2, 2, 2, 2, 52, 7, 7, 7, 7, 7, 44, 44, 2, + 36, 36, 70, 69, 36, 36, 36, 36, 7, 7, 7, 7, 7, 36, 36, 62, + 36, 36, 36, 36, 70, 43, 43, 83, 85, 83, 85, 80, 44, 44, 44, 44, + 36, 70, 36, 36, 36, 36, 83, 44, 7, 7, 7, 7, 7, 44, 2, 2, + 69, 36, 36, 77, 67, 93, 83, 36, 71, 43, 71, 70, 71, 36, 36, 43, + 70, 62, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 81, 104, 2, + 36, 36, 36, 36, 36, 93, 43, 84, 2, 104, 162, 80, 44, 44, 44, 44, + 81, 36, 36, 62, 81, 36, 36, 62, 81, 36, 36, 62, 44, 44, 44, 44, + 16, 16, 16, 16, 16, 110, 40, 40, 16, 16, 16, 44, 44, 44, 44, 44, + 36, 93, 85, 84, 83, 161, 85, 44, 36, 36, 44, 44, 44, 44, 44, 44, + 36, 36, 36, 62, 44, 81, 36, 36, 163, 163, 163, 163, 163, 163, 163, 163, + 164, 164, 164, 164, 164, 164, 164, 164, 16, 16, 16, 108, 44, 44, 44, 44, + 44, 53, 16, 16, 44, 44, 81, 71, 36, 36, 36, 36, 165, 36, 36, 36, + 36, 36, 36, 62, 36, 36, 62, 62, 36, 81, 62, 36, 36, 36, 36, 36, + 36, 41, 41, 41, 41, 41, 41, 41, 41, 44, 44, 44, 44, 44, 44, 44, + 44, 81, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 144, + 44, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 160, 44, + 2, 2, 2, 166, 126, 44, 44, 44, 6, 167, 168, 144, 144, 144, 144, 144, + 144, 144, 126, 166, 126, 2, 123, 169, 2, 64, 2, 2, 149, 144, 144, 126, + 2, 170, 8, 171, 66, 2, 44, 44, 36, 36, 62, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 62, 79, 54, 2, 3, 2, 4, 5, 6, 2, + 16, 16, 16, 16, 16, 17, 18, 125, 126, 4, 2, 36, 36, 36, 36, 36, + 69, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 40, + 44, 36, 36, 36, 44, 36, 36, 36, 44, 36, 36, 36, 44, 36, 62, 44, + 20, 172, 57, 130, 26, 8, 140, 90, 44, 44, 44, 44, 79, 65, 67, 44, + 36, 36, 36, 36, 36, 36, 81, 36, 36, 36, 36, 36, 36, 62, 36, 81, + 2, 64, 44, 173, 27, 27, 27, 27, 27, 27, 44, 56, 67, 67, 67, 67, + 101, 101, 139, 27, 89, 67, 67, 67, 67, 67, 67, 67, 67, 27, 90, 44, + 90, 44, 44, 44, 44, 44, 44, 44, 67, 67, 67, 67, 67, 67, 50, 44, + 174, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 44, 44, + 27, 27, 44, 44, 44, 44, 44, 44, 148, 36, 36, 36, 36, 175, 44, 44, + 36, 36, 36, 43, 43, 80, 44, 44, 36, 36, 36, 36, 36, 36, 36, 54, + 36, 36, 44, 44, 36, 36, 36, 36, 176, 101, 101, 44, 44, 44, 44, 44, + 11, 11, 11, 11, 16, 16, 16, 16, 36, 36, 44, 44, 44, 44, 44, 54, + 36, 36, 36, 44, 62, 36, 36, 36, 36, 36, 36, 81, 62, 44, 62, 81, + 36, 36, 36, 54, 27, 27, 27, 27, 36, 36, 36, 77, 156, 27, 27, 27, + 44, 44, 44, 173, 27, 27, 27, 27, 36, 62, 36, 44, 44, 173, 27, 27, + 36, 36, 36, 27, 27, 27, 44, 54, 36, 36, 36, 36, 36, 44, 44, 54, + 36, 36, 36, 36, 44, 44, 27, 36, 44, 27, 27, 27, 27, 27, 27, 27, + 70, 43, 58, 80, 44, 44, 43, 43, 36, 36, 81, 36, 81, 36, 36, 36, + 36, 36, 44, 44, 43, 80, 44, 58, 27, 27, 27, 27, 44, 44, 44, 44, + 2, 2, 2, 2, 64, 44, 44, 44, 36, 36, 36, 36, 36, 36, 177, 30, + 36, 36, 36, 36, 36, 36, 177, 27, 36, 36, 36, 36, 78, 36, 36, 36, + 36, 36, 70, 80, 44, 173, 27, 27, 2, 2, 2, 64, 44, 44, 44, 44, + 36, 36, 36, 44, 54, 2, 2, 2, 36, 36, 36, 44, 27, 27, 27, 27, + 36, 62, 44, 44, 27, 27, 27, 27, 36, 44, 44, 44, 54, 2, 64, 44, + 44, 44, 44, 44, 173, 27, 27, 27, 36, 36, 36, 36, 62, 44, 44, 44, + 11, 47, 44, 44, 44, 44, 44, 44, 16, 108, 44, 44, 44, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 96, 85, 94, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 43, 43, 43, 43, 43, 43, 43, 61, 2, 2, 2, 44, + 27, 27, 27, 7, 7, 7, 7, 7, 44, 44, 44, 44, 44, 44, 44, 58, + 84, 85, 43, 83, 85, 61, 178, 2, 2, 44, 44, 44, 44, 44, 44, 44, + 43, 71, 36, 36, 36, 36, 36, 36, 36, 36, 36, 70, 43, 43, 85, 43, + 43, 43, 80, 7, 7, 7, 7, 7, 2, 2, 44, 44, 44, 44, 44, 44, + 36, 70, 2, 62, 44, 44, 44, 44, 36, 93, 84, 43, 43, 43, 43, 83, + 94, 36, 63, 2, 2, 43, 61, 44, 7, 7, 7, 7, 7, 63, 63, 2, + 173, 27, 27, 27, 27, 27, 27, 27, 27, 27, 96, 44, 44, 44, 44, 44, + 36, 36, 36, 36, 36, 36, 84, 85, 43, 84, 83, 43, 2, 2, 2, 44, + 36, 36, 36, 62, 62, 36, 36, 81, 36, 36, 36, 36, 36, 36, 36, 81, + 36, 36, 36, 36, 63, 44, 44, 44, 36, 36, 36, 36, 36, 36, 36, 70, + 84, 85, 43, 43, 43, 80, 44, 44, 43, 84, 81, 36, 36, 36, 62, 81, + 83, 84, 88, 87, 88, 87, 84, 44, 62, 44, 44, 87, 44, 44, 81, 36, + 36, 84, 44, 43, 43, 43, 80, 44, 43, 43, 80, 44, 44, 44, 44, 44, + 84, 85, 43, 43, 83, 83, 84, 85, 83, 43, 36, 72, 44, 44, 44, 44, + 36, 36, 36, 36, 36, 36, 36, 93, 84, 43, 43, 44, 84, 84, 43, 85, + 61, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 36, 36, 43, 44, + 84, 85, 43, 43, 43, 83, 85, 85, 61, 2, 62, 44, 44, 44, 44, 44, + 36, 36, 36, 36, 36, 70, 85, 84, 43, 43, 43, 85, 44, 44, 44, 44, + 36, 36, 36, 36, 36, 44, 58, 43, 84, 43, 43, 85, 43, 43, 44, 44, + 7, 7, 7, 7, 7, 27, 2, 92, 27, 96, 44, 44, 44, 44, 44, 81, + 101, 101, 101, 101, 101, 101, 101, 175, 2, 2, 64, 44, 44, 44, 44, 44, + 43, 43, 61, 44, 44, 44, 44, 44, 43, 43, 43, 61, 2, 2, 67, 67, + 40, 40, 92, 44, 44, 44, 44, 44, 7, 7, 7, 7, 7, 173, 27, 27, + 27, 81, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 44, 44, 81, 36, + 93, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 88, 43, 74, 40, 40, 40, 40, 40, 40, + 36, 44, 44, 44, 44, 44, 44, 44, 36, 36, 36, 36, 36, 44, 50, 61, + 65, 65, 44, 44, 44, 44, 44, 44, 67, 67, 67, 90, 56, 67, 67, 67, + 67, 67, 179, 85, 43, 67, 179, 84, 84, 180, 65, 65, 65, 181, 43, 43, + 43, 76, 50, 43, 43, 43, 67, 67, 67, 67, 67, 67, 67, 43, 43, 67, + 67, 67, 67, 67, 90, 44, 44, 44, 67, 43, 76, 44, 44, 44, 44, 44, + 27, 44, 44, 44, 44, 44, 44, 44, 11, 11, 11, 11, 11, 16, 16, 16, + 16, 16, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, + 16, 16, 108, 16, 16, 16, 16, 16, 11, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 47, 11, 44, 47, 48, 47, 48, 11, 47, 11, + 11, 11, 11, 16, 16, 53, 53, 16, 16, 16, 53, 16, 16, 16, 16, 16, + 16, 16, 11, 48, 11, 47, 48, 11, 11, 11, 47, 11, 11, 11, 47, 16, + 16, 16, 16, 16, 11, 48, 11, 47, 11, 11, 47, 47, 44, 11, 11, 11, + 47, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 11, 11, + 11, 11, 11, 16, 16, 16, 16, 16, 16, 16, 16, 44, 11, 11, 11, 11, + 31, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 33, 16, 16, + 16, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 31, 16, 16, + 16, 16, 33, 16, 16, 16, 11, 11, 11, 11, 31, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 33, 16, 16, 16, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 31, 16, 16, 16, 16, 33, 16, 16, 16, + 11, 11, 11, 11, 31, 16, 16, 16, 16, 33, 16, 16, 16, 32, 44, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 43, 43, 43, 76, 67, 50, 43, 43, + 43, 43, 43, 43, 43, 43, 76, 67, 67, 67, 50, 67, 67, 67, 67, 67, + 67, 67, 76, 21, 2, 2, 44, 44, 44, 44, 44, 44, 44, 58, 43, 43, + 36, 36, 62, 173, 27, 27, 27, 27, 43, 43, 43, 80, 44, 44, 44, 44, + 36, 36, 81, 36, 36, 36, 36, 36, 81, 62, 62, 81, 81, 36, 36, 36, + 36, 62, 36, 36, 81, 81, 44, 44, 44, 62, 44, 81, 81, 81, 81, 36, + 81, 62, 62, 81, 81, 81, 81, 81, 81, 62, 62, 81, 36, 62, 36, 36, + 36, 62, 36, 36, 81, 36, 62, 62, 36, 36, 36, 36, 36, 81, 36, 36, + 81, 36, 81, 36, 36, 81, 36, 36, 8, 44, 44, 44, 44, 44, 44, 44, + 56, 67, 67, 67, 67, 67, 67, 67, 44, 44, 44, 67, 67, 67, 67, 67, + 67, 90, 44, 44, 44, 44, 44, 44, 67, 67, 67, 67, 67, 25, 41, 41, + 67, 67, 56, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 90, 44, + 67, 67, 90, 44, 44, 44, 44, 44, 67, 67, 67, 67, 44, 44, 44, 44, + 67, 67, 67, 67, 67, 67, 67, 44, 79, 44, 44, 44, 44, 44, 44, 44, + 65, 65, 65, 65, 65, 65, 65, 65, 164, 164, 164, 164, 164, 164, 164, 44, +}; + +static RE_UINT8 re_general_category_stage_5[] = { + 15, 15, 12, 23, 23, 23, 25, 23, 20, 21, 23, 24, 23, 19, 9, 9, + 24, 24, 24, 23, 23, 1, 1, 1, 1, 20, 23, 21, 26, 22, 26, 2, + 2, 2, 2, 20, 24, 21, 24, 15, 25, 25, 27, 23, 26, 27, 5, 28, + 24, 16, 27, 26, 27, 24, 11, 11, 26, 11, 5, 29, 11, 23, 1, 24, + 1, 2, 2, 24, 2, 1, 2, 5, 5, 5, 1, 3, 3, 2, 5, 2, + 4, 4, 26, 26, 4, 26, 6, 6, 0, 0, 4, 2, 1, 23, 1, 0, + 0, 1, 24, 1, 27, 6, 7, 7, 0, 4, 0, 2, 0, 23, 19, 0, + 0, 27, 27, 25, 0, 6, 19, 6, 23, 6, 6, 23, 5, 0, 5, 23, + 23, 0, 16, 16, 23, 25, 27, 27, 16, 0, 4, 5, 5, 6, 6, 5, + 23, 5, 6, 16, 6, 4, 4, 6, 6, 27, 5, 27, 27, 5, 0, 16, + 6, 0, 0, 5, 4, 0, 6, 8, 8, 8, 8, 6, 23, 4, 0, 8, + 8, 0, 11, 27, 27, 0, 0, 25, 23, 27, 5, 8, 8, 5, 23, 11, + 11, 0, 19, 5, 12, 5, 5, 20, 21, 0, 10, 10, 10, 5, 19, 23, + 5, 4, 7, 0, 2, 4, 3, 3, 2, 0, 3, 26, 2, 26, 0, 26, + 1, 26, 26, 0, 12, 12, 12, 16, 19, 19, 28, 29, 20, 28, 13, 14, + 16, 12, 23, 28, 29, 23, 23, 22, 22, 23, 24, 20, 21, 23, 23, 12, + 11, 4, 21, 4, 25, 0, 6, 7, 7, 6, 1, 27, 27, 1, 27, 2, + 2, 27, 10, 1, 2, 10, 10, 11, 24, 27, 27, 20, 21, 27, 21, 24, + 21, 20, 2, 6, 20, 0, 27, 4, 5, 10, 19, 20, 21, 21, 27, 10, + 19, 4, 10, 4, 6, 26, 26, 4, 27, 11, 4, 23, 7, 23, 26, 1, + 25, 27, 8, 23, 4, 8, 18, 18, 17, 17, 5, 24, 23, 20, 19, 22, + 22, 20, 22, 22, 24, 19, 24, 0, 24, 26, 0, 11, 6, 11, 10, 0, + 23, 10, 5, 11, 23, 16, 27, 8, 8, 16, 16, 6, +}; + +/* General_Category: 9628 bytes. */ + +RE_UINT32 re_get_general_category(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 11; + code = ch ^ (f << 11); + pos = (RE_UINT32)re_general_category_stage_1[f] << 4; + f = code >> 7; + code ^= f << 7; + pos = (RE_UINT32)re_general_category_stage_2[pos + f] << 3; + f = code >> 4; + code ^= f << 4; + pos = (RE_UINT32)re_general_category_stage_3[pos + f] << 3; + f = code >> 1; + code ^= f << 1; + pos = (RE_UINT32)re_general_category_stage_4[pos + f] << 1; + value = re_general_category_stage_5[pos + code]; + + return value; +} + +/* Block. */ + +static RE_UINT8 re_block_stage_1[] = { + 0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 6, 7, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 16, 16, 16, 16, 18, 16, 19, 20, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 24, 25, 16, 16, 26, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 27, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, +}; + +static RE_UINT8 re_block_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 28, + 29, 30, 31, 31, 32, 32, 32, 33, 34, 34, 34, 34, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 50, 51, 51, + 52, 53, 54, 55, 56, 56, 57, 57, 58, 59, 60, 61, 62, 62, 63, 64, + 65, 65, 66, 67, 68, 68, 69, 69, 70, 71, 72, 73, 74, 75, 76, 77, + 78, 79, 80, 81, 82, 82, 83, 83, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 85, 86, 86, 86, 86, + 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, + 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, + 87, 87, 87, 87, 87, 87, 87, 87, 87, 88, 89, 89, 90, 91, 92, 93, + 94, 95, 96, 97, 98, 99, 100, 101, 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 103, + 104, 104, 104, 104, 104, 104, 104, 105, 106, 106, 106, 106, 106, 106, 106, 106, + 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, + 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, + 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, + 107, 107, 108, 108, 108, 108, 109, 110, 110, 110, 110, 110, 111, 112, 113, 114, + 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 119, 126, 126, 126, 119, + 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 119, 119, 137, 119, 119, 119, + 138, 139, 140, 141, 142, 143, 144, 119, 119, 145, 119, 146, 147, 148, 149, 119, + 119, 150, 119, 119, 119, 151, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, + 152, 152, 152, 152, 152, 152, 152, 152, 153, 154, 155, 119, 119, 119, 119, 119, + 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, + 156, 156, 156, 156, 156, 156, 156, 156, 157, 119, 119, 119, 119, 119, 119, 119, + 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, + 119, 119, 119, 119, 119, 119, 119, 119, 158, 158, 158, 158, 158, 119, 119, 119, + 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, + 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, + 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, + 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, + 159, 159, 159, 159, 160, 161, 162, 163, 119, 119, 119, 119, 119, 119, 164, 165, + 166, 166, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, + 119, 119, 119, 119, 119, 119, 119, 119, 167, 168, 119, 119, 119, 119, 119, 119, + 169, 169, 170, 170, 171, 119, 172, 119, 173, 173, 173, 173, 173, 173, 173, 173, + 174, 174, 174, 174, 174, 175, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, + 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, + 176, 177, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 178, 178, 119, 119, + 179, 180, 181, 181, 182, 182, 183, 183, 183, 183, 183, 183, 184, 185, 186, 187, + 188, 188, 189, 189, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, + 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, + 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, + 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 191, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 193, 194, + 195, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, + 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, + 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 197, 119, 119, + 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, + 198, 198, 198, 198, 199, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, + 200, 119, 201, 202, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, + 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, + 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, + 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, + 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, + 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, +}; + +static RE_UINT16 re_block_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 10, + 10, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, + 13, 13, 13, 13, 13, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, + 17, 17, 17, 17, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, + 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, + 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, + 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, + 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, + 29, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 30, 30, 30, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, + 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 39, 39, 39, 39, 39, 39, + 40, 40, 40, 40, 40, 40, 40, 40, 41, 41, 42, 42, 42, 42, 42, 42, + 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 49, 49, 49, 49, 49, + 50, 50, 50, 50, 50, 51, 51, 51, 52, 52, 52, 52, 52, 52, 53, 53, + 54, 54, 55, 55, 55, 55, 55, 55, 55, 55, 55, 56, 56, 56, 56, 56, + 57, 57, 57, 57, 57, 57, 57, 57, 58, 58, 58, 58, 59, 59, 59, 59, + 60, 60, 60, 60, 60, 61, 61, 61, 19, 19, 19, 19, 62, 63, 63, 63, + 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 66, + 67, 67, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, + 69, 69, 69, 69, 69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, + 73, 73, 73, 73, 73, 74, 74, 74, 74, 75, 75, 75, 75, 75, 75, 75, + 76, 76, 76, 76, 76, 76, 76, 76, 77, 77, 77, 77, 77, 77, 77, 77, + 78, 78, 78, 78, 79, 79, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, + 81, 81, 81, 81, 81, 81, 81, 81, 82, 82, 83, 83, 83, 83, 83, 83, + 84, 84, 84, 84, 84, 84, 84, 84, 85, 85, 85, 85, 85, 85, 85, 85, + 85, 85, 85, 85, 86, 86, 86, 87, 88, 88, 88, 88, 88, 88, 88, 88, + 89, 89, 89, 89, 89, 89, 89, 89, 90, 90, 90, 90, 90, 90, 90, 90, + 91, 91, 91, 91, 91, 91, 91, 91, 92, 92, 92, 92, 92, 92, 92, 92, + 93, 93, 93, 93, 93, 93, 94, 94, 95, 95, 95, 95, 95, 95, 95, 95, + 96, 96, 96, 97, 97, 97, 97, 97, 98, 98, 98, 98, 98, 98, 99, 99, + 100, 100, 100, 100, 100, 100, 100, 100, 101, 101, 101, 101, 101, 101, 101, 101, + 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 19, 103, + 104, 104, 104, 104, 105, 105, 105, 105, 105, 105, 106, 106, 106, 106, 106, 106, + 107, 107, 107, 108, 108, 108, 108, 108, 108, 109, 110, 110, 111, 111, 111, 112, + 113, 113, 113, 113, 113, 113, 113, 113, 114, 114, 114, 114, 114, 114, 114, 114, + 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 116, 116, 116, 116, + 117, 117, 117, 117, 117, 117, 117, 117, 118, 118, 118, 118, 118, 118, 118, 118, + 118, 119, 119, 119, 119, 120, 120, 120, 121, 121, 121, 121, 121, 121, 121, 121, + 121, 121, 121, 121, 122, 122, 122, 122, 122, 122, 123, 123, 123, 123, 123, 123, + 124, 124, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, + 126, 126, 126, 127, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 130, 130, + 131, 131, 131, 132, 132, 132, 133, 133, 134, 134, 134, 134, 134, 134, 135, 135, + 136, 136, 136, 136, 136, 136, 137, 137, 138, 138, 138, 138, 138, 138, 139, 139, + 140, 140, 140, 141, 141, 141, 141, 142, 142, 142, 142, 142, 143, 143, 143, 143, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 145, 145, 145, 145, 145, + 146, 146, 146, 146, 146, 146, 146, 146, 147, 147, 147, 147, 147, 147, 147, 147, + 148, 148, 148, 148, 148, 148, 148, 148, 149, 149, 149, 149, 149, 149, 149, 149, + 150, 150, 150, 150, 150, 150, 150, 150, 151, 151, 151, 151, 151, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, 153, 154, 155, 156, 156, 157, 157, 158, + 158, 158, 158, 158, 158, 158, 158, 158, 159, 159, 159, 159, 159, 159, 159, 159, + 159, 159, 159, 159, 159, 159, 159, 160, 161, 161, 161, 161, 161, 161, 161, 161, + 162, 162, 162, 162, 162, 162, 162, 162, 163, 163, 163, 163, 164, 164, 164, 164, + 164, 165, 165, 165, 165, 166, 166, 166, 19, 19, 19, 19, 19, 19, 19, 19, + 167, 167, 168, 168, 168, 168, 169, 169, 170, 170, 170, 171, 171, 172, 172, 172, + 173, 173, 174, 174, 174, 174, 19, 19, 175, 175, 175, 175, 175, 176, 176, 176, + 177, 177, 177, 19, 19, 19, 19, 19, 178, 178, 178, 179, 179, 179, 179, 19, + 180, 180, 180, 180, 180, 180, 180, 180, 181, 181, 181, 181, 182, 182, 183, 183, + 184, 184, 184, 19, 19, 19, 185, 185, 186, 186, 187, 187, 19, 19, 19, 19, + 188, 188, 189, 189, 189, 189, 189, 189, 190, 190, 190, 190, 190, 190, 191, 191, + 192, 192, 19, 19, 193, 193, 193, 193, 194, 194, 194, 194, 195, 195, 196, 196, + 197, 197, 197, 19, 19, 19, 19, 19, 198, 198, 198, 198, 198, 19, 19, 19, + 199, 199, 199, 199, 199, 199, 199, 199, 19, 19, 19, 19, 19, 19, 200, 200, + 201, 201, 201, 201, 201, 201, 201, 201, 202, 202, 202, 202, 202, 203, 203, 203, + 204, 204, 204, 204, 204, 205, 205, 205, 206, 206, 206, 206, 206, 206, 207, 207, + 208, 208, 208, 208, 208, 19, 19, 19, 209, 209, 209, 210, 210, 210, 210, 210, + 211, 211, 211, 211, 211, 211, 211, 211, 212, 212, 212, 212, 212, 212, 19, 19, + 213, 213, 213, 213, 213, 213, 213, 213, 214, 214, 214, 214, 214, 214, 19, 19, + 215, 215, 215, 215, 215, 19, 19, 19, 216, 216, 216, 216, 19, 19, 19, 19, + 19, 19, 217, 217, 217, 217, 217, 217, 19, 19, 19, 19, 218, 218, 218, 218, + 219, 219, 219, 219, 219, 219, 219, 219, 220, 220, 220, 220, 220, 220, 220, 220, + 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 19, 19, 19, + 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 19, 19, 19, 19, 19, + 223, 223, 223, 223, 223, 223, 223, 223, 224, 224, 224, 224, 224, 224, 224, 224, + 224, 224, 224, 224, 225, 225, 225, 19, 19, 19, 19, 19, 19, 226, 226, 226, + 227, 227, 227, 227, 227, 227, 227, 227, 227, 19, 19, 19, 19, 19, 19, 19, + 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 19, 19, 19, 19, 19, 19, + 229, 229, 229, 229, 229, 229, 229, 229, 230, 230, 230, 230, 230, 230, 230, 230, + 230, 230, 231, 19, 19, 19, 19, 19, 232, 232, 232, 232, 232, 232, 232, 232, + 233, 233, 233, 233, 233, 233, 233, 233, 234, 234, 234, 234, 234, 19, 19, 19, + 235, 235, 235, 235, 235, 235, 236, 236, 237, 237, 237, 237, 237, 237, 237, 237, + 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 19, 19, 19, 19, 19, + 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 19, 19, + 240, 240, 240, 240, 240, 240, 240, 240, 241, 241, 241, 242, 242, 242, 242, 242, + 242, 242, 243, 243, 243, 243, 243, 243, 244, 244, 244, 244, 244, 244, 244, 244, + 245, 245, 245, 245, 245, 245, 245, 245, 246, 246, 246, 246, 246, 246, 246, 246, + 247, 247, 247, 247, 247, 248, 248, 248, 249, 249, 249, 249, 249, 249, 249, 249, + 250, 250, 250, 250, 250, 250, 250, 250, 251, 251, 251, 251, 251, 251, 251, 251, + 252, 252, 252, 252, 252, 252, 252, 252, 253, 253, 253, 253, 253, 253, 253, 253, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 19, 19, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 257, 257, 257, 257, 257, 257, + 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 257, 19, 19, 19, 19, 19, + 258, 258, 258, 258, 258, 258, 258, 258, 258, 258, 19, 19, 19, 19, 19, 19, + 259, 259, 259, 259, 259, 259, 259, 259, 260, 260, 260, 260, 260, 260, 260, 260, + 260, 260, 260, 260, 260, 260, 260, 19, 261, 261, 261, 261, 261, 261, 261, 261, + 262, 262, 262, 262, 262, 262, 262, 262, +}; + +static RE_UINT16 re_block_stage_4[] = { + 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, + 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, + 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, + 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, + 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, + 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, + 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27, + 28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, + 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, + 36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 39, 39, 39, + 40, 40, 40, 40, 41, 41, 41, 41, 42, 42, 42, 42, 43, 43, 43, 43, + 44, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47, + 48, 48, 48, 48, 49, 49, 49, 49, 50, 50, 50, 50, 51, 51, 51, 51, + 52, 52, 52, 52, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 55, 55, + 56, 56, 56, 56, 57, 57, 57, 57, 58, 58, 58, 58, 59, 59, 59, 59, + 60, 60, 60, 60, 61, 61, 61, 61, 62, 62, 62, 62, 63, 63, 63, 63, + 64, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 66, 67, 67, 67, 67, + 68, 68, 68, 68, 69, 69, 69, 69, 70, 70, 70, 70, 71, 71, 71, 71, + 72, 72, 72, 72, 73, 73, 73, 73, 74, 74, 74, 74, 75, 75, 75, 75, + 76, 76, 76, 76, 77, 77, 77, 77, 78, 78, 78, 78, 79, 79, 79, 79, + 80, 80, 80, 80, 81, 81, 81, 81, 82, 82, 82, 82, 83, 83, 83, 83, + 84, 84, 84, 84, 85, 85, 85, 85, 86, 86, 86, 86, 87, 87, 87, 87, + 88, 88, 88, 88, 89, 89, 89, 89, 90, 90, 90, 90, 91, 91, 91, 91, + 92, 92, 92, 92, 93, 93, 93, 93, 94, 94, 94, 94, 95, 95, 95, 95, + 96, 96, 96, 96, 97, 97, 97, 97, 98, 98, 98, 98, 99, 99, 99, 99, + 100, 100, 100, 100, 101, 101, 101, 101, 102, 102, 102, 102, 103, 103, 103, 103, + 104, 104, 104, 104, 105, 105, 105, 105, 106, 106, 106, 106, 107, 107, 107, 107, + 108, 108, 108, 108, 109, 109, 109, 109, 110, 110, 110, 110, 111, 111, 111, 111, + 112, 112, 112, 112, 113, 113, 113, 113, 114, 114, 114, 114, 115, 115, 115, 115, + 116, 116, 116, 116, 117, 117, 117, 117, 118, 118, 118, 118, 119, 119, 119, 119, + 120, 120, 120, 120, 121, 121, 121, 121, 122, 122, 122, 122, 123, 123, 123, 123, + 124, 124, 124, 124, 125, 125, 125, 125, 126, 126, 126, 126, 127, 127, 127, 127, + 128, 128, 128, 128, 129, 129, 129, 129, 130, 130, 130, 130, 131, 131, 131, 131, + 132, 132, 132, 132, 133, 133, 133, 133, 134, 134, 134, 134, 135, 135, 135, 135, + 136, 136, 136, 136, 137, 137, 137, 137, 138, 138, 138, 138, 139, 139, 139, 139, + 140, 140, 140, 140, 141, 141, 141, 141, 142, 142, 142, 142, 143, 143, 143, 143, + 144, 144, 144, 144, 145, 145, 145, 145, 146, 146, 146, 146, 147, 147, 147, 147, + 148, 148, 148, 148, 149, 149, 149, 149, 150, 150, 150, 150, 151, 151, 151, 151, + 152, 152, 152, 152, 153, 153, 153, 153, 154, 154, 154, 154, 155, 155, 155, 155, + 156, 156, 156, 156, 157, 157, 157, 157, 158, 158, 158, 158, 159, 159, 159, 159, + 160, 160, 160, 160, 161, 161, 161, 161, 162, 162, 162, 162, 163, 163, 163, 163, + 164, 164, 164, 164, 165, 165, 165, 165, 166, 166, 166, 166, 167, 167, 167, 167, + 168, 168, 168, 168, 169, 169, 169, 169, 170, 170, 170, 170, 171, 171, 171, 171, + 172, 172, 172, 172, 173, 173, 173, 173, 174, 174, 174, 174, 175, 175, 175, 175, + 176, 176, 176, 176, 177, 177, 177, 177, 178, 178, 178, 178, 179, 179, 179, 179, + 180, 180, 180, 180, 181, 181, 181, 181, 182, 182, 182, 182, 183, 183, 183, 183, + 184, 184, 184, 184, 185, 185, 185, 185, 186, 186, 186, 186, 187, 187, 187, 187, + 188, 188, 188, 188, 189, 189, 189, 189, 190, 190, 190, 190, 191, 191, 191, 191, + 192, 192, 192, 192, 193, 193, 193, 193, 194, 194, 194, 194, 195, 195, 195, 195, + 196, 196, 196, 196, 197, 197, 197, 197, 198, 198, 198, 198, 199, 199, 199, 199, + 200, 200, 200, 200, 201, 201, 201, 201, 202, 202, 202, 202, 203, 203, 203, 203, + 204, 204, 204, 204, 205, 205, 205, 205, 206, 206, 206, 206, 207, 207, 207, 207, + 208, 208, 208, 208, 209, 209, 209, 209, 210, 210, 210, 210, 211, 211, 211, 211, + 212, 212, 212, 212, 213, 213, 213, 213, 214, 214, 214, 214, 215, 215, 215, 215, + 216, 216, 216, 216, 217, 217, 217, 217, 218, 218, 218, 218, 219, 219, 219, 219, + 220, 220, 220, 220, 221, 221, 221, 221, 222, 222, 222, 222, 223, 223, 223, 223, + 224, 224, 224, 224, 225, 225, 225, 225, 226, 226, 226, 226, 227, 227, 227, 227, + 228, 228, 228, 228, 229, 229, 229, 229, 230, 230, 230, 230, 231, 231, 231, 231, + 232, 232, 232, 232, 233, 233, 233, 233, 234, 234, 234, 234, 235, 235, 235, 235, + 236, 236, 236, 236, 237, 237, 237, 237, 238, 238, 238, 238, 239, 239, 239, 239, + 240, 240, 240, 240, 241, 241, 241, 241, 242, 242, 242, 242, 243, 243, 243, 243, + 244, 244, 244, 244, 245, 245, 245, 245, 246, 246, 246, 246, 247, 247, 247, 247, + 248, 248, 248, 248, 249, 249, 249, 249, 250, 250, 250, 250, 251, 251, 251, 251, + 252, 252, 252, 252, 253, 253, 253, 253, 254, 254, 254, 254, 255, 255, 255, 255, + 256, 256, 256, 256, 257, 257, 257, 257, 258, 258, 258, 258, 259, 259, 259, 259, + 260, 260, 260, 260, 261, 261, 261, 261, 262, 262, 262, 262, +}; + +static RE_UINT16 re_block_stage_5[] = { + 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, + 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, + 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, + 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, + 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 0, 0, 0, 0, + 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, + 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27, + 28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, + 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, + 36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 39, 39, 39, + 40, 40, 40, 40, 41, 41, 41, 41, 42, 42, 42, 42, 43, 43, 43, 43, + 44, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47, + 48, 48, 48, 48, 49, 49, 49, 49, 50, 50, 50, 50, 51, 51, 51, 51, + 52, 52, 52, 52, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 55, 55, + 56, 56, 56, 56, 57, 57, 57, 57, 58, 58, 58, 58, 59, 59, 59, 59, + 60, 60, 60, 60, 61, 61, 61, 61, 62, 62, 62, 62, 63, 63, 63, 63, + 64, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 66, 67, 67, 67, 67, + 68, 68, 68, 68, 69, 69, 69, 69, 70, 70, 70, 70, 71, 71, 71, 71, + 72, 72, 72, 72, 73, 73, 73, 73, 74, 74, 74, 74, 75, 75, 75, 75, + 76, 76, 76, 76, 77, 77, 77, 77, 78, 78, 78, 78, 79, 79, 79, 79, + 80, 80, 80, 80, 81, 81, 81, 81, 82, 82, 82, 82, 83, 83, 83, 83, + 84, 84, 84, 84, 85, 85, 85, 85, 86, 86, 86, 86, 87, 87, 87, 87, + 88, 88, 88, 88, 89, 89, 89, 89, 90, 90, 90, 90, 91, 91, 91, 91, + 92, 92, 92, 92, 93, 93, 93, 93, 94, 94, 94, 94, 95, 95, 95, 95, + 96, 96, 96, 96, 97, 97, 97, 97, 98, 98, 98, 98, 99, 99, 99, 99, + 100, 100, 100, 100, 101, 101, 101, 101, 102, 102, 102, 102, 103, 103, 103, 103, + 104, 104, 104, 104, 105, 105, 105, 105, 106, 106, 106, 106, 107, 107, 107, 107, + 108, 108, 108, 108, 109, 109, 109, 109, 110, 110, 110, 110, 111, 111, 111, 111, + 112, 112, 112, 112, 113, 113, 113, 113, 114, 114, 114, 114, 115, 115, 115, 115, + 116, 116, 116, 116, 117, 117, 117, 117, 118, 118, 118, 118, 119, 119, 119, 119, + 120, 120, 120, 120, 121, 121, 121, 121, 122, 122, 122, 122, 123, 123, 123, 123, + 124, 124, 124, 124, 125, 125, 125, 125, 126, 126, 126, 126, 127, 127, 127, 127, + 128, 128, 128, 128, 129, 129, 129, 129, 130, 130, 130, 130, 131, 131, 131, 131, + 132, 132, 132, 132, 133, 133, 133, 133, 134, 134, 134, 134, 135, 135, 135, 135, + 136, 136, 136, 136, 137, 137, 137, 137, 138, 138, 138, 138, 139, 139, 139, 139, + 140, 140, 140, 140, 141, 141, 141, 141, 142, 142, 142, 142, 143, 143, 143, 143, + 144, 144, 144, 144, 145, 145, 145, 145, 146, 146, 146, 146, 147, 147, 147, 147, + 148, 148, 148, 148, 149, 149, 149, 149, 150, 150, 150, 150, 151, 151, 151, 151, + 152, 152, 152, 152, 153, 153, 153, 153, 154, 154, 154, 154, 155, 155, 155, 155, + 156, 156, 156, 156, 157, 157, 157, 157, 158, 158, 158, 158, 159, 159, 159, 159, + 160, 160, 160, 160, 161, 161, 161, 161, 162, 162, 162, 162, 163, 163, 163, 163, + 164, 164, 164, 164, 165, 165, 165, 165, 166, 166, 166, 166, 167, 167, 167, 167, + 168, 168, 168, 168, 169, 169, 169, 169, 170, 170, 170, 170, 171, 171, 171, 171, + 172, 172, 172, 172, 173, 173, 173, 173, 174, 174, 174, 174, 175, 175, 175, 175, + 176, 176, 176, 176, 177, 177, 177, 177, 178, 178, 178, 178, 179, 179, 179, 179, + 180, 180, 180, 180, 181, 181, 181, 181, 182, 182, 182, 182, 183, 183, 183, 183, + 184, 184, 184, 184, 185, 185, 185, 185, 186, 186, 186, 186, 187, 187, 187, 187, + 188, 188, 188, 188, 189, 189, 189, 189, 190, 190, 190, 190, 191, 191, 191, 191, + 192, 192, 192, 192, 193, 193, 193, 193, 194, 194, 194, 194, 195, 195, 195, 195, + 196, 196, 196, 196, 197, 197, 197, 197, 198, 198, 198, 198, 199, 199, 199, 199, + 200, 200, 200, 200, 201, 201, 201, 201, 202, 202, 202, 202, 203, 203, 203, 203, + 204, 204, 204, 204, 205, 205, 205, 205, 206, 206, 206, 206, 207, 207, 207, 207, + 208, 208, 208, 208, 209, 209, 209, 209, 210, 210, 210, 210, 211, 211, 211, 211, + 212, 212, 212, 212, 213, 213, 213, 213, 214, 214, 214, 214, 215, 215, 215, 215, + 216, 216, 216, 216, 217, 217, 217, 217, 218, 218, 218, 218, 219, 219, 219, 219, + 220, 220, 220, 220, 221, 221, 221, 221, 222, 222, 222, 222, 223, 223, 223, 223, + 224, 224, 224, 224, 225, 225, 225, 225, 226, 226, 226, 226, 227, 227, 227, 227, + 228, 228, 228, 228, 229, 229, 229, 229, 230, 230, 230, 230, 231, 231, 231, 231, + 232, 232, 232, 232, 233, 233, 233, 233, 234, 234, 234, 234, 235, 235, 235, 235, + 236, 236, 236, 236, 237, 237, 237, 237, 238, 238, 238, 238, 239, 239, 239, 239, + 240, 240, 240, 240, 241, 241, 241, 241, 242, 242, 242, 242, 243, 243, 243, 243, + 244, 244, 244, 244, 245, 245, 245, 245, 246, 246, 246, 246, 247, 247, 247, 247, + 248, 248, 248, 248, 249, 249, 249, 249, 250, 250, 250, 250, 251, 251, 251, 251, + 252, 252, 252, 252, 253, 253, 253, 253, 254, 254, 254, 254, 255, 255, 255, 255, + 256, 256, 256, 256, 257, 257, 257, 257, 258, 258, 258, 258, 259, 259, 259, 259, + 260, 260, 260, 260, 261, 261, 261, 261, 262, 262, 262, 262, +}; + +/* Block: 8720 bytes. */ + +RE_UINT32 re_get_block(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 12; + code = ch ^ (f << 12); + pos = (RE_UINT32)re_block_stage_1[f] << 5; + f = code >> 7; + code ^= f << 7; + pos = (RE_UINT32)re_block_stage_2[pos + f] << 3; + f = code >> 4; + code ^= f << 4; + pos = (RE_UINT32)re_block_stage_3[pos + f] << 2; + f = code >> 2; + code ^= f << 2; + pos = (RE_UINT32)re_block_stage_4[pos + f] << 2; + value = re_block_stage_5[pos + code]; + + return value; +} + +/* Script. */ + +static RE_UINT8 re_script_stage_1[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 9, 10, 11, 12, 12, 12, 12, 13, 14, 14, 14, 14, 15, + 16, 17, 18, 19, 20, 14, 21, 14, 22, 14, 14, 14, 14, 23, 14, 14, + 14, 14, 14, 14, 14, 14, 24, 25, 14, 14, 26, 27, 14, 28, 29, 30, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 31, 7, 32, 33, 7, 34, 14, 14, 14, 14, 14, 35, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 36, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, +}; + +static RE_UINT8 re_script_stage_2[] = { + 0, 1, 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 32, 33, 34, 35, 36, 37, 37, 37, 37, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 2, 2, 53, 54, + 55, 56, 57, 58, 59, 59, 59, 60, 61, 59, 59, 59, 59, 59, 59, 59, + 62, 62, 59, 59, 59, 59, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, + 73, 74, 75, 76, 77, 78, 79, 59, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 80, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 81, + 82, 82, 82, 82, 82, 82, 82, 82, 82, 83, 84, 84, 85, 86, 87, 88, + 89, 90, 91, 92, 93, 94, 95, 96, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 97, + 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 71, 71, 99, 100, 101, 102, 103, 103, 104, 105, 106, 107, 108, 109, + 110, 111, 112, 113, 98, 114, 115, 116, 117, 118, 119, 98, 120, 120, 121, 98, + 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 98, 98, 132, 98, 98, 98, + 133, 134, 135, 136, 137, 138, 139, 98, 98, 140, 98, 141, 142, 143, 144, 98, + 98, 145, 98, 98, 98, 146, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, + 147, 147, 147, 147, 147, 147, 147, 148, 149, 147, 150, 98, 98, 98, 98, 98, + 151, 151, 151, 151, 151, 151, 151, 151, 152, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, 153, 153, 153, 153, 154, 98, 98, 98, + 155, 155, 155, 155, 156, 157, 158, 159, 98, 98, 98, 98, 98, 98, 160, 161, + 162, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, 163, 164, 98, 98, 98, 98, 98, 98, + 59, 165, 166, 167, 168, 98, 169, 98, 170, 171, 172, 59, 59, 173, 59, 174, + 175, 175, 175, 175, 175, 176, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, + 177, 178, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 179, 180, 98, 98, + 181, 182, 183, 184, 185, 98, 59, 59, 59, 59, 186, 187, 59, 188, 189, 190, + 191, 192, 193, 194, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 195, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 196, 71, + 197, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 198, 98, 98, + 71, 71, 71, 71, 199, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, + 200, 98, 201, 202, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, +}; + +static RE_UINT16 re_script_stage_3[] = { + 0, 0, 0, 0, 1, 2, 1, 2, 0, 0, 3, 3, 4, 5, 4, 5, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 0, 0, 7, 0, + 8, 8, 8, 8, 8, 8, 8, 9, 10, 11, 12, 11, 11, 11, 13, 11, + 14, 14, 14, 14, 14, 14, 14, 14, 15, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 16, 17, 18, 16, 17, 19, 20, 21, 21, 22, 21, 23, 24, + 25, 26, 27, 27, 28, 29, 27, 30, 27, 27, 27, 27, 27, 31, 27, 27, + 32, 33, 33, 33, 34, 27, 27, 27, 35, 35, 35, 36, 37, 37, 37, 38, + 39, 39, 40, 41, 42, 43, 44, 44, 44, 44, 27, 45, 44, 44, 46, 27, + 47, 47, 47, 47, 47, 48, 49, 47, 50, 51, 52, 53, 54, 55, 56, 57, + 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, + 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, + 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, + 122, 123, 123, 124, 123, 125, 44, 44, 126, 127, 128, 129, 130, 131, 44, 44, + 132, 132, 132, 132, 133, 132, 134, 135, 132, 133, 132, 136, 136, 137, 44, 44, + 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 139, 139, 140, 139, 139, 141, + 142, 142, 142, 142, 142, 142, 142, 142, 143, 143, 143, 143, 144, 145, 143, 143, + 144, 143, 143, 146, 147, 148, 143, 143, 143, 147, 143, 143, 143, 149, 143, 150, + 143, 151, 152, 152, 152, 152, 152, 153, 154, 154, 154, 154, 154, 154, 154, 154, + 155, 156, 157, 157, 157, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, + 168, 168, 168, 168, 168, 169, 170, 170, 171, 172, 173, 173, 173, 173, 173, 174, + 173, 173, 175, 154, 154, 154, 154, 176, 177, 178, 179, 179, 180, 181, 182, 183, + 184, 184, 185, 184, 186, 187, 168, 168, 188, 189, 190, 190, 190, 191, 190, 192, + 193, 193, 194, 195, 44, 44, 44, 44, 196, 196, 196, 196, 197, 196, 196, 198, + 199, 199, 199, 199, 200, 200, 200, 201, 202, 202, 202, 203, 204, 205, 205, 205, + 44, 44, 44, 44, 206, 207, 208, 209, 4, 4, 210, 4, 4, 211, 212, 213, + 4, 4, 4, 214, 8, 8, 8, 215, 11, 216, 11, 11, 216, 217, 11, 218, + 11, 11, 11, 219, 219, 220, 11, 221, 222, 0, 0, 0, 0, 0, 223, 224, + 225, 226, 0, 225, 44, 8, 8, 227, 0, 0, 228, 229, 230, 0, 4, 4, + 231, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 232, 0, 0, 233, 44, 232, 44, 0, 0, + 234, 234, 234, 234, 234, 234, 234, 234, 0, 0, 0, 0, 0, 0, 0, 235, + 0, 236, 0, 237, 238, 239, 240, 44, 241, 241, 242, 241, 241, 242, 4, 4, + 243, 243, 243, 243, 243, 243, 243, 244, 139, 139, 140, 245, 245, 245, 246, 247, + 143, 248, 249, 249, 249, 249, 14, 14, 0, 0, 0, 0, 250, 44, 44, 44, + 251, 252, 251, 251, 251, 251, 251, 253, 251, 251, 251, 251, 251, 251, 251, 251, + 251, 251, 251, 251, 251, 254, 44, 255, 256, 0, 257, 258, 259, 260, 260, 260, + 260, 261, 262, 263, 263, 263, 263, 264, 265, 266, 267, 268, 142, 142, 142, 142, + 269, 0, 266, 270, 0, 0, 271, 263, 142, 269, 0, 0, 0, 0, 142, 272, + 0, 0, 0, 0, 0, 263, 263, 273, 263, 263, 263, 263, 263, 274, 0, 0, + 251, 251, 251, 254, 0, 0, 0, 0, 251, 251, 251, 251, 251, 254, 44, 44, + 275, 275, 275, 275, 275, 275, 275, 275, 276, 275, 275, 275, 277, 278, 278, 278, + 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 280, 44, 14, 14, 14, 14, + 14, 14, 281, 281, 281, 281, 281, 282, 0, 0, 283, 4, 4, 4, 4, 4, + 284, 4, 285, 286, 44, 44, 44, 287, 288, 288, 289, 290, 291, 291, 291, 292, + 293, 293, 293, 293, 294, 295, 47, 296, 297, 297, 298, 299, 299, 300, 142, 301, + 302, 302, 302, 302, 303, 304, 138, 305, 306, 306, 306, 307, 308, 309, 138, 138, + 310, 310, 310, 310, 311, 312, 313, 314, 315, 316, 249, 4, 4, 317, 318, 152, + 152, 152, 152, 152, 313, 313, 319, 320, 142, 142, 321, 142, 322, 142, 142, 323, + 44, 44, 44, 44, 44, 44, 44, 44, 251, 251, 251, 251, 251, 251, 324, 251, + 251, 251, 251, 251, 251, 325, 44, 44, 326, 327, 21, 328, 329, 27, 27, 27, + 27, 27, 27, 27, 330, 46, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 331, 44, 27, 27, 27, 27, 332, 27, 27, 333, 44, 44, 334, + 8, 290, 335, 0, 0, 336, 337, 338, 27, 27, 27, 27, 27, 27, 27, 339, + 340, 0, 1, 2, 1, 2, 341, 262, 263, 342, 142, 269, 343, 344, 345, 346, + 347, 348, 349, 350, 351, 351, 44, 44, 348, 348, 348, 348, 348, 348, 348, 352, + 353, 0, 0, 354, 11, 11, 11, 11, 355, 255, 356, 44, 44, 0, 0, 357, + 358, 359, 360, 360, 360, 361, 362, 255, 363, 363, 364, 365, 366, 367, 367, 368, + 369, 370, 371, 371, 372, 373, 44, 44, 374, 374, 374, 374, 374, 375, 375, 375, + 376, 377, 378, 44, 44, 44, 44, 44, 379, 379, 380, 381, 381, 381, 382, 44, + 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 384, 383, 385, 386, 44, + 387, 388, 388, 389, 390, 391, 392, 392, 393, 394, 395, 44, 44, 44, 396, 397, + 398, 399, 400, 401, 44, 44, 44, 44, 402, 402, 403, 404, 403, 405, 403, 403, + 406, 407, 408, 409, 410, 411, 412, 412, 413, 413, 44, 44, 414, 414, 415, 416, + 417, 417, 417, 418, 419, 420, 421, 422, 423, 424, 425, 44, 44, 44, 44, 44, + 426, 426, 426, 426, 427, 44, 44, 44, 428, 428, 428, 429, 428, 428, 428, 430, + 44, 44, 44, 44, 44, 44, 27, 431, 432, 432, 432, 432, 433, 434, 432, 435, + 436, 436, 436, 436, 437, 438, 439, 440, 441, 441, 441, 442, 443, 444, 444, 445, + 446, 446, 446, 446, 447, 446, 448, 449, 450, 451, 450, 452, 44, 44, 44, 44, + 453, 454, 455, 456, 456, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, + 467, 467, 467, 467, 468, 469, 44, 44, 470, 470, 470, 471, 470, 472, 44, 44, + 473, 473, 473, 473, 474, 475, 44, 44, 476, 476, 476, 477, 478, 44, 44, 44, + 479, 480, 481, 479, 44, 44, 44, 44, 44, 44, 482, 482, 482, 482, 482, 483, + 44, 44, 44, 44, 484, 484, 484, 485, 486, 486, 486, 486, 486, 486, 486, 486, + 486, 487, 44, 44, 44, 44, 44, 44, 486, 486, 486, 486, 486, 486, 488, 489, + 486, 486, 486, 486, 490, 44, 44, 44, 491, 491, 491, 491, 491, 491, 491, 491, + 491, 491, 492, 44, 44, 44, 44, 44, 493, 493, 493, 493, 493, 493, 493, 493, + 493, 493, 493, 493, 494, 44, 44, 44, 281, 281, 281, 281, 281, 281, 281, 281, + 281, 281, 281, 495, 496, 497, 498, 44, 44, 44, 44, 44, 44, 499, 500, 501, + 502, 502, 502, 502, 503, 504, 505, 506, 502, 44, 44, 44, 44, 44, 44, 44, + 507, 507, 507, 507, 508, 507, 507, 509, 510, 507, 44, 44, 44, 44, 44, 44, + 511, 44, 44, 44, 44, 44, 44, 44, 512, 512, 512, 512, 512, 512, 513, 514, + 515, 516, 271, 44, 44, 44, 44, 44, 0, 0, 0, 0, 0, 0, 0, 517, + 0, 0, 518, 0, 0, 0, 519, 520, 521, 0, 522, 0, 0, 0, 523, 44, + 11, 11, 11, 11, 524, 44, 44, 44, 0, 0, 0, 0, 0, 233, 0, 239, + 0, 0, 0, 0, 0, 223, 0, 0, 0, 525, 526, 527, 528, 0, 0, 0, + 529, 530, 0, 531, 532, 533, 0, 0, 0, 0, 236, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 534, 0, 0, 0, 535, 535, 535, 535, 535, 535, 535, 535, + 536, 537, 538, 44, 44, 44, 44, 44, 539, 539, 539, 539, 539, 539, 539, 539, + 539, 539, 539, 539, 540, 541, 44, 44, 542, 27, 543, 544, 545, 546, 547, 548, + 549, 550, 551, 550, 44, 44, 44, 330, 0, 0, 255, 0, 0, 0, 0, 0, + 0, 271, 225, 340, 340, 340, 0, 517, 552, 0, 225, 0, 0, 0, 255, 0, + 0, 232, 44, 44, 44, 44, 553, 0, 554, 0, 0, 232, 523, 239, 44, 44, + 0, 0, 0, 0, 0, 0, 0, 555, 0, 0, 528, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 556, 552, 271, 0, 0, 0, 0, 0, 0, 0, 271, + 0, 0, 0, 0, 0, 557, 44, 44, 255, 0, 0, 0, 558, 290, 0, 0, + 558, 0, 559, 44, 44, 44, 44, 44, 44, 523, 44, 44, 44, 44, 44, 44, + 557, 44, 44, 44, 556, 44, 44, 44, 251, 251, 251, 251, 251, 560, 44, 44, + 251, 251, 251, 561, 251, 251, 251, 251, 251, 324, 251, 251, 251, 251, 251, 251, + 251, 251, 562, 44, 44, 44, 44, 44, 251, 324, 44, 44, 44, 44, 44, 44, + 563, 44, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 44, +}; + +static RE_UINT16 re_script_stage_4[] = { + 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 3, 0, 0, 0, 4, 0, + 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 5, 0, 2, 5, 6, 0, + 7, 7, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15, 8, 8, 8, 8, + 16, 8, 8, 8, 17, 18, 18, 18, 19, 19, 19, 19, 19, 20, 19, 19, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 21, 22, 22, 22, 24, 21, + 25, 26, 26, 26, 26, 26, 26, 26, 26, 26, 12, 12, 26, 26, 27, 12, + 26, 28, 12, 12, 29, 30, 29, 31, 29, 29, 32, 33, 29, 29, 29, 29, + 31, 29, 34, 7, 7, 35, 29, 29, 36, 29, 29, 29, 29, 29, 29, 30, + 37, 37, 37, 38, 37, 37, 37, 37, 37, 37, 39, 40, 41, 41, 41, 41, + 42, 12, 12, 12, 43, 43, 43, 43, 43, 43, 44, 12, 45, 45, 45, 45, + 45, 45, 45, 46, 45, 45, 45, 47, 48, 48, 48, 48, 48, 48, 48, 49, + 12, 12, 12, 12, 29, 50, 12, 12, 51, 29, 29, 29, 52, 52, 52, 52, + 53, 52, 52, 52, 52, 54, 52, 52, 55, 56, 55, 57, 57, 55, 55, 55, + 55, 55, 58, 55, 59, 60, 61, 55, 55, 57, 57, 62, 12, 63, 12, 64, + 55, 60, 55, 55, 55, 55, 55, 12, 65, 65, 66, 67, 68, 69, 69, 69, + 69, 69, 70, 69, 70, 71, 72, 70, 66, 67, 68, 72, 73, 12, 65, 74, + 12, 75, 69, 69, 69, 72, 12, 12, 76, 76, 77, 78, 78, 77, 77, 77, + 77, 77, 79, 77, 79, 76, 80, 77, 77, 78, 78, 80, 81, 12, 12, 12, + 77, 82, 77, 77, 80, 12, 83, 12, 84, 84, 85, 86, 86, 85, 85, 85, + 85, 85, 87, 85, 87, 84, 88, 85, 85, 86, 86, 88, 12, 89, 12, 90, + 85, 89, 85, 85, 85, 85, 12, 12, 91, 92, 93, 91, 94, 95, 96, 94, + 97, 98, 93, 91, 99, 99, 95, 91, 93, 91, 94, 95, 98, 97, 12, 12, + 12, 91, 99, 99, 99, 99, 93, 12, 100, 101, 100, 102, 102, 100, 100, 100, + 100, 100, 102, 100, 100, 100, 103, 101, 100, 102, 102, 103, 12, 104, 105, 12, + 100, 106, 100, 100, 12, 12, 100, 100, 107, 107, 108, 109, 109, 108, 108, 108, + 108, 108, 109, 108, 108, 107, 110, 108, 108, 109, 109, 110, 12, 111, 12, 112, + 108, 113, 108, 108, 111, 12, 12, 12, 114, 114, 115, 116, 116, 115, 115, 115, + 115, 115, 115, 115, 115, 115, 117, 114, 115, 116, 116, 117, 12, 118, 12, 118, + 115, 119, 115, 115, 115, 120, 114, 115, 121, 122, 123, 123, 123, 124, 121, 123, + 123, 123, 123, 123, 125, 123, 123, 126, 123, 124, 127, 128, 123, 129, 123, 123, + 12, 121, 123, 123, 121, 130, 12, 12, 131, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 133, 134, 132, 132, 132, 12, 135, 136, 137, 138, 12, 139, 140, 139, + 140, 141, 142, 140, 139, 139, 143, 144, 139, 137, 139, 144, 139, 139, 144, 139, + 145, 145, 145, 145, 145, 145, 146, 145, 145, 145, 145, 147, 146, 145, 145, 145, + 145, 145, 145, 148, 145, 149, 150, 12, 151, 151, 151, 151, 152, 152, 152, 152, + 152, 153, 12, 154, 152, 152, 155, 152, 156, 156, 156, 156, 157, 157, 157, 157, + 157, 157, 158, 159, 157, 160, 158, 159, 158, 159, 157, 160, 158, 159, 157, 157, + 157, 160, 157, 157, 157, 157, 160, 161, 157, 157, 157, 162, 157, 157, 159, 12, + 163, 163, 163, 163, 163, 164, 163, 164, 165, 165, 165, 165, 166, 166, 166, 166, + 166, 166, 166, 167, 168, 168, 168, 168, 168, 168, 169, 170, 168, 168, 171, 12, + 172, 172, 172, 173, 172, 174, 12, 12, 175, 175, 175, 175, 175, 176, 12, 12, + 177, 177, 177, 177, 177, 12, 12, 12, 178, 178, 178, 179, 179, 12, 12, 12, + 180, 180, 180, 180, 180, 180, 180, 181, 180, 180, 181, 12, 182, 183, 184, 185, + 184, 184, 186, 12, 184, 184, 184, 184, 184, 184, 12, 12, 184, 184, 185, 12, + 165, 187, 12, 12, 188, 188, 188, 188, 188, 188, 188, 189, 188, 188, 188, 12, + 190, 188, 188, 188, 191, 191, 191, 191, 191, 191, 191, 192, 191, 193, 12, 12, + 194, 194, 194, 194, 194, 194, 194, 12, 194, 194, 195, 12, 194, 194, 196, 197, + 198, 198, 198, 198, 198, 198, 198, 199, 200, 200, 200, 200, 200, 200, 200, 201, + 200, 200, 200, 202, 200, 200, 203, 12, 200, 200, 200, 203, 7, 7, 7, 204, + 205, 205, 205, 205, 205, 205, 205, 12, 205, 205, 205, 206, 207, 207, 207, 207, + 208, 208, 208, 208, 208, 12, 12, 208, 209, 209, 209, 209, 209, 209, 210, 209, + 209, 209, 211, 212, 213, 213, 213, 213, 207, 207, 12, 12, 214, 7, 7, 7, + 215, 7, 216, 217, 0, 218, 219, 12, 2, 220, 221, 2, 2, 2, 2, 222, + 223, 220, 224, 2, 2, 2, 225, 2, 2, 2, 2, 226, 7, 219, 12, 7, + 8, 227, 8, 227, 8, 8, 228, 228, 8, 8, 8, 227, 8, 15, 8, 8, + 8, 10, 8, 229, 10, 15, 8, 14, 0, 0, 0, 230, 0, 231, 0, 0, + 232, 0, 0, 233, 0, 0, 0, 234, 2, 2, 2, 235, 236, 12, 12, 12, + 0, 237, 238, 0, 4, 0, 0, 0, 0, 0, 0, 4, 2, 2, 5, 12, + 0, 0, 234, 12, 0, 234, 12, 12, 239, 239, 239, 239, 0, 240, 0, 0, + 0, 241, 0, 0, 0, 0, 241, 242, 0, 0, 231, 0, 241, 12, 12, 12, + 12, 12, 12, 0, 243, 243, 243, 243, 243, 243, 243, 244, 18, 18, 18, 18, + 18, 12, 245, 18, 246, 246, 246, 246, 246, 246, 12, 247, 248, 12, 12, 247, + 157, 160, 12, 12, 157, 160, 157, 160, 234, 12, 12, 12, 249, 249, 249, 249, + 249, 249, 250, 249, 249, 12, 12, 12, 249, 251, 12, 12, 0, 0, 0, 12, + 0, 252, 0, 0, 253, 249, 254, 255, 0, 0, 249, 0, 256, 257, 257, 257, + 257, 257, 257, 257, 257, 258, 259, 260, 261, 262, 262, 262, 262, 262, 262, 262, + 262, 262, 263, 261, 12, 264, 265, 265, 265, 265, 265, 265, 265, 265, 265, 266, + 267, 156, 156, 156, 156, 156, 156, 268, 265, 265, 269, 12, 0, 12, 12, 12, + 156, 156, 156, 270, 262, 262, 262, 271, 262, 262, 0, 0, 272, 272, 272, 272, + 272, 272, 272, 273, 272, 274, 12, 12, 275, 275, 275, 275, 276, 276, 276, 276, + 276, 276, 276, 12, 277, 277, 277, 277, 277, 277, 12, 12, 238, 2, 2, 2, + 2, 2, 233, 2, 2, 2, 2, 278, 2, 2, 12, 12, 12, 279, 2, 2, + 280, 280, 280, 280, 280, 280, 280, 12, 0, 0, 241, 12, 281, 281, 281, 281, + 281, 281, 12, 12, 282, 282, 282, 282, 282, 283, 12, 284, 282, 282, 285, 12, + 52, 52, 52, 286, 287, 287, 287, 287, 287, 287, 287, 288, 289, 289, 289, 289, + 289, 12, 12, 290, 156, 156, 156, 291, 292, 292, 292, 292, 292, 292, 292, 293, + 292, 292, 294, 295, 151, 151, 151, 296, 297, 297, 297, 297, 297, 298, 12, 12, + 297, 297, 297, 299, 297, 297, 299, 297, 300, 300, 300, 300, 301, 12, 12, 12, + 12, 12, 302, 300, 303, 303, 303, 303, 303, 304, 12, 12, 161, 160, 161, 160, + 161, 160, 12, 12, 2, 2, 3, 2, 2, 305, 12, 12, 303, 303, 303, 306, + 303, 303, 306, 12, 156, 12, 12, 12, 156, 268, 307, 156, 156, 156, 156, 12, + 249, 249, 249, 251, 249, 249, 251, 12, 2, 308, 12, 12, 309, 22, 12, 25, + 26, 27, 26, 310, 311, 312, 26, 26, 313, 12, 12, 12, 29, 29, 29, 314, + 315, 29, 29, 29, 29, 29, 12, 12, 29, 29, 29, 313, 7, 7, 7, 316, + 234, 0, 0, 0, 0, 234, 0, 12, 29, 317, 29, 29, 29, 29, 29, 318, + 242, 0, 0, 0, 0, 319, 262, 262, 262, 262, 262, 320, 321, 156, 321, 156, + 321, 156, 321, 291, 0, 234, 0, 234, 12, 12, 242, 241, 322, 322, 322, 323, + 322, 322, 322, 322, 322, 324, 322, 322, 322, 322, 324, 325, 322, 322, 322, 326, + 322, 322, 324, 12, 234, 134, 0, 0, 0, 134, 0, 0, 8, 8, 8, 327, + 327, 12, 12, 12, 0, 0, 0, 328, 329, 329, 329, 329, 329, 329, 329, 330, + 331, 331, 331, 331, 332, 12, 12, 12, 216, 0, 0, 0, 333, 333, 333, 333, + 333, 12, 12, 12, 334, 334, 334, 334, 334, 334, 335, 12, 336, 336, 336, 336, + 336, 336, 337, 12, 338, 338, 338, 338, 338, 338, 338, 339, 340, 340, 340, 340, + 340, 12, 340, 340, 340, 341, 12, 12, 342, 342, 342, 342, 343, 343, 343, 343, + 344, 344, 344, 344, 344, 344, 344, 345, 344, 344, 345, 12, 346, 346, 346, 346, + 346, 346, 12, 12, 347, 347, 347, 347, 347, 12, 12, 348, 349, 349, 349, 349, + 349, 350, 12, 12, 349, 351, 12, 12, 349, 349, 12, 12, 352, 353, 354, 352, + 352, 352, 352, 352, 352, 355, 356, 357, 358, 358, 358, 358, 358, 359, 358, 358, + 360, 360, 360, 360, 361, 361, 361, 361, 361, 361, 361, 362, 12, 363, 361, 361, + 364, 364, 364, 364, 365, 366, 367, 364, 368, 368, 368, 368, 368, 368, 368, 369, + 370, 370, 370, 370, 370, 370, 371, 372, 373, 373, 373, 373, 374, 374, 374, 374, + 374, 374, 12, 374, 375, 374, 374, 374, 376, 377, 12, 376, 376, 378, 378, 376, + 376, 376, 376, 376, 376, 12, 379, 380, 376, 376, 12, 12, 376, 376, 381, 12, + 382, 382, 382, 382, 383, 383, 383, 383, 384, 384, 384, 384, 384, 385, 386, 384, + 384, 385, 12, 12, 387, 387, 387, 387, 387, 388, 389, 387, 390, 390, 390, 390, + 390, 391, 390, 390, 392, 392, 392, 392, 393, 12, 392, 392, 394, 394, 394, 394, + 395, 12, 396, 397, 12, 12, 396, 394, 398, 398, 398, 398, 398, 398, 399, 12, + 400, 400, 400, 400, 401, 12, 12, 12, 401, 12, 402, 400, 29, 29, 29, 403, + 404, 404, 404, 404, 404, 404, 404, 405, 406, 404, 404, 404, 12, 12, 12, 407, + 408, 408, 408, 408, 409, 12, 12, 12, 410, 410, 410, 410, 410, 410, 411, 12, + 410, 410, 412, 12, 413, 413, 413, 413, 413, 414, 413, 413, 413, 12, 12, 12, + 415, 415, 415, 415, 415, 416, 12, 12, 417, 417, 417, 417, 417, 417, 417, 418, + 122, 123, 123, 123, 123, 130, 12, 12, 419, 419, 419, 419, 420, 419, 419, 419, + 419, 419, 419, 421, 422, 423, 424, 425, 422, 422, 422, 425, 422, 422, 426, 12, + 427, 427, 427, 427, 427, 427, 428, 12, 427, 427, 429, 12, 430, 431, 430, 432, + 432, 430, 430, 430, 430, 430, 433, 430, 433, 431, 434, 430, 430, 432, 432, 434, + 435, 436, 12, 431, 430, 437, 430, 435, 430, 435, 12, 12, 438, 438, 438, 438, + 438, 438, 12, 12, 438, 438, 439, 12, 440, 440, 440, 440, 440, 441, 440, 440, + 440, 440, 440, 441, 442, 442, 442, 442, 442, 443, 12, 12, 442, 442, 444, 12, + 445, 445, 445, 445, 445, 445, 12, 12, 445, 445, 446, 12, 447, 447, 447, 447, + 447, 447, 448, 449, 447, 447, 447, 12, 450, 450, 450, 450, 451, 12, 12, 452, + 453, 453, 453, 453, 453, 453, 454, 12, 455, 455, 455, 455, 455, 455, 456, 12, + 455, 455, 455, 457, 455, 458, 12, 12, 455, 12, 12, 12, 459, 459, 459, 459, + 459, 459, 459, 460, 461, 461, 461, 461, 461, 462, 12, 12, 277, 277, 463, 12, + 464, 464, 464, 464, 464, 464, 464, 465, 464, 464, 466, 467, 468, 468, 468, 468, + 468, 468, 468, 469, 468, 469, 12, 12, 470, 470, 470, 470, 470, 471, 12, 12, + 470, 470, 472, 470, 472, 470, 470, 470, 470, 470, 12, 473, 474, 474, 474, 474, + 474, 475, 12, 12, 474, 474, 474, 476, 12, 12, 12, 477, 478, 12, 12, 12, + 479, 479, 479, 479, 479, 479, 480, 12, 479, 479, 479, 481, 479, 479, 481, 12, + 479, 479, 482, 479, 0, 241, 12, 12, 0, 234, 242, 0, 0, 483, 230, 0, + 0, 0, 483, 7, 214, 484, 7, 0, 0, 0, 485, 230, 0, 0, 486, 12, + 8, 227, 12, 12, 0, 0, 0, 231, 487, 488, 242, 231, 0, 0, 489, 242, + 0, 242, 0, 0, 0, 489, 234, 242, 0, 231, 0, 231, 0, 0, 489, 234, + 0, 490, 240, 0, 231, 0, 0, 0, 0, 0, 0, 240, 491, 491, 491, 491, + 491, 491, 491, 12, 12, 12, 492, 491, 493, 491, 491, 491, 494, 494, 494, 494, + 494, 495, 494, 494, 494, 496, 12, 12, 29, 497, 29, 29, 498, 499, 497, 29, + 403, 29, 500, 12, 501, 51, 500, 497, 498, 499, 500, 500, 498, 499, 403, 29, + 403, 29, 497, 502, 29, 29, 503, 29, 29, 29, 29, 12, 497, 497, 503, 29, + 0, 0, 0, 486, 12, 240, 0, 0, 504, 12, 12, 12, 0, 0, 489, 0, + 486, 12, 12, 12, 0, 486, 12, 12, 0, 0, 12, 12, 0, 0, 0, 241, + 249, 505, 12, 12, 249, 506, 12, 12, 251, 12, 12, 12, 507, 12, 12, 12, +}; + +static RE_UINT8 re_script_stage_5[] = { + 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, + 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 35, 35, 41, 41, 41, 41, + 3, 3, 3, 3, 1, 3, 3, 3, 0, 0, 3, 3, 3, 3, 1, 3, + 0, 0, 0, 0, 3, 1, 3, 1, 3, 3, 3, 0, 3, 0, 3, 3, + 3, 3, 0, 3, 3, 3, 55, 55, 55, 55, 55, 55, 4, 4, 4, 4, + 4, 41, 41, 4, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, + 0, 1, 5, 0, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0, + 6, 0, 0, 0, 7, 7, 7, 7, 7, 1, 7, 7, 1, 7, 7, 7, + 7, 7, 7, 1, 1, 0, 7, 1, 7, 7, 7, 41, 41, 41, 7, 7, + 41, 7, 7, 7, 8, 8, 8, 8, 8, 8, 0, 8, 8, 8, 8, 0, + 0, 8, 8, 8, 9, 9, 9, 9, 9, 9, 0, 0, 66, 66, 66, 66, + 66, 66, 66, 0, 82, 82, 82, 82, 82, 82, 0, 0, 82, 82, 82, 0, + 95, 95, 95, 95, 0, 0, 95, 0, 7, 0, 0, 0, 0, 0, 0, 7, + 10, 10, 10, 10, 10, 41, 41, 10, 1, 1, 10, 10, 11, 11, 11, 11, + 0, 11, 11, 11, 11, 0, 0, 11, 11, 0, 11, 11, 11, 0, 11, 0, + 0, 0, 11, 11, 11, 11, 0, 0, 11, 11, 11, 0, 0, 0, 0, 11, + 11, 11, 0, 11, 0, 12, 12, 12, 12, 12, 12, 0, 0, 0, 0, 12, + 12, 0, 0, 12, 12, 12, 12, 12, 12, 0, 12, 12, 0, 12, 12, 0, + 12, 12, 0, 0, 0, 12, 0, 0, 12, 0, 12, 0, 0, 0, 12, 12, + 0, 13, 13, 13, 13, 13, 13, 13, 13, 13, 0, 13, 13, 0, 13, 13, + 13, 13, 0, 0, 13, 0, 0, 0, 0, 0, 13, 13, 0, 13, 0, 0, + 0, 14, 14, 14, 14, 14, 14, 14, 14, 0, 0, 14, 14, 0, 14, 14, + 14, 14, 0, 0, 0, 0, 14, 14, 14, 14, 0, 14, 0, 0, 15, 15, + 0, 15, 15, 15, 15, 15, 15, 0, 15, 0, 15, 15, 15, 15, 0, 0, + 0, 15, 15, 0, 0, 0, 0, 15, 15, 0, 0, 0, 15, 15, 15, 15, + 16, 16, 16, 16, 0, 16, 16, 16, 16, 0, 16, 16, 16, 16, 0, 0, + 0, 16, 16, 0, 16, 16, 16, 0, 0, 0, 16, 16, 0, 17, 17, 17, + 17, 17, 17, 17, 17, 0, 17, 17, 17, 17, 0, 0, 0, 17, 17, 0, + 0, 0, 17, 0, 0, 0, 17, 17, 0, 18, 18, 18, 18, 18, 18, 18, + 18, 0, 18, 18, 18, 18, 18, 0, 0, 0, 0, 18, 0, 0, 18, 18, + 18, 18, 0, 0, 0, 0, 19, 19, 0, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 0, 19, 19, 0, 19, 0, 19, 0, 0, 0, 0, 19, 0, + 0, 0, 0, 19, 19, 0, 19, 0, 19, 0, 0, 0, 0, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 1, 0, 21, 21, 0, + 21, 0, 0, 21, 21, 0, 21, 0, 0, 21, 0, 0, 21, 21, 21, 21, + 0, 21, 21, 21, 0, 21, 0, 21, 0, 0, 21, 21, 21, 21, 0, 21, + 21, 21, 0, 0, 22, 22, 22, 22, 0, 22, 22, 22, 22, 0, 0, 0, + 22, 0, 22, 22, 22, 1, 1, 1, 1, 22, 22, 0, 23, 23, 23, 23, + 24, 24, 24, 24, 24, 24, 0, 24, 0, 24, 0, 0, 24, 24, 24, 1, + 25, 25, 25, 25, 26, 26, 26, 26, 26, 0, 26, 26, 26, 26, 0, 0, + 26, 26, 26, 0, 0, 26, 26, 26, 26, 0, 0, 0, 27, 27, 27, 27, + 27, 27, 0, 0, 28, 28, 28, 28, 29, 29, 29, 29, 29, 0, 0, 0, + 30, 30, 30, 30, 30, 30, 30, 1, 1, 1, 30, 30, 30, 0, 0, 0, + 42, 42, 42, 42, 42, 0, 42, 42, 42, 0, 0, 0, 43, 43, 43, 43, + 43, 1, 1, 0, 44, 44, 44, 44, 45, 45, 45, 45, 45, 0, 45, 45, + 31, 31, 31, 31, 31, 31, 0, 0, 32, 32, 1, 1, 32, 1, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 0, 0, 28, 28, 0, 0, + 46, 46, 46, 46, 46, 46, 46, 0, 46, 0, 0, 0, 47, 47, 47, 47, + 47, 47, 0, 0, 47, 0, 0, 0, 56, 56, 56, 56, 56, 56, 0, 0, + 56, 56, 56, 0, 0, 0, 56, 56, 54, 54, 54, 54, 0, 0, 54, 54, + 78, 78, 78, 78, 78, 78, 78, 0, 78, 0, 0, 78, 78, 78, 0, 0, + 41, 41, 41, 0, 62, 62, 62, 62, 62, 0, 0, 0, 67, 67, 67, 67, + 93, 93, 93, 93, 68, 68, 68, 68, 0, 0, 0, 68, 68, 68, 0, 0, + 0, 68, 68, 68, 69, 69, 69, 69, 41, 41, 41, 1, 41, 1, 41, 41, + 41, 1, 1, 1, 1, 41, 1, 1, 41, 1, 1, 0, 41, 41, 0, 0, + 2, 2, 3, 3, 3, 3, 3, 4, 2, 3, 3, 3, 3, 3, 2, 2, + 3, 3, 3, 2, 4, 2, 2, 2, 2, 2, 2, 3, 3, 3, 0, 0, + 0, 3, 0, 3, 0, 3, 3, 3, 41, 41, 1, 1, 1, 0, 1, 1, + 1, 2, 0, 0, 1, 1, 1, 2, 1, 1, 1, 0, 2, 0, 0, 0, + 41, 0, 0, 0, 1, 1, 3, 1, 1, 1, 2, 2, 53, 53, 53, 53, + 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 57, 57, 57, 57, + 57, 57, 57, 0, 0, 55, 55, 55, 58, 58, 58, 58, 0, 0, 0, 58, + 58, 0, 0, 0, 36, 36, 36, 36, 36, 36, 0, 36, 36, 36, 0, 0, + 1, 36, 1, 36, 1, 36, 36, 36, 36, 36, 41, 41, 41, 41, 25, 25, + 0, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 0, 0, 41, 41, 1, + 1, 33, 33, 33, 1, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 1, + 0, 35, 35, 35, 35, 35, 35, 35, 35, 35, 0, 0, 0, 25, 25, 25, + 25, 25, 25, 0, 35, 35, 35, 0, 25, 25, 25, 1, 34, 34, 34, 0, + 37, 37, 37, 37, 37, 0, 0, 0, 37, 37, 37, 0, 83, 83, 83, 83, + 70, 70, 70, 70, 84, 84, 84, 84, 2, 2, 0, 0, 0, 0, 0, 2, + 59, 59, 59, 59, 65, 65, 65, 65, 71, 71, 71, 71, 71, 0, 0, 0, + 0, 0, 71, 71, 71, 71, 0, 0, 10, 10, 0, 0, 72, 72, 72, 72, + 72, 72, 1, 72, 73, 73, 73, 73, 0, 0, 0, 73, 25, 0, 0, 0, + 85, 85, 85, 85, 85, 85, 0, 1, 85, 85, 0, 0, 0, 0, 85, 85, + 23, 23, 23, 0, 77, 77, 77, 77, 77, 77, 77, 0, 77, 77, 0, 0, + 79, 79, 79, 79, 79, 79, 79, 0, 0, 0, 0, 79, 86, 86, 86, 86, + 86, 86, 86, 0, 2, 3, 0, 0, 86, 86, 0, 0, 0, 0, 0, 25, + 2, 2, 2, 0, 0, 0, 0, 5, 6, 0, 6, 0, 6, 6, 0, 6, + 6, 0, 6, 6, 7, 7, 0, 0, 7, 7, 1, 1, 0, 0, 7, 7, + 41, 41, 4, 4, 7, 0, 7, 7, 7, 0, 0, 1, 1, 1, 34, 34, + 34, 34, 1, 1, 0, 0, 25, 25, 48, 48, 48, 48, 0, 48, 48, 48, + 48, 48, 48, 0, 48, 48, 0, 48, 48, 48, 0, 0, 3, 0, 0, 0, + 1, 41, 0, 0, 74, 74, 74, 74, 74, 0, 0, 0, 75, 75, 75, 75, + 75, 0, 0, 0, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 0, + 120, 120, 120, 120, 120, 120, 120, 0, 49, 49, 49, 49, 49, 49, 0, 49, + 60, 60, 60, 60, 60, 60, 0, 0, 40, 40, 40, 40, 50, 50, 50, 50, + 51, 51, 51, 51, 51, 51, 0, 0, 106, 106, 106, 106, 103, 103, 103, 103, + 0, 0, 0, 103, 110, 110, 110, 110, 110, 110, 110, 0, 110, 110, 0, 0, + 52, 52, 52, 52, 52, 52, 0, 0, 52, 0, 52, 52, 52, 52, 0, 52, + 52, 0, 0, 0, 52, 0, 0, 52, 87, 87, 87, 87, 87, 87, 0, 87, + 118, 118, 118, 118, 117, 117, 117, 117, 117, 117, 117, 0, 0, 0, 0, 117, + 128, 128, 128, 128, 128, 128, 128, 0, 128, 128, 0, 0, 0, 0, 0, 128, + 64, 64, 64, 64, 0, 0, 0, 64, 76, 76, 76, 76, 76, 76, 0, 0, + 0, 0, 0, 76, 98, 98, 98, 98, 97, 97, 97, 97, 0, 0, 97, 97, + 61, 61, 61, 61, 0, 61, 61, 0, 0, 61, 61, 61, 61, 61, 61, 0, + 0, 0, 0, 61, 61, 0, 0, 0, 88, 88, 88, 88, 116, 116, 116, 116, + 112, 112, 112, 112, 112, 112, 112, 0, 0, 0, 0, 112, 80, 80, 80, 80, + 80, 80, 0, 0, 0, 80, 80, 80, 89, 89, 89, 89, 89, 89, 0, 0, + 90, 90, 90, 90, 90, 90, 90, 0, 121, 121, 121, 121, 121, 121, 0, 0, + 0, 121, 121, 121, 121, 0, 0, 0, 91, 91, 91, 91, 91, 0, 0, 0, + 130, 130, 130, 130, 130, 130, 130, 0, 0, 0, 130, 130, 7, 7, 7, 0, + 94, 94, 94, 94, 94, 94, 0, 0, 0, 0, 94, 94, 0, 0, 0, 94, + 92, 92, 92, 92, 92, 92, 0, 0, 101, 101, 101, 101, 101, 0, 0, 0, + 101, 101, 0, 0, 96, 96, 96, 96, 96, 0, 96, 96, 111, 111, 111, 111, + 111, 111, 111, 0, 100, 100, 100, 100, 100, 100, 0, 0, 109, 109, 109, 109, + 109, 109, 0, 109, 109, 109, 0, 0, 129, 129, 129, 129, 129, 129, 129, 0, + 129, 0, 129, 129, 129, 129, 0, 129, 129, 129, 0, 0, 123, 123, 123, 123, + 123, 123, 123, 0, 123, 123, 0, 0, 107, 107, 107, 107, 0, 107, 107, 107, + 107, 0, 0, 107, 107, 0, 107, 107, 107, 107, 0, 0, 107, 0, 0, 0, + 0, 0, 0, 107, 0, 0, 107, 107, 124, 124, 124, 124, 124, 124, 0, 0, + 122, 122, 122, 122, 122, 122, 0, 0, 114, 114, 114, 114, 114, 0, 0, 0, + 114, 114, 0, 0, 102, 102, 102, 102, 102, 102, 0, 0, 126, 126, 126, 126, + 126, 126, 0, 0, 0, 126, 126, 126, 125, 125, 125, 125, 125, 125, 125, 0, + 0, 0, 0, 125, 119, 119, 119, 119, 119, 0, 0, 0, 63, 63, 63, 63, + 63, 63, 0, 0, 63, 63, 63, 0, 63, 0, 0, 0, 81, 81, 81, 81, + 81, 81, 81, 0, 127, 127, 127, 127, 127, 127, 127, 0, 84, 0, 0, 0, + 115, 115, 115, 115, 115, 115, 115, 0, 115, 115, 0, 0, 0, 0, 115, 115, + 104, 104, 104, 104, 104, 104, 0, 0, 108, 108, 108, 108, 108, 108, 0, 0, + 108, 108, 0, 108, 0, 108, 108, 108, 99, 99, 99, 99, 99, 0, 0, 0, + 99, 99, 99, 0, 0, 0, 0, 99, 34, 33, 0, 0, 105, 105, 105, 105, + 105, 105, 105, 0, 105, 0, 0, 0, 105, 105, 0, 0, 1, 1, 1, 41, + 1, 41, 41, 41, 1, 1, 41, 41, 1, 0, 0, 0, 0, 0, 1, 0, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 131, 131, 131, 131, + 0, 0, 0, 131, 0, 131, 131, 131, 113, 113, 113, 113, 113, 0, 0, 113, + 113, 113, 113, 0, 0, 7, 7, 7, 0, 7, 7, 0, 7, 0, 0, 7, + 0, 7, 0, 7, 0, 0, 7, 0, 7, 0, 7, 0, 7, 7, 0, 7, + 33, 1, 1, 0, 36, 36, 36, 0, 36, 0, 0, 0, 0, 1, 0, 0, +}; + +/* Script: 10928 bytes. */ + +RE_UINT32 re_get_script(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 11; + code = ch ^ (f << 11); + pos = (RE_UINT32)re_script_stage_1[f] << 4; + f = code >> 7; + code ^= f << 7; + pos = (RE_UINT32)re_script_stage_2[pos + f] << 3; + f = code >> 4; + code ^= f << 4; + pos = (RE_UINT32)re_script_stage_3[pos + f] << 2; + f = code >> 2; + code ^= f << 2; + pos = (RE_UINT32)re_script_stage_4[pos + f] << 2; + value = re_script_stage_5[pos + code]; + + return value; +} + +/* Word_Break. */ + +static RE_UINT8 re_word_break_stage_1[] = { + 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 5, 6, 6, 7, 4, 8, + 9, 10, 11, 12, 13, 4, 14, 4, 4, 4, 4, 15, 4, 16, 17, 18, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 19, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, +}; + +static RE_UINT8 re_word_break_stage_2[] = { + 0, 1, 2, 2, 2, 3, 4, 5, 2, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 2, 2, 31, 32, 33, 34, 35, 2, 2, 2, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 2, 50, 2, 2, 51, 52, + 53, 54, 55, 56, 57, 57, 57, 57, 57, 58, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, 59, 60, 61, 62, 63, 57, 57, 57, + 64, 65, 66, 67, 57, 68, 69, 57, 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 70, 2, 2, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 83, + 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 84, 85, 2, 2, 86, 87, 88, 89, 90, 91, + 92, 93, 94, 95, 57, 96, 97, 98, 2, 99, 100, 57, 2, 2, 101, 57, + 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 57, 57, 57, 57, 57, 57, + 112, 113, 114, 115, 116, 117, 118, 57, 57, 119, 57, 120, 121, 122, 123, 57, + 57, 124, 57, 57, 57, 125, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, + 2, 2, 2, 2, 2, 2, 2, 126, 127, 2, 128, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, + 2, 2, 2, 2, 2, 2, 2, 2, 129, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, 2, 2, 2, 2, 130, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, + 2, 2, 2, 2, 131, 132, 133, 134, 57, 57, 57, 57, 57, 57, 135, 136, + 137, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, 138, 139, 57, 57, 57, 57, 57, 57, + 57, 57, 140, 141, 142, 57, 57, 57, 143, 144, 145, 2, 2, 146, 147, 148, + 57, 57, 57, 57, 149, 150, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, + 2, 151, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 152, 153, 57, 57, + 57, 57, 154, 155, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, + 156, 57, 157, 158, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, +}; + +static RE_UINT8 re_word_break_stage_3[] = { + 0, 1, 0, 0, 2, 3, 4, 5, 6, 7, 7, 8, 6, 7, 7, 9, + 10, 0, 0, 0, 0, 11, 12, 13, 7, 7, 14, 7, 7, 7, 14, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 15, 7, 16, 0, 17, 18, 0, 0, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 21, + 22, 23, 7, 7, 24, 7, 7, 7, 7, 7, 7, 7, 7, 7, 25, 7, + 26, 27, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 6, 7, 7, 7, 14, 28, 6, 7, 7, 7, + 7, 29, 30, 19, 19, 19, 19, 31, 32, 0, 33, 33, 33, 34, 35, 0, + 36, 37, 19, 38, 7, 7, 7, 7, 7, 39, 19, 19, 4, 40, 41, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 42, 43, 44, 45, 4, 46, + 0, 47, 48, 7, 7, 7, 19, 19, 19, 49, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 50, 19, 51, 0, 4, 52, 7, 7, 7, 39, 53, 54, + 7, 7, 50, 55, 56, 57, 0, 0, 7, 7, 7, 58, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 7, 17, 0, 0, 0, 0, 0, 59, 19, 19, 19, + 60, 7, 7, 7, 7, 7, 7, 61, 19, 19, 62, 7, 63, 4, 6, 7, + 64, 65, 66, 7, 7, 67, 68, 69, 70, 71, 72, 73, 63, 4, 74, 0, + 75, 76, 66, 7, 7, 67, 77, 78, 79, 80, 81, 82, 83, 4, 84, 0, + 75, 25, 24, 7, 7, 67, 85, 69, 31, 86, 87, 0, 63, 4, 0, 28, + 75, 65, 66, 7, 7, 67, 85, 69, 70, 80, 88, 73, 63, 4, 28, 0, + 89, 90, 91, 92, 93, 90, 7, 94, 95, 96, 97, 0, 83, 4, 0, 0, + 98, 20, 67, 7, 7, 67, 7, 99, 100, 96, 101, 9, 63, 4, 0, 0, + 75, 20, 67, 7, 7, 67, 102, 69, 100, 96, 101, 103, 63, 4, 104, 0, + 75, 20, 67, 7, 7, 7, 7, 105, 100, 106, 72, 107, 63, 4, 0, 108, + 109, 7, 14, 108, 7, 7, 24, 110, 14, 111, 112, 19, 83, 4, 113, 0, + 0, 0, 0, 0, 0, 0, 114, 115, 72, 116, 4, 117, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 114, 118, 0, 119, 4, 117, 0, 0, 0, 0, + 87, 0, 0, 120, 4, 117, 121, 122, 7, 6, 7, 7, 7, 17, 30, 19, + 100, 123, 19, 30, 19, 19, 19, 124, 125, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 59, 19, 116, 4, 117, 88, 126, 127, 119, 128, 0, + 129, 31, 4, 130, 7, 7, 7, 7, 25, 131, 7, 7, 7, 7, 7, 132, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 91, 14, 91, 7, 7, 7, 7, + 7, 91, 7, 7, 7, 7, 91, 14, 91, 7, 14, 7, 7, 7, 7, 7, + 7, 7, 91, 7, 7, 7, 7, 7, 7, 7, 7, 133, 0, 0, 0, 0, + 7, 7, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 134, 134, + 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 65, 7, 7, + 6, 7, 7, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 90, 7, 87, + 7, 20, 135, 0, 7, 7, 135, 0, 7, 7, 136, 0, 7, 20, 137, 0, + 0, 0, 0, 0, 0, 0, 138, 19, 19, 19, 139, 140, 4, 117, 0, 0, + 0, 141, 4, 117, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, + 7, 7, 7, 7, 7, 142, 7, 7, 7, 7, 7, 7, 7, 7, 134, 0, + 7, 7, 7, 14, 19, 139, 19, 139, 83, 4, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 117, 0, 0, 0, 0, + 7, 7, 143, 139, 0, 0, 0, 0, 0, 0, 144, 116, 19, 19, 19, 70, + 4, 117, 4, 117, 0, 0, 19, 116, 0, 0, 0, 0, 0, 0, 0, 0, + 145, 7, 7, 7, 7, 7, 146, 19, 145, 147, 4, 117, 0, 59, 139, 0, + 148, 7, 7, 7, 62, 149, 4, 52, 7, 7, 7, 7, 50, 19, 139, 0, + 7, 7, 7, 7, 146, 19, 19, 0, 4, 150, 4, 52, 7, 7, 7, 134, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 151, 19, 19, 152, 153, 120, + 7, 7, 7, 7, 7, 7, 7, 7, 19, 19, 19, 19, 19, 19, 119, 138, + 7, 7, 134, 134, 7, 7, 7, 7, 134, 134, 7, 154, 7, 7, 7, 134, + 7, 7, 7, 7, 7, 7, 20, 155, 156, 17, 157, 147, 7, 17, 156, 17, + 0, 158, 0, 159, 160, 161, 0, 162, 163, 0, 164, 0, 165, 166, 28, 107, + 0, 0, 7, 17, 0, 0, 0, 0, 0, 0, 19, 19, 19, 19, 167, 0, + 168, 108, 110, 169, 18, 170, 7, 171, 172, 173, 0, 0, 7, 7, 7, 7, + 7, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 174, 7, 7, 7, 7, 7, 7, 74, 0, 0, + 7, 7, 7, 7, 7, 14, 7, 7, 7, 7, 7, 14, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 17, 175, 176, 0, + 7, 7, 7, 7, 25, 131, 7, 7, 7, 7, 7, 7, 7, 107, 0, 72, + 7, 7, 14, 0, 14, 14, 14, 14, 14, 14, 14, 14, 19, 19, 19, 19, + 0, 0, 0, 0, 0, 107, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 131, 0, 0, 0, 0, 129, 177, 93, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 178, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 180, + 172, 7, 7, 7, 7, 134, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 14, 0, 0, 7, 7, 7, 9, 0, 0, 0, 0, 0, 0, 179, 179, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 179, 179, 179, 179, 179, 181, + 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 0, 0, 0, 0, 0, + 7, 17, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 134, + 7, 17, 7, 7, 4, 182, 0, 0, 7, 7, 7, 7, 7, 143, 151, 183, + 7, 7, 7, 50, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 120, 0, + 0, 0, 107, 7, 108, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 66, 7, 7, 7, 134, 7, 0, 0, 0, 0, 0, 0, 0, 107, 7, + 184, 185, 7, 7, 39, 0, 0, 0, 7, 7, 7, 7, 7, 7, 147, 0, + 27, 7, 7, 7, 7, 7, 146, 19, 124, 0, 4, 117, 19, 19, 27, 186, + 4, 52, 7, 7, 50, 119, 7, 7, 143, 19, 139, 0, 7, 7, 7, 17, + 60, 7, 7, 7, 7, 7, 39, 19, 167, 107, 4, 117, 140, 0, 4, 117, + 7, 7, 7, 7, 7, 62, 116, 0, 185, 187, 4, 117, 0, 0, 0, 188, + 0, 0, 0, 0, 0, 0, 127, 189, 81, 0, 0, 0, 7, 39, 190, 0, + 191, 191, 191, 0, 14, 14, 7, 7, 7, 7, 7, 132, 134, 0, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 39, 192, 4, 117, + 7, 7, 7, 7, 147, 0, 7, 7, 14, 193, 7, 7, 7, 7, 7, 147, + 14, 0, 193, 194, 33, 195, 196, 197, 198, 33, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 74, 0, 0, 0, 193, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 134, 0, 0, 7, 7, 7, 7, 7, 7, + 7, 7, 108, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, 7, 147, + 19, 19, 199, 0, 19, 19, 200, 0, 0, 201, 202, 0, 0, 0, 20, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 203, + 204, 3, 0, 205, 6, 7, 7, 8, 6, 7, 7, 9, 206, 179, 179, 179, + 179, 179, 179, 207, 7, 7, 7, 14, 108, 108, 108, 208, 0, 0, 0, 209, + 7, 102, 7, 7, 14, 7, 7, 210, 7, 134, 7, 134, 0, 0, 0, 0, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 9, + 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 17, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 140, + 7, 7, 7, 17, 7, 7, 7, 7, 7, 7, 87, 0, 167, 0, 0, 0, + 7, 7, 7, 7, 0, 0, 7, 7, 7, 9, 7, 7, 7, 7, 50, 115, + 7, 7, 7, 134, 7, 7, 7, 7, 147, 7, 169, 0, 0, 0, 0, 0, + 7, 7, 7, 134, 4, 117, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 147, 0, 0, 0, + 7, 7, 7, 7, 7, 7, 14, 0, 7, 7, 134, 0, 7, 0, 0, 0, + 134, 67, 7, 7, 7, 7, 25, 211, 7, 7, 134, 0, 7, 7, 14, 0, + 7, 7, 7, 14, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 212, 0, + 7, 7, 134, 0, 7, 7, 7, 74, 0, 0, 0, 0, 0, 0, 0, 0, + 7, 7, 7, 7, 7, 7, 7, 174, 0, 0, 0, 0, 0, 0, 0, 0, + 213, 138, 102, 6, 7, 7, 147, 79, 0, 0, 0, 0, 7, 7, 7, 17, + 7, 7, 7, 17, 0, 0, 0, 0, 7, 6, 7, 7, 214, 0, 0, 0, + 7, 7, 7, 7, 7, 7, 134, 0, 7, 7, 134, 0, 7, 7, 9, 0, + 7, 7, 74, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 87, 0, 0, 0, 0, 0, 0, + 7, 7, 7, 7, 7, 7, 9, 0, 7, 7, 7, 7, 7, 7, 9, 0, + 148, 7, 7, 7, 7, 7, 7, 19, 116, 0, 0, 0, 83, 4, 0, 72, + 148, 7, 7, 7, 7, 7, 19, 215, 0, 0, 7, 7, 7, 87, 4, 117, + 148, 7, 7, 7, 143, 19, 216, 4, 0, 0, 7, 7, 7, 7, 217, 0, + 148, 7, 7, 7, 7, 7, 39, 19, 218, 219, 4, 220, 0, 0, 0, 0, + 7, 7, 24, 7, 7, 146, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 14, 170, 7, 25, 7, 87, 7, 7, 7, 7, 7, 143, 19, 115, 4, 117, + 98, 65, 66, 7, 7, 67, 85, 69, 70, 80, 97, 172, 221, 124, 124, 0, + 7, 7, 7, 7, 7, 7, 19, 19, 222, 0, 4, 117, 0, 0, 0, 0, + 7, 7, 7, 7, 7, 143, 119, 19, 167, 0, 0, 187, 0, 0, 0, 0, + 7, 7, 7, 7, 7, 7, 19, 19, 223, 0, 4, 117, 0, 0, 0, 0, + 7, 7, 7, 7, 7, 39, 19, 0, 4, 117, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 144, 19, 139, 4, 117, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 4, 117, 0, 107, + 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 87, + 7, 7, 7, 74, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 14, 0, 0, + 7, 7, 7, 7, 7, 7, 7, 7, 147, 0, 0, 0, 0, 0, 0, 0, + 7, 7, 7, 7, 7, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 7, 7, 7, 7, 7, 7, 7, 7, 14, 0, 0, 0, 0, 0, 0, 0, + 7, 7, 7, 7, 7, 7, 7, 87, 7, 7, 7, 14, 4, 117, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 134, 124, 0, + 7, 7, 7, 7, 7, 7, 116, 0, 147, 0, 4, 117, 193, 7, 7, 172, + 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 7, 7, 7, 7, 7, 7, 7, 7, 17, 0, 62, 19, 19, 19, 19, 116, + 0, 72, 148, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 224, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 9, 7, 17, + 7, 87, 7, 225, 226, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 144, 227, 228, 229, + 230, 139, 0, 0, 0, 231, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 219, 0, 0, 0, 0, 0, 0, 0, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 20, 7, 7, 7, 7, 7, + 7, 7, 7, 20, 232, 233, 7, 234, 102, 7, 7, 7, 7, 7, 7, 7, + 25, 235, 20, 20, 7, 7, 7, 236, 155, 108, 67, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 134, 7, 7, 7, 67, 7, 7, 132, 7, 7, 7, 132, + 7, 7, 20, 7, 7, 7, 20, 7, 7, 14, 7, 7, 7, 14, 7, 7, + 7, 67, 7, 7, 7, 67, 7, 7, 132, 237, 4, 4, 4, 4, 4, 4, + 19, 19, 19, 19, 19, 19, 116, 59, 19, 19, 19, 19, 19, 124, 140, 0, + 238, 0, 0, 59, 30, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 7, 7, 7, 7, 7, 7, 7, 7, 17, 0, 116, 0, 0, 0, 0, 0, + 102, 7, 7, 7, 239, 6, 132, 240, 168, 241, 239, 154, 239, 132, 132, 82, + 7, 24, 7, 147, 242, 24, 7, 147, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 7, 7, 7, 74, 7, 7, 7, 74, 7, 7, + 7, 74, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 243, 244, 244, 244, + 245, 0, 0, 0, 166, 166, 166, 166, 166, 166, 166, 166, 166, 166, 166, 166, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 0, 0, +}; + +static RE_UINT8 re_word_break_stage_4[] = { + 0, 0, 1, 2, 3, 4, 0, 5, 6, 6, 7, 0, 8, 9, 9, 9, + 10, 11, 10, 0, 0, 12, 13, 14, 0, 15, 13, 0, 9, 10, 16, 17, + 16, 18, 9, 19, 0, 20, 21, 21, 9, 22, 17, 23, 0, 24, 10, 22, + 25, 9, 9, 25, 26, 21, 27, 9, 28, 0, 29, 0, 30, 21, 21, 31, + 32, 31, 33, 33, 34, 0, 35, 36, 37, 38, 0, 39, 40, 41, 42, 21, + 43, 44, 45, 9, 9, 46, 21, 47, 21, 48, 49, 27, 50, 51, 0, 52, + 53, 9, 40, 8, 9, 54, 55, 0, 50, 9, 21, 16, 56, 0, 57, 21, + 21, 58, 58, 59, 58, 0, 60, 21, 21, 9, 54, 61, 58, 21, 54, 62, + 58, 8, 9, 51, 51, 9, 22, 9, 20, 17, 16, 61, 21, 63, 63, 64, + 0, 60, 0, 25, 16, 0, 30, 8, 10, 65, 22, 66, 16, 49, 40, 60, + 63, 59, 67, 0, 8, 20, 0, 62, 27, 68, 22, 8, 31, 59, 19, 0, + 0, 69, 70, 8, 10, 17, 22, 16, 66, 22, 65, 19, 16, 69, 40, 69, + 49, 59, 19, 60, 21, 8, 16, 46, 21, 49, 0, 32, 9, 8, 0, 13, + 66, 0, 10, 46, 49, 64, 0, 65, 17, 9, 69, 8, 9, 28, 71, 60, + 21, 72, 69, 0, 67, 21, 40, 0, 21, 40, 73, 0, 31, 74, 21, 59, + 59, 0, 0, 75, 67, 69, 9, 58, 21, 74, 0, 71, 59, 69, 49, 63, + 30, 74, 69, 21, 76, 59, 0, 28, 10, 9, 10, 30, 9, 16, 54, 74, + 54, 0, 77, 0, 0, 21, 21, 0, 0, 67, 60, 78, 79, 0, 9, 42, + 0, 30, 21, 45, 9, 21, 9, 0, 80, 9, 21, 27, 73, 8, 40, 21, + 45, 53, 54, 81, 82, 82, 9, 20, 17, 22, 9, 17, 0, 83, 84, 0, + 0, 85, 86, 87, 0, 11, 88, 89, 0, 88, 37, 90, 37, 37, 74, 0, + 13, 65, 8, 16, 22, 25, 16, 9, 0, 8, 16, 13, 0, 17, 65, 42, + 27, 0, 91, 92, 93, 94, 95, 95, 96, 95, 95, 96, 50, 0, 21, 97, + 98, 98, 42, 9, 65, 28, 9, 59, 60, 59, 74, 69, 17, 99, 8, 10, + 40, 59, 65, 9, 0, 100, 101, 33, 33, 34, 33, 102, 103, 101, 104, 89, + 11, 88, 0, 105, 5, 106, 9, 107, 0, 108, 109, 0, 0, 110, 95, 111, + 17, 19, 112, 0, 10, 25, 19, 51, 10, 16, 58, 32, 9, 99, 40, 14, + 21, 113, 42, 13, 45, 19, 69, 74, 114, 19, 54, 69, 21, 25, 74, 19, + 94, 0, 16, 32, 37, 0, 59, 30, 115, 37, 116, 21, 40, 30, 69, 59, + 13, 66, 8, 22, 25, 8, 10, 8, 25, 10, 9, 62, 0, 74, 66, 51, + 82, 0, 82, 8, 8, 8, 0, 117, 118, 118, 14, 0, +}; + +static RE_UINT8 re_word_break_stage_5[] = { + 0, 0, 0, 0, 0, 0, 5, 6, 6, 4, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 2, 13, 0, 14, 0, 15, 15, 15, 15, 15, 15, 12, 13, + 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 0, 0, 0, 0, 16, + 0, 6, 0, 0, 0, 0, 11, 0, 0, 9, 0, 0, 0, 11, 0, 12, + 11, 11, 0, 0, 0, 0, 11, 11, 0, 0, 0, 12, 11, 0, 0, 0, + 11, 0, 11, 0, 7, 7, 7, 7, 11, 0, 11, 11, 11, 11, 13, 11, + 0, 0, 11, 12, 11, 11, 0, 11, 11, 11, 0, 7, 7, 7, 11, 11, + 0, 11, 0, 0, 0, 13, 0, 0, 0, 7, 7, 7, 7, 7, 0, 7, + 0, 7, 7, 0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 11, + 12, 0, 0, 0, 9, 9, 9, 9, 9, 9, 0, 0, 13, 13, 0, 0, + 7, 7, 7, 0, 9, 0, 0, 0, 11, 11, 11, 7, 15, 15, 0, 15, + 13, 0, 11, 11, 7, 11, 11, 11, 0, 11, 7, 7, 7, 9, 0, 7, + 7, 11, 11, 7, 7, 0, 7, 7, 15, 15, 11, 11, 11, 0, 0, 11, + 0, 0, 0, 9, 11, 7, 11, 11, 11, 11, 7, 7, 7, 11, 0, 0, + 13, 0, 11, 0, 7, 7, 11, 7, 11, 7, 7, 7, 7, 7, 0, 0, + 0, 0, 0, 7, 7, 11, 7, 7, 0, 0, 15, 15, 7, 0, 0, 7, + 7, 7, 11, 0, 0, 0, 0, 11, 0, 11, 11, 0, 0, 7, 0, 0, + 11, 7, 0, 0, 0, 0, 7, 7, 0, 0, 7, 11, 0, 0, 7, 0, + 7, 0, 7, 0, 15, 15, 0, 0, 7, 0, 0, 0, 0, 7, 0, 7, + 15, 15, 7, 7, 11, 0, 7, 7, 7, 7, 9, 0, 11, 7, 11, 0, + 7, 7, 7, 11, 7, 11, 11, 0, 0, 11, 0, 11, 7, 7, 9, 9, + 14, 14, 0, 0, 14, 0, 0, 12, 6, 6, 9, 9, 9, 9, 9, 0, + 16, 0, 0, 0, 13, 0, 0, 0, 9, 0, 9, 9, 0, 10, 10, 10, + 10, 10, 0, 0, 0, 7, 7, 10, 10, 0, 0, 0, 10, 10, 10, 10, + 10, 10, 10, 0, 7, 7, 0, 11, 11, 11, 7, 11, 11, 7, 7, 0, + 0, 3, 7, 3, 3, 0, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, + 13, 0, 0, 12, 0, 16, 16, 16, 13, 12, 0, 0, 11, 0, 0, 9, + 0, 0, 0, 14, 0, 0, 12, 13, 0, 0, 10, 10, 10, 10, 7, 7, + 0, 9, 9, 9, 7, 0, 15, 15, 15, 15, 11, 0, 7, 7, 7, 9, + 9, 9, 9, 7, 0, 0, 8, 8, 8, 8, 8, 8, +}; + +/* Word_Break: 4424 bytes. */ + +RE_UINT32 re_get_word_break(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 12; + code = ch ^ (f << 12); + pos = (RE_UINT32)re_word_break_stage_1[f] << 5; + f = code >> 7; + code ^= f << 7; + pos = (RE_UINT32)re_word_break_stage_2[pos + f] << 4; + f = code >> 3; + code ^= f << 3; + pos = (RE_UINT32)re_word_break_stage_3[pos + f] << 1; + f = code >> 2; + code ^= f << 2; + pos = (RE_UINT32)re_word_break_stage_4[pos + f] << 2; + value = re_word_break_stage_5[pos + code]; + + return value; +} + +/* Grapheme_Cluster_Break. */ + +static RE_UINT8 re_grapheme_cluster_break_stage_1[] = { + 0, 1, 2, 2, 2, 3, 4, 5, 6, 2, 2, 7, 2, 8, 9, 10, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 11, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, +}; + +static RE_UINT8 re_grapheme_cluster_break_stage_2[] = { + 0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 1, 17, 1, 1, 1, 18, 19, 20, 21, 22, 23, 24, 1, 1, + 25, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 26, 27, 1, 1, + 28, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 29, 1, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 34, 35, 36, 37, 38, 39, 40, 34, 35, 36, 37, 38, 39, + 40, 34, 35, 36, 37, 38, 39, 40, 34, 35, 36, 37, 38, 39, 40, 34, + 35, 36, 37, 38, 39, 40, 34, 41, 42, 42, 42, 42, 42, 42, 42, 42, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 43, 1, 1, 44, 45, + 1, 46, 47, 48, 1, 1, 1, 1, 1, 1, 49, 1, 1, 1, 1, 1, + 50, 51, 52, 53, 54, 55, 56, 57, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 58, 59, 1, 1, 1, 60, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 61, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 62, 63, 1, 1, 1, 1, 1, 1, 1, 64, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 65, 1, 1, 1, 1, 1, 1, 1, + 1, 66, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 42, 67, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_grapheme_cluster_break_stage_3[] = { + 0, 1, 2, 2, 2, 2, 2, 3, 1, 1, 4, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 5, 5, 5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 6, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 7, 5, 8, 9, 2, 2, 2, + 10, 11, 2, 2, 12, 5, 2, 13, 2, 2, 2, 2, 2, 14, 15, 2, + 3, 16, 2, 5, 17, 2, 2, 2, 2, 2, 18, 13, 2, 2, 12, 19, + 2, 20, 21, 2, 2, 22, 2, 2, 2, 2, 2, 2, 2, 2, 23, 5, + 24, 2, 2, 25, 26, 27, 28, 2, 29, 2, 2, 30, 31, 32, 28, 2, + 33, 2, 2, 34, 35, 16, 2, 36, 33, 2, 2, 34, 37, 2, 28, 2, + 29, 2, 2, 38, 31, 39, 28, 2, 40, 2, 2, 41, 42, 32, 2, 2, + 43, 2, 2, 44, 45, 46, 28, 2, 29, 2, 2, 47, 48, 46, 28, 2, + 29, 2, 2, 41, 49, 32, 28, 2, 50, 2, 2, 2, 51, 52, 2, 50, + 2, 2, 2, 53, 54, 2, 2, 2, 2, 2, 2, 55, 56, 2, 2, 2, + 2, 57, 2, 58, 2, 2, 2, 59, 60, 61, 5, 62, 63, 2, 2, 2, + 2, 2, 64, 65, 2, 66, 13, 67, 68, 69, 2, 2, 2, 2, 2, 2, + 70, 70, 70, 70, 70, 70, 71, 71, 71, 71, 72, 73, 73, 73, 73, 73, + 2, 2, 2, 2, 2, 64, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 74, 2, 74, 2, 28, 2, 28, 2, 2, 2, 75, 76, 77, 2, 2, + 78, 2, 2, 2, 2, 2, 2, 2, 2, 2, 79, 2, 2, 2, 2, 2, + 2, 2, 80, 81, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 82, 2, 2, 2, 83, 84, 85, 2, 2, 2, 86, 2, 2, 2, 2, + 87, 2, 2, 88, 89, 2, 12, 19, 90, 2, 91, 2, 2, 2, 92, 93, + 2, 2, 94, 95, 2, 2, 2, 2, 2, 2, 2, 2, 2, 96, 97, 98, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 99, + 100, 2, 101, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 5, 5, 13, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 102, 103, + 2, 2, 2, 2, 2, 2, 2, 102, 2, 2, 2, 2, 2, 2, 5, 5, + 2, 2, 104, 2, 2, 2, 2, 2, 2, 105, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 102, 106, 2, 44, 2, 2, 2, 2, 2, 103, + 107, 2, 108, 2, 2, 2, 2, 2, 109, 2, 2, 110, 111, 2, 5, 103, + 2, 2, 112, 2, 113, 93, 70, 114, 24, 2, 2, 115, 116, 2, 117, 2, + 2, 2, 118, 119, 120, 2, 2, 121, 2, 2, 2, 122, 16, 2, 123, 124, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 125, 2, + 126, 127, 128, 129, 128, 130, 128, 126, 127, 128, 129, 128, 130, 128, 126, 127, + 128, 129, 128, 130, 128, 126, 127, 128, 129, 128, 130, 128, 126, 127, 128, 129, + 128, 130, 128, 126, 127, 128, 129, 128, 130, 128, 126, 127, 128, 129, 128, 130, + 128, 126, 127, 128, 129, 128, 130, 128, 126, 127, 128, 129, 128, 130, 128, 126, + 127, 128, 129, 128, 130, 128, 126, 127, 128, 129, 128, 130, 128, 126, 127, 128, + 129, 128, 130, 128, 126, 127, 128, 129, 128, 130, 128, 126, 127, 128, 129, 128, + 130, 128, 126, 127, 128, 129, 128, 130, 128, 126, 127, 128, 129, 128, 130, 128, + 128, 129, 128, 130, 128, 126, 127, 128, 129, 128, 131, 71, 132, 73, 73, 133, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 134, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 5, 2, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 44, 2, 2, 2, 2, 2, 135, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 69, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 13, 2, + 2, 2, 2, 2, 2, 2, 2, 136, 2, 2, 2, 2, 2, 2, 2, 2, + 137, 2, 2, 138, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 46, 2, + 139, 2, 2, 140, 141, 2, 2, 102, 90, 2, 2, 142, 2, 2, 2, 2, + 143, 2, 144, 145, 2, 2, 2, 146, 90, 2, 2, 147, 148, 2, 2, 2, + 2, 2, 149, 150, 2, 2, 2, 2, 2, 2, 2, 2, 2, 102, 151, 2, + 93, 2, 2, 30, 152, 32, 153, 145, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 154, 155, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 102, 156, 13, 157, 2, 2, + 2, 2, 2, 158, 13, 2, 2, 2, 2, 2, 159, 160, 2, 2, 2, 2, + 2, 64, 161, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 145, + 2, 2, 2, 141, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 162, 163, 164, 102, 143, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 165, 166, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 167, 168, 169, 2, 170, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 74, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 5, 5, 5, 171, 5, 5, 62, 117, 172, 12, 7, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 141, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 173, 174, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, +}; + +static RE_UINT8 re_grapheme_cluster_break_stage_4[] = { + 0, 0, 1, 2, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 4, + 3, 3, 3, 5, 6, 6, 6, 6, 7, 6, 8, 3, 9, 6, 6, 6, + 6, 6, 6, 10, 11, 10, 3, 3, 0, 12, 3, 3, 6, 6, 13, 14, + 3, 3, 7, 6, 15, 3, 3, 3, 3, 16, 6, 17, 6, 18, 19, 8, + 20, 3, 3, 3, 6, 6, 13, 3, 3, 16, 6, 6, 6, 3, 3, 3, + 3, 16, 10, 6, 6, 9, 9, 8, 3, 3, 9, 3, 7, 6, 6, 6, + 21, 3, 3, 3, 3, 3, 22, 23, 24, 6, 25, 26, 9, 6, 3, 3, + 16, 3, 3, 3, 27, 3, 3, 3, 3, 3, 3, 28, 24, 29, 30, 31, + 3, 7, 3, 3, 32, 3, 3, 3, 3, 3, 3, 23, 33, 7, 18, 8, + 8, 20, 3, 3, 24, 10, 34, 31, 3, 3, 3, 19, 3, 16, 3, 3, + 35, 3, 3, 3, 3, 3, 3, 22, 36, 37, 38, 31, 25, 3, 3, 3, + 3, 3, 3, 16, 25, 39, 19, 8, 3, 11, 3, 3, 3, 3, 3, 40, + 41, 42, 38, 8, 24, 23, 38, 31, 37, 3, 3, 3, 3, 3, 35, 7, + 43, 44, 45, 46, 47, 6, 13, 3, 3, 7, 6, 13, 47, 6, 10, 15, + 3, 3, 6, 8, 3, 3, 8, 3, 3, 48, 20, 37, 9, 6, 6, 21, + 6, 19, 3, 9, 6, 6, 9, 6, 6, 6, 6, 15, 3, 35, 3, 3, + 3, 3, 3, 9, 49, 6, 32, 33, 3, 37, 8, 16, 9, 15, 3, 3, + 35, 33, 3, 20, 3, 3, 3, 20, 50, 50, 50, 50, 51, 51, 51, 51, + 51, 51, 52, 52, 52, 52, 52, 52, 16, 15, 3, 3, 3, 53, 6, 54, + 45, 41, 24, 6, 6, 3, 3, 20, 3, 3, 7, 55, 3, 3, 20, 3, + 21, 46, 25, 3, 41, 45, 24, 3, 3, 7, 56, 3, 3, 57, 6, 13, + 44, 9, 6, 25, 46, 6, 6, 18, 6, 6, 6, 13, 6, 58, 3, 3, + 3, 49, 21, 25, 41, 58, 3, 3, 59, 3, 3, 3, 60, 54, 53, 8, + 3, 22, 54, 61, 54, 3, 3, 3, 3, 45, 45, 6, 6, 43, 3, 3, + 13, 6, 6, 6, 49, 6, 15, 20, 37, 15, 8, 3, 6, 8, 3, 6, + 3, 3, 4, 62, 3, 3, 0, 63, 3, 3, 3, 7, 8, 3, 3, 3, + 3, 3, 16, 6, 3, 3, 11, 3, 13, 6, 6, 8, 35, 35, 7, 3, + 64, 65, 3, 3, 66, 3, 3, 3, 3, 45, 45, 45, 45, 15, 3, 3, + 3, 16, 6, 8, 3, 7, 6, 6, 50, 50, 50, 67, 7, 43, 54, 25, + 58, 3, 3, 3, 3, 20, 3, 3, 3, 3, 9, 21, 65, 33, 3, 3, + 7, 3, 3, 68, 3, 3, 3, 15, 19, 18, 15, 16, 3, 3, 64, 54, + 3, 69, 3, 3, 64, 26, 36, 31, 70, 71, 71, 71, 71, 71, 71, 70, + 71, 71, 71, 71, 71, 71, 70, 71, 71, 70, 71, 71, 71, 3, 3, 3, + 51, 72, 73, 52, 52, 52, 52, 3, 3, 3, 3, 35, 0, 0, 0, 3, + 3, 16, 13, 3, 9, 11, 3, 6, 3, 3, 13, 7, 74, 3, 3, 3, + 3, 3, 6, 6, 6, 13, 3, 3, 46, 21, 33, 5, 13, 3, 3, 3, + 3, 7, 6, 24, 6, 15, 3, 3, 7, 3, 3, 3, 64, 43, 6, 21, + 58, 3, 16, 15, 3, 3, 3, 46, 54, 49, 3, 3, 46, 6, 13, 3, + 25, 30, 30, 66, 37, 16, 6, 15, 56, 6, 75, 61, 49, 3, 3, 3, + 43, 8, 45, 53, 3, 3, 3, 8, 46, 6, 21, 61, 3, 3, 7, 26, + 6, 53, 3, 3, 43, 53, 6, 3, 76, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 77, 3, 3, 3, 11, 0, 3, 3, 3, 3, 78, 8, 60, + 79, 0, 80, 6, 13, 9, 6, 3, 3, 3, 16, 8, 6, 13, 7, 6, + 3, 15, 3, 3, 3, 81, 82, 82, 82, 82, 82, 82, +}; + +static RE_UINT8 re_grapheme_cluster_break_stage_5[] = { + 3, 3, 3, 3, 3, 3, 2, 3, 3, 1, 3, 3, 0, 0, 0, 0, + 0, 0, 0, 3, 0, 3, 0, 0, 4, 4, 4, 4, 0, 0, 0, 4, + 4, 4, 0, 0, 0, 4, 4, 4, 4, 4, 0, 4, 0, 4, 4, 0, + 3, 3, 0, 0, 4, 4, 4, 0, 3, 0, 0, 0, 4, 0, 0, 0, + 0, 0, 4, 4, 4, 3, 0, 4, 4, 0, 0, 4, 4, 0, 4, 4, + 0, 4, 0, 0, 4, 4, 4, 6, 0, 0, 4, 6, 4, 0, 6, 6, + 6, 4, 4, 4, 4, 6, 6, 6, 6, 4, 6, 6, 0, 4, 6, 6, + 4, 0, 4, 6, 4, 0, 0, 6, 6, 0, 0, 6, 6, 4, 0, 0, + 0, 4, 4, 6, 6, 4, 4, 0, 4, 6, 0, 6, 0, 0, 4, 0, + 4, 6, 6, 0, 0, 0, 6, 6, 6, 0, 6, 6, 6, 0, 4, 4, + 4, 0, 6, 4, 6, 6, 4, 6, 6, 0, 4, 6, 6, 6, 4, 4, + 4, 0, 4, 0, 6, 6, 6, 6, 6, 6, 6, 4, 0, 4, 0, 6, + 0, 4, 0, 4, 4, 6, 4, 4, 7, 7, 7, 7, 8, 8, 8, 8, + 9, 9, 9, 9, 4, 4, 6, 4, 4, 4, 6, 6, 4, 4, 3, 0, + 4, 6, 6, 4, 0, 6, 4, 6, 6, 0, 0, 0, 4, 4, 6, 0, + 0, 6, 4, 4, 6, 4, 6, 4, 4, 4, 3, 3, 3, 3, 3, 0, + 0, 0, 0, 6, 6, 4, 4, 6, 6, 6, 0, 0, 7, 0, 0, 0, + 4, 6, 0, 0, 0, 6, 4, 0, 10, 11, 11, 11, 11, 11, 11, 11, + 8, 8, 8, 0, 0, 0, 0, 9, 6, 4, 6, 0, 4, 6, 4, 6, + 0, 6, 6, 6, 6, 6, 6, 0, 0, 4, 6, 4, 4, 4, 4, 3, + 3, 3, 3, 4, 0, 0, 5, 5, 5, 5, 5, 5, +}; + +/* Grapheme_Cluster_Break: 2640 bytes. */ + +RE_UINT32 re_get_grapheme_cluster_break(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 13; + code = ch ^ (f << 13); + pos = (RE_UINT32)re_grapheme_cluster_break_stage_1[f] << 5; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_grapheme_cluster_break_stage_2[pos + f] << 4; + f = code >> 4; + code ^= f << 4; + pos = (RE_UINT32)re_grapheme_cluster_break_stage_3[pos + f] << 2; + f = code >> 2; + code ^= f << 2; + pos = (RE_UINT32)re_grapheme_cluster_break_stage_4[pos + f] << 2; + value = re_grapheme_cluster_break_stage_5[pos + code]; + + return value; +} + +/* Sentence_Break. */ + +static RE_UINT8 re_sentence_break_stage_1[] = { + 0, 1, 2, 3, 4, 5, 5, 5, 5, 6, 7, 5, 5, 8, 9, 10, + 11, 12, 13, 14, 15, 9, 16, 9, 9, 9, 9, 17, 9, 18, 19, 20, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 21, 22, 23, 9, 9, 24, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 25, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, +}; + +static RE_UINT8 re_sentence_break_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 17, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 33, 33, 36, 33, 37, 33, 33, 38, 39, 40, 33, + 41, 42, 33, 33, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 43, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 44, + 17, 17, 17, 17, 45, 17, 46, 47, 48, 49, 50, 51, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 52, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 17, 53, 54, 17, 55, 56, 57, + 58, 59, 60, 61, 62, 63, 17, 64, 65, 66, 67, 68, 69, 33, 33, 33, + 70, 71, 72, 73, 74, 75, 76, 77, 78, 33, 79, 33, 33, 33, 33, 33, + 17, 17, 17, 80, 81, 82, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 17, 17, 17, 17, 83, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 17, 17, 84, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 17, 17, 85, 86, 33, 33, 33, 87, + 88, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 89, 33, 33, 33, + 33, 90, 91, 33, 92, 93, 94, 95, 33, 33, 96, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 97, 33, 33, 33, 33, 33, 98, 33, + 33, 99, 33, 33, 33, 33, 100, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 17, 17, 17, 17, 17, 17, 101, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 102, 103, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 104, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 17, 17, 105, 33, 33, 33, 33, 33, + 106, 107, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, +}; + +static RE_UINT16 re_sentence_break_stage_3[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 8, 16, 17, 18, 19, 20, 21, 22, 23, 23, 23, 24, 25, 26, 27, 28, + 29, 30, 18, 8, 31, 8, 32, 8, 8, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 41, 41, 44, 45, 46, 47, 48, 41, 41, 49, 50, 51, + 52, 53, 54, 55, 55, 56, 55, 57, 58, 59, 60, 61, 62, 63, 64, 65, + 66, 67, 68, 69, 70, 71, 72, 73, 74, 71, 75, 76, 77, 78, 79, 80, + 81, 82, 83, 84, 85, 86, 87, 88, 85, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 55, 99, 100, 101, 55, 102, 103, 104, 105, 106, 107, 108, 55, + 41, 109, 110, 111, 112, 29, 113, 114, 41, 41, 41, 41, 41, 41, 41, 41, + 41, 41, 115, 41, 116, 117, 118, 41, 119, 41, 120, 121, 122, 29, 29, 123, + 96, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 124, 125, 41, 41, 126, + 127, 128, 129, 130, 41, 131, 132, 133, 134, 41, 41, 135, 41, 136, 41, 137, + 138, 139, 140, 141, 41, 142, 143, 55, 144, 41, 145, 146, 147, 148, 55, 55, + 149, 131, 150, 151, 152, 153, 41, 154, 41, 155, 156, 157, 55, 55, 158, 159, + 18, 18, 18, 18, 18, 18, 23, 160, 8, 8, 8, 8, 161, 8, 8, 8, + 162, 163, 164, 165, 163, 166, 167, 168, 169, 170, 171, 172, 173, 55, 174, 175, + 176, 177, 178, 30, 179, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 180, 181, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 182, 30, 183, + 55, 55, 184, 185, 55, 55, 186, 187, 55, 55, 55, 55, 188, 55, 189, 190, + 29, 191, 192, 193, 8, 8, 8, 194, 18, 195, 41, 196, 197, 198, 198, 23, + 199, 200, 201, 55, 55, 55, 55, 55, 202, 203, 96, 41, 204, 96, 41, 114, + 205, 206, 41, 41, 207, 208, 55, 209, 41, 41, 41, 41, 41, 137, 55, 55, + 41, 41, 41, 41, 41, 41, 137, 55, 41, 41, 41, 41, 210, 55, 209, 211, + 212, 213, 8, 214, 215, 41, 41, 216, 217, 218, 8, 219, 220, 221, 55, 222, + 223, 224, 41, 225, 226, 131, 227, 228, 50, 229, 230, 231, 58, 232, 233, 234, + 41, 235, 236, 237, 41, 238, 239, 240, 241, 242, 243, 244, 18, 18, 41, 245, + 41, 41, 41, 41, 41, 246, 247, 248, 41, 41, 41, 249, 41, 41, 250, 55, + 251, 252, 253, 41, 41, 254, 255, 41, 41, 256, 209, 41, 257, 41, 258, 259, + 260, 261, 262, 263, 41, 41, 41, 264, 265, 2, 266, 267, 268, 138, 269, 270, + 271, 272, 273, 55, 41, 41, 41, 208, 55, 55, 41, 56, 55, 55, 55, 274, + 55, 55, 55, 55, 231, 41, 275, 276, 41, 209, 277, 278, 279, 41, 280, 55, + 29, 281, 282, 41, 279, 133, 55, 55, 41, 283, 41, 284, 55, 55, 55, 55, + 41, 197, 137, 258, 55, 55, 55, 55, 285, 286, 137, 197, 138, 55, 55, 287, + 137, 250, 55, 55, 41, 288, 55, 55, 289, 290, 291, 231, 231, 55, 104, 292, + 41, 137, 137, 293, 254, 55, 55, 55, 41, 41, 294, 55, 29, 295, 18, 296, + 152, 297, 298, 299, 152, 300, 301, 302, 152, 303, 304, 305, 152, 232, 306, 55, + 307, 308, 55, 55, 309, 310, 311, 312, 313, 71, 314, 315, 55, 55, 55, 55, + 55, 55, 55, 55, 41, 47, 316, 55, 55, 55, 55, 55, 41, 317, 318, 55, + 41, 47, 319, 55, 41, 320, 133, 55, 321, 322, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 29, 18, 323, 55, 55, 55, 55, 55, 55, 41, 324, + 41, 41, 41, 41, 250, 55, 55, 55, 41, 41, 41, 207, 41, 41, 41, 41, + 41, 41, 284, 55, 55, 55, 55, 55, 41, 207, 55, 55, 55, 55, 55, 55, + 41, 41, 325, 55, 55, 55, 55, 55, 41, 324, 138, 326, 55, 55, 209, 327, + 41, 328, 329, 330, 122, 55, 55, 55, 41, 41, 331, 332, 333, 55, 55, 55, + 334, 55, 55, 55, 55, 55, 55, 55, 41, 41, 41, 335, 336, 337, 55, 55, + 55, 55, 55, 338, 339, 340, 55, 55, 55, 55, 341, 55, 55, 55, 55, 55, + 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 342, 343, 355, + 345, 356, 357, 358, 349, 359, 360, 361, 362, 363, 364, 191, 365, 366, 367, 368, + 23, 369, 23, 370, 371, 372, 55, 55, 41, 41, 41, 41, 41, 41, 373, 55, + 374, 375, 376, 377, 378, 379, 55, 55, 55, 380, 381, 381, 382, 55, 55, 55, + 55, 55, 55, 383, 55, 55, 55, 55, 41, 41, 41, 41, 41, 41, 197, 55, + 41, 56, 41, 41, 41, 41, 41, 41, 279, 41, 41, 41, 41, 41, 41, 41, + 41, 41, 41, 41, 41, 334, 55, 55, 279, 55, 55, 55, 55, 55, 55, 55, + 384, 385, 385, 385, 55, 55, 55, 55, 23, 23, 23, 23, 23, 23, 23, 386, +}; + +static RE_UINT8 re_sentence_break_stage_4[] = { + 0, 0, 1, 2, 0, 0, 0, 0, 3, 4, 5, 6, 7, 7, 8, 9, + 10, 11, 11, 11, 11, 11, 12, 13, 14, 15, 15, 15, 15, 15, 16, 13, + 0, 17, 0, 0, 0, 0, 0, 0, 18, 0, 19, 20, 0, 21, 19, 0, + 11, 11, 11, 11, 11, 22, 11, 23, 15, 15, 15, 15, 15, 24, 15, 15, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, + 26, 26, 27, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 28, 29, + 30, 31, 32, 33, 28, 31, 34, 28, 25, 31, 29, 31, 32, 26, 35, 34, + 36, 28, 31, 26, 26, 26, 26, 27, 25, 25, 25, 25, 30, 31, 25, 25, + 25, 25, 25, 25, 25, 15, 33, 30, 26, 23, 25, 25, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 37, 15, 15, + 15, 15, 15, 15, 15, 15, 38, 36, 39, 40, 36, 36, 41, 0, 0, 0, + 15, 42, 0, 43, 0, 0, 0, 0, 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 25, 45, 46, 47, 0, 48, 22, 49, 32, 11, 11, 11, + 50, 11, 11, 15, 15, 15, 15, 15, 15, 15, 15, 51, 33, 34, 25, 25, + 25, 25, 25, 25, 15, 52, 30, 32, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 15, 15, 15, 15, 53, 44, 54, 25, 25, 25, 25, 25, + 28, 26, 26, 29, 25, 25, 25, 25, 25, 25, 25, 25, 10, 11, 11, 11, + 11, 11, 11, 11, 11, 22, 55, 56, 14, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 57, 0, 58, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 59, + 60, 59, 0, 0, 36, 36, 36, 36, 36, 36, 61, 0, 36, 0, 0, 0, + 62, 63, 0, 64, 44, 44, 65, 66, 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 67, 44, 44, 44, 44, 44, 7, 7, 68, 69, 70, 36, 36, 36, + 36, 36, 36, 36, 36, 71, 44, 72, 44, 73, 74, 75, 7, 7, 76, 77, + 78, 0, 0, 79, 80, 36, 36, 36, 36, 36, 36, 36, 44, 44, 44, 44, + 44, 44, 65, 81, 36, 36, 36, 36, 36, 82, 44, 44, 83, 0, 0, 0, + 7, 7, 76, 36, 36, 36, 36, 36, 36, 36, 67, 44, 44, 41, 84, 0, + 36, 36, 36, 36, 36, 82, 85, 44, 44, 86, 86, 87, 0, 0, 0, 0, + 36, 36, 36, 36, 36, 36, 86, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 36, 36, 36, 36, 36, 88, 0, 0, 89, 44, 44, 44, 44, 44, 44, 44, + 44, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 82, 90, + 44, 44, 44, 44, 86, 44, 36, 36, 82, 91, 7, 7, 81, 36, 36, 36, + 86, 81, 36, 77, 77, 36, 36, 36, 36, 36, 92, 36, 43, 40, 41, 90, + 44, 93, 93, 94, 0, 89, 0, 95, 82, 96, 7, 7, 41, 0, 0, 0, + 58, 81, 61, 97, 77, 36, 36, 36, 36, 36, 92, 36, 92, 98, 41, 74, + 65, 89, 93, 87, 99, 0, 81, 43, 0, 96, 7, 7, 75, 100, 0, 0, + 58, 81, 36, 95, 95, 36, 36, 36, 36, 36, 92, 36, 92, 81, 41, 90, + 44, 59, 59, 87, 88, 0, 0, 0, 82, 96, 7, 7, 0, 0, 55, 0, + 58, 81, 36, 77, 77, 36, 36, 36, 44, 93, 93, 87, 0, 101, 0, 95, + 82, 96, 7, 7, 55, 0, 0, 0, 102, 81, 61, 40, 92, 41, 98, 92, + 97, 88, 61, 40, 36, 36, 41, 101, 65, 101, 74, 87, 88, 89, 0, 0, + 0, 96, 7, 7, 0, 0, 0, 0, 44, 81, 36, 92, 92, 36, 36, 36, + 36, 36, 92, 36, 36, 36, 41, 103, 44, 74, 74, 87, 0, 60, 61, 0, + 82, 96, 7, 7, 0, 0, 0, 0, 58, 81, 36, 92, 92, 36, 36, 36, + 36, 36, 92, 36, 36, 81, 41, 90, 44, 74, 74, 87, 0, 60, 0, 104, + 82, 96, 7, 7, 98, 0, 0, 0, 36, 36, 36, 36, 36, 36, 61, 103, + 44, 74, 74, 94, 0, 89, 0, 97, 82, 96, 7, 7, 0, 0, 40, 36, + 101, 81, 36, 36, 36, 61, 40, 36, 36, 36, 36, 36, 95, 36, 36, 55, + 36, 61, 105, 89, 44, 106, 44, 44, 0, 96, 7, 7, 101, 0, 0, 0, + 81, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 80, 44, 65, 0, + 36, 67, 44, 65, 7, 7, 107, 0, 98, 77, 43, 55, 0, 36, 81, 36, + 81, 108, 40, 81, 80, 44, 59, 83, 36, 43, 44, 87, 7, 7, 107, 36, + 88, 0, 0, 0, 0, 0, 87, 0, 7, 7, 107, 0, 0, 109, 110, 111, + 36, 36, 81, 36, 36, 36, 36, 36, 36, 36, 36, 88, 58, 44, 44, 44, + 44, 74, 36, 86, 44, 44, 58, 44, 44, 44, 44, 44, 44, 44, 44, 112, + 0, 105, 0, 0, 0, 0, 0, 0, 36, 36, 67, 44, 44, 44, 44, 113, + 7, 7, 114, 0, 36, 82, 75, 82, 90, 73, 44, 75, 86, 70, 36, 36, + 82, 44, 44, 85, 7, 7, 115, 87, 11, 50, 0, 116, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 61, 36, 36, 36, 92, 41, 36, 61, 92, 41, + 36, 36, 92, 41, 36, 36, 36, 36, 36, 36, 36, 36, 92, 41, 36, 61, + 92, 41, 36, 36, 36, 61, 36, 36, 36, 36, 36, 36, 92, 41, 36, 36, + 36, 36, 36, 36, 36, 36, 61, 58, 117, 9, 118, 0, 0, 0, 0, 0, + 36, 36, 36, 36, 0, 0, 0, 0, 11, 11, 11, 11, 11, 119, 15, 39, + 36, 36, 36, 120, 36, 36, 36, 36, 121, 36, 36, 36, 36, 36, 122, 123, + 36, 36, 61, 40, 36, 36, 88, 0, 36, 36, 36, 92, 82, 112, 0, 0, + 36, 36, 36, 36, 82, 124, 0, 0, 36, 36, 36, 36, 82, 0, 0, 0, + 36, 36, 36, 92, 125, 0, 0, 0, 36, 36, 36, 36, 36, 44, 44, 44, + 44, 44, 44, 44, 44, 97, 0, 100, 7, 7, 107, 0, 0, 0, 0, 0, + 126, 0, 127, 128, 7, 7, 107, 0, 36, 36, 36, 36, 36, 36, 0, 0, + 36, 36, 129, 0, 36, 36, 36, 36, 36, 36, 36, 36, 36, 41, 0, 0, + 36, 36, 36, 36, 36, 36, 36, 61, 44, 44, 44, 0, 44, 44, 44, 0, + 0, 91, 7, 7, 36, 36, 36, 36, 36, 36, 36, 41, 36, 88, 0, 0, + 36, 36, 36, 0, 36, 36, 36, 36, 36, 36, 41, 0, 7, 7, 107, 0, + 36, 36, 36, 36, 36, 67, 44, 0, 36, 36, 36, 36, 36, 86, 44, 65, + 44, 44, 44, 44, 44, 44, 44, 93, 7, 7, 107, 0, 7, 7, 107, 0, + 0, 97, 130, 0, 44, 44, 44, 65, 44, 70, 36, 36, 36, 36, 36, 36, + 44, 70, 36, 0, 7, 7, 114, 131, 0, 0, 89, 44, 44, 0, 0, 0, + 113, 36, 36, 36, 36, 36, 36, 36, 86, 44, 44, 75, 7, 7, 76, 36, + 36, 82, 44, 44, 44, 0, 0, 0, 36, 44, 44, 44, 44, 44, 9, 118, + 7, 7, 107, 81, 7, 7, 76, 36, 36, 36, 36, 36, 36, 36, 36, 132, + 0, 0, 0, 0, 65, 44, 44, 44, 44, 44, 70, 80, 82, 133, 87, 0, + 44, 44, 44, 44, 44, 87, 0, 44, 25, 25, 25, 25, 25, 34, 15, 27, + 15, 15, 11, 11, 15, 39, 11, 119, 15, 15, 11, 11, 15, 15, 11, 11, + 15, 39, 11, 119, 15, 15, 134, 134, 15, 15, 11, 11, 15, 15, 15, 39, + 15, 15, 11, 11, 15, 135, 11, 136, 46, 135, 11, 137, 15, 46, 11, 0, + 15, 15, 11, 137, 46, 135, 11, 137, 138, 138, 139, 140, 141, 142, 143, 143, + 0, 144, 145, 146, 0, 0, 147, 148, 0, 149, 148, 0, 0, 0, 0, 150, + 62, 151, 62, 62, 21, 0, 0, 152, 0, 0, 0, 147, 15, 15, 15, 42, + 0, 0, 0, 0, 44, 44, 44, 44, 44, 44, 44, 44, 112, 0, 0, 0, + 48, 153, 154, 155, 23, 116, 10, 119, 0, 156, 49, 157, 11, 38, 158, 33, + 0, 159, 39, 160, 0, 0, 0, 0, 161, 38, 88, 0, 0, 0, 0, 0, + 0, 0, 143, 0, 0, 0, 0, 0, 0, 0, 147, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 162, 11, 11, 15, 15, 39, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 4, 143, 123, 0, 143, 143, 143, 5, 0, 0, + 0, 147, 0, 0, 0, 0, 0, 0, 0, 163, 143, 143, 0, 0, 0, 0, + 4, 143, 143, 143, 143, 143, 123, 0, 0, 0, 0, 0, 0, 0, 143, 0, + 0, 0, 0, 0, 0, 0, 0, 5, 11, 11, 11, 22, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 24, 31, 164, 26, 32, 25, 29, 15, 33, + 25, 42, 153, 165, 54, 0, 0, 0, 15, 166, 0, 21, 36, 36, 36, 36, + 36, 36, 0, 97, 0, 0, 0, 89, 36, 36, 36, 36, 36, 61, 0, 0, + 36, 61, 36, 61, 36, 61, 36, 61, 143, 143, 143, 5, 0, 0, 0, 5, + 143, 143, 5, 167, 0, 0, 0, 118, 168, 0, 0, 0, 0, 0, 0, 0, + 169, 81, 143, 143, 5, 143, 143, 170, 81, 36, 82, 44, 81, 41, 36, 88, + 36, 36, 36, 36, 36, 61, 60, 81, 0, 81, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 41, 81, 36, 36, 36, 36, 36, 36, 61, 0, 0, 0, 0, + 36, 36, 36, 36, 36, 36, 61, 0, 0, 0, 0, 0, 36, 36, 36, 36, + 36, 36, 36, 88, 0, 0, 0, 0, 36, 36, 36, 36, 36, 36, 36, 171, + 36, 36, 36, 172, 36, 36, 36, 36, 7, 7, 76, 0, 0, 0, 0, 0, + 25, 25, 25, 173, 65, 44, 44, 174, 25, 25, 25, 25, 25, 25, 25, 175, + 36, 36, 36, 36, 176, 9, 0, 0, 0, 0, 0, 0, 0, 97, 36, 36, + 177, 25, 25, 25, 27, 25, 25, 25, 25, 25, 25, 25, 15, 15, 26, 30, + 25, 25, 178, 179, 25, 27, 25, 25, 25, 25, 31, 119, 11, 25, 0, 0, + 0, 0, 0, 0, 0, 97, 180, 36, 181, 181, 67, 36, 36, 36, 36, 36, + 67, 44, 0, 0, 0, 0, 0, 0, 36, 36, 36, 36, 36, 131, 0, 0, + 75, 36, 36, 36, 36, 36, 36, 36, 44, 112, 0, 131, 7, 7, 107, 0, + 44, 44, 44, 44, 75, 36, 97, 55, 36, 82, 44, 176, 36, 36, 36, 36, + 36, 67, 44, 44, 44, 0, 0, 0, 36, 36, 36, 36, 36, 36, 36, 88, + 36, 36, 36, 36, 67, 44, 44, 44, 112, 0, 148, 97, 7, 7, 107, 0, + 36, 80, 36, 36, 7, 7, 76, 61, 36, 36, 86, 44, 44, 65, 0, 0, + 67, 36, 36, 87, 7, 7, 107, 182, 36, 36, 36, 36, 36, 61, 183, 75, + 36, 36, 36, 36, 90, 73, 70, 82, 129, 0, 0, 0, 0, 0, 97, 41, + 36, 36, 67, 44, 184, 185, 0, 0, 81, 61, 81, 61, 81, 61, 0, 0, + 36, 61, 36, 61, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 24, 15, + 15, 39, 0, 0, 15, 15, 15, 15, 67, 44, 186, 87, 7, 7, 107, 0, + 36, 0, 0, 0, 36, 36, 36, 36, 36, 61, 97, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 0, 36, 36, 36, 41, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 41, 0, 15, 24, 0, 0, 187, 15, 0, 188, + 36, 36, 92, 36, 36, 61, 36, 43, 95, 92, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 41, 0, 0, 0, 0, 0, 0, 0, 97, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 189, 36, 36, 36, 36, 40, 36, 36, 36, + 36, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 36, 36, 0, + 44, 44, 44, 44, 190, 4, 123, 0, 44, 44, 44, 44, 191, 170, 143, 143, + 143, 192, 123, 0, 6, 193, 194, 195, 141, 0, 0, 0, 36, 92, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 196, 57, 0, 5, 6, 0, 0, 197, 9, + 14, 15, 15, 15, 15, 15, 16, 198, 199, 200, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 82, 40, 36, 40, 36, 40, 36, 40, 88, + 0, 0, 0, 0, 0, 0, 201, 0, 36, 36, 36, 81, 36, 36, 36, 36, + 36, 61, 36, 36, 36, 36, 61, 95, 36, 36, 36, 41, 36, 36, 36, 41, + 0, 0, 0, 0, 0, 0, 0, 99, 36, 36, 36, 36, 88, 0, 0, 0, + 112, 0, 0, 0, 0, 0, 0, 0, 36, 36, 61, 0, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 82, 65, 0, 36, 36, 36, 36, 36, 36, 36, 41, + 36, 0, 36, 36, 81, 41, 0, 0, 11, 11, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 36, 36, 36, 36, 36, 36, 0, 0, 36, 36, 36, 36, + 36, 0, 0, 0, 0, 0, 0, 0, 36, 41, 92, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 95, 88, 77, 36, 36, 36, 36, 61, 41, 0, 0, + 36, 36, 36, 36, 36, 36, 0, 40, 86, 60, 0, 44, 36, 81, 81, 36, + 36, 36, 36, 36, 36, 0, 65, 89, 0, 0, 0, 0, 0, 131, 0, 0, + 36, 185, 0, 0, 0, 0, 0, 0, 36, 36, 36, 36, 61, 0, 0, 0, + 36, 36, 88, 0, 0, 0, 0, 0, 11, 11, 11, 11, 22, 0, 0, 0, + 15, 15, 15, 15, 24, 0, 0, 0, 36, 36, 36, 36, 36, 36, 44, 44, + 44, 186, 118, 0, 0, 0, 0, 0, 0, 96, 7, 7, 0, 0, 0, 89, + 36, 36, 36, 36, 44, 44, 65, 202, 148, 0, 0, 0, 36, 36, 36, 36, + 36, 36, 88, 0, 7, 7, 107, 0, 36, 67, 44, 44, 44, 203, 7, 7, + 182, 0, 0, 0, 36, 36, 36, 36, 36, 36, 36, 36, 67, 104, 0, 0, + 70, 204, 101, 205, 7, 7, 206, 172, 36, 36, 36, 36, 95, 36, 36, 36, + 36, 36, 36, 44, 44, 44, 207, 118, 36, 61, 92, 95, 36, 36, 36, 95, + 36, 36, 208, 0, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 67, + 44, 44, 65, 0, 7, 7, 107, 0, 44, 81, 36, 77, 77, 36, 36, 36, + 44, 93, 93, 87, 88, 89, 0, 81, 82, 101, 44, 112, 44, 112, 0, 0, + 44, 95, 0, 0, 7, 7, 107, 0, 36, 36, 36, 67, 44, 87, 44, 44, + 209, 0, 182, 130, 130, 130, 36, 87, 124, 88, 0, 0, 7, 7, 107, 0, + 36, 36, 67, 44, 44, 44, 0, 0, 36, 36, 36, 36, 36, 36, 41, 58, + 44, 44, 44, 0, 7, 7, 107, 78, 7, 7, 107, 0, 0, 0, 0, 97, + 36, 36, 36, 36, 36, 36, 88, 0, 36, 61, 0, 0, 0, 0, 0, 0, + 7, 7, 107, 131, 0, 0, 0, 0, 36, 36, 36, 41, 44, 205, 0, 0, + 36, 36, 36, 36, 44, 186, 118, 0, 36, 118, 0, 0, 7, 7, 107, 0, + 97, 36, 36, 36, 36, 36, 0, 81, 36, 88, 0, 0, 86, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 65, 0, 0, 0, 89, 113, 36, 36, 36, + 41, 0, 0, 0, 0, 0, 0, 0, 36, 36, 61, 0, 36, 36, 36, 88, + 36, 36, 88, 0, 36, 36, 41, 210, 62, 0, 0, 0, 0, 0, 0, 0, + 0, 58, 87, 58, 211, 62, 212, 44, 65, 58, 44, 0, 0, 0, 0, 0, + 0, 0, 101, 87, 0, 0, 0, 0, 101, 112, 0, 0, 0, 0, 0, 0, + 11, 11, 11, 11, 11, 11, 155, 15, 15, 15, 15, 15, 15, 11, 11, 11, + 11, 11, 11, 155, 15, 135, 15, 15, 15, 15, 11, 11, 11, 11, 11, 11, + 155, 15, 15, 15, 15, 15, 15, 49, 48, 213, 10, 49, 11, 155, 166, 14, + 15, 14, 15, 15, 11, 11, 11, 11, 11, 11, 155, 15, 15, 15, 15, 15, + 15, 50, 22, 10, 11, 49, 11, 214, 15, 15, 15, 15, 15, 15, 50, 22, + 11, 156, 162, 11, 214, 15, 15, 15, 15, 15, 15, 11, 11, 11, 11, 11, + 11, 155, 15, 15, 15, 15, 15, 15, 11, 11, 11, 155, 15, 15, 15, 15, + 155, 15, 15, 15, 15, 15, 15, 11, 11, 11, 11, 11, 11, 155, 15, 15, + 15, 15, 15, 15, 11, 11, 11, 11, 15, 39, 11, 11, 11, 11, 11, 11, + 214, 15, 15, 15, 15, 15, 24, 15, 33, 11, 11, 11, 11, 11, 22, 15, + 15, 15, 15, 15, 15, 135, 15, 11, 11, 11, 11, 11, 11, 214, 15, 15, + 15, 15, 15, 24, 15, 33, 11, 11, 15, 15, 135, 15, 11, 11, 11, 11, + 11, 11, 214, 15, 15, 15, 15, 15, 24, 15, 27, 96, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 44, 44, 44, 44, 44, 65, 89, 44, + 44, 44, 44, 112, 0, 99, 0, 0, 0, 112, 118, 0, 0, 0, 89, 44, + 58, 44, 44, 44, 0, 0, 0, 0, 36, 88, 0, 0, 44, 65, 0, 0, + 36, 81, 36, 36, 36, 36, 36, 36, 98, 77, 81, 36, 61, 36, 108, 0, + 104, 97, 108, 81, 98, 77, 108, 108, 98, 77, 61, 36, 61, 36, 81, 43, + 36, 36, 95, 36, 36, 36, 36, 0, 81, 81, 95, 36, 36, 36, 36, 0, + 0, 0, 0, 0, 11, 11, 11, 11, 11, 11, 119, 0, 11, 11, 11, 11, + 11, 11, 119, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 163, 123, 0, + 20, 0, 0, 0, 0, 0, 0, 0, 62, 62, 62, 62, 62, 62, 62, 62, + 44, 44, 44, 44, 0, 0, 0, 0, +}; + +static RE_UINT8 re_sentence_break_stage_5[] = { + 0, 0, 0, 0, 0, 6, 2, 6, 6, 1, 0, 0, 6, 12, 13, 0, + 0, 0, 0, 13, 13, 13, 0, 0, 14, 14, 11, 0, 10, 10, 10, 10, + 10, 10, 14, 0, 0, 0, 0, 12, 0, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 13, 0, 13, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 13, 0, 4, 0, 0, 6, 0, 0, 0, 0, 0, 7, 13, + 0, 5, 0, 0, 0, 7, 0, 0, 8, 8, 8, 0, 8, 8, 8, 7, + 7, 7, 7, 0, 8, 7, 8, 7, 7, 8, 7, 8, 7, 7, 8, 7, + 8, 8, 7, 8, 7, 8, 7, 7, 7, 8, 8, 7, 8, 7, 8, 8, + 7, 8, 8, 8, 7, 7, 8, 8, 8, 7, 7, 7, 8, 7, 7, 9, + 9, 9, 9, 9, 9, 7, 7, 7, 7, 9, 9, 9, 7, 7, 0, 0, + 0, 0, 9, 9, 9, 9, 0, 0, 7, 0, 0, 0, 9, 0, 9, 0, + 3, 3, 3, 3, 9, 0, 8, 7, 0, 0, 7, 7, 7, 7, 0, 8, + 0, 0, 8, 0, 8, 0, 8, 8, 8, 8, 0, 8, 7, 7, 7, 8, + 8, 7, 0, 8, 8, 7, 0, 3, 3, 3, 8, 7, 0, 9, 0, 0, + 0, 14, 0, 0, 0, 12, 0, 0, 0, 3, 3, 3, 3, 3, 0, 3, + 0, 3, 3, 0, 9, 9, 9, 0, 5, 5, 5, 5, 5, 5, 0, 0, + 14, 14, 0, 0, 3, 3, 3, 0, 5, 0, 0, 12, 9, 9, 9, 3, + 10, 10, 0, 10, 10, 0, 9, 9, 3, 9, 9, 9, 12, 9, 3, 3, + 3, 5, 0, 3, 3, 9, 9, 3, 3, 0, 3, 3, 3, 3, 9, 9, + 10, 10, 9, 9, 9, 0, 0, 9, 12, 12, 12, 0, 0, 0, 0, 5, + 9, 3, 9, 9, 0, 9, 9, 9, 9, 9, 3, 3, 3, 9, 0, 0, + 14, 12, 9, 0, 3, 3, 9, 3, 9, 3, 3, 3, 3, 3, 0, 0, + 9, 0, 0, 0, 0, 0, 0, 3, 3, 9, 3, 3, 12, 12, 10, 10, + 9, 0, 9, 9, 3, 0, 0, 3, 3, 3, 9, 0, 9, 9, 0, 9, + 0, 0, 10, 10, 0, 0, 0, 9, 0, 9, 9, 0, 0, 3, 0, 0, + 9, 3, 0, 0, 0, 0, 3, 3, 0, 0, 3, 9, 0, 9, 3, 3, + 0, 0, 9, 0, 0, 0, 3, 0, 3, 0, 3, 0, 10, 10, 0, 0, + 0, 9, 0, 9, 0, 3, 0, 3, 0, 3, 13, 13, 13, 13, 3, 3, + 3, 0, 0, 0, 3, 3, 3, 9, 10, 10, 12, 12, 10, 10, 3, 3, + 0, 8, 0, 0, 0, 0, 12, 0, 12, 0, 0, 0, 8, 8, 0, 0, + 9, 0, 12, 9, 6, 9, 9, 9, 9, 9, 9, 13, 13, 0, 0, 0, + 3, 12, 12, 0, 9, 0, 3, 3, 0, 0, 14, 12, 14, 12, 0, 3, + 3, 3, 5, 0, 9, 3, 9, 0, 12, 12, 12, 12, 0, 0, 12, 12, + 9, 9, 12, 12, 3, 9, 9, 0, 0, 8, 0, 8, 7, 0, 7, 7, + 8, 0, 7, 0, 8, 0, 0, 0, 6, 6, 6, 6, 6, 6, 6, 5, + 3, 3, 5, 5, 0, 0, 0, 14, 14, 0, 0, 0, 13, 13, 13, 13, + 11, 0, 0, 0, 4, 4, 5, 5, 5, 5, 5, 6, 0, 13, 13, 0, + 12, 12, 0, 0, 0, 13, 13, 12, 0, 0, 0, 6, 5, 0, 5, 5, + 0, 13, 13, 7, 0, 0, 0, 8, 0, 0, 7, 8, 8, 8, 7, 7, + 8, 0, 8, 0, 8, 8, 0, 7, 9, 7, 0, 0, 0, 8, 7, 7, + 0, 0, 7, 0, 9, 9, 9, 8, 0, 0, 8, 8, 0, 0, 13, 13, + 8, 7, 7, 8, 7, 8, 7, 3, 7, 7, 0, 7, 0, 0, 12, 9, + 0, 0, 13, 0, 6, 14, 12, 0, 0, 13, 13, 13, 9, 9, 0, 12, + 9, 0, 12, 12, 8, 7, 9, 3, 3, 3, 0, 9, 7, 7, 3, 3, + 3, 3, 0, 12, 0, 0, 8, 7, 9, 0, 0, 8, 7, 8, 7, 9, + 7, 7, 7, 9, 9, 9, 3, 9, 0, 12, 12, 12, 0, 0, 9, 3, + 12, 12, 9, 9, 9, 3, 3, 0, 3, 3, 3, 12, 0, 0, 0, 7, + 0, 9, 3, 9, 9, 9, 13, 13, 14, 14, 0, 14, 0, 14, 14, 0, + 13, 0, 0, 13, 0, 14, 12, 12, 14, 13, 13, 13, 13, 13, 13, 0, + 9, 0, 0, 5, 0, 0, 14, 0, 0, 13, 0, 13, 13, 12, 13, 13, + 14, 0, 9, 9, 0, 5, 5, 5, 0, 5, 12, 12, 3, 0, 10, 10, + 9, 12, 12, 0, 3, 12, 0, 0, 10, 10, 9, 0, 12, 12, 0, 12, + 9, 12, 0, 0, 3, 0, 12, 12, 0, 3, 3, 12, 3, 3, 3, 5, + 5, 5, 5, 3, 0, 8, 8, 0, 8, 0, 7, 7, +}; + +/* Sentence_Break: 6372 bytes. */ + +RE_UINT32 re_get_sentence_break(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 12; + code = ch ^ (f << 12); + pos = (RE_UINT32)re_sentence_break_stage_1[f] << 4; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_sentence_break_stage_2[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_sentence_break_stage_3[pos + f] << 3; + f = code >> 2; + code ^= f << 2; + pos = (RE_UINT32)re_sentence_break_stage_4[pos + f] << 2; + value = re_sentence_break_stage_5[pos + code]; + + return value; +} + +/* Math. */ + +static RE_UINT8 re_math_stage_1[] = { + 0, 1, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, +}; + +static RE_UINT8 re_math_stage_2[] = { + 0, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 6, 1, 1, +}; + +static RE_UINT8 re_math_stage_3[] = { + 0, 1, 1, 2, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 4, 5, 6, 7, 1, 8, 9, 10, 1, 6, 6, 11, 1, 1, 1, 1, + 1, 1, 1, 12, 1, 1, 13, 14, 1, 1, 1, 1, 15, 16, 17, 18, + 1, 1, 1, 1, 1, 1, 19, 1, +}; + +static RE_UINT8 re_math_stage_4[] = { + 0, 1, 2, 3, 0, 4, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, + 9, 10, 11, 12, 13, 0, 14, 15, 16, 17, 18, 0, 19, 20, 21, 22, + 23, 23, 23, 23, 23, 23, 23, 23, 24, 25, 0, 26, 27, 28, 29, 30, + 0, 0, 0, 0, 0, 31, 32, 33, 34, 0, 35, 36, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 23, 23, 0, 19, 37, 0, 0, 0, 0, 0, + 0, 38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 39, 0, 0, 0, 0, + 1, 3, 3, 0, 0, 0, 0, 40, 23, 23, 41, 23, 42, 43, 44, 23, + 45, 46, 47, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 48, 23, 23, + 23, 23, 23, 23, 23, 23, 49, 23, 44, 50, 51, 52, 53, 54, 0, 55, +}; + +static RE_UINT8 re_math_stage_5[] = { + 0, 0, 0, 0, 0, 8, 0, 112, 0, 0, 0, 64, 0, 0, 0, 80, + 0, 16, 2, 0, 0, 0, 128, 0, 0, 0, 39, 0, 0, 0, 115, 0, + 192, 1, 0, 0, 0, 0, 64, 0, 0, 0, 28, 0, 17, 0, 4, 0, + 30, 0, 0, 124, 0, 124, 0, 0, 0, 0, 255, 31, 98, 248, 0, 0, + 132, 252, 47, 63, 16, 179, 251, 241, 255, 11, 0, 0, 0, 0, 255, 255, + 255, 126, 195, 240, 255, 255, 255, 47, 48, 0, 240, 255, 255, 255, 255, 255, + 0, 15, 0, 0, 3, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 248, + 255, 255, 191, 0, 0, 0, 1, 240, 7, 0, 0, 0, 3, 192, 255, 240, + 195, 140, 15, 0, 148, 31, 0, 255, 96, 0, 0, 0, 5, 0, 0, 0, + 15, 224, 0, 0, 159, 31, 0, 0, 0, 2, 0, 0, 126, 1, 0, 0, + 4, 30, 0, 0, 255, 255, 223, 255, 255, 255, 255, 223, 100, 222, 255, 235, + 239, 255, 255, 255, 191, 231, 223, 223, 255, 255, 255, 123, 95, 252, 253, 255, + 63, 255, 255, 255, 255, 207, 255, 255, 150, 254, 247, 10, 132, 234, 150, 170, + 150, 247, 247, 94, 255, 251, 255, 15, 238, 251, 255, 15, 0, 0, 3, 0, +}; + +/* Math: 538 bytes. */ + +RE_UINT32 re_get_math(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 15; + code = ch ^ (f << 15); + pos = (RE_UINT32)re_math_stage_1[f] << 4; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_math_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_math_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_math_stage_4[pos + f] << 5; + pos += code; + value = (re_math_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Alphabetic. */ + +static RE_UINT8 re_alphabetic_stage_1[] = { + 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, +}; + +static RE_UINT8 re_alphabetic_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 9, 10, 11, 7, 7, 7, 7, 12, 13, 13, 13, 13, 14, + 15, 16, 17, 18, 19, 13, 20, 13, 21, 13, 13, 13, 13, 22, 13, 13, + 13, 13, 13, 13, 13, 13, 23, 24, 13, 13, 25, 13, 13, 26, 27, 13, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 28, 7, 29, 30, 7, 31, 13, 13, 13, 13, 13, 32, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, +}; + +static RE_UINT8 re_alphabetic_stage_3[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 1, 17, 18, 19, 1, 20, 21, 22, 23, 24, 25, 26, 27, 1, 28, + 29, 30, 31, 31, 32, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 31, + 36, 37, 31, 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 38, 1, 1, 1, 1, 1, 1, 1, 1, 1, 39, + 1, 1, 1, 1, 40, 1, 41, 42, 43, 44, 45, 46, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 47, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 1, 48, 49, 1, 50, 51, 52, 53, 54, 55, 56, 57, 58, 1, 59, + 60, 61, 62, 63, 64, 31, 31, 31, 65, 66, 67, 68, 69, 70, 71, 72, + 73, 31, 74, 31, 31, 31, 31, 31, 1, 1, 1, 75, 76, 77, 31, 31, + 1, 1, 1, 1, 78, 31, 31, 31, 31, 31, 31, 31, 1, 1, 79, 31, + 1, 1, 80, 81, 31, 31, 31, 82, 83, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 84, 31, 31, 31, 31, 31, 31, 31, 85, 86, 87, 88, + 89, 31, 31, 31, 31, 31, 90, 31, 31, 91, 31, 31, 31, 31, 31, 31, + 1, 1, 1, 1, 1, 1, 92, 1, 1, 1, 1, 1, 1, 1, 1, 93, + 94, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 95, 31, + 1, 1, 96, 31, 31, 31, 31, 31, +}; + +static RE_UINT8 re_alphabetic_stage_4[] = { + 0, 0, 1, 1, 0, 2, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 5, 6, 0, 0, 7, 8, 9, 10, 4, 11, + 4, 4, 4, 4, 12, 4, 4, 4, 4, 13, 14, 15, 16, 17, 18, 19, + 20, 4, 21, 22, 4, 4, 23, 24, 25, 4, 26, 4, 4, 27, 28, 29, + 30, 31, 32, 0, 0, 33, 0, 34, 4, 35, 36, 37, 38, 39, 40, 41, + 42, 43, 44, 45, 46, 47, 48, 49, 50, 47, 51, 52, 53, 54, 55, 0, + 56, 57, 58, 59, 60, 61, 62, 63, 60, 64, 65, 66, 67, 68, 69, 70, + 15, 71, 72, 0, 73, 74, 75, 0, 76, 0, 77, 78, 79, 80, 0, 0, + 4, 81, 25, 82, 83, 4, 84, 85, 4, 4, 86, 4, 87, 88, 89, 4, + 90, 4, 91, 0, 92, 4, 4, 93, 15, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 94, 1, 4, 4, 95, 96, 97, 97, 98, 4, 99, 100, 0, + 0, 4, 4, 101, 4, 102, 4, 103, 104, 105, 25, 106, 4, 107, 108, 0, + 109, 4, 104, 110, 0, 111, 0, 0, 4, 112, 113, 0, 4, 114, 4, 115, + 4, 103, 116, 117, 0, 0, 0, 118, 4, 4, 4, 4, 4, 4, 0, 119, + 93, 4, 120, 117, 4, 121, 122, 123, 0, 0, 0, 124, 125, 0, 0, 0, + 126, 127, 128, 4, 129, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 130, 4, 108, 4, 131, 104, 4, 4, 4, 4, 132, + 4, 84, 4, 133, 134, 135, 135, 4, 0, 136, 0, 0, 0, 0, 0, 0, + 137, 138, 15, 4, 139, 15, 4, 85, 140, 141, 4, 4, 142, 71, 0, 25, + 4, 4, 4, 4, 4, 103, 0, 0, 4, 4, 4, 4, 4, 4, 103, 0, + 4, 4, 4, 4, 31, 0, 25, 117, 143, 144, 4, 145, 4, 4, 4, 92, + 146, 147, 4, 4, 148, 149, 0, 146, 150, 16, 4, 97, 4, 4, 59, 151, + 28, 102, 152, 80, 4, 153, 136, 154, 4, 134, 155, 156, 4, 104, 157, 158, + 159, 160, 85, 161, 4, 4, 4, 162, 4, 4, 4, 4, 4, 163, 164, 109, + 4, 4, 4, 165, 4, 4, 166, 0, 167, 168, 169, 4, 4, 27, 170, 4, + 4, 117, 25, 4, 171, 4, 16, 172, 0, 0, 0, 173, 4, 4, 4, 80, + 0, 1, 1, 174, 4, 104, 175, 0, 176, 177, 178, 0, 4, 4, 4, 71, + 0, 0, 4, 33, 0, 0, 0, 0, 0, 0, 0, 0, 80, 4, 179, 0, + 4, 25, 102, 71, 117, 4, 180, 0, 4, 4, 4, 4, 117, 0, 0, 0, + 4, 181, 4, 59, 0, 0, 0, 0, 4, 134, 103, 16, 0, 0, 0, 0, + 182, 183, 103, 134, 104, 0, 0, 184, 103, 166, 0, 0, 4, 185, 0, 0, + 186, 97, 0, 80, 80, 0, 77, 187, 4, 103, 103, 152, 27, 0, 0, 0, + 4, 4, 129, 0, 4, 152, 4, 152, 4, 4, 188, 0, 147, 32, 25, 129, + 4, 152, 25, 189, 4, 4, 190, 0, 191, 192, 0, 0, 193, 194, 4, 129, + 38, 47, 195, 59, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 196, 0, + 0, 0, 0, 0, 4, 197, 198, 0, 4, 104, 199, 0, 4, 103, 0, 0, + 200, 162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 201, + 0, 0, 0, 0, 0, 0, 4, 32, 4, 4, 4, 4, 166, 0, 0, 0, + 4, 4, 4, 142, 4, 4, 4, 4, 4, 4, 59, 0, 0, 0, 0, 0, + 4, 142, 0, 0, 0, 0, 0, 0, 4, 4, 202, 0, 0, 0, 0, 0, + 4, 32, 104, 0, 0, 0, 25, 155, 4, 134, 59, 203, 92, 0, 0, 0, + 4, 4, 204, 104, 170, 0, 0, 0, 205, 0, 0, 0, 0, 0, 0, 0, + 4, 4, 4, 206, 207, 0, 0, 0, 4, 4, 208, 4, 209, 210, 211, 4, + 212, 213, 214, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 215, 216, 85, + 208, 208, 131, 131, 217, 217, 218, 0, 4, 4, 4, 4, 4, 4, 187, 0, + 211, 219, 220, 221, 222, 223, 0, 0, 0, 25, 224, 224, 108, 0, 0, 0, + 4, 4, 4, 4, 4, 4, 134, 0, 4, 33, 4, 4, 4, 4, 4, 4, + 117, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 205, 0, 0, + 117, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_alphabetic_stage_5[] = { + 0, 0, 0, 0, 254, 255, 255, 7, 0, 4, 32, 4, 255, 255, 127, 255, + 255, 255, 255, 255, 195, 255, 3, 0, 31, 80, 0, 0, 32, 0, 0, 0, + 0, 0, 223, 188, 64, 215, 255, 255, 251, 255, 255, 255, 255, 255, 191, 255, + 3, 252, 255, 255, 255, 255, 254, 255, 255, 255, 127, 2, 254, 255, 255, 255, + 255, 0, 0, 0, 0, 0, 255, 191, 182, 0, 255, 255, 255, 7, 7, 0, + 0, 0, 255, 7, 255, 255, 255, 254, 0, 192, 255, 255, 255, 255, 239, 31, + 254, 225, 0, 156, 0, 0, 255, 255, 0, 224, 255, 255, 255, 255, 3, 0, + 0, 252, 255, 255, 255, 7, 48, 4, 255, 255, 255, 252, 255, 31, 0, 0, + 255, 255, 255, 1, 255, 255, 31, 0, 248, 3, 255, 255, 255, 255, 255, 239, + 255, 223, 225, 255, 15, 0, 254, 255, 239, 159, 249, 255, 255, 253, 197, 227, + 159, 89, 128, 176, 15, 0, 3, 0, 238, 135, 249, 255, 255, 253, 109, 195, + 135, 25, 2, 94, 0, 0, 63, 0, 238, 191, 251, 255, 255, 253, 237, 227, + 191, 27, 1, 0, 15, 0, 0, 2, 238, 159, 249, 255, 159, 25, 192, 176, + 15, 0, 2, 0, 236, 199, 61, 214, 24, 199, 255, 195, 199, 29, 129, 0, + 239, 223, 253, 255, 255, 253, 255, 227, 223, 29, 96, 7, 15, 0, 0, 0, + 238, 223, 253, 255, 255, 253, 239, 227, 223, 29, 96, 64, 15, 0, 6, 0, + 255, 255, 255, 231, 223, 93, 128, 128, 15, 0, 0, 252, 236, 255, 127, 252, + 255, 255, 251, 47, 127, 128, 95, 255, 0, 0, 12, 0, 255, 255, 255, 7, + 127, 32, 0, 0, 150, 37, 240, 254, 174, 236, 255, 59, 95, 32, 0, 240, + 1, 0, 0, 0, 255, 254, 255, 255, 255, 31, 254, 255, 3, 255, 255, 254, + 255, 255, 255, 31, 255, 255, 127, 249, 231, 193, 255, 255, 127, 64, 0, 48, + 191, 32, 255, 255, 255, 255, 255, 247, 255, 61, 127, 61, 255, 61, 255, 255, + 255, 255, 61, 127, 61, 255, 127, 255, 255, 255, 61, 255, 255, 255, 255, 135, + 255, 255, 0, 0, 255, 255, 63, 63, 255, 159, 255, 255, 255, 199, 255, 1, + 255, 223, 15, 0, 255, 255, 15, 0, 255, 223, 13, 0, 255, 255, 207, 255, + 255, 1, 128, 16, 255, 255, 255, 0, 255, 7, 255, 255, 255, 255, 63, 0, + 255, 255, 255, 127, 255, 15, 255, 1, 255, 63, 31, 0, 255, 15, 255, 255, + 255, 3, 0, 0, 255, 255, 255, 15, 254, 255, 31, 0, 128, 0, 0, 0, + 255, 255, 239, 255, 239, 15, 0, 0, 255, 243, 0, 252, 191, 255, 3, 0, + 0, 224, 0, 252, 255, 255, 255, 63, 0, 222, 111, 0, 128, 255, 31, 0, + 63, 63, 255, 170, 255, 255, 223, 95, 220, 31, 207, 15, 255, 31, 220, 31, + 0, 0, 2, 128, 0, 0, 255, 31, 132, 252, 47, 62, 80, 189, 255, 243, + 224, 67, 0, 0, 255, 1, 0, 0, 0, 0, 192, 255, 255, 127, 255, 255, + 31, 120, 12, 0, 255, 128, 0, 0, 255, 255, 127, 0, 127, 127, 127, 127, + 0, 128, 0, 0, 224, 0, 0, 0, 254, 3, 62, 31, 255, 255, 127, 224, + 224, 255, 255, 255, 255, 63, 254, 255, 255, 127, 0, 0, 255, 31, 255, 255, + 0, 12, 0, 0, 255, 127, 240, 143, 0, 0, 128, 255, 252, 255, 255, 255, + 255, 249, 255, 255, 255, 63, 255, 0, 187, 247, 255, 255, 0, 0, 252, 40, + 255, 255, 7, 0, 255, 255, 247, 255, 223, 255, 0, 124, 255, 63, 0, 0, + 255, 255, 127, 196, 5, 0, 0, 56, 255, 255, 60, 0, 126, 126, 126, 0, + 127, 127, 255, 255, 63, 0, 255, 255, 255, 7, 0, 0, 15, 0, 255, 255, + 127, 248, 255, 255, 255, 63, 255, 255, 255, 255, 255, 3, 127, 0, 248, 224, + 255, 253, 127, 95, 219, 255, 255, 255, 0, 0, 248, 255, 255, 255, 252, 255, + 0, 0, 255, 15, 0, 0, 223, 255, 192, 255, 255, 255, 252, 252, 252, 28, + 255, 239, 255, 255, 127, 255, 255, 183, 255, 63, 255, 63, 255, 255, 1, 0, + 15, 255, 62, 0, 255, 0, 255, 255, 63, 253, 255, 255, 255, 255, 191, 145, + 255, 255, 55, 0, 255, 255, 255, 192, 111, 240, 239, 254, 31, 0, 0, 0, + 63, 0, 0, 0, 255, 255, 71, 0, 30, 0, 0, 20, 255, 255, 251, 255, + 255, 255, 159, 0, 127, 189, 255, 191, 255, 1, 255, 255, 159, 25, 129, 224, + 179, 0, 0, 0, 255, 255, 63, 127, 0, 0, 0, 63, 17, 0, 0, 0, + 255, 255, 255, 227, 0, 0, 0, 128, 127, 0, 0, 0, 248, 255, 255, 224, + 31, 0, 255, 255, 3, 0, 0, 0, 255, 7, 255, 31, 255, 1, 255, 67, + 255, 255, 223, 255, 255, 255, 255, 223, 100, 222, 255, 235, 239, 255, 255, 255, + 191, 231, 223, 223, 255, 255, 255, 123, 95, 252, 253, 255, 63, 255, 255, 255, + 253, 255, 255, 247, 255, 253, 255, 255, 247, 15, 0, 0, 150, 254, 247, 10, + 132, 234, 150, 170, 150, 247, 247, 94, 255, 251, 255, 15, 238, 251, 255, 15, + 255, 3, 255, 255, +}; + +/* Alphabetic: 2085 bytes. */ + +RE_UINT32 re_get_alphabetic(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_alphabetic_stage_1[f] << 5; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_alphabetic_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_alphabetic_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_alphabetic_stage_4[pos + f] << 5; + pos += code; + value = (re_alphabetic_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Lowercase. */ + +static RE_UINT8 re_lowercase_stage_1[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, +}; + +static RE_UINT8 re_lowercase_stage_2[] = { + 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 5, + 6, 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 8, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +}; + +static RE_UINT8 re_lowercase_stage_3[] = { + 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 9, 10, + 11, 12, 6, 6, 13, 6, 6, 6, 6, 6, 6, 6, 14, 15, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 16, 17, 6, 6, 6, 18, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 19, 6, 6, 6, 20, + 6, 6, 6, 6, 21, 6, 6, 6, 6, 6, 6, 6, 22, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 23, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 24, 25, 26, 27, 6, 6, 6, 6, 6, 6, 6, 6, +}; + +static RE_UINT8 re_lowercase_stage_4[] = { + 0, 0, 0, 1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 5, 13, 14, 15, 16, 17, 18, 19, 0, 0, 20, 21, 22, 23, 24, 25, + 0, 26, 15, 5, 27, 5, 28, 5, 5, 29, 0, 30, 31, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, + 15, 15, 15, 15, 15, 15, 0, 0, 5, 5, 5, 5, 33, 5, 5, 5, + 34, 35, 36, 37, 35, 38, 39, 40, 0, 0, 0, 41, 42, 0, 0, 0, + 43, 44, 45, 26, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 26, 47, + 0, 26, 48, 49, 5, 5, 5, 50, 15, 51, 0, 0, 0, 0, 0, 0, + 0, 0, 5, 52, 53, 0, 0, 0, 0, 54, 5, 55, 56, 57, 0, 58, + 0, 26, 59, 60, 15, 15, 0, 0, 61, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, 0, 62, 63, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 15, 64, 0, 0, 0, 0, 0, 0, 15, 0, + 65, 66, 67, 31, 68, 69, 70, 71, 72, 73, 74, 75, 76, 65, 66, 77, + 31, 68, 78, 63, 71, 79, 80, 81, 82, 78, 83, 26, 84, 71, 85, 0, +}; + +static RE_UINT8 re_lowercase_stage_5[] = { + 0, 0, 0, 0, 254, 255, 255, 7, 0, 4, 32, 4, 0, 0, 0, 128, + 255, 255, 127, 255, 170, 170, 170, 170, 170, 170, 170, 85, 85, 171, 170, 170, + 170, 170, 170, 212, 41, 49, 36, 78, 42, 45, 81, 230, 64, 82, 85, 181, + 170, 170, 41, 170, 170, 170, 250, 147, 133, 170, 255, 255, 255, 255, 255, 255, + 255, 255, 239, 255, 255, 255, 255, 1, 3, 0, 0, 0, 31, 0, 0, 0, + 32, 0, 0, 0, 0, 0, 138, 60, 0, 0, 1, 0, 0, 240, 255, 255, + 255, 127, 227, 170, 170, 170, 47, 25, 0, 0, 255, 255, 2, 168, 170, 170, + 84, 213, 170, 170, 170, 170, 0, 0, 254, 255, 255, 255, 255, 0, 0, 0, + 0, 0, 0, 63, 170, 170, 234, 191, 255, 0, 63, 0, 255, 0, 255, 0, + 63, 0, 255, 0, 255, 0, 255, 63, 255, 0, 223, 64, 220, 0, 207, 0, + 255, 0, 220, 0, 0, 0, 2, 128, 0, 0, 255, 31, 0, 196, 8, 0, + 0, 128, 16, 50, 192, 67, 0, 0, 16, 0, 0, 0, 255, 3, 0, 0, + 255, 255, 255, 127, 98, 21, 218, 63, 26, 80, 8, 0, 191, 32, 0, 0, + 170, 42, 0, 0, 170, 170, 170, 58, 168, 170, 171, 170, 170, 170, 255, 149, + 170, 80, 186, 170, 170, 2, 160, 0, 0, 0, 0, 7, 255, 255, 255, 247, + 63, 0, 255, 255, 127, 0, 248, 0, 0, 255, 255, 255, 255, 255, 0, 0, + 255, 255, 7, 0, 0, 0, 0, 252, 255, 255, 15, 0, 0, 192, 223, 255, + 252, 255, 255, 15, 0, 0, 192, 235, 239, 255, 0, 0, 0, 252, 255, 255, + 15, 0, 0, 192, 255, 255, 255, 0, 0, 0, 252, 255, 255, 15, 0, 0, + 192, 255, 255, 255, 0, 192, 255, 255, 0, 0, 192, 255, 63, 0, 0, 0, + 252, 255, 255, 247, 3, 0, 0, 240, 255, 255, 223, 15, 255, 127, 63, 0, + 255, 253, 0, 0, 247, 11, 0, 0, +}; + +/* Lowercase: 777 bytes. */ + +RE_UINT32 re_get_lowercase(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_lowercase_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_lowercase_stage_2[pos + f] << 4; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_lowercase_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_lowercase_stage_4[pos + f] << 5; + pos += code; + value = (re_lowercase_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Uppercase. */ + +static RE_UINT8 re_uppercase_stage_1[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, +}; + +static RE_UINT8 re_uppercase_stage_2[] = { + 0, 1, 2, 3, 4, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, + 8, 9, 1, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 11, 1, 1, 1, 12, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_uppercase_stage_3[] = { + 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 9, 10, + 6, 11, 6, 6, 12, 6, 6, 6, 6, 6, 6, 6, 13, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 14, 15, 6, 6, 6, 6, 6, 6, 6, 16, + 6, 6, 6, 6, 17, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, 6, + 19, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 20, 21, 22, 23, + 6, 24, 6, 6, 6, 6, 6, 6, +}; + +static RE_UINT8 re_uppercase_stage_4[] = { + 0, 0, 1, 0, 0, 0, 2, 0, 3, 4, 5, 6, 7, 8, 9, 10, + 3, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0, 13, 14, 15, 16, 17, + 18, 19, 0, 3, 20, 3, 21, 3, 3, 22, 23, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 24, 0, + 0, 0, 0, 0, 0, 18, 18, 25, 3, 3, 3, 3, 26, 3, 3, 3, + 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 19, 37, 0, 0, 0, + 0, 0, 0, 0, 0, 38, 19, 0, 18, 39, 0, 40, 3, 3, 3, 41, + 0, 0, 3, 42, 43, 0, 0, 0, 0, 44, 3, 45, 46, 47, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 0, 18, 48, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 18, 49, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, + 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 50, 51, 52, + 53, 63, 25, 56, 57, 53, 64, 65, 66, 67, 38, 39, 56, 68, 69, 0, + 0, 56, 70, 70, 57, 0, 0, 0, +}; + +static RE_UINT8 re_uppercase_stage_5[] = { + 0, 0, 0, 0, 254, 255, 255, 7, 255, 255, 127, 127, 85, 85, 85, 85, + 85, 85, 85, 170, 170, 84, 85, 85, 85, 85, 85, 43, 214, 206, 219, 177, + 213, 210, 174, 17, 144, 164, 170, 74, 85, 85, 210, 85, 85, 85, 5, 108, + 122, 85, 0, 0, 0, 0, 69, 128, 64, 215, 254, 255, 251, 15, 0, 0, + 0, 128, 28, 85, 85, 85, 144, 230, 255, 255, 255, 255, 255, 255, 0, 0, + 1, 84, 85, 85, 171, 42, 85, 85, 85, 85, 254, 255, 255, 255, 127, 0, + 191, 32, 0, 0, 255, 255, 63, 0, 85, 85, 21, 64, 0, 255, 0, 63, + 0, 255, 0, 255, 0, 63, 0, 170, 0, 255, 0, 0, 0, 0, 0, 15, + 0, 15, 0, 15, 0, 31, 0, 15, 132, 56, 39, 62, 80, 61, 15, 192, + 32, 0, 0, 0, 8, 0, 0, 0, 0, 0, 192, 255, 255, 127, 0, 0, + 157, 234, 37, 192, 5, 40, 4, 0, 85, 21, 0, 0, 85, 85, 85, 5, + 84, 85, 84, 85, 85, 85, 0, 106, 85, 40, 69, 85, 85, 61, 95, 0, + 255, 0, 0, 0, 255, 255, 7, 0, 255, 255, 255, 3, 0, 0, 240, 255, + 255, 63, 0, 0, 0, 255, 255, 255, 3, 0, 0, 208, 100, 222, 63, 0, + 0, 0, 255, 255, 255, 3, 0, 0, 176, 231, 223, 31, 0, 0, 0, 123, + 95, 252, 1, 0, 0, 240, 255, 255, 63, 0, 0, 0, 3, 0, 0, 240, + 1, 0, 0, 0, 252, 255, 255, 7, 0, 0, 0, 240, 255, 255, 31, 0, + 255, 1, 0, 0, 0, 4, 0, 0, 255, 3, 255, 255, +}; + +/* Uppercase: 701 bytes. */ + +RE_UINT32 re_get_uppercase(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_uppercase_stage_1[f] << 5; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_uppercase_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_uppercase_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_uppercase_stage_4[pos + f] << 5; + pos += code; + value = (re_uppercase_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Cased. */ + +static RE_UINT8 re_cased_stage_1[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, +}; + +static RE_UINT8 re_cased_stage_2[] = { + 0, 1, 2, 3, 4, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 6, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, + 9, 10, 1, 11, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 12, 1, 1, 1, 13, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_cased_stage_3[] = { + 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 9, 10, 11, + 12, 13, 6, 6, 14, 6, 6, 6, 6, 6, 6, 6, 15, 16, 6, 6, + 6, 6, 6, 6, 6, 6, 17, 18, 6, 6, 6, 19, 6, 6, 6, 6, + 6, 6, 6, 20, 6, 6, 6, 21, 6, 6, 6, 6, 22, 6, 6, 6, + 6, 6, 6, 6, 23, 6, 6, 6, 24, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 25, 26, 27, 28, 6, 29, 6, 6, 6, 6, 6, 6, +}; + +static RE_UINT8 re_cased_stage_4[] = { + 0, 0, 1, 1, 0, 2, 3, 3, 4, 4, 4, 4, 4, 5, 6, 4, + 4, 4, 4, 4, 7, 8, 9, 10, 0, 0, 11, 12, 13, 14, 4, 15, + 4, 4, 4, 4, 16, 4, 4, 4, 4, 17, 18, 19, 20, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 21, 0, + 0, 0, 0, 0, 0, 4, 4, 22, 4, 4, 4, 4, 4, 4, 0, 0, + 4, 4, 4, 4, 4, 4, 4, 4, 22, 4, 23, 24, 4, 25, 26, 27, + 0, 0, 0, 28, 29, 0, 0, 0, 30, 31, 32, 4, 33, 0, 0, 0, + 0, 0, 0, 0, 0, 34, 4, 35, 4, 36, 37, 4, 4, 4, 4, 38, + 4, 21, 0, 0, 0, 0, 0, 0, 0, 0, 4, 39, 24, 0, 0, 0, + 0, 40, 4, 4, 41, 42, 0, 43, 0, 44, 5, 45, 4, 4, 0, 0, + 46, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, + 4, 4, 47, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 48, 4, 48, + 0, 0, 0, 0, 0, 4, 4, 0, 4, 4, 49, 4, 50, 51, 52, 4, + 53, 54, 55, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 56, 57, 5, + 49, 49, 36, 36, 58, 58, 59, 0, 0, 44, 60, 60, 35, 0, 0, 0, +}; + +static RE_UINT8 re_cased_stage_5[] = { + 0, 0, 0, 0, 254, 255, 255, 7, 0, 4, 32, 4, 255, 255, 127, 255, + 255, 255, 255, 255, 255, 255, 255, 247, 240, 255, 255, 255, 255, 255, 239, 255, + 255, 255, 255, 1, 3, 0, 0, 0, 31, 0, 0, 0, 32, 0, 0, 0, + 0, 0, 207, 188, 64, 215, 255, 255, 251, 255, 255, 255, 255, 255, 191, 255, + 3, 252, 255, 255, 255, 255, 254, 255, 255, 255, 127, 0, 254, 255, 255, 255, + 255, 0, 0, 0, 191, 32, 0, 0, 255, 255, 63, 63, 63, 63, 255, 170, + 255, 255, 255, 63, 255, 255, 223, 95, 220, 31, 207, 15, 255, 31, 220, 31, + 0, 0, 2, 128, 0, 0, 255, 31, 132, 252, 47, 62, 80, 189, 31, 242, + 224, 67, 0, 0, 24, 0, 0, 0, 0, 0, 192, 255, 255, 3, 0, 0, + 255, 127, 255, 255, 255, 255, 255, 127, 31, 120, 12, 0, 255, 63, 0, 0, + 252, 255, 255, 255, 255, 120, 255, 255, 255, 63, 255, 0, 0, 0, 0, 7, + 0, 0, 255, 255, 63, 0, 255, 255, 127, 0, 248, 0, 255, 255, 0, 0, + 255, 255, 7, 0, 255, 255, 223, 255, 255, 255, 255, 223, 100, 222, 255, 235, + 239, 255, 255, 255, 191, 231, 223, 223, 255, 255, 255, 123, 95, 252, 253, 255, + 63, 255, 255, 255, 253, 255, 255, 247, 255, 253, 255, 255, 247, 15, 0, 0, + 255, 3, 255, 255, +}; + +/* Cased: 709 bytes. */ + +RE_UINT32 re_get_cased(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_cased_stage_1[f] << 5; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_cased_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_cased_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_cased_stage_4[pos + f] << 5; + pos += code; + value = (re_cased_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Case_Ignorable. */ + +static RE_UINT8 re_case_ignorable_stage_1[] = { + 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, + 4, 4, +}; + +static RE_UINT8 re_case_ignorable_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 8, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10, + 11, 12, 13, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 14, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 15, 7, 7, 16, 17, 7, 18, 19, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 20, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, +}; + +static RE_UINT8 re_case_ignorable_stage_3[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 1, 1, 17, 1, 1, 1, 18, 19, 20, 21, 22, 23, 24, 1, 25, + 26, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 28, 29, 1, + 30, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 31, 1, 1, 1, 32, 1, 33, 34, 35, 36, 37, 38, 1, 1, 1, 1, + 1, 1, 1, 39, 1, 1, 40, 41, 1, 42, 43, 44, 1, 1, 1, 1, + 1, 1, 45, 1, 1, 1, 1, 1, 46, 47, 48, 49, 50, 51, 52, 53, + 1, 1, 54, 55, 1, 1, 1, 56, 1, 1, 1, 1, 57, 1, 1, 1, + 1, 58, 59, 1, 1, 1, 1, 1, 1, 1, 60, 1, 1, 1, 1, 1, + 61, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 62, 1, 1, 1, 1, + 63, 64, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_case_ignorable_stage_4[] = { + 0, 1, 2, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 5, 6, 6, 6, 6, 6, 7, 8, 0, 0, 0, + 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 10, 0, 11, 12, 13, 14, + 15, 0, 16, 17, 0, 0, 18, 19, 20, 5, 21, 0, 0, 22, 0, 23, + 24, 25, 26, 0, 0, 0, 0, 27, 28, 29, 30, 31, 32, 33, 34, 35, + 36, 33, 37, 38, 36, 33, 39, 35, 32, 40, 41, 35, 42, 0, 43, 0, + 3, 44, 45, 35, 32, 40, 46, 35, 32, 0, 34, 35, 0, 0, 47, 0, + 0, 48, 49, 0, 0, 50, 51, 0, 52, 53, 0, 54, 55, 56, 57, 0, + 0, 58, 59, 60, 61, 0, 0, 33, 0, 0, 62, 0, 0, 0, 0, 0, + 63, 63, 64, 64, 0, 65, 66, 0, 67, 0, 68, 0, 0, 69, 0, 0, + 0, 70, 0, 0, 0, 0, 0, 0, 71, 0, 72, 73, 0, 74, 0, 0, + 75, 76, 42, 77, 78, 79, 0, 80, 0, 81, 0, 82, 0, 0, 83, 84, + 0, 85, 6, 86, 87, 6, 6, 88, 0, 0, 0, 0, 0, 89, 90, 91, + 92, 93, 0, 94, 95, 0, 5, 96, 0, 0, 0, 97, 0, 0, 0, 98, + 0, 0, 0, 99, 0, 0, 0, 6, 0, 100, 0, 0, 0, 0, 0, 0, + 101, 102, 0, 0, 103, 0, 0, 104, 105, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 82, 106, 0, 0, 107, 108, 0, 0, 109, + 6, 78, 0, 17, 110, 0, 0, 52, 111, 112, 0, 0, 0, 0, 113, 114, + 0, 115, 116, 0, 28, 117, 100, 112, 0, 118, 119, 120, 0, 121, 122, 123, + 0, 0, 87, 0, 0, 0, 0, 124, 2, 0, 0, 0, 0, 125, 78, 0, + 126, 127, 128, 0, 0, 0, 0, 129, 1, 2, 3, 17, 44, 0, 0, 130, + 0, 0, 0, 0, 0, 0, 0, 131, 0, 0, 0, 0, 0, 0, 0, 3, + 0, 0, 0, 132, 0, 0, 0, 0, 133, 134, 0, 0, 0, 0, 0, 112, + 32, 135, 136, 129, 78, 137, 0, 0, 28, 138, 0, 139, 78, 140, 141, 0, + 0, 142, 0, 0, 0, 0, 129, 143, 78, 33, 3, 144, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 145, 146, 0, 0, 0, 0, 0, 0, 147, 148, 0, + 0, 149, 3, 0, 0, 150, 0, 0, 62, 151, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 152, 0, 153, 75, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 154, 0, 0, 0, 0, 0, 0, 0, 155, 75, 0, 0, + 0, 0, 0, 156, 157, 158, 0, 0, 0, 0, 159, 0, 0, 0, 0, 0, + 6, 160, 6, 161, 162, 163, 0, 0, 0, 0, 0, 0, 0, 0, 153, 0, + 0, 0, 0, 0, 0, 0, 0, 87, 32, 6, 6, 6, 0, 0, 0, 0, + 6, 6, 6, 6, 6, 6, 6, 127, +}; + +static RE_UINT8 re_case_ignorable_stage_5[] = { + 0, 0, 0, 0, 128, 64, 0, 4, 0, 0, 0, 64, 1, 0, 0, 0, + 0, 161, 144, 1, 0, 0, 255, 255, 255, 255, 255, 255, 255, 255, 48, 4, + 176, 0, 0, 0, 248, 3, 0, 0, 0, 0, 0, 2, 0, 0, 254, 255, + 255, 255, 255, 191, 182, 0, 0, 0, 0, 0, 16, 0, 63, 0, 255, 23, + 1, 248, 255, 255, 0, 0, 1, 0, 0, 0, 192, 191, 255, 61, 0, 0, + 0, 128, 2, 0, 255, 7, 0, 0, 192, 255, 1, 0, 0, 248, 63, 4, + 0, 0, 192, 255, 255, 63, 0, 0, 0, 0, 0, 14, 248, 255, 255, 255, + 7, 0, 0, 0, 0, 0, 0, 20, 254, 33, 254, 0, 12, 0, 2, 0, + 2, 0, 0, 0, 0, 0, 0, 16, 30, 32, 0, 0, 12, 0, 0, 0, + 6, 0, 0, 0, 134, 57, 2, 0, 0, 0, 35, 0, 190, 33, 0, 0, + 0, 0, 0, 144, 30, 32, 64, 0, 4, 0, 0, 0, 1, 32, 0, 0, + 0, 0, 0, 192, 193, 61, 96, 0, 64, 48, 0, 0, 0, 4, 92, 0, + 0, 0, 242, 7, 192, 127, 0, 0, 0, 0, 242, 27, 64, 63, 0, 0, + 0, 0, 0, 3, 0, 0, 160, 2, 0, 0, 254, 127, 223, 224, 255, 254, + 255, 255, 255, 31, 64, 0, 0, 0, 0, 224, 253, 102, 0, 0, 0, 195, + 1, 0, 30, 0, 100, 32, 0, 32, 0, 0, 0, 224, 0, 0, 28, 0, + 0, 0, 12, 0, 0, 0, 176, 63, 64, 254, 143, 32, 0, 120, 0, 0, + 8, 0, 0, 0, 0, 2, 0, 0, 135, 1, 4, 14, 0, 0, 128, 9, + 0, 0, 64, 127, 229, 31, 248, 159, 128, 0, 255, 127, 15, 0, 0, 0, + 0, 0, 208, 23, 0, 248, 15, 0, 3, 0, 0, 0, 60, 59, 0, 0, + 64, 163, 3, 0, 0, 240, 207, 0, 0, 0, 0, 63, 0, 0, 247, 255, + 253, 33, 16, 3, 0, 240, 255, 255, 255, 7, 0, 1, 0, 0, 0, 248, + 255, 255, 63, 240, 0, 0, 0, 160, 3, 224, 0, 224, 0, 224, 0, 96, + 0, 248, 0, 3, 144, 124, 0, 0, 223, 255, 2, 128, 0, 0, 255, 31, + 255, 255, 1, 0, 0, 0, 0, 48, 0, 128, 3, 0, 0, 128, 0, 128, + 0, 128, 0, 0, 32, 0, 0, 0, 0, 60, 62, 8, 0, 0, 0, 126, + 0, 0, 0, 112, 0, 0, 32, 0, 0, 16, 0, 0, 0, 128, 247, 191, + 0, 0, 0, 240, 0, 0, 3, 0, 0, 7, 0, 0, 68, 8, 0, 0, + 96, 0, 0, 0, 16, 0, 0, 0, 255, 255, 3, 0, 192, 63, 0, 0, + 128, 255, 3, 0, 0, 0, 200, 19, 0, 126, 102, 0, 8, 16, 0, 0, + 0, 0, 1, 16, 0, 0, 157, 193, 2, 0, 0, 32, 0, 48, 88, 0, + 32, 33, 0, 0, 0, 0, 252, 255, 255, 255, 8, 0, 255, 255, 0, 0, + 0, 0, 36, 0, 0, 0, 0, 128, 8, 0, 0, 14, 0, 0, 0, 32, + 0, 0, 192, 7, 110, 240, 0, 0, 0, 0, 0, 135, 0, 0, 0, 255, + 127, 0, 0, 0, 0, 0, 120, 38, 128, 239, 31, 0, 0, 0, 8, 0, + 0, 0, 192, 127, 0, 28, 0, 0, 0, 128, 211, 0, 248, 7, 0, 0, + 192, 31, 31, 0, 0, 0, 248, 133, 13, 0, 0, 0, 0, 0, 60, 176, + 1, 0, 0, 48, 0, 0, 248, 167, 0, 40, 191, 0, 188, 15, 0, 0, + 0, 0, 31, 0, 0, 0, 127, 0, 0, 128, 255, 255, 0, 0, 0, 96, + 128, 3, 248, 255, 231, 15, 0, 0, 0, 60, 0, 0, 28, 0, 0, 0, + 255, 255, 127, 248, 255, 31, 32, 0, 16, 0, 0, 248, 254, 255, 0, 0, +}; + +/* Case_Ignorable: 1474 bytes. */ + +RE_UINT32 re_get_case_ignorable(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 15; + code = ch ^ (f << 15); + pos = (RE_UINT32)re_case_ignorable_stage_1[f] << 4; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_case_ignorable_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_case_ignorable_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_case_ignorable_stage_4[pos + f] << 5; + pos += code; + value = (re_case_ignorable_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Changes_When_Lowercased. */ + +static RE_UINT8 re_changes_when_lowercased_stage_1[] = { + 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, +}; + +static RE_UINT8 re_changes_when_lowercased_stage_2[] = { + 0, 1, 2, 3, 4, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, + 8, 9, 1, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_changes_when_lowercased_stage_3[] = { + 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 9, 10, + 6, 11, 6, 6, 12, 6, 6, 6, 6, 6, 6, 6, 13, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 14, 15, 6, 6, 6, 6, 6, 6, 6, 16, + 6, 6, 6, 6, 17, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, 6, + 19, 6, 6, 6, 6, 6, 6, 6, +}; + +static RE_UINT8 re_changes_when_lowercased_stage_4[] = { + 0, 0, 1, 0, 0, 0, 2, 0, 3, 4, 5, 6, 7, 8, 9, 10, + 3, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0, 13, 14, 15, 16, 17, + 18, 19, 0, 3, 20, 3, 21, 3, 3, 22, 23, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 24, 0, + 0, 0, 0, 0, 0, 18, 18, 25, 3, 3, 3, 3, 26, 3, 3, 3, + 27, 28, 29, 30, 28, 31, 32, 33, 0, 34, 0, 19, 35, 0, 0, 0, + 0, 0, 0, 0, 0, 36, 19, 0, 18, 37, 0, 38, 3, 3, 3, 39, + 0, 0, 3, 40, 41, 0, 0, 0, 0, 42, 3, 43, 44, 45, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 0, 18, 46, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 18, 47, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, +}; + +static RE_UINT8 re_changes_when_lowercased_stage_5[] = { + 0, 0, 0, 0, 254, 255, 255, 7, 255, 255, 127, 127, 85, 85, 85, 85, + 85, 85, 85, 170, 170, 84, 85, 85, 85, 85, 85, 43, 214, 206, 219, 177, + 213, 210, 174, 17, 176, 173, 170, 74, 85, 85, 214, 85, 85, 85, 5, 108, + 122, 85, 0, 0, 0, 0, 69, 128, 64, 215, 254, 255, 251, 15, 0, 0, + 0, 128, 0, 85, 85, 85, 144, 230, 255, 255, 255, 255, 255, 255, 0, 0, + 1, 84, 85, 85, 171, 42, 85, 85, 85, 85, 254, 255, 255, 255, 127, 0, + 191, 32, 0, 0, 255, 255, 63, 0, 85, 85, 21, 64, 0, 255, 0, 63, + 0, 255, 0, 255, 0, 63, 0, 170, 0, 255, 0, 0, 0, 255, 0, 31, + 0, 31, 0, 15, 0, 31, 0, 31, 64, 12, 4, 0, 8, 0, 0, 0, + 0, 0, 192, 255, 255, 127, 0, 0, 157, 234, 37, 192, 5, 40, 4, 0, + 85, 21, 0, 0, 85, 85, 85, 5, 84, 85, 84, 85, 85, 85, 0, 106, + 85, 40, 69, 85, 85, 61, 95, 0, 255, 0, 0, 0, 255, 255, 7, 0, +}; + +/* Changes_When_Lowercased: 538 bytes. */ + +RE_UINT32 re_get_changes_when_lowercased(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 15; + code = ch ^ (f << 15); + pos = (RE_UINT32)re_changes_when_lowercased_stage_1[f] << 4; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_changes_when_lowercased_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_changes_when_lowercased_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_changes_when_lowercased_stage_4[pos + f] << 5; + pos += code; + value = (re_changes_when_lowercased_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Changes_When_Uppercased. */ + +static RE_UINT8 re_changes_when_uppercased_stage_1[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, +}; + +static RE_UINT8 re_changes_when_uppercased_stage_2[] = { + 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 5, + 6, 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +}; + +static RE_UINT8 re_changes_when_uppercased_stage_3[] = { + 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 9, 10, + 6, 11, 6, 6, 12, 6, 6, 6, 6, 6, 6, 6, 13, 14, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 15, 16, 6, 6, 6, 17, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, 6, 19, + 6, 6, 6, 6, 20, 6, 6, 6, 6, 6, 6, 6, 21, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 22, 6, 6, 6, 6, 6, 6, 6, +}; + +static RE_UINT8 re_changes_when_uppercased_stage_4[] = { + 0, 0, 0, 1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 5, 13, 14, 15, 16, 0, 0, 0, 0, 0, 17, 18, 19, 20, 21, 22, + 0, 23, 24, 5, 25, 5, 26, 5, 5, 27, 0, 28, 29, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30, + 0, 0, 0, 31, 0, 0, 0, 0, 5, 5, 5, 5, 32, 5, 5, 5, + 33, 34, 35, 36, 24, 37, 38, 39, 0, 0, 40, 23, 41, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 23, 42, 0, 23, 43, 44, 5, 5, 5, 45, + 24, 46, 0, 0, 0, 0, 0, 0, 0, 0, 5, 47, 48, 0, 0, 0, + 0, 49, 5, 50, 51, 52, 0, 0, 0, 0, 53, 23, 24, 24, 0, 0, + 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 55, 56, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 57, + 0, 0, 0, 0, 0, 0, 24, 0, +}; + +static RE_UINT8 re_changes_when_uppercased_stage_5[] = { + 0, 0, 0, 0, 254, 255, 255, 7, 0, 0, 32, 0, 0, 0, 0, 128, + 255, 255, 127, 255, 170, 170, 170, 170, 170, 170, 170, 84, 85, 171, 170, 170, + 170, 170, 170, 212, 41, 17, 36, 70, 42, 33, 81, 162, 96, 91, 85, 181, + 170, 170, 45, 170, 168, 170, 10, 144, 133, 170, 223, 26, 107, 155, 38, 32, + 137, 31, 4, 96, 32, 0, 0, 0, 0, 0, 138, 56, 0, 0, 1, 0, + 0, 240, 255, 255, 255, 127, 227, 170, 170, 170, 47, 9, 0, 0, 255, 255, + 255, 255, 255, 255, 2, 168, 170, 170, 84, 213, 170, 170, 170, 170, 0, 0, + 254, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 63, 0, 0, 0, 34, + 170, 170, 234, 15, 255, 0, 63, 0, 255, 0, 255, 0, 63, 0, 255, 0, + 255, 0, 255, 63, 255, 255, 223, 80, 220, 16, 207, 0, 255, 0, 220, 16, + 0, 64, 0, 0, 16, 0, 0, 0, 255, 3, 0, 0, 255, 255, 255, 127, + 98, 21, 72, 0, 10, 80, 8, 0, 191, 32, 0, 0, 170, 42, 0, 0, + 170, 170, 170, 10, 168, 170, 168, 170, 170, 170, 0, 148, 170, 16, 138, 170, + 170, 2, 160, 0, 0, 0, 8, 0, 127, 0, 248, 0, 0, 255, 255, 255, + 255, 255, 0, 0, 255, 255, 7, 0, +}; + +/* Changes_When_Uppercased: 609 bytes. */ + +RE_UINT32 re_get_changes_when_uppercased(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_changes_when_uppercased_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_changes_when_uppercased_stage_2[pos + f] << 4; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_changes_when_uppercased_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_changes_when_uppercased_stage_4[pos + f] << 5; + pos += code; + value = (re_changes_when_uppercased_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Changes_When_Titlecased. */ + +static RE_UINT8 re_changes_when_titlecased_stage_1[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, +}; + +static RE_UINT8 re_changes_when_titlecased_stage_2[] = { + 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 5, + 6, 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +}; + +static RE_UINT8 re_changes_when_titlecased_stage_3[] = { + 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 9, 10, + 6, 11, 6, 6, 12, 6, 6, 6, 6, 6, 6, 6, 13, 14, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 15, 16, 6, 6, 6, 17, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, 6, 19, + 6, 6, 6, 6, 20, 6, 6, 6, 6, 6, 6, 6, 21, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 22, 6, 6, 6, 6, 6, 6, 6, +}; + +static RE_UINT8 re_changes_when_titlecased_stage_4[] = { + 0, 0, 0, 1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 5, 13, 14, 15, 16, 0, 0, 0, 0, 0, 17, 18, 19, 20, 21, 22, + 0, 23, 24, 5, 25, 5, 26, 5, 5, 27, 0, 28, 29, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30, + 0, 0, 0, 31, 0, 0, 0, 0, 5, 5, 5, 5, 32, 5, 5, 5, + 33, 34, 35, 36, 34, 37, 38, 39, 0, 0, 40, 23, 41, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 23, 42, 0, 23, 43, 44, 5, 5, 5, 45, + 24, 46, 0, 0, 0, 0, 0, 0, 0, 0, 5, 47, 48, 0, 0, 0, + 0, 49, 5, 50, 51, 52, 0, 0, 0, 0, 53, 23, 24, 24, 0, 0, + 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 55, 56, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 57, + 0, 0, 0, 0, 0, 0, 24, 0, +}; + +static RE_UINT8 re_changes_when_titlecased_stage_5[] = { + 0, 0, 0, 0, 254, 255, 255, 7, 0, 0, 32, 0, 0, 0, 0, 128, + 255, 255, 127, 255, 170, 170, 170, 170, 170, 170, 170, 84, 85, 171, 170, 170, + 170, 170, 170, 212, 41, 17, 36, 70, 42, 33, 81, 162, 208, 86, 85, 181, + 170, 170, 43, 170, 168, 170, 10, 144, 133, 170, 223, 26, 107, 155, 38, 32, + 137, 31, 4, 96, 32, 0, 0, 0, 0, 0, 138, 56, 0, 0, 1, 0, + 0, 240, 255, 255, 255, 127, 227, 170, 170, 170, 47, 9, 0, 0, 255, 255, + 255, 255, 255, 255, 2, 168, 170, 170, 84, 213, 170, 170, 170, 170, 0, 0, + 254, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 63, 0, 0, 0, 34, + 170, 170, 234, 15, 255, 0, 63, 0, 255, 0, 255, 0, 63, 0, 255, 0, + 255, 0, 255, 63, 255, 0, 223, 64, 220, 0, 207, 0, 255, 0, 220, 0, + 0, 64, 0, 0, 16, 0, 0, 0, 255, 3, 0, 0, 255, 255, 255, 127, + 98, 21, 72, 0, 10, 80, 8, 0, 191, 32, 0, 0, 170, 42, 0, 0, + 170, 170, 170, 10, 168, 170, 168, 170, 170, 170, 0, 148, 170, 16, 138, 170, + 170, 2, 160, 0, 0, 0, 8, 0, 127, 0, 248, 0, 0, 255, 255, 255, + 255, 255, 0, 0, 255, 255, 7, 0, +}; + +/* Changes_When_Titlecased: 609 bytes. */ + +RE_UINT32 re_get_changes_when_titlecased(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_changes_when_titlecased_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_changes_when_titlecased_stage_2[pos + f] << 4; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_changes_when_titlecased_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_changes_when_titlecased_stage_4[pos + f] << 5; + pos += code; + value = (re_changes_when_titlecased_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Changes_When_Casefolded. */ + +static RE_UINT8 re_changes_when_casefolded_stage_1[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, +}; + +static RE_UINT8 re_changes_when_casefolded_stage_2[] = { + 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 5, + 6, 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +}; + +static RE_UINT8 re_changes_when_casefolded_stage_3[] = { + 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 9, 10, + 6, 11, 6, 6, 12, 6, 6, 6, 6, 6, 6, 6, 13, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 14, 15, 6, 6, 6, 16, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 17, 6, 6, 6, 18, + 6, 6, 6, 6, 19, 6, 6, 6, 6, 6, 6, 6, 20, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 21, 6, 6, 6, 6, 6, 6, 6, +}; + +static RE_UINT8 re_changes_when_casefolded_stage_4[] = { + 0, 0, 1, 0, 0, 2, 3, 0, 4, 5, 6, 7, 8, 9, 10, 11, + 4, 12, 13, 0, 0, 0, 0, 0, 0, 0, 14, 15, 16, 17, 18, 19, + 20, 21, 0, 4, 22, 4, 23, 4, 4, 24, 25, 0, 26, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 27, 0, + 0, 0, 0, 0, 0, 0, 0, 28, 4, 4, 4, 4, 29, 4, 4, 4, + 30, 31, 32, 33, 20, 34, 35, 36, 0, 37, 0, 21, 38, 0, 0, 0, + 0, 0, 0, 0, 0, 39, 21, 0, 20, 40, 0, 41, 4, 4, 4, 42, + 0, 0, 4, 43, 44, 0, 0, 0, 0, 45, 4, 46, 47, 48, 0, 0, + 0, 0, 0, 49, 20, 20, 0, 0, 50, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 0, 20, 51, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 20, 52, 0, 0, 0, 0, 0, 0, 0, 20, 0, 0, +}; + +static RE_UINT8 re_changes_when_casefolded_stage_5[] = { + 0, 0, 0, 0, 254, 255, 255, 7, 0, 0, 32, 0, 255, 255, 127, 255, + 85, 85, 85, 85, 85, 85, 85, 170, 170, 86, 85, 85, 85, 85, 85, 171, + 214, 206, 219, 177, 213, 210, 174, 17, 176, 173, 170, 74, 85, 85, 214, 85, + 85, 85, 5, 108, 122, 85, 0, 0, 32, 0, 0, 0, 0, 0, 69, 128, + 64, 215, 254, 255, 251, 15, 0, 0, 4, 128, 99, 85, 85, 85, 179, 230, + 255, 255, 255, 255, 255, 255, 0, 0, 1, 84, 85, 85, 171, 42, 85, 85, + 85, 85, 254, 255, 255, 255, 127, 0, 128, 0, 0, 0, 191, 32, 0, 0, + 0, 0, 0, 63, 85, 85, 21, 76, 0, 255, 0, 63, 0, 255, 0, 255, + 0, 63, 0, 170, 0, 255, 0, 0, 255, 255, 156, 31, 156, 31, 0, 15, + 0, 31, 156, 31, 64, 12, 4, 0, 8, 0, 0, 0, 0, 0, 192, 255, + 255, 127, 0, 0, 157, 234, 37, 192, 5, 40, 4, 0, 85, 21, 0, 0, + 85, 85, 85, 5, 84, 85, 84, 85, 85, 85, 0, 106, 85, 40, 69, 85, + 85, 61, 95, 0, 0, 0, 255, 255, 127, 0, 248, 0, 255, 0, 0, 0, + 255, 255, 7, 0, +}; + +/* Changes_When_Casefolded: 581 bytes. */ + +RE_UINT32 re_get_changes_when_casefolded(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_changes_when_casefolded_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_changes_when_casefolded_stage_2[pos + f] << 4; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_changes_when_casefolded_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_changes_when_casefolded_stage_4[pos + f] << 5; + pos += code; + value = (re_changes_when_casefolded_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Changes_When_Casemapped. */ + +static RE_UINT8 re_changes_when_casemapped_stage_1[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, +}; + +static RE_UINT8 re_changes_when_casemapped_stage_2[] = { + 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 5, + 6, 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +}; + +static RE_UINT8 re_changes_when_casemapped_stage_3[] = { + 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 9, 10, 11, + 6, 12, 6, 6, 13, 6, 6, 6, 6, 6, 6, 6, 14, 15, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 16, 17, 6, 6, 6, 18, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 19, 6, 6, 6, 20, + 6, 6, 6, 6, 21, 6, 6, 6, 6, 6, 6, 6, 22, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 23, 6, 6, 6, 6, 6, 6, 6, +}; + +static RE_UINT8 re_changes_when_casemapped_stage_4[] = { + 0, 0, 1, 1, 0, 2, 3, 3, 4, 5, 4, 4, 6, 7, 8, 4, + 4, 9, 10, 11, 12, 0, 0, 0, 0, 0, 13, 14, 15, 16, 17, 18, + 4, 4, 4, 4, 19, 4, 4, 4, 4, 20, 21, 22, 23, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 24, 0, + 0, 0, 0, 0, 0, 4, 4, 25, 0, 0, 0, 26, 0, 0, 0, 0, + 4, 4, 4, 4, 27, 4, 4, 4, 25, 4, 28, 29, 4, 30, 31, 32, + 0, 33, 34, 4, 35, 0, 0, 0, 0, 0, 0, 0, 0, 36, 4, 37, + 4, 38, 39, 40, 4, 4, 4, 41, 4, 24, 0, 0, 0, 0, 0, 0, + 0, 0, 4, 42, 43, 0, 0, 0, 0, 44, 4, 45, 46, 47, 0, 0, + 0, 0, 48, 49, 4, 4, 0, 0, 50, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 0, 0, 0, 0, 0, 4, 4, 51, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 4, 52, 4, 52, 0, 0, 0, 0, 0, 4, 4, 0, +}; + +static RE_UINT8 re_changes_when_casemapped_stage_5[] = { + 0, 0, 0, 0, 254, 255, 255, 7, 0, 0, 32, 0, 255, 255, 127, 255, + 255, 255, 255, 255, 255, 255, 255, 254, 255, 223, 255, 247, 255, 243, 255, 179, + 240, 255, 255, 255, 253, 255, 15, 252, 255, 255, 223, 26, 107, 155, 38, 32, + 137, 31, 4, 96, 32, 0, 0, 0, 0, 0, 207, 184, 64, 215, 255, 255, + 251, 255, 255, 255, 255, 255, 227, 255, 255, 255, 191, 239, 3, 252, 255, 255, + 255, 255, 254, 255, 255, 255, 127, 0, 254, 255, 255, 255, 255, 0, 0, 0, + 191, 32, 0, 0, 255, 255, 63, 63, 0, 0, 0, 34, 255, 255, 255, 79, + 63, 63, 255, 170, 255, 255, 255, 63, 255, 255, 223, 95, 220, 31, 207, 15, + 255, 31, 220, 31, 64, 12, 4, 0, 0, 64, 0, 0, 24, 0, 0, 0, + 0, 0, 192, 255, 255, 3, 0, 0, 255, 127, 255, 255, 255, 255, 255, 127, + 255, 255, 109, 192, 15, 120, 12, 0, 255, 63, 0, 0, 255, 255, 255, 15, + 252, 255, 252, 255, 255, 255, 0, 254, 255, 56, 207, 255, 255, 63, 255, 0, + 0, 0, 8, 0, 0, 0, 255, 255, 127, 0, 248, 0, 255, 255, 0, 0, + 255, 255, 7, 0, +}; + +/* Changes_When_Casemapped: 597 bytes. */ + +RE_UINT32 re_get_changes_when_casemapped(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_changes_when_casemapped_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_changes_when_casemapped_stage_2[pos + f] << 4; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_changes_when_casemapped_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_changes_when_casemapped_stage_4[pos + f] << 5; + pos += code; + value = (re_changes_when_casemapped_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* ID_Start. */ + +static RE_UINT8 re_id_start_stage_1[] = { + 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, +}; + +static RE_UINT8 re_id_start_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 9, 10, 11, 7, 7, 7, 7, 12, 13, 13, 13, 13, 14, + 15, 16, 17, 18, 19, 13, 20, 13, 21, 13, 13, 13, 13, 22, 13, 13, + 13, 13, 13, 13, 13, 13, 23, 24, 13, 13, 25, 13, 13, 26, 13, 13, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 27, 7, 28, 29, 7, 30, 13, 13, 13, 13, 13, 31, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, +}; + +static RE_UINT8 re_id_start_stage_3[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 1, 17, 18, 19, 1, 20, 21, 22, 23, 24, 25, 26, 27, 1, 28, + 29, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 31, 31, + 34, 35, 31, 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 36, 1, 1, 1, 1, 1, 1, 1, 1, 1, 37, + 1, 1, 1, 1, 38, 1, 39, 40, 41, 42, 43, 44, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 45, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 1, 46, 47, 1, 48, 49, 50, 51, 52, 53, 54, 55, 56, 1, 57, + 58, 59, 60, 61, 62, 31, 31, 31, 63, 64, 65, 66, 67, 68, 69, 70, + 71, 31, 72, 31, 31, 31, 31, 31, 1, 1, 1, 73, 74, 75, 31, 31, + 1, 1, 1, 1, 76, 31, 31, 31, 31, 31, 31, 31, 1, 1, 77, 31, + 1, 1, 78, 79, 31, 31, 31, 80, 81, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 82, 31, 31, 31, 31, 31, 31, 31, 83, 84, 85, 86, + 87, 31, 31, 31, 31, 31, 88, 31, 1, 1, 1, 1, 1, 1, 89, 1, + 1, 1, 1, 1, 1, 1, 1, 90, 91, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 92, 31, 1, 1, 93, 31, 31, 31, 31, 31, +}; + +static RE_UINT8 re_id_start_stage_4[] = { + 0, 0, 1, 1, 0, 2, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 5, 6, 0, 0, 0, 7, 8, 9, 4, 10, + 4, 4, 4, 4, 11, 4, 4, 4, 4, 12, 13, 14, 15, 0, 16, 17, + 0, 4, 18, 19, 4, 4, 20, 21, 22, 23, 24, 4, 4, 25, 26, 27, + 28, 29, 30, 0, 0, 31, 0, 0, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 45, 49, 50, 51, 52, 46, 0, + 53, 54, 55, 56, 53, 57, 58, 59, 53, 60, 61, 62, 63, 64, 65, 0, + 14, 66, 65, 0, 67, 68, 69, 0, 70, 0, 71, 72, 73, 0, 0, 0, + 4, 74, 75, 76, 77, 4, 78, 79, 4, 4, 80, 4, 81, 82, 83, 4, + 84, 4, 85, 0, 23, 4, 4, 86, 14, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 87, 1, 4, 4, 88, 89, 90, 90, 91, 4, 92, 93, 0, + 0, 4, 4, 94, 4, 95, 4, 96, 97, 0, 16, 98, 4, 99, 100, 0, + 101, 4, 31, 0, 0, 102, 0, 0, 103, 92, 104, 0, 105, 106, 4, 107, + 4, 108, 109, 110, 0, 0, 0, 111, 4, 4, 4, 4, 4, 4, 0, 0, + 86, 4, 112, 110, 4, 113, 114, 115, 0, 0, 0, 116, 117, 0, 0, 0, + 118, 119, 120, 4, 121, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 4, 122, 97, 4, 4, 4, 4, 123, 4, 78, 4, 124, 101, 125, 125, 0, + 126, 127, 14, 4, 128, 14, 4, 79, 103, 129, 4, 4, 130, 85, 0, 16, + 4, 4, 4, 4, 4, 96, 0, 0, 4, 4, 4, 4, 4, 4, 96, 0, + 4, 4, 4, 4, 72, 0, 16, 110, 131, 132, 4, 133, 110, 4, 4, 23, + 134, 135, 4, 4, 136, 137, 0, 134, 138, 139, 4, 92, 135, 92, 0, 140, + 26, 141, 65, 142, 32, 143, 144, 145, 4, 121, 146, 147, 4, 148, 149, 150, + 151, 152, 79, 141, 4, 4, 4, 139, 4, 4, 4, 4, 4, 153, 154, 155, + 4, 4, 4, 156, 4, 4, 157, 0, 158, 159, 160, 4, 4, 90, 161, 4, + 4, 110, 16, 4, 162, 4, 15, 163, 0, 0, 0, 164, 4, 4, 4, 142, + 0, 1, 1, 165, 4, 97, 166, 0, 167, 168, 169, 0, 4, 4, 4, 85, + 0, 0, 4, 31, 0, 0, 0, 0, 0, 0, 0, 0, 142, 4, 170, 0, + 4, 16, 171, 96, 110, 4, 172, 0, 4, 4, 4, 4, 110, 0, 0, 0, + 4, 173, 4, 108, 0, 0, 0, 0, 4, 101, 96, 15, 0, 0, 0, 0, + 174, 175, 96, 101, 97, 0, 0, 176, 96, 157, 0, 0, 4, 177, 0, 0, + 178, 92, 0, 142, 142, 0, 71, 179, 4, 96, 96, 143, 90, 0, 0, 0, + 4, 4, 121, 0, 4, 143, 4, 143, 105, 94, 0, 0, 105, 23, 16, 121, + 105, 65, 16, 180, 105, 143, 181, 0, 182, 183, 0, 0, 184, 185, 97, 0, + 48, 45, 186, 56, 0, 0, 0, 0, 0, 0, 0, 0, 4, 23, 187, 0, + 0, 0, 0, 0, 4, 130, 188, 0, 4, 23, 189, 0, 4, 18, 0, 0, + 157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 190, + 0, 0, 0, 0, 0, 0, 4, 30, 4, 4, 4, 4, 157, 0, 0, 0, + 4, 4, 4, 130, 4, 4, 4, 4, 4, 4, 108, 0, 0, 0, 0, 0, + 4, 130, 0, 0, 0, 0, 0, 0, 4, 4, 65, 0, 0, 0, 0, 0, + 4, 30, 97, 0, 0, 0, 16, 191, 4, 23, 108, 192, 23, 0, 0, 0, + 4, 4, 193, 0, 161, 0, 0, 0, 56, 0, 0, 0, 0, 0, 0, 0, + 4, 4, 4, 194, 195, 0, 0, 0, 4, 4, 196, 4, 197, 198, 199, 4, + 200, 201, 202, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 203, 204, 79, + 196, 196, 122, 122, 205, 205, 146, 0, 4, 4, 4, 4, 4, 4, 179, 0, + 199, 206, 207, 208, 209, 210, 0, 0, 4, 4, 4, 4, 4, 4, 101, 0, + 4, 31, 4, 4, 4, 4, 4, 4, 110, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 56, 0, 0, 110, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_id_start_stage_5[] = { + 0, 0, 0, 0, 254, 255, 255, 7, 0, 4, 32, 4, 255, 255, 127, 255, + 255, 255, 255, 255, 195, 255, 3, 0, 31, 80, 0, 0, 0, 0, 223, 188, + 64, 215, 255, 255, 251, 255, 255, 255, 255, 255, 191, 255, 3, 252, 255, 255, + 255, 255, 254, 255, 255, 255, 127, 2, 254, 255, 255, 255, 255, 0, 0, 0, + 0, 0, 255, 255, 255, 7, 7, 0, 255, 7, 0, 0, 0, 192, 254, 255, + 255, 255, 47, 0, 96, 192, 0, 156, 0, 0, 253, 255, 255, 255, 0, 0, + 0, 224, 255, 255, 63, 0, 2, 0, 0, 252, 255, 255, 255, 7, 48, 4, + 255, 255, 63, 4, 16, 1, 0, 0, 255, 255, 255, 1, 255, 255, 31, 0, + 240, 255, 255, 255, 255, 255, 255, 35, 0, 0, 1, 255, 3, 0, 254, 255, + 225, 159, 249, 255, 255, 253, 197, 35, 0, 64, 0, 176, 3, 0, 3, 0, + 224, 135, 249, 255, 255, 253, 109, 3, 0, 0, 0, 94, 0, 0, 28, 0, + 224, 191, 251, 255, 255, 253, 237, 35, 0, 0, 1, 0, 3, 0, 0, 2, + 224, 159, 249, 255, 0, 0, 0, 176, 3, 0, 2, 0, 232, 199, 61, 214, + 24, 199, 255, 3, 224, 223, 253, 255, 255, 253, 255, 35, 0, 0, 0, 7, + 3, 0, 0, 0, 255, 253, 239, 35, 0, 0, 0, 64, 3, 0, 6, 0, + 255, 255, 255, 39, 0, 64, 0, 128, 3, 0, 0, 252, 224, 255, 127, 252, + 255, 255, 251, 47, 127, 0, 0, 0, 255, 255, 13, 0, 150, 37, 240, 254, + 174, 236, 13, 32, 95, 0, 0, 240, 1, 0, 0, 0, 255, 254, 255, 255, + 255, 31, 0, 0, 0, 31, 0, 0, 255, 7, 0, 128, 0, 0, 63, 60, + 98, 192, 225, 255, 3, 64, 0, 0, 191, 32, 255, 255, 255, 255, 255, 247, + 255, 61, 127, 61, 255, 61, 255, 255, 255, 255, 61, 127, 61, 255, 127, 255, + 255, 255, 61, 255, 255, 255, 255, 7, 255, 255, 63, 63, 255, 159, 255, 255, + 255, 199, 255, 1, 255, 223, 3, 0, 255, 255, 3, 0, 255, 223, 1, 0, + 255, 255, 15, 0, 0, 0, 128, 16, 255, 255, 255, 0, 255, 5, 255, 255, + 255, 255, 63, 0, 255, 255, 255, 127, 255, 63, 31, 0, 255, 15, 255, 255, + 255, 3, 0, 0, 255, 255, 127, 0, 128, 0, 0, 0, 224, 255, 255, 255, + 224, 15, 0, 0, 248, 255, 255, 255, 1, 192, 0, 252, 63, 0, 0, 0, + 15, 0, 0, 0, 0, 224, 0, 252, 255, 255, 255, 63, 0, 222, 99, 0, + 63, 63, 255, 170, 255, 255, 223, 95, 220, 31, 207, 15, 255, 31, 220, 31, + 0, 0, 2, 128, 0, 0, 255, 31, 132, 252, 47, 63, 80, 253, 255, 243, + 224, 67, 0, 0, 255, 1, 0, 0, 255, 127, 255, 255, 31, 120, 12, 0, + 255, 128, 0, 0, 127, 127, 127, 127, 224, 0, 0, 0, 254, 3, 62, 31, + 255, 255, 127, 248, 255, 63, 254, 255, 255, 127, 0, 0, 255, 31, 255, 255, + 0, 12, 0, 0, 255, 127, 0, 128, 0, 0, 128, 255, 252, 255, 255, 255, + 255, 249, 255, 255, 255, 63, 255, 0, 187, 247, 255, 255, 7, 0, 0, 0, + 0, 0, 252, 40, 63, 0, 255, 255, 255, 255, 255, 31, 255, 255, 7, 0, + 0, 128, 0, 0, 223, 255, 0, 124, 247, 15, 0, 0, 255, 255, 127, 196, + 255, 255, 98, 62, 5, 0, 0, 56, 255, 7, 28, 0, 126, 126, 126, 0, + 127, 127, 255, 255, 15, 0, 255, 255, 127, 248, 255, 255, 255, 255, 255, 15, + 255, 63, 255, 255, 255, 255, 255, 3, 127, 0, 248, 160, 255, 253, 127, 95, + 219, 255, 255, 255, 0, 0, 248, 255, 255, 255, 252, 255, 0, 0, 255, 15, + 0, 0, 223, 255, 192, 255, 255, 255, 252, 252, 252, 28, 255, 239, 255, 255, + 127, 255, 255, 183, 255, 63, 255, 63, 255, 255, 1, 0, 255, 7, 255, 255, + 15, 255, 62, 0, 255, 0, 255, 255, 63, 253, 255, 255, 255, 255, 191, 145, + 255, 255, 55, 0, 255, 255, 255, 192, 1, 0, 239, 254, 31, 0, 0, 0, + 255, 255, 71, 0, 30, 0, 0, 20, 255, 255, 251, 255, 255, 15, 0, 0, + 127, 189, 255, 191, 255, 1, 255, 255, 0, 0, 1, 224, 176, 0, 0, 0, + 0, 0, 0, 15, 16, 0, 0, 0, 0, 0, 0, 128, 255, 63, 0, 0, + 248, 255, 255, 224, 31, 0, 1, 0, 255, 7, 255, 31, 255, 1, 255, 3, + 255, 255, 223, 255, 255, 255, 255, 223, 100, 222, 255, 235, 239, 255, 255, 255, + 191, 231, 223, 223, 255, 255, 255, 123, 95, 252, 253, 255, 63, 255, 255, 255, + 253, 255, 255, 247, 255, 253, 255, 255, 150, 254, 247, 10, 132, 234, 150, 170, + 150, 247, 247, 94, 255, 251, 255, 15, 238, 251, 255, 15, +}; + +/* ID_Start: 1997 bytes. */ + +RE_UINT32 re_get_id_start(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_id_start_stage_1[f] << 5; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_id_start_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_id_start_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_id_start_stage_4[pos + f] << 5; + pos += code; + value = (re_id_start_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* ID_Continue. */ + +static RE_UINT8 re_id_continue_stage_1[] = { + 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 6, 6, + 6, 6, +}; + +static RE_UINT8 re_id_continue_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 9, 10, 11, 7, 7, 7, 7, 12, 13, 13, 13, 13, 14, + 15, 16, 17, 18, 19, 13, 20, 13, 21, 13, 13, 13, 13, 22, 13, 13, + 13, 13, 13, 13, 13, 13, 23, 24, 13, 13, 25, 26, 13, 27, 13, 13, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 28, 7, 29, 30, 7, 31, 13, 13, 13, 13, 13, 32, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 33, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, +}; + +static RE_UINT8 re_id_continue_stage_3[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 1, 17, 18, 19, 1, 20, 21, 22, 23, 24, 25, 26, 27, 1, 28, + 29, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 31, 31, + 34, 35, 31, 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 36, 1, 1, 1, 1, 1, 1, 1, 1, 1, 37, + 1, 1, 1, 1, 38, 1, 39, 40, 41, 42, 43, 44, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 45, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 1, 46, 47, 1, 48, 49, 50, 51, 52, 53, 54, 55, 56, 1, 57, + 58, 59, 60, 61, 62, 31, 31, 31, 63, 64, 65, 66, 67, 68, 69, 70, + 71, 31, 72, 31, 31, 31, 31, 31, 1, 1, 1, 73, 74, 75, 31, 31, + 1, 1, 1, 1, 76, 31, 31, 31, 31, 31, 31, 31, 1, 1, 77, 31, + 1, 1, 78, 79, 31, 31, 31, 80, 81, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 82, 31, 31, 31, 31, 83, 84, 31, 85, 86, 87, 88, + 31, 31, 89, 31, 31, 31, 31, 31, 90, 31, 31, 31, 31, 31, 91, 31, + 1, 1, 1, 1, 1, 1, 92, 1, 1, 1, 1, 1, 1, 1, 1, 93, + 94, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 95, 31, + 1, 1, 96, 31, 31, 31, 31, 31, 31, 97, 31, 31, 31, 31, 31, 31, +}; + +static RE_UINT8 re_id_continue_stage_4[] = { + 0, 1, 2, 3, 0, 4, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 7, 8, 6, 6, 6, 9, 10, 11, 6, 12, + 6, 6, 6, 6, 13, 6, 6, 6, 6, 14, 15, 16, 17, 18, 19, 20, + 21, 6, 6, 22, 6, 6, 23, 24, 25, 6, 26, 6, 6, 27, 6, 28, + 6, 29, 30, 0, 0, 31, 0, 32, 6, 6, 6, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 43, 47, 48, 49, 50, 51, 52, + 53, 54, 55, 56, 57, 58, 59, 60, 57, 61, 62, 63, 64, 65, 66, 67, + 16, 68, 69, 0, 70, 71, 72, 0, 73, 74, 75, 76, 77, 78, 79, 0, + 6, 6, 80, 6, 81, 6, 82, 83, 6, 6, 84, 6, 85, 86, 87, 6, + 88, 6, 61, 89, 90, 6, 6, 91, 16, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 92, 3, 6, 6, 93, 94, 31, 95, 96, 6, 6, 97, 98, + 99, 6, 6, 100, 6, 101, 6, 102, 103, 104, 105, 106, 6, 107, 108, 0, + 30, 6, 103, 109, 110, 111, 0, 0, 6, 6, 112, 113, 6, 6, 6, 95, + 6, 100, 114, 81, 0, 0, 115, 116, 6, 6, 6, 6, 6, 6, 6, 117, + 91, 6, 118, 81, 6, 119, 120, 121, 0, 122, 123, 124, 125, 0, 125, 126, + 127, 128, 129, 6, 130, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 6, 131, 103, 6, 6, 6, 6, 132, 6, 82, 6, 133, 134, 135, 135, 6, + 136, 137, 16, 6, 138, 16, 6, 83, 139, 140, 6, 6, 141, 68, 0, 25, + 6, 6, 6, 6, 6, 102, 0, 0, 6, 6, 6, 6, 6, 6, 102, 0, + 6, 6, 6, 6, 142, 0, 25, 81, 143, 144, 6, 145, 6, 6, 6, 27, + 146, 147, 6, 6, 148, 149, 0, 146, 6, 150, 6, 95, 6, 6, 151, 152, + 6, 153, 95, 78, 6, 6, 154, 103, 6, 134, 155, 156, 6, 6, 157, 158, + 159, 160, 83, 161, 6, 6, 6, 162, 6, 6, 6, 6, 6, 163, 164, 30, + 6, 6, 6, 153, 6, 6, 165, 0, 166, 167, 168, 6, 6, 27, 169, 6, + 6, 81, 25, 6, 170, 6, 150, 171, 90, 172, 173, 174, 6, 6, 6, 78, + 1, 2, 3, 105, 6, 103, 175, 0, 176, 177, 178, 0, 6, 6, 6, 68, + 0, 0, 6, 31, 0, 0, 0, 179, 0, 0, 0, 0, 78, 6, 180, 181, + 6, 25, 101, 68, 81, 6, 182, 0, 6, 6, 6, 6, 81, 98, 0, 0, + 6, 183, 6, 184, 0, 0, 0, 0, 6, 134, 102, 150, 0, 0, 0, 0, + 185, 186, 102, 134, 103, 0, 0, 187, 102, 165, 0, 0, 6, 188, 0, 0, + 189, 190, 0, 78, 78, 0, 75, 191, 6, 102, 102, 192, 27, 0, 0, 0, + 6, 6, 130, 0, 6, 192, 6, 192, 6, 6, 191, 193, 6, 68, 25, 194, + 6, 195, 25, 196, 6, 6, 197, 0, 198, 100, 0, 0, 199, 200, 6, 201, + 34, 43, 202, 203, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 204, 0, + 0, 0, 0, 0, 6, 205, 206, 0, 6, 6, 207, 0, 6, 100, 98, 0, + 208, 112, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 209, + 0, 0, 0, 0, 0, 0, 6, 210, 6, 6, 6, 6, 165, 0, 0, 0, + 6, 6, 6, 141, 6, 6, 6, 6, 6, 6, 184, 0, 0, 0, 0, 0, + 6, 141, 0, 0, 0, 0, 0, 0, 6, 6, 191, 0, 0, 0, 0, 0, + 6, 210, 103, 98, 0, 0, 25, 106, 6, 134, 211, 212, 90, 0, 0, 0, + 6, 6, 213, 103, 214, 0, 0, 0, 215, 0, 0, 0, 0, 0, 0, 0, + 6, 6, 6, 216, 217, 0, 0, 0, 0, 0, 0, 218, 219, 220, 0, 0, + 0, 0, 221, 0, 0, 0, 0, 0, 6, 6, 195, 6, 222, 223, 224, 6, + 225, 226, 227, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 228, 229, 83, + 195, 195, 131, 131, 230, 230, 231, 6, 6, 232, 6, 233, 234, 235, 0, 0, + 6, 6, 6, 6, 6, 6, 236, 0, 224, 237, 238, 239, 240, 241, 0, 0, + 6, 6, 6, 6, 6, 6, 134, 0, 6, 31, 6, 6, 6, 6, 6, 6, + 81, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 215, 0, 0, + 81, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6, 6, 6, 90, +}; + +static RE_UINT8 re_id_continue_stage_5[] = { + 0, 0, 0, 0, 0, 0, 255, 3, 254, 255, 255, 135, 254, 255, 255, 7, + 0, 4, 160, 4, 255, 255, 127, 255, 255, 255, 255, 255, 195, 255, 3, 0, + 31, 80, 0, 0, 255, 255, 223, 188, 192, 215, 255, 255, 251, 255, 255, 255, + 255, 255, 191, 255, 251, 252, 255, 255, 255, 255, 254, 255, 255, 255, 127, 2, + 254, 255, 255, 255, 255, 0, 254, 255, 255, 255, 255, 191, 182, 0, 255, 255, + 255, 7, 7, 0, 0, 0, 255, 7, 255, 195, 255, 255, 255, 255, 239, 159, + 255, 253, 255, 159, 0, 0, 255, 255, 255, 231, 255, 255, 255, 255, 3, 0, + 255, 255, 63, 4, 255, 63, 0, 0, 255, 255, 255, 15, 255, 255, 31, 0, + 248, 255, 255, 255, 207, 255, 254, 255, 239, 159, 249, 255, 255, 253, 197, 243, + 159, 121, 128, 176, 207, 255, 3, 0, 238, 135, 249, 255, 255, 253, 109, 211, + 135, 57, 2, 94, 192, 255, 63, 0, 238, 191, 251, 255, 255, 253, 237, 243, + 191, 59, 1, 0, 207, 255, 0, 2, 238, 159, 249, 255, 159, 57, 192, 176, + 207, 255, 2, 0, 236, 199, 61, 214, 24, 199, 255, 195, 199, 61, 129, 0, + 192, 255, 0, 0, 239, 223, 253, 255, 255, 253, 255, 227, 223, 61, 96, 7, + 207, 255, 0, 0, 238, 223, 253, 255, 255, 253, 239, 243, 223, 61, 96, 64, + 207, 255, 6, 0, 255, 255, 255, 231, 223, 125, 128, 128, 207, 255, 0, 252, + 236, 255, 127, 252, 255, 255, 251, 47, 127, 132, 95, 255, 192, 255, 12, 0, + 255, 255, 255, 7, 255, 127, 255, 3, 150, 37, 240, 254, 174, 236, 255, 59, + 95, 63, 255, 243, 1, 0, 0, 3, 255, 3, 160, 194, 255, 254, 255, 255, + 255, 31, 254, 255, 223, 255, 255, 254, 255, 255, 255, 31, 64, 0, 0, 0, + 255, 3, 255, 255, 255, 255, 255, 63, 191, 32, 255, 255, 255, 255, 255, 247, + 255, 61, 127, 61, 255, 61, 255, 255, 255, 255, 61, 127, 61, 255, 127, 255, + 255, 255, 61, 255, 0, 254, 3, 0, 255, 255, 0, 0, 255, 255, 63, 63, + 255, 159, 255, 255, 255, 199, 255, 1, 255, 223, 31, 0, 255, 255, 15, 0, + 255, 223, 13, 0, 255, 255, 143, 48, 255, 3, 0, 0, 0, 56, 255, 3, + 255, 255, 255, 0, 255, 7, 255, 255, 255, 255, 63, 0, 255, 255, 255, 127, + 255, 15, 255, 15, 192, 255, 255, 255, 255, 63, 31, 0, 255, 15, 255, 255, + 255, 3, 255, 7, 255, 255, 255, 159, 255, 3, 255, 3, 128, 0, 255, 63, + 255, 15, 255, 3, 0, 248, 15, 0, 255, 227, 255, 255, 0, 0, 247, 255, + 255, 255, 127, 3, 255, 255, 63, 240, 63, 63, 255, 170, 255, 255, 223, 95, + 220, 31, 207, 15, 255, 31, 220, 31, 0, 0, 0, 128, 1, 0, 16, 0, + 0, 0, 2, 128, 0, 0, 255, 31, 226, 255, 1, 0, 132, 252, 47, 63, + 80, 253, 255, 243, 224, 67, 0, 0, 255, 1, 0, 0, 255, 127, 255, 255, + 31, 248, 15, 0, 255, 128, 0, 128, 255, 255, 127, 0, 127, 127, 127, 127, + 224, 0, 0, 0, 254, 255, 62, 31, 255, 255, 127, 254, 224, 255, 255, 255, + 255, 63, 254, 255, 255, 127, 0, 0, 255, 31, 0, 0, 255, 31, 255, 255, + 255, 15, 0, 0, 255, 255, 240, 191, 0, 0, 128, 255, 252, 255, 255, 255, + 255, 249, 255, 255, 255, 63, 255, 0, 255, 0, 0, 0, 31, 0, 255, 3, + 255, 255, 255, 40, 255, 63, 255, 255, 1, 128, 255, 3, 255, 63, 255, 3, + 255, 255, 127, 252, 7, 0, 0, 56, 255, 255, 124, 0, 126, 126, 126, 0, + 127, 127, 255, 255, 63, 0, 255, 255, 255, 55, 255, 3, 15, 0, 255, 255, + 127, 248, 255, 255, 255, 255, 255, 3, 127, 0, 248, 224, 255, 253, 127, 95, + 219, 255, 255, 255, 0, 0, 248, 255, 255, 255, 252, 255, 0, 0, 255, 15, + 255, 255, 24, 0, 0, 224, 0, 0, 0, 0, 223, 255, 252, 252, 252, 28, + 255, 239, 255, 255, 127, 255, 255, 183, 255, 63, 255, 63, 0, 0, 0, 32, + 255, 255, 1, 0, 1, 0, 0, 0, 15, 255, 62, 0, 255, 0, 255, 255, + 15, 0, 0, 0, 63, 253, 255, 255, 255, 255, 191, 145, 255, 255, 55, 0, + 255, 255, 255, 192, 111, 240, 239, 254, 255, 255, 15, 135, 127, 0, 0, 0, + 255, 255, 7, 0, 192, 255, 0, 128, 255, 1, 255, 3, 255, 255, 223, 255, + 255, 255, 79, 0, 31, 28, 255, 23, 255, 255, 251, 255, 127, 189, 255, 191, + 255, 1, 255, 255, 255, 7, 255, 3, 159, 57, 129, 224, 207, 31, 31, 0, + 191, 0, 255, 3, 255, 255, 63, 255, 1, 0, 0, 63, 17, 0, 255, 3, + 255, 255, 255, 227, 255, 3, 0, 128, 255, 255, 255, 1, 15, 0, 255, 3, + 248, 255, 255, 224, 31, 0, 255, 255, 0, 128, 255, 255, 3, 0, 0, 0, + 255, 7, 255, 31, 255, 1, 255, 99, 224, 227, 7, 248, 231, 15, 0, 0, + 0, 60, 0, 0, 28, 0, 0, 0, 255, 255, 255, 223, 100, 222, 255, 235, + 239, 255, 255, 255, 191, 231, 223, 223, 255, 255, 255, 123, 95, 252, 253, 255, + 63, 255, 255, 255, 253, 255, 255, 247, 255, 253, 255, 255, 247, 207, 255, 255, + 255, 255, 127, 248, 255, 31, 32, 0, 16, 0, 0, 248, 254, 255, 0, 0, + 31, 0, 127, 0, 150, 254, 247, 10, 132, 234, 150, 170, 150, 247, 247, 94, + 255, 251, 255, 15, 238, 251, 255, 15, +}; + +/* ID_Continue: 2186 bytes. */ + +RE_UINT32 re_get_id_continue(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 15; + code = ch ^ (f << 15); + pos = (RE_UINT32)re_id_continue_stage_1[f] << 4; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_id_continue_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_id_continue_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_id_continue_stage_4[pos + f] << 5; + pos += code; + value = (re_id_continue_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* XID_Start. */ + +static RE_UINT8 re_xid_start_stage_1[] = { + 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, +}; + +static RE_UINT8 re_xid_start_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 9, 10, 11, 7, 7, 7, 7, 12, 13, 13, 13, 13, 14, + 15, 16, 17, 18, 19, 13, 20, 13, 21, 13, 13, 13, 13, 22, 13, 13, + 13, 13, 13, 13, 13, 13, 23, 24, 13, 13, 25, 13, 13, 26, 13, 13, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 27, 7, 28, 29, 7, 30, 13, 13, 13, 13, 13, 31, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, +}; + +static RE_UINT8 re_xid_start_stage_3[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 1, 17, 18, 19, 1, 20, 21, 22, 23, 24, 25, 26, 27, 1, 28, + 29, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 31, 31, + 34, 35, 31, 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 36, 1, 1, 1, 1, 1, 1, 1, 1, 1, 37, + 1, 1, 1, 1, 38, 1, 39, 40, 41, 42, 43, 44, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 45, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 1, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 1, 58, + 59, 60, 61, 62, 63, 31, 31, 31, 64, 65, 66, 67, 68, 69, 70, 71, + 72, 31, 73, 31, 31, 31, 31, 31, 1, 1, 1, 74, 75, 76, 31, 31, + 1, 1, 1, 1, 77, 31, 31, 31, 31, 31, 31, 31, 1, 1, 78, 31, + 1, 1, 79, 80, 31, 31, 31, 81, 82, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 83, 31, 31, 31, 31, 31, 31, 31, 84, 85, 86, 87, + 88, 31, 31, 31, 31, 31, 89, 31, 1, 1, 1, 1, 1, 1, 90, 1, + 1, 1, 1, 1, 1, 1, 1, 91, 92, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 93, 31, 1, 1, 94, 31, 31, 31, 31, 31, +}; + +static RE_UINT8 re_xid_start_stage_4[] = { + 0, 0, 1, 1, 0, 2, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 5, 6, 0, 0, 0, 7, 8, 9, 4, 10, + 4, 4, 4, 4, 11, 4, 4, 4, 4, 12, 13, 14, 15, 0, 16, 17, + 0, 4, 18, 19, 4, 4, 20, 21, 22, 23, 24, 4, 4, 25, 26, 27, + 28, 29, 30, 0, 0, 31, 0, 0, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 45, 49, 50, 51, 52, 46, 0, + 53, 54, 55, 56, 53, 57, 58, 59, 53, 60, 61, 62, 63, 64, 65, 0, + 14, 66, 65, 0, 67, 68, 69, 0, 70, 0, 71, 72, 73, 0, 0, 0, + 4, 74, 75, 76, 77, 4, 78, 79, 4, 4, 80, 4, 81, 82, 83, 4, + 84, 4, 85, 0, 23, 4, 4, 86, 14, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 87, 1, 4, 4, 88, 89, 90, 90, 91, 4, 92, 93, 0, + 0, 4, 4, 94, 4, 95, 4, 96, 97, 0, 16, 98, 4, 99, 100, 0, + 101, 4, 31, 0, 0, 102, 0, 0, 103, 92, 104, 0, 105, 106, 4, 107, + 4, 108, 109, 110, 0, 0, 0, 111, 4, 4, 4, 4, 4, 4, 0, 0, + 86, 4, 112, 110, 4, 113, 114, 115, 0, 0, 0, 116, 117, 0, 0, 0, + 118, 119, 120, 4, 121, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 4, 122, 97, 4, 4, 4, 4, 123, 4, 78, 4, 124, 101, 125, 125, 0, + 126, 127, 14, 4, 128, 14, 4, 79, 103, 129, 4, 4, 130, 85, 0, 16, + 4, 4, 4, 4, 4, 96, 0, 0, 4, 4, 4, 4, 4, 4, 96, 0, + 4, 4, 4, 4, 72, 0, 16, 110, 131, 132, 4, 133, 110, 4, 4, 23, + 134, 135, 4, 4, 136, 137, 0, 134, 138, 139, 4, 92, 135, 92, 0, 140, + 26, 141, 65, 142, 32, 143, 144, 145, 4, 121, 146, 147, 4, 148, 149, 150, + 151, 152, 79, 141, 4, 4, 4, 139, 4, 4, 4, 4, 4, 153, 154, 155, + 4, 4, 4, 156, 4, 4, 157, 0, 158, 159, 160, 4, 4, 90, 161, 4, + 4, 4, 110, 32, 4, 4, 4, 4, 4, 110, 16, 4, 162, 4, 15, 163, + 0, 0, 0, 164, 4, 4, 4, 142, 0, 1, 1, 165, 110, 97, 166, 0, + 167, 168, 169, 0, 4, 4, 4, 85, 0, 0, 4, 31, 0, 0, 0, 0, + 0, 0, 0, 0, 142, 4, 170, 0, 4, 16, 171, 96, 110, 4, 172, 0, + 4, 4, 4, 4, 110, 0, 0, 0, 4, 173, 4, 108, 0, 0, 0, 0, + 4, 101, 96, 15, 0, 0, 0, 0, 174, 175, 96, 101, 97, 0, 0, 176, + 96, 157, 0, 0, 4, 177, 0, 0, 178, 92, 0, 142, 142, 0, 71, 179, + 4, 96, 96, 143, 90, 0, 0, 0, 4, 4, 121, 0, 4, 143, 4, 143, + 105, 94, 0, 0, 105, 23, 16, 121, 105, 65, 16, 180, 105, 143, 181, 0, + 182, 183, 0, 0, 184, 185, 97, 0, 48, 45, 186, 56, 0, 0, 0, 0, + 0, 0, 0, 0, 4, 23, 187, 0, 0, 0, 0, 0, 4, 130, 188, 0, + 4, 23, 189, 0, 4, 18, 0, 0, 157, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 4, 4, 190, 0, 0, 0, 0, 0, 0, 4, 30, + 4, 4, 4, 4, 157, 0, 0, 0, 4, 4, 4, 130, 4, 4, 4, 4, + 4, 4, 108, 0, 0, 0, 0, 0, 4, 130, 0, 0, 0, 0, 0, 0, + 4, 4, 65, 0, 0, 0, 0, 0, 4, 30, 97, 0, 0, 0, 16, 191, + 4, 23, 108, 192, 23, 0, 0, 0, 4, 4, 193, 0, 161, 0, 0, 0, + 56, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 194, 195, 0, 0, 0, + 4, 4, 196, 4, 197, 198, 199, 4, 200, 201, 202, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 203, 204, 79, 196, 196, 122, 122, 205, 205, 146, 0, + 4, 4, 4, 4, 4, 4, 179, 0, 199, 206, 207, 208, 209, 210, 0, 0, + 4, 4, 4, 4, 4, 4, 101, 0, 4, 31, 4, 4, 4, 4, 4, 4, + 110, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 56, 0, 0, + 110, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_xid_start_stage_5[] = { + 0, 0, 0, 0, 254, 255, 255, 7, 0, 4, 32, 4, 255, 255, 127, 255, + 255, 255, 255, 255, 195, 255, 3, 0, 31, 80, 0, 0, 0, 0, 223, 184, + 64, 215, 255, 255, 251, 255, 255, 255, 255, 255, 191, 255, 3, 252, 255, 255, + 255, 255, 254, 255, 255, 255, 127, 2, 254, 255, 255, 255, 255, 0, 0, 0, + 0, 0, 255, 255, 255, 7, 7, 0, 255, 7, 0, 0, 0, 192, 254, 255, + 255, 255, 47, 0, 96, 192, 0, 156, 0, 0, 253, 255, 255, 255, 0, 0, + 0, 224, 255, 255, 63, 0, 2, 0, 0, 252, 255, 255, 255, 7, 48, 4, + 255, 255, 63, 4, 16, 1, 0, 0, 255, 255, 255, 1, 255, 255, 31, 0, + 240, 255, 255, 255, 255, 255, 255, 35, 0, 0, 1, 255, 3, 0, 254, 255, + 225, 159, 249, 255, 255, 253, 197, 35, 0, 64, 0, 176, 3, 0, 3, 0, + 224, 135, 249, 255, 255, 253, 109, 3, 0, 0, 0, 94, 0, 0, 28, 0, + 224, 191, 251, 255, 255, 253, 237, 35, 0, 0, 1, 0, 3, 0, 0, 2, + 224, 159, 249, 255, 0, 0, 0, 176, 3, 0, 2, 0, 232, 199, 61, 214, + 24, 199, 255, 3, 224, 223, 253, 255, 255, 253, 255, 35, 0, 0, 0, 7, + 3, 0, 0, 0, 255, 253, 239, 35, 0, 0, 0, 64, 3, 0, 6, 0, + 255, 255, 255, 39, 0, 64, 0, 128, 3, 0, 0, 252, 224, 255, 127, 252, + 255, 255, 251, 47, 127, 0, 0, 0, 255, 255, 5, 0, 150, 37, 240, 254, + 174, 236, 5, 32, 95, 0, 0, 240, 1, 0, 0, 0, 255, 254, 255, 255, + 255, 31, 0, 0, 0, 31, 0, 0, 255, 7, 0, 128, 0, 0, 63, 60, + 98, 192, 225, 255, 3, 64, 0, 0, 191, 32, 255, 255, 255, 255, 255, 247, + 255, 61, 127, 61, 255, 61, 255, 255, 255, 255, 61, 127, 61, 255, 127, 255, + 255, 255, 61, 255, 255, 255, 255, 7, 255, 255, 63, 63, 255, 159, 255, 255, + 255, 199, 255, 1, 255, 223, 3, 0, 255, 255, 3, 0, 255, 223, 1, 0, + 255, 255, 15, 0, 0, 0, 128, 16, 255, 255, 255, 0, 255, 5, 255, 255, + 255, 255, 63, 0, 255, 255, 255, 127, 255, 63, 31, 0, 255, 15, 255, 255, + 255, 3, 0, 0, 255, 255, 127, 0, 128, 0, 0, 0, 224, 255, 255, 255, + 224, 15, 0, 0, 248, 255, 255, 255, 1, 192, 0, 252, 63, 0, 0, 0, + 15, 0, 0, 0, 0, 224, 0, 252, 255, 255, 255, 63, 0, 222, 99, 0, + 63, 63, 255, 170, 255, 255, 223, 95, 220, 31, 207, 15, 255, 31, 220, 31, + 0, 0, 2, 128, 0, 0, 255, 31, 132, 252, 47, 63, 80, 253, 255, 243, + 224, 67, 0, 0, 255, 1, 0, 0, 255, 127, 255, 255, 31, 120, 12, 0, + 255, 128, 0, 0, 127, 127, 127, 127, 224, 0, 0, 0, 254, 3, 62, 31, + 255, 255, 127, 224, 255, 63, 254, 255, 255, 127, 0, 0, 255, 31, 255, 255, + 0, 12, 0, 0, 255, 127, 0, 128, 0, 0, 128, 255, 252, 255, 255, 255, + 255, 249, 255, 255, 255, 63, 255, 0, 187, 247, 255, 255, 7, 0, 0, 0, + 0, 0, 252, 40, 63, 0, 255, 255, 255, 255, 255, 31, 255, 255, 7, 0, + 0, 128, 0, 0, 223, 255, 0, 124, 247, 15, 0, 0, 255, 255, 127, 196, + 255, 255, 98, 62, 5, 0, 0, 56, 255, 7, 28, 0, 126, 126, 126, 0, + 127, 127, 255, 255, 15, 0, 255, 255, 127, 248, 255, 255, 255, 255, 255, 15, + 255, 63, 255, 255, 255, 255, 255, 3, 127, 0, 248, 160, 255, 253, 127, 95, + 219, 255, 255, 255, 0, 0, 248, 255, 255, 255, 252, 255, 0, 0, 255, 3, + 0, 0, 138, 170, 192, 255, 255, 255, 252, 252, 252, 28, 255, 239, 255, 255, + 127, 255, 255, 183, 255, 63, 255, 63, 255, 255, 1, 0, 255, 7, 255, 255, + 15, 255, 62, 0, 255, 0, 255, 255, 63, 253, 255, 255, 255, 255, 191, 145, + 255, 255, 55, 0, 255, 255, 255, 192, 1, 0, 239, 254, 31, 0, 0, 0, + 255, 255, 71, 0, 30, 0, 0, 20, 255, 255, 251, 255, 255, 15, 0, 0, + 127, 189, 255, 191, 255, 1, 255, 255, 0, 0, 1, 224, 176, 0, 0, 0, + 0, 0, 0, 15, 16, 0, 0, 0, 0, 0, 0, 128, 255, 63, 0, 0, + 248, 255, 255, 224, 31, 0, 1, 0, 255, 7, 255, 31, 255, 1, 255, 3, + 255, 255, 223, 255, 255, 255, 255, 223, 100, 222, 255, 235, 239, 255, 255, 255, + 191, 231, 223, 223, 255, 255, 255, 123, 95, 252, 253, 255, 63, 255, 255, 255, + 253, 255, 255, 247, 255, 253, 255, 255, 150, 254, 247, 10, 132, 234, 150, 170, + 150, 247, 247, 94, 255, 251, 255, 15, 238, 251, 255, 15, +}; + +/* XID_Start: 2005 bytes. */ + +RE_UINT32 re_get_xid_start(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_xid_start_stage_1[f] << 5; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_xid_start_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_xid_start_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_xid_start_stage_4[pos + f] << 5; + pos += code; + value = (re_xid_start_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* XID_Continue. */ + +static RE_UINT8 re_xid_continue_stage_1[] = { + 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 6, 6, + 6, 6, +}; + +static RE_UINT8 re_xid_continue_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 9, 10, 11, 7, 7, 7, 7, 12, 13, 13, 13, 13, 14, + 15, 16, 17, 18, 19, 13, 20, 13, 21, 13, 13, 13, 13, 22, 13, 13, + 13, 13, 13, 13, 13, 13, 23, 24, 13, 13, 25, 26, 13, 27, 13, 13, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 28, 7, 29, 30, 7, 31, 13, 13, 13, 13, 13, 32, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 33, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, +}; + +static RE_UINT8 re_xid_continue_stage_3[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 1, 17, 18, 19, 1, 20, 21, 22, 23, 24, 25, 26, 27, 1, 28, + 29, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 31, 31, + 34, 35, 31, 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 36, 1, 1, 1, 1, 1, 1, 1, 1, 1, 37, + 1, 1, 1, 1, 38, 1, 39, 40, 41, 42, 43, 44, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 45, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 1, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 1, 58, + 59, 60, 61, 62, 63, 31, 31, 31, 64, 65, 66, 67, 68, 69, 70, 71, + 72, 31, 73, 31, 31, 31, 31, 31, 1, 1, 1, 74, 75, 76, 31, 31, + 1, 1, 1, 1, 77, 31, 31, 31, 31, 31, 31, 31, 1, 1, 78, 31, + 1, 1, 79, 80, 31, 31, 31, 81, 82, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 83, 31, 31, 31, 31, 84, 85, 31, 86, 87, 88, 89, + 31, 31, 90, 31, 31, 31, 31, 31, 91, 31, 31, 31, 31, 31, 92, 31, + 1, 1, 1, 1, 1, 1, 93, 1, 1, 1, 1, 1, 1, 1, 1, 94, + 95, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 96, 31, + 1, 1, 97, 31, 31, 31, 31, 31, 31, 98, 31, 31, 31, 31, 31, 31, +}; + +static RE_UINT8 re_xid_continue_stage_4[] = { + 0, 1, 2, 3, 0, 4, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 7, 8, 6, 6, 6, 9, 10, 11, 6, 12, + 6, 6, 6, 6, 13, 6, 6, 6, 6, 14, 15, 16, 17, 18, 19, 20, + 21, 6, 6, 22, 6, 6, 23, 24, 25, 6, 26, 6, 6, 27, 6, 28, + 6, 29, 30, 0, 0, 31, 0, 32, 6, 6, 6, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 43, 47, 48, 49, 50, 51, 52, + 53, 54, 55, 56, 57, 58, 59, 60, 57, 61, 62, 63, 64, 65, 66, 67, + 16, 68, 69, 0, 70, 71, 72, 0, 73, 74, 75, 76, 77, 78, 79, 0, + 6, 6, 80, 6, 81, 6, 82, 83, 6, 6, 84, 6, 85, 86, 87, 6, + 88, 6, 61, 89, 90, 6, 6, 91, 16, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 92, 3, 6, 6, 93, 94, 31, 95, 96, 6, 6, 97, 98, + 99, 6, 6, 100, 6, 101, 6, 102, 103, 104, 105, 106, 6, 107, 108, 0, + 30, 6, 103, 109, 110, 111, 0, 0, 6, 6, 112, 113, 6, 6, 6, 95, + 6, 100, 114, 81, 0, 0, 115, 116, 6, 6, 6, 6, 6, 6, 6, 117, + 91, 6, 118, 81, 6, 119, 120, 121, 0, 122, 123, 124, 125, 0, 125, 126, + 127, 128, 129, 6, 130, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 6, 131, 103, 6, 6, 6, 6, 132, 6, 82, 6, 133, 134, 135, 135, 6, + 136, 137, 16, 6, 138, 16, 6, 83, 139, 140, 6, 6, 141, 68, 0, 25, + 6, 6, 6, 6, 6, 102, 0, 0, 6, 6, 6, 6, 6, 6, 102, 0, + 6, 6, 6, 6, 142, 0, 25, 81, 143, 144, 6, 145, 6, 6, 6, 27, + 146, 147, 6, 6, 148, 149, 0, 146, 6, 150, 6, 95, 6, 6, 151, 152, + 6, 153, 95, 78, 6, 6, 154, 103, 6, 134, 155, 156, 6, 6, 157, 158, + 159, 160, 83, 161, 6, 6, 6, 162, 6, 6, 6, 6, 6, 163, 164, 30, + 6, 6, 6, 153, 6, 6, 165, 0, 166, 167, 168, 6, 6, 27, 169, 6, + 6, 6, 81, 170, 6, 6, 6, 6, 6, 81, 25, 6, 171, 6, 150, 1, + 90, 172, 173, 174, 6, 6, 6, 78, 1, 2, 3, 105, 6, 103, 175, 0, + 176, 177, 178, 0, 6, 6, 6, 68, 0, 0, 6, 31, 0, 0, 0, 179, + 0, 0, 0, 0, 78, 6, 180, 181, 6, 25, 101, 68, 81, 6, 182, 0, + 6, 6, 6, 6, 81, 98, 0, 0, 6, 183, 6, 184, 0, 0, 0, 0, + 6, 134, 102, 150, 0, 0, 0, 0, 185, 186, 102, 134, 103, 0, 0, 187, + 102, 165, 0, 0, 6, 188, 0, 0, 189, 190, 0, 78, 78, 0, 75, 191, + 6, 102, 102, 192, 27, 0, 0, 0, 6, 6, 130, 0, 6, 192, 6, 192, + 6, 6, 191, 193, 6, 68, 25, 194, 6, 195, 25, 196, 6, 6, 197, 0, + 198, 100, 0, 0, 199, 200, 6, 201, 34, 43, 202, 203, 0, 0, 0, 0, + 0, 0, 0, 0, 6, 6, 204, 0, 0, 0, 0, 0, 6, 205, 206, 0, + 6, 6, 207, 0, 6, 100, 98, 0, 208, 112, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 6, 6, 209, 0, 0, 0, 0, 0, 0, 6, 210, + 6, 6, 6, 6, 165, 0, 0, 0, 6, 6, 6, 141, 6, 6, 6, 6, + 6, 6, 184, 0, 0, 0, 0, 0, 6, 141, 0, 0, 0, 0, 0, 0, + 6, 6, 191, 0, 0, 0, 0, 0, 6, 210, 103, 98, 0, 0, 25, 106, + 6, 134, 211, 212, 90, 0, 0, 0, 6, 6, 213, 103, 214, 0, 0, 0, + 215, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 216, 217, 0, 0, 0, + 0, 0, 0, 218, 219, 220, 0, 0, 0, 0, 221, 0, 0, 0, 0, 0, + 6, 6, 195, 6, 222, 223, 224, 6, 225, 226, 227, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 228, 229, 83, 195, 195, 131, 131, 230, 230, 231, 6, + 6, 232, 6, 233, 234, 235, 0, 0, 6, 6, 6, 6, 6, 6, 236, 0, + 224, 237, 238, 239, 240, 241, 0, 0, 6, 6, 6, 6, 6, 6, 134, 0, + 6, 31, 6, 6, 6, 6, 6, 6, 81, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 215, 0, 0, 81, 0, 0, 0, 0, 0, 0, 0, + 6, 6, 6, 6, 6, 6, 6, 90, +}; + +static RE_UINT8 re_xid_continue_stage_5[] = { + 0, 0, 0, 0, 0, 0, 255, 3, 254, 255, 255, 135, 254, 255, 255, 7, + 0, 4, 160, 4, 255, 255, 127, 255, 255, 255, 255, 255, 195, 255, 3, 0, + 31, 80, 0, 0, 255, 255, 223, 184, 192, 215, 255, 255, 251, 255, 255, 255, + 255, 255, 191, 255, 251, 252, 255, 255, 255, 255, 254, 255, 255, 255, 127, 2, + 254, 255, 255, 255, 255, 0, 254, 255, 255, 255, 255, 191, 182, 0, 255, 255, + 255, 7, 7, 0, 0, 0, 255, 7, 255, 195, 255, 255, 255, 255, 239, 159, + 255, 253, 255, 159, 0, 0, 255, 255, 255, 231, 255, 255, 255, 255, 3, 0, + 255, 255, 63, 4, 255, 63, 0, 0, 255, 255, 255, 15, 255, 255, 31, 0, + 248, 255, 255, 255, 207, 255, 254, 255, 239, 159, 249, 255, 255, 253, 197, 243, + 159, 121, 128, 176, 207, 255, 3, 0, 238, 135, 249, 255, 255, 253, 109, 211, + 135, 57, 2, 94, 192, 255, 63, 0, 238, 191, 251, 255, 255, 253, 237, 243, + 191, 59, 1, 0, 207, 255, 0, 2, 238, 159, 249, 255, 159, 57, 192, 176, + 207, 255, 2, 0, 236, 199, 61, 214, 24, 199, 255, 195, 199, 61, 129, 0, + 192, 255, 0, 0, 239, 223, 253, 255, 255, 253, 255, 227, 223, 61, 96, 7, + 207, 255, 0, 0, 238, 223, 253, 255, 255, 253, 239, 243, 223, 61, 96, 64, + 207, 255, 6, 0, 255, 255, 255, 231, 223, 125, 128, 128, 207, 255, 0, 252, + 236, 255, 127, 252, 255, 255, 251, 47, 127, 132, 95, 255, 192, 255, 12, 0, + 255, 255, 255, 7, 255, 127, 255, 3, 150, 37, 240, 254, 174, 236, 255, 59, + 95, 63, 255, 243, 1, 0, 0, 3, 255, 3, 160, 194, 255, 254, 255, 255, + 255, 31, 254, 255, 223, 255, 255, 254, 255, 255, 255, 31, 64, 0, 0, 0, + 255, 3, 255, 255, 255, 255, 255, 63, 191, 32, 255, 255, 255, 255, 255, 247, + 255, 61, 127, 61, 255, 61, 255, 255, 255, 255, 61, 127, 61, 255, 127, 255, + 255, 255, 61, 255, 0, 254, 3, 0, 255, 255, 0, 0, 255, 255, 63, 63, + 255, 159, 255, 255, 255, 199, 255, 1, 255, 223, 31, 0, 255, 255, 15, 0, + 255, 223, 13, 0, 255, 255, 143, 48, 255, 3, 0, 0, 0, 56, 255, 3, + 255, 255, 255, 0, 255, 7, 255, 255, 255, 255, 63, 0, 255, 255, 255, 127, + 255, 15, 255, 15, 192, 255, 255, 255, 255, 63, 31, 0, 255, 15, 255, 255, + 255, 3, 255, 7, 255, 255, 255, 159, 255, 3, 255, 3, 128, 0, 255, 63, + 255, 15, 255, 3, 0, 248, 15, 0, 255, 227, 255, 255, 0, 0, 247, 255, + 255, 255, 127, 3, 255, 255, 63, 240, 63, 63, 255, 170, 255, 255, 223, 95, + 220, 31, 207, 15, 255, 31, 220, 31, 0, 0, 0, 128, 1, 0, 16, 0, + 0, 0, 2, 128, 0, 0, 255, 31, 226, 255, 1, 0, 132, 252, 47, 63, + 80, 253, 255, 243, 224, 67, 0, 0, 255, 1, 0, 0, 255, 127, 255, 255, + 31, 248, 15, 0, 255, 128, 0, 128, 255, 255, 127, 0, 127, 127, 127, 127, + 224, 0, 0, 0, 254, 255, 62, 31, 255, 255, 127, 230, 224, 255, 255, 255, + 255, 63, 254, 255, 255, 127, 0, 0, 255, 31, 0, 0, 255, 31, 255, 255, + 255, 15, 0, 0, 255, 255, 240, 191, 0, 0, 128, 255, 252, 255, 255, 255, + 255, 249, 255, 255, 255, 63, 255, 0, 255, 0, 0, 0, 31, 0, 255, 3, + 255, 255, 255, 40, 255, 63, 255, 255, 1, 128, 255, 3, 255, 63, 255, 3, + 255, 255, 127, 252, 7, 0, 0, 56, 255, 255, 124, 0, 126, 126, 126, 0, + 127, 127, 255, 255, 63, 0, 255, 255, 255, 55, 255, 3, 15, 0, 255, 255, + 127, 248, 255, 255, 255, 255, 255, 3, 127, 0, 248, 224, 255, 253, 127, 95, + 219, 255, 255, 255, 0, 0, 248, 255, 240, 255, 255, 255, 255, 255, 252, 255, + 255, 255, 24, 0, 0, 224, 0, 0, 0, 0, 138, 170, 252, 252, 252, 28, + 255, 239, 255, 255, 127, 255, 255, 183, 255, 63, 255, 63, 0, 0, 0, 32, + 255, 255, 1, 0, 1, 0, 0, 0, 15, 255, 62, 0, 255, 0, 255, 255, + 15, 0, 0, 0, 63, 253, 255, 255, 255, 255, 191, 145, 255, 255, 55, 0, + 255, 255, 255, 192, 111, 240, 239, 254, 255, 255, 15, 135, 127, 0, 0, 0, + 255, 255, 7, 0, 192, 255, 0, 128, 255, 1, 255, 3, 255, 255, 223, 255, + 255, 255, 79, 0, 31, 28, 255, 23, 255, 255, 251, 255, 127, 189, 255, 191, + 255, 1, 255, 255, 255, 7, 255, 3, 159, 57, 129, 224, 207, 31, 31, 0, + 191, 0, 255, 3, 255, 255, 63, 255, 1, 0, 0, 63, 17, 0, 255, 3, + 255, 255, 255, 227, 255, 3, 0, 128, 255, 255, 255, 1, 15, 0, 255, 3, + 248, 255, 255, 224, 31, 0, 255, 255, 0, 128, 255, 255, 3, 0, 0, 0, + 255, 7, 255, 31, 255, 1, 255, 99, 224, 227, 7, 248, 231, 15, 0, 0, + 0, 60, 0, 0, 28, 0, 0, 0, 255, 255, 255, 223, 100, 222, 255, 235, + 239, 255, 255, 255, 191, 231, 223, 223, 255, 255, 255, 123, 95, 252, 253, 255, + 63, 255, 255, 255, 253, 255, 255, 247, 255, 253, 255, 255, 247, 207, 255, 255, + 255, 255, 127, 248, 255, 31, 32, 0, 16, 0, 0, 248, 254, 255, 0, 0, + 31, 0, 127, 0, 150, 254, 247, 10, 132, 234, 150, 170, 150, 247, 247, 94, + 255, 251, 255, 15, 238, 251, 255, 15, +}; + +/* XID_Continue: 2194 bytes. */ + +RE_UINT32 re_get_xid_continue(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 15; + code = ch ^ (f << 15); + pos = (RE_UINT32)re_xid_continue_stage_1[f] << 4; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_xid_continue_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_xid_continue_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_xid_continue_stage_4[pos + f] << 5; + pos += code; + value = (re_xid_continue_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Default_Ignorable_Code_Point. */ + +static RE_UINT8 re_default_ignorable_code_point_stage_1[] = { + 0, 1, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, + 2, 2, +}; + +static RE_UINT8 re_default_ignorable_code_point_stage_2[] = { + 0, 1, 2, 3, 4, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 7, 1, 1, 8, 1, 1, 1, 1, 1, + 9, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_default_ignorable_code_point_stage_3[] = { + 0, 1, 1, 2, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 4, 1, 1, 1, 1, 1, 5, 6, 1, 1, 1, 1, 1, 1, 1, + 7, 1, 1, 1, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 9, 10, 1, 1, 1, 1, 11, 1, 1, 1, + 1, 12, 1, 1, 1, 1, 1, 1, 13, 13, 13, 13, 13, 13, 13, 13, +}; + +static RE_UINT8 re_default_ignorable_code_point_stage_4[] = { + 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, + 7, 0, 0, 0, 0, 0, 0, 0, 8, 9, 0, 10, 0, 0, 0, 0, + 0, 0, 0, 11, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 4, + 0, 0, 0, 0, 0, 5, 0, 12, 0, 0, 0, 0, 0, 13, 0, 0, + 0, 0, 0, 14, 0, 0, 0, 0, 15, 15, 15, 15, 15, 15, 15, 15, +}; + +static RE_UINT8 re_default_ignorable_code_point_stage_5[] = { + 0, 0, 0, 0, 0, 32, 0, 0, 0, 128, 0, 0, 0, 0, 0, 16, + 0, 0, 0, 128, 1, 0, 0, 0, 0, 0, 48, 0, 0, 120, 0, 0, + 0, 248, 0, 0, 0, 124, 0, 0, 255, 255, 0, 0, 16, 0, 0, 0, + 0, 0, 255, 1, 15, 0, 0, 0, 0, 0, 248, 7, 255, 255, 255, 255, +}; + +/* Default_Ignorable_Code_Point: 370 bytes. */ + +RE_UINT32 re_get_default_ignorable_code_point(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 15; + code = ch ^ (f << 15); + pos = (RE_UINT32)re_default_ignorable_code_point_stage_1[f] << 4; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_default_ignorable_code_point_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_default_ignorable_code_point_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_default_ignorable_code_point_stage_4[pos + f] << 5; + pos += code; + value = (re_default_ignorable_code_point_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Grapheme_Extend. */ + +static RE_UINT8 re_grapheme_extend_stage_1[] = { + 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, + 4, 4, +}; + +static RE_UINT8 re_grapheme_extend_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 8, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10, + 11, 12, 13, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 14, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 15, 7, 7, 16, 17, 7, 18, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 19, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, +}; + +static RE_UINT8 re_grapheme_extend_stage_3[] = { + 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 0, 0, 15, 0, 0, 0, 16, 17, 18, 19, 20, 21, 22, 0, 0, + 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 25, 0, 0, + 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 27, 0, 28, 29, 30, 31, 0, 0, 0, 0, + 0, 0, 0, 32, 0, 0, 33, 34, 0, 35, 36, 37, 0, 0, 0, 0, + 0, 0, 38, 0, 0, 0, 0, 0, 39, 40, 41, 42, 43, 44, 45, 46, + 0, 0, 47, 48, 0, 0, 0, 49, 0, 0, 0, 0, 50, 0, 0, 0, + 0, 51, 52, 0, 0, 0, 0, 0, 0, 0, 53, 0, 0, 0, 0, 0, + 54, 0, 0, 0, 0, 0, 0, 0, 0, 55, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_grapheme_extend_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 0, 0, 0, 0, + 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 0, + 7, 0, 8, 9, 0, 0, 10, 11, 12, 13, 14, 0, 0, 15, 0, 16, + 17, 18, 19, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 24, + 28, 29, 30, 31, 28, 29, 32, 24, 25, 33, 34, 24, 35, 36, 37, 0, + 38, 39, 40, 24, 25, 41, 42, 24, 25, 36, 27, 24, 0, 0, 43, 0, + 0, 44, 45, 0, 0, 46, 47, 0, 48, 49, 0, 50, 51, 52, 53, 0, + 0, 54, 55, 56, 57, 0, 0, 0, 0, 0, 58, 0, 0, 0, 0, 0, + 59, 59, 60, 60, 0, 61, 62, 0, 63, 0, 0, 0, 0, 64, 0, 0, + 0, 65, 0, 0, 0, 0, 0, 0, 66, 0, 67, 68, 0, 69, 0, 0, + 70, 71, 35, 16, 72, 73, 0, 74, 0, 75, 0, 0, 0, 0, 76, 77, + 0, 0, 0, 0, 0, 0, 1, 78, 79, 0, 0, 0, 0, 0, 13, 80, + 0, 0, 0, 0, 0, 0, 0, 81, 0, 0, 0, 82, 0, 0, 0, 1, + 0, 83, 0, 0, 84, 0, 0, 0, 0, 0, 0, 85, 39, 0, 0, 86, + 87, 88, 0, 0, 0, 0, 89, 90, 0, 91, 92, 0, 21, 93, 0, 94, + 0, 95, 96, 29, 0, 97, 25, 98, 0, 0, 0, 0, 0, 0, 0, 99, + 36, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100, + 0, 0, 0, 0, 0, 0, 0, 38, 0, 0, 0, 101, 0, 0, 0, 0, + 102, 103, 0, 0, 0, 0, 0, 88, 25, 104, 105, 82, 72, 106, 0, 0, + 21, 107, 0, 108, 72, 109, 110, 0, 0, 111, 0, 0, 0, 0, 82, 112, + 72, 26, 113, 114, 0, 0, 0, 0, 0, 0, 0, 0, 0, 115, 116, 0, + 0, 0, 0, 0, 0, 117, 118, 0, 0, 119, 38, 0, 0, 120, 0, 0, + 58, 121, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 122, + 0, 123, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 124, 0, 0, 0, + 0, 0, 0, 0, 125, 0, 0, 0, 0, 0, 0, 126, 127, 128, 0, 0, + 0, 0, 129, 0, 0, 0, 0, 0, 1, 130, 1, 131, 132, 133, 0, 0, + 0, 0, 0, 0, 0, 0, 123, 0, 1, 1, 1, 1, 1, 1, 1, 2, +}; + +static RE_UINT8 re_grapheme_extend_stage_5[] = { + 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 0, 0, 248, 3, 0, 0, + 0, 0, 254, 255, 255, 255, 255, 191, 182, 0, 0, 0, 0, 0, 255, 7, + 0, 248, 255, 255, 0, 0, 1, 0, 0, 0, 192, 159, 159, 61, 0, 0, + 0, 0, 2, 0, 0, 0, 255, 255, 255, 7, 0, 0, 192, 255, 1, 0, + 0, 248, 15, 0, 0, 0, 192, 251, 239, 62, 0, 0, 0, 0, 0, 14, + 248, 255, 255, 255, 7, 0, 0, 0, 0, 0, 0, 20, 254, 33, 254, 0, + 12, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 80, 30, 32, 128, 0, + 6, 0, 0, 0, 0, 0, 0, 16, 134, 57, 2, 0, 0, 0, 35, 0, + 190, 33, 0, 0, 0, 0, 0, 208, 30, 32, 192, 0, 4, 0, 0, 0, + 0, 0, 0, 64, 1, 32, 128, 0, 1, 0, 0, 0, 0, 0, 0, 192, + 193, 61, 96, 0, 0, 0, 0, 144, 68, 48, 96, 0, 0, 132, 92, 128, + 0, 0, 242, 7, 128, 127, 0, 0, 0, 0, 242, 27, 0, 63, 0, 0, + 0, 0, 0, 3, 0, 0, 160, 2, 0, 0, 254, 127, 223, 224, 255, 254, + 255, 255, 255, 31, 64, 0, 0, 0, 0, 224, 253, 102, 0, 0, 0, 195, + 1, 0, 30, 0, 100, 32, 0, 32, 0, 0, 0, 224, 0, 0, 28, 0, + 0, 0, 12, 0, 0, 0, 176, 63, 64, 254, 15, 32, 0, 56, 0, 0, + 0, 2, 0, 0, 135, 1, 4, 14, 0, 0, 128, 9, 0, 0, 64, 127, + 229, 31, 248, 159, 0, 0, 255, 127, 15, 0, 0, 0, 0, 0, 208, 23, + 3, 0, 0, 0, 60, 59, 0, 0, 64, 163, 3, 0, 0, 240, 207, 0, + 0, 0, 247, 255, 253, 33, 16, 3, 255, 255, 63, 240, 0, 48, 0, 0, + 255, 255, 1, 0, 0, 128, 3, 0, 0, 0, 0, 128, 0, 252, 0, 0, + 0, 0, 0, 6, 0, 128, 247, 63, 0, 0, 3, 0, 68, 8, 0, 0, + 96, 0, 0, 0, 16, 0, 0, 0, 255, 255, 3, 0, 192, 63, 0, 0, + 128, 255, 3, 0, 0, 0, 200, 19, 32, 0, 0, 0, 0, 126, 102, 0, + 8, 16, 0, 0, 0, 0, 157, 193, 0, 48, 64, 0, 32, 33, 0, 0, + 0, 0, 0, 32, 0, 0, 192, 7, 110, 240, 0, 0, 0, 0, 0, 135, + 0, 0, 0, 255, 127, 0, 0, 0, 0, 0, 120, 6, 128, 239, 31, 0, + 0, 0, 8, 0, 0, 0, 192, 127, 0, 28, 0, 0, 0, 128, 211, 0, + 248, 7, 0, 0, 1, 0, 128, 0, 192, 31, 31, 0, 0, 0, 249, 165, + 13, 0, 0, 0, 0, 128, 60, 176, 1, 0, 0, 48, 0, 0, 248, 167, + 0, 40, 191, 0, 188, 15, 0, 0, 0, 0, 31, 0, 0, 0, 127, 0, + 0, 128, 7, 0, 0, 0, 0, 96, 160, 195, 7, 248, 231, 15, 0, 0, + 0, 60, 0, 0, 28, 0, 0, 0, 255, 255, 127, 248, 255, 31, 32, 0, + 16, 0, 0, 248, 254, 255, 0, 0, +}; + +/* Grapheme_Extend: 1274 bytes. */ + +RE_UINT32 re_get_grapheme_extend(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 15; + code = ch ^ (f << 15); + pos = (RE_UINT32)re_grapheme_extend_stage_1[f] << 4; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_grapheme_extend_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_grapheme_extend_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_grapheme_extend_stage_4[pos + f] << 5; + pos += code; + value = (re_grapheme_extend_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Grapheme_Base. */ + +static RE_UINT8 re_grapheme_base_stage_1[] = { + 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, +}; + +static RE_UINT8 re_grapheme_base_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13, + 13, 13, 13, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 15, 13, 16, 17, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 19, 29, 30, 19, 19, 13, 31, 19, 19, + 19, 32, 19, 19, 19, 19, 19, 19, 19, 19, 33, 34, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 35, 19, 19, 36, + 19, 19, 19, 19, 37, 38, 39, 19, 19, 19, 40, 41, 42, 43, 44, 19, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 45, 13, 13, 13, 46, 47, 13, + 13, 13, 13, 48, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 49, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, +}; + +static RE_UINT8 re_grapheme_base_stage_3[] = { + 0, 1, 2, 2, 2, 2, 3, 4, 2, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, + 28, 29, 2, 2, 30, 31, 32, 33, 2, 2, 2, 2, 2, 34, 35, 36, + 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 2, 47, 2, 2, 48, 49, + 50, 51, 2, 52, 2, 2, 2, 53, 54, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 55, 56, 57, 58, 59, 60, 61, 62, 2, 63, + 64, 65, 66, 67, 68, 69, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 70, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 71, + 2, 72, 2, 2, 73, 74, 2, 75, 76, 77, 78, 79, 80, 81, 82, 83, + 2, 2, 2, 2, 2, 2, 2, 84, 85, 85, 85, 85, 85, 85, 85, 85, + 85, 85, 2, 2, 86, 87, 88, 89, 2, 2, 90, 91, 92, 93, 94, 95, + 96, 53, 97, 98, 85, 99, 100, 101, 2, 102, 103, 85, 2, 2, 104, 85, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 85, 85, 115, 85, 85, 85, + 116, 117, 118, 119, 120, 121, 122, 85, 85, 123, 85, 124, 125, 126, 127, 85, + 85, 128, 85, 85, 85, 129, 85, 85, 2, 2, 2, 2, 2, 2, 2, 130, + 131, 2, 132, 85, 85, 85, 85, 85, 133, 85, 85, 85, 85, 85, 85, 85, + 2, 2, 2, 2, 134, 85, 85, 85, 2, 2, 2, 2, 135, 136, 137, 138, + 85, 85, 85, 85, 85, 85, 139, 140, 141, 85, 85, 85, 85, 85, 85, 85, + 142, 143, 85, 85, 85, 85, 85, 85, 2, 144, 145, 146, 147, 85, 148, 85, + 149, 150, 151, 2, 2, 152, 2, 153, 2, 2, 2, 2, 154, 155, 85, 85, + 2, 156, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 157, 158, 85, 85, + 159, 160, 161, 162, 163, 85, 2, 2, 2, 2, 164, 165, 2, 166, 167, 168, + 169, 170, 171, 172, 85, 85, 85, 85, 2, 2, 2, 2, 2, 173, 2, 2, + 2, 2, 2, 2, 2, 2, 174, 2, 175, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 176, 85, 85, 2, 2, 2, 2, 177, 85, 85, 85, +}; + +static RE_UINT8 re_grapheme_base_stage_4[] = { + 0, 0, 1, 1, 1, 1, 1, 2, 0, 0, 3, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 4, + 5, 1, 6, 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 8, 1, 9, 8, 1, 10, 0, 0, 11, 12, 1, 13, 14, + 15, 16, 1, 1, 13, 0, 1, 8, 1, 1, 1, 1, 1, 17, 18, 1, + 19, 20, 1, 0, 21, 1, 1, 1, 1, 1, 22, 23, 1, 1, 13, 24, + 1, 25, 26, 2, 1, 27, 0, 0, 0, 0, 1, 14, 0, 0, 0, 0, + 28, 1, 1, 29, 30, 31, 32, 1, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 34, 35, 42, 43, 44, 15, 45, 46, 6, 35, 47, 48, 43, 39, 49, + 50, 34, 35, 51, 52, 38, 39, 53, 54, 55, 56, 57, 58, 43, 15, 13, + 59, 20, 35, 60, 61, 62, 39, 63, 64, 20, 35, 65, 66, 11, 39, 67, + 64, 20, 1, 68, 69, 70, 39, 71, 72, 73, 1, 74, 75, 76, 15, 45, + 8, 1, 1, 77, 78, 40, 0, 0, 79, 80, 81, 82, 83, 84, 0, 0, + 1, 4, 1, 85, 86, 1, 87, 70, 88, 0, 0, 89, 90, 13, 0, 0, + 1, 1, 87, 91, 1, 92, 8, 93, 94, 3, 1, 1, 95, 1, 1, 1, + 1, 1, 1, 1, 96, 97, 1, 1, 96, 1, 1, 98, 99, 100, 1, 1, + 1, 99, 1, 1, 1, 13, 1, 87, 1, 101, 1, 1, 1, 1, 1, 102, + 1, 87, 1, 1, 1, 1, 1, 103, 3, 104, 1, 105, 1, 104, 3, 43, + 1, 1, 1, 106, 107, 108, 101, 101, 13, 101, 1, 1, 1, 1, 1, 53, + 1, 1, 109, 1, 1, 1, 1, 22, 1, 2, 110, 111, 112, 1, 19, 14, + 1, 1, 40, 1, 101, 113, 1, 1, 1, 114, 1, 1, 1, 115, 116, 117, + 101, 101, 19, 0, 0, 0, 0, 0, 118, 1, 1, 119, 120, 1, 13, 108, + 121, 1, 122, 1, 1, 1, 123, 124, 1, 1, 40, 125, 126, 1, 1, 1, + 0, 0, 0, 0, 53, 127, 128, 129, 1, 1, 1, 1, 0, 0, 0, 0, + 1, 102, 1, 1, 102, 130, 1, 19, 1, 1, 1, 131, 131, 132, 1, 133, + 13, 1, 134, 1, 1, 1, 0, 32, 2, 87, 1, 2, 0, 0, 0, 0, + 40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 13, + 1, 1, 75, 0, 13, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 135, + 1, 136, 1, 126, 35, 104, 137, 0, 1, 1, 2, 1, 1, 2, 1, 1, + 1, 1, 1, 1, 1, 1, 2, 138, 1, 1, 95, 1, 1, 1, 134, 43, + 1, 75, 139, 139, 139, 139, 0, 0, 1, 1, 1, 1, 117, 0, 0, 0, + 1, 140, 1, 1, 1, 1, 1, 141, 1, 1, 1, 1, 1, 22, 0, 40, + 1, 1, 101, 1, 8, 1, 1, 1, 1, 142, 1, 1, 1, 1, 1, 1, + 143, 1, 19, 8, 1, 1, 1, 1, 2, 1, 1, 13, 1, 1, 141, 1, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, + 1, 1, 1, 22, 1, 1, 1, 1, 1, 1, 1, 1, 1, 22, 0, 0, + 87, 1, 1, 1, 75, 1, 1, 1, 1, 1, 40, 0, 1, 1, 2, 144, + 1, 19, 1, 1, 1, 1, 1, 145, 1, 1, 19, 53, 0, 0, 0, 146, + 147, 1, 148, 101, 1, 1, 1, 53, 1, 1, 1, 1, 149, 101, 0, 150, + 1, 1, 151, 1, 75, 152, 1, 87, 28, 1, 1, 153, 154, 155, 131, 2, + 1, 1, 156, 157, 158, 84, 1, 159, 1, 1, 1, 160, 161, 162, 163, 22, + 164, 165, 139, 1, 1, 1, 22, 1, 1, 1, 1, 1, 1, 1, 166, 101, + 1, 1, 141, 1, 142, 1, 1, 40, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 19, 1, 1, 1, 1, 1, 1, 101, 0, 0, + 75, 167, 1, 168, 169, 1, 1, 1, 1, 1, 1, 1, 104, 28, 1, 1, + 1, 1, 1, 1, 0, 1, 1, 1, 1, 121, 1, 1, 53, 0, 0, 19, + 0, 101, 0, 1, 1, 170, 171, 131, 1, 1, 1, 1, 1, 1, 1, 87, + 8, 1, 1, 1, 1, 1, 1, 1, 1, 19, 1, 2, 172, 173, 139, 174, + 159, 1, 100, 175, 19, 19, 0, 0, 176, 1, 1, 177, 1, 1, 1, 1, + 87, 40, 43, 0, 0, 1, 1, 87, 1, 87, 1, 1, 1, 43, 8, 40, + 1, 1, 141, 1, 13, 1, 1, 22, 1, 154, 1, 1, 178, 22, 0, 0, + 1, 19, 101, 0, 0, 0, 0, 0, 1, 1, 53, 1, 1, 1, 179, 0, + 1, 1, 1, 75, 1, 22, 53, 0, 180, 1, 1, 181, 1, 182, 1, 1, + 1, 2, 146, 0, 0, 0, 1, 183, 1, 184, 1, 57, 0, 0, 0, 0, + 1, 1, 1, 185, 1, 121, 1, 1, 43, 186, 1, 141, 53, 103, 1, 1, + 1, 1, 0, 0, 1, 1, 187, 75, 1, 1, 1, 71, 1, 136, 1, 188, + 1, 189, 190, 0, 0, 0, 0, 0, 1, 1, 1, 1, 103, 0, 0, 0, + 1, 1, 1, 117, 1, 1, 1, 7, 0, 0, 0, 0, 0, 0, 1, 2, + 20, 1, 1, 53, 191, 121, 1, 0, 121, 1, 1, 192, 104, 1, 103, 101, + 28, 1, 193, 15, 141, 1, 1, 194, 121, 1, 1, 195, 60, 1, 8, 14, + 1, 6, 2, 196, 0, 0, 0, 0, 197, 154, 101, 1, 1, 2, 117, 101, + 50, 34, 35, 198, 199, 200, 141, 0, 1, 1, 1, 201, 202, 101, 0, 0, + 1, 1, 2, 203, 8, 40, 0, 0, 1, 1, 1, 204, 61, 101, 0, 0, + 1, 1, 205, 206, 101, 0, 0, 0, 1, 101, 207, 1, 0, 0, 0, 0, + 0, 0, 1, 1, 1, 1, 1, 208, 0, 0, 0, 0, 1, 1, 1, 103, + 1, 101, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 14, + 1, 1, 1, 1, 141, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 75, 0, 0, 0, 1, 1, 1, 103, 1, 2, 155, 0, + 0, 0, 0, 0, 0, 1, 19, 209, 1, 1, 1, 146, 22, 140, 6, 210, + 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 14, 1, 1, 2, + 0, 28, 0, 0, 0, 0, 0, 0, 104, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 13, 87, 103, 211, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 22, 1, 1, 9, 1, 1, 1, 212, 0, + 213, 1, 155, 1, 1, 1, 103, 0, 1, 1, 1, 1, 214, 0, 0, 0, + 1, 1, 1, 1, 1, 75, 1, 104, 1, 1, 1, 1, 1, 131, 1, 1, + 1, 3, 215, 29, 216, 1, 1, 1, 217, 218, 1, 219, 220, 20, 1, 1, + 1, 1, 136, 1, 1, 1, 1, 1, 1, 1, 1, 1, 163, 1, 1, 1, + 0, 0, 0, 221, 0, 0, 21, 131, 222, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 223, 0, 0, 0, 216, 1, 224, 225, 226, 227, 228, 229, + 140, 40, 230, 40, 0, 0, 0, 104, 1, 1, 40, 1, 1, 1, 1, 1, + 1, 141, 2, 8, 8, 8, 1, 22, 87, 1, 2, 1, 1, 1, 40, 1, + 1, 13, 0, 0, 0, 0, 15, 1, 117, 1, 1, 13, 103, 104, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 140, 1, 1, 216, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 43, 87, 141, 1, 1, 1, 1, 1, 1, 1, 141, + 1, 1, 1, 1, 1, 14, 0, 0, 40, 1, 1, 1, 53, 101, 1, 1, + 53, 1, 19, 0, 0, 0, 0, 0, 0, 103, 0, 0, 0, 0, 0, 0, + 14, 0, 0, 0, 43, 0, 0, 0, 1, 1, 1, 1, 1, 75, 0, 0, + 1, 1, 1, 14, 1, 1, 1, 1, 1, 19, 1, 1, 1, 1, 1, 1, + 1, 1, 104, 0, 0, 0, 0, 0, 1, 19, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_grapheme_base_stage_5[] = { + 0, 0, 255, 255, 255, 127, 255, 223, 255, 252, 240, 215, 251, 255, 7, 252, + 254, 255, 127, 254, 255, 230, 0, 64, 73, 0, 255, 7, 31, 0, 192, 255, + 0, 200, 63, 64, 96, 194, 255, 63, 253, 255, 0, 224, 63, 0, 2, 0, + 240, 7, 63, 4, 16, 1, 255, 65, 248, 255, 255, 235, 1, 222, 1, 255, + 243, 255, 237, 159, 249, 255, 255, 253, 197, 163, 129, 89, 0, 176, 195, 255, + 255, 15, 232, 135, 109, 195, 1, 0, 0, 94, 28, 0, 232, 191, 237, 227, + 1, 26, 3, 2, 236, 159, 237, 35, 129, 25, 255, 0, 232, 199, 61, 214, + 24, 199, 255, 131, 198, 29, 238, 223, 255, 35, 30, 0, 0, 7, 0, 255, + 236, 223, 239, 99, 155, 13, 6, 0, 255, 167, 193, 93, 0, 128, 63, 254, + 236, 255, 127, 252, 251, 47, 127, 0, 3, 127, 13, 128, 127, 128, 150, 37, + 240, 254, 174, 236, 13, 32, 95, 0, 255, 243, 95, 253, 255, 254, 255, 31, + 32, 31, 0, 192, 191, 223, 2, 153, 255, 60, 225, 255, 155, 223, 191, 32, + 255, 61, 127, 61, 61, 127, 61, 255, 127, 255, 255, 3, 63, 63, 255, 1, + 3, 0, 99, 0, 79, 192, 191, 1, 240, 31, 255, 5, 120, 14, 251, 1, + 241, 255, 255, 199, 127, 198, 191, 0, 26, 224, 7, 0, 240, 255, 47, 232, + 251, 15, 252, 255, 195, 196, 191, 92, 12, 240, 48, 248, 255, 227, 8, 0, + 2, 222, 111, 0, 255, 170, 223, 255, 207, 239, 220, 127, 255, 128, 207, 255, + 63, 255, 0, 240, 12, 254, 127, 127, 255, 251, 15, 0, 127, 248, 224, 255, + 8, 192, 252, 0, 128, 255, 187, 247, 159, 15, 15, 192, 252, 63, 63, 192, + 12, 128, 55, 236, 255, 191, 255, 195, 255, 129, 25, 0, 247, 47, 255, 239, + 98, 62, 5, 0, 0, 248, 255, 207, 126, 126, 126, 0, 223, 30, 248, 160, + 127, 95, 219, 255, 247, 255, 127, 15, 252, 252, 252, 28, 0, 48, 255, 183, + 135, 255, 143, 255, 15, 255, 15, 128, 63, 253, 191, 145, 191, 255, 55, 248, + 255, 143, 255, 240, 239, 254, 31, 248, 7, 255, 3, 30, 0, 254, 128, 63, + 135, 217, 127, 16, 119, 0, 63, 128, 44, 63, 127, 189, 237, 163, 158, 57, + 1, 224, 6, 90, 242, 0, 3, 79, 7, 88, 255, 215, 64, 0, 67, 0, + 7, 128, 32, 0, 255, 224, 255, 147, 95, 60, 24, 240, 35, 0, 100, 222, + 239, 255, 191, 231, 223, 223, 255, 123, 95, 252, 128, 7, 239, 15, 159, 255, + 150, 254, 247, 10, 132, 234, 150, 170, 150, 247, 247, 94, 238, 251, +}; + +/* Grapheme_Base: 2544 bytes. */ + +RE_UINT32 re_get_grapheme_base(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 15; + code = ch ^ (f << 15); + pos = (RE_UINT32)re_grapheme_base_stage_1[f] << 5; + f = code >> 10; + code ^= f << 10; + pos = (RE_UINT32)re_grapheme_base_stage_2[pos + f] << 3; + f = code >> 7; + code ^= f << 7; + pos = (RE_UINT32)re_grapheme_base_stage_3[pos + f] << 3; + f = code >> 4; + code ^= f << 4; + pos = (RE_UINT32)re_grapheme_base_stage_4[pos + f] << 4; + pos += code; + value = (re_grapheme_base_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Grapheme_Link. */ + +static RE_UINT8 re_grapheme_link_stage_1[] = { + 0, 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, +}; + +static RE_UINT8 re_grapheme_link_stage_2[] = { + 0, 0, 1, 2, 3, 4, 5, 0, 0, 0, 0, 6, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, + 0, 0, 8, 0, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_grapheme_link_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 2, 3, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 6, 7, 0, 0, 0, 0, 8, 0, 9, 10, + 0, 0, 11, 0, 0, 0, 0, 0, 12, 9, 13, 14, 0, 15, 0, 16, + 0, 0, 0, 0, 17, 0, 0, 0, 18, 19, 20, 14, 21, 22, 1, 0, + 0, 23, 0, 17, 17, 24, 25, 0, +}; + +static RE_UINT8 re_grapheme_link_stage_4[] = { + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 3, 0, 0, + 4, 0, 0, 0, 0, 5, 0, 0, 6, 6, 0, 0, 0, 0, 7, 0, + 0, 0, 0, 8, 0, 0, 4, 0, 0, 9, 0, 10, 0, 0, 0, 11, + 12, 0, 0, 0, 0, 0, 13, 0, 0, 0, 8, 0, 0, 0, 0, 14, + 0, 0, 0, 1, 0, 11, 0, 0, 0, 0, 12, 11, 0, 15, 0, 0, + 0, 16, 0, 0, 0, 17, 0, 0, 0, 0, 0, 2, 0, 0, 18, 0, + 0, 14, 0, 0, 0, 19, 0, 0, +}; + +static RE_UINT8 re_grapheme_link_stage_5[] = { + 0, 0, 0, 0, 0, 32, 0, 0, 0, 4, 0, 0, 0, 0, 0, 4, + 16, 0, 0, 0, 0, 0, 0, 6, 0, 0, 16, 0, 0, 0, 4, 0, + 1, 0, 0, 0, 0, 12, 0, 0, 0, 0, 12, 0, 0, 0, 0, 128, + 64, 0, 0, 0, 0, 0, 8, 0, 0, 0, 64, 0, 0, 0, 0, 2, + 0, 0, 24, 0, 0, 0, 32, 0, 4, 0, 0, 0, 0, 8, 0, 0, +}; + +/* Grapheme_Link: 404 bytes. */ + +RE_UINT32 re_get_grapheme_link(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 14; + code = ch ^ (f << 14); + pos = (RE_UINT32)re_grapheme_link_stage_1[f] << 4; + f = code >> 10; + code ^= f << 10; + pos = (RE_UINT32)re_grapheme_link_stage_2[pos + f] << 3; + f = code >> 7; + code ^= f << 7; + pos = (RE_UINT32)re_grapheme_link_stage_3[pos + f] << 2; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_grapheme_link_stage_4[pos + f] << 5; + pos += code; + value = (re_grapheme_link_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* White_Space. */ + +static RE_UINT8 re_white_space_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_white_space_stage_2[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +}; + +static RE_UINT8 re_white_space_stage_3[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, + 3, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_white_space_stage_4[] = { + 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 3, 1, 1, 1, 1, 1, 4, 5, 1, 1, 1, 1, 1, 1, + 3, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_white_space_stage_5[] = { + 0, 62, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 32, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 255, 7, 0, 0, 0, 131, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, +}; + +/* White_Space: 169 bytes. */ + +RE_UINT32 re_get_white_space(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_white_space_stage_1[f] << 3; + f = code >> 13; + code ^= f << 13; + pos = (RE_UINT32)re_white_space_stage_2[pos + f] << 4; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_white_space_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_white_space_stage_4[pos + f] << 6; + pos += code; + value = (re_white_space_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Bidi_Control. */ + +static RE_UINT8 re_bidi_control_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_bidi_control_stage_2[] = { + 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_bidi_control_stage_3[] = { + 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_bidi_control_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 2, 3, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_bidi_control_stage_5[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, + 0, 192, 0, 0, 0, 124, 0, 0, 0, 0, 0, 0, 192, 3, 0, 0, +}; + +/* Bidi_Control: 129 bytes. */ + +RE_UINT32 re_get_bidi_control(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_bidi_control_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_bidi_control_stage_2[pos + f] << 3; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_bidi_control_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_bidi_control_stage_4[pos + f] << 6; + pos += code; + value = (re_bidi_control_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Join_Control. */ + +static RE_UINT8 re_join_control_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_join_control_stage_2[] = { + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_join_control_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_join_control_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_join_control_stage_5[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 0, 0, 0, 0, 0, +}; + +/* Join_Control: 97 bytes. */ + +RE_UINT32 re_get_join_control(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_join_control_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_join_control_stage_2[pos + f] << 3; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_join_control_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_join_control_stage_4[pos + f] << 6; + pos += code; + value = (re_join_control_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Dash. */ + +static RE_UINT8 re_dash_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_dash_stage_2[] = { + 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, +}; + +static RE_UINT8 re_dash_stage_3[] = { + 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 1, 4, 1, 1, 1, + 5, 6, 1, 1, 1, 1, 1, 7, 8, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, +}; + +static RE_UINT8 re_dash_stage_4[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, + 4, 1, 1, 1, 1, 1, 1, 1, 5, 6, 7, 1, 1, 1, 1, 1, + 8, 1, 1, 1, 1, 1, 1, 1, 9, 3, 1, 1, 1, 1, 1, 1, + 10, 1, 11, 1, 1, 1, 1, 1, 12, 13, 1, 1, 14, 1, 1, 1, +}; + +static RE_UINT8 re_dash_stage_5[] = { + 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 0, 0, 0, 0, 0, 64, 1, 0, 0, 0, 0, 0, 0, 0, + 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63, 0, 0, 0, 0, 0, + 0, 0, 8, 0, 0, 0, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, + 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 128, 4, 0, 0, 0, 12, + 0, 0, 0, 16, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 1, 8, 0, 0, 0, + 0, 32, 0, 0, 0, 0, 0, 0, +}; + +/* Dash: 297 bytes. */ + +RE_UINT32 re_get_dash(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_dash_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_dash_stage_2[pos + f] << 3; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_dash_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_dash_stage_4[pos + f] << 6; + pos += code; + value = (re_dash_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Hyphen. */ + +static RE_UINT8 re_hyphen_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_hyphen_stage_2[] = { + 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, +}; + +static RE_UINT8 re_hyphen_stage_3[] = { + 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, + 4, 1, 1, 1, 1, 1, 1, 5, 6, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, +}; + +static RE_UINT8 re_hyphen_stage_4[] = { + 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, + 4, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 6, 1, 1, 1, 1, 1, 7, 1, 1, 8, 9, 1, 1, +}; + +static RE_UINT8 re_hyphen_stage_5[] = { + 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0, 0, 0, + 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, +}; + +/* Hyphen: 241 bytes. */ + +RE_UINT32 re_get_hyphen(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_hyphen_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_hyphen_stage_2[pos + f] << 3; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_hyphen_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_hyphen_stage_4[pos + f] << 6; + pos += code; + value = (re_hyphen_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Quotation_Mark. */ + +static RE_UINT8 re_quotation_mark_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_quotation_mark_stage_2[] = { + 0, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_quotation_mark_stage_3[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 1, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 5, +}; + +static RE_UINT8 re_quotation_mark_stage_4[] = { + 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 3, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, + 5, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 7, 8, 1, 1, +}; + +static RE_UINT8 re_quotation_mark_stage_5[] = { + 0, 0, 0, 0, 132, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, 0, 8, 0, 0, 0, 255, 0, 0, 0, 6, + 4, 0, 0, 0, 0, 0, 0, 0, 0, 240, 0, 224, 0, 0, 0, 0, + 30, 0, 0, 0, 0, 0, 0, 0, 132, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 12, 0, 0, 0, +}; + +/* Quotation_Mark: 209 bytes. */ + +RE_UINT32 re_get_quotation_mark(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_quotation_mark_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_quotation_mark_stage_2[pos + f] << 3; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_quotation_mark_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_quotation_mark_stage_4[pos + f] << 6; + pos += code; + value = (re_quotation_mark_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Terminal_Punctuation. */ + +static RE_UINT8 re_terminal_punctuation_stage_1[] = { + 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, +}; + +static RE_UINT8 re_terminal_punctuation_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 12, 13, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 14, + 15, 9, 16, 9, 17, 18, 9, 9, 9, 19, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 20, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 21, + 9, 9, 9, 9, 9, 9, 22, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, +}; + +static RE_UINT8 re_terminal_punctuation_stage_3[] = { + 0, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 4, 5, 6, 7, 8, + 9, 1, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 11, 1, 12, 1, + 13, 1, 1, 1, 1, 1, 14, 1, 1, 1, 1, 1, 15, 16, 17, 18, + 19, 1, 20, 1, 1, 21, 22, 1, 23, 1, 1, 1, 1, 1, 1, 1, + 24, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 25, 1, 1, 1, 26, 1, 1, 1, 1, 1, 1, 1, + 1, 27, 1, 1, 28, 29, 1, 1, 30, 31, 32, 33, 34, 35, 1, 36, + 1, 1, 1, 1, 37, 1, 38, 1, 1, 1, 1, 1, 1, 1, 1, 39, + 40, 1, 41, 1, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 1, 1, + 1, 1, 1, 52, 53, 1, 54, 1, 55, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 56, 57, 58, 1, 1, 41, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 59, 1, 1, +}; + +static RE_UINT8 re_terminal_punctuation_stage_4[] = { + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, + 4, 0, 5, 0, 6, 0, 0, 0, 0, 0, 7, 0, 8, 0, 0, 0, + 0, 0, 0, 9, 0, 10, 2, 0, 0, 0, 0, 11, 0, 0, 12, 0, + 13, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 15, 0, 0, 0, 16, + 0, 0, 0, 17, 0, 18, 0, 0, 0, 0, 19, 0, 20, 0, 0, 0, + 0, 0, 11, 0, 0, 21, 0, 0, 0, 0, 22, 0, 0, 23, 0, 24, + 0, 25, 26, 0, 0, 27, 28, 0, 29, 0, 0, 0, 0, 0, 0, 24, + 30, 0, 0, 0, 0, 0, 0, 31, 0, 0, 0, 32, 0, 0, 33, 0, + 0, 34, 0, 0, 0, 0, 26, 0, 0, 0, 35, 0, 0, 0, 36, 37, + 0, 0, 0, 38, 0, 0, 39, 0, 1, 0, 0, 40, 36, 0, 41, 0, + 0, 0, 42, 0, 36, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 43, + 0, 44, 0, 0, 45, 0, 0, 0, 0, 0, 46, 0, 0, 24, 47, 0, + 0, 0, 48, 0, 0, 0, 49, 0, 0, 50, 0, 0, 0, 4, 0, 0, + 0, 0, 51, 0, 0, 0, 29, 0, 0, 52, 0, 0, 0, 0, 0, 53, + 0, 0, 0, 33, 0, 0, 0, 54, 0, 55, 56, 0, 57, 0, 0, 0, +}; + +static RE_UINT8 re_terminal_punctuation_stage_5[] = { + 0, 0, 0, 0, 2, 80, 0, 140, 0, 0, 0, 64, 128, 0, 0, 0, + 0, 2, 0, 0, 8, 0, 0, 0, 0, 16, 0, 136, 0, 0, 16, 0, + 255, 23, 0, 0, 0, 0, 0, 3, 0, 0, 255, 127, 48, 0, 0, 0, + 0, 0, 0, 12, 0, 225, 7, 0, 0, 12, 0, 0, 254, 1, 0, 0, + 0, 96, 0, 0, 0, 56, 0, 0, 0, 0, 96, 0, 0, 0, 112, 4, + 60, 3, 0, 0, 0, 15, 0, 0, 0, 0, 0, 236, 0, 0, 0, 248, + 0, 0, 0, 192, 0, 0, 0, 48, 128, 3, 0, 0, 0, 64, 0, 16, + 2, 0, 0, 0, 6, 0, 0, 0, 0, 224, 0, 0, 0, 0, 248, 0, + 0, 0, 192, 0, 0, 192, 0, 0, 0, 128, 0, 0, 0, 0, 0, 224, + 0, 0, 0, 128, 0, 0, 3, 0, 0, 8, 0, 0, 0, 0, 247, 0, + 18, 0, 0, 0, 0, 0, 1, 0, 0, 0, 128, 0, 0, 0, 63, 0, + 0, 0, 0, 252, 0, 0, 0, 30, 128, 63, 0, 0, 3, 0, 0, 0, + 14, 0, 0, 0, 96, 32, 0, 192, 0, 0, 0, 31, 60, 254, 255, 0, + 0, 0, 0, 112, 0, 0, 31, 0, 0, 0, 32, 0, 0, 0, 128, 3, + 16, 0, 0, 0, 128, 7, 0, 0, +}; + +/* Terminal_Punctuation: 850 bytes. */ + +RE_UINT32 re_get_terminal_punctuation(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 15; + code = ch ^ (f << 15); + pos = (RE_UINT32)re_terminal_punctuation_stage_1[f] << 5; + f = code >> 10; + code ^= f << 10; + pos = (RE_UINT32)re_terminal_punctuation_stage_2[pos + f] << 3; + f = code >> 7; + code ^= f << 7; + pos = (RE_UINT32)re_terminal_punctuation_stage_3[pos + f] << 2; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_terminal_punctuation_stage_4[pos + f] << 5; + pos += code; + value = (re_terminal_punctuation_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Other_Math. */ + +static RE_UINT8 re_other_math_stage_1[] = { + 0, 1, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, +}; + +static RE_UINT8 re_other_math_stage_2[] = { + 0, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 6, 1, 1, +}; + +static RE_UINT8 re_other_math_stage_3[] = { + 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 3, 4, 1, 5, 1, 6, 7, 8, 1, 9, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 10, 11, 1, 1, 1, 1, 12, 13, 14, 15, + 1, 1, 1, 1, 1, 1, 16, 1, +}; + +static RE_UINT8 re_other_math_stage_4[] = { + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 7, 8, 0, 9, 10, + 11, 12, 13, 0, 14, 15, 16, 17, 18, 0, 0, 0, 0, 19, 20, 21, + 0, 0, 0, 0, 0, 22, 23, 24, 25, 0, 26, 27, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 25, 28, 0, 0, 0, 0, 29, 0, 30, 31, + 0, 0, 0, 32, 0, 0, 0, 0, 0, 33, 0, 0, 0, 0, 0, 0, + 34, 34, 35, 34, 36, 37, 38, 34, 39, 40, 41, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 42, 43, 44, 35, 35, 45, 45, 46, 46, 47, 34, + 38, 48, 49, 50, 51, 52, 0, 0, +}; + +static RE_UINT8 re_other_math_stage_5[] = { + 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 39, 0, 0, 0, 51, 0, + 0, 0, 64, 0, 0, 0, 28, 0, 1, 0, 0, 0, 30, 0, 0, 96, + 0, 96, 0, 0, 0, 0, 255, 31, 98, 248, 0, 0, 132, 252, 47, 62, + 16, 179, 251, 241, 224, 3, 0, 0, 0, 0, 224, 243, 182, 62, 195, 240, + 255, 63, 235, 47, 48, 0, 0, 0, 0, 15, 0, 0, 0, 0, 176, 0, + 0, 0, 1, 0, 4, 0, 0, 0, 3, 192, 127, 240, 193, 140, 15, 0, + 148, 31, 0, 0, 96, 0, 0, 0, 5, 0, 0, 0, 15, 96, 0, 0, + 192, 255, 0, 0, 248, 255, 255, 1, 0, 0, 0, 15, 0, 0, 0, 48, + 10, 1, 0, 0, 0, 0, 0, 80, 255, 255, 255, 255, 255, 255, 223, 255, + 255, 255, 255, 223, 100, 222, 255, 235, 239, 255, 255, 255, 191, 231, 223, 223, + 255, 255, 255, 123, 95, 252, 253, 255, 63, 255, 255, 255, 253, 255, 255, 247, + 255, 255, 255, 247, 255, 127, 255, 255, 255, 253, 255, 255, 247, 207, 255, 255, + 150, 254, 247, 10, 132, 234, 150, 170, 150, 247, 247, 94, 255, 251, 255, 15, + 238, 251, 255, 15, +}; + +/* Other_Math: 502 bytes. */ + +RE_UINT32 re_get_other_math(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 15; + code = ch ^ (f << 15); + pos = (RE_UINT32)re_other_math_stage_1[f] << 4; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_other_math_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_other_math_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_other_math_stage_4[pos + f] << 5; + pos += code; + value = (re_other_math_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Hex_Digit. */ + +static RE_UINT8 re_hex_digit_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_hex_digit_stage_2[] = { + 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_hex_digit_stage_3[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 2, +}; + +static RE_UINT8 re_hex_digit_stage_4[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 2, 1, +}; + +static RE_UINT8 re_hex_digit_stage_5[] = { + 0, 0, 0, 0, 0, 0, 255, 3, 126, 0, 0, 0, 126, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 255, 3, 126, 0, 0, 0, 126, 0, 0, 0, 0, 0, 0, 0, +}; + +/* Hex_Digit: 129 bytes. */ + +RE_UINT32 re_get_hex_digit(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_hex_digit_stage_1[f] << 3; + f = code >> 13; + code ^= f << 13; + pos = (RE_UINT32)re_hex_digit_stage_2[pos + f] << 3; + f = code >> 10; + code ^= f << 10; + pos = (RE_UINT32)re_hex_digit_stage_3[pos + f] << 3; + f = code >> 7; + code ^= f << 7; + pos = (RE_UINT32)re_hex_digit_stage_4[pos + f] << 7; + pos += code; + value = (re_hex_digit_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* ASCII_Hex_Digit. */ + +static RE_UINT8 re_ascii_hex_digit_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_ascii_hex_digit_stage_2[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_ascii_hex_digit_stage_3[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_ascii_hex_digit_stage_4[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_ascii_hex_digit_stage_5[] = { + 0, 0, 0, 0, 0, 0, 255, 3, 126, 0, 0, 0, 126, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/* ASCII_Hex_Digit: 97 bytes. */ + +RE_UINT32 re_get_ascii_hex_digit(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_ascii_hex_digit_stage_1[f] << 3; + f = code >> 13; + code ^= f << 13; + pos = (RE_UINT32)re_ascii_hex_digit_stage_2[pos + f] << 3; + f = code >> 10; + code ^= f << 10; + pos = (RE_UINT32)re_ascii_hex_digit_stage_3[pos + f] << 3; + f = code >> 7; + code ^= f << 7; + pos = (RE_UINT32)re_ascii_hex_digit_stage_4[pos + f] << 7; + pos += code; + value = (re_ascii_hex_digit_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Other_Alphabetic. */ + +static RE_UINT8 re_other_alphabetic_stage_1[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, +}; + +static RE_UINT8 re_other_alphabetic_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 7, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 9, + 10, 11, 12, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 13, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 14, 6, 6, 6, 6, 6, 6, 15, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, +}; + +static RE_UINT8 re_other_alphabetic_stage_3[] = { + 0, 0, 0, 1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 0, 0, 14, 0, 0, 0, 15, 16, 17, 18, 19, 20, 21, 0, 0, + 0, 0, 0, 0, 22, 0, 0, 0, 0, 0, 0, 0, 0, 23, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, + 25, 26, 27, 28, 0, 0, 0, 0, 0, 0, 0, 29, 0, 0, 0, 0, + 0, 0, 0, 30, 0, 0, 0, 0, 0, 0, 31, 0, 0, 0, 0, 0, + 32, 33, 34, 35, 36, 37, 38, 39, 0, 0, 0, 40, 0, 0, 0, 41, + 0, 0, 0, 0, 42, 0, 0, 0, 0, 43, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_other_alphabetic_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 3, 0, 4, 0, 5, 6, 0, 0, 7, 8, + 9, 10, 0, 0, 0, 11, 0, 0, 12, 13, 0, 0, 0, 0, 0, 14, + 15, 16, 17, 18, 19, 20, 21, 18, 19, 20, 22, 23, 19, 20, 24, 18, + 19, 20, 25, 18, 26, 20, 27, 0, 15, 20, 28, 18, 19, 20, 28, 18, + 19, 20, 29, 18, 18, 0, 30, 31, 0, 32, 33, 0, 0, 34, 33, 0, + 0, 0, 0, 35, 36, 37, 0, 0, 0, 38, 39, 40, 41, 0, 0, 0, + 0, 0, 42, 0, 0, 0, 0, 0, 31, 31, 31, 31, 0, 43, 44, 0, + 0, 0, 0, 0, 0, 45, 0, 0, 0, 46, 0, 0, 0, 0, 0, 0, + 47, 0, 48, 49, 0, 0, 0, 0, 50, 51, 15, 0, 52, 53, 0, 54, + 0, 55, 0, 0, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 0, 56, + 0, 0, 0, 0, 0, 43, 57, 58, 0, 0, 0, 0, 0, 0, 0, 57, + 0, 0, 0, 59, 20, 0, 0, 0, 0, 60, 0, 0, 61, 62, 15, 0, + 0, 63, 64, 0, 15, 62, 0, 0, 0, 65, 66, 0, 0, 67, 0, 68, + 0, 0, 0, 0, 0, 0, 0, 69, 70, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 71, 0, 0, 0, 0, 72, 0, 0, 0, 0, 0, 0, 0, + 52, 73, 74, 0, 26, 75, 0, 0, 52, 64, 0, 0, 52, 76, 0, 0, + 0, 77, 0, 0, 0, 0, 42, 44, 15, 20, 21, 18, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 10, 61, 0, 0, 0, 0, 0, 0, 78, 79, 0, + 0, 80, 81, 0, 0, 82, 0, 0, 83, 84, 0, 0, 0, 0, 0, 0, + 0, 85, 0, 0, 0, 0, 0, 0, 0, 0, 35, 86, 0, 0, 0, 0, + 0, 0, 0, 0, 70, 0, 0, 0, 0, 10, 87, 87, 58, 0, 0, 0, +}; + +static RE_UINT8 re_other_alphabetic_stage_5[] = { + 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 255, 191, 182, 0, 0, 0, + 0, 0, 255, 7, 0, 248, 255, 254, 0, 0, 1, 0, 0, 0, 192, 31, + 158, 33, 0, 0, 0, 0, 2, 0, 0, 0, 255, 255, 192, 255, 1, 0, + 0, 0, 192, 248, 239, 30, 0, 0, 248, 3, 255, 255, 15, 0, 0, 0, + 0, 0, 0, 204, 255, 223, 224, 0, 12, 0, 0, 0, 14, 0, 0, 0, + 0, 0, 0, 192, 159, 25, 128, 0, 135, 25, 2, 0, 0, 0, 35, 0, + 191, 27, 0, 0, 159, 25, 192, 0, 4, 0, 0, 0, 199, 29, 128, 0, + 223, 29, 96, 0, 223, 29, 128, 0, 0, 128, 95, 255, 0, 0, 12, 0, + 0, 0, 242, 7, 0, 32, 0, 0, 0, 0, 242, 27, 0, 0, 254, 255, + 3, 224, 255, 254, 255, 255, 255, 31, 0, 248, 127, 121, 0, 0, 192, 195, + 133, 1, 30, 0, 124, 0, 0, 48, 0, 0, 0, 128, 0, 0, 192, 255, + 255, 1, 0, 0, 0, 2, 0, 0, 255, 15, 255, 1, 0, 0, 128, 15, + 0, 0, 224, 127, 254, 255, 31, 0, 31, 0, 0, 0, 0, 0, 224, 255, + 7, 0, 0, 0, 254, 51, 0, 0, 128, 255, 3, 0, 240, 255, 63, 0, + 128, 255, 31, 0, 255, 255, 255, 255, 255, 3, 0, 0, 0, 0, 240, 15, + 248, 0, 0, 0, 3, 0, 0, 0, 0, 0, 240, 255, 192, 7, 0, 0, + 128, 255, 7, 0, 0, 254, 127, 0, 8, 48, 0, 0, 0, 0, 157, 65, + 0, 248, 32, 0, 248, 7, 0, 0, 0, 0, 0, 64, 0, 0, 192, 7, + 110, 240, 0, 0, 0, 0, 0, 255, 63, 0, 0, 0, 0, 0, 255, 1, + 0, 0, 248, 255, 0, 240, 159, 0, 0, 128, 63, 127, 0, 0, 0, 48, + 0, 0, 255, 127, 1, 0, 0, 0, 0, 248, 63, 0, 0, 0, 0, 224, + 255, 7, 0, 0, 0, 0, 127, 0, 255, 255, 255, 127, 255, 3, 255, 255, +}; + +/* Other_Alphabetic: 945 bytes. */ + +RE_UINT32 re_get_other_alphabetic(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_other_alphabetic_stage_1[f] << 5; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_other_alphabetic_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_other_alphabetic_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_other_alphabetic_stage_4[pos + f] << 5; + pos += code; + value = (re_other_alphabetic_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Ideographic. */ + +static RE_UINT8 re_ideographic_stage_1[] = { + 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_ideographic_stage_2[] = { + 0, 0, 0, 0, 0, 0, 1, 2, 2, 3, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 6, 2, 7, 8, 2, 9, 0, 0, 0, 0, 0, 10, +}; + +static RE_UINT8 re_ideographic_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 4, 0, 2, 5, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 6, 2, 2, 2, 2, 2, 2, 2, 2, 7, + 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 9, 0, + 2, 2, 10, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_ideographic_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 0, 0, + 3, 3, 3, 3, 3, 3, 4, 0, 3, 3, 3, 5, 3, 3, 6, 0, + 3, 3, 3, 3, 3, 3, 7, 0, 3, 8, 3, 3, 3, 3, 3, 3, + 9, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 10, 0, 0, + 9, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_ideographic_stage_5[] = { + 0, 0, 0, 0, 192, 0, 0, 0, 254, 3, 0, 7, 255, 255, 255, 255, + 255, 255, 63, 0, 255, 63, 255, 255, 255, 255, 255, 3, 255, 255, 127, 0, + 255, 255, 31, 0, 255, 255, 255, 63, 3, 0, 0, 0, +}; + +/* Ideographic: 333 bytes. */ + +RE_UINT32 re_get_ideographic(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_ideographic_stage_1[f] << 5; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_ideographic_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_ideographic_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_ideographic_stage_4[pos + f] << 5; + pos += code; + value = (re_ideographic_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Diacritic. */ + +static RE_UINT8 re_diacritic_stage_1[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, +}; + +static RE_UINT8 re_diacritic_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 7, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 9, + 10, 11, 12, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 13, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 14, 4, 4, 15, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, +}; + +static RE_UINT8 re_diacritic_stage_3[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 1, 1, 1, 1, 1, 1, 17, 1, 18, 19, 20, 21, 22, 1, 23, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 24, 1, 25, 1, + 26, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 28, + 29, 30, 31, 32, 1, 1, 1, 1, 1, 1, 1, 33, 1, 1, 34, 35, + 1, 1, 36, 1, 1, 1, 1, 1, 1, 1, 37, 1, 1, 1, 1, 1, + 38, 39, 40, 41, 42, 43, 44, 45, 1, 1, 46, 1, 1, 1, 1, 47, + 1, 48, 1, 1, 1, 1, 1, 1, 49, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_diacritic_stage_4[] = { + 0, 0, 1, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 4, 5, 5, 5, 5, 6, 7, 8, 0, 0, 0, + 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 10, 0, 11, 12, 13, 0, + 0, 0, 14, 0, 0, 0, 15, 16, 0, 4, 17, 0, 0, 18, 0, 19, + 20, 0, 0, 0, 0, 0, 0, 21, 0, 22, 23, 24, 0, 22, 25, 0, + 0, 22, 25, 0, 0, 22, 25, 0, 0, 22, 25, 0, 0, 0, 25, 0, + 0, 0, 25, 0, 0, 22, 25, 0, 0, 0, 25, 0, 0, 0, 26, 0, + 0, 0, 27, 0, 0, 0, 28, 0, 20, 29, 0, 0, 30, 0, 31, 0, + 0, 32, 0, 0, 33, 0, 0, 0, 0, 0, 0, 0, 0, 0, 34, 0, + 0, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 0, 37, 0, 0, + 0, 38, 39, 40, 0, 41, 0, 0, 0, 42, 0, 43, 0, 0, 4, 44, + 0, 45, 5, 17, 0, 0, 46, 47, 0, 0, 0, 0, 0, 48, 49, 50, + 0, 0, 0, 0, 0, 0, 0, 51, 0, 52, 0, 0, 0, 0, 0, 0, + 0, 53, 0, 0, 54, 0, 0, 22, 0, 0, 0, 55, 56, 0, 0, 57, + 58, 59, 0, 0, 60, 0, 0, 20, 0, 0, 0, 0, 0, 0, 39, 61, + 0, 62, 63, 0, 0, 63, 2, 64, 0, 0, 0, 65, 0, 15, 66, 67, + 0, 0, 68, 0, 0, 0, 0, 69, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 70, 0, 0, 0, 0, 0, 0, 0, 1, 2, 71, 72, 0, 0, 73, + 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 74, + 0, 0, 0, 0, 0, 75, 0, 0, 0, 76, 0, 63, 0, 0, 77, 0, + 0, 78, 0, 0, 0, 0, 0, 79, 0, 22, 25, 80, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 81, 0, 0, 0, 0, 0, 0, 15, 2, 0, + 0, 15, 0, 0, 0, 42, 0, 0, 0, 82, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 83, 0, 0, 0, 0, 84, 0, 0, 0, + 0, 0, 0, 85, 86, 87, 0, 0, 0, 0, 0, 0, 0, 0, 88, 0, +}; + +static RE_UINT8 re_diacritic_stage_5[] = { + 0, 0, 0, 0, 0, 0, 0, 64, 1, 0, 0, 0, 0, 129, 144, 1, + 0, 0, 255, 255, 255, 255, 255, 255, 255, 127, 255, 224, 7, 0, 48, 4, + 48, 0, 0, 0, 248, 0, 0, 0, 0, 0, 0, 2, 0, 0, 254, 255, + 251, 255, 255, 191, 22, 0, 0, 0, 0, 248, 135, 1, 0, 0, 0, 128, + 97, 28, 0, 0, 255, 7, 0, 0, 192, 255, 1, 0, 0, 248, 63, 0, + 0, 0, 0, 3, 248, 255, 255, 127, 0, 0, 0, 16, 0, 32, 30, 0, + 0, 0, 2, 0, 0, 32, 0, 0, 0, 4, 0, 0, 128, 95, 0, 0, + 0, 31, 0, 0, 0, 0, 160, 194, 220, 0, 0, 0, 64, 0, 0, 0, + 0, 0, 128, 6, 128, 191, 0, 12, 0, 254, 15, 32, 0, 0, 0, 14, + 0, 0, 224, 159, 0, 0, 255, 63, 0, 0, 16, 0, 16, 0, 0, 0, + 0, 248, 15, 0, 0, 12, 0, 0, 0, 0, 192, 0, 0, 0, 0, 63, + 255, 33, 16, 3, 0, 240, 255, 255, 240, 255, 0, 0, 0, 0, 32, 224, + 0, 0, 0, 160, 3, 224, 0, 224, 0, 224, 0, 96, 0, 128, 3, 0, + 0, 128, 0, 0, 0, 252, 0, 0, 0, 0, 0, 30, 0, 128, 0, 176, + 0, 0, 0, 48, 0, 0, 3, 0, 0, 0, 128, 255, 3, 0, 0, 0, + 0, 1, 0, 0, 255, 255, 3, 0, 0, 120, 0, 0, 0, 0, 8, 0, + 32, 0, 0, 0, 0, 0, 0, 56, 7, 0, 0, 0, 0, 0, 64, 0, + 0, 0, 0, 248, 0, 48, 0, 0, 255, 255, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 192, 8, 0, 0, 0, 96, 0, 0, 0, 0, 0, 0, 6, + 0, 0, 24, 0, 1, 28, 0, 0, 0, 0, 96, 0, 0, 6, 0, 0, + 192, 31, 31, 0, 12, 0, 0, 0, 0, 8, 0, 0, 0, 0, 31, 0, + 0, 128, 255, 255, 128, 227, 7, 248, 231, 15, 0, 0, 0, 60, 0, 0, + 0, 0, 127, 0, +}; + +/* Diacritic: 997 bytes. */ + +RE_UINT32 re_get_diacritic(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_diacritic_stage_1[f] << 5; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_diacritic_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_diacritic_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_diacritic_stage_4[pos + f] << 5; + pos += code; + value = (re_diacritic_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Extender. */ + +static RE_UINT8 re_extender_stage_1[] = { + 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, +}; + +static RE_UINT8 re_extender_stage_2[] = { + 0, 1, 2, 3, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 5, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7, + 2, 2, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 9, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +}; + +static RE_UINT8 re_extender_stage_3[] = { + 0, 1, 2, 1, 1, 1, 3, 4, 1, 1, 1, 1, 1, 1, 5, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 7, 1, 8, 1, 1, 1, + 9, 1, 1, 1, 1, 1, 1, 1, 10, 1, 1, 1, 1, 1, 11, 1, + 1, 12, 13, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 14, + 1, 1, 1, 15, 1, 16, 1, 1, 1, 1, 1, 17, 1, 1, 1, 1, +}; + +static RE_UINT8 re_extender_stage_4[] = { + 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 5, 0, 0, 0, 5, 0, + 6, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, + 0, 9, 0, 10, 0, 0, 0, 0, 11, 12, 0, 0, 13, 0, 0, 14, + 15, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 17, 5, 0, 0, 0, 18, 0, 0, 19, 20, + 0, 0, 0, 18, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 21, 0, 0, 0, 22, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_extender_stage_5[] = { + 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 3, 0, 1, 0, 0, 0, + 0, 0, 0, 4, 64, 0, 0, 0, 0, 4, 0, 0, 8, 0, 0, 0, + 128, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 8, 32, 0, 0, 0, + 0, 0, 62, 0, 0, 0, 0, 96, 0, 0, 0, 112, 0, 0, 32, 0, + 0, 16, 0, 0, 0, 128, 0, 0, 0, 0, 1, 0, 0, 0, 0, 32, + 0, 0, 24, 0, 192, 1, 0, 0, 12, 0, 0, 0, +}; + +/* Extender: 414 bytes. */ + +RE_UINT32 re_get_extender(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 15; + code = ch ^ (f << 15); + pos = (RE_UINT32)re_extender_stage_1[f] << 4; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_extender_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_extender_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_extender_stage_4[pos + f] << 5; + pos += code; + value = (re_extender_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Other_Lowercase. */ + +static RE_UINT8 re_other_lowercase_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_other_lowercase_stage_2[] = { + 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +}; + +static RE_UINT8 re_other_lowercase_stage_3[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, + 4, 2, 5, 2, 2, 2, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 7, 2, 8, 2, 2, +}; + +static RE_UINT8 re_other_lowercase_stage_4[] = { + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 4, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, 0, + 0, 8, 9, 0, 0, 10, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, + 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 14, 0, 15, + 0, 0, 0, 0, 0, 16, 0, 0, +}; + +static RE_UINT8 re_other_lowercase_stage_5[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 4, + 0, 0, 0, 0, 0, 0, 255, 1, 3, 0, 0, 0, 31, 0, 0, 0, + 32, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 240, 255, 255, + 255, 255, 255, 255, 255, 7, 0, 1, 0, 0, 0, 248, 255, 255, 255, 255, + 0, 0, 0, 0, 0, 0, 2, 128, 0, 0, 255, 31, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 255, 255, 0, 0, 255, 255, 255, 3, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 48, 0, 0, 0, 48, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 3, + 0, 0, 0, 240, 0, 0, 0, 0, +}; + +/* Other_Lowercase: 297 bytes. */ + +RE_UINT32 re_get_other_lowercase(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_other_lowercase_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_other_lowercase_stage_2[pos + f] << 3; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_other_lowercase_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_other_lowercase_stage_4[pos + f] << 6; + pos += code; + value = (re_other_lowercase_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Other_Uppercase. */ + +static RE_UINT8 re_other_uppercase_stage_1[] = { + 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, +}; + +static RE_UINT8 re_other_uppercase_stage_2[] = { + 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, +}; + +static RE_UINT8 re_other_uppercase_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, + 0, 3, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_other_uppercase_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 1, 0, 0, 3, 4, 4, 5, 0, 0, 0, +}; + +static RE_UINT8 re_other_uppercase_stage_5[] = { + 0, 0, 0, 0, 255, 255, 0, 0, 0, 0, 192, 255, 0, 0, 255, 255, + 255, 3, 255, 255, 255, 3, 0, 0, +}; + +/* Other_Uppercase: 162 bytes. */ + +RE_UINT32 re_get_other_uppercase(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 15; + code = ch ^ (f << 15); + pos = (RE_UINT32)re_other_uppercase_stage_1[f] << 4; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_other_uppercase_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_other_uppercase_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_other_uppercase_stage_4[pos + f] << 5; + pos += code; + value = (re_other_uppercase_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Noncharacter_Code_Point. */ + +static RE_UINT8 re_noncharacter_code_point_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_noncharacter_code_point_stage_2[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, +}; + +static RE_UINT8 re_noncharacter_code_point_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, + 0, 0, 0, 0, 0, 0, 0, 2, +}; + +static RE_UINT8 re_noncharacter_code_point_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 0, 0, 2, +}; + +static RE_UINT8 re_noncharacter_code_point_stage_5[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 255, 255, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 192, +}; + +/* Noncharacter_Code_Point: 121 bytes. */ + +RE_UINT32 re_get_noncharacter_code_point(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_noncharacter_code_point_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_noncharacter_code_point_stage_2[pos + f] << 3; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_noncharacter_code_point_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_noncharacter_code_point_stage_4[pos + f] << 6; + pos += code; + value = (re_noncharacter_code_point_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Other_Grapheme_Extend. */ + +static RE_UINT8 re_other_grapheme_extend_stage_1[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, +}; + +static RE_UINT8 re_other_grapheme_extend_stage_2[] = { + 0, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, + 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_other_grapheme_extend_stage_3[] = { + 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 6, 0, 7, 8, 0, 0, 0, 0, 0, + 9, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_other_grapheme_extend_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, + 0, 0, 0, 0, 1, 2, 1, 2, 0, 0, 0, 3, 1, 2, 0, 4, + 5, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 1, 2, 0, 0, + 0, 0, 8, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 10, 0, 0, +}; + +static RE_UINT8 re_other_grapheme_extend_stage_5[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, + 0, 0, 128, 0, 0, 0, 0, 0, 4, 0, 96, 0, 0, 0, 0, 0, + 0, 128, 0, 128, 0, 0, 0, 0, 0, 48, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 192, 0, 0, 0, 0, 0, 192, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 32, 0, 0, 0, 0, 0, 128, 0, 0, + 0, 0, 0, 0, 32, 192, 7, 0, +}; + +/* Other_Grapheme_Extend: 289 bytes. */ + +RE_UINT32 re_get_other_grapheme_extend(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_other_grapheme_extend_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_other_grapheme_extend_stage_2[pos + f] << 3; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_other_grapheme_extend_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_other_grapheme_extend_stage_4[pos + f] << 6; + pos += code; + value = (re_other_grapheme_extend_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* IDS_Binary_Operator. */ + +static RE_UINT8 re_ids_binary_operator_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_ids_binary_operator_stage_2[] = { + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_ids_binary_operator_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, +}; + +static RE_UINT8 re_ids_binary_operator_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, +}; + +static RE_UINT8 re_ids_binary_operator_stage_5[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 243, 15, +}; + +/* IDS_Binary_Operator: 97 bytes. */ + +RE_UINT32 re_get_ids_binary_operator(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_ids_binary_operator_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_ids_binary_operator_stage_2[pos + f] << 3; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_ids_binary_operator_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_ids_binary_operator_stage_4[pos + f] << 6; + pos += code; + value = (re_ids_binary_operator_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* IDS_Trinary_Operator. */ + +static RE_UINT8 re_ids_trinary_operator_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_ids_trinary_operator_stage_2[] = { + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_ids_trinary_operator_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, +}; + +static RE_UINT8 re_ids_trinary_operator_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, +}; + +static RE_UINT8 re_ids_trinary_operator_stage_5[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, +}; + +/* IDS_Trinary_Operator: 97 bytes. */ + +RE_UINT32 re_get_ids_trinary_operator(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_ids_trinary_operator_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_ids_trinary_operator_stage_2[pos + f] << 3; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_ids_trinary_operator_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_ids_trinary_operator_stage_4[pos + f] << 6; + pos += code; + value = (re_ids_trinary_operator_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Radical. */ + +static RE_UINT8 re_radical_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_radical_stage_2[] = { + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_radical_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, +}; + +static RE_UINT8 re_radical_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 2, 2, 3, 2, 2, 2, 2, 2, 2, 4, 0, +}; + +static RE_UINT8 re_radical_stage_5[] = { + 0, 0, 0, 0, 255, 255, 255, 251, 255, 255, 255, 255, 255, 255, 15, 0, + 255, 255, 63, 0, +}; + +/* Radical: 117 bytes. */ + +RE_UINT32 re_get_radical(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_radical_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_radical_stage_2[pos + f] << 3; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_radical_stage_3[pos + f] << 4; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_radical_stage_4[pos + f] << 5; + pos += code; + value = (re_radical_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Unified_Ideograph. */ + +static RE_UINT8 re_unified_ideograph_stage_1[] = { + 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_unified_ideograph_stage_2[] = { + 0, 0, 0, 1, 2, 3, 3, 3, 3, 4, 0, 0, 0, 0, 0, 5, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 6, 7, 8, 0, 0, 0, +}; + +static RE_UINT8 re_unified_ideograph_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 3, 0, 0, 0, 0, 0, 4, 0, 0, + 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 6, 7, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 8, +}; + +static RE_UINT8 re_unified_ideograph_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 3, + 4, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 5, 1, 1, 1, 1, + 1, 1, 1, 1, 6, 1, 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 8, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_unified_ideograph_stage_5[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 63, 0, 255, 255, 63, 0, 0, 0, 0, 0, + 0, 192, 26, 128, 154, 3, 0, 0, 255, 255, 127, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 31, 0, 255, 255, 255, 63, 255, 255, 255, 255, + 255, 255, 255, 255, 3, 0, 0, 0, +}; + +/* Unified_Ideograph: 281 bytes. */ + +RE_UINT32 re_get_unified_ideograph(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_unified_ideograph_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_unified_ideograph_stage_2[pos + f] << 3; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_unified_ideograph_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_unified_ideograph_stage_4[pos + f] << 6; + pos += code; + value = (re_unified_ideograph_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Other_Default_Ignorable_Code_Point. */ + +static RE_UINT8 re_other_default_ignorable_code_point_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, + 1, +}; + +static RE_UINT8 re_other_default_ignorable_code_point_stage_2[] = { + 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, +}; + +static RE_UINT8 re_other_default_ignorable_code_point_stage_3[] = { + 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 0, 0, + 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, + 7, 8, 8, 8, 8, 8, 8, 8, +}; + +static RE_UINT8 re_other_default_ignorable_code_point_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, + 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, + 0, 0, 0, 0, 0, 0, 6, 7, 8, 0, 9, 9, 0, 0, 0, 10, + 9, 9, 9, 9, 9, 9, 9, 9, +}; + +static RE_UINT8 re_other_default_ignorable_code_point_stage_5[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, + 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, 1, + 253, 255, 255, 255, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 255, 255, + 0, 0, 0, 0, 0, 0, 255, 255, +}; + +/* Other_Default_Ignorable_Code_Point: 281 bytes. */ + +RE_UINT32 re_get_other_default_ignorable_code_point(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_other_default_ignorable_code_point_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_other_default_ignorable_code_point_stage_2[pos + f] << 3; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_other_default_ignorable_code_point_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_other_default_ignorable_code_point_stage_4[pos + f] << 6; + pos += code; + value = (re_other_default_ignorable_code_point_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Deprecated. */ + +static RE_UINT8 re_deprecated_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, + 1, 1, +}; + +static RE_UINT8 re_deprecated_stage_2[] = { + 0, 1, 2, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +}; + +static RE_UINT8 re_deprecated_stage_3[] = { + 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, + 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, + 5, 0, 0, 6, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_deprecated_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, + 0, 6, 0, 0, 0, 0, 0, 0, 7, 0, 0, 8, 0, 0, 0, 0, +}; + +static RE_UINT8 re_deprecated_stage_5[] = { + 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 8, 0, 0, 0, 128, 2, + 24, 0, 0, 0, 0, 252, 0, 0, 0, 6, 0, 0, 2, 0, 0, 0, + 0, 0, 0, 128, +}; + +/* Deprecated: 230 bytes. */ + +RE_UINT32 re_get_deprecated(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 15; + code = ch ^ (f << 15); + pos = (RE_UINT32)re_deprecated_stage_1[f] << 4; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_deprecated_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_deprecated_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_deprecated_stage_4[pos + f] << 5; + pos += code; + value = (re_deprecated_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Soft_Dotted. */ + +static RE_UINT8 re_soft_dotted_stage_1[] = { + 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, +}; + +static RE_UINT8 re_soft_dotted_stage_2[] = { + 0, 1, 1, 2, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_soft_dotted_stage_3[] = { + 0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 6, 7, 5, 8, 9, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 10, 5, 5, 5, 5, 5, 5, 5, 11, 12, 13, 5, +}; + +static RE_UINT8 re_soft_dotted_stage_4[] = { + 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, + 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 9, 10, 11, 0, 0, 0, 12, 0, 0, 0, 0, 13, 0, + 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, + 0, 0, 0, 16, 0, 0, 0, 0, 0, 17, 18, 0, 19, 20, 0, 21, + 0, 22, 23, 0, 24, 0, 17, 18, 0, 19, 20, 0, 21, 0, 0, 0, +}; + +static RE_UINT8 re_soft_dotted_stage_5[] = { + 0, 0, 0, 0, 0, 6, 0, 0, 0, 128, 0, 0, 0, 2, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 32, 0, 0, 4, 0, 0, 0, 8, 0, + 0, 0, 64, 1, 4, 0, 0, 0, 0, 0, 64, 0, 16, 1, 0, 0, + 0, 32, 0, 0, 0, 8, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, + 0, 0, 0, 16, 12, 0, 0, 0, 0, 0, 192, 0, 0, 12, 0, 0, + 0, 0, 0, 192, 0, 0, 12, 0, 192, 0, 0, 0, 0, 0, 0, 12, + 0, 192, 0, 0, +}; + +/* Soft_Dotted: 342 bytes. */ + +RE_UINT32 re_get_soft_dotted(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 15; + code = ch ^ (f << 15); + pos = (RE_UINT32)re_soft_dotted_stage_1[f] << 4; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_soft_dotted_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_soft_dotted_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_soft_dotted_stage_4[pos + f] << 5; + pos += code; + value = (re_soft_dotted_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Logical_Order_Exception. */ + +static RE_UINT8 re_logical_order_exception_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_logical_order_exception_stage_2[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +}; + +static RE_UINT8 re_logical_order_exception_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, +}; + +static RE_UINT8 re_logical_order_exception_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_logical_order_exception_stage_5[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 224, 4, 0, 0, 0, 0, 0, 0, 96, 26, +}; + +/* Logical_Order_Exception: 145 bytes. */ + +RE_UINT32 re_get_logical_order_exception(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_logical_order_exception_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_logical_order_exception_stage_2[pos + f] << 3; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_logical_order_exception_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_logical_order_exception_stage_4[pos + f] << 6; + pos += code; + value = (re_logical_order_exception_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Other_ID_Start. */ + +static RE_UINT8 re_other_id_start_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_other_id_start_stage_2[] = { + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_other_id_start_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_other_id_start_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 2, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_other_id_start_stage_5[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 64, 0, 0, + 0, 0, 0, 24, 0, 0, 0, 0, +}; + +/* Other_ID_Start: 113 bytes. */ + +RE_UINT32 re_get_other_id_start(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_other_id_start_stage_1[f] << 3; + f = code >> 13; + code ^= f << 13; + pos = (RE_UINT32)re_other_id_start_stage_2[pos + f] << 4; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_other_id_start_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_other_id_start_stage_4[pos + f] << 6; + pos += code; + value = (re_other_id_start_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Other_ID_Continue. */ + +static RE_UINT8 re_other_id_continue_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_other_id_continue_stage_2[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_other_id_continue_stage_3[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 4, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +}; + +static RE_UINT8 re_other_id_continue_stage_4[] = { + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 4, +}; + +static RE_UINT8 re_other_id_continue_stage_5[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 0, + 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 254, 3, 0, + 0, 0, 0, 4, 0, 0, 0, 0, +}; + +/* Other_ID_Continue: 145 bytes. */ + +RE_UINT32 re_get_other_id_continue(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_other_id_continue_stage_1[f] << 3; + f = code >> 13; + code ^= f << 13; + pos = (RE_UINT32)re_other_id_continue_stage_2[pos + f] << 4; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_other_id_continue_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_other_id_continue_stage_4[pos + f] << 6; + pos += code; + value = (re_other_id_continue_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* STerm. */ + +static RE_UINT8 re_sterm_stage_1[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, +}; + +static RE_UINT8 re_sterm_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 8, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10, + 7, 11, 12, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 13, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 14, 7, 7, 7, 15, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, +}; + +static RE_UINT8 re_sterm_stage_3[] = { + 0, 1, 1, 1, 1, 2, 3, 4, 1, 5, 1, 1, 1, 1, 1, 1, + 6, 1, 1, 7, 1, 1, 8, 9, 10, 11, 12, 13, 14, 1, 1, 1, + 15, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16, 1, + 17, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 18, 1, 19, 1, 20, 21, 22, 23, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 24, 25, 1, 1, 26, 1, 1, 1, 1, 1, + 27, 28, 29, 1, 1, 30, 31, 32, 1, 1, 33, 34, 1, 1, 1, 1, + 1, 1, 1, 1, 35, 1, 1, 1, 1, 1, 36, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_sterm_stage_4[] = { + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 4, 0, + 5, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 0, + 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, + 0, 0, 0, 10, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, + 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, + 0, 15, 0, 16, 0, 0, 0, 0, 0, 17, 18, 0, 0, 0, 0, 0, + 0, 19, 0, 0, 0, 0, 0, 0, 20, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 3, 21, 0, 0, 0, 0, 0, 0, 22, + 0, 0, 0, 23, 0, 0, 21, 0, 0, 24, 0, 0, 0, 0, 25, 0, + 0, 0, 26, 0, 0, 0, 0, 27, 0, 0, 0, 0, 0, 0, 0, 28, + 0, 0, 29, 0, 0, 0, 0, 0, 1, 0, 0, 30, 0, 0, 0, 0, + 0, 0, 23, 0, 0, 0, 0, 0, 0, 0, 31, 0, 0, 16, 32, 0, + 0, 0, 33, 0, 0, 0, 34, 0, 0, 35, 0, 0, 0, 2, 0, 0, + 0, 0, 0, 0, 0, 0, 36, 0, 0, 0, 37, 0, 0, 0, 0, 0, + 0, 38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 0, 39, + 0, 40, 41, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, + 0, 0, 0, 0, 42, 0, 0, 0, +}; + +static RE_UINT8 re_sterm_stage_5[] = { + 0, 0, 0, 0, 2, 64, 0, 128, 0, 2, 0, 0, 0, 0, 0, 128, + 0, 0, 16, 0, 7, 0, 0, 0, 0, 0, 0, 2, 48, 0, 0, 0, + 0, 12, 0, 0, 132, 1, 0, 0, 0, 64, 0, 0, 0, 0, 96, 0, + 8, 2, 0, 0, 0, 15, 0, 0, 0, 0, 0, 204, 0, 0, 0, 24, + 0, 0, 0, 192, 0, 0, 0, 48, 128, 3, 0, 0, 0, 64, 0, 16, + 4, 0, 0, 0, 0, 192, 0, 0, 0, 0, 136, 0, 0, 0, 192, 0, + 0, 128, 0, 0, 0, 3, 0, 0, 0, 0, 0, 224, 0, 0, 3, 0, + 0, 8, 0, 0, 0, 0, 196, 0, 2, 0, 0, 0, 128, 1, 0, 0, + 3, 0, 0, 0, 14, 0, 0, 0, 96, 32, 0, 192, 0, 0, 0, 27, + 12, 254, 255, 0, 6, 0, 0, 0, 0, 0, 0, 112, 0, 0, 32, 0, + 0, 0, 128, 1, 16, 0, 0, 0, 0, 1, 0, 0, +}; + +/* STerm: 709 bytes. */ + +RE_UINT32 re_get_sterm(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_sterm_stage_1[f] << 5; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_sterm_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_sterm_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_sterm_stage_4[pos + f] << 5; + pos += code; + value = (re_sterm_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Variation_Selector. */ + +static RE_UINT8 re_variation_selector_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, + 1, +}; + +static RE_UINT8 re_variation_selector_stage_2[] = { + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_variation_selector_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_variation_selector_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 4, +}; + +static RE_UINT8 re_variation_selector_stage_5[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 56, 0, 0, 0, 0, 0, 0, + 255, 255, 0, 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 0, 0, +}; + +/* Variation_Selector: 169 bytes. */ + +RE_UINT32 re_get_variation_selector(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_variation_selector_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_variation_selector_stage_2[pos + f] << 3; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_variation_selector_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_variation_selector_stage_4[pos + f] << 6; + pos += code; + value = (re_variation_selector_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Pattern_White_Space. */ + +static RE_UINT8 re_pattern_white_space_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_pattern_white_space_stage_2[] = { + 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_pattern_white_space_stage_3[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_pattern_white_space_stage_4[] = { + 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 3, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_pattern_white_space_stage_5[] = { + 0, 62, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 32, 0, 0, 0, 0, 0, 0, 0, 0, 192, 0, 0, 0, 3, 0, 0, +}; + +/* Pattern_White_Space: 129 bytes. */ + +RE_UINT32 re_get_pattern_white_space(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_pattern_white_space_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_pattern_white_space_stage_2[pos + f] << 3; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_pattern_white_space_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_pattern_white_space_stage_4[pos + f] << 6; + pos += code; + value = (re_pattern_white_space_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Pattern_Syntax. */ + +static RE_UINT8 re_pattern_syntax_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_pattern_syntax_stage_2[] = { + 0, 1, 1, 1, 2, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_pattern_syntax_stage_3[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 3, 4, 4, 5, 4, 4, 6, 4, 4, 4, 4, 1, 1, 7, 1, + 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 10, 1, +}; + +static RE_UINT8 re_pattern_syntax_stage_4[] = { + 0, 1, 2, 2, 0, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, + 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, + 8, 8, 8, 9, 10, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, + 11, 12, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, + 0, 0, 14, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_pattern_syntax_stage_5[] = { + 0, 0, 0, 0, 254, 255, 0, 252, 1, 0, 0, 120, 254, 90, 67, 136, + 0, 0, 128, 0, 0, 0, 255, 255, 255, 0, 255, 127, 254, 255, 239, 127, + 255, 255, 255, 255, 255, 255, 63, 0, 0, 0, 240, 255, 14, 255, 255, 255, + 1, 0, 1, 0, 0, 0, 0, 192, 96, 0, 0, 0, +}; + +/* Pattern_Syntax: 277 bytes. */ + +RE_UINT32 re_get_pattern_syntax(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_pattern_syntax_stage_1[f] << 5; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_pattern_syntax_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_pattern_syntax_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_pattern_syntax_stage_4[pos + f] << 5; + pos += code; + value = (re_pattern_syntax_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Hangul_Syllable_Type. */ + +static RE_UINT8 re_hangul_syllable_type_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_hangul_syllable_type_stage_2[] = { + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_hangul_syllable_type_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 0, 0, 0, 0, 0, 4, 5, 6, 7, 8, 9, 10, 4, + 5, 6, 7, 8, 9, 10, 4, 5, 6, 7, 8, 9, 10, 4, 5, 6, + 7, 8, 9, 10, 4, 5, 6, 7, 8, 9, 10, 4, 5, 6, 7, 8, + 9, 10, 4, 5, 6, 7, 8, 9, 10, 4, 5, 6, 7, 8, 9, 10, + 4, 5, 6, 7, 8, 9, 10, 4, 5, 6, 7, 8, 9, 10, 4, 5, + 6, 7, 8, 9, 10, 4, 5, 6, 7, 8, 9, 10, 4, 5, 6, 11, +}; + +static RE_UINT8 re_hangul_syllable_type_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 4, + 5, 6, 6, 7, 6, 6, 6, 5, 6, 6, 7, 6, 6, 6, 5, 6, + 6, 7, 6, 6, 6, 5, 6, 6, 7, 6, 6, 6, 5, 6, 6, 7, + 6, 6, 6, 5, 6, 6, 7, 6, 6, 6, 5, 6, 6, 7, 6, 6, + 6, 5, 6, 6, 7, 6, 6, 6, 5, 6, 6, 7, 6, 6, 6, 5, + 6, 6, 7, 6, 6, 6, 5, 6, 6, 7, 6, 6, 6, 5, 6, 6, + 7, 6, 6, 6, 5, 6, 6, 7, 6, 6, 6, 5, 6, 6, 7, 6, + 6, 6, 5, 6, 6, 7, 6, 6, 6, 5, 6, 6, 7, 6, 6, 6, + 6, 5, 6, 6, 8, 0, 2, 2, 9, 10, 3, 3, 3, 3, 3, 11, +}; + +static RE_UINT8 re_hangul_syllable_type_stage_5[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + 1, 1, 1, 1, 1, 0, 0, 0, 4, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, + 5, 5, 5, 5, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, + 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, +}; + +/* Hangul_Syllable_Type: 497 bytes. */ + +RE_UINT32 re_get_hangul_syllable_type(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_hangul_syllable_type_stage_1[f] << 5; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_hangul_syllable_type_stage_2[pos + f] << 4; + f = code >> 7; + code ^= f << 7; + pos = (RE_UINT32)re_hangul_syllable_type_stage_3[pos + f] << 4; + f = code >> 3; + code ^= f << 3; + pos = (RE_UINT32)re_hangul_syllable_type_stage_4[pos + f] << 3; + value = re_hangul_syllable_type_stage_5[pos + code]; + + return value; +} + +/* Bidi_Class. */ + +static RE_UINT8 re_bidi_class_stage_1[] = { + 0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 7, + 8, 9, 5, 5, 5, 5, 10, 5, 5, 5, 5, 11, 5, 12, 13, 14, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 15, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 15, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 15, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 15, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 15, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 15, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 15, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 15, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 15, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 15, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 15, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 15, + 16, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 15, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 15, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 15, +}; + +static RE_UINT8 re_bidi_class_stage_2[] = { + 0, 1, 2, 2, 2, 3, 4, 5, 2, 6, 2, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, + 28, 29, 2, 2, 2, 2, 30, 31, 32, 2, 2, 2, 2, 33, 34, 35, + 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 2, 46, 2, 2, 2, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 53, 53, 53, 58, 53, 53, + 2, 2, 53, 53, 53, 53, 59, 60, 2, 61, 62, 63, 64, 65, 53, 66, + 67, 68, 2, 69, 70, 71, 72, 73, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 74, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 75, 2, 2, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 2, 86, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 87, 88, 88, 88, 89, 90, 91, 92, 93, 94, + 2, 2, 95, 96, 2, 97, 98, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 99, 99, 100, 99, 101, 102, 103, 99, 99, 99, 99, 99, 104, 99, 99, 99, + 105, 106, 107, 108, 109, 110, 111, 2, 2, 112, 2, 113, 114, 115, 116, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 117, 118, 2, 2, 2, 2, 2, 2, 2, 2, 119, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 120, 2, 2, 2, 2, 2, 2, + 2, 2, 121, 122, 123, 2, 124, 2, 2, 2, 2, 2, 2, 125, 126, 127, + 2, 2, 2, 2, 128, 129, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 99, 130, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 88, 131, 99, 99, + 132, 133, 134, 2, 2, 2, 53, 53, 53, 53, 135, 136, 53, 137, 138, 139, + 140, 141, 142, 143, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 144, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 144, + 145, 145, 146, 147, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, + 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, +}; + +static RE_UINT8 re_bidi_class_stage_3[] = { + 0, 1, 2, 3, 4, 5, 4, 6, 7, 8, 9, 10, 11, 12, 11, 12, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 13, 14, 14, 15, 16, + 17, 17, 17, 17, 17, 17, 17, 18, 19, 11, 11, 11, 11, 11, 11, 20, + 21, 11, 11, 11, 11, 11, 11, 11, 22, 23, 17, 24, 25, 26, 26, 26, + 27, 28, 29, 29, 30, 17, 31, 32, 29, 29, 29, 29, 29, 33, 34, 35, + 29, 36, 29, 17, 28, 29, 29, 29, 29, 29, 37, 32, 26, 26, 38, 39, + 26, 40, 41, 26, 26, 42, 26, 26, 26, 26, 29, 29, 29, 29, 43, 17, + 44, 11, 11, 45, 46, 47, 48, 11, 49, 11, 11, 50, 51, 11, 48, 52, + 53, 11, 11, 50, 54, 49, 11, 55, 53, 11, 11, 50, 56, 11, 48, 57, + 49, 11, 11, 58, 51, 59, 48, 11, 60, 11, 11, 11, 61, 11, 11, 62, + 63, 11, 11, 64, 65, 66, 48, 67, 49, 11, 11, 50, 68, 11, 48, 11, + 49, 11, 11, 11, 51, 11, 48, 11, 11, 11, 11, 11, 69, 70, 11, 11, + 11, 11, 11, 71, 72, 11, 11, 11, 11, 11, 11, 73, 74, 11, 11, 11, + 11, 75, 11, 76, 11, 11, 11, 77, 78, 79, 17, 80, 59, 11, 11, 11, + 11, 11, 81, 82, 11, 83, 63, 84, 85, 86, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 81, 11, 11, 11, 87, 11, 11, 11, 11, 11, 11, + 4, 11, 11, 11, 11, 11, 11, 11, 88, 89, 11, 11, 11, 11, 11, 11, + 11, 90, 11, 90, 11, 48, 11, 48, 11, 11, 11, 91, 92, 93, 11, 87, + 94, 11, 11, 11, 11, 11, 11, 11, 11, 11, 95, 11, 11, 11, 11, 11, + 11, 11, 96, 97, 98, 11, 11, 11, 11, 11, 11, 11, 11, 99, 16, 16, + 11, 100, 11, 11, 11, 101, 102, 103, 11, 11, 11, 104, 11, 11, 11, 11, + 105, 11, 11, 106, 60, 11, 107, 105, 108, 11, 109, 11, 11, 11, 110, 108, + 11, 11, 111, 112, 11, 11, 11, 11, 11, 11, 11, 11, 11, 113, 114, 115, + 11, 11, 11, 11, 17, 17, 17, 116, 11, 11, 11, 117, 118, 119, 119, 120, + 121, 16, 122, 123, 124, 125, 126, 127, 128, 11, 129, 129, 129, 17, 17, 63, + 130, 131, 132, 133, 134, 16, 11, 11, 135, 16, 16, 16, 16, 16, 16, 16, + 16, 136, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 137, 11, 11, 11, 5, 16, 138, 16, 16, 16, 16, 16, 139, + 16, 16, 140, 11, 139, 11, 16, 16, 141, 142, 11, 11, 11, 11, 143, 16, + 16, 16, 144, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 145, + 16, 146, 16, 147, 148, 149, 150, 11, 11, 11, 11, 11, 11, 11, 151, 152, + 11, 11, 11, 11, 11, 11, 11, 153, 11, 11, 11, 11, 11, 11, 17, 17, + 16, 16, 16, 16, 154, 11, 11, 11, 16, 155, 16, 16, 16, 16, 16, 156, + 16, 16, 16, 16, 16, 137, 11, 157, 158, 16, 159, 160, 11, 11, 11, 11, + 11, 161, 4, 11, 11, 11, 11, 162, 11, 11, 11, 11, 16, 16, 156, 11, + 11, 120, 11, 11, 11, 16, 11, 163, 11, 11, 11, 164, 150, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 165, 11, 11, 11, 11, 11, 99, 11, 166, + 11, 11, 11, 11, 16, 16, 16, 16, 11, 16, 16, 16, 140, 11, 11, 11, + 119, 11, 11, 11, 11, 11, 153, 167, 11, 64, 11, 11, 11, 11, 11, 108, + 16, 16, 149, 11, 11, 11, 11, 11, 168, 11, 11, 11, 11, 11, 11, 11, + 169, 11, 170, 171, 11, 11, 11, 172, 11, 11, 11, 11, 173, 11, 17, 108, + 11, 11, 174, 11, 175, 108, 11, 11, 44, 11, 11, 176, 11, 11, 177, 11, + 11, 11, 178, 179, 180, 11, 11, 50, 11, 11, 11, 181, 49, 11, 68, 59, + 11, 11, 11, 11, 11, 11, 182, 11, 11, 183, 184, 26, 26, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 185, 29, 29, 29, 29, + 29, 29, 29, 29, 29, 8, 8, 186, 17, 87, 17, 16, 16, 187, 188, 29, + 29, 29, 29, 29, 29, 29, 29, 189, 190, 3, 4, 5, 4, 5, 137, 11, + 11, 11, 11, 11, 11, 11, 191, 192, 193, 11, 11, 11, 16, 16, 16, 16, + 194, 157, 4, 11, 11, 11, 11, 86, 11, 11, 11, 11, 11, 11, 195, 142, + 11, 11, 11, 11, 11, 11, 11, 196, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 197, 26, 26, 26, 26, 26, 26, 198, 26, 26, 199, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 200, 26, 26, 26, 26, 201, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 202, 203, 49, 11, 11, 204, 205, 14, 137, 153, + 108, 11, 11, 206, 11, 11, 11, 11, 44, 11, 207, 208, 11, 11, 11, 209, + 108, 11, 11, 210, 211, 11, 11, 11, 11, 11, 153, 212, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 153, 213, 11, 108, 11, 11, 50, 63, 11, 214, 208, + 11, 11, 11, 215, 216, 11, 11, 11, 11, 11, 11, 217, 63, 68, 11, 11, + 11, 11, 11, 218, 63, 11, 11, 11, 11, 11, 219, 220, 11, 11, 11, 11, + 11, 81, 221, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 208, + 11, 11, 11, 205, 11, 11, 11, 11, 153, 44, 11, 11, 11, 11, 11, 11, + 11, 222, 223, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 224, 225, + 226, 11, 227, 11, 11, 11, 11, 11, 16, 16, 16, 16, 228, 11, 11, 11, + 16, 16, 16, 16, 16, 140, 11, 11, 11, 11, 11, 11, 11, 162, 11, 11, + 11, 229, 11, 11, 166, 11, 11, 11, 230, 11, 11, 11, 231, 232, 232, 232, + 17, 17, 17, 233, 17, 17, 80, 177, 173, 107, 234, 11, 11, 11, 11, 11, + 26, 26, 26, 26, 26, 235, 26, 26, 29, 29, 29, 29, 29, 29, 29, 236, + 16, 16, 157, 16, 16, 16, 16, 16, 16, 156, 237, 164, 164, 164, 16, 137, + 238, 11, 11, 11, 11, 11, 133, 11, 16, 16, 16, 16, 16, 16, 16, 155, + 16, 16, 239, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 4, 194, 156, + 16, 16, 16, 16, 16, 16, 16, 156, 16, 16, 16, 16, 16, 240, 11, 11, + 157, 16, 16, 16, 241, 87, 16, 16, 241, 16, 242, 11, 11, 11, 11, 11, + 11, 243, 11, 11, 11, 11, 11, 11, 240, 11, 11, 11, 4, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 244, 8, 8, 8, 8, 8, 8, 8, 8, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 8, +}; + +static RE_UINT8 re_bidi_class_stage_4[] = { + 0, 0, 1, 2, 0, 0, 0, 3, 4, 5, 6, 7, 8, 8, 9, 10, + 11, 12, 12, 12, 12, 12, 13, 10, 12, 12, 13, 14, 0, 15, 0, 0, + 0, 0, 0, 0, 16, 5, 17, 18, 19, 20, 21, 10, 12, 12, 12, 12, + 12, 13, 12, 12, 12, 12, 22, 12, 23, 10, 10, 10, 12, 24, 10, 17, + 10, 10, 10, 10, 25, 25, 25, 25, 12, 26, 12, 27, 12, 17, 12, 12, + 12, 27, 12, 12, 28, 25, 29, 12, 12, 12, 27, 30, 31, 25, 25, 25, + 25, 25, 25, 32, 33, 32, 34, 34, 34, 34, 34, 34, 35, 36, 37, 38, + 25, 25, 39, 40, 40, 40, 40, 40, 40, 40, 41, 25, 35, 35, 42, 43, + 44, 40, 40, 40, 40, 45, 25, 46, 25, 47, 48, 49, 8, 8, 50, 40, + 51, 40, 40, 40, 40, 45, 25, 25, 34, 34, 52, 25, 25, 53, 54, 34, + 34, 55, 32, 25, 25, 31, 31, 56, 34, 34, 31, 34, 41, 25, 25, 25, + 57, 12, 12, 12, 12, 12, 58, 59, 60, 25, 59, 61, 60, 25, 12, 12, + 62, 12, 12, 12, 61, 12, 12, 12, 12, 12, 12, 59, 60, 59, 12, 61, + 63, 12, 64, 12, 65, 12, 12, 12, 65, 28, 66, 29, 29, 61, 12, 12, + 60, 67, 59, 61, 68, 12, 12, 12, 12, 12, 12, 66, 12, 58, 12, 12, + 58, 12, 12, 12, 59, 12, 12, 61, 13, 10, 69, 12, 59, 12, 12, 12, + 12, 12, 12, 62, 59, 62, 70, 29, 12, 65, 12, 12, 12, 12, 10, 71, + 12, 12, 12, 29, 12, 12, 58, 12, 62, 72, 12, 12, 61, 25, 57, 64, + 12, 28, 25, 57, 61, 25, 67, 59, 12, 12, 25, 29, 12, 12, 29, 12, + 12, 73, 74, 26, 60, 25, 25, 57, 25, 70, 12, 60, 25, 25, 60, 25, + 25, 25, 25, 59, 12, 12, 12, 60, 70, 25, 65, 65, 12, 12, 29, 62, + 60, 59, 12, 12, 58, 65, 12, 61, 12, 12, 12, 61, 10, 10, 26, 12, + 75, 12, 12, 12, 12, 12, 13, 11, 62, 59, 12, 12, 12, 67, 25, 29, + 12, 58, 60, 25, 25, 12, 64, 61, 10, 10, 76, 77, 12, 12, 61, 12, + 57, 28, 59, 12, 58, 12, 60, 12, 11, 26, 12, 12, 12, 12, 12, 23, + 12, 28, 66, 12, 12, 58, 25, 57, 72, 60, 25, 59, 28, 25, 25, 66, + 25, 25, 25, 57, 25, 12, 12, 12, 12, 70, 57, 59, 12, 12, 28, 25, + 29, 12, 12, 12, 62, 29, 67, 29, 12, 58, 29, 73, 12, 12, 12, 25, + 25, 62, 12, 12, 57, 25, 25, 25, 70, 25, 59, 61, 12, 59, 29, 12, + 25, 29, 12, 25, 12, 12, 12, 78, 26, 12, 12, 24, 12, 12, 12, 24, + 12, 12, 12, 22, 79, 79, 80, 81, 10, 10, 82, 83, 84, 85, 10, 10, + 10, 86, 10, 10, 10, 10, 10, 87, 0, 88, 89, 0, 90, 8, 91, 71, + 8, 8, 91, 71, 84, 84, 84, 84, 17, 71, 26, 12, 12, 20, 11, 23, + 10, 78, 92, 93, 12, 12, 23, 12, 10, 11, 23, 26, 12, 12, 24, 12, + 94, 10, 10, 10, 10, 26, 12, 12, 10, 20, 10, 10, 10, 10, 71, 12, + 10, 71, 12, 12, 10, 10, 8, 8, 8, 8, 8, 12, 12, 12, 23, 10, + 10, 10, 10, 24, 10, 23, 10, 10, 10, 26, 10, 10, 10, 10, 26, 24, + 10, 10, 20, 10, 26, 12, 12, 12, 12, 12, 12, 10, 12, 24, 71, 28, + 29, 12, 24, 10, 12, 12, 12, 28, 71, 12, 12, 12, 10, 10, 17, 10, + 10, 12, 12, 12, 10, 10, 10, 12, 95, 11, 10, 10, 11, 12, 62, 29, + 11, 23, 12, 24, 12, 12, 96, 11, 12, 12, 13, 12, 12, 12, 12, 71, + 24, 10, 10, 10, 12, 13, 71, 12, 12, 12, 12, 13, 97, 25, 25, 98, + 12, 12, 11, 12, 58, 58, 28, 12, 12, 65, 10, 12, 12, 12, 99, 12, + 12, 10, 12, 12, 12, 59, 12, 12, 12, 62, 25, 29, 12, 28, 25, 25, + 28, 62, 29, 59, 12, 61, 12, 12, 12, 12, 60, 57, 65, 65, 12, 12, + 28, 12, 12, 59, 70, 66, 59, 62, 12, 61, 59, 61, 12, 12, 12, 100, + 34, 34, 101, 34, 40, 40, 40, 102, 40, 40, 40, 103, 104, 105, 10, 106, + 107, 71, 108, 12, 40, 40, 40, 109, 30, 5, 6, 7, 5, 110, 10, 71, + 0, 0, 111, 112, 92, 12, 12, 12, 10, 10, 10, 11, 113, 8, 8, 8, + 12, 62, 57, 12, 34, 34, 34, 114, 31, 33, 34, 25, 34, 34, 115, 52, + 34, 33, 34, 34, 34, 34, 116, 10, 35, 35, 35, 35, 35, 35, 35, 117, + 12, 12, 25, 25, 25, 57, 12, 12, 28, 57, 65, 12, 12, 28, 25, 60, + 25, 59, 12, 12, 28, 12, 12, 12, 12, 62, 25, 57, 12, 12, 62, 59, + 29, 70, 12, 12, 28, 25, 57, 12, 12, 62, 25, 59, 28, 25, 72, 28, + 70, 12, 12, 12, 62, 29, 12, 67, 28, 25, 57, 73, 12, 12, 28, 61, + 25, 67, 12, 12, 62, 67, 25, 12, 12, 12, 12, 65, 0, 12, 12, 12, + 12, 28, 29, 12, 118, 0, 119, 25, 57, 60, 25, 12, 12, 12, 62, 29, + 120, 121, 12, 12, 12, 92, 12, 12, 12, 12, 92, 12, 13, 12, 12, 122, + 8, 8, 8, 8, 25, 57, 28, 25, 60, 25, 25, 25, 25, 115, 34, 34, + 123, 40, 40, 40, 10, 10, 10, 71, 8, 8, 124, 11, 10, 24, 10, 10, + 10, 11, 12, 12, 10, 10, 12, 12, 10, 10, 10, 26, 10, 10, 11, 12, + 12, 12, 12, 125, +}; + +static RE_UINT8 re_bidi_class_stage_5[] = { + 11, 11, 11, 11, 11, 8, 7, 8, 9, 7, 11, 11, 7, 7, 7, 8, + 9, 10, 10, 4, 4, 4, 10, 10, 10, 10, 10, 3, 6, 3, 6, 6, + 2, 2, 2, 2, 2, 2, 6, 10, 10, 10, 10, 10, 10, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 11, 11, 7, 11, 11, + 6, 10, 4, 4, 10, 10, 0, 10, 10, 11, 10, 10, 4, 4, 2, 2, + 10, 0, 10, 10, 10, 2, 0, 10, 0, 10, 10, 0, 0, 0, 10, 10, + 0, 10, 10, 10, 12, 12, 12, 12, 10, 10, 0, 0, 0, 0, 10, 0, + 0, 0, 0, 12, 12, 12, 0, 0, 0, 10, 10, 4, 1, 12, 12, 12, + 12, 12, 1, 12, 1, 12, 12, 1, 1, 1, 1, 1, 5, 5, 5, 5, + 5, 5, 10, 10, 13, 4, 4, 13, 6, 13, 10, 10, 12, 12, 12, 13, + 13, 13, 13, 13, 13, 13, 13, 12, 5, 5, 4, 5, 5, 13, 13, 13, + 12, 13, 13, 13, 13, 13, 12, 12, 12, 5, 10, 12, 12, 13, 13, 12, + 12, 10, 12, 12, 12, 12, 13, 13, 2, 2, 13, 13, 13, 12, 13, 13, + 1, 1, 1, 12, 1, 1, 10, 10, 10, 10, 1, 1, 1, 1, 12, 12, + 12, 12, 1, 1, 12, 12, 12, 0, 0, 0, 12, 0, 12, 0, 0, 0, + 0, 12, 12, 12, 0, 12, 0, 0, 0, 0, 12, 12, 0, 0, 4, 4, + 0, 0, 0, 4, 0, 12, 12, 0, 12, 0, 0, 12, 12, 12, 0, 12, + 0, 4, 0, 0, 10, 4, 10, 0, 12, 0, 12, 12, 10, 10, 10, 0, + 12, 0, 12, 0, 0, 12, 0, 12, 0, 12, 10, 10, 9, 0, 0, 0, + 10, 10, 10, 12, 12, 12, 11, 0, 0, 10, 0, 10, 9, 9, 9, 9, + 9, 9, 9, 11, 11, 11, 0, 1, 9, 7, 16, 17, 18, 14, 15, 6, + 4, 4, 4, 4, 4, 10, 10, 10, 6, 10, 10, 10, 10, 10, 10, 9, + 11, 11, 19, 20, 21, 22, 11, 11, 2, 0, 0, 0, 2, 2, 3, 3, + 0, 10, 0, 0, 0, 0, 4, 0, 10, 10, 3, 4, 9, 10, 10, 10, + 0, 12, 12, 10, 12, 12, 12, 10, 12, 12, 10, 10, 4, 4, 0, 0, + 0, 1, 12, 1, 1, 3, 1, 1, 13, 13, 10, 10, 13, 10, 13, 13, + 6, 10, 6, 0, 10, 6, 10, 10, 10, 10, 10, 4, 10, 10, 3, 3, + 10, 4, 4, 10, 13, 13, 13, 11, 10, 4, 4, 0, 11, 10, 10, 10, + 10, 10, 11, 11, 12, 2, 2, 2, 1, 1, 1, 10, 12, 12, 12, 1, + 1, 10, 10, 10, 5, 5, 5, 1, 0, 0, 0, 11, 11, 11, 11, 12, + 10, 10, 12, 12, 12, 10, 0, 0, 0, 0, 2, 2, 10, 10, 13, 13, + 2, 2, 2, 10, 0, 0, 11, 11, +}; + +/* Bidi_Class: 3484 bytes. */ + +RE_UINT32 re_get_bidi_class(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 12; + code = ch ^ (f << 12); + pos = (RE_UINT32)re_bidi_class_stage_1[f] << 5; + f = code >> 7; + code ^= f << 7; + pos = (RE_UINT32)re_bidi_class_stage_2[pos + f] << 3; + f = code >> 4; + code ^= f << 4; + pos = (RE_UINT32)re_bidi_class_stage_3[pos + f] << 2; + f = code >> 2; + code ^= f << 2; + pos = (RE_UINT32)re_bidi_class_stage_4[pos + f] << 2; + value = re_bidi_class_stage_5[pos + code]; + + return value; +} + +/* Canonical_Combining_Class. */ + +static RE_UINT8 re_canonical_combining_class_stage_1[] = { + 0, 1, 2, 2, 2, 3, 2, 4, 5, 2, 2, 6, 2, 7, 8, 9, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, +}; + +static RE_UINT8 re_canonical_combining_class_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 10, 11, 12, 13, 0, + 14, 0, 0, 0, 0, 0, 15, 0, 16, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 0, 21, + 22, 23, 0, 0, 0, 24, 0, 0, 25, 26, 27, 28, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 31, 32, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 33, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_canonical_combining_class_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 0, + 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, 0, + 9, 0, 10, 11, 0, 0, 12, 13, 14, 15, 16, 0, 0, 0, 0, 17, + 18, 19, 20, 0, 0, 0, 0, 21, 0, 22, 23, 0, 0, 22, 24, 0, + 0, 22, 24, 0, 0, 22, 24, 0, 0, 22, 24, 0, 0, 0, 24, 0, + 0, 0, 25, 0, 0, 22, 24, 0, 0, 0, 24, 0, 0, 0, 26, 0, + 0, 27, 28, 0, 0, 29, 30, 0, 31, 32, 0, 33, 34, 0, 35, 0, + 0, 36, 0, 0, 37, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 39, 39, 0, 0, 0, 0, 40, 0, + 0, 0, 0, 0, 0, 41, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, + 43, 0, 0, 44, 0, 45, 0, 0, 0, 46, 47, 48, 0, 49, 0, 50, + 0, 51, 0, 0, 0, 0, 52, 53, 0, 0, 0, 0, 0, 0, 54, 55, + 0, 0, 0, 0, 0, 0, 56, 57, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 58, 0, 0, 0, 59, 0, 0, 0, 60, + 0, 61, 0, 0, 62, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 63, 64, 0, 0, 65, 0, 0, 0, 0, 0, 0, 0, 0, + 66, 0, 0, 0, 0, 0, 47, 67, 0, 68, 69, 0, 0, 70, 71, 0, + 0, 0, 0, 0, 0, 72, 73, 74, 0, 0, 0, 0, 0, 0, 0, 24, + 0, 0, 0, 0, 0, 0, 0, 0, 75, 0, 0, 0, 0, 0, 0, 0, + 0, 76, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 77, + 0, 0, 0, 0, 0, 0, 0, 78, 0, 0, 0, 79, 0, 0, 0, 0, + 80, 81, 0, 0, 0, 0, 0, 82, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 66, 59, 0, 83, 0, 0, 84, 85, 0, 70, 0, 0, 86, 0, + 0, 87, 0, 0, 0, 0, 0, 88, 0, 22, 24, 89, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 90, 0, 0, 0, 0, 0, 0, 59, 91, 0, + 0, 59, 0, 0, 0, 92, 0, 0, 0, 93, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 94, 0, 95, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 96, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97, 98, 99, 0, 0, + 0, 0, 100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 101, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_canonical_combining_class_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 3, 4, + 5, 6, 7, 4, 4, 8, 9, 10, 1, 11, 12, 13, 14, 15, 16, 17, + 18, 1, 1, 1, 0, 0, 0, 0, 19, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 20, 21, 22, 1, 23, 4, 21, 24, 25, 26, 27, 28, + 29, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 31, 0, + 0, 0, 32, 33, 34, 35, 1, 36, 0, 0, 0, 0, 37, 0, 0, 0, + 0, 0, 0, 0, 0, 38, 1, 39, 14, 39, 40, 41, 0, 0, 0, 0, + 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0, 43, 36, 44, 45, + 21, 45, 46, 0, 0, 0, 0, 0, 0, 0, 19, 1, 21, 0, 0, 0, + 0, 0, 0, 0, 0, 38, 47, 1, 1, 48, 48, 49, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 50, 0, 51, 21, 43, 52, 53, 21, 35, 1, + 0, 0, 0, 0, 0, 0, 0, 54, 0, 0, 0, 55, 56, 57, 0, 0, + 0, 0, 0, 55, 0, 0, 0, 0, 0, 0, 0, 55, 0, 58, 0, 0, + 0, 0, 59, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 0, + 0, 0, 61, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 0, + 0, 0, 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 0, + 0, 0, 0, 0, 0, 65, 66, 0, 0, 0, 0, 0, 67, 68, 69, 70, + 71, 72, 0, 0, 0, 0, 0, 0, 0, 73, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 74, 75, 0, 0, 0, 0, 76, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 48, 0, 0, 0, 0, 0, 77, 0, 0, + 0, 0, 0, 0, 59, 0, 0, 78, 0, 0, 79, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 80, 0, 0, 0, 0, 0, 0, 19, 81, 0, + 77, 0, 0, 0, 0, 48, 1, 82, 0, 0, 0, 0, 1, 52, 15, 41, + 0, 0, 0, 0, 0, 54, 0, 0, 0, 77, 0, 0, 0, 0, 0, 0, + 0, 0, 19, 10, 1, 0, 0, 0, 0, 0, 83, 0, 0, 0, 0, 0, + 0, 84, 0, 0, 83, 0, 0, 0, 0, 0, 0, 0, 0, 74, 0, 0, + 0, 0, 0, 0, 85, 9, 12, 4, 86, 8, 87, 76, 0, 57, 49, 0, + 21, 1, 21, 88, 89, 1, 1, 1, 1, 1, 1, 1, 1, 49, 0, 90, + 0, 0, 0, 0, 91, 1, 92, 57, 78, 93, 94, 4, 57, 0, 0, 0, + 0, 0, 0, 19, 49, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 95, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 96, 97, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 98, 0, 0, 0, 0, 19, 0, 1, 1, 49, + 0, 0, 0, 0, 0, 0, 0, 38, 0, 0, 0, 0, 49, 0, 0, 0, + 0, 59, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 49, 0, 0, 0, + 0, 0, 51, 64, 0, 0, 0, 0, 0, 0, 0, 0, 95, 0, 0, 0, + 0, 0, 0, 0, 74, 0, 0, 0, 77, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 99, 100, 57, 38, 78, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 59, 0, 0, 0, 0, 0, 0, 0, 0, 0, 101, + 1, 14, 4, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 76, + 81, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 85, 0, + 0, 0, 0, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 103, 95, + 0, 104, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 105, 0, + 85, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 95, 77, 0, 0, + 77, 0, 84, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 105, 0, 0, + 0, 0, 106, 0, 0, 0, 0, 0, 0, 38, 1, 57, 1, 57, 0, 0, + 107, 0, 0, 0, 0, 0, 0, 0, 54, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 107, 0, 0, 0, 0, 95, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 8, 87, 0, 0, 0, 0, 0, 0, 1, 85, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 108, 0, 109, 110, 111, 112, 0, 51, 4, + 113, 48, 23, 0, 0, 0, 0, 0, 0, 0, 38, 49, 0, 0, 0, 0, + 38, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 113, 0, 0, +}; + +static RE_UINT8 re_canonical_combining_class_stage_5[] = { + 0, 0, 0, 0, 50, 50, 50, 50, 50, 51, 45, 45, 45, 45, 51, 43, + 45, 45, 45, 45, 45, 41, 41, 45, 45, 45, 45, 41, 41, 45, 45, 45, + 1, 1, 1, 1, 1, 45, 45, 45, 45, 50, 50, 50, 50, 54, 50, 45, + 45, 45, 50, 50, 50, 45, 45, 0, 50, 50, 50, 45, 45, 45, 45, 50, + 51, 45, 45, 50, 52, 53, 53, 52, 53, 53, 52, 50, 0, 0, 0, 50, + 0, 45, 50, 50, 50, 50, 45, 50, 50, 50, 46, 45, 50, 50, 45, 45, + 50, 46, 49, 50, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14, 15, + 16, 17, 0, 18, 0, 19, 20, 0, 50, 45, 0, 13, 25, 26, 27, 0, + 0, 0, 0, 22, 23, 24, 25, 26, 27, 28, 29, 50, 50, 45, 45, 50, + 45, 50, 50, 45, 30, 0, 0, 0, 0, 0, 50, 50, 50, 0, 0, 50, + 50, 0, 45, 50, 50, 45, 0, 0, 0, 31, 0, 0, 50, 45, 50, 50, + 45, 45, 50, 45, 45, 50, 45, 50, 45, 50, 50, 0, 50, 50, 0, 50, + 0, 50, 50, 50, 50, 50, 0, 0, 0, 45, 45, 45, 0, 0, 0, 45, + 50, 45, 45, 45, 22, 23, 24, 50, 2, 0, 0, 0, 0, 4, 0, 0, + 0, 50, 45, 50, 50, 0, 0, 0, 0, 32, 33, 0, 0, 0, 4, 0, + 34, 34, 4, 0, 35, 35, 35, 35, 36, 36, 0, 0, 37, 37, 37, 37, + 45, 45, 0, 0, 0, 45, 0, 45, 0, 43, 0, 0, 0, 38, 39, 0, + 40, 0, 0, 0, 0, 0, 39, 39, 39, 39, 0, 0, 39, 0, 50, 50, + 4, 0, 50, 50, 0, 0, 45, 0, 0, 0, 0, 2, 0, 4, 4, 0, + 0, 45, 0, 0, 4, 0, 0, 0, 0, 50, 0, 0, 0, 49, 0, 0, + 0, 46, 50, 45, 45, 0, 0, 0, 50, 0, 0, 45, 0, 0, 4, 4, + 0, 0, 2, 0, 50, 50, 50, 0, 50, 0, 1, 1, 1, 0, 0, 0, + 50, 53, 42, 45, 41, 50, 50, 50, 52, 45, 50, 45, 50, 50, 1, 1, + 1, 1, 1, 50, 0, 1, 1, 50, 45, 50, 1, 1, 0, 0, 0, 4, + 0, 0, 44, 49, 51, 46, 47, 47, 0, 3, 3, 0, 50, 0, 50, 50, + 45, 0, 0, 50, 0, 0, 21, 0, 0, 45, 0, 50, 50, 1, 45, 0, + 0, 50, 45, 0, 0, 4, 2, 0, 0, 2, 4, 0, 0, 0, 4, 2, + 0, 0, 1, 0, 0, 43, 43, 1, 1, 1, 0, 0, 0, 48, 43, 43, + 43, 43, 43, 0, 45, 45, 45, 0, +}; + +/* Canonical_Combining_Class: 2112 bytes. */ + +RE_UINT32 re_get_canonical_combining_class(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 13; + code = ch ^ (f << 13); + pos = (RE_UINT32)re_canonical_combining_class_stage_1[f] << 4; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_canonical_combining_class_stage_2[pos + f] << 4; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_canonical_combining_class_stage_3[pos + f] << 3; + f = code >> 2; + code ^= f << 2; + pos = (RE_UINT32)re_canonical_combining_class_stage_4[pos + f] << 2; + value = re_canonical_combining_class_stage_5[pos + code]; + + return value; +} + +/* Decomposition_Type. */ + +static RE_UINT8 re_decomposition_type_stage_1[] = { + 0, 1, 2, 2, 2, 3, 4, 5, 6, 2, 2, 2, 2, 2, 7, 8, + 2, 2, 2, 2, 2, 2, 2, 9, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, +}; + +static RE_UINT8 re_decomposition_type_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 16, 7, 17, 18, 19, + 20, 21, 22, 23, 24, 7, 7, 7, 7, 7, 25, 7, 26, 27, 28, 29, + 30, 31, 32, 33, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 34, 35, 7, 7, 7, 36, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 38, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 37, 39, 40, 41, 42, 43, 44, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 45, 46, 7, 47, 48, 49, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 50, 7, 7, 51, 52, 53, 54, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 55, 7, + 7, 56, 57, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 37, 37, 58, 7, 7, 7, 7, 7, +}; + +static RE_UINT8 re_decomposition_type_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 3, 5, + 6, 7, 8, 9, 10, 11, 8, 12, 0, 0, 13, 14, 15, 16, 17, 18, + 6, 19, 20, 21, 0, 0, 0, 0, 0, 0, 0, 22, 0, 23, 24, 0, + 0, 0, 0, 0, 25, 0, 0, 26, 27, 14, 28, 14, 29, 30, 0, 31, + 32, 33, 0, 33, 0, 32, 0, 34, 0, 0, 0, 0, 35, 36, 37, 38, + 0, 0, 0, 0, 0, 0, 0, 0, 39, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 40, 0, 0, 0, 0, 41, 0, 0, 0, 0, 42, 43, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 33, 44, 0, 45, 0, 0, 0, 0, 0, 0, 46, 47, 0, 0, + 0, 0, 0, 48, 0, 49, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 50, 51, 0, 0, 0, 52, 0, 0, 53, 0, 0, 0, + 0, 0, 0, 0, 54, 0, 0, 0, 0, 0, 0, 0, 55, 0, 0, 0, + 0, 0, 0, 0, 53, 0, 0, 0, 0, 0, 0, 0, 0, 56, 0, 0, + 0, 0, 0, 57, 0, 0, 0, 0, 0, 0, 0, 57, 0, 58, 0, 0, + 59, 0, 0, 0, 60, 61, 33, 62, 63, 60, 61, 33, 0, 0, 0, 0, + 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65, + 66, 67, 0, 68, 69, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 70, 71, 72, 73, 74, 75, 0, 76, 73, 73, 0, 0, 0, 0, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 77, 6, 6, 6, 6, 6, 78, + 6, 79, 6, 6, 79, 80, 6, 81, 6, 6, 6, 82, 83, 84, 6, 85, + 86, 87, 88, 89, 90, 91, 0, 92, 93, 94, 95, 0, 0, 0, 0, 0, + 96, 97, 98, 99, 100, 101, 102, 102, 103, 104, 105, 0, 106, 0, 0, 0, + 107, 0, 108, 109, 110, 0, 111, 112, 112, 0, 113, 0, 0, 0, 114, 0, + 0, 0, 115, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 116, 117, 102, 102, 102, 118, 116, 116, 119, 0, + 120, 0, 0, 0, 0, 0, 0, 121, 0, 0, 0, 0, 0, 122, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 123, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 124, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 125, 0, 0, 0, 0, 0, 57, + 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 126, 0, 0, + 127, 0, 0, 128, 129, 130, 131, 132, 0, 133, 129, 130, 131, 132, 0, 134, + 0, 0, 0, 135, 102, 102, 102, 102, 136, 137, 0, 0, 0, 0, 0, 0, + 102, 136, 102, 102, 138, 139, 116, 140, 116, 116, 116, 116, 141, 116, 116, 140, + 142, 142, 142, 142, 142, 143, 102, 144, 142, 142, 142, 142, 142, 142, 102, 145, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 146, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 147, 0, 0, 0, 0, 0, 0, 0, 148, + 0, 0, 0, 0, 0, 149, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 21, 0, 0, 0, 0, 0, + 81, 150, 151, 6, 6, 6, 81, 6, 6, 6, 6, 6, 6, 78, 0, 0, + 152, 153, 154, 155, 156, 157, 158, 158, 159, 158, 160, 161, 0, 162, 163, 164, + 165, 165, 165, 165, 165, 165, 166, 167, 167, 168, 169, 169, 169, 170, 171, 172, + 165, 173, 174, 175, 0, 176, 177, 178, 179, 180, 167, 181, 182, 0, 0, 183, + 0, 184, 0, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 194, 195, 196, + 197, 198, 198, 198, 198, 198, 199, 200, 200, 200, 200, 201, 202, 203, 204, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 205, 206, 0, 0, 0, 0, 0, + 0, 0, 207, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 208, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 104, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 207, 209, 0, 0, 0, 0, 210, 14, 0, 0, 0, + 211, 211, 211, 211, 211, 212, 211, 211, 211, 213, 214, 215, 216, 211, 211, 211, + 217, 218, 211, 219, 220, 221, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, + 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 222, 211, 211, 211, 211, 211, + 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 223, 211, 211, 211, + 216, 211, 224, 225, 226, 227, 228, 229, 230, 231, 232, 231, 0, 0, 0, 0, + 233, 102, 234, 142, 142, 0, 235, 0, 0, 236, 0, 0, 0, 0, 0, 0, + 237, 142, 142, 238, 239, 240, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 6, 81, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_decomposition_type_stage_4[] = { + 0, 0, 0, 0, 1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 8, 8, + 10, 11, 10, 12, 10, 11, 10, 9, 8, 8, 8, 8, 13, 8, 8, 8, + 8, 12, 8, 8, 14, 8, 10, 15, 16, 8, 17, 8, 12, 8, 8, 8, + 8, 8, 8, 15, 12, 0, 0, 18, 19, 0, 0, 0, 0, 20, 20, 21, + 8, 8, 8, 22, 8, 13, 8, 8, 23, 12, 8, 8, 8, 8, 8, 13, + 0, 13, 8, 8, 8, 0, 0, 0, 24, 24, 25, 0, 0, 0, 20, 5, + 24, 25, 0, 0, 9, 19, 0, 0, 0, 19, 26, 27, 0, 21, 11, 22, + 0, 0, 13, 8, 0, 0, 13, 11, 28, 29, 0, 0, 30, 5, 31, 0, + 9, 18, 0, 11, 0, 0, 32, 0, 0, 13, 0, 0, 33, 0, 0, 0, + 8, 13, 13, 8, 13, 8, 13, 8, 8, 12, 12, 0, 0, 3, 0, 0, + 13, 11, 0, 0, 0, 34, 35, 0, 36, 0, 0, 0, 18, 0, 0, 0, + 32, 19, 0, 0, 0, 0, 8, 8, 0, 0, 18, 19, 0, 0, 0, 9, + 18, 27, 0, 0, 0, 0, 10, 27, 0, 0, 37, 19, 0, 0, 0, 12, + 0, 19, 0, 0, 0, 0, 13, 19, 0, 0, 19, 0, 19, 18, 22, 0, + 0, 0, 27, 11, 3, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 1, + 18, 0, 0, 32, 27, 18, 0, 19, 18, 38, 17, 0, 32, 0, 0, 0, + 0, 27, 0, 0, 0, 0, 0, 25, 0, 27, 36, 36, 27, 0, 0, 0, + 0, 0, 18, 32, 9, 0, 0, 0, 0, 0, 0, 39, 24, 24, 39, 24, + 24, 24, 24, 40, 24, 24, 24, 24, 41, 42, 43, 0, 0, 0, 25, 0, + 0, 0, 44, 24, 8, 8, 45, 0, 8, 8, 12, 0, 8, 12, 8, 12, + 8, 8, 46, 46, 8, 8, 8, 12, 8, 22, 8, 47, 21, 22, 8, 8, + 8, 13, 8, 10, 13, 22, 8, 48, 49, 50, 30, 0, 51, 3, 0, 0, + 0, 30, 0, 52, 3, 53, 0, 54, 0, 3, 5, 0, 0, 3, 0, 3, + 55, 24, 24, 24, 42, 42, 42, 43, 42, 42, 42, 56, 0, 0, 35, 0, + 57, 34, 58, 59, 59, 60, 61, 62, 63, 64, 65, 66, 66, 67, 68, 59, + 69, 61, 62, 0, 70, 70, 70, 70, 20, 20, 20, 20, 0, 0, 71, 0, + 0, 0, 13, 0, 0, 0, 0, 27, 0, 0, 0, 10, 0, 19, 32, 19, + 0, 36, 0, 72, 35, 0, 0, 0, 32, 37, 32, 0, 36, 0, 0, 10, + 12, 12, 12, 0, 0, 0, 0, 8, 8, 0, 13, 12, 0, 0, 33, 0, + 73, 73, 73, 73, 73, 20, 20, 20, 20, 74, 73, 73, 73, 73, 75, 0, + 0, 0, 0, 35, 0, 30, 0, 0, 0, 0, 0, 19, 0, 0, 0, 76, + 0, 0, 0, 44, 0, 0, 0, 3, 20, 5, 0, 0, 77, 0, 0, 0, + 0, 26, 30, 0, 0, 0, 0, 36, 36, 36, 36, 36, 36, 46, 32, 0, + 9, 22, 33, 12, 0, 19, 3, 78, 0, 37, 11, 79, 34, 20, 20, 20, + 20, 20, 20, 30, 4, 24, 24, 24, 20, 73, 0, 0, 80, 73, 73, 73, + 73, 73, 73, 75, 20, 20, 20, 81, 81, 81, 81, 81, 81, 81, 20, 20, + 82, 81, 81, 81, 20, 20, 20, 83, 0, 0, 0, 55, 25, 0, 0, 0, + 0, 0, 55, 0, 0, 0, 0, 24, 36, 10, 8, 11, 36, 33, 13, 8, + 20, 30, 0, 0, 3, 20, 0, 46, 59, 59, 84, 8, 8, 11, 8, 36, + 9, 22, 8, 15, 85, 86, 86, 86, 86, 86, 86, 86, 86, 85, 85, 85, + 87, 85, 86, 86, 88, 0, 0, 0, 89, 90, 91, 92, 85, 87, 86, 85, + 85, 85, 93, 87, 94, 94, 94, 94, 94, 95, 95, 95, 95, 95, 95, 95, + 95, 96, 97, 97, 97, 97, 97, 97, 97, 97, 97, 98, 99, 99, 99, 99, + 99, 100, 94, 94, 101, 95, 95, 95, 95, 95, 95, 102, 97, 99, 99, 103, + 104, 97, 105, 106, 107, 105, 108, 105, 104, 96, 95, 105, 96, 109, 110, 97, + 111, 106, 112, 105, 95, 106, 113, 95, 96, 106, 0, 0, 94, 94, 94, 114, + 115, 115, 116, 0, 115, 115, 115, 115, 115, 117, 118, 20, 119, 120, 120, 120, + 120, 119, 120, 0, 121, 122, 123, 123, 124, 91, 125, 126, 90, 125, 127, 127, + 127, 127, 126, 91, 125, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 126, + 125, 126, 91, 128, 129, 130, 130, 130, 130, 130, 130, 130, 131, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 133, 134, 132, 134, 132, 134, 132, 134, 135, + 130, 136, 132, 133, 0, 0, 27, 19, 0, 0, 18, 0, 0, 0, 0, 13, + 0, 0, 18, 36, 8, 19, 0, 0, 0, 0, 18, 8, 59, 59, 59, 59, + 59, 137, 59, 59, 59, 59, 59, 137, 138, 139, 61, 137, 59, 59, 66, 61, + 59, 61, 59, 59, 59, 66, 140, 61, 59, 137, 59, 137, 59, 59, 66, 140, + 59, 141, 142, 59, 137, 59, 59, 59, 59, 62, 59, 59, 59, 59, 59, 142, + 139, 143, 61, 59, 140, 59, 144, 0, 138, 145, 144, 61, 139, 143, 144, 144, + 139, 143, 140, 59, 140, 59, 61, 141, 59, 59, 66, 59, 59, 59, 59, 0, + 61, 61, 66, 59, 20, 20, 30, 0, 20, 20, 146, 75, 0, 0, 4, 0, + 147, 0, 0, 0, 148, 0, 0, 0, 81, 81, 148, 0, 20, 20, 35, 0, + 149, 0, 0, 0, +}; + +static RE_UINT8 re_decomposition_type_stage_5[] = { + 0, 0, 0, 0, 4, 0, 0, 0, 2, 0, 10, 0, 0, 0, 0, 2, + 0, 0, 10, 10, 2, 2, 0, 0, 2, 10, 10, 0, 17, 17, 17, 0, + 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, + 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 2, 2, 1, 1, 1, 2, + 2, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, + 2, 2, 2, 2, 2, 1, 1, 1, 1, 0, 1, 1, 1, 2, 2, 2, + 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, + 2, 2, 2, 1, 1, 2, 2, 0, 2, 2, 2, 0, 0, 2, 0, 0, + 0, 1, 0, 0, 0, 1, 1, 0, 0, 2, 2, 2, 2, 0, 0, 0, + 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 2, 10, 10, 10, 0, + 10, 10, 0, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 0, + 0, 0, 0, 10, 1, 1, 2, 1, 0, 1, 0, 1, 1, 2, 1, 2, + 1, 1, 2, 0, 1, 1, 2, 2, 2, 2, 2, 4, 0, 4, 0, 0, + 0, 0, 0, 4, 2, 0, 2, 2, 2, 0, 2, 0, 10, 10, 0, 0, + 11, 0, 0, 0, 2, 2, 3, 2, 0, 2, 3, 3, 3, 3, 3, 3, + 0, 3, 2, 0, 0, 3, 3, 3, 3, 3, 0, 0, 10, 2, 10, 0, + 3, 0, 1, 0, 3, 0, 1, 1, 3, 3, 0, 3, 3, 2, 2, 2, + 2, 3, 0, 2, 3, 0, 0, 0, 17, 17, 17, 17, 0, 17, 0, 0, + 2, 2, 0, 2, 9, 9, 9, 9, 2, 2, 9, 9, 9, 9, 9, 0, + 11, 10, 0, 0, 13, 0, 0, 0, 2, 0, 1, 12, 0, 0, 1, 12, + 16, 9, 9, 9, 16, 16, 16, 16, 2, 16, 16, 16, 2, 2, 2, 16, + 3, 3, 1, 1, 8, 7, 8, 7, 5, 6, 8, 7, 8, 7, 5, 6, + 8, 7, 0, 0, 0, 0, 0, 8, 7, 5, 6, 8, 7, 8, 7, 8, + 7, 8, 8, 7, 5, 8, 7, 5, 8, 8, 8, 8, 7, 7, 7, 7, + 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, + 6, 8, 8, 8, 8, 7, 7, 7, 7, 5, 5, 5, 7, 8, 0, 0, + 5, 7, 5, 5, 7, 5, 7, 7, 5, 5, 7, 7, 5, 5, 7, 5, + 5, 7, 7, 5, 7, 7, 5, 7, 5, 5, 5, 7, 0, 0, 5, 5, + 5, 7, 7, 7, 5, 7, 5, 7, 8, 0, 0, 0, 12, 12, 12, 12, + 12, 12, 0, 0, 12, 0, 0, 12, 12, 2, 2, 2, 15, 15, 15, 0, + 15, 15, 15, 15, 8, 6, 8, 0, 8, 0, 8, 6, 8, 6, 8, 6, + 8, 8, 7, 8, 7, 8, 7, 5, 6, 8, 7, 8, 6, 8, 7, 5, + 7, 0, 0, 0, 0, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 0, 0, 0, 14, 14, 14, 0, 0, 0, + 13, 13, 13, 0, 3, 0, 3, 3, 0, 0, 3, 0, 0, 3, 3, 0, + 3, 3, 3, 0, 3, 0, 3, 0, 0, 0, 3, 3, 3, 0, 0, 3, + 0, 3, 0, 3, 0, 0, 0, 3, 2, 2, 2, 9, 16, 0, 0, 0, + 16, 16, 16, 0, 9, 9, 0, 0, +}; + +/* Decomposition_Type: 2964 bytes. */ + +RE_UINT32 re_get_decomposition_type(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 13; + code = ch ^ (f << 13); + pos = (RE_UINT32)re_decomposition_type_stage_1[f] << 5; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_decomposition_type_stage_2[pos + f] << 4; + f = code >> 4; + code ^= f << 4; + pos = (RE_UINT32)re_decomposition_type_stage_3[pos + f] << 2; + f = code >> 2; + code ^= f << 2; + pos = (RE_UINT32)re_decomposition_type_stage_4[pos + f] << 2; + value = re_decomposition_type_stage_5[pos + code]; + + return value; +} + +/* East_Asian_Width. */ + +static RE_UINT8 re_east_asian_width_stage_1[] = { + 0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 6, 5, 5, 7, 8, 9, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 10, 10, 10, 12, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 13, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 13, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 14, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 15, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 15, +}; + +static RE_UINT8 re_east_asian_width_stage_2[] = { + 0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 7, 8, 9, 10, 11, 12, 13, 14, 5, 15, 5, 16, 5, 5, 17, 18, + 19, 20, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 24, 5, 5, 5, 5, 25, 5, 5, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 26, 5, 5, 5, 5, 5, 5, 5, 5, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 22, 22, 5, 5, 5, 28, 29, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 30, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 31, 32, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 33, + 5, 34, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 35, +}; + +static RE_UINT8 re_east_asian_width_stage_3[] = { + 0, 0, 1, 1, 1, 1, 1, 2, 0, 0, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 11, 0, 0, 0, 0, 0, 15, 16, 0, 0, + 0, 0, 0, 0, 0, 9, 9, 0, 0, 0, 0, 0, 17, 18, 0, 0, + 19, 19, 19, 19, 19, 19, 19, 0, 0, 20, 21, 20, 21, 0, 0, 0, + 9, 19, 19, 19, 19, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 22, 22, 22, 22, 22, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 23, 24, 25, 0, 0, 0, 26, 27, 0, 28, 0, 0, 0, 0, 0, + 29, 30, 31, 0, 0, 32, 33, 34, 35, 34, 0, 36, 0, 37, 38, 0, + 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 0, 0, 0, 0, + 0, 44, 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 19, 19, 19, 19, 19, 19, 19, 19, 51, 19, + 19, 19, 19, 19, 33, 19, 19, 52, 19, 53, 21, 54, 55, 56, 57, 0, + 58, 59, 0, 0, 60, 0, 61, 0, 0, 62, 0, 62, 63, 19, 64, 19, + 0, 0, 0, 65, 0, 38, 0, 66, 0, 0, 0, 0, 0, 0, 67, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 68, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 69, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 22, 70, 22, 22, 22, 22, 22, 71, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 72, 0, 73, + 74, 22, 22, 75, 76, 22, 22, 22, 22, 77, 22, 22, 22, 22, 22, 22, + 78, 22, 79, 76, 22, 22, 22, 22, 75, 22, 22, 80, 22, 22, 71, 22, + 22, 75, 22, 22, 81, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 75, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 0, 0, 0, 0, + 22, 22, 22, 22, 22, 22, 22, 22, 82, 22, 22, 22, 83, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 22, 82, 0, 0, 0, 0, 0, 0, 0, 0, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 71, 0, 0, 0, 0, 0, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 84, 0, 22, 22, 85, 86, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 87, 88, 88, 88, 88, 88, 89, 90, 90, 90, 90, 91, 92, 93, 94, 65, + 95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 96, 19, 97, 19, 19, 19, 34, 19, 19, 96, 0, 0, 0, 0, 0, 0, + 98, 22, 22, 80, 99, 95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 79, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 0, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 97, +}; + +static RE_UINT8 re_east_asian_width_stage_4[] = { + 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 7, 0, 10, 0, 0, 11, 12, 11, 13, 14, 10, 9, 14, + 8, 12, 9, 5, 15, 0, 0, 0, 16, 0, 12, 0, 0, 13, 12, 0, + 17, 0, 11, 12, 9, 11, 7, 15, 13, 0, 0, 0, 0, 0, 0, 10, + 5, 5, 5, 11, 0, 18, 17, 15, 11, 0, 7, 16, 7, 7, 7, 7, + 17, 7, 7, 7, 19, 7, 14, 0, 20, 20, 20, 20, 18, 9, 14, 14, + 9, 7, 0, 0, 8, 15, 12, 10, 0, 11, 0, 12, 17, 11, 0, 0, + 0, 0, 21, 11, 12, 15, 15, 0, 12, 10, 0, 0, 22, 10, 12, 0, + 12, 11, 12, 9, 7, 7, 7, 0, 7, 7, 14, 0, 0, 0, 15, 0, + 0, 0, 14, 0, 10, 11, 0, 0, 0, 12, 0, 0, 8, 12, 18, 12, + 15, 15, 10, 17, 18, 16, 7, 5, 0, 7, 0, 14, 0, 0, 11, 11, + 10, 0, 0, 0, 14, 7, 13, 13, 13, 13, 0, 0, 0, 15, 15, 0, + 0, 15, 0, 0, 0, 0, 0, 12, 0, 0, 23, 0, 7, 7, 19, 7, + 7, 0, 0, 0, 13, 14, 0, 0, 13, 13, 0, 14, 14, 13, 18, 13, + 14, 0, 0, 0, 13, 14, 0, 12, 0, 22, 15, 13, 0, 14, 0, 5, + 5, 0, 0, 0, 19, 19, 9, 19, 0, 0, 0, 13, 0, 7, 7, 19, + 19, 0, 7, 7, 0, 0, 0, 15, 0, 13, 7, 7, 0, 24, 1, 25, + 0, 26, 0, 0, 0, 17, 14, 0, 20, 20, 27, 20, 20, 0, 0, 0, + 20, 28, 0, 0, 20, 20, 20, 0, 29, 20, 20, 20, 20, 20, 20, 30, + 31, 20, 20, 20, 20, 30, 31, 20, 0, 31, 20, 20, 20, 20, 20, 28, + 20, 20, 30, 0, 20, 20, 7, 7, 20, 20, 20, 32, 20, 30, 0, 0, + 20, 20, 28, 0, 30, 20, 20, 20, 20, 30, 20, 0, 33, 34, 34, 34, + 34, 34, 34, 34, 35, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 37, + 38, 36, 38, 36, 38, 36, 38, 39, 34, 40, 36, 37, 28, 0, 0, 0, + 7, 7, 9, 0, 7, 7, 7, 14, 30, 0, 0, 0, 20, 20, 32, 0, +}; + +static RE_UINT8 re_east_asian_width_stage_5[] = { + 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 0, 0, 1, 5, 5, + 1, 5, 5, 1, 1, 0, 1, 0, 5, 1, 1, 5, 1, 1, 1, 1, + 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, + 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, + 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, + 3, 3, 3, 3, 0, 2, 0, 0, 0, 1, 1, 0, 0, 3, 3, 0, + 0, 0, 5, 5, 5, 5, 0, 0, 0, 5, 5, 0, 3, 3, 0, 3, + 3, 3, 0, 0, 4, 3, 3, 3, 3, 3, 3, 0, 0, 3, 3, 3, + 3, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 0, 0, 0, + 4, 4, 4, 0, +}; + +/* East_Asian_Width: 1668 bytes. */ + +RE_UINT32 re_get_east_asian_width(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 12; + code = ch ^ (f << 12); + pos = (RE_UINT32)re_east_asian_width_stage_1[f] << 4; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_east_asian_width_stage_2[pos + f] << 4; + f = code >> 4; + code ^= f << 4; + pos = (RE_UINT32)re_east_asian_width_stage_3[pos + f] << 2; + f = code >> 2; + code ^= f << 2; + pos = (RE_UINT32)re_east_asian_width_stage_4[pos + f] << 2; + value = re_east_asian_width_stage_5[pos + code]; + + return value; +} + +/* Joining_Group. */ + +static RE_UINT8 re_joining_group_stage_1[] = { + 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, +}; + +static RE_UINT8 re_joining_group_stage_2[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +}; + +static RE_UINT8 re_joining_group_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0, + 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_joining_group_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0, 0, 21, 0, 22, + 0, 0, 23, 24, 25, 26, 0, 0, 0, 27, 28, 29, 30, 31, 32, 33, + 0, 0, 0, 0, 34, 35, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 37, 38, 39, 40, 41, 42, 0, 0, +}; + +static RE_UINT8 re_joining_group_stage_5[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 45, 0, 3, 3, 43, 3, 45, 3, + 4, 41, 4, 4, 13, 13, 13, 6, 6, 31, 31, 35, 35, 33, 33, 39, + 39, 1, 1, 11, 11, 55, 55, 55, 0, 9, 29, 19, 22, 24, 26, 16, + 43, 45, 45, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 29, + 0, 3, 3, 3, 0, 3, 43, 43, 45, 4, 4, 4, 4, 4, 4, 4, + 4, 13, 13, 13, 13, 13, 13, 13, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 31, 31, 31, 31, 31, 31, 31, 31, 31, 35, 35, 35, 33, 33, 39, + 1, 9, 9, 9, 9, 9, 9, 29, 29, 11, 38, 11, 19, 19, 19, 11, + 11, 11, 11, 11, 11, 22, 22, 22, 22, 26, 26, 26, 26, 56, 21, 13, + 41, 17, 17, 14, 43, 43, 43, 43, 43, 43, 43, 43, 55, 47, 55, 43, + 45, 45, 46, 46, 0, 41, 0, 0, 0, 0, 0, 0, 0, 0, 6, 31, + 0, 0, 35, 33, 1, 0, 0, 21, 2, 0, 5, 12, 12, 7, 7, 15, + 44, 50, 18, 42, 42, 48, 49, 20, 23, 25, 27, 36, 10, 8, 28, 32, + 34, 30, 7, 37, 40, 5, 12, 7, 0, 0, 0, 0, 0, 51, 52, 53, + 4, 4, 4, 4, 4, 4, 4, 13, 13, 6, 6, 31, 35, 1, 1, 1, + 9, 9, 11, 11, 11, 24, 24, 26, 26, 26, 22, 31, 31, 35, 13, 13, + 35, 31, 13, 3, 3, 55, 55, 45, 43, 43, 54, 54, 13, 35, 35, 19, + 4, 4, 13, 39, 9, 29, 22, 24, 45, 45, 31, 43, 57, 0, 6, 33, + 11, 58, 31, 1, 19, 0, 0, 0, 59, 61, 61, 65, 65, 62, 0, 83, + 0, 85, 85, 0, 0, 66, 80, 84, 68, 68, 68, 69, 63, 81, 70, 71, + 77, 60, 60, 73, 73, 76, 74, 74, 74, 75, 0, 0, 78, 0, 0, 0, + 0, 0, 0, 72, 64, 79, 82, 67, +}; + +/* Joining_Group: 586 bytes. */ + +RE_UINT32 re_get_joining_group(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 15; + code = ch ^ (f << 15); + pos = (RE_UINT32)re_joining_group_stage_1[f] << 4; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_joining_group_stage_2[pos + f] << 4; + f = code >> 7; + code ^= f << 7; + pos = (RE_UINT32)re_joining_group_stage_3[pos + f] << 4; + f = code >> 3; + code ^= f << 3; + pos = (RE_UINT32)re_joining_group_stage_4[pos + f] << 3; + value = re_joining_group_stage_5[pos + code]; + + return value; +} + +/* Joining_Type. */ + +static RE_UINT8 re_joining_type_stage_1[] = { + 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 6, + 7, 8, 4, 4, 4, 4, 9, 4, 4, 4, 4, 10, 4, 11, 12, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 13, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, +}; + +static RE_UINT8 re_joining_type_stage_2[] = { + 0, 1, 0, 0, 0, 0, 2, 0, 0, 3, 0, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 0, 0, 0, 0, 27, 0, 0, 0, 0, 0, 0, 0, 28, 29, + 30, 31, 32, 0, 33, 34, 35, 36, 37, 38, 0, 39, 0, 0, 0, 0, + 40, 41, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 43, 44, 0, 0, 0, 0, + 45, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 47, 48, 0, 0, + 49, 50, 51, 52, 53, 54, 0, 55, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 56, 0, 0, 0, 0, 0, 57, 43, 0, 58, + 0, 0, 0, 59, 0, 60, 61, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 62, 63, 0, 64, 0, 0, 0, 0, 0, 0, 0, 0, + 65, 66, 67, 68, 69, 70, 71, 0, 0, 72, 0, 73, 74, 75, 76, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 77, 78, 0, 0, 0, 0, 0, 0, 0, 0, 79, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 80, 0, 0, 0, 0, 0, 0, + 0, 0, 81, 82, 83, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 84, 85, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 86, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 87, 0, 88, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_joining_type_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 2, 5, 6, 0, 0, 0, 0, 7, 8, 9, 10, 2, 11, 12, + 13, 14, 15, 15, 16, 17, 18, 19, 20, 21, 22, 2, 23, 24, 25, 26, + 0, 0, 27, 28, 29, 15, 30, 31, 0, 32, 33, 0, 34, 35, 0, 0, + 0, 0, 36, 37, 0, 0, 38, 2, 39, 0, 0, 40, 41, 42, 43, 0, + 44, 0, 0, 45, 46, 0, 43, 0, 47, 0, 0, 45, 48, 44, 0, 49, + 47, 0, 0, 45, 50, 0, 43, 0, 44, 0, 0, 51, 46, 52, 43, 0, + 53, 0, 0, 0, 54, 0, 0, 0, 28, 0, 0, 55, 56, 57, 43, 0, + 44, 0, 0, 51, 58, 0, 43, 0, 44, 0, 0, 0, 46, 0, 43, 0, + 0, 0, 0, 0, 59, 60, 0, 0, 0, 0, 0, 61, 62, 0, 0, 0, + 0, 0, 0, 63, 64, 0, 0, 0, 0, 65, 0, 66, 0, 0, 0, 67, + 68, 69, 2, 70, 52, 0, 0, 0, 0, 0, 71, 72, 0, 73, 28, 74, + 75, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 71, 0, 0, + 0, 76, 0, 76, 0, 43, 0, 43, 0, 0, 0, 77, 78, 79, 0, 0, + 80, 0, 15, 15, 15, 15, 15, 81, 82, 15, 83, 0, 0, 0, 0, 0, + 0, 0, 84, 85, 0, 0, 0, 0, 0, 86, 0, 0, 0, 87, 88, 89, + 0, 0, 0, 90, 0, 0, 0, 0, 91, 0, 0, 92, 53, 0, 93, 91, + 94, 0, 95, 0, 0, 0, 96, 94, 0, 0, 97, 98, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 99, 100, 101, 0, 0, 0, 0, 2, 2, 2, 102, + 103, 0, 104, 0, 0, 0, 105, 0, 0, 0, 0, 0, 0, 2, 2, 28, + 0, 0, 0, 0, 0, 0, 20, 94, 0, 0, 0, 0, 0, 0, 0, 20, + 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 106, 0, 0, 0, 0, 0, + 0, 107, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 108, + 0, 55, 0, 0, 0, 0, 0, 94, 109, 0, 57, 0, 15, 15, 15, 110, + 0, 0, 0, 0, 111, 0, 2, 94, 0, 0, 112, 0, 113, 94, 0, 0, + 39, 0, 0, 114, 0, 0, 115, 0, 0, 0, 116, 117, 118, 0, 0, 45, + 0, 0, 0, 119, 44, 0, 120, 52, 0, 0, 0, 0, 0, 0, 121, 0, + 0, 122, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 123, 0, 0, 0, 0, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 0, 28, 0, 0, 0, 0, 0, 0, 0, 0, 124, + 125, 0, 0, 126, 0, 0, 0, 0, 0, 0, 0, 0, 127, 128, 129, 0, + 130, 131, 132, 0, 0, 0, 0, 0, 44, 0, 0, 133, 134, 0, 0, 20, + 94, 0, 0, 135, 0, 0, 0, 0, 39, 0, 136, 137, 0, 0, 0, 138, + 94, 0, 0, 139, 140, 0, 0, 0, 0, 0, 20, 141, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 20, 142, 0, 94, 0, 0, 45, 28, 0, 143, 137, + 0, 0, 0, 144, 145, 0, 0, 0, 0, 0, 0, 146, 28, 120, 0, 0, + 0, 0, 0, 147, 28, 0, 0, 0, 0, 0, 148, 149, 0, 0, 0, 0, + 0, 71, 150, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 137, + 0, 0, 0, 134, 0, 0, 0, 0, 20, 39, 0, 0, 0, 0, 0, 0, + 0, 151, 91, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 152, 38, + 153, 0, 106, 0, 0, 0, 0, 0, 0, 0, 0, 0, 76, 0, 0, 0, + 2, 2, 2, 154, 2, 2, 70, 115, 111, 93, 4, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 134, 0, 0, 44, 0, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, +}; + +static RE_UINT8 re_joining_type_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 3, 2, 4, 0, + 5, 2, 2, 2, 2, 2, 2, 6, 7, 6, 0, 0, 2, 2, 8, 9, + 10, 11, 12, 13, 14, 15, 15, 15, 16, 15, 17, 2, 0, 0, 0, 18, + 19, 20, 15, 15, 15, 15, 21, 21, 21, 21, 22, 15, 15, 15, 15, 15, + 23, 21, 21, 24, 25, 26, 2, 27, 2, 27, 28, 29, 0, 0, 18, 30, + 0, 0, 0, 3, 31, 32, 22, 33, 15, 15, 34, 23, 2, 2, 8, 35, + 15, 15, 32, 15, 15, 15, 13, 36, 24, 36, 22, 15, 0, 37, 2, 2, + 9, 0, 0, 0, 0, 0, 18, 15, 15, 15, 38, 2, 2, 0, 39, 0, + 0, 37, 6, 2, 2, 5, 5, 4, 36, 25, 12, 15, 15, 40, 5, 0, + 15, 15, 25, 41, 42, 43, 0, 0, 3, 2, 2, 2, 8, 0, 0, 0, + 0, 0, 44, 9, 5, 2, 9, 1, 5, 2, 0, 0, 37, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 9, 5, 9, 0, 1, 7, 0, 0, 0, + 7, 3, 27, 4, 4, 1, 0, 0, 5, 6, 9, 1, 0, 0, 0, 27, + 0, 44, 0, 0, 44, 0, 0, 0, 9, 0, 0, 1, 0, 0, 0, 37, + 9, 37, 28, 4, 0, 7, 0, 0, 0, 44, 0, 4, 0, 0, 44, 0, + 37, 45, 0, 0, 1, 2, 8, 0, 0, 3, 2, 8, 1, 2, 6, 9, + 0, 0, 2, 4, 0, 0, 4, 0, 0, 46, 1, 0, 5, 2, 2, 8, + 2, 28, 0, 5, 2, 2, 5, 2, 2, 2, 2, 9, 0, 0, 0, 5, + 28, 2, 7, 7, 0, 0, 4, 37, 5, 9, 0, 0, 44, 7, 0, 1, + 37, 9, 0, 0, 0, 6, 2, 4, 0, 44, 5, 2, 2, 0, 0, 1, + 0, 47, 48, 4, 15, 15, 0, 0, 0, 47, 15, 15, 15, 15, 49, 0, + 8, 3, 9, 0, 44, 0, 5, 0, 0, 3, 27, 0, 0, 44, 2, 8, + 45, 5, 2, 9, 3, 2, 2, 27, 2, 2, 2, 8, 2, 0, 0, 0, + 0, 28, 8, 9, 0, 0, 3, 2, 4, 0, 0, 0, 37, 4, 6, 4, + 0, 44, 4, 46, 0, 0, 0, 2, 2, 37, 0, 0, 8, 2, 2, 2, + 28, 2, 9, 1, 0, 9, 4, 0, 2, 4, 0, 2, 0, 0, 3, 50, + 0, 0, 37, 8, 2, 9, 37, 2, 0, 0, 37, 4, 0, 0, 7, 0, + 8, 2, 2, 4, 44, 44, 3, 0, 51, 0, 0, 0, 0, 9, 0, 0, + 0, 37, 2, 4, 0, 3, 2, 2, 3, 37, 4, 9, 0, 1, 0, 0, + 0, 0, 5, 8, 7, 7, 0, 0, 3, 0, 0, 9, 28, 27, 9, 37, + 0, 0, 0, 4, 0, 1, 9, 1, 0, 0, 0, 44, 0, 0, 5, 0, + 0, 37, 8, 0, 5, 7, 0, 2, 0, 0, 8, 3, 15, 52, 53, 54, + 14, 55, 15, 12, 56, 57, 47, 13, 24, 22, 12, 58, 56, 0, 0, 0, + 0, 0, 20, 59, 0, 0, 2, 2, 2, 8, 0, 0, 3, 8, 7, 1, + 0, 3, 2, 5, 2, 9, 0, 0, 3, 0, 0, 0, 0, 37, 2, 8, + 0, 0, 37, 9, 4, 28, 0, 0, 3, 2, 8, 0, 0, 37, 2, 9, + 3, 2, 45, 3, 28, 0, 0, 0, 37, 4, 0, 6, 3, 2, 8, 46, + 0, 0, 3, 1, 2, 6, 0, 0, 37, 6, 2, 0, 0, 0, 0, 7, + 0, 3, 4, 0, 8, 5, 2, 0, 2, 8, 3, 2, +}; + +static RE_UINT8 re_joining_type_stage_5[] = { + 0, 0, 0, 0, 0, 5, 0, 0, 5, 5, 5, 5, 0, 0, 0, 5, + 5, 5, 0, 0, 0, 5, 5, 5, 5, 5, 0, 5, 0, 5, 5, 0, + 5, 5, 5, 0, 5, 0, 0, 0, 2, 0, 3, 3, 3, 3, 2, 3, + 2, 3, 2, 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 3, 2, 2, 5, 0, 0, 2, 2, 5, 3, 3, 3, + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 2, 2, 3, + 2, 3, 2, 3, 2, 2, 3, 3, 0, 3, 5, 5, 5, 0, 0, 5, + 5, 0, 5, 5, 5, 5, 3, 3, 2, 0, 0, 2, 3, 5, 2, 2, + 2, 3, 3, 3, 2, 2, 3, 2, 3, 2, 3, 2, 0, 3, 2, 2, + 3, 2, 2, 2, 0, 0, 5, 5, 2, 2, 2, 5, 0, 0, 1, 0, + 3, 2, 0, 0, 3, 0, 3, 2, 2, 3, 3, 2, 2, 0, 0, 0, + 0, 0, 5, 0, 5, 0, 5, 0, 0, 5, 0, 5, 0, 0, 0, 2, + 0, 0, 1, 5, 2, 5, 2, 0, 0, 1, 5, 5, 2, 2, 4, 0, + 2, 3, 0, 3, 0, 3, 3, 0, 0, 4, 3, 3, 2, 2, 2, 4, + 2, 3, 0, 0, 3, 5, 5, 0, 3, 2, 3, 3, 3, 2, 2, 0, +}; + +/* Joining_Type: 2292 bytes. */ + +RE_UINT32 re_get_joining_type(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 12; + code = ch ^ (f << 12); + pos = (RE_UINT32)re_joining_type_stage_1[f] << 5; + f = code >> 7; + code ^= f << 7; + pos = (RE_UINT32)re_joining_type_stage_2[pos + f] << 3; + f = code >> 4; + code ^= f << 4; + pos = (RE_UINT32)re_joining_type_stage_3[pos + f] << 2; + f = code >> 2; + code ^= f << 2; + pos = (RE_UINT32)re_joining_type_stage_4[pos + f] << 2; + value = re_joining_type_stage_5[pos + code]; + + return value; +} + +/* Line_Break. */ + +static RE_UINT8 re_line_break_stage_1[] = { + 0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 10, 17, 10, 10, 10, 10, 18, 10, 19, 20, 21, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 22, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 22, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 23, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, +}; + +static RE_UINT8 re_line_break_stage_2[] = { + 0, 1, 2, 2, 2, 3, 4, 5, 2, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31, 32, 33, 34, 35, 36, 37, 2, 2, 2, 2, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 2, 51, 2, 2, 52, 53, + 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, + 2, 2, 2, 70, 2, 2, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, + 81, 82, 83, 84, 85, 86, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, + 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, + 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, + 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 87, 79, 79, 79, 79, + 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, + 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, + 88, 79, 79, 79, 79, 79, 79, 79, 79, 89, 2, 2, 90, 91, 2, 92, + 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 101, + 102, 103, 104, 105, 106, 107, 101, 102, 103, 104, 105, 106, 107, 101, 102, 103, + 104, 105, 106, 107, 101, 102, 103, 104, 105, 106, 107, 101, 102, 103, 104, 105, + 106, 107, 101, 102, 103, 104, 105, 106, 107, 101, 102, 103, 104, 105, 106, 107, + 101, 102, 103, 104, 105, 106, 107, 101, 102, 103, 104, 105, 106, 107, 101, 102, + 103, 104, 105, 106, 107, 101, 102, 103, 104, 105, 106, 107, 101, 102, 103, 108, + 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, + 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 79, 79, 79, 79, 111, 112, 2, 2, 113, 114, 115, 116, 117, 118, + 119, 120, 121, 122, 110, 123, 124, 125, 2, 126, 127, 110, 2, 2, 128, 110, + 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 110, 110, 139, 110, 110, 110, + 140, 141, 142, 143, 144, 145, 146, 110, 110, 147, 110, 148, 149, 150, 151, 110, + 110, 152, 110, 110, 110, 153, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, + 2, 2, 2, 2, 2, 2, 2, 154, 155, 2, 156, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, + 2, 2, 2, 2, 157, 158, 159, 2, 160, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, 2, 2, 2, 161, 162, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, + 2, 2, 2, 2, 163, 164, 165, 166, 110, 110, 110, 110, 110, 110, 167, 168, + 169, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, 170, 171, 110, 110, 110, 110, 110, 110, + 2, 172, 173, 174, 175, 110, 176, 110, 177, 178, 179, 2, 2, 180, 2, 181, + 2, 2, 2, 2, 182, 183, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, + 2, 184, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 185, 186, 110, 110, + 187, 188, 189, 190, 191, 110, 79, 192, 79, 193, 194, 195, 196, 197, 198, 199, + 200, 201, 202, 203, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, + 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, + 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 204, + 205, 110, 206, 207, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, +}; + +static RE_UINT16 re_line_break_stage_3[] = { + 0, 1, 2, 3, 4, 5, 4, 6, 7, 1, 8, 9, 4, 10, 4, 10, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 11, 12, 4, 4, + 1, 1, 1, 1, 13, 14, 15, 16, 17, 4, 18, 4, 4, 4, 4, 4, + 19, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 20, 4, 21, 20, 4, + 22, 23, 1, 24, 25, 26, 27, 28, 29, 30, 4, 4, 31, 1, 32, 33, + 4, 4, 4, 4, 4, 34, 35, 36, 37, 38, 4, 1, 39, 4, 4, 4, + 4, 4, 40, 41, 36, 4, 31, 42, 4, 43, 44, 45, 4, 46, 47, 47, + 47, 47, 4, 48, 47, 47, 49, 1, 50, 4, 4, 51, 1, 52, 53, 4, + 54, 55, 56, 57, 58, 59, 60, 61, 62, 55, 56, 63, 64, 65, 66, 67, + 68, 18, 56, 69, 70, 71, 60, 72, 73, 55, 56, 69, 74, 75, 60, 76, + 77, 78, 79, 80, 81, 82, 66, 83, 84, 85, 56, 86, 87, 88, 60, 89, + 90, 85, 56, 91, 87, 92, 60, 93, 90, 85, 4, 94, 95, 96, 60, 97, + 98, 99, 4, 100, 101, 102, 66, 103, 104, 105, 105, 106, 107, 108, 47, 47, + 109, 110, 111, 112, 113, 114, 47, 47, 115, 116, 36, 117, 118, 4, 119, 120, + 121, 122, 1, 123, 124, 125, 47, 47, 105, 105, 105, 105, 126, 105, 105, 105, + 105, 127, 4, 4, 128, 4, 4, 4, 129, 129, 129, 129, 129, 129, 130, 130, + 130, 130, 131, 132, 132, 132, 132, 132, 4, 4, 4, 4, 133, 134, 4, 4, + 133, 4, 4, 135, 136, 137, 4, 4, 4, 136, 4, 4, 4, 138, 139, 119, + 4, 140, 4, 4, 4, 4, 4, 141, 142, 4, 4, 4, 4, 4, 4, 4, + 142, 143, 4, 4, 4, 4, 144, 145, 146, 147, 4, 148, 4, 149, 146, 150, + 105, 105, 105, 105, 105, 151, 152, 140, 153, 152, 4, 4, 4, 4, 4, 76, + 4, 4, 154, 4, 4, 4, 4, 155, 4, 45, 156, 156, 157, 105, 158, 159, + 105, 105, 160, 105, 161, 162, 4, 4, 4, 163, 105, 105, 105, 164, 105, 165, + 152, 152, 158, 166, 47, 47, 47, 47, 167, 4, 4, 168, 169, 170, 171, 172, + 173, 4, 174, 36, 4, 4, 40, 175, 4, 4, 168, 176, 177, 36, 4, 178, + 47, 47, 47, 47, 76, 179, 180, 181, 4, 4, 4, 4, 1, 1, 1, 182, + 4, 141, 4, 4, 141, 183, 4, 184, 4, 4, 4, 185, 185, 186, 4, 187, + 188, 189, 190, 191, 192, 193, 194, 195, 196, 119, 197, 198, 199, 1, 1, 200, + 201, 202, 203, 4, 4, 204, 205, 206, 207, 206, 4, 4, 4, 208, 4, 4, + 209, 210, 211, 212, 213, 214, 215, 4, 216, 217, 218, 219, 4, 4, 220, 4, + 221, 222, 223, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 224, + 4, 4, 225, 47, 226, 47, 227, 227, 227, 227, 227, 227, 227, 227, 227, 228, + 227, 227, 227, 227, 205, 227, 227, 229, 227, 230, 231, 232, 233, 234, 235, 4, + 236, 237, 4, 238, 239, 4, 240, 241, 4, 242, 4, 243, 244, 245, 246, 247, + 248, 4, 4, 4, 4, 249, 250, 251, 227, 252, 4, 4, 253, 4, 254, 4, + 255, 256, 4, 4, 4, 221, 4, 257, 4, 4, 4, 4, 4, 258, 4, 259, + 4, 260, 4, 261, 56, 262, 263, 47, 4, 4, 45, 4, 4, 45, 4, 4, + 4, 4, 4, 4, 4, 4, 264, 265, 4, 4, 128, 4, 4, 4, 266, 267, + 4, 225, 268, 268, 268, 268, 1, 1, 269, 270, 271, 272, 273, 47, 47, 47, + 274, 275, 274, 274, 274, 274, 274, 276, 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 277, 47, 278, 279, 280, 281, 282, 283, 274, 284, 274, + 285, 286, 287, 274, 284, 274, 285, 288, 289, 274, 290, 291, 274, 274, 274, 274, + 292, 274, 274, 293, 274, 274, 276, 294, 274, 292, 274, 274, 295, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 292, 274, 274, 274, 274, 4, 4, 4, 4, + 274, 296, 274, 274, 274, 274, 274, 274, 297, 274, 274, 274, 298, 4, 4, 178, + 299, 4, 300, 47, 4, 4, 264, 301, 4, 302, 4, 4, 4, 4, 4, 303, + 4, 4, 184, 76, 47, 47, 47, 304, 305, 4, 306, 307, 4, 4, 4, 308, + 309, 4, 4, 168, 310, 152, 1, 311, 36, 4, 312, 4, 313, 314, 129, 315, + 50, 4, 4, 316, 317, 318, 105, 319, 4, 4, 320, 321, 322, 323, 105, 105, + 105, 105, 105, 105, 324, 325, 31, 326, 327, 328, 268, 4, 4, 4, 155, 4, + 4, 4, 4, 4, 4, 4, 329, 152, 330, 331, 332, 333, 332, 334, 332, 330, + 331, 332, 333, 332, 334, 332, 330, 331, 332, 333, 332, 334, 332, 330, 331, 332, + 333, 332, 334, 332, 330, 331, 332, 333, 332, 334, 332, 330, 331, 332, 333, 332, + 334, 332, 330, 331, 332, 333, 332, 334, 332, 330, 331, 332, 333, 332, 334, 332, + 333, 332, 335, 130, 336, 132, 132, 337, 338, 338, 338, 338, 338, 338, 338, 338, + 47, 47, 47, 47, 47, 47, 47, 47, 225, 339, 340, 341, 342, 4, 4, 4, + 4, 4, 4, 4, 262, 343, 4, 4, 4, 4, 4, 344, 47, 4, 4, 4, + 4, 345, 4, 4, 76, 47, 47, 346, 1, 347, 1, 348, 349, 350, 351, 185, + 4, 4, 4, 4, 4, 4, 4, 352, 353, 354, 274, 355, 274, 356, 357, 358, + 4, 359, 4, 45, 360, 361, 362, 363, 364, 4, 137, 365, 184, 184, 47, 47, + 4, 4, 4, 4, 4, 4, 4, 226, 366, 4, 4, 367, 4, 4, 4, 4, + 119, 368, 71, 47, 47, 4, 4, 369, 4, 119, 4, 4, 4, 71, 33, 368, + 4, 4, 370, 4, 226, 4, 4, 371, 4, 372, 4, 4, 373, 374, 47, 47, + 4, 184, 152, 47, 47, 47, 47, 47, 4, 4, 76, 4, 4, 4, 375, 47, + 4, 4, 4, 225, 4, 155, 76, 47, 376, 4, 4, 377, 4, 378, 4, 4, + 4, 45, 304, 47, 47, 47, 4, 379, 4, 380, 4, 381, 47, 47, 47, 47, + 4, 4, 4, 382, 4, 345, 4, 4, 383, 384, 4, 385, 76, 386, 4, 4, + 4, 4, 47, 47, 4, 4, 387, 388, 4, 4, 4, 389, 4, 260, 4, 390, + 4, 391, 392, 47, 47, 47, 47, 47, 4, 4, 4, 4, 145, 47, 47, 47, + 4, 4, 4, 393, 4, 4, 4, 394, 47, 47, 47, 47, 47, 47, 4, 45, + 173, 4, 4, 395, 396, 345, 397, 398, 173, 4, 4, 399, 400, 4, 145, 152, + 173, 4, 313, 401, 402, 4, 4, 403, 173, 4, 4, 316, 404, 405, 20, 48, + 4, 18, 406, 407, 47, 47, 47, 47, 408, 37, 409, 4, 4, 264, 410, 152, + 411, 55, 56, 69, 74, 412, 413, 414, 4, 4, 4, 1, 415, 152, 47, 47, + 4, 4, 264, 416, 417, 418, 47, 47, 4, 4, 4, 1, 419, 152, 47, 47, + 4, 4, 31, 420, 152, 47, 47, 47, 105, 421, 160, 422, 47, 47, 47, 47, + 47, 47, 4, 4, 4, 4, 36, 423, 47, 47, 47, 47, 4, 4, 4, 145, + 4, 140, 47, 47, 47, 47, 47, 47, 4, 4, 4, 4, 4, 4, 45, 424, + 4, 4, 4, 4, 370, 47, 47, 47, 4, 4, 4, 4, 4, 425, 4, 4, + 426, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 427, + 4, 4, 45, 47, 47, 47, 47, 47, 4, 4, 4, 4, 428, 4, 4, 4, + 4, 4, 4, 4, 225, 47, 47, 47, 4, 4, 4, 145, 4, 45, 429, 47, + 47, 47, 47, 47, 47, 4, 184, 430, 4, 4, 4, 431, 432, 433, 18, 434, + 4, 47, 47, 47, 47, 47, 47, 47, 4, 4, 4, 4, 48, 435, 1, 166, + 398, 173, 47, 47, 47, 47, 47, 47, 436, 47, 47, 47, 47, 47, 47, 47, + 4, 4, 4, 4, 4, 4, 226, 119, 145, 437, 438, 47, 47, 47, 47, 47, + 4, 4, 4, 4, 4, 4, 4, 155, 4, 4, 21, 4, 4, 4, 439, 1, + 440, 4, 441, 4, 4, 4, 145, 47, 4, 4, 4, 4, 442, 47, 47, 47, + 4, 4, 4, 4, 4, 225, 4, 262, 4, 4, 4, 4, 4, 185, 4, 4, + 4, 146, 443, 444, 445, 4, 4, 4, 446, 447, 4, 448, 449, 85, 4, 4, + 4, 4, 260, 4, 4, 4, 4, 4, 4, 4, 4, 4, 450, 451, 451, 451, + 1, 1, 1, 452, 1, 1, 453, 454, 455, 456, 23, 47, 47, 47, 47, 47, + 4, 4, 4, 4, 457, 321, 47, 47, 445, 4, 458, 459, 460, 461, 462, 463, + 464, 368, 465, 368, 47, 47, 47, 262, 274, 274, 278, 274, 274, 274, 274, 274, + 274, 276, 292, 291, 291, 291, 274, 277, 466, 227, 467, 227, 227, 227, 468, 227, + 227, 469, 47, 47, 47, 47, 470, 471, 472, 274, 274, 293, 473, 436, 47, 47, + 274, 474, 274, 475, 274, 274, 274, 476, 274, 274, 477, 478, 274, 274, 274, 274, + 479, 480, 481, 482, 483, 274, 274, 275, 274, 274, 484, 274, 274, 485, 274, 486, + 274, 274, 274, 274, 274, 4, 4, 487, 274, 274, 274, 274, 274, 488, 297, 276, + 4, 4, 4, 4, 4, 4, 4, 370, 4, 4, 4, 4, 4, 48, 47, 47, + 368, 4, 4, 4, 76, 140, 4, 4, 76, 4, 184, 47, 47, 47, 47, 47, + 47, 473, 47, 47, 47, 47, 47, 47, 489, 47, 47, 47, 488, 47, 47, 47, + 274, 274, 274, 274, 274, 274, 274, 290, 490, 47, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 47, +}; + +static RE_UINT8 re_line_break_stage_4[] = { + 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12, 12, 13, 14, 15, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 16, 17, 14, + 14, 14, 14, 14, 14, 16, 18, 19, 0, 0, 20, 0, 0, 0, 0, 0, + 21, 22, 23, 24, 25, 26, 27, 14, 22, 28, 29, 28, 28, 26, 28, 30, + 14, 14, 14, 24, 14, 14, 14, 14, 14, 14, 14, 24, 31, 28, 31, 14, + 25, 14, 14, 14, 28, 28, 24, 32, 0, 0, 0, 0, 0, 0, 0, 33, + 0, 0, 0, 0, 0, 0, 34, 34, 34, 35, 0, 0, 0, 0, 0, 0, + 14, 14, 14, 14, 36, 14, 14, 37, 36, 36, 14, 14, 14, 38, 38, 14, + 14, 39, 14, 14, 14, 14, 14, 14, 14, 19, 0, 0, 0, 14, 14, 14, + 39, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 38, 39, 14, 14, 14, + 14, 14, 14, 14, 40, 41, 39, 9, 42, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 43, 19, 44, 0, 45, 36, 36, 36, 36, + 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 36, 36, + 46, 48, 38, 36, 36, 36, 36, 36, 14, 14, 14, 14, 49, 50, 13, 14, + 0, 0, 0, 0, 0, 51, 52, 53, 14, 14, 14, 14, 14, 19, 0, 0, + 12, 12, 12, 12, 12, 54, 55, 14, 44, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 56, 0, 0, 0, 44, 19, 0, 0, 44, 19, 44, 0, 0, 14, + 12, 12, 12, 12, 12, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 39, + 19, 14, 14, 14, 14, 14, 14, 14, 0, 0, 0, 0, 0, 52, 39, 14, + 14, 14, 14, 0, 0, 0, 0, 0, 44, 36, 36, 36, 36, 36, 36, 36, + 0, 0, 14, 14, 57, 38, 36, 36, 14, 14, 14, 0, 0, 19, 0, 0, + 0, 0, 19, 0, 19, 0, 0, 36, 14, 14, 14, 14, 14, 14, 14, 38, + 14, 14, 14, 14, 19, 0, 36, 38, 36, 36, 36, 36, 36, 36, 36, 36, + 14, 14, 38, 36, 36, 36, 36, 36, 36, 42, 0, 0, 0, 0, 0, 0, + 0, 0, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 0, 44, 0, + 19, 0, 0, 0, 14, 14, 14, 14, 14, 0, 58, 12, 12, 12, 12, 12, + 19, 0, 39, 14, 14, 14, 38, 39, 38, 39, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 38, 14, 14, 14, 38, 38, 36, 14, 14, 36, 44, 0, + 0, 0, 52, 42, 52, 42, 0, 38, 36, 36, 36, 42, 36, 36, 14, 39, + 14, 0, 36, 12, 12, 12, 12, 12, 14, 50, 14, 14, 49, 9, 36, 36, + 42, 0, 39, 14, 14, 38, 36, 39, 38, 14, 39, 38, 14, 36, 52, 0, + 0, 52, 36, 42, 52, 42, 0, 36, 42, 36, 36, 36, 39, 14, 38, 38, + 36, 36, 36, 12, 12, 12, 12, 12, 0, 14, 19, 36, 36, 36, 36, 36, + 42, 0, 39, 14, 14, 14, 14, 39, 38, 14, 39, 14, 14, 36, 44, 0, + 0, 0, 0, 42, 0, 42, 0, 36, 38, 36, 36, 36, 36, 36, 36, 36, + 9, 36, 36, 36, 39, 36, 36, 36, 42, 0, 39, 14, 14, 14, 38, 39, + 0, 0, 52, 42, 52, 42, 0, 36, 36, 36, 36, 0, 36, 36, 14, 39, + 14, 14, 14, 14, 36, 36, 36, 36, 36, 44, 39, 14, 14, 38, 36, 14, + 38, 14, 14, 36, 39, 38, 38, 14, 36, 39, 38, 36, 14, 38, 36, 14, + 14, 14, 14, 14, 14, 36, 36, 0, 0, 52, 36, 0, 52, 0, 0, 36, + 38, 36, 36, 42, 36, 36, 36, 36, 14, 14, 14, 14, 9, 38, 36, 36, + 0, 0, 39, 14, 14, 14, 38, 14, 38, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 36, 39, 0, 0, 0, 52, 0, 52, 0, 0, 36, + 36, 36, 42, 52, 14, 38, 36, 36, 36, 36, 36, 36, 14, 14, 14, 14, + 42, 0, 39, 14, 14, 14, 38, 14, 14, 14, 39, 14, 14, 36, 44, 0, + 36, 36, 42, 52, 36, 36, 36, 38, 39, 38, 36, 36, 36, 36, 36, 36, + 14, 14, 14, 14, 14, 38, 39, 0, 0, 0, 52, 0, 52, 0, 0, 38, + 36, 36, 36, 42, 36, 36, 36, 39, 14, 14, 14, 36, 59, 14, 14, 14, + 36, 0, 39, 14, 14, 14, 14, 14, 14, 14, 14, 38, 36, 14, 14, 14, + 14, 39, 14, 14, 14, 14, 39, 36, 14, 14, 14, 38, 36, 52, 36, 42, + 0, 0, 52, 52, 0, 0, 0, 0, 36, 0, 38, 36, 36, 36, 36, 36, + 60, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 62, 36, 63, 61, 61, 61, 61, 61, 61, 61, 64, + 12, 12, 12, 12, 12, 58, 36, 36, 60, 62, 62, 60, 62, 62, 60, 36, + 36, 36, 61, 61, 60, 61, 61, 61, 60, 61, 60, 60, 36, 61, 60, 61, + 61, 61, 61, 61, 61, 60, 61, 36, 61, 61, 62, 62, 61, 61, 61, 36, + 12, 12, 12, 12, 12, 36, 61, 61, 32, 65, 29, 65, 66, 67, 68, 53, + 53, 69, 56, 14, 0, 14, 14, 14, 14, 14, 43, 19, 19, 70, 70, 0, + 14, 14, 14, 14, 39, 14, 14, 14, 14, 14, 14, 14, 14, 14, 38, 36, + 42, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 14, 14, 19, 0, + 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 52, 58, + 14, 14, 14, 44, 14, 14, 38, 14, 65, 71, 14, 14, 72, 73, 36, 36, + 12, 12, 12, 12, 12, 58, 14, 14, 12, 12, 12, 12, 12, 61, 61, 61, + 14, 14, 14, 39, 36, 36, 39, 36, 74, 74, 74, 74, 74, 74, 74, 74, + 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 76, 76, 76, 76, + 76, 76, 76, 76, 76, 76, 76, 76, 14, 14, 14, 14, 38, 14, 14, 36, + 14, 14, 14, 38, 38, 14, 14, 36, 38, 14, 14, 36, 14, 14, 14, 38, + 38, 14, 14, 36, 14, 14, 14, 14, 14, 14, 14, 38, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 38, 42, 0, 27, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 36, 36, 36, 14, 14, 14, 36, 14, 14, 14, 36, + 77, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 16, 78, 36, + 14, 14, 14, 14, 14, 27, 58, 14, 14, 14, 14, 14, 38, 36, 36, 36, + 14, 14, 14, 14, 14, 14, 38, 14, 14, 0, 52, 36, 36, 36, 36, 36, + 14, 0, 1, 41, 36, 36, 36, 36, 14, 0, 36, 36, 36, 36, 36, 36, + 38, 0, 36, 36, 36, 36, 36, 36, 61, 61, 58, 79, 77, 80, 61, 36, + 12, 12, 12, 12, 12, 36, 36, 36, 14, 53, 58, 29, 53, 19, 0, 73, + 14, 14, 14, 14, 19, 38, 36, 36, 14, 14, 14, 36, 36, 36, 36, 36, + 0, 0, 0, 0, 0, 0, 36, 36, 38, 36, 53, 12, 12, 12, 12, 12, + 61, 61, 61, 61, 61, 61, 61, 36, 61, 61, 62, 36, 36, 36, 36, 36, + 61, 61, 61, 61, 61, 61, 36, 36, 61, 61, 61, 61, 61, 36, 36, 36, + 12, 12, 12, 12, 12, 62, 36, 61, 14, 14, 14, 19, 0, 0, 36, 14, + 61, 61, 61, 61, 61, 61, 61, 62, 61, 61, 61, 61, 61, 61, 62, 42, + 0, 0, 0, 0, 0, 0, 0, 52, 0, 0, 44, 14, 14, 14, 14, 14, + 14, 14, 0, 0, 0, 0, 0, 0, 0, 0, 44, 14, 14, 14, 36, 36, + 12, 12, 12, 12, 12, 58, 27, 58, 77, 14, 14, 14, 14, 19, 0, 0, + 0, 0, 14, 14, 14, 14, 38, 36, 0, 44, 14, 14, 14, 14, 14, 14, + 19, 0, 0, 0, 0, 0, 0, 14, 0, 0, 36, 36, 36, 36, 14, 14, + 0, 0, 0, 0, 36, 81, 58, 58, 12, 12, 12, 12, 12, 36, 39, 14, + 14, 14, 14, 14, 14, 14, 14, 58, 0, 44, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 44, 14, 19, 14, 14, 0, 44, 38, 0, 36, 36, 36, + 0, 0, 0, 36, 36, 36, 0, 0, 14, 14, 14, 14, 39, 39, 39, 39, + 14, 14, 14, 14, 14, 14, 14, 36, 14, 14, 38, 14, 14, 14, 14, 14, + 14, 14, 36, 14, 14, 14, 39, 14, 36, 14, 38, 14, 14, 14, 32, 38, + 58, 58, 58, 82, 58, 83, 0, 0, 82, 58, 84, 25, 85, 86, 85, 86, + 28, 14, 87, 88, 89, 0, 0, 33, 50, 50, 50, 50, 7, 90, 91, 14, + 14, 14, 92, 93, 91, 14, 14, 14, 14, 14, 14, 77, 58, 58, 27, 58, + 94, 14, 38, 0, 0, 0, 0, 0, 14, 36, 25, 14, 14, 14, 16, 95, + 24, 28, 25, 14, 14, 14, 16, 78, 23, 23, 23, 6, 23, 23, 23, 23, + 23, 23, 23, 22, 23, 6, 23, 22, 23, 23, 23, 23, 23, 23, 23, 23, + 52, 36, 36, 36, 36, 36, 36, 36, 14, 49, 24, 14, 49, 14, 14, 14, + 14, 24, 14, 96, 14, 14, 14, 14, 24, 25, 14, 14, 14, 24, 14, 14, + 14, 14, 28, 14, 14, 24, 14, 25, 28, 28, 28, 28, 28, 28, 14, 14, + 28, 28, 28, 28, 28, 14, 14, 14, 14, 14, 14, 14, 24, 14, 36, 36, + 14, 25, 25, 14, 14, 14, 14, 14, 25, 28, 14, 24, 25, 24, 14, 24, + 24, 23, 24, 14, 14, 25, 24, 28, 25, 24, 24, 24, 28, 28, 25, 25, + 14, 14, 28, 28, 14, 14, 28, 14, 14, 14, 14, 14, 25, 14, 25, 14, + 14, 25, 14, 14, 14, 14, 14, 14, 28, 14, 28, 28, 14, 28, 14, 28, + 14, 28, 14, 28, 14, 14, 14, 14, 14, 14, 24, 14, 24, 14, 14, 14, + 14, 14, 24, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 24, + 14, 14, 14, 14, 14, 14, 14, 97, 14, 14, 14, 14, 70, 70, 14, 14, + 14, 25, 14, 14, 14, 98, 14, 14, 14, 14, 14, 14, 16, 99, 14, 14, + 98, 98, 14, 14, 14, 38, 36, 36, 14, 14, 14, 38, 36, 36, 36, 36, + 14, 14, 14, 14, 14, 38, 36, 36, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 25, 28, 28, 25, 14, 14, 14, 14, 14, + 14, 28, 28, 14, 14, 14, 14, 14, 28, 24, 28, 28, 28, 14, 14, 14, + 14, 28, 14, 28, 14, 14, 28, 14, 28, 14, 14, 28, 25, 24, 14, 28, + 28, 14, 14, 14, 14, 14, 14, 14, 14, 28, 28, 14, 14, 14, 14, 24, + 98, 98, 24, 25, 24, 14, 14, 28, 14, 14, 98, 28, 100, 98, 98, 98, + 14, 14, 14, 14, 101, 98, 14, 14, 25, 25, 14, 14, 14, 14, 14, 14, + 28, 24, 28, 24, 102, 25, 28, 24, 14, 14, 14, 14, 14, 14, 14, 101, + 14, 14, 14, 14, 14, 14, 14, 28, 14, 14, 14, 14, 14, 14, 101, 98, + 98, 98, 98, 98, 102, 28, 103, 101, 98, 103, 102, 28, 98, 28, 102, 103, + 98, 24, 14, 14, 28, 102, 28, 28, 103, 98, 98, 103, 98, 102, 103, 98, + 98, 98, 100, 14, 98, 98, 98, 14, 14, 14, 14, 24, 14, 7, 85, 85, + 5, 53, 14, 14, 70, 70, 70, 70, 70, 70, 70, 28, 28, 28, 28, 28, + 28, 28, 14, 14, 14, 14, 14, 14, 14, 14, 16, 99, 14, 14, 14, 14, + 14, 14, 14, 70, 70, 70, 70, 70, 14, 16, 104, 104, 104, 104, 104, 104, + 104, 104, 104, 104, 99, 14, 14, 14, 14, 14, 14, 14, 14, 14, 70, 14, + 14, 14, 24, 28, 28, 14, 14, 14, 14, 14, 36, 14, 14, 14, 14, 14, + 14, 14, 14, 36, 14, 14, 14, 14, 14, 14, 14, 14, 14, 36, 39, 14, + 14, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 19, 0, 14, 36, 36, 105, 58, 77, 106, + 14, 14, 14, 14, 36, 36, 36, 39, 41, 36, 36, 36, 36, 36, 36, 42, + 14, 14, 14, 38, 14, 14, 14, 38, 85, 85, 85, 85, 85, 85, 85, 58, + 58, 58, 58, 27, 107, 14, 85, 14, 85, 70, 70, 70, 70, 58, 58, 56, + 58, 27, 77, 14, 14, 108, 58, 77, 58, 109, 36, 36, 36, 36, 36, 36, + 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 110, 98, 98, + 98, 98, 36, 36, 36, 36, 36, 36, 98, 98, 98, 36, 36, 36, 36, 36, + 98, 98, 98, 98, 98, 98, 36, 36, 18, 111, 112, 98, 70, 70, 70, 70, + 70, 98, 70, 70, 70, 70, 113, 114, 98, 98, 98, 98, 98, 0, 0, 0, + 98, 98, 115, 98, 98, 112, 116, 98, 117, 118, 118, 118, 118, 98, 98, 98, + 98, 118, 98, 98, 98, 98, 98, 98, 98, 118, 118, 118, 98, 98, 98, 119, + 98, 98, 118, 120, 42, 121, 91, 116, 122, 118, 118, 118, 118, 98, 98, 98, + 98, 98, 118, 119, 98, 112, 123, 116, 36, 36, 110, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 36, 110, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 124, 98, 98, 98, 98, 98, 124, 36, 36, + 125, 125, 125, 125, 125, 125, 125, 125, 98, 98, 98, 98, 28, 28, 28, 28, + 98, 98, 112, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 124, 36, + 98, 98, 98, 124, 36, 36, 36, 36, 14, 14, 14, 14, 14, 14, 27, 106, + 12, 12, 12, 12, 12, 14, 36, 36, 0, 44, 0, 0, 0, 0, 0, 14, + 14, 14, 14, 14, 14, 14, 14, 0, 0, 27, 58, 58, 36, 36, 36, 36, + 36, 36, 36, 39, 14, 14, 14, 14, 14, 44, 14, 44, 14, 19, 14, 14, + 14, 19, 0, 0, 14, 14, 36, 36, 14, 14, 14, 14, 126, 36, 36, 36, + 14, 14, 65, 53, 36, 36, 36, 36, 0, 14, 14, 14, 14, 14, 14, 14, + 0, 0, 52, 36, 36, 36, 36, 58, 0, 14, 14, 14, 14, 14, 29, 36, + 14, 14, 14, 0, 0, 0, 0, 58, 14, 14, 14, 19, 0, 0, 0, 0, + 0, 0, 36, 36, 36, 36, 36, 39, 74, 74, 74, 74, 74, 74, 127, 36, + 14, 19, 0, 0, 0, 0, 0, 0, 44, 14, 14, 27, 58, 14, 14, 39, + 12, 12, 12, 12, 12, 36, 36, 14, 12, 12, 12, 12, 12, 61, 61, 62, + 14, 14, 14, 14, 19, 0, 0, 0, 0, 0, 0, 52, 36, 36, 36, 36, + 14, 19, 14, 14, 14, 14, 0, 36, 12, 12, 12, 12, 12, 36, 27, 58, + 61, 62, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 60, 61, 61, + 58, 14, 19, 52, 36, 36, 36, 36, 39, 14, 14, 38, 39, 14, 14, 38, + 39, 14, 14, 38, 36, 36, 36, 36, 14, 19, 0, 0, 0, 1, 0, 36, + 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 129, + 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 129, 129, 129, + 129, 129, 128, 129, 129, 129, 129, 129, 129, 129, 36, 36, 36, 36, 36, 36, + 75, 75, 75, 130, 36, 131, 76, 76, 76, 76, 76, 76, 76, 76, 36, 36, + 132, 132, 132, 132, 132, 132, 132, 132, 36, 39, 14, 14, 36, 36, 133, 134, + 46, 46, 46, 46, 48, 46, 46, 46, 46, 46, 46, 47, 46, 46, 47, 47, + 46, 133, 47, 46, 46, 46, 46, 46, 36, 39, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 104, 36, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 126, 36, 135, 136, 57, 137, 138, 36, 36, 36, + 98, 98, 139, 104, 104, 104, 104, 104, 104, 104, 111, 139, 111, 98, 98, 98, + 111, 78, 91, 53, 139, 104, 104, 111, 98, 98, 98, 124, 140, 141, 36, 36, + 14, 14, 14, 14, 14, 14, 38, 142, 105, 98, 6, 98, 70, 98, 111, 111, + 98, 98, 98, 98, 98, 91, 98, 143, 98, 98, 98, 98, 98, 139, 144, 98, + 98, 98, 98, 98, 98, 139, 144, 139, 114, 70, 93, 145, 125, 125, 125, 125, + 146, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 91, + 36, 14, 14, 14, 36, 14, 14, 14, 36, 14, 14, 14, 36, 14, 38, 36, + 22, 98, 140, 147, 14, 14, 14, 38, 36, 36, 36, 36, 42, 0, 148, 36, + 14, 14, 14, 14, 14, 14, 39, 14, 14, 14, 14, 14, 14, 38, 14, 39, + 58, 41, 36, 39, 14, 14, 14, 14, 14, 14, 36, 39, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 36, 36, 14, 14, 14, 14, 14, 14, 19, 36, + 14, 14, 36, 36, 36, 36, 36, 36, 14, 14, 14, 0, 0, 52, 36, 36, + 14, 14, 14, 14, 14, 14, 14, 81, 14, 14, 36, 36, 14, 14, 14, 14, + 77, 14, 14, 36, 36, 36, 36, 36, 14, 14, 36, 36, 36, 36, 36, 39, + 14, 14, 14, 36, 38, 14, 14, 14, 14, 14, 14, 39, 38, 36, 38, 39, + 14, 14, 14, 81, 14, 14, 14, 14, 14, 38, 14, 36, 36, 39, 14, 14, + 14, 14, 14, 14, 14, 14, 36, 81, 14, 14, 14, 14, 14, 36, 36, 39, + 14, 14, 14, 14, 36, 36, 14, 14, 19, 0, 42, 52, 36, 36, 0, 0, + 14, 14, 39, 14, 39, 14, 14, 14, 14, 14, 36, 36, 0, 52, 36, 42, + 58, 58, 58, 58, 38, 36, 36, 36, 14, 14, 19, 52, 36, 39, 14, 14, + 58, 58, 58, 149, 36, 36, 36, 36, 14, 14, 14, 36, 81, 58, 58, 58, + 14, 38, 36, 36, 14, 14, 14, 14, 14, 36, 36, 36, 39, 14, 38, 36, + 36, 36, 36, 36, 39, 14, 14, 14, 14, 38, 36, 36, 36, 36, 36, 36, + 14, 38, 36, 36, 36, 14, 14, 14, 14, 14, 14, 14, 0, 0, 0, 0, + 0, 0, 0, 1, 77, 14, 14, 36, 14, 14, 14, 12, 12, 12, 12, 12, + 36, 36, 36, 36, 36, 36, 36, 42, 0, 0, 0, 0, 0, 44, 14, 58, + 58, 36, 36, 36, 36, 36, 36, 36, 0, 0, 52, 12, 12, 12, 12, 12, + 58, 58, 36, 36, 36, 36, 36, 36, 14, 19, 32, 38, 36, 36, 36, 36, + 44, 14, 27, 77, 77, 0, 44, 36, 12, 12, 12, 12, 12, 32, 27, 58, + 14, 14, 14, 14, 14, 14, 0, 0, 0, 0, 0, 0, 58, 27, 77, 36, + 14, 14, 14, 38, 38, 14, 14, 39, 14, 14, 14, 14, 27, 36, 36, 36, + 0, 0, 0, 0, 0, 52, 36, 36, 0, 0, 39, 14, 14, 14, 38, 39, + 38, 36, 36, 42, 36, 36, 39, 14, 14, 0, 36, 0, 0, 0, 52, 36, + 0, 0, 52, 36, 36, 36, 36, 36, 0, 0, 14, 14, 36, 36, 36, 36, + 0, 0, 0, 36, 0, 0, 0, 0, 150, 58, 53, 14, 27, 58, 58, 58, + 58, 58, 58, 58, 14, 14, 0, 36, 1, 77, 38, 36, 36, 36, 36, 36, + 0, 0, 0, 0, 36, 36, 36, 36, 61, 61, 61, 61, 61, 36, 60, 61, + 12, 12, 12, 12, 12, 61, 58, 151, 14, 38, 36, 36, 36, 36, 36, 39, + 58, 58, 41, 36, 36, 36, 36, 36, 14, 14, 14, 14, 152, 70, 114, 14, + 14, 99, 14, 70, 70, 14, 14, 14, 14, 14, 14, 14, 16, 114, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 70, 12, 12, 12, 12, 12, 36, 36, 58, + 0, 0, 1, 36, 36, 36, 36, 36, 0, 0, 0, 1, 58, 14, 14, 14, + 14, 14, 77, 36, 36, 36, 36, 36, 12, 12, 12, 12, 12, 39, 14, 14, + 14, 14, 14, 14, 36, 36, 39, 14, 19, 0, 0, 0, 0, 0, 0, 0, + 98, 36, 36, 36, 36, 36, 36, 36, 14, 14, 14, 14, 14, 36, 19, 1, + 0, 0, 36, 36, 36, 36, 36, 36, 14, 14, 19, 0, 0, 14, 19, 0, + 0, 44, 19, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 0, 0, 14, + 14, 0, 44, 36, 36, 36, 36, 36, 36, 38, 39, 38, 39, 14, 38, 14, + 14, 14, 14, 14, 14, 39, 39, 14, 14, 14, 39, 14, 14, 14, 14, 14, + 14, 14, 14, 39, 14, 38, 39, 14, 14, 14, 38, 14, 14, 14, 38, 14, + 14, 14, 14, 14, 14, 39, 14, 38, 14, 14, 38, 38, 36, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 0, 0, 0, 44, 14, 19, 0, 0, 0, 0, 0, 0, 0, 0, 44, 14, + 14, 14, 19, 14, 14, 14, 14, 14, 14, 14, 44, 27, 58, 77, 36, 36, + 36, 36, 36, 36, 36, 42, 0, 0, 14, 14, 38, 39, 14, 14, 14, 14, + 39, 38, 38, 39, 39, 14, 14, 14, 14, 38, 14, 14, 39, 39, 36, 36, + 36, 38, 36, 39, 39, 39, 39, 14, 39, 38, 38, 39, 39, 39, 39, 39, + 39, 38, 38, 39, 14, 38, 14, 14, 14, 38, 14, 14, 39, 14, 38, 38, + 14, 14, 14, 14, 14, 39, 14, 14, 39, 14, 39, 14, 14, 39, 14, 14, + 28, 28, 28, 28, 28, 28, 153, 36, 28, 28, 28, 28, 28, 28, 28, 38, + 28, 28, 28, 28, 28, 14, 36, 36, 28, 28, 28, 28, 28, 153, 36, 36, + 36, 36, 36, 154, 154, 154, 154, 154, 154, 154, 154, 154, 154, 154, 154, 154, + 98, 124, 36, 36, 36, 36, 36, 36, 98, 98, 98, 98, 124, 36, 36, 36, + 98, 98, 98, 98, 98, 98, 14, 98, 98, 98, 100, 101, 98, 98, 101, 98, + 98, 98, 98, 98, 98, 100, 14, 14, 101, 101, 101, 98, 98, 98, 98, 100, + 100, 101, 98, 98, 98, 98, 98, 98, 14, 14, 14, 101, 98, 98, 98, 98, + 98, 98, 98, 100, 14, 14, 14, 14, 14, 14, 101, 98, 98, 98, 98, 98, + 98, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 98, 98, 98, + 98, 98, 110, 98, 98, 98, 98, 98, 98, 98, 14, 14, 14, 14, 98, 98, + 98, 98, 14, 14, 14, 98, 98, 98, 14, 14, 14, 85, 155, 91, 14, 14, + 124, 36, 36, 36, 36, 36, 36, 36, 98, 98, 124, 36, 36, 36, 36, 36, + 42, 36, 36, 36, 36, 36, 36, 36, +}; + +static RE_UINT8 re_line_break_stage_5[] = { + 16, 16, 16, 18, 22, 20, 20, 21, 19, 6, 3, 12, 9, 10, 12, 3, + 1, 36, 12, 9, 8, 15, 8, 7, 11, 11, 8, 8, 12, 12, 12, 6, + 12, 1, 9, 36, 18, 2, 12, 16, 16, 29, 4, 1, 10, 9, 9, 9, + 12, 25, 25, 12, 25, 3, 12, 18, 25, 25, 17, 12, 25, 1, 17, 25, + 12, 17, 16, 4, 4, 4, 4, 16, 0, 0, 8, 12, 12, 0, 0, 12, + 0, 8, 18, 0, 0, 16, 18, 16, 16, 12, 6, 16, 37, 37, 37, 0, + 37, 12, 12, 10, 10, 10, 16, 6, 16, 0, 6, 6, 10, 11, 11, 12, + 6, 12, 8, 6, 18, 18, 0, 10, 0, 24, 24, 24, 24, 0, 0, 9, + 24, 12, 17, 17, 4, 17, 17, 18, 4, 6, 4, 12, 1, 2, 18, 17, + 12, 4, 4, 0, 31, 31, 32, 32, 33, 33, 18, 12, 2, 0, 5, 24, + 18, 9, 0, 18, 18, 4, 18, 28, 26, 25, 3, 3, 1, 3, 14, 14, + 14, 18, 20, 20, 3, 25, 5, 5, 8, 1, 2, 5, 30, 12, 2, 25, + 9, 12, 12, 14, 13, 13, 2, 12, 13, 12, 12, 13, 13, 25, 25, 13, + 2, 1, 0, 6, 6, 18, 1, 18, 26, 26, 1, 0, 0, 13, 2, 13, + 13, 5, 5, 1, 2, 2, 13, 16, 5, 13, 0, 38, 13, 38, 38, 13, + 38, 0, 16, 5, 5, 38, 38, 5, 13, 0, 38, 38, 10, 12, 31, 0, + 34, 35, 35, 35, 32, 0, 0, 33, 27, 27, 0, 37, 16, 37, 8, 2, + 2, 8, 6, 1, 2, 14, 13, 1, 13, 9, 10, 13, 0, 30, 13, 6, + 13, 2, 12, 38, 38, 12, 9, 0, 23, 25, 14, 0, 16, 17, 18, 24, + 1, 1, 25, 0, 39, 39, 3, 5, +}; + +/* Line_Break: 8608 bytes. */ + +RE_UINT32 re_get_line_break(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 12; + code = ch ^ (f << 12); + pos = (RE_UINT32)re_line_break_stage_1[f] << 5; + f = code >> 7; + code ^= f << 7; + pos = (RE_UINT32)re_line_break_stage_2[pos + f] << 3; + f = code >> 4; + code ^= f << 4; + pos = (RE_UINT32)re_line_break_stage_3[pos + f] << 3; + f = code >> 1; + code ^= f << 1; + pos = (RE_UINT32)re_line_break_stage_4[pos + f] << 1; + value = re_line_break_stage_5[pos + code]; + + return value; +} + +/* Numeric_Type. */ + +static RE_UINT8 re_numeric_type_stage_1[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11, 12, + 13, 14, 15, 11, 11, 11, 16, 11, 11, 11, 11, 11, 11, 17, 18, 19, + 20, 11, 21, 22, 11, 11, 23, 11, 11, 11, 11, 11, 11, 11, 11, 24, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, +}; + +static RE_UINT8 re_numeric_type_stage_2[] = { + 0, 1, 1, 1, 1, 1, 2, 3, 1, 4, 5, 6, 7, 8, 9, 10, + 11, 1, 1, 12, 1, 1, 13, 14, 15, 16, 17, 18, 19, 1, 1, 1, + 20, 21, 1, 1, 22, 1, 1, 23, 1, 1, 1, 1, 24, 1, 1, 1, + 25, 26, 27, 1, 28, 1, 1, 1, 29, 1, 1, 30, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 31, 32, + 1, 33, 1, 34, 1, 1, 35, 1, 36, 1, 1, 1, 1, 1, 37, 38, + 1, 1, 39, 40, 1, 1, 1, 41, 1, 1, 1, 1, 1, 1, 1, 42, + 1, 1, 1, 43, 1, 1, 44, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 45, 1, 1, 1, 46, 1, 1, 1, 1, 1, 1, 1, 47, 48, 1, 1, + 1, 1, 1, 1, 1, 1, 49, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 50, 1, 51, 52, 53, 54, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 55, 1, 1, 1, 1, 1, 15, + 1, 56, 57, 58, 59, 1, 1, 1, 60, 61, 62, 63, 64, 1, 65, 1, + 66, 67, 54, 1, 68, 1, 69, 70, 71, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 72, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 73, 74, 1, 1, 1, 1, + 1, 1, 1, 75, 1, 1, 1, 76, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 77, 1, 1, 1, 1, 1, 1, 1, + 1, 78, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 79, 80, 1, 1, 1, 1, 1, 1, 1, 81, 82, 83, 1, 1, 1, 1, + 1, 1, 1, 84, 1, 1, 1, 1, 1, 85, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 86, 1, 1, 1, 1, + 1, 1, 87, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 84, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_numeric_type_stage_3[] = { + 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 3, 0, + 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 4, + 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 8, 0, 0, 0, 4, + 0, 0, 0, 9, 0, 0, 0, 4, 0, 0, 1, 0, 0, 0, 1, 0, + 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 0, + 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, + 0, 0, 0, 0, 0, 0, 0, 13, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 4, 0, 0, 0, 14, 0, 0, 0, 0, 0, 15, 0, 0, 0, + 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, + 0, 0, 0, 16, 17, 0, 0, 0, 0, 0, 18, 19, 20, 0, 0, 0, + 0, 0, 0, 21, 22, 0, 0, 23, 0, 0, 0, 24, 25, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 26, 27, 28, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 29, 0, 0, 0, 0, 30, 31, 0, 30, 32, 0, 0, + 33, 0, 0, 0, 34, 0, 0, 0, 0, 35, 0, 0, 0, 0, 0, 0, + 0, 0, 36, 0, 0, 0, 0, 0, 37, 0, 26, 0, 38, 39, 40, 41, + 36, 0, 0, 42, 0, 0, 0, 0, 43, 0, 44, 45, 0, 0, 0, 0, + 0, 0, 46, 0, 0, 0, 47, 0, 0, 0, 0, 0, 0, 0, 48, 0, + 0, 0, 0, 0, 0, 0, 0, 49, 0, 0, 0, 50, 0, 0, 0, 51, + 52, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 53, + 0, 0, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 0, + 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 56, 0, 0, 0, + 0, 0, 0, 53, 0, 0, 0, 0, 0, 0, 0, 0, 44, 0, 0, 0, + 0, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 57, 0, 0, + 0, 42, 0, 0, 0, 0, 0, 0, 0, 58, 59, 60, 0, 0, 0, 56, + 0, 3, 0, 0, 0, 0, 0, 61, 0, 62, 0, 0, 0, 0, 1, 0, + 3, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 63, 0, 55, 64, 26, + 65, 66, 19, 67, 68, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 69, + 0, 70, 71, 0, 0, 0, 72, 0, 0, 0, 0, 0, 0, 3, 0, 0, + 0, 0, 73, 74, 0, 75, 0, 76, 77, 0, 0, 0, 0, 78, 79, 19, + 0, 0, 80, 81, 82, 0, 0, 83, 0, 0, 73, 73, 0, 84, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 85, 0, 0, 0, 86, 0, 0, 0, 0, + 0, 0, 87, 88, 0, 0, 0, 1, 0, 89, 0, 0, 0, 0, 1, 90, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 3, 0, + 0, 91, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 92, + 19, 19, 19, 93, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, + 0, 0, 94, 95, 0, 0, 0, 0, 0, 0, 0, 96, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 97, 98, 0, 0, 0, 0, 0, 0, 75, 0, + 99, 0, 0, 0, 0, 0, 0, 0, 58, 0, 0, 43, 0, 0, 0, 100, + 0, 58, 0, 0, 0, 0, 0, 0, 0, 35, 0, 0, 101, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 102, 103, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0, 60, 0, 0, 0, + 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 0, 0, 0, 0, +}; + +static RE_UINT8 re_numeric_type_stage_4[] = { + 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4, 1, 2, 0, 0, + 5, 1, 0, 0, 5, 1, 6, 7, 5, 1, 8, 0, 5, 1, 9, 0, + 5, 1, 0, 10, 5, 1, 11, 0, 1, 12, 13, 0, 0, 14, 15, 16, + 0, 17, 18, 0, 1, 2, 19, 7, 0, 0, 1, 20, 1, 2, 1, 2, + 0, 0, 21, 22, 23, 22, 0, 0, 0, 0, 19, 19, 19, 19, 19, 19, + 24, 7, 0, 0, 23, 25, 26, 27, 19, 23, 25, 13, 0, 28, 29, 30, + 0, 0, 31, 32, 23, 33, 34, 0, 0, 0, 0, 35, 36, 0, 0, 0, + 37, 7, 0, 9, 0, 0, 38, 0, 19, 7, 0, 0, 0, 19, 37, 19, + 0, 0, 37, 19, 35, 0, 0, 0, 39, 0, 0, 0, 0, 40, 0, 0, + 0, 35, 0, 0, 41, 42, 0, 0, 0, 43, 44, 0, 0, 0, 0, 36, + 18, 0, 0, 36, 0, 18, 0, 0, 0, 0, 18, 0, 43, 0, 0, 0, + 45, 0, 0, 0, 0, 46, 0, 0, 47, 43, 0, 0, 48, 0, 0, 0, + 0, 0, 0, 39, 0, 0, 42, 42, 0, 0, 0, 40, 0, 0, 0, 17, + 0, 49, 18, 0, 0, 0, 0, 45, 0, 43, 0, 0, 0, 0, 40, 0, + 0, 0, 45, 0, 0, 45, 39, 0, 42, 0, 0, 0, 45, 43, 0, 0, + 0, 0, 0, 18, 17, 19, 0, 0, 0, 0, 11, 0, 0, 39, 39, 18, + 0, 0, 50, 0, 36, 19, 19, 19, 19, 19, 13, 0, 19, 19, 19, 18, + 0, 51, 0, 0, 37, 19, 19, 13, 13, 0, 0, 0, 42, 40, 0, 0, + 0, 0, 52, 0, 0, 0, 0, 19, 0, 0, 0, 37, 36, 19, 0, 0, + 0, 0, 0, 53, 0, 0, 17, 13, 0, 0, 0, 54, 19, 19, 8, 19, + 55, 0, 0, 0, 0, 0, 0, 56, 0, 0, 0, 57, 0, 53, 0, 0, + 0, 37, 0, 0, 0, 0, 0, 8, 23, 25, 19, 10, 0, 0, 58, 59, + 60, 1, 0, 0, 0, 0, 5, 1, 37, 19, 16, 0, 0, 0, 1, 61, + 1, 12, 9, 0, 19, 10, 0, 0, 0, 0, 1, 62, 7, 0, 0, 0, + 19, 19, 7, 0, 0, 5, 1, 1, 1, 1, 1, 1, 23, 63, 0, 0, + 40, 0, 0, 0, 39, 43, 0, 43, 0, 40, 0, 35, 0, 0, 0, 42, +}; + +static RE_UINT8 re_numeric_type_stage_5[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, + 0, 2, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 3, 3, + 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, + 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 3, 3, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 2, 2, + 2, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 0, 0, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, + 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 1, 2, 0, 0, 0, 0, 0, 0, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, + 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, + 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, + 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, + 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 1, 1, 0, 0, 0, 0, + 3, 3, 0, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 0, 0, 0, +}; + +/* Numeric_Type: 2304 bytes. */ + +RE_UINT32 re_get_numeric_type(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 12; + code = ch ^ (f << 12); + pos = (RE_UINT32)re_numeric_type_stage_1[f] << 4; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_numeric_type_stage_2[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_numeric_type_stage_3[pos + f] << 2; + f = code >> 3; + code ^= f << 3; + pos = (RE_UINT32)re_numeric_type_stage_4[pos + f] << 3; + value = re_numeric_type_stage_5[pos + code]; + + return value; +} + +/* Numeric_Value. */ + +static RE_UINT8 re_numeric_value_stage_1[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11, 12, + 13, 14, 15, 11, 11, 11, 16, 11, 11, 11, 11, 11, 11, 17, 18, 19, + 20, 11, 21, 22, 11, 11, 23, 11, 11, 11, 11, 11, 11, 11, 11, 24, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, +}; + +static RE_UINT8 re_numeric_value_stage_2[] = { + 0, 1, 1, 1, 1, 1, 2, 3, 1, 4, 5, 6, 7, 8, 9, 10, + 11, 1, 1, 12, 1, 1, 13, 14, 15, 16, 17, 18, 19, 1, 1, 1, + 20, 21, 1, 1, 22, 1, 1, 23, 1, 1, 1, 1, 24, 1, 1, 1, + 25, 26, 27, 1, 28, 1, 1, 1, 29, 1, 1, 30, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 31, 32, + 1, 33, 1, 34, 1, 1, 35, 1, 36, 1, 1, 1, 1, 1, 37, 38, + 1, 1, 39, 40, 1, 1, 1, 41, 1, 1, 1, 1, 1, 1, 1, 42, + 1, 1, 1, 43, 1, 1, 44, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 45, 1, 1, 1, 46, 1, 1, 1, 1, 1, 1, 1, 47, 48, 1, 1, + 1, 1, 1, 1, 1, 1, 49, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 50, 1, 51, 52, 53, 54, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 55, 1, 1, 1, 1, 1, 15, + 1, 56, 57, 58, 59, 1, 1, 1, 60, 61, 62, 63, 64, 1, 65, 1, + 66, 67, 54, 1, 68, 1, 69, 70, 71, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 72, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 73, 74, 1, 1, 1, 1, + 1, 1, 1, 75, 1, 1, 1, 76, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 77, 1, 1, 1, 1, 1, 1, 1, + 1, 78, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 79, 80, 1, 1, 1, 1, 1, 1, 1, 81, 82, 83, 1, 1, 1, 1, + 1, 1, 1, 84, 1, 1, 1, 1, 1, 85, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 86, 1, 1, 1, 1, + 1, 1, 87, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 88, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_numeric_value_stage_3[] = { + 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 3, 0, + 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 4, + 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 8, 0, 0, 0, 4, + 0, 0, 0, 9, 0, 0, 0, 4, 0, 0, 1, 0, 0, 0, 1, 0, + 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 0, + 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, + 0, 0, 0, 0, 0, 0, 0, 13, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 4, 0, 0, 0, 14, 0, 0, 0, 0, 0, 13, 0, 0, 0, + 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, + 0, 0, 0, 15, 3, 0, 0, 0, 0, 0, 16, 17, 18, 0, 0, 0, + 0, 0, 0, 19, 20, 0, 0, 21, 0, 0, 0, 22, 23, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 24, 25, 26, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 27, 0, 0, 0, 0, 28, 29, 0, 28, 30, 0, 0, + 31, 0, 0, 0, 32, 0, 0, 0, 0, 33, 0, 0, 0, 0, 0, 0, + 0, 0, 34, 0, 0, 0, 0, 0, 35, 0, 36, 0, 37, 38, 39, 40, + 41, 0, 0, 42, 0, 0, 0, 0, 43, 0, 44, 45, 0, 0, 0, 0, + 0, 0, 46, 0, 0, 0, 47, 0, 0, 0, 0, 0, 0, 0, 48, 0, + 0, 0, 0, 0, 0, 0, 0, 49, 0, 0, 0, 50, 0, 0, 0, 51, + 52, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 53, + 0, 0, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 0, + 56, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 57, 0, 0, 0, + 0, 0, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0, 0, 0, + 0, 60, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 61, 0, 0, + 0, 62, 0, 0, 0, 0, 0, 0, 0, 63, 64, 65, 0, 0, 0, 66, + 0, 3, 0, 0, 0, 0, 0, 67, 0, 68, 0, 0, 0, 0, 1, 0, + 3, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 69, 0, 70, 71, 72, + 73, 74, 75, 76, 77, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 78, + 0, 79, 80, 0, 0, 0, 81, 0, 0, 0, 0, 0, 0, 3, 0, 0, + 0, 0, 82, 83, 0, 84, 0, 85, 86, 0, 0, 0, 0, 87, 88, 89, + 0, 0, 90, 91, 92, 0, 0, 93, 0, 0, 94, 94, 0, 95, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 96, 0, 0, 0, 97, 0, 0, 0, 0, + 0, 0, 98, 99, 0, 0, 0, 1, 0, 100, 0, 0, 0, 0, 1, 101, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 3, 0, + 0, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 103, + 104, 105, 106, 107, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, + 0, 0, 108, 109, 0, 0, 0, 0, 0, 0, 0, 110, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 111, 112, 0, 0, 0, 0, 0, 0, 113, 0, + 114, 0, 0, 0, 0, 0, 0, 0, 115, 0, 0, 116, 0, 0, 0, 117, + 0, 118, 0, 0, 0, 0, 0, 0, 0, 119, 0, 0, 120, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 121, 122, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 62, 0, 0, 0, 0, 0, 0, 0, 123, 0, 0, 0, + 124, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 125, 0, 0, 0, 0, + 0, 0, 0, 0, 126, 0, 0, 0, +}; + +static RE_UINT8 re_numeric_value_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0, + 0, 0, 0, 0, 4, 0, 5, 6, 1, 2, 3, 0, 0, 0, 0, 0, + 0, 7, 8, 9, 0, 0, 0, 0, 0, 7, 8, 9, 0, 10, 11, 0, + 0, 7, 8, 9, 12, 13, 0, 0, 0, 7, 8, 9, 14, 0, 0, 0, + 0, 7, 8, 9, 0, 0, 1, 15, 0, 7, 8, 9, 16, 17, 0, 0, + 1, 2, 18, 19, 20, 0, 0, 0, 0, 0, 21, 2, 22, 23, 24, 25, + 0, 0, 0, 26, 27, 0, 0, 0, 1, 2, 3, 0, 1, 2, 3, 0, + 0, 0, 0, 0, 1, 2, 28, 0, 0, 0, 0, 0, 29, 2, 3, 0, + 0, 0, 0, 0, 30, 31, 32, 33, 34, 35, 36, 37, 34, 35, 36, 37, + 38, 39, 40, 0, 0, 0, 0, 0, 34, 35, 36, 41, 42, 34, 35, 36, + 41, 42, 34, 35, 36, 41, 42, 0, 0, 0, 43, 44, 45, 46, 2, 47, + 0, 0, 0, 0, 0, 48, 49, 50, 34, 35, 51, 49, 50, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 52, 0, 53, 0, 0, 0, 0, 0, 0, + 21, 2, 3, 0, 0, 0, 54, 0, 0, 0, 0, 0, 48, 55, 0, 0, + 34, 35, 56, 0, 0, 0, 0, 0, 0, 0, 57, 58, 59, 60, 61, 62, + 0, 0, 0, 0, 63, 64, 65, 66, 0, 67, 0, 0, 0, 0, 0, 0, + 68, 0, 0, 0, 0, 0, 0, 0, 0, 0, 69, 0, 0, 0, 0, 0, + 0, 0, 0, 70, 0, 0, 0, 0, 71, 72, 73, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 74, 0, 0, 0, 75, 0, 76, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 77, 78, 0, 0, 0, 0, 0, 0, 79, + 0, 0, 80, 0, 0, 0, 0, 0, 0, 0, 0, 67, 0, 0, 0, 0, + 0, 0, 0, 0, 81, 0, 0, 0, 0, 82, 0, 0, 0, 0, 0, 0, + 0, 83, 0, 0, 0, 0, 0, 0, 0, 0, 84, 85, 0, 0, 0, 0, + 86, 87, 0, 88, 0, 0, 0, 0, 89, 80, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 90, 0, 0, 0, 0, 0, 5, 0, 5, 0, + 0, 0, 0, 0, 0, 0, 91, 0, 0, 0, 0, 0, 0, 0, 0, 92, + 0, 0, 0, 15, 75, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 93, + 0, 0, 0, 94, 0, 0, 0, 0, 0, 0, 0, 0, 95, 0, 0, 0, + 0, 95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 96, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 97, 0, 98, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 25, 0, 0, 0, 0, 0, 0, 0, 99, 68, 0, 0, 0, + 0, 0, 0, 0, 75, 0, 0, 0, 100, 0, 0, 0, 0, 0, 0, 0, + 0, 101, 0, 81, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 102, 0, + 0, 0, 0, 0, 0, 103, 0, 0, 0, 48, 49, 104, 0, 0, 0, 0, + 0, 0, 0, 0, 105, 106, 0, 0, 0, 0, 107, 0, 108, 0, 75, 0, + 0, 0, 0, 0, 103, 0, 0, 0, 0, 0, 0, 0, 109, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 110, 0, 111, 8, 9, 57, 58, 112, 113, + 114, 115, 116, 117, 118, 0, 0, 0, 119, 120, 121, 122, 123, 124, 125, 126, + 127, 128, 129, 130, 122, 131, 132, 0, 0, 0, 133, 0, 0, 0, 0, 0, + 21, 2, 22, 23, 24, 134, 135, 0, 136, 0, 0, 0, 0, 0, 0, 0, + 137, 0, 138, 0, 0, 0, 0, 0, 0, 0, 0, 0, 139, 140, 0, 0, + 0, 0, 0, 0, 0, 0, 141, 142, 0, 0, 0, 0, 0, 0, 21, 143, + 0, 111, 144, 145, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 111, 145, + 0, 0, 0, 0, 0, 146, 147, 0, 0, 0, 0, 0, 0, 0, 0, 148, + 34, 35, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, + 34, 163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 164, + 0, 0, 0, 0, 0, 0, 0, 165, 0, 0, 111, 145, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 34, 163, 0, 0, 21, 166, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 167, 168, 34, 35, 149, 150, 169, 152, 170, 171, + 0, 0, 0, 0, 48, 49, 50, 172, 173, 174, 8, 9, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 7, 8, 9, 21, 2, 22, 23, 24, 175, 0, 0, + 0, 0, 0, 0, 1, 2, 22, 0, 1, 2, 22, 23, 176, 0, 0, 0, + 8, 9, 49, 177, 35, 178, 2, 179, 180, 181, 9, 182, 183, 182, 184, 185, + 186, 187, 188, 189, 144, 190, 191, 192, 193, 194, 195, 196, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 2, 197, 198, 199, 0, 0, 0, 0, 0, 0, 0, + 34, 35, 149, 150, 200, 0, 0, 0, 0, 0, 0, 7, 8, 9, 1, 2, + 201, 8, 9, 1, 2, 201, 8, 9, 0, 111, 8, 9, 0, 0, 0, 0, + 202, 49, 104, 29, 0, 0, 0, 0, 70, 0, 0, 0, 0, 0, 0, 0, + 0, 203, 0, 0, 0, 0, 0, 0, 98, 0, 0, 0, 0, 0, 0, 0, + 67, 0, 0, 0, 0, 0, 0, 0, 0, 0, 91, 0, 0, 0, 0, 0, + 204, 0, 0, 88, 0, 0, 0, 88, 0, 0, 101, 0, 0, 0, 0, 73, + 0, 0, 0, 0, 0, 0, 73, 0, 0, 0, 0, 0, 0, 0, 80, 0, + 0, 0, 0, 0, 0, 0, 107, 0, 0, 0, 0, 205, 0, 0, 0, 0, + 0, 0, 0, 0, 206, 0, 0, 0, +}; + +static RE_UINT8 re_numeric_value_stage_5[] = { + 0, 0, 0, 0, 2, 27, 29, 31, 33, 35, 37, 39, 41, 43, 0, 0, + 0, 0, 29, 31, 0, 27, 0, 0, 12, 17, 22, 0, 0, 0, 2, 27, + 29, 31, 33, 35, 37, 39, 41, 43, 3, 7, 10, 12, 22, 50, 0, 0, + 0, 0, 12, 17, 22, 3, 7, 10, 44, 89, 98, 0, 27, 29, 31, 0, + 44, 89, 98, 12, 17, 22, 0, 0, 41, 43, 17, 28, 30, 32, 34, 36, + 38, 40, 42, 1, 0, 27, 29, 31, 41, 43, 44, 54, 64, 74, 84, 85, + 86, 87, 88, 89, 107, 0, 0, 0, 0, 0, 51, 52, 53, 0, 0, 0, + 41, 43, 27, 0, 2, 0, 0, 0, 8, 6, 5, 13, 21, 11, 15, 19, + 23, 9, 24, 7, 14, 20, 25, 27, 27, 29, 31, 33, 35, 37, 39, 41, + 43, 44, 45, 46, 84, 89, 93, 98, 98, 102, 107, 0, 0, 37, 84, 111, + 116, 2, 0, 0, 47, 48, 49, 50, 51, 52, 53, 54, 0, 0, 2, 45, + 46, 47, 48, 49, 50, 51, 52, 53, 54, 27, 29, 31, 41, 43, 44, 2, + 0, 0, 27, 29, 31, 33, 35, 37, 39, 41, 43, 44, 43, 44, 27, 29, + 0, 17, 0, 0, 0, 0, 0, 2, 44, 54, 64, 0, 31, 33, 0, 0, + 43, 44, 0, 0, 44, 54, 64, 74, 84, 85, 86, 87, 0, 55, 56, 57, + 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, + 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 0, 35, 0, 0, + 0, 0, 0, 29, 0, 0, 35, 0, 0, 39, 0, 0, 27, 0, 0, 39, + 0, 0, 0, 107, 0, 31, 0, 0, 0, 43, 0, 0, 29, 0, 0, 0, + 35, 0, 33, 0, 0, 0, 0, 128, 44, 0, 0, 0, 0, 0, 0, 98, + 31, 0, 0, 0, 89, 0, 0, 0, 128, 0, 0, 0, 0, 0, 130, 0, + 0, 29, 0, 41, 0, 37, 0, 0, 0, 44, 0, 98, 54, 64, 0, 0, + 74, 0, 0, 0, 0, 31, 31, 31, 0, 0, 0, 33, 0, 0, 27, 0, + 0, 0, 43, 54, 0, 0, 44, 0, 41, 0, 0, 0, 0, 0, 39, 0, + 0, 0, 43, 0, 0, 0, 89, 0, 0, 0, 33, 0, 0, 0, 29, 0, + 0, 98, 0, 0, 0, 0, 37, 0, 37, 0, 0, 0, 0, 0, 2, 0, + 39, 41, 43, 2, 12, 17, 22, 3, 7, 10, 0, 0, 0, 0, 0, 31, + 0, 0, 0, 44, 0, 37, 0, 37, 0, 44, 0, 0, 0, 0, 0, 27, + 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 12, 17, 27, 35, + 84, 93, 102, 111, 35, 44, 84, 89, 93, 98, 102, 35, 44, 84, 89, 93, + 98, 107, 111, 44, 27, 27, 27, 29, 29, 29, 29, 35, 44, 44, 44, 44, + 44, 64, 84, 84, 84, 84, 89, 91, 93, 93, 93, 93, 84, 17, 17, 21, + 22, 0, 0, 0, 0, 0, 2, 12, 90, 91, 92, 93, 94, 95, 96, 97, + 27, 35, 44, 84, 0, 88, 0, 0, 0, 0, 97, 0, 0, 27, 29, 44, + 54, 89, 0, 0, 27, 29, 31, 44, 54, 89, 98, 107, 33, 35, 44, 54, + 29, 31, 33, 33, 35, 44, 54, 89, 0, 0, 27, 44, 54, 89, 29, 31, + 26, 17, 0, 0, 43, 44, 54, 64, 74, 84, 85, 86, 0, 0, 89, 90, + 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, + 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 119, 120, 122, 123, 124, + 125, 126, 4, 9, 12, 13, 16, 17, 18, 21, 22, 24, 44, 54, 89, 98, + 0, 27, 84, 0, 0, 27, 44, 54, 33, 44, 54, 89, 0, 0, 27, 35, + 44, 84, 89, 98, 87, 88, 89, 90, 95, 96, 97, 17, 12, 13, 21, 0, + 54, 64, 74, 84, 85, 86, 87, 88, 89, 98, 2, 27, 98, 0, 0, 0, + 86, 87, 88, 0, 39, 41, 43, 33, 43, 27, 29, 31, 41, 43, 27, 29, + 31, 33, 35, 29, 31, 31, 33, 35, 27, 29, 31, 31, 33, 35, 118, 121, + 33, 35, 31, 31, 33, 33, 33, 33, 37, 39, 39, 39, 41, 41, 43, 43, + 43, 43, 29, 31, 33, 35, 37, 27, 35, 35, 29, 31, 27, 29, 13, 21, + 24, 13, 21, 7, 12, 9, 12, 12, 17, 13, 21, 74, 84, 33, 35, 37, + 39, 41, 43, 0, 41, 43, 0, 44, 89, 107, 127, 128, 129, 130, 0, 0, + 87, 88, 0, 0, 41, 43, 2, 27, 2, 2, 27, 29, 33, 0, 0, 0, + 0, 0, 0, 64, 0, 33, 0, 0, 43, 0, 0, 0, +}; + +/* Numeric_Value: 3228 bytes. */ + +RE_UINT32 re_get_numeric_value(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 12; + code = ch ^ (f << 12); + pos = (RE_UINT32)re_numeric_value_stage_1[f] << 4; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_numeric_value_stage_2[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_numeric_value_stage_3[pos + f] << 3; + f = code >> 2; + code ^= f << 2; + pos = (RE_UINT32)re_numeric_value_stage_4[pos + f] << 2; + value = re_numeric_value_stage_5[pos + code]; + + return value; +} + +/* Bidi_Mirrored. */ + +static RE_UINT8 re_bidi_mirrored_stage_1[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, +}; + +static RE_UINT8 re_bidi_mirrored_stage_2[] = { + 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, +}; + +static RE_UINT8 re_bidi_mirrored_stage_3[] = { + 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 1, 1, + 4, 5, 1, 6, 7, 8, 1, 9, 10, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 11, + 1, 1, 1, 12, 1, 1, 1, 1, +}; + +static RE_UINT8 re_bidi_mirrored_stage_4[] = { + 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 5, 3, 3, 3, 3, 3, + 6, 7, 8, 3, 3, 9, 3, 3, 10, 11, 12, 13, 14, 3, 3, 3, + 3, 3, 3, 3, 3, 15, 3, 16, 3, 3, 3, 3, 3, 3, 17, 18, + 19, 20, 21, 22, 3, 3, 3, 3, 23, 3, 3, 3, 3, 3, 3, 3, + 24, 3, 3, 3, 3, 3, 3, 3, 3, 25, 3, 3, 26, 27, 3, 3, + 3, 3, 3, 28, 29, 30, 31, 32, +}; + +static RE_UINT8 re_bidi_mirrored_stage_5[] = { + 0, 0, 0, 0, 0, 3, 0, 80, 0, 0, 0, 40, 0, 0, 0, 40, + 0, 0, 0, 0, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 60, 0, 0, 0, 24, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 6, 96, 0, 0, 0, 0, 0, 0, 96, + 0, 96, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 30, 63, 98, 188, 87, 248, 15, 250, 255, 31, 60, 128, 245, 207, 255, 255, + 255, 159, 7, 1, 204, 255, 255, 193, 0, 62, 195, 255, 255, 63, 255, 255, + 0, 15, 0, 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 255, 63, 0, + 121, 59, 120, 112, 252, 255, 0, 0, 248, 255, 255, 249, 255, 255, 0, 1, + 63, 194, 55, 31, 58, 3, 240, 51, 0, 252, 255, 223, 83, 122, 48, 112, + 0, 0, 128, 1, 48, 188, 25, 254, 255, 255, 255, 255, 207, 191, 255, 255, + 255, 255, 127, 80, 124, 112, 136, 47, 60, 54, 0, 48, 255, 3, 0, 0, + 0, 255, 243, 15, 0, 0, 0, 0, 0, 0, 0, 126, 48, 0, 0, 0, + 0, 3, 0, 80, 0, 0, 0, 40, 0, 0, 0, 168, 13, 0, 0, 0, + 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, + 0, 128, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, + 8, 0, 0, 0, 0, 0, 0, 0, +}; + +/* Bidi_Mirrored: 489 bytes. */ + +RE_UINT32 re_get_bidi_mirrored(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_bidi_mirrored_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_bidi_mirrored_stage_2[pos + f] << 3; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_bidi_mirrored_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_bidi_mirrored_stage_4[pos + f] << 6; + pos += code; + value = (re_bidi_mirrored_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Indic_Positional_Category. */ + +static RE_UINT8 re_indic_positional_category_stage_1[] = { + 0, 1, 1, 1, 1, 2, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_indic_positional_category_stage_2[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, + 8, 0, 0, 0, 0, 0, 0, 9, 0, 10, 11, 12, 13, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 16, 17, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, + 19, 20, 21, 22, 23, 24, 25, 26, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_indic_positional_category_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 2, 3, 4, 5, 0, 6, 0, 0, 7, 8, 9, 5, 0, + 10, 0, 0, 7, 11, 0, 0, 12, 10, 0, 0, 7, 13, 0, 5, 0, + 6, 0, 0, 14, 15, 16, 5, 0, 17, 0, 0, 18, 19, 9, 0, 0, + 20, 0, 0, 21, 22, 23, 5, 0, 6, 0, 0, 14, 24, 25, 5, 0, + 6, 0, 0, 18, 26, 9, 5, 0, 27, 0, 0, 0, 28, 29, 0, 27, + 0, 0, 0, 30, 31, 0, 0, 0, 0, 0, 0, 32, 33, 0, 0, 0, + 0, 34, 0, 35, 0, 0, 0, 36, 37, 38, 39, 40, 41, 0, 0, 0, + 0, 0, 42, 43, 0, 44, 45, 46, 47, 48, 0, 0, 0, 0, 0, 0, + 0, 49, 0, 49, 0, 50, 0, 50, 0, 0, 0, 51, 52, 53, 0, 0, + 0, 0, 54, 55, 0, 0, 0, 0, 0, 0, 0, 56, 57, 0, 0, 0, + 0, 58, 0, 0, 0, 59, 60, 61, 0, 0, 0, 0, 0, 0, 0, 0, + 62, 0, 0, 63, 64, 0, 65, 66, 67, 0, 68, 0, 0, 0, 69, 70, + 0, 0, 71, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 73, 74, 75, + 76, 0, 77, 0, 0, 0, 0, 0, 78, 0, 0, 79, 80, 0, 81, 82, + 0, 0, 83, 0, 84, 70, 0, 0, 1, 0, 0, 85, 86, 0, 87, 0, + 0, 0, 88, 89, 90, 0, 0, 91, 0, 0, 0, 92, 93, 0, 94, 95, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 96, 0, + 97, 0, 0, 98, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 99, 0, 0, 100, 101, 0, 0, 0, 67, 0, 0, 102, 0, 0, 0, 0, + 103, 0, 104, 105, 0, 0, 0, 106, 67, 0, 0, 107, 108, 0, 0, 0, + 0, 0, 109, 110, 0, 0, 0, 0, 0, 0, 0, 0, 0, 111, 112, 0, + 6, 0, 0, 18, 113, 9, 114, 115, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 116, 117, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 118, 119, 120, 121, 0, 0, + 0, 0, 0, 122, 123, 0, 0, 0, 0, 0, 124, 125, 0, 0, 0, 0, + 0, 126, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_indic_positional_category_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 7, 1, 2, 8, 5, 9, + 10, 7, 1, 6, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, + 10, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, + 5, 6, 3, 11, 12, 13, 14, 0, 0, 0, 0, 15, 0, 0, 0, 0, + 10, 2, 0, 0, 0, 0, 0, 0, 5, 3, 0, 10, 16, 10, 17, 0, + 1, 0, 18, 0, 0, 0, 0, 0, 5, 6, 7, 10, 19, 15, 5, 0, + 0, 0, 0, 0, 0, 0, 3, 20, 5, 6, 3, 11, 21, 13, 22, 0, + 0, 0, 0, 19, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 8, 2, 23, 0, 24, 12, 25, 26, 0, + 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 2, 8, 23, 1, 27, 1, 1, 0, 0, 0, 10, 3, 0, 0, 0, 0, + 28, 8, 23, 19, 29, 30, 1, 0, 0, 0, 15, 23, 0, 0, 0, 0, + 8, 5, 3, 24, 12, 25, 26, 0, 0, 8, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 16, 0, 15, 8, 1, 3, 3, 4, 31, 32, 33, + 20, 8, 1, 1, 6, 3, 0, 0, 34, 34, 35, 10, 1, 1, 1, 16, + 20, 8, 1, 1, 6, 10, 3, 0, 34, 34, 36, 0, 1, 1, 1, 0, + 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 18, 18, 10, 0, 0, 4, + 18, 37, 6, 38, 38, 1, 1, 2, 37, 1, 3, 1, 0, 0, 18, 6, + 6, 6, 6, 6, 18, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 15, 20, 17, 39, 1, 1, 17, 23, 2, 18, 3, + 0, 0, 0, 8, 6, 0, 0, 6, 3, 8, 23, 15, 8, 8, 8, 0, + 10, 1, 16, 0, 0, 0, 0, 0, 0, 40, 41, 2, 8, 8, 5, 15, + 0, 0, 0, 0, 0, 8, 20, 0, 0, 17, 3, 0, 0, 0, 0, 0, + 0, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 1, 17, 6, 42, + 43, 24, 25, 2, 20, 1, 1, 1, 1, 10, 0, 0, 0, 0, 10, 0, + 1, 40, 44, 45, 2, 8, 0, 0, 8, 40, 8, 8, 5, 17, 0, 0, + 8, 8, 46, 34, 8, 35, 8, 8, 23, 0, 0, 0, 8, 0, 0, 0, + 0, 0, 0, 10, 39, 20, 0, 0, 0, 0, 11, 40, 1, 17, 6, 3, + 15, 2, 20, 1, 17, 7, 40, 24, 24, 41, 1, 1, 1, 1, 16, 18, + 1, 1, 23, 0, 0, 0, 0, 0, 0, 0, 2, 1, 6, 47, 48, 24, + 25, 19, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 7, 1, + 1, 1, 0, 0, 0, 0, 0, 0, 1, 23, 0, 0, 0, 0, 0, 0, + 15, 6, 17, 9, 1, 23, 6, 0, 0, 0, 0, 2, 1, 8, 20, 20, + 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 4, 49, 8, 7, 1, + 1, 1, 24, 17, 0, 0, 0, 0, 1, 16, 50, 6, 6, 1, 6, 6, + 2, 51, 51, 51, 52, 0, 18, 0, 0, 0, 16, 0, 0, 0, 0, 0, + 0, 0, 0, 16, 0, 10, 0, 0, 0, 15, 5, 2, 0, 0, 0, 0, + 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, + 8, 8, 3, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 6, 0, + 0, 0, 0, 18, 6, 17, 6, 7, 0, 10, 8, 1, 6, 24, 2, 8, + 53, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 10, 1, 17, 54, 41, 40, 55, 3, 0, 0, 0, 0, + 0, 10, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 15, 2, 0, + 2, 1, 56, 57, 58, 46, 35, 1, 10, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 11, 7, 9, 0, 0, 15, 0, 0, 0, 0, 0, + 0, 15, 20, 8, 40, 23, 5, 0, 59, 6, 10, 52, 0, 0, 6, 7, + 0, 0, 0, 0, 17, 3, 0, 0, 20, 23, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 1, 6, 6, 6, 1, 1, 16, 0, 0, 0, 0, + 4, 5, 7, 2, 5, 3, 0, 0, 1, 16, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 10, 1, 6, 41, 38, 17, 3, 16, 0, 0, 0, 0, 0, + 0, 18, 0, 0, 0, 0, 0, 0, 0, 15, 9, 6, 6, 6, 1, 19, + 23, 0, 0, 0, 0, 10, 3, 0, 0, 0, 0, 0, 0, 0, 8, 5, + 1, 30, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, + 4, 5, 7, 1, 17, 3, 0, 0, 2, 8, 23, 11, 12, 13, 33, 0, + 0, 8, 0, 1, 1, 1, 16, 0, 1, 1, 16, 0, 0, 0, 0, 0, + 4, 5, 6, 6, 39, 60, 33, 26, 2, 6, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 15, 9, 6, 6, 0, 49, 32, 1, 5, + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, + 8, 5, 6, 6, 7, 2, 20, 5, 16, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 10, 20, 9, 6, 1, 1, 5, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 18, 10, 8, 1, 6, 41, 7, 1, 0, 0, +}; + +static RE_UINT8 re_indic_positional_category_stage_5[] = { + 0, 0, 5, 5, 5, 1, 6, 0, 1, 2, 1, 6, 6, 6, 6, 5, + 1, 1, 2, 1, 0, 5, 0, 2, 2, 0, 0, 4, 4, 6, 0, 1, + 5, 0, 5, 6, 0, 6, 5, 8, 1, 5, 9, 0, 10, 6, 1, 0, + 2, 2, 4, 4, 4, 5, 7, 0, 8, 1, 8, 0, 8, 8, 9, 2, + 4, 10, 4, 1, 3, 3, 3, 1, 3, 0, 5, 7, 7, 7, 6, 2, + 6, 1, 2, 5, 9, 10, 4, 2, 1, 8, 8, 5, 1, 3, 6, 11, + 7, 12, 2, 9, 13, 6, 13, 13, 13, 0, 11, 0, 5, 2, 2, 6, + 6, 3, 3, 5, 5, 3, 0, 13, 5, 9, +}; + +/* Indic_Positional_Category: 1842 bytes. */ + +RE_UINT32 re_get_indic_positional_category(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 13; + code = ch ^ (f << 13); + pos = (RE_UINT32)re_indic_positional_category_stage_1[f] << 5; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_indic_positional_category_stage_2[pos + f] << 4; + f = code >> 4; + code ^= f << 4; + pos = (RE_UINT32)re_indic_positional_category_stage_3[pos + f] << 3; + f = code >> 1; + code ^= f << 1; + pos = (RE_UINT32)re_indic_positional_category_stage_4[pos + f] << 1; + value = re_indic_positional_category_stage_5[pos + code]; + + return value; +} + +/* Indic_Syllabic_Category. */ + +static RE_UINT8 re_indic_syllabic_category_stage_1[] = { + 0, 1, 2, 2, 2, 3, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, +}; + +static RE_UINT8 re_indic_syllabic_category_stage_2[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 1, 1, 1, 1, 1, 1, 10, 1, 11, 12, 13, 14, 1, 1, 1, + 15, 1, 1, 1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 17, 18, 19, 20, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 21, 1, 1, 1, 1, 1, + 22, 23, 24, 25, 26, 27, 28, 29, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_indic_syllabic_category_stage_3[] = { + 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 3, 4, 0, 5, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 12, 20, + 21, 15, 16, 22, 23, 24, 25, 26, 27, 28, 16, 29, 30, 0, 12, 31, + 14, 15, 16, 29, 32, 33, 12, 34, 35, 36, 37, 38, 39, 40, 25, 0, + 41, 42, 16, 43, 44, 45, 12, 0, 46, 42, 16, 47, 44, 48, 12, 49, + 46, 42, 8, 50, 51, 52, 12, 53, 54, 55, 8, 56, 57, 58, 25, 59, + 60, 8, 61, 62, 63, 2, 0, 0, 64, 65, 66, 67, 68, 69, 0, 0, + 0, 0, 70, 71, 72, 8, 73, 74, 75, 76, 77, 78, 79, 0, 0, 0, + 8, 8, 80, 81, 82, 83, 84, 85, 86, 87, 0, 0, 0, 0, 0, 0, + 88, 89, 90, 89, 90, 91, 88, 92, 8, 8, 93, 94, 95, 96, 2, 0, + 97, 61, 98, 99, 25, 8, 100, 101, 8, 8, 102, 103, 104, 2, 0, 0, + 8, 105, 8, 8, 106, 107, 108, 109, 2, 2, 0, 0, 0, 0, 0, 0, + 110, 90, 8, 111, 112, 2, 0, 0, 113, 8, 114, 115, 8, 8, 116, 117, + 8, 8, 118, 119, 120, 0, 0, 0, 0, 0, 0, 0, 0, 121, 122, 123, + 124, 125, 0, 0, 0, 0, 0, 126, 127, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, + 129, 8, 130, 0, 8, 131, 132, 133, 134, 135, 8, 136, 137, 2, 138, 122, + 139, 8, 140, 8, 141, 142, 0, 0, 143, 8, 8, 144, 145, 2, 146, 147, + 148, 8, 149, 150, 151, 2, 8, 152, 8, 8, 8, 153, 154, 0, 155, 156, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 157, 158, 159, 2, + 160, 161, 8, 162, 163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 164, 90, 8, 165, 166, 167, 168, 169, 170, 8, 8, 171, 0, 0, 0, 0, + 172, 8, 173, 174, 0, 175, 8, 176, 177, 178, 8, 179, 180, 2, 181, 182, + 183, 184, 185, 186, 0, 0, 0, 0, 187, 188, 189, 190, 8, 191, 192, 2, + 193, 15, 16, 29, 32, 40, 194, 195, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 196, 8, 8, 197, 198, 2, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 199, 8, 200, 201, 202, 203, 0, 0, + 199, 8, 8, 204, 205, 2, 0, 0, 190, 8, 206, 207, 2, 0, 0, 0, + 8, 208, 209, 210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_indic_syllabic_category_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 3, 0, 4, 0, 0, 0, + 5, 0, 0, 0, 0, 6, 0, 0, 7, 8, 8, 8, 8, 9, 10, 10, + 10, 10, 10, 10, 10, 10, 11, 12, 13, 13, 13, 14, 15, 16, 10, 10, + 17, 18, 2, 2, 19, 8, 10, 10, 20, 21, 8, 22, 22, 9, 10, 10, + 10, 10, 23, 10, 24, 25, 26, 12, 13, 27, 27, 28, 0, 29, 0, 30, + 26, 0, 0, 0, 20, 21, 31, 32, 23, 33, 26, 34, 35, 29, 27, 36, + 0, 0, 37, 24, 0, 18, 2, 2, 38, 39, 0, 0, 20, 21, 8, 40, + 40, 9, 10, 10, 23, 37, 26, 12, 13, 41, 41, 36, 0, 0, 42, 0, + 13, 27, 27, 36, 0, 43, 0, 30, 42, 0, 0, 0, 44, 21, 31, 19, + 45, 46, 33, 23, 47, 48, 49, 25, 10, 10, 26, 43, 35, 43, 50, 36, + 0, 29, 0, 0, 7, 21, 8, 45, 45, 9, 10, 10, 10, 10, 26, 51, + 13, 50, 50, 36, 0, 52, 49, 0, 20, 21, 8, 45, 10, 37, 26, 12, + 0, 52, 0, 53, 54, 0, 0, 0, 10, 10, 49, 51, 13, 50, 50, 55, + 0, 29, 0, 32, 0, 0, 56, 57, 58, 21, 8, 8, 8, 31, 25, 10, + 30, 10, 10, 42, 10, 49, 59, 29, 13, 60, 13, 13, 43, 0, 0, 0, + 37, 10, 10, 10, 10, 10, 10, 49, 13, 13, 61, 0, 13, 41, 62, 63, + 33, 64, 24, 42, 0, 10, 37, 10, 37, 65, 25, 33, 13, 13, 41, 66, + 13, 67, 62, 68, 2, 2, 3, 10, 2, 2, 2, 2, 2, 69, 70, 0, + 10, 10, 37, 10, 10, 10, 10, 48, 16, 13, 13, 71, 72, 73, 74, 75, + 76, 76, 77, 76, 76, 76, 76, 76, 76, 76, 76, 78, 0, 79, 0, 0, + 80, 8, 81, 13, 13, 82, 83, 84, 2, 2, 3, 85, 86, 17, 87, 88, + 89, 90, 91, 92, 93, 94, 10, 10, 95, 96, 62, 97, 2, 2, 98, 99, + 100, 10, 10, 23, 11, 101, 0, 0, 100, 10, 10, 10, 11, 0, 0, 0, + 102, 0, 0, 0, 103, 8, 8, 8, 8, 43, 13, 13, 13, 71, 104, 105, + 106, 0, 0, 107, 108, 10, 10, 10, 13, 13, 109, 0, 110, 111, 112, 0, + 113, 114, 114, 115, 116, 117, 0, 0, 10, 10, 10, 0, 13, 13, 13, 13, + 118, 111, 119, 0, 10, 120, 13, 0, 10, 10, 10, 80, 100, 121, 111, 122, + 123, 13, 13, 13, 13, 91, 124, 125, 126, 127, 8, 8, 10, 128, 13, 13, + 13, 129, 10, 0, 130, 8, 131, 10, 132, 13, 133, 134, 2, 2, 135, 136, + 10, 137, 13, 13, 138, 0, 0, 0, 10, 139, 13, 118, 111, 140, 0, 0, + 2, 2, 3, 37, 141, 142, 142, 142, 143, 0, 0, 0, 144, 145, 143, 0, + 0, 0, 0, 146, 147, 4, 0, 0, 0, 148, 0, 0, 5, 148, 0, 0, + 0, 0, 0, 4, 40, 149, 150, 10, 120, 13, 0, 0, 10, 10, 10, 151, + 152, 153, 154, 10, 155, 0, 0, 0, 156, 8, 8, 8, 131, 10, 10, 10, + 10, 157, 13, 13, 13, 158, 0, 0, 142, 142, 142, 142, 2, 2, 159, 10, + 151, 114, 160, 119, 10, 120, 13, 161, 162, 0, 0, 0, 163, 8, 9, 100, + 164, 13, 13, 165, 158, 0, 0, 0, 10, 166, 10, 10, 2, 2, 159, 49, + 8, 131, 10, 10, 10, 10, 93, 13, 167, 168, 0, 0, 111, 111, 111, 169, + 37, 0, 170, 92, 13, 13, 13, 96, 171, 0, 0, 0, 131, 10, 120, 13, + 0, 172, 0, 0, 10, 10, 10, 86, 173, 10, 174, 111, 175, 13, 35, 176, + 93, 52, 0, 71, 10, 37, 37, 10, 10, 0, 177, 178, 2, 2, 0, 0, + 179, 180, 8, 8, 10, 10, 13, 13, 13, 181, 0, 0, 182, 183, 183, 183, + 183, 184, 2, 2, 0, 0, 0, 185, 186, 8, 8, 9, 13, 13, 187, 0, + 186, 100, 10, 10, 10, 120, 13, 13, 188, 189, 2, 2, 114, 190, 10, 10, + 164, 0, 0, 0, 186, 8, 8, 8, 9, 10, 10, 10, 120, 13, 13, 13, + 191, 0, 192, 67, 193, 2, 2, 2, 2, 194, 0, 0, 8, 8, 10, 10, + 30, 10, 10, 10, 10, 10, 10, 13, 13, 195, 0, 0, 8, 49, 23, 30, + 10, 10, 10, 30, 10, 10, 48, 0, 8, 8, 131, 10, 10, 10, 10, 150, + 13, 13, 196, 0, 7, 21, 8, 22, 17, 197, 142, 145, 142, 145, 0, 0, + 21, 8, 8, 100, 13, 13, 13, 198, 199, 107, 0, 0, 8, 8, 8, 131, + 10, 10, 10, 120, 13, 99, 13, 200, 201, 0, 0, 0, 0, 0, 8, 99, + 13, 13, 13, 202, 67, 0, 0, 0, 10, 10, 150, 203, 13, 204, 0, 0, + 10, 10, 26, 205, 13, 13, 206, 0, 2, 2, 2, 0, +}; + +static RE_UINT8 re_indic_syllabic_category_stage_5[] = { + 0, 0, 0, 0, 0, 11, 0, 0, 33, 33, 33, 33, 33, 33, 0, 0, + 11, 0, 0, 0, 0, 0, 28, 28, 0, 0, 0, 11, 1, 1, 1, 2, + 8, 8, 8, 8, 8, 12, 12, 12, 12, 12, 12, 12, 12, 12, 9, 9, + 4, 3, 9, 9, 9, 9, 9, 9, 9, 5, 9, 9, 0, 26, 26, 0, + 0, 9, 9, 9, 8, 8, 9, 9, 0, 0, 33, 33, 0, 0, 8, 8, + 0, 1, 1, 2, 0, 8, 8, 8, 8, 0, 0, 8, 12, 0, 12, 12, + 12, 0, 12, 0, 0, 0, 12, 12, 12, 12, 0, 0, 9, 0, 0, 9, + 9, 5, 13, 0, 0, 0, 0, 9, 12, 12, 0, 12, 8, 8, 8, 0, + 0, 0, 0, 8, 0, 12, 12, 0, 4, 0, 9, 9, 9, 9, 9, 0, + 9, 5, 0, 0, 0, 12, 12, 12, 1, 25, 11, 11, 0, 19, 0, 0, + 8, 8, 0, 8, 9, 9, 0, 9, 0, 12, 0, 0, 0, 0, 9, 9, + 0, 0, 1, 22, 8, 0, 8, 8, 8, 12, 0, 0, 0, 0, 0, 12, + 12, 0, 0, 0, 12, 12, 12, 0, 9, 0, 9, 9, 0, 3, 9, 9, + 0, 9, 9, 0, 0, 0, 12, 0, 0, 14, 14, 0, 9, 5, 16, 0, + 0, 0, 13, 13, 13, 13, 13, 13, 0, 0, 1, 2, 0, 0, 5, 0, + 9, 0, 9, 0, 9, 9, 6, 0, 24, 24, 24, 24, 29, 1, 6, 0, + 12, 0, 0, 12, 0, 12, 0, 12, 19, 19, 0, 0, 9, 0, 0, 0, + 0, 1, 0, 0, 0, 28, 0, 28, 0, 4, 0, 0, 9, 9, 1, 2, + 9, 9, 1, 1, 6, 3, 0, 0, 21, 21, 21, 21, 21, 18, 18, 18, + 18, 18, 18, 18, 0, 18, 18, 18, 18, 0, 0, 0, 0, 0, 28, 0, + 12, 8, 8, 8, 8, 8, 8, 9, 9, 9, 1, 24, 2, 7, 6, 19, + 19, 19, 19, 12, 0, 0, 11, 0, 12, 12, 8, 8, 9, 9, 12, 12, + 12, 12, 19, 19, 19, 12, 9, 24, 24, 12, 12, 9, 9, 24, 24, 24, + 24, 24, 12, 12, 12, 9, 9, 9, 9, 12, 12, 12, 12, 12, 19, 9, + 9, 9, 9, 24, 24, 24, 12, 24, 33, 33, 24, 24, 9, 9, 0, 0, + 8, 8, 8, 12, 6, 0, 0, 0, 12, 0, 9, 9, 12, 12, 12, 8, + 9, 27, 27, 28, 17, 29, 28, 28, 28, 6, 7, 28, 3, 0, 0, 0, + 11, 12, 12, 12, 9, 18, 18, 18, 20, 20, 1, 20, 20, 20, 20, 20, + 20, 20, 9, 28, 12, 12, 12, 10, 10, 10, 10, 10, 10, 10, 0, 0, + 23, 23, 23, 23, 23, 0, 0, 0, 9, 20, 20, 20, 24, 24, 0, 0, + 12, 12, 12, 9, 12, 19, 19, 20, 20, 20, 20, 0, 7, 9, 9, 9, + 24, 24, 28, 28, 28, 0, 0, 28, 1, 1, 1, 17, 2, 8, 8, 8, + 4, 9, 9, 9, 5, 12, 12, 12, 1, 17, 2, 8, 8, 8, 12, 12, + 12, 18, 18, 18, 9, 9, 6, 7, 18, 18, 12, 12, 33, 33, 3, 12, + 12, 12, 20, 20, 8, 8, 4, 9, 20, 20, 6, 6, 18, 18, 9, 9, + 1, 1, 28, 4, 26, 26, 26, 0, 26, 26, 26, 26, 26, 26, 0, 0, + 0, 0, 2, 2, 26, 0, 0, 0, 30, 31, 0, 0, 11, 11, 11, 11, + 28, 0, 0, 0, 8, 8, 6, 12, 12, 12, 12, 1, 12, 12, 10, 10, + 10, 10, 12, 12, 12, 12, 10, 18, 18, 12, 12, 12, 12, 18, 12, 1, + 1, 2, 8, 8, 20, 9, 9, 9, 5, 0, 0, 0, 33, 33, 12, 12, + 10, 10, 10, 24, 9, 9, 9, 20, 20, 20, 20, 6, 1, 1, 17, 2, + 12, 12, 12, 4, 9, 18, 19, 19, 12, 9, 0, 12, 9, 9, 9, 19, + 19, 19, 19, 0, 20, 20, 0, 0, 0, 0, 12, 24, 23, 24, 23, 0, + 0, 2, 7, 0, 12, 8, 12, 12, 12, 12, 12, 20, 20, 20, 20, 9, + 24, 6, 0, 0, 4, 4, 4, 0, 0, 0, 0, 7, 1, 1, 2, 14, + 14, 8, 8, 8, 9, 9, 5, 0, 0, 0, 34, 34, 34, 34, 34, 34, + 34, 34, 33, 33, 0, 0, 0, 32, 1, 1, 2, 8, 9, 5, 4, 0, + 9, 9, 9, 7, 6, 0, 33, 33, 10, 12, 12, 12, 5, 3, 15, 15, + 0, 0, 4, 9, 0, 33, 33, 33, 33, 0, 0, 0, 1, 5, 4, 25, + 9, 4, 6, 0, 0, 0, 26, 26, 9, 9, 9, 1, 1, 2, 5, 4, + 1, 1, 2, 5, 4, 0, 0, 0, 9, 1, 2, 5, 2, 9, 9, 9, + 9, 9, 5, 4, 0, 19, 19, 19, 9, 9, 9, 6, +}; + +/* Indic_Syllabic_Category: 2448 bytes. */ + +RE_UINT32 re_get_indic_syllabic_category(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 13; + code = ch ^ (f << 13); + pos = (RE_UINT32)re_indic_syllabic_category_stage_1[f] << 5; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_indic_syllabic_category_stage_2[pos + f] << 4; + f = code >> 4; + code ^= f << 4; + pos = (RE_UINT32)re_indic_syllabic_category_stage_3[pos + f] << 2; + f = code >> 2; + code ^= f << 2; + pos = (RE_UINT32)re_indic_syllabic_category_stage_4[pos + f] << 2; + value = re_indic_syllabic_category_stage_5[pos + code]; + + return value; +} + +/* Alphanumeric. */ + +static RE_UINT8 re_alphanumeric_stage_1[] = { + 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, +}; + +static RE_UINT8 re_alphanumeric_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 9, 10, 11, 7, 7, 7, 7, 12, 13, 13, 13, 13, 14, + 15, 16, 17, 18, 19, 13, 20, 13, 21, 13, 13, 13, 13, 22, 13, 13, + 13, 13, 13, 13, 13, 13, 23, 24, 13, 13, 25, 13, 13, 26, 27, 13, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 28, 7, 29, 30, 7, 31, 13, 13, 13, 13, 13, 32, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, +}; + +static RE_UINT8 re_alphanumeric_stage_3[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 1, 17, 18, 19, 1, 20, 21, 22, 23, 24, 25, 26, 27, 1, 28, + 29, 30, 31, 31, 32, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 31, + 36, 37, 31, 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 38, 1, 1, 1, 1, 1, 1, 1, 1, 1, 39, + 1, 1, 1, 1, 40, 1, 41, 42, 43, 44, 45, 46, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 47, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 1, 48, 49, 1, 50, 51, 52, 53, 54, 55, 56, 57, 58, 1, 59, + 60, 61, 62, 63, 64, 31, 31, 31, 65, 66, 67, 68, 69, 70, 71, 72, + 73, 31, 74, 31, 31, 31, 31, 31, 1, 1, 1, 75, 76, 77, 31, 31, + 1, 1, 1, 1, 78, 31, 31, 31, 31, 31, 31, 31, 1, 1, 79, 31, + 1, 1, 80, 81, 31, 31, 31, 82, 83, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 84, 31, 31, 31, 31, 31, 31, 31, 85, 86, 87, 88, + 89, 31, 31, 31, 31, 31, 90, 31, 31, 91, 31, 31, 31, 31, 31, 31, + 1, 1, 1, 1, 1, 1, 92, 1, 1, 1, 1, 1, 1, 1, 1, 93, + 94, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 95, 31, + 1, 1, 96, 31, 31, 31, 31, 31, +}; + +static RE_UINT8 re_alphanumeric_stage_4[] = { + 0, 1, 2, 2, 0, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 6, 7, 0, 0, 8, 9, 10, 11, 5, 12, + 5, 5, 5, 5, 13, 5, 5, 5, 5, 14, 15, 16, 17, 18, 19, 20, + 21, 5, 22, 23, 5, 5, 24, 25, 26, 5, 27, 5, 5, 28, 5, 29, + 30, 31, 32, 0, 0, 33, 0, 34, 5, 35, 36, 37, 38, 39, 40, 41, + 42, 43, 44, 45, 46, 47, 48, 49, 50, 47, 51, 52, 53, 54, 55, 56, + 57, 58, 59, 60, 61, 62, 63, 64, 61, 65, 66, 67, 68, 69, 70, 71, + 16, 72, 73, 0, 74, 75, 76, 0, 77, 78, 79, 80, 81, 82, 0, 0, + 5, 83, 84, 85, 86, 5, 87, 88, 5, 5, 89, 5, 90, 91, 92, 5, + 93, 5, 94, 0, 95, 5, 5, 96, 16, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 97, 2, 5, 5, 98, 99, 100, 100, 101, 5, 102, 103, 78, + 1, 5, 5, 104, 5, 105, 5, 106, 107, 108, 109, 110, 5, 111, 112, 0, + 113, 5, 107, 114, 112, 115, 0, 0, 5, 116, 117, 0, 5, 118, 5, 119, + 5, 106, 120, 121, 0, 0, 0, 122, 5, 5, 5, 5, 5, 5, 0, 123, + 96, 5, 124, 121, 5, 125, 126, 127, 0, 0, 0, 128, 129, 0, 0, 0, + 130, 131, 132, 5, 133, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 134, 5, 78, 5, 135, 107, 5, 5, 5, 5, 136, + 5, 87, 5, 137, 138, 139, 139, 5, 0, 140, 0, 0, 0, 0, 0, 0, + 141, 142, 16, 5, 143, 16, 5, 88, 144, 145, 5, 5, 146, 72, 0, 26, + 5, 5, 5, 5, 5, 106, 0, 0, 5, 5, 5, 5, 5, 5, 106, 0, + 5, 5, 5, 5, 31, 0, 26, 121, 147, 148, 5, 149, 5, 5, 5, 95, + 150, 151, 5, 5, 152, 153, 0, 150, 154, 17, 5, 100, 5, 5, 155, 156, + 5, 105, 157, 82, 5, 158, 159, 160, 5, 138, 161, 162, 5, 107, 163, 164, + 165, 166, 88, 167, 5, 5, 5, 168, 5, 5, 5, 5, 5, 169, 170, 113, + 5, 5, 5, 171, 5, 5, 172, 0, 173, 174, 175, 5, 5, 28, 176, 5, + 5, 121, 26, 5, 177, 5, 17, 178, 0, 0, 0, 179, 5, 5, 5, 82, + 1, 2, 2, 109, 5, 107, 180, 0, 181, 182, 183, 0, 5, 5, 5, 72, + 0, 0, 5, 33, 0, 0, 0, 0, 0, 0, 0, 0, 82, 5, 184, 0, + 5, 26, 105, 72, 121, 5, 185, 0, 5, 5, 5, 5, 121, 78, 0, 0, + 5, 186, 5, 187, 0, 0, 0, 0, 5, 138, 106, 17, 0, 0, 0, 0, + 188, 189, 106, 138, 107, 0, 0, 190, 106, 172, 0, 0, 5, 191, 0, 0, + 192, 100, 0, 82, 82, 0, 79, 193, 5, 106, 106, 157, 28, 0, 0, 0, + 5, 5, 133, 0, 5, 157, 5, 157, 5, 5, 194, 56, 151, 32, 26, 195, + 5, 196, 26, 197, 5, 5, 198, 0, 199, 200, 0, 0, 201, 202, 5, 195, + 38, 47, 203, 187, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 204, 0, + 0, 0, 0, 0, 5, 205, 206, 0, 5, 107, 207, 0, 5, 106, 78, 0, + 208, 168, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 209, + 0, 0, 0, 0, 0, 0, 5, 32, 5, 5, 5, 5, 172, 0, 0, 0, + 5, 5, 5, 146, 5, 5, 5, 5, 5, 5, 187, 0, 0, 0, 0, 0, + 5, 146, 0, 0, 0, 0, 0, 0, 5, 5, 210, 0, 0, 0, 0, 0, + 5, 32, 107, 78, 0, 0, 26, 211, 5, 138, 155, 212, 95, 0, 0, 0, + 5, 5, 213, 107, 176, 0, 0, 0, 214, 0, 0, 0, 0, 0, 0, 0, + 5, 5, 5, 215, 216, 0, 0, 0, 5, 5, 217, 5, 218, 219, 220, 5, + 221, 222, 223, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 224, 225, 88, + 217, 217, 135, 135, 226, 226, 227, 5, 5, 5, 5, 5, 5, 5, 193, 0, + 220, 228, 229, 230, 231, 232, 0, 0, 0, 26, 84, 84, 78, 0, 0, 0, + 5, 5, 5, 5, 5, 5, 138, 0, 5, 33, 5, 5, 5, 5, 5, 5, + 121, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 214, 0, 0, + 121, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_alphanumeric_stage_5[] = { + 0, 0, 0, 0, 0, 0, 255, 3, 254, 255, 255, 7, 0, 4, 32, 4, + 255, 255, 127, 255, 255, 255, 255, 255, 195, 255, 3, 0, 31, 80, 0, 0, + 32, 0, 0, 0, 0, 0, 223, 188, 64, 215, 255, 255, 251, 255, 255, 255, + 255, 255, 191, 255, 3, 252, 255, 255, 255, 255, 254, 255, 255, 255, 127, 2, + 254, 255, 255, 255, 255, 0, 0, 0, 0, 0, 255, 191, 182, 0, 255, 255, + 255, 7, 7, 0, 0, 0, 255, 7, 255, 255, 255, 254, 255, 195, 255, 255, + 255, 255, 239, 31, 254, 225, 255, 159, 0, 0, 255, 255, 0, 224, 255, 255, + 255, 255, 3, 0, 255, 7, 48, 4, 255, 255, 255, 252, 255, 31, 0, 0, + 255, 255, 255, 1, 255, 255, 31, 0, 248, 3, 255, 255, 255, 255, 255, 239, + 255, 223, 225, 255, 207, 255, 254, 255, 239, 159, 249, 255, 255, 253, 197, 227, + 159, 89, 128, 176, 207, 255, 3, 0, 238, 135, 249, 255, 255, 253, 109, 195, + 135, 25, 2, 94, 192, 255, 63, 0, 238, 191, 251, 255, 255, 253, 237, 227, + 191, 27, 1, 0, 207, 255, 0, 2, 238, 159, 249, 255, 159, 25, 192, 176, + 207, 255, 2, 0, 236, 199, 61, 214, 24, 199, 255, 195, 199, 29, 129, 0, + 192, 255, 0, 0, 239, 223, 253, 255, 255, 253, 255, 227, 223, 29, 96, 7, + 207, 255, 0, 0, 238, 223, 253, 255, 255, 253, 239, 227, 223, 29, 96, 64, + 207, 255, 6, 0, 255, 255, 255, 231, 223, 93, 128, 128, 207, 255, 0, 252, + 236, 255, 127, 252, 255, 255, 251, 47, 127, 128, 95, 255, 192, 255, 12, 0, + 255, 255, 255, 7, 127, 32, 255, 3, 150, 37, 240, 254, 174, 236, 255, 59, + 95, 32, 255, 243, 1, 0, 0, 0, 255, 3, 0, 0, 255, 254, 255, 255, + 255, 31, 254, 255, 3, 255, 255, 254, 255, 255, 255, 31, 255, 255, 127, 249, + 255, 3, 255, 255, 231, 193, 255, 255, 127, 64, 255, 51, 191, 32, 255, 255, + 255, 255, 255, 247, 255, 61, 127, 61, 255, 61, 255, 255, 255, 255, 61, 127, + 61, 255, 127, 255, 255, 255, 61, 255, 255, 255, 255, 135, 255, 255, 0, 0, + 255, 255, 63, 63, 255, 159, 255, 255, 255, 199, 255, 1, 255, 223, 15, 0, + 255, 255, 15, 0, 255, 223, 13, 0, 255, 255, 207, 255, 255, 1, 128, 16, + 255, 255, 255, 0, 255, 7, 255, 255, 255, 255, 63, 0, 255, 255, 255, 127, + 255, 15, 255, 1, 192, 255, 255, 255, 255, 63, 31, 0, 255, 15, 255, 255, + 255, 3, 255, 3, 255, 255, 255, 15, 254, 255, 31, 0, 128, 0, 0, 0, + 255, 255, 239, 255, 239, 15, 255, 3, 255, 243, 255, 255, 191, 255, 3, 0, + 255, 227, 255, 255, 255, 255, 255, 63, 0, 222, 111, 0, 128, 255, 31, 0, + 63, 63, 255, 170, 255, 255, 223, 95, 220, 31, 207, 15, 255, 31, 220, 31, + 0, 0, 2, 128, 0, 0, 255, 31, 132, 252, 47, 62, 80, 189, 255, 243, + 224, 67, 0, 0, 255, 1, 0, 0, 0, 0, 192, 255, 255, 127, 255, 255, + 31, 120, 12, 0, 255, 128, 0, 0, 255, 255, 127, 0, 127, 127, 127, 127, + 0, 128, 0, 0, 224, 0, 0, 0, 254, 3, 62, 31, 255, 255, 127, 224, + 224, 255, 255, 255, 255, 63, 254, 255, 255, 127, 0, 0, 255, 31, 255, 255, + 255, 15, 0, 0, 255, 127, 240, 143, 0, 0, 128, 255, 252, 255, 255, 255, + 255, 249, 255, 255, 255, 63, 255, 0, 187, 247, 255, 255, 15, 0, 255, 3, + 0, 0, 252, 40, 255, 255, 7, 0, 255, 255, 247, 255, 0, 128, 255, 3, + 223, 255, 255, 127, 255, 63, 255, 3, 255, 255, 127, 196, 5, 0, 0, 56, + 255, 255, 60, 0, 126, 126, 126, 0, 127, 127, 255, 255, 63, 0, 255, 255, + 255, 7, 255, 3, 15, 0, 255, 255, 127, 248, 255, 255, 255, 63, 255, 255, + 255, 255, 255, 3, 127, 0, 248, 224, 255, 253, 127, 95, 219, 255, 255, 255, + 0, 0, 248, 255, 255, 255, 252, 255, 0, 0, 255, 15, 0, 0, 223, 255, + 252, 252, 252, 28, 255, 239, 255, 255, 127, 255, 255, 183, 255, 63, 255, 63, + 255, 255, 1, 0, 15, 255, 62, 0, 255, 0, 255, 255, 15, 0, 0, 0, + 63, 253, 255, 255, 255, 255, 191, 145, 255, 255, 55, 0, 255, 255, 255, 192, + 111, 240, 239, 254, 31, 0, 0, 0, 63, 0, 0, 0, 255, 1, 255, 3, + 255, 255, 199, 255, 255, 255, 71, 0, 30, 0, 255, 23, 255, 255, 251, 255, + 255, 255, 159, 0, 127, 189, 255, 191, 255, 1, 255, 255, 159, 25, 129, 224, + 179, 0, 255, 3, 255, 255, 63, 127, 0, 0, 0, 63, 17, 0, 255, 3, + 255, 255, 255, 227, 255, 3, 0, 128, 127, 0, 0, 0, 255, 63, 0, 0, + 248, 255, 255, 224, 31, 0, 255, 255, 3, 0, 0, 0, 255, 7, 255, 31, + 255, 1, 255, 67, 255, 255, 223, 255, 255, 255, 255, 223, 100, 222, 255, 235, + 239, 255, 255, 255, 191, 231, 223, 223, 255, 255, 255, 123, 95, 252, 253, 255, + 63, 255, 255, 255, 253, 255, 255, 247, 255, 253, 255, 255, 247, 207, 255, 255, + 150, 254, 247, 10, 132, 234, 150, 170, 150, 247, 247, 94, 255, 251, 255, 15, + 238, 251, 255, 15, +}; + +/* Alphanumeric: 2117 bytes. */ + +RE_UINT32 re_get_alphanumeric(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_alphanumeric_stage_1[f] << 5; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_alphanumeric_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_alphanumeric_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_alphanumeric_stage_4[pos + f] << 5; + pos += code; + value = (re_alphanumeric_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Any. */ + +RE_UINT32 re_get_any(RE_UINT32 ch) { + return 1; +} + +/* Blank. */ + +static RE_UINT8 re_blank_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_blank_stage_2[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +}; + +static RE_UINT8 re_blank_stage_3[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, + 3, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_blank_stage_4[] = { + 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 3, 1, 1, 1, 1, 1, 4, 5, 1, 1, 1, 1, 1, 1, + 3, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_blank_stage_5[] = { + 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 255, 7, 0, 0, 0, 128, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, +}; + +/* Blank: 169 bytes. */ + +RE_UINT32 re_get_blank(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_blank_stage_1[f] << 3; + f = code >> 13; + code ^= f << 13; + pos = (RE_UINT32)re_blank_stage_2[pos + f] << 4; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_blank_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_blank_stage_4[pos + f] << 6; + pos += code; + value = (re_blank_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Graph. */ + +static RE_UINT8 re_graph_stage_1[] = { + 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 4, 8, + 4, 8, +}; + +static RE_UINT8 re_graph_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 9, 10, 11, 7, 7, 7, 7, 12, 13, 7, 7, 7, 14, + 15, 16, 17, 18, 19, 13, 20, 13, 21, 13, 13, 13, 13, 22, 13, 13, + 13, 13, 13, 13, 13, 13, 23, 24, 13, 13, 25, 26, 13, 27, 28, 29, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 30, 7, 31, 32, 7, 33, 13, 13, 13, 13, 13, 34, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 35, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 36, +}; + +static RE_UINT8 re_graph_stage_3[] = { + 0, 1, 1, 2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 1, 15, 16, 1, 1, 17, 18, 19, 20, 21, 22, 23, 24, 1, 25, + 26, 27, 1, 28, 29, 1, 1, 1, 1, 1, 1, 30, 31, 32, 33, 34, + 35, 36, 37, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 38, 1, 1, 1, 1, 1, 1, 1, 1, 1, 39, + 1, 1, 1, 1, 40, 1, 41, 42, 43, 44, 45, 46, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 47, 48, 48, 48, 48, 48, 48, 48, 48, + 1, 1, 49, 50, 1, 51, 52, 53, 54, 55, 56, 57, 58, 59, 1, 60, + 61, 62, 63, 64, 65, 48, 66, 48, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 48, 76, 48, 48, 48, 48, 48, 1, 1, 1, 77, 78, 79, 48, 48, + 1, 1, 1, 1, 80, 48, 48, 48, 48, 48, 48, 48, 1, 1, 81, 48, + 1, 1, 82, 83, 48, 48, 48, 84, 85, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 86, 48, 48, 48, 87, 88, 89, 90, 91, 92, 93, 94, + 1, 1, 95, 48, 48, 48, 48, 48, 96, 48, 48, 48, 48, 48, 97, 48, + 98, 99, 100, 1, 1, 101, 102, 103, 104, 105, 48, 48, 48, 48, 48, 48, + 1, 1, 1, 1, 1, 1, 106, 1, 1, 1, 1, 1, 1, 1, 1, 107, + 108, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 109, 48, + 1, 1, 110, 48, 48, 48, 48, 48, 111, 112, 48, 48, 48, 48, 48, 48, + 1, 1, 1, 1, 1, 1, 1, 113, +}; + +static RE_UINT8 re_graph_stage_4[] = { + 0, 1, 2, 3, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 4, 5, 6, 2, 2, 2, 7, 8, 1, 9, 2, 10, 11, + 12, 2, 2, 2, 2, 2, 2, 2, 13, 2, 14, 2, 2, 15, 2, 16, + 2, 17, 18, 0, 0, 19, 0, 20, 2, 2, 2, 2, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 30, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 44, 48, 49, 50, 51, 52, 53, 54, + 1, 55, 56, 0, 57, 58, 59, 0, 2, 2, 60, 61, 62, 12, 63, 0, + 2, 2, 2, 2, 2, 2, 64, 2, 2, 2, 65, 2, 66, 67, 68, 2, + 69, 2, 48, 70, 71, 2, 2, 72, 2, 2, 2, 2, 73, 2, 2, 74, + 75, 76, 77, 78, 2, 2, 79, 80, 81, 2, 2, 82, 2, 83, 2, 84, + 3, 85, 86, 87, 2, 88, 89, 2, 90, 2, 3, 91, 80, 17, 0, 0, + 2, 2, 88, 70, 2, 2, 2, 92, 2, 93, 94, 2, 0, 0, 10, 95, + 2, 2, 2, 2, 2, 2, 2, 96, 72, 2, 97, 79, 2, 98, 99, 100, + 101, 102, 3, 103, 104, 3, 105, 106, 2, 2, 2, 2, 88, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 16, 2, 107, 108, 2, 2, 2, 2, 2, + 2, 2, 2, 109, 110, 111, 112, 113, 2, 114, 3, 2, 2, 2, 2, 115, + 2, 64, 2, 116, 76, 117, 117, 2, 2, 2, 118, 0, 119, 2, 2, 77, + 2, 2, 2, 2, 2, 2, 84, 120, 1, 2, 1, 2, 8, 2, 2, 2, + 121, 122, 2, 2, 114, 16, 2, 123, 3, 2, 2, 2, 2, 2, 2, 3, + 2, 2, 2, 2, 2, 84, 2, 2, 2, 2, 2, 2, 2, 2, 84, 0, + 2, 2, 2, 2, 124, 2, 125, 2, 2, 126, 2, 2, 2, 2, 2, 82, + 2, 2, 2, 2, 2, 127, 0, 128, 2, 129, 2, 82, 2, 2, 130, 79, + 2, 2, 131, 70, 2, 2, 132, 3, 2, 76, 133, 2, 2, 2, 134, 76, + 135, 136, 2, 137, 2, 2, 2, 138, 2, 2, 2, 2, 2, 123, 139, 56, + 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 140, 2, 2, 71, 0, + 141, 142, 143, 2, 2, 2, 144, 2, 2, 2, 105, 2, 145, 2, 146, 147, + 71, 2, 148, 149, 2, 2, 2, 91, 1, 2, 2, 2, 2, 3, 150, 151, + 152, 153, 154, 0, 2, 2, 2, 16, 155, 156, 2, 2, 157, 158, 105, 79, + 0, 0, 0, 0, 70, 2, 106, 56, 2, 123, 83, 16, 159, 2, 160, 0, + 2, 2, 2, 2, 79, 161, 0, 0, 2, 10, 2, 162, 0, 0, 0, 0, + 2, 76, 84, 146, 0, 0, 0, 0, 163, 164, 165, 2, 3, 166, 0, 167, + 168, 169, 0, 0, 2, 170, 145, 2, 171, 172, 173, 2, 2, 0, 2, 174, + 2, 175, 110, 176, 177, 178, 0, 0, 2, 2, 179, 0, 2, 180, 2, 181, + 0, 0, 0, 3, 0, 0, 0, 0, 2, 2, 182, 183, 2, 2, 184, 185, + 2, 98, 123, 76, 2, 2, 140, 186, 187, 79, 0, 0, 188, 189, 2, 190, + 21, 30, 191, 192, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 193, 0, + 0, 0, 0, 0, 2, 110, 79, 0, 2, 2, 194, 0, 2, 82, 161, 0, + 111, 88, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 195, + 0, 0, 0, 0, 0, 0, 2, 74, 2, 2, 2, 2, 71, 0, 0, 0, + 2, 2, 2, 196, 2, 2, 2, 2, 2, 2, 197, 0, 0, 0, 0, 0, + 2, 198, 0, 0, 0, 0, 0, 0, 2, 2, 107, 0, 0, 0, 0, 0, + 2, 74, 3, 199, 0, 0, 105, 200, 2, 2, 201, 202, 203, 0, 0, 0, + 2, 2, 204, 3, 205, 0, 0, 0, 206, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 207, 208, 197, 0, 0, 2, 2, 2, 2, 2, 2, 2, 84, + 2, 209, 2, 2, 2, 2, 2, 179, 2, 2, 210, 0, 0, 0, 0, 0, + 2, 2, 76, 15, 0, 0, 0, 0, 2, 2, 98, 2, 12, 211, 212, 2, + 213, 214, 215, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 216, 2, 2, + 2, 2, 2, 2, 2, 2, 217, 2, 2, 2, 2, 2, 218, 219, 0, 0, + 2, 2, 2, 2, 2, 2, 220, 0, 212, 221, 222, 223, 224, 225, 0, 226, + 2, 88, 2, 2, 77, 227, 228, 84, 124, 114, 2, 88, 16, 0, 0, 229, + 230, 16, 231, 0, 0, 0, 0, 0, 2, 2, 2, 119, 2, 212, 2, 2, + 2, 2, 2, 2, 2, 2, 106, 232, 2, 2, 2, 77, 2, 2, 19, 0, + 88, 2, 193, 2, 10, 233, 0, 0, 234, 0, 0, 0, 235, 0, 158, 0, + 2, 2, 2, 2, 2, 2, 76, 0, 2, 19, 2, 2, 2, 2, 2, 2, + 79, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 206, 0, 0, + 79, 0, 0, 0, 0, 0, 0, 0, 236, 2, 2, 2, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 203, 2, 2, 2, 2, 2, 2, 2, 79, +}; + +static RE_UINT8 re_graph_stage_5[] = { + 0, 0, 0, 0, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 127, + 255, 255, 255, 252, 240, 215, 255, 255, 251, 255, 255, 255, 255, 255, 254, 255, + 255, 255, 127, 254, 255, 230, 254, 255, 255, 0, 255, 255, 255, 7, 31, 0, + 255, 255, 255, 223, 255, 191, 255, 255, 255, 231, 255, 255, 255, 255, 3, 0, + 255, 255, 255, 7, 255, 63, 255, 127, 255, 255, 255, 79, 255, 255, 31, 0, + 248, 255, 255, 255, 239, 159, 249, 255, 255, 253, 197, 243, 159, 121, 128, 176, + 207, 255, 255, 15, 238, 135, 249, 255, 255, 253, 109, 211, 135, 57, 2, 94, + 192, 255, 63, 0, 238, 191, 251, 255, 255, 253, 237, 243, 191, 59, 1, 0, + 207, 255, 3, 2, 238, 159, 249, 255, 159, 57, 192, 176, 207, 255, 255, 0, + 236, 199, 61, 214, 24, 199, 255, 195, 199, 61, 129, 0, 192, 255, 255, 7, + 239, 223, 253, 255, 255, 253, 255, 227, 223, 61, 96, 7, 207, 255, 0, 255, + 238, 223, 253, 255, 255, 253, 239, 243, 223, 61, 96, 64, 207, 255, 6, 0, + 255, 255, 255, 231, 223, 125, 128, 128, 207, 255, 63, 254, 236, 255, 127, 252, + 255, 255, 251, 47, 127, 132, 95, 255, 192, 255, 28, 0, 255, 255, 255, 135, + 255, 255, 255, 15, 150, 37, 240, 254, 174, 236, 255, 59, 95, 63, 255, 243, + 255, 254, 255, 255, 255, 31, 254, 255, 255, 255, 255, 254, 255, 223, 255, 7, + 191, 32, 255, 255, 255, 61, 127, 61, 255, 61, 255, 255, 255, 255, 61, 127, + 61, 255, 127, 255, 255, 255, 61, 255, 255, 255, 255, 31, 255, 255, 255, 3, + 255, 255, 63, 63, 254, 255, 255, 31, 255, 255, 255, 1, 255, 223, 31, 0, + 255, 255, 127, 0, 255, 255, 15, 0, 255, 223, 13, 0, 255, 255, 255, 63, + 255, 3, 255, 3, 255, 127, 255, 3, 255, 255, 255, 0, 255, 7, 255, 255, + 255, 255, 63, 0, 255, 15, 255, 15, 241, 255, 255, 255, 255, 63, 31, 0, + 255, 15, 255, 255, 255, 3, 255, 199, 255, 255, 255, 207, 255, 255, 255, 159, + 255, 255, 15, 240, 255, 255, 255, 248, 255, 227, 255, 255, 255, 255, 127, 3, + 255, 255, 63, 240, 63, 63, 255, 170, 255, 255, 223, 255, 223, 255, 207, 239, + 255, 255, 220, 127, 0, 248, 255, 255, 255, 124, 255, 255, 223, 255, 243, 255, + 255, 127, 255, 31, 0, 0, 255, 255, 255, 255, 1, 0, 127, 0, 0, 0, + 255, 7, 0, 0, 255, 255, 207, 255, 255, 255, 63, 255, 255, 255, 255, 227, + 255, 253, 3, 0, 0, 240, 0, 0, 255, 127, 255, 255, 255, 255, 15, 254, + 255, 128, 1, 128, 127, 127, 127, 127, 7, 0, 0, 0, 255, 255, 255, 251, + 0, 0, 255, 15, 224, 255, 255, 255, 255, 63, 254, 255, 15, 0, 255, 255, + 255, 31, 255, 255, 127, 0, 255, 255, 255, 15, 0, 0, 255, 63, 255, 0, + 0, 0, 128, 255, 255, 15, 255, 3, 31, 192, 255, 3, 255, 255, 15, 128, + 255, 191, 255, 195, 255, 63, 255, 243, 7, 0, 0, 248, 126, 126, 126, 0, + 127, 127, 255, 255, 63, 0, 255, 255, 255, 63, 255, 3, 127, 248, 255, 255, + 255, 63, 255, 255, 127, 0, 248, 224, 255, 255, 127, 95, 219, 255, 255, 255, + 3, 0, 248, 255, 255, 255, 252, 255, 255, 0, 0, 0, 0, 0, 255, 63, + 255, 255, 247, 255, 127, 15, 223, 255, 252, 252, 252, 28, 127, 127, 0, 62, + 255, 239, 255, 255, 127, 255, 255, 183, 255, 63, 255, 63, 135, 255, 255, 255, + 255, 255, 143, 255, 255, 31, 255, 15, 1, 0, 0, 0, 255, 255, 255, 191, + 15, 255, 63, 0, 255, 3, 0, 0, 15, 128, 0, 0, 63, 253, 255, 255, + 255, 255, 191, 145, 255, 255, 191, 255, 128, 255, 0, 0, 255, 255, 55, 248, + 255, 255, 255, 143, 255, 255, 255, 131, 255, 255, 255, 240, 111, 240, 239, 254, + 255, 255, 15, 135, 255, 0, 255, 1, 127, 248, 127, 0, 255, 255, 63, 254, + 255, 255, 7, 255, 255, 255, 3, 30, 0, 254, 0, 0, 255, 1, 0, 0, + 255, 255, 7, 0, 255, 255, 7, 252, 255, 63, 252, 255, 255, 255, 0, 128, + 3, 0, 255, 255, 255, 1, 255, 3, 254, 255, 31, 0, 255, 255, 251, 255, + 127, 189, 255, 191, 255, 3, 255, 255, 255, 7, 255, 3, 159, 57, 129, 224, + 207, 31, 31, 0, 255, 0, 255, 3, 31, 0, 255, 3, 255, 255, 7, 128, + 255, 127, 31, 0, 15, 0, 0, 0, 255, 127, 0, 0, 255, 195, 0, 0, + 255, 63, 63, 0, 63, 0, 255, 251, 251, 255, 255, 224, 255, 255, 0, 0, + 31, 0, 255, 255, 0, 128, 255, 255, 3, 0, 0, 0, 255, 7, 255, 31, + 255, 1, 255, 243, 127, 254, 255, 255, 63, 0, 0, 0, 100, 222, 255, 235, + 239, 255, 255, 255, 191, 231, 223, 223, 255, 255, 255, 123, 95, 252, 253, 255, + 63, 255, 255, 255, 255, 207, 255, 255, 255, 15, 0, 248, 254, 255, 0, 0, + 159, 255, 127, 0, 150, 254, 247, 10, 132, 234, 150, 170, 150, 247, 247, 94, + 255, 251, 255, 15, 238, 251, 255, 15, 0, 0, 3, 0, 255, 127, 254, 255, + 254, 255, 254, 255, 192, 255, 255, 255, 7, 0, 255, 255, 255, 1, 3, 0, + 255, 31, 15, 0, 255, 63, 0, 0, 0, 0, 255, 1, 31, 0, 0, 0, + 2, 0, 0, 0, +}; + +/* Graph: 2334 bytes. */ + +RE_UINT32 re_get_graph(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 15; + code = ch ^ (f << 15); + pos = (RE_UINT32)re_graph_stage_1[f] << 4; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_graph_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_graph_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_graph_stage_4[pos + f] << 5; + pos += code; + value = (re_graph_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Print. */ + +static RE_UINT8 re_print_stage_1[] = { + 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 4, 8, + 4, 8, +}; + +static RE_UINT8 re_print_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 9, 10, 11, 7, 7, 7, 7, 12, 13, 7, 7, 7, 14, + 15, 16, 17, 18, 19, 13, 20, 13, 21, 13, 13, 13, 13, 22, 13, 13, + 13, 13, 13, 13, 13, 13, 23, 24, 13, 13, 25, 26, 13, 27, 28, 29, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 30, 7, 31, 32, 7, 33, 13, 13, 13, 13, 13, 34, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 35, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 36, +}; + +static RE_UINT8 re_print_stage_3[] = { + 0, 1, 1, 2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 1, 15, 16, 1, 1, 17, 18, 19, 20, 21, 22, 23, 24, 1, 25, + 26, 27, 1, 28, 29, 1, 1, 1, 1, 1, 1, 30, 31, 32, 33, 34, + 35, 36, 37, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 38, 1, 1, 1, 1, 1, 1, 1, 1, 1, 39, + 1, 1, 1, 1, 40, 1, 41, 42, 43, 44, 45, 46, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 47, 48, 48, 48, 48, 48, 48, 48, 48, + 1, 1, 49, 50, 1, 51, 52, 53, 54, 55, 56, 57, 58, 59, 1, 60, + 61, 62, 63, 64, 65, 48, 66, 48, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 48, 76, 48, 48, 48, 48, 48, 1, 1, 1, 77, 78, 79, 48, 48, + 1, 1, 1, 1, 80, 48, 48, 48, 48, 48, 48, 48, 1, 1, 81, 48, + 1, 1, 82, 83, 48, 48, 48, 84, 85, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 86, 48, 48, 48, 87, 88, 89, 90, 91, 92, 93, 94, + 1, 1, 95, 48, 48, 48, 48, 48, 96, 48, 48, 48, 48, 48, 97, 48, + 98, 99, 100, 1, 1, 101, 102, 103, 104, 105, 48, 48, 48, 48, 48, 48, + 1, 1, 1, 1, 1, 1, 106, 1, 1, 1, 1, 1, 1, 1, 1, 107, + 108, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 109, 48, + 1, 1, 110, 48, 48, 48, 48, 48, 111, 112, 48, 48, 48, 48, 48, 48, + 1, 1, 1, 1, 1, 1, 1, 113, +}; + +static RE_UINT8 re_print_stage_4[] = { + 0, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 3, 4, 5, 1, 1, 1, 6, 7, 8, 9, 1, 10, 11, + 12, 1, 1, 1, 1, 1, 1, 1, 13, 1, 14, 1, 1, 15, 1, 16, + 1, 17, 18, 0, 0, 19, 0, 20, 1, 1, 1, 1, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 30, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 44, 48, 49, 50, 51, 52, 53, 54, + 8, 55, 56, 0, 57, 58, 59, 0, 1, 1, 60, 61, 62, 12, 63, 0, + 1, 1, 1, 1, 1, 1, 64, 1, 1, 1, 65, 1, 66, 67, 68, 1, + 69, 1, 48, 70, 71, 1, 1, 72, 1, 1, 1, 1, 70, 1, 1, 73, + 74, 75, 76, 77, 1, 1, 78, 79, 80, 1, 1, 81, 1, 82, 1, 83, + 2, 84, 85, 86, 1, 87, 88, 1, 89, 1, 2, 90, 79, 17, 0, 0, + 1, 1, 87, 70, 1, 1, 1, 91, 1, 92, 93, 1, 0, 0, 10, 94, + 1, 1, 1, 1, 1, 1, 1, 95, 72, 1, 96, 78, 1, 97, 98, 99, + 1, 100, 1, 101, 102, 2, 103, 104, 1, 1, 1, 1, 87, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 16, 1, 105, 106, 1, 1, 1, 1, 1, + 1, 1, 1, 107, 108, 109, 110, 111, 1, 112, 2, 1, 1, 1, 1, 113, + 1, 64, 1, 114, 75, 115, 115, 1, 1, 1, 116, 0, 117, 1, 1, 76, + 1, 1, 1, 1, 1, 1, 83, 118, 1, 1, 8, 1, 7, 1, 1, 1, + 119, 120, 1, 1, 112, 16, 1, 121, 2, 1, 1, 1, 1, 1, 1, 2, + 1, 1, 1, 1, 1, 83, 1, 1, 1, 1, 1, 1, 1, 1, 83, 0, + 1, 1, 1, 1, 122, 1, 123, 1, 1, 124, 1, 1, 1, 1, 1, 81, + 1, 1, 1, 1, 1, 125, 0, 126, 1, 127, 1, 81, 1, 1, 128, 78, + 1, 1, 129, 70, 1, 1, 130, 2, 1, 75, 131, 1, 1, 1, 132, 75, + 133, 134, 1, 135, 1, 1, 1, 136, 1, 1, 1, 1, 1, 121, 137, 56, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 138, 1, 1, 71, 0, + 139, 140, 141, 1, 1, 1, 142, 1, 1, 1, 103, 1, 143, 1, 144, 145, + 71, 1, 146, 147, 1, 1, 1, 90, 8, 1, 1, 1, 1, 2, 148, 149, + 150, 151, 152, 0, 1, 1, 1, 16, 153, 154, 1, 1, 155, 156, 103, 78, + 0, 0, 0, 0, 70, 1, 104, 56, 1, 121, 82, 16, 157, 1, 158, 0, + 1, 1, 1, 1, 78, 159, 0, 0, 1, 10, 1, 160, 0, 0, 0, 0, + 1, 75, 83, 144, 0, 0, 0, 0, 161, 162, 163, 1, 2, 164, 0, 165, + 166, 167, 0, 0, 1, 168, 143, 1, 169, 170, 171, 1, 1, 0, 1, 172, + 1, 173, 108, 174, 175, 176, 0, 0, 1, 1, 177, 0, 1, 178, 1, 179, + 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 180, 181, 1, 1, 182, 183, + 1, 97, 121, 75, 1, 1, 138, 184, 185, 78, 0, 0, 186, 187, 1, 188, + 21, 30, 189, 190, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 191, 0, + 0, 0, 0, 0, 1, 108, 78, 0, 1, 1, 192, 0, 1, 81, 159, 0, + 109, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 193, + 0, 0, 0, 0, 0, 0, 1, 73, 1, 1, 1, 1, 71, 0, 0, 0, + 1, 1, 1, 194, 1, 1, 1, 1, 1, 1, 195, 0, 0, 0, 0, 0, + 1, 196, 0, 0, 0, 0, 0, 0, 1, 1, 105, 0, 0, 0, 0, 0, + 1, 73, 2, 197, 0, 0, 103, 198, 1, 1, 199, 200, 201, 0, 0, 0, + 1, 1, 202, 2, 203, 0, 0, 0, 204, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 205, 206, 195, 0, 0, 1, 1, 1, 1, 1, 1, 1, 83, + 1, 207, 1, 1, 1, 1, 1, 177, 1, 1, 208, 0, 0, 0, 0, 0, + 1, 1, 75, 15, 0, 0, 0, 0, 1, 1, 97, 1, 12, 209, 210, 1, + 211, 212, 213, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 214, 1, 1, + 1, 1, 1, 1, 1, 1, 215, 1, 1, 1, 1, 1, 216, 217, 0, 0, + 1, 1, 1, 1, 1, 1, 218, 0, 210, 219, 220, 221, 222, 223, 0, 224, + 1, 87, 1, 1, 76, 225, 226, 83, 122, 112, 1, 87, 16, 0, 0, 227, + 228, 16, 229, 0, 0, 0, 0, 0, 1, 1, 1, 117, 1, 210, 1, 1, + 1, 1, 1, 1, 1, 1, 104, 230, 1, 1, 1, 76, 1, 1, 19, 0, + 87, 1, 191, 1, 10, 231, 0, 0, 232, 0, 0, 0, 233, 0, 156, 0, + 1, 1, 1, 1, 1, 1, 75, 0, 1, 19, 1, 1, 1, 1, 1, 1, + 78, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 204, 0, 0, + 78, 0, 0, 0, 0, 0, 0, 0, 234, 1, 1, 1, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 201, 1, 1, 1, 1, 1, 1, 1, 78, +}; + +static RE_UINT8 re_print_stage_5[] = { + 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 252, + 240, 215, 255, 255, 251, 255, 255, 255, 255, 255, 254, 255, 255, 255, 127, 254, + 254, 255, 255, 255, 255, 230, 254, 255, 255, 0, 255, 255, 255, 7, 31, 0, + 255, 255, 255, 223, 255, 191, 255, 255, 255, 231, 255, 255, 255, 255, 3, 0, + 255, 255, 255, 7, 255, 63, 255, 127, 255, 255, 255, 79, 255, 255, 31, 0, + 248, 255, 255, 255, 239, 159, 249, 255, 255, 253, 197, 243, 159, 121, 128, 176, + 207, 255, 255, 15, 238, 135, 249, 255, 255, 253, 109, 211, 135, 57, 2, 94, + 192, 255, 63, 0, 238, 191, 251, 255, 255, 253, 237, 243, 191, 59, 1, 0, + 207, 255, 3, 2, 238, 159, 249, 255, 159, 57, 192, 176, 207, 255, 255, 0, + 236, 199, 61, 214, 24, 199, 255, 195, 199, 61, 129, 0, 192, 255, 255, 7, + 239, 223, 253, 255, 255, 253, 255, 227, 223, 61, 96, 7, 207, 255, 0, 255, + 238, 223, 253, 255, 255, 253, 239, 243, 223, 61, 96, 64, 207, 255, 6, 0, + 255, 255, 255, 231, 223, 125, 128, 128, 207, 255, 63, 254, 236, 255, 127, 252, + 255, 255, 251, 47, 127, 132, 95, 255, 192, 255, 28, 0, 255, 255, 255, 135, + 255, 255, 255, 15, 150, 37, 240, 254, 174, 236, 255, 59, 95, 63, 255, 243, + 255, 254, 255, 255, 255, 31, 254, 255, 255, 255, 255, 254, 255, 223, 255, 7, + 191, 32, 255, 255, 255, 61, 127, 61, 255, 61, 255, 255, 255, 255, 61, 127, + 61, 255, 127, 255, 255, 255, 61, 255, 255, 255, 255, 31, 255, 255, 255, 3, + 255, 255, 63, 63, 255, 255, 255, 1, 255, 223, 31, 0, 255, 255, 127, 0, + 255, 255, 15, 0, 255, 223, 13, 0, 255, 255, 255, 63, 255, 3, 255, 3, + 255, 127, 255, 3, 255, 255, 255, 0, 255, 7, 255, 255, 255, 255, 63, 0, + 255, 15, 255, 15, 241, 255, 255, 255, 255, 63, 31, 0, 255, 15, 255, 255, + 255, 3, 255, 199, 255, 255, 255, 207, 255, 255, 255, 159, 255, 255, 15, 240, + 255, 255, 255, 248, 255, 227, 255, 255, 255, 255, 127, 3, 255, 255, 63, 240, + 63, 63, 255, 170, 255, 255, 223, 255, 223, 255, 207, 239, 255, 255, 220, 127, + 255, 252, 255, 255, 223, 255, 243, 255, 255, 127, 255, 31, 0, 0, 255, 255, + 255, 255, 1, 0, 127, 0, 0, 0, 255, 7, 0, 0, 255, 255, 207, 255, + 255, 255, 63, 255, 255, 255, 255, 227, 255, 253, 3, 0, 0, 240, 0, 0, + 255, 127, 255, 255, 255, 255, 15, 254, 255, 128, 1, 128, 127, 127, 127, 127, + 7, 0, 0, 0, 255, 255, 255, 251, 0, 0, 255, 15, 224, 255, 255, 255, + 255, 63, 254, 255, 15, 0, 255, 255, 255, 31, 255, 255, 127, 0, 255, 255, + 255, 15, 0, 0, 255, 63, 255, 0, 0, 0, 128, 255, 255, 15, 255, 3, + 31, 192, 255, 3, 255, 255, 15, 128, 255, 191, 255, 195, 255, 63, 255, 243, + 7, 0, 0, 248, 126, 126, 126, 0, 127, 127, 255, 255, 63, 0, 255, 255, + 255, 63, 255, 3, 127, 248, 255, 255, 255, 63, 255, 255, 127, 0, 248, 224, + 255, 255, 127, 95, 219, 255, 255, 255, 3, 0, 248, 255, 255, 255, 252, 255, + 255, 0, 0, 0, 0, 0, 255, 63, 255, 255, 247, 255, 127, 15, 223, 255, + 252, 252, 252, 28, 127, 127, 0, 62, 255, 239, 255, 255, 127, 255, 255, 183, + 255, 63, 255, 63, 135, 255, 255, 255, 255, 255, 143, 255, 255, 31, 255, 15, + 1, 0, 0, 0, 255, 255, 255, 191, 15, 255, 63, 0, 255, 3, 0, 0, + 15, 128, 0, 0, 63, 253, 255, 255, 255, 255, 191, 145, 255, 255, 191, 255, + 128, 255, 0, 0, 255, 255, 55, 248, 255, 255, 255, 143, 255, 255, 255, 131, + 255, 255, 255, 240, 111, 240, 239, 254, 255, 255, 15, 135, 255, 0, 255, 1, + 127, 248, 127, 0, 255, 255, 63, 254, 255, 255, 7, 255, 255, 255, 3, 30, + 0, 254, 0, 0, 255, 1, 0, 0, 255, 255, 7, 0, 255, 255, 7, 252, + 255, 63, 252, 255, 255, 255, 0, 128, 3, 0, 255, 255, 255, 1, 255, 3, + 254, 255, 31, 0, 255, 255, 251, 255, 127, 189, 255, 191, 255, 3, 255, 255, + 255, 7, 255, 3, 159, 57, 129, 224, 207, 31, 31, 0, 255, 0, 255, 3, + 31, 0, 255, 3, 255, 255, 7, 128, 255, 127, 31, 0, 15, 0, 0, 0, + 255, 127, 0, 0, 255, 195, 0, 0, 255, 63, 63, 0, 63, 0, 255, 251, + 251, 255, 255, 224, 255, 255, 0, 0, 31, 0, 255, 255, 0, 128, 255, 255, + 3, 0, 0, 0, 255, 7, 255, 31, 255, 1, 255, 243, 127, 254, 255, 255, + 63, 0, 0, 0, 100, 222, 255, 235, 239, 255, 255, 255, 191, 231, 223, 223, + 255, 255, 255, 123, 95, 252, 253, 255, 63, 255, 255, 255, 255, 207, 255, 255, + 255, 15, 0, 248, 254, 255, 0, 0, 159, 255, 127, 0, 150, 254, 247, 10, + 132, 234, 150, 170, 150, 247, 247, 94, 255, 251, 255, 15, 238, 251, 255, 15, + 0, 0, 3, 0, 255, 127, 254, 255, 254, 255, 254, 255, 192, 255, 255, 255, + 7, 0, 255, 255, 255, 1, 3, 0, 255, 31, 15, 0, 255, 63, 0, 0, + 0, 0, 255, 1, 31, 0, 0, 0, 2, 0, 0, 0, +}; + +/* Print: 2326 bytes. */ + +RE_UINT32 re_get_print(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 15; + code = ch ^ (f << 15); + pos = (RE_UINT32)re_print_stage_1[f] << 4; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_print_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_print_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_print_stage_4[pos + f] << 5; + pos += code; + value = (re_print_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Word. */ + +static RE_UINT8 re_word_stage_1[] = { + 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 6, 6, + 6, 6, +}; + +static RE_UINT8 re_word_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 9, 10, 11, 7, 7, 7, 7, 12, 13, 13, 13, 13, 14, + 15, 16, 17, 18, 19, 13, 20, 13, 21, 13, 13, 13, 13, 22, 13, 13, + 13, 13, 13, 13, 13, 13, 23, 24, 13, 13, 25, 26, 13, 27, 28, 13, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 29, 7, 30, 31, 7, 32, 13, 13, 13, 13, 13, 33, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 34, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, +}; + +static RE_UINT8 re_word_stage_3[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 1, 17, 18, 19, 1, 20, 21, 22, 23, 24, 25, 26, 27, 1, 28, + 29, 30, 31, 31, 32, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 31, + 36, 37, 31, 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 38, 1, 1, 1, 1, 1, 1, 1, 1, 1, 39, + 1, 1, 1, 1, 40, 1, 41, 42, 43, 44, 45, 46, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 47, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 1, 48, 49, 1, 50, 51, 52, 53, 54, 55, 56, 57, 58, 1, 59, + 60, 61, 62, 63, 64, 31, 31, 31, 65, 66, 67, 68, 69, 70, 71, 72, + 73, 31, 74, 31, 31, 31, 31, 31, 1, 1, 1, 75, 76, 77, 31, 31, + 1, 1, 1, 1, 78, 31, 31, 31, 31, 31, 31, 31, 1, 1, 79, 31, + 1, 1, 80, 81, 31, 31, 31, 82, 83, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 84, 31, 31, 31, 31, 85, 86, 31, 87, 88, 89, 90, + 31, 31, 91, 31, 31, 31, 31, 31, 92, 31, 31, 31, 31, 31, 93, 31, + 31, 94, 31, 31, 31, 31, 31, 31, 1, 1, 1, 1, 1, 1, 95, 1, + 1, 1, 1, 1, 1, 1, 1, 96, 97, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 98, 31, 1, 1, 99, 31, 31, 31, 31, 31, + 31, 100, 31, 31, 31, 31, 31, 31, +}; + +static RE_UINT8 re_word_stage_4[] = { + 0, 1, 2, 3, 0, 4, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 7, 8, 6, 6, 6, 9, 10, 11, 6, 12, + 6, 6, 6, 6, 11, 6, 6, 6, 6, 13, 14, 15, 16, 17, 18, 19, + 20, 6, 6, 21, 6, 6, 22, 23, 24, 6, 25, 6, 6, 26, 6, 27, + 6, 28, 29, 0, 0, 30, 0, 31, 6, 6, 6, 32, 33, 34, 35, 36, + 37, 38, 39, 40, 41, 42, 43, 44, 45, 42, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 59, 56, 60, 61, 62, 63, 64, 65, 66, + 15, 67, 68, 0, 69, 70, 71, 0, 72, 73, 74, 75, 76, 77, 78, 0, + 6, 6, 79, 6, 80, 6, 81, 82, 6, 6, 83, 6, 84, 85, 86, 6, + 87, 6, 60, 0, 88, 6, 6, 89, 15, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 90, 3, 6, 6, 91, 92, 30, 93, 94, 6, 6, 95, 96, + 97, 6, 6, 98, 6, 99, 6, 100, 101, 102, 103, 104, 6, 105, 106, 0, + 29, 6, 101, 107, 106, 108, 0, 0, 6, 6, 109, 110, 6, 6, 6, 93, + 6, 98, 111, 80, 0, 0, 112, 113, 6, 6, 6, 6, 6, 6, 6, 114, + 89, 6, 115, 80, 6, 116, 117, 118, 119, 120, 121, 122, 123, 0, 24, 124, + 125, 126, 127, 6, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 129, 6, 96, 6, 130, 101, 6, 6, 6, 6, 131, + 6, 81, 6, 132, 133, 134, 134, 6, 0, 135, 0, 0, 0, 0, 0, 0, + 136, 137, 15, 6, 138, 15, 6, 82, 139, 140, 6, 6, 141, 67, 0, 24, + 6, 6, 6, 6, 6, 100, 0, 0, 6, 6, 6, 6, 6, 6, 100, 0, + 6, 6, 6, 6, 142, 0, 24, 80, 143, 144, 6, 145, 6, 6, 6, 26, + 146, 147, 6, 6, 148, 149, 0, 146, 6, 150, 6, 93, 6, 6, 151, 152, + 6, 153, 93, 77, 6, 6, 154, 101, 6, 133, 155, 156, 6, 6, 157, 158, + 159, 160, 82, 161, 6, 6, 6, 162, 6, 6, 6, 6, 6, 163, 164, 29, + 6, 6, 6, 153, 6, 6, 165, 0, 166, 167, 168, 6, 6, 26, 169, 6, + 6, 80, 24, 6, 170, 6, 150, 171, 88, 172, 173, 174, 6, 6, 6, 77, + 1, 2, 3, 103, 6, 101, 175, 0, 176, 177, 178, 0, 6, 6, 6, 67, + 0, 0, 6, 30, 0, 0, 0, 179, 0, 0, 0, 0, 77, 6, 124, 180, + 6, 24, 99, 67, 80, 6, 181, 0, 6, 6, 6, 6, 80, 96, 0, 0, + 6, 182, 6, 183, 0, 0, 0, 0, 6, 133, 100, 150, 0, 0, 0, 0, + 184, 185, 100, 133, 101, 0, 0, 186, 100, 165, 0, 0, 6, 187, 0, 0, + 188, 189, 0, 77, 77, 0, 74, 190, 6, 100, 100, 191, 26, 0, 0, 0, + 6, 6, 128, 0, 6, 191, 6, 191, 6, 6, 190, 192, 6, 67, 24, 193, + 6, 194, 24, 195, 6, 6, 196, 0, 197, 98, 0, 0, 198, 199, 6, 200, + 33, 42, 201, 202, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 203, 0, + 0, 0, 0, 0, 6, 204, 205, 0, 6, 6, 206, 0, 6, 98, 96, 0, + 207, 109, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 208, + 0, 0, 0, 0, 0, 0, 6, 209, 6, 6, 6, 6, 165, 0, 0, 0, + 6, 6, 6, 141, 6, 6, 6, 6, 6, 6, 183, 0, 0, 0, 0, 0, + 6, 141, 0, 0, 0, 0, 0, 0, 6, 6, 190, 0, 0, 0, 0, 0, + 6, 209, 101, 96, 0, 0, 24, 104, 6, 133, 210, 211, 88, 0, 0, 0, + 6, 6, 212, 101, 213, 0, 0, 0, 214, 0, 0, 0, 0, 0, 0, 0, + 6, 6, 6, 215, 216, 0, 0, 0, 0, 0, 0, 217, 218, 219, 0, 0, + 0, 0, 220, 0, 0, 0, 0, 0, 6, 6, 194, 6, 221, 222, 223, 6, + 224, 225, 226, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 227, 228, 82, + 194, 194, 130, 130, 229, 229, 230, 6, 6, 231, 6, 232, 233, 234, 0, 0, + 6, 6, 6, 6, 6, 6, 235, 0, 223, 236, 237, 238, 239, 240, 0, 0, + 0, 24, 79, 79, 96, 0, 0, 0, 6, 6, 6, 6, 6, 6, 133, 0, + 6, 30, 6, 6, 6, 6, 6, 6, 80, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 214, 0, 0, 80, 0, 0, 0, 0, 0, 0, 0, + 6, 6, 6, 6, 6, 6, 6, 88, +}; + +static RE_UINT8 re_word_stage_5[] = { + 0, 0, 0, 0, 0, 0, 255, 3, 254, 255, 255, 135, 254, 255, 255, 7, + 0, 4, 32, 4, 255, 255, 127, 255, 255, 255, 255, 255, 195, 255, 3, 0, + 31, 80, 0, 0, 255, 255, 223, 188, 64, 215, 255, 255, 251, 255, 255, 255, + 255, 255, 191, 255, 255, 255, 254, 255, 255, 255, 127, 2, 254, 255, 255, 255, + 255, 0, 254, 255, 255, 255, 255, 191, 182, 0, 255, 255, 255, 7, 7, 0, + 0, 0, 255, 7, 255, 195, 255, 255, 255, 255, 239, 159, 255, 253, 255, 159, + 0, 0, 255, 255, 255, 231, 255, 255, 255, 255, 3, 0, 255, 255, 63, 4, + 255, 63, 0, 0, 255, 255, 255, 15, 255, 255, 31, 0, 248, 255, 255, 255, + 207, 255, 254, 255, 239, 159, 249, 255, 255, 253, 197, 243, 159, 121, 128, 176, + 207, 255, 3, 0, 238, 135, 249, 255, 255, 253, 109, 211, 135, 57, 2, 94, + 192, 255, 63, 0, 238, 191, 251, 255, 255, 253, 237, 243, 191, 59, 1, 0, + 207, 255, 0, 2, 238, 159, 249, 255, 159, 57, 192, 176, 207, 255, 2, 0, + 236, 199, 61, 214, 24, 199, 255, 195, 199, 61, 129, 0, 192, 255, 0, 0, + 239, 223, 253, 255, 255, 253, 255, 227, 223, 61, 96, 7, 207, 255, 0, 0, + 238, 223, 253, 255, 255, 253, 239, 243, 223, 61, 96, 64, 207, 255, 6, 0, + 255, 255, 255, 231, 223, 125, 128, 128, 207, 255, 0, 252, 236, 255, 127, 252, + 255, 255, 251, 47, 127, 132, 95, 255, 192, 255, 12, 0, 255, 255, 255, 7, + 255, 127, 255, 3, 150, 37, 240, 254, 174, 236, 255, 59, 95, 63, 255, 243, + 1, 0, 0, 3, 255, 3, 160, 194, 255, 254, 255, 255, 255, 31, 254, 255, + 223, 255, 255, 254, 255, 255, 255, 31, 64, 0, 0, 0, 255, 3, 255, 255, + 255, 255, 255, 63, 191, 32, 255, 255, 255, 255, 255, 247, 255, 61, 127, 61, + 255, 61, 255, 255, 255, 255, 61, 127, 61, 255, 127, 255, 255, 255, 61, 255, + 255, 255, 0, 0, 255, 255, 63, 63, 255, 159, 255, 255, 255, 199, 255, 1, + 255, 223, 31, 0, 255, 255, 15, 0, 255, 223, 13, 0, 255, 255, 143, 48, + 255, 3, 0, 0, 0, 56, 255, 3, 255, 255, 255, 0, 255, 7, 255, 255, + 255, 255, 63, 0, 255, 255, 255, 127, 255, 15, 255, 15, 192, 255, 255, 255, + 255, 63, 31, 0, 255, 15, 255, 255, 255, 3, 255, 3, 255, 255, 255, 159, + 128, 0, 255, 127, 255, 15, 255, 3, 0, 248, 15, 0, 255, 227, 255, 255, + 0, 0, 247, 255, 255, 255, 127, 3, 255, 255, 63, 240, 63, 63, 255, 170, + 255, 255, 223, 95, 220, 31, 207, 15, 255, 31, 220, 31, 0, 48, 0, 0, + 0, 0, 0, 128, 1, 0, 16, 0, 0, 0, 2, 128, 0, 0, 255, 31, + 255, 255, 1, 0, 132, 252, 47, 62, 80, 189, 255, 243, 224, 67, 0, 0, + 255, 1, 0, 0, 0, 0, 192, 255, 255, 127, 255, 255, 31, 248, 15, 0, + 255, 128, 0, 128, 255, 255, 127, 0, 127, 127, 127, 127, 0, 128, 0, 0, + 224, 0, 0, 0, 254, 255, 62, 31, 255, 255, 127, 230, 224, 255, 255, 255, + 255, 63, 254, 255, 255, 127, 0, 0, 255, 31, 0, 0, 255, 31, 255, 255, + 255, 15, 0, 0, 255, 255, 247, 191, 0, 0, 128, 255, 252, 255, 255, 255, + 255, 249, 255, 255, 255, 63, 255, 0, 255, 0, 0, 0, 31, 0, 255, 3, + 255, 255, 255, 40, 255, 63, 255, 255, 1, 128, 255, 3, 255, 63, 255, 3, + 255, 255, 127, 252, 7, 0, 0, 56, 255, 255, 124, 0, 126, 126, 126, 0, + 127, 127, 255, 255, 63, 0, 255, 255, 255, 55, 255, 3, 15, 0, 255, 255, + 127, 248, 255, 255, 255, 255, 255, 3, 127, 0, 248, 224, 255, 253, 127, 95, + 219, 255, 255, 255, 0, 0, 248, 255, 255, 255, 252, 255, 0, 0, 255, 15, + 255, 255, 24, 0, 0, 224, 0, 0, 0, 0, 223, 255, 252, 252, 252, 28, + 255, 239, 255, 255, 127, 255, 255, 183, 255, 63, 255, 63, 0, 0, 0, 32, + 1, 0, 0, 0, 15, 255, 62, 0, 255, 0, 255, 255, 15, 0, 0, 0, + 63, 253, 255, 255, 255, 255, 191, 145, 255, 255, 55, 0, 255, 255, 255, 192, + 111, 240, 239, 254, 255, 255, 15, 135, 127, 0, 0, 0, 255, 255, 7, 0, + 192, 255, 0, 128, 255, 1, 255, 3, 255, 255, 223, 255, 255, 255, 79, 0, + 31, 28, 255, 23, 255, 255, 251, 255, 127, 189, 255, 191, 255, 1, 255, 255, + 255, 7, 255, 3, 159, 57, 129, 224, 207, 31, 31, 0, 191, 0, 255, 3, + 255, 255, 63, 255, 1, 0, 0, 63, 17, 0, 255, 3, 255, 255, 255, 227, + 255, 3, 0, 128, 255, 255, 255, 1, 15, 0, 255, 3, 248, 255, 255, 224, + 31, 0, 255, 255, 0, 128, 255, 255, 3, 0, 0, 0, 255, 7, 255, 31, + 255, 1, 255, 99, 224, 227, 7, 248, 231, 15, 0, 0, 0, 60, 0, 0, + 28, 0, 0, 0, 255, 255, 255, 223, 100, 222, 255, 235, 239, 255, 255, 255, + 191, 231, 223, 223, 255, 255, 255, 123, 95, 252, 253, 255, 63, 255, 255, 255, + 253, 255, 255, 247, 255, 253, 255, 255, 247, 207, 255, 255, 255, 255, 127, 248, + 255, 31, 32, 0, 16, 0, 0, 248, 254, 255, 0, 0, 31, 0, 127, 0, + 150, 254, 247, 10, 132, 234, 150, 170, 150, 247, 247, 94, 255, 251, 255, 15, + 238, 251, 255, 15, +}; + +/* Word: 2214 bytes. */ + +RE_UINT32 re_get_word(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 15; + code = ch ^ (f << 15); + pos = (RE_UINT32)re_word_stage_1[f] << 4; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_word_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_word_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_word_stage_4[pos + f] << 5; + pos += code; + value = (re_word_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* XDigit. */ + +static RE_UINT8 re_xdigit_stage_1[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, +}; + +static RE_UINT8 re_xdigit_stage_2[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 4, + 5, 6, 2, 2, 2, 2, 7, 2, 2, 2, 2, 2, 2, 8, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +}; + +static RE_UINT8 re_xdigit_stage_3[] = { + 0, 1, 1, 1, 1, 1, 2, 3, 1, 4, 4, 4, 4, 4, 5, 6, + 7, 1, 1, 1, 1, 1, 1, 8, 9, 10, 11, 12, 13, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 6, 1, 14, 15, 16, 17, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 18, + 1, 1, 1, 1, 19, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 20, 21, 17, 1, 14, 1, 22, 23, 8, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 24, 16, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 25, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_xdigit_stage_4[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 3, 2, 0, 2, 2, 2, 4, + 2, 5, 2, 5, 2, 6, 2, 6, 3, 2, 2, 2, 2, 4, 6, 2, + 2, 2, 2, 3, 6, 2, 2, 2, 2, 7, 2, 6, 2, 2, 8, 2, + 2, 6, 0, 2, 2, 8, 2, 2, 2, 2, 2, 6, 4, 2, 2, 9, + 2, 6, 2, 2, 2, 2, 2, 0, 10, 11, 2, 2, 2, 2, 3, 2, + 2, 5, 2, 0, 12, 2, 2, 6, 2, 6, 2, 4, 0, 2, 2, 2, + 2, 3, 2, 2, 2, 2, 2, 13, +}; + +static RE_UINT8 re_xdigit_stage_5[] = { + 0, 0, 0, 0, 0, 0, 255, 3, 126, 0, 0, 0, 126, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, 3, 0, 0, + 255, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 192, 255, 0, 0, + 0, 0, 255, 3, 0, 0, 0, 0, 192, 255, 0, 0, 0, 0, 0, 0, + 255, 3, 255, 3, 0, 0, 0, 0, 0, 0, 255, 3, 0, 0, 255, 3, + 0, 0, 255, 3, 126, 0, 0, 0, 126, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 192, 255, 0, 192, 255, 255, 255, 255, 255, 255, +}; + +/* XDigit: 425 bytes. */ + +RE_UINT32 re_get_xdigit(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_xdigit_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_xdigit_stage_2[pos + f] << 4; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_xdigit_stage_3[pos + f] << 2; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_xdigit_stage_4[pos + f] << 6; + pos += code; + value = (re_xdigit_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Posix_Digit. */ + +static RE_UINT8 re_posix_digit_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_posix_digit_stage_2[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_posix_digit_stage_3[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_posix_digit_stage_4[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_posix_digit_stage_5[] = { + 0, 0, 0, 0, 0, 0, 255, 3, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/* Posix_Digit: 97 bytes. */ + +RE_UINT32 re_get_posix_digit(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_posix_digit_stage_1[f] << 4; + f = code >> 12; + code ^= f << 12; + pos = (RE_UINT32)re_posix_digit_stage_2[pos + f] << 3; + f = code >> 9; + code ^= f << 9; + pos = (RE_UINT32)re_posix_digit_stage_3[pos + f] << 3; + f = code >> 6; + code ^= f << 6; + pos = (RE_UINT32)re_posix_digit_stage_4[pos + f] << 6; + pos += code; + value = (re_posix_digit_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Posix_AlNum. */ + +static RE_UINT8 re_posix_alnum_stage_1[] = { + 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, +}; + +static RE_UINT8 re_posix_alnum_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 9, 10, 11, 7, 7, 7, 7, 12, 13, 13, 13, 13, 14, + 15, 16, 17, 18, 19, 13, 20, 13, 21, 13, 13, 13, 13, 22, 13, 13, + 13, 13, 13, 13, 13, 13, 23, 24, 13, 13, 25, 13, 13, 26, 27, 13, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 28, 7, 29, 30, 7, 31, 13, 13, 13, 13, 13, 32, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, +}; + +static RE_UINT8 re_posix_alnum_stage_3[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 1, 17, 18, 19, 1, 20, 21, 22, 23, 24, 25, 26, 27, 1, 28, + 29, 30, 31, 31, 32, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 31, + 36, 37, 31, 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 38, 1, 1, 1, 1, 1, 1, 1, 1, 1, 39, + 1, 1, 1, 1, 40, 1, 41, 42, 43, 44, 45, 46, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 47, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 1, 48, 49, 1, 50, 51, 52, 53, 54, 55, 56, 57, 58, 1, 59, + 60, 61, 62, 63, 64, 31, 31, 31, 65, 66, 67, 68, 69, 70, 71, 72, + 73, 31, 74, 31, 31, 31, 31, 31, 1, 1, 1, 75, 76, 77, 31, 31, + 1, 1, 1, 1, 78, 31, 31, 31, 31, 31, 31, 31, 1, 1, 79, 31, + 1, 1, 80, 81, 31, 31, 31, 82, 83, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 84, 31, 31, 31, 31, 31, 31, 31, 85, 86, 87, 88, + 89, 31, 31, 31, 31, 31, 90, 31, 31, 91, 31, 31, 31, 31, 31, 31, + 1, 1, 1, 1, 1, 1, 92, 1, 1, 1, 1, 1, 1, 1, 1, 93, + 94, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 95, 31, + 1, 1, 96, 31, 31, 31, 31, 31, +}; + +static RE_UINT8 re_posix_alnum_stage_4[] = { + 0, 1, 2, 2, 0, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 6, 7, 0, 0, 8, 9, 10, 11, 5, 12, + 5, 5, 5, 5, 13, 5, 5, 5, 5, 14, 15, 16, 17, 18, 19, 20, + 21, 5, 22, 23, 5, 5, 24, 25, 26, 5, 27, 5, 5, 28, 29, 30, + 31, 32, 33, 0, 0, 34, 0, 35, 5, 36, 37, 38, 39, 40, 41, 42, + 43, 44, 45, 46, 47, 48, 49, 50, 51, 48, 52, 53, 54, 55, 56, 0, + 57, 58, 59, 60, 61, 62, 63, 64, 61, 65, 66, 67, 68, 69, 70, 71, + 16, 72, 73, 0, 74, 75, 76, 0, 77, 0, 78, 79, 80, 81, 0, 0, + 5, 82, 26, 83, 84, 5, 85, 86, 5, 5, 87, 5, 88, 89, 90, 5, + 91, 5, 92, 0, 93, 5, 5, 94, 16, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 95, 2, 5, 5, 96, 97, 98, 98, 99, 5, 100, 101, 0, + 0, 5, 5, 102, 5, 103, 5, 104, 105, 106, 26, 107, 5, 108, 109, 0, + 110, 5, 105, 111, 0, 112, 0, 0, 5, 113, 114, 0, 5, 115, 5, 116, + 5, 104, 117, 118, 0, 0, 0, 119, 5, 5, 5, 5, 5, 5, 0, 120, + 94, 5, 121, 118, 5, 122, 123, 124, 0, 0, 0, 125, 126, 0, 0, 0, + 127, 128, 129, 5, 130, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 131, 5, 109, 5, 132, 105, 5, 5, 5, 5, 133, + 5, 85, 5, 134, 135, 136, 136, 5, 0, 137, 0, 0, 0, 0, 0, 0, + 138, 139, 16, 5, 140, 16, 5, 86, 141, 142, 5, 5, 143, 72, 0, 26, + 5, 5, 5, 5, 5, 104, 0, 0, 5, 5, 5, 5, 5, 5, 104, 0, + 5, 5, 5, 5, 32, 0, 26, 118, 144, 145, 5, 146, 5, 5, 5, 93, + 147, 148, 5, 5, 149, 150, 0, 147, 151, 17, 5, 98, 5, 5, 60, 152, + 29, 103, 153, 81, 5, 154, 137, 155, 5, 135, 156, 157, 5, 105, 158, 159, + 160, 161, 86, 162, 5, 5, 5, 163, 5, 5, 5, 5, 5, 164, 165, 110, + 5, 5, 5, 166, 5, 5, 167, 0, 168, 169, 170, 5, 5, 28, 171, 5, + 5, 118, 26, 5, 172, 5, 17, 173, 0, 0, 0, 174, 5, 5, 5, 81, + 0, 2, 2, 175, 5, 105, 176, 0, 177, 178, 179, 0, 5, 5, 5, 72, + 0, 0, 5, 34, 0, 0, 0, 0, 0, 0, 0, 0, 81, 5, 180, 0, + 5, 26, 103, 72, 118, 5, 181, 0, 5, 5, 5, 5, 118, 0, 0, 0, + 5, 182, 5, 60, 0, 0, 0, 0, 5, 135, 104, 17, 0, 0, 0, 0, + 183, 184, 104, 135, 105, 0, 0, 185, 104, 167, 0, 0, 5, 186, 0, 0, + 187, 98, 0, 81, 81, 0, 78, 188, 5, 104, 104, 153, 28, 0, 0, 0, + 5, 5, 130, 0, 5, 153, 5, 153, 5, 5, 189, 0, 148, 33, 26, 130, + 5, 153, 26, 190, 5, 5, 191, 0, 192, 193, 0, 0, 194, 195, 5, 130, + 39, 48, 196, 60, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 197, 0, + 0, 0, 0, 0, 5, 198, 199, 0, 5, 105, 200, 0, 5, 104, 0, 0, + 201, 163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 202, + 0, 0, 0, 0, 0, 0, 5, 33, 5, 5, 5, 5, 167, 0, 0, 0, + 5, 5, 5, 143, 5, 5, 5, 5, 5, 5, 60, 0, 0, 0, 0, 0, + 5, 143, 0, 0, 0, 0, 0, 0, 5, 5, 203, 0, 0, 0, 0, 0, + 5, 33, 105, 0, 0, 0, 26, 156, 5, 135, 60, 204, 93, 0, 0, 0, + 5, 5, 205, 105, 171, 0, 0, 0, 206, 0, 0, 0, 0, 0, 0, 0, + 5, 5, 5, 207, 208, 0, 0, 0, 5, 5, 209, 5, 210, 211, 212, 5, + 213, 214, 215, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 216, 217, 86, + 209, 209, 132, 132, 218, 218, 219, 0, 5, 5, 5, 5, 5, 5, 188, 0, + 212, 220, 221, 222, 223, 224, 0, 0, 0, 26, 225, 225, 109, 0, 0, 0, + 5, 5, 5, 5, 5, 5, 135, 0, 5, 34, 5, 5, 5, 5, 5, 5, + 118, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 206, 0, 0, + 118, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_posix_alnum_stage_5[] = { + 0, 0, 0, 0, 0, 0, 255, 3, 254, 255, 255, 7, 0, 4, 32, 4, + 255, 255, 127, 255, 255, 255, 255, 255, 195, 255, 3, 0, 31, 80, 0, 0, + 32, 0, 0, 0, 0, 0, 223, 188, 64, 215, 255, 255, 251, 255, 255, 255, + 255, 255, 191, 255, 3, 252, 255, 255, 255, 255, 254, 255, 255, 255, 127, 2, + 254, 255, 255, 255, 255, 0, 0, 0, 0, 0, 255, 191, 182, 0, 255, 255, + 255, 7, 7, 0, 0, 0, 255, 7, 255, 255, 255, 254, 0, 192, 255, 255, + 255, 255, 239, 31, 254, 225, 0, 156, 0, 0, 255, 255, 0, 224, 255, 255, + 255, 255, 3, 0, 0, 252, 255, 255, 255, 7, 48, 4, 255, 255, 255, 252, + 255, 31, 0, 0, 255, 255, 255, 1, 255, 255, 31, 0, 248, 3, 255, 255, + 255, 255, 255, 239, 255, 223, 225, 255, 15, 0, 254, 255, 239, 159, 249, 255, + 255, 253, 197, 227, 159, 89, 128, 176, 15, 0, 3, 0, 238, 135, 249, 255, + 255, 253, 109, 195, 135, 25, 2, 94, 0, 0, 63, 0, 238, 191, 251, 255, + 255, 253, 237, 227, 191, 27, 1, 0, 15, 0, 0, 2, 238, 159, 249, 255, + 159, 25, 192, 176, 15, 0, 2, 0, 236, 199, 61, 214, 24, 199, 255, 195, + 199, 29, 129, 0, 239, 223, 253, 255, 255, 253, 255, 227, 223, 29, 96, 7, + 15, 0, 0, 0, 238, 223, 253, 255, 255, 253, 239, 227, 223, 29, 96, 64, + 15, 0, 6, 0, 255, 255, 255, 231, 223, 93, 128, 128, 15, 0, 0, 252, + 236, 255, 127, 252, 255, 255, 251, 47, 127, 128, 95, 255, 0, 0, 12, 0, + 255, 255, 255, 7, 127, 32, 0, 0, 150, 37, 240, 254, 174, 236, 255, 59, + 95, 32, 0, 240, 1, 0, 0, 0, 255, 254, 255, 255, 255, 31, 254, 255, + 3, 255, 255, 254, 255, 255, 255, 31, 255, 255, 127, 249, 231, 193, 255, 255, + 127, 64, 0, 48, 191, 32, 255, 255, 255, 255, 255, 247, 255, 61, 127, 61, + 255, 61, 255, 255, 255, 255, 61, 127, 61, 255, 127, 255, 255, 255, 61, 255, + 255, 255, 255, 135, 255, 255, 0, 0, 255, 255, 63, 63, 255, 159, 255, 255, + 255, 199, 255, 1, 255, 223, 15, 0, 255, 255, 15, 0, 255, 223, 13, 0, + 255, 255, 207, 255, 255, 1, 128, 16, 255, 255, 255, 0, 255, 7, 255, 255, + 255, 255, 63, 0, 255, 255, 255, 127, 255, 15, 255, 1, 255, 63, 31, 0, + 255, 15, 255, 255, 255, 3, 0, 0, 255, 255, 255, 15, 254, 255, 31, 0, + 128, 0, 0, 0, 255, 255, 239, 255, 239, 15, 0, 0, 255, 243, 0, 252, + 191, 255, 3, 0, 0, 224, 0, 252, 255, 255, 255, 63, 0, 222, 111, 0, + 128, 255, 31, 0, 63, 63, 255, 170, 255, 255, 223, 95, 220, 31, 207, 15, + 255, 31, 220, 31, 0, 0, 2, 128, 0, 0, 255, 31, 132, 252, 47, 62, + 80, 189, 255, 243, 224, 67, 0, 0, 255, 1, 0, 0, 0, 0, 192, 255, + 255, 127, 255, 255, 31, 120, 12, 0, 255, 128, 0, 0, 255, 255, 127, 0, + 127, 127, 127, 127, 0, 128, 0, 0, 224, 0, 0, 0, 254, 3, 62, 31, + 255, 255, 127, 224, 224, 255, 255, 255, 255, 63, 254, 255, 255, 127, 0, 0, + 255, 31, 255, 255, 0, 12, 0, 0, 255, 127, 240, 143, 0, 0, 128, 255, + 252, 255, 255, 255, 255, 249, 255, 255, 255, 63, 255, 0, 187, 247, 255, 255, + 0, 0, 252, 40, 255, 255, 7, 0, 255, 255, 247, 255, 223, 255, 0, 124, + 255, 63, 0, 0, 255, 255, 127, 196, 5, 0, 0, 56, 255, 255, 60, 0, + 126, 126, 126, 0, 127, 127, 255, 255, 63, 0, 255, 255, 255, 7, 0, 0, + 15, 0, 255, 255, 127, 248, 255, 255, 255, 63, 255, 255, 255, 255, 255, 3, + 127, 0, 248, 224, 255, 253, 127, 95, 219, 255, 255, 255, 0, 0, 248, 255, + 255, 255, 252, 255, 0, 0, 255, 15, 0, 0, 223, 255, 192, 255, 255, 255, + 252, 252, 252, 28, 255, 239, 255, 255, 127, 255, 255, 183, 255, 63, 255, 63, + 255, 255, 1, 0, 15, 255, 62, 0, 255, 0, 255, 255, 63, 253, 255, 255, + 255, 255, 191, 145, 255, 255, 55, 0, 255, 255, 255, 192, 111, 240, 239, 254, + 31, 0, 0, 0, 63, 0, 0, 0, 255, 255, 71, 0, 30, 0, 0, 20, + 255, 255, 251, 255, 255, 255, 159, 0, 127, 189, 255, 191, 255, 1, 255, 255, + 159, 25, 129, 224, 179, 0, 0, 0, 255, 255, 63, 127, 0, 0, 0, 63, + 17, 0, 0, 0, 255, 255, 255, 227, 0, 0, 0, 128, 127, 0, 0, 0, + 248, 255, 255, 224, 31, 0, 255, 255, 3, 0, 0, 0, 255, 7, 255, 31, + 255, 1, 255, 67, 255, 255, 223, 255, 255, 255, 255, 223, 100, 222, 255, 235, + 239, 255, 255, 255, 191, 231, 223, 223, 255, 255, 255, 123, 95, 252, 253, 255, + 63, 255, 255, 255, 253, 255, 255, 247, 255, 253, 255, 255, 247, 15, 0, 0, + 150, 254, 247, 10, 132, 234, 150, 170, 150, 247, 247, 94, 255, 251, 255, 15, + 238, 251, 255, 15, 255, 3, 255, 255, +}; + +/* Posix_AlNum: 2089 bytes. */ + +RE_UINT32 re_get_posix_alnum(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_posix_alnum_stage_1[f] << 5; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_posix_alnum_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_posix_alnum_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_posix_alnum_stage_4[pos + f] << 5; + pos += code; + value = (re_posix_alnum_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Posix_Punct. */ + +static RE_UINT8 re_posix_punct_stage_1[] = { + 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, +}; + +static RE_UINT8 re_posix_punct_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 9, 10, 7, 7, 7, 7, 7, 7, 7, 7, 7, 11, + 12, 13, 14, 7, 15, 7, 7, 7, 7, 7, 7, 7, 7, 16, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 17, 7, 7, 18, 19, 7, 20, 21, 22, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, +}; + +static RE_UINT8 re_posix_punct_stage_3[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 1, 1, 17, 18, 1, 19, 20, 21, 22, 23, 24, 25, 1, 1, 26, + 27, 28, 29, 30, 31, 29, 29, 32, 29, 29, 29, 33, 34, 35, 36, 37, + 38, 39, 40, 29, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 41, 1, 1, 1, 1, 1, 1, 42, 1, 43, 44, + 45, 46, 47, 48, 1, 1, 1, 1, 1, 1, 1, 49, 1, 50, 51, 52, + 1, 53, 1, 54, 1, 55, 1, 1, 56, 57, 58, 59, 1, 1, 1, 1, + 60, 61, 62, 1, 63, 64, 65, 66, 1, 1, 1, 1, 67, 1, 1, 1, + 1, 1, 68, 69, 1, 1, 1, 1, 1, 1, 1, 1, 70, 1, 1, 1, + 71, 72, 73, 74, 1, 1, 75, 76, 29, 29, 77, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 10, 1, 78, 79, 80, 29, 29, 81, 82, 83, + 84, 85, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_posix_punct_stage_4[] = { + 0, 1, 2, 3, 0, 4, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 6, 7, 0, 0, 0, 8, 9, 0, 0, 10, + 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 12, 0, 13, 14, 15, 16, + 17, 0, 0, 18, 0, 0, 19, 20, 21, 0, 0, 0, 0, 0, 0, 22, + 0, 23, 14, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 25, + 0, 0, 0, 0, 0, 0, 0, 26, 0, 0, 0, 27, 0, 0, 0, 28, + 0, 0, 0, 29, 0, 0, 0, 0, 0, 0, 0, 30, 0, 0, 0, 31, + 0, 29, 32, 0, 0, 0, 0, 0, 33, 34, 0, 0, 35, 36, 37, 0, + 0, 0, 38, 0, 36, 0, 0, 39, 0, 0, 0, 40, 41, 0, 0, 0, + 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 43, 44, 0, 0, 45, + 0, 46, 0, 0, 0, 0, 47, 0, 48, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 49, 0, 0, 0, 36, 50, 36, 0, 0, 0, 0, 51, 0, 0, + 0, 0, 12, 52, 0, 0, 0, 53, 0, 54, 0, 36, 0, 0, 55, 0, + 0, 0, 0, 0, 0, 56, 57, 58, 59, 60, 61, 62, 63, 61, 0, 0, + 64, 65, 66, 0, 67, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 68, 50, 69, 48, 0, 53, 70, 0, 0, + 50, 50, 50, 70, 71, 50, 50, 50, 50, 50, 50, 72, 73, 74, 75, 76, + 0, 0, 0, 0, 0, 0, 0, 77, 0, 0, 0, 27, 0, 0, 0, 0, + 50, 78, 79, 0, 80, 50, 50, 81, 50, 50, 50, 50, 50, 50, 70, 82, + 83, 84, 0, 0, 44, 42, 0, 39, 0, 0, 0, 0, 85, 0, 50, 86, + 61, 87, 88, 50, 87, 89, 50, 61, 0, 0, 0, 0, 0, 0, 50, 50, + 0, 0, 0, 0, 59, 50, 69, 36, 90, 0, 0, 91, 0, 0, 0, 92, + 93, 94, 0, 0, 95, 0, 0, 0, 0, 96, 0, 97, 0, 0, 98, 99, + 0, 98, 29, 0, 0, 0, 100, 0, 0, 0, 53, 101, 0, 0, 36, 26, + 0, 0, 39, 0, 0, 0, 0, 102, 0, 103, 0, 0, 0, 104, 94, 0, + 0, 36, 0, 0, 0, 0, 0, 105, 41, 59, 106, 107, 0, 0, 0, 0, + 1, 2, 2, 108, 0, 0, 0, 109, 79, 110, 0, 111, 112, 42, 59, 113, + 0, 0, 0, 0, 29, 0, 27, 0, 0, 0, 0, 114, 0, 0, 0, 0, + 0, 0, 5, 115, 0, 0, 0, 0, 29, 29, 0, 0, 0, 0, 0, 0, + 0, 0, 116, 29, 0, 0, 117, 118, 0, 111, 0, 0, 119, 0, 0, 0, + 0, 0, 120, 0, 0, 121, 94, 0, 0, 0, 86, 122, 0, 0, 123, 0, + 0, 124, 0, 0, 0, 103, 0, 0, 0, 0, 0, 0, 0, 0, 125, 0, + 0, 0, 0, 0, 0, 0, 126, 0, 0, 0, 127, 0, 0, 0, 0, 0, + 0, 53, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, + 0, 0, 0, 98, 0, 0, 0, 129, 0, 110, 130, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 131, 0, 0, 0, 50, 50, 50, 50, 50, 50, 50, 70, + 50, 132, 50, 133, 134, 135, 50, 40, 50, 50, 136, 0, 0, 0, 0, 0, + 50, 50, 93, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 137, 39, + 129, 129, 114, 114, 103, 103, 138, 0, 0, 139, 0, 140, 141, 0, 0, 0, + 50, 142, 50, 50, 81, 143, 144, 70, 59, 145, 38, 146, 147, 0, 0, 148, + 149, 68, 150, 0, 0, 0, 0, 0, 50, 50, 50, 80, 50, 151, 50, 50, + 50, 50, 50, 50, 50, 50, 89, 152, 50, 50, 50, 81, 50, 50, 153, 0, + 142, 50, 154, 50, 60, 21, 0, 0, 116, 0, 0, 0, 155, 0, 42, 0, +}; + +static RE_UINT8 re_posix_punct_stage_5[] = { + 0, 0, 0, 0, 254, 255, 0, 252, 1, 0, 0, 248, 1, 0, 0, 120, + 254, 219, 211, 137, 0, 0, 128, 0, 60, 0, 252, 255, 224, 175, 255, 255, + 0, 0, 32, 64, 176, 0, 0, 0, 0, 0, 64, 0, 4, 0, 0, 0, + 0, 0, 0, 252, 0, 230, 0, 0, 0, 0, 0, 64, 73, 0, 0, 0, + 0, 0, 24, 0, 192, 255, 0, 200, 0, 60, 0, 0, 0, 0, 16, 64, + 0, 2, 0, 96, 255, 63, 0, 0, 0, 0, 192, 3, 0, 0, 255, 127, + 48, 0, 1, 0, 0, 0, 12, 12, 0, 0, 3, 0, 0, 0, 1, 0, + 0, 0, 248, 7, 0, 0, 0, 128, 0, 0, 0, 2, 0, 0, 16, 0, + 0, 128, 0, 12, 254, 255, 255, 252, 0, 0, 80, 61, 32, 0, 0, 0, + 0, 0, 0, 192, 191, 223, 255, 7, 0, 252, 0, 0, 0, 0, 0, 8, + 255, 1, 0, 0, 0, 0, 255, 3, 1, 0, 0, 0, 0, 96, 0, 0, + 0, 0, 0, 24, 0, 56, 0, 0, 0, 0, 96, 0, 0, 0, 112, 15, + 255, 7, 0, 0, 49, 0, 0, 0, 255, 255, 255, 255, 127, 63, 0, 0, + 255, 7, 240, 31, 0, 0, 0, 240, 0, 0, 0, 248, 255, 0, 8, 0, + 0, 0, 0, 160, 3, 224, 0, 224, 0, 224, 0, 96, 0, 0, 255, 255, + 255, 0, 255, 255, 255, 255, 255, 127, 0, 0, 0, 124, 0, 124, 0, 0, + 123, 3, 208, 193, 175, 66, 0, 12, 31, 188, 0, 0, 0, 12, 255, 255, + 255, 255, 255, 7, 127, 0, 0, 0, 255, 255, 63, 0, 0, 0, 240, 255, + 255, 255, 207, 255, 255, 255, 63, 255, 255, 255, 255, 227, 255, 253, 3, 0, + 0, 240, 0, 0, 224, 7, 0, 222, 255, 127, 255, 255, 7, 0, 0, 0, + 255, 255, 255, 251, 255, 255, 15, 0, 0, 0, 255, 15, 30, 255, 255, 255, + 1, 0, 193, 224, 0, 0, 195, 255, 15, 0, 0, 0, 0, 252, 255, 255, + 255, 0, 1, 0, 255, 255, 1, 0, 0, 224, 0, 0, 0, 0, 8, 64, + 0, 0, 252, 0, 255, 255, 127, 0, 3, 0, 0, 0, 0, 6, 0, 0, + 0, 15, 192, 3, 0, 0, 240, 0, 0, 192, 0, 0, 0, 0, 0, 23, + 254, 63, 0, 192, 0, 0, 128, 3, 0, 8, 0, 0, 0, 2, 0, 0, + 0, 0, 252, 255, 0, 0, 0, 48, 255, 255, 247, 255, 127, 15, 0, 0, + 63, 0, 0, 0, 127, 127, 0, 48, 0, 0, 128, 255, 0, 0, 0, 254, + 255, 19, 255, 15, 255, 255, 255, 31, 0, 128, 0, 0, 0, 0, 128, 1, + 0, 0, 255, 1, 0, 1, 0, 0, 0, 0, 127, 0, 0, 0, 0, 30, + 128, 63, 0, 0, 0, 0, 0, 216, 0, 0, 48, 0, 224, 35, 0, 232, + 0, 0, 0, 63, 64, 0, 0, 0, 254, 255, 255, 0, 14, 0, 0, 0, + 0, 0, 31, 0, 0, 0, 32, 0, 48, 0, 0, 0, 0, 0, 0, 144, + 127, 254, 255, 255, 31, 28, 0, 0, 24, 240, 255, 255, 255, 195, 255, 255, + 35, 0, 0, 0, 2, 0, 0, 8, 8, 0, 0, 0, 0, 0, 128, 7, + 0, 224, 223, 255, 239, 15, 0, 0, 255, 15, 255, 255, 255, 127, 254, 255, + 254, 255, 254, 255, 255, 127, 0, 0, 0, 12, 0, 0, 0, 252, 255, 7, + 192, 255, 255, 255, 7, 0, 255, 255, 255, 1, 3, 0, 239, 255, 255, 255, + 255, 31, 15, 0, 255, 255, 31, 0, 255, 0, 255, 3, 31, 0, 0, 0, +}; + +/* Posix_Punct: 1609 bytes. */ + +RE_UINT32 re_get_posix_punct(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_posix_punct_stage_1[f] << 5; + f = code >> 11; + code ^= f << 11; + pos = (RE_UINT32)re_posix_punct_stage_2[pos + f] << 3; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_posix_punct_stage_3[pos + f] << 3; + f = code >> 5; + code ^= f << 5; + pos = (RE_UINT32)re_posix_punct_stage_4[pos + f] << 5; + pos += code; + value = (re_posix_punct_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* Posix_XDigit. */ + +static RE_UINT8 re_posix_xdigit_stage_1[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, +}; + +static RE_UINT8 re_posix_xdigit_stage_2[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_posix_xdigit_stage_3[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_posix_xdigit_stage_4[] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static RE_UINT8 re_posix_xdigit_stage_5[] = { + 0, 0, 0, 0, 0, 0, 255, 3, 126, 0, 0, 0, 126, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/* Posix_XDigit: 97 bytes. */ + +RE_UINT32 re_get_posix_xdigit(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + + f = ch >> 16; + code = ch ^ (f << 16); + pos = (RE_UINT32)re_posix_xdigit_stage_1[f] << 3; + f = code >> 13; + code ^= f << 13; + pos = (RE_UINT32)re_posix_xdigit_stage_2[pos + f] << 3; + f = code >> 10; + code ^= f << 10; + pos = (RE_UINT32)re_posix_xdigit_stage_3[pos + f] << 3; + f = code >> 7; + code ^= f << 7; + pos = (RE_UINT32)re_posix_xdigit_stage_4[pos + f] << 7; + pos += code; + value = (re_posix_xdigit_stage_5[pos >> 3] >> (pos & 0x7)) & 0x1; + + return value; +} + +/* All_Cases. */ + +static RE_UINT8 re_all_cases_stage_1[] = { + 0, 1, 2, 2, 2, 3, 2, 4, 5, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, +}; + +static RE_UINT8 re_all_cases_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 9, 10, 11, + 6, 12, 6, 6, 13, 6, 6, 6, 6, 6, 6, 6, 14, 15, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 16, 17, 6, 6, 6, 18, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 19, 6, 6, 6, 20, + 6, 6, 6, 6, 21, 6, 6, 6, 6, 6, 6, 6, 22, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 23, 6, 6, 6, 6, 6, 6, 6, +}; + +static RE_UINT8 re_all_cases_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, + 0, 0, 0, 0, 0, 0, 9, 0, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 18, 18, 18, 18, 18, 19, 20, 21, 22, 18, 18, 18, 18, 18, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 21, 34, 18, 18, 35, 18, + 18, 18, 18, 18, 36, 18, 37, 38, 39, 18, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 50, 0, 0, 0, 0, 0, 51, 52, + 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 18, 18, 18, 64, 65, + 66, 66, 11, 11, 11, 11, 15, 15, 15, 15, 67, 67, 18, 18, 18, 18, + 68, 69, 18, 18, 18, 18, 18, 18, 70, 71, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 72, 73, 73, 73, 74, 0, 75, 76, 76, 76, + 77, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 78, 78, 78, 78, 79, 80, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 82, 83, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 85, 18, 18, 18, + 18, 18, 86, 87, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 88, 89, 82, 83, 88, 89, 88, 89, 82, 83, 90, 91, 88, 89, 92, 93, + 88, 89, 88, 89, 88, 89, 94, 95, 96, 97, 98, 99, 100, 101, 96, 102, + 0, 0, 0, 0, 103, 104, 105, 0, 0, 106, 0, 0, 107, 107, 108, 108, + 109, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 110, 111, 111, 111, 112, 112, 112, 113, 0, 0, + 73, 73, 73, 73, 73, 74, 76, 76, 76, 76, 76, 77, 114, 115, 116, 117, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 37, 118, 119, 0, + 120, 120, 120, 120, 121, 122, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 18, 18, 18, 18, 18, 86, 0, 0, + 18, 18, 18, 37, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 69, 18, 69, 18, 18, 18, 18, 18, 18, 18, 0, 123, + 18, 124, 51, 18, 18, 125, 126, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127, 0, 0, 0, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, + 129, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 11, 11, 4, 5, 15, 15, 8, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 130, 130, 130, 130, 130, 131, 131, 131, 131, 131, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 132, 132, 132, 132, 132, 132, 133, 0, 134, 134, 134, 134, 134, 134, 135, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 11, 11, 11, 11, 15, 15, 15, 15, 0, 0, 0, 0, +}; + +static RE_UINT8 re_all_cases_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, + 1, 1, 1, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, + 5, 6, 5, 7, 5, 5, 5, 5, 5, 5, 5, 8, 5, 5, 5, 5, + 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, + 1, 1, 1, 1, 1, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 11, + 5, 5, 5, 5, 5, 12, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 0, 5, 5, 5, 5, 5, 5, 5, 13, + 14, 15, 14, 15, 14, 15, 14, 15, 16, 17, 14, 15, 14, 15, 14, 15, + 0, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, + 15, 0, 14, 15, 14, 15, 14, 15, 18, 14, 15, 14, 15, 14, 15, 19, + 20, 21, 14, 15, 14, 15, 22, 14, 15, 23, 23, 14, 15, 0, 24, 25, + 26, 14, 15, 23, 27, 28, 29, 30, 14, 15, 31, 0, 29, 32, 33, 34, + 14, 15, 14, 15, 14, 15, 35, 14, 15, 35, 0, 0, 14, 15, 35, 14, + 15, 36, 36, 14, 15, 14, 15, 37, 14, 15, 0, 0, 14, 15, 0, 38, + 0, 0, 0, 0, 39, 40, 41, 39, 40, 41, 39, 40, 41, 14, 15, 14, + 15, 14, 15, 14, 15, 42, 14, 15, 0, 39, 40, 41, 14, 15, 43, 44, + 45, 0, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 0, 0, 0, 0, + 0, 0, 46, 14, 15, 47, 48, 49, 49, 14, 15, 50, 51, 52, 14, 15, + 53, 54, 55, 56, 57, 0, 58, 58, 0, 59, 0, 60, 61, 0, 0, 0, + 58, 62, 0, 63, 0, 64, 65, 0, 66, 67, 0, 68, 69, 0, 0, 67, + 0, 70, 71, 0, 0, 72, 0, 0, 0, 0, 0, 0, 0, 73, 0, 0, + 74, 0, 0, 74, 0, 0, 0, 75, 74, 76, 77, 77, 78, 0, 0, 0, + 0, 0, 79, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 80, 81, 0, + 0, 0, 0, 0, 0, 82, 0, 0, 14, 15, 14, 15, 0, 0, 14, 15, + 0, 0, 0, 33, 33, 33, 0, 83, 0, 0, 0, 0, 0, 0, 84, 0, + 85, 85, 85, 0, 86, 0, 87, 87, 88, 1, 89, 1, 1, 90, 1, 1, + 91, 92, 93, 1, 94, 1, 1, 1, 95, 96, 0, 97, 1, 1, 98, 1, + 1, 99, 1, 1, 100, 101, 101, 101, 102, 5, 103, 5, 5, 104, 5, 5, + 105, 106, 107, 5, 108, 5, 5, 5, 109, 110, 111, 112, 5, 5, 113, 5, + 5, 114, 5, 5, 115, 116, 116, 117, 118, 119, 0, 0, 0, 120, 121, 122, + 123, 124, 125, 126, 127, 128, 0, 14, 15, 129, 14, 15, 0, 45, 45, 45, + 130, 130, 130, 130, 130, 130, 130, 130, 131, 131, 131, 131, 131, 131, 131, 131, + 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 14, 15, 14, 15, + 132, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 133, + 0, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, + 134, 134, 134, 134, 134, 134, 134, 0, 0, 135, 135, 135, 135, 135, 135, 135, + 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 0, + 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 0, 136, + 0, 0, 0, 0, 0, 136, 0, 0, 137, 137, 137, 137, 137, 137, 137, 137, + 117, 117, 117, 117, 117, 117, 0, 0, 122, 122, 122, 122, 122, 122, 0, 0, + 0, 138, 0, 0, 0, 139, 0, 0, 140, 141, 14, 15, 14, 15, 14, 15, + 14, 15, 14, 15, 14, 15, 0, 0, 0, 0, 0, 142, 0, 0, 143, 0, + 117, 117, 117, 117, 117, 117, 117, 117, 122, 122, 122, 122, 122, 122, 122, 122, + 0, 117, 0, 117, 0, 117, 0, 117, 0, 122, 0, 122, 0, 122, 0, 122, + 144, 144, 145, 145, 145, 145, 146, 146, 147, 147, 148, 148, 149, 149, 0, 0, + 117, 117, 0, 150, 0, 0, 0, 0, 122, 122, 151, 151, 152, 0, 153, 0, + 0, 0, 0, 150, 0, 0, 0, 0, 154, 154, 154, 154, 152, 0, 0, 0, + 117, 117, 0, 155, 0, 0, 0, 0, 122, 122, 156, 156, 0, 0, 0, 0, + 117, 117, 0, 157, 0, 125, 0, 0, 122, 122, 158, 158, 129, 0, 0, 0, + 159, 159, 160, 160, 152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 161, 0, + 0, 0, 162, 163, 0, 0, 0, 0, 0, 0, 164, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 165, 0, 166, 166, 166, 166, 166, 166, 166, 166, + 167, 167, 167, 167, 167, 167, 167, 167, 0, 0, 0, 14, 15, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 168, 168, 168, 168, 168, 168, 168, 168, 168, 168, + 169, 169, 169, 169, 169, 169, 169, 169, 169, 169, 0, 0, 0, 0, 0, 0, + 14, 15, 170, 171, 172, 173, 174, 14, 15, 14, 15, 14, 15, 175, 176, 177, + 178, 0, 14, 15, 0, 14, 15, 0, 0, 0, 0, 0, 0, 0, 179, 179, + 0, 0, 0, 14, 15, 14, 15, 0, 0, 0, 14, 15, 0, 0, 0, 0, + 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 0, 180, + 0, 0, 0, 0, 0, 180, 0, 0, 0, 14, 15, 14, 15, 181, 14, 15, + 0, 0, 0, 14, 15, 182, 0, 0, 14, 15, 183, 184, 185, 186, 0, 0, + 187, 188, 189, 190, 14, 15, 14, 15, 0, 0, 0, 191, 0, 0, 0, 0, + 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 0, 0, 0, 14, 15, 0, + 193, 193, 193, 193, 193, 193, 193, 193, 194, 194, 194, 194, 194, 194, 194, 194, + 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 0, 0, 0, 0, 0, + 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 0, 0, 0, 0, 0, +}; + +/* All_Cases: 2184 bytes. */ + +static RE_AllCases re_all_cases_table[] = { + {{ 0, 0, 0}}, + {{ 32, 0, 0}}, + {{ 32, 232, 0}}, + {{ 32, 8415, 0}}, + {{ 32, 300, 0}}, + {{ -32, 0, 0}}, + {{ -32, 199, 0}}, + {{ -32, 8383, 0}}, + {{ -32, 268, 0}}, + {{ 743, 775, 0}}, + {{ 32, 8294, 0}}, + {{ 7615, 0, 0}}, + {{ -32, 8262, 0}}, + {{ 121, 0, 0}}, + {{ 1, 0, 0}}, + {{ -1, 0, 0}}, + {{ -199, 0, 0}}, + {{ -232, 0, 0}}, + {{ -121, 0, 0}}, + {{ -300, -268, 0}}, + {{ 195, 0, 0}}, + {{ 210, 0, 0}}, + {{ 206, 0, 0}}, + {{ 205, 0, 0}}, + {{ 79, 0, 0}}, + {{ 202, 0, 0}}, + {{ 203, 0, 0}}, + {{ 207, 0, 0}}, + {{ 97, 0, 0}}, + {{ 211, 0, 0}}, + {{ 209, 0, 0}}, + {{ 163, 0, 0}}, + {{ 213, 0, 0}}, + {{ 130, 0, 0}}, + {{ 214, 0, 0}}, + {{ 218, 0, 0}}, + {{ 217, 0, 0}}, + {{ 219, 0, 0}}, + {{ 56, 0, 0}}, + {{ 1, 2, 0}}, + {{ -1, 1, 0}}, + {{ -2, -1, 0}}, + {{ -79, 0, 0}}, + {{ -97, 0, 0}}, + {{ -56, 0, 0}}, + {{ -130, 0, 0}}, + {{ 10795, 0, 0}}, + {{ -163, 0, 0}}, + {{ 10792, 0, 0}}, + {{ 10815, 0, 0}}, + {{ -195, 0, 0}}, + {{ 69, 0, 0}}, + {{ 71, 0, 0}}, + {{ 10783, 0, 0}}, + {{ 10780, 0, 0}}, + {{ 10782, 0, 0}}, + {{ -210, 0, 0}}, + {{ -206, 0, 0}}, + {{ -205, 0, 0}}, + {{ -202, 0, 0}}, + {{ -203, 0, 0}}, + {{ 42319, 0, 0}}, + {{ 42315, 0, 0}}, + {{ -207, 0, 0}}, + {{ 42280, 0, 0}}, + {{ 42308, 0, 0}}, + {{ -209, 0, 0}}, + {{ -211, 0, 0}}, + {{ 10743, 0, 0}}, + {{ 42305, 0, 0}}, + {{ 10749, 0, 0}}, + {{ -213, 0, 0}}, + {{ -214, 0, 0}}, + {{ 10727, 0, 0}}, + {{ -218, 0, 0}}, + {{ 42282, 0, 0}}, + {{ -69, 0, 0}}, + {{ -217, 0, 0}}, + {{ -71, 0, 0}}, + {{ -219, 0, 0}}, + {{ 42261, 0, 0}}, + {{ 42258, 0, 0}}, + {{ 84, 116, 7289}}, + {{ 116, 0, 0}}, + {{ 38, 0, 0}}, + {{ 37, 0, 0}}, + {{ 64, 0, 0}}, + {{ 63, 0, 0}}, + {{ 7235, 0, 0}}, + {{ 32, 62, 0}}, + {{ 32, 96, 0}}, + {{ 32, 57, 92}}, + {{ -84, 32, 7205}}, + {{ 32, 86, 0}}, + {{ -743, 32, 0}}, + {{ 32, 54, 0}}, + {{ 32, 80, 0}}, + {{ 31, 32, 0}}, + {{ 32, 47, 0}}, + {{ 32, 7549, 0}}, + {{ -38, 0, 0}}, + {{ -37, 0, 0}}, + {{ 7219, 0, 0}}, + {{ -32, 30, 0}}, + {{ -32, 64, 0}}, + {{ -32, 25, 60}}, + {{ -116, -32, 7173}}, + {{ -32, 54, 0}}, + {{ -775, -32, 0}}, + {{ -32, 22, 0}}, + {{ -32, 48, 0}}, + {{ -31, 1, 0}}, + {{ -32, -1, 0}}, + {{ -32, 15, 0}}, + {{ -32, 7517, 0}}, + {{ -64, 0, 0}}, + {{ -63, 0, 0}}, + {{ 8, 0, 0}}, + {{ -62, -30, 0}}, + {{ -57, -25, 35}}, + {{ -47, -15, 0}}, + {{ -54, -22, 0}}, + {{ -8, 0, 0}}, + {{ -86, -54, 0}}, + {{ -80, -48, 0}}, + {{ 7, 0, 0}}, + {{ -116, 0, 0}}, + {{ -92, -60, -35}}, + {{ -96, -64, 0}}, + {{ -7, 0, 0}}, + {{ 80, 0, 0}}, + {{ -80, 0, 0}}, + {{ 15, 0, 0}}, + {{ -15, 0, 0}}, + {{ 48, 0, 0}}, + {{ -48, 0, 0}}, + {{ 7264, 0, 0}}, + {{ 38864, 0, 0}}, + {{ 35332, 0, 0}}, + {{ 3814, 0, 0}}, + {{ 1, 59, 0}}, + {{ -1, 58, 0}}, + {{ -59, -58, 0}}, + {{ -7615, 0, 0}}, + {{ 74, 0, 0}}, + {{ 86, 0, 0}}, + {{ 100, 0, 0}}, + {{ 128, 0, 0}}, + {{ 112, 0, 0}}, + {{ 126, 0, 0}}, + {{ 9, 0, 0}}, + {{ -74, 0, 0}}, + {{ -9, 0, 0}}, + {{ -7289, -7205, -7173}}, + {{ -86, 0, 0}}, + {{ -7235, 0, 0}}, + {{ -100, 0, 0}}, + {{ -7219, 0, 0}}, + {{ -112, 0, 0}}, + {{ -128, 0, 0}}, + {{ -126, 0, 0}}, + {{ -7549, -7517, 0}}, + {{ -8415, -8383, 0}}, + {{ -8294, -8262, 0}}, + {{ 28, 0, 0}}, + {{ -28, 0, 0}}, + {{ 16, 0, 0}}, + {{ -16, 0, 0}}, + {{ 26, 0, 0}}, + {{ -26, 0, 0}}, + {{-10743, 0, 0}}, + {{ -3814, 0, 0}}, + {{-10727, 0, 0}}, + {{-10795, 0, 0}}, + {{-10792, 0, 0}}, + {{-10780, 0, 0}}, + {{-10749, 0, 0}}, + {{-10783, 0, 0}}, + {{-10782, 0, 0}}, + {{-10815, 0, 0}}, + {{ -7264, 0, 0}}, + {{-35332, 0, 0}}, + {{-42280, 0, 0}}, + {{-42308, 0, 0}}, + {{-42319, 0, 0}}, + {{-42315, 0, 0}}, + {{-42305, 0, 0}}, + {{-42258, 0, 0}}, + {{-42282, 0, 0}}, + {{-42261, 0, 0}}, + {{ 928, 0, 0}}, + {{ -928, 0, 0}}, + {{-38864, 0, 0}}, + {{ 40, 0, 0}}, + {{ -40, 0, 0}}, +}; + +/* All_Cases: 2340 bytes. */ + +int re_get_all_cases(RE_UINT32 ch, RE_UINT32* codepoints) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + RE_AllCases* all_cases; + int count; + + f = ch >> 13; + code = ch ^ (f << 13); + pos = (RE_UINT32)re_all_cases_stage_1[f] << 5; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_all_cases_stage_2[pos + f] << 5; + f = code >> 3; + code ^= f << 3; + pos = (RE_UINT32)re_all_cases_stage_3[pos + f] << 3; + value = re_all_cases_stage_4[pos + code]; + + all_cases = &re_all_cases_table[value]; + + codepoints[0] = ch; + count = 1; + + while (count < RE_MAX_CASES && all_cases->diffs[count - 1] != 0) { + codepoints[count] = (RE_UINT32)((RE_INT32)ch + all_cases->diffs[count - + 1]); + ++count; + } + + return count; +} + +/* Simple_Case_Folding. */ + +static RE_UINT8 re_simple_case_folding_stage_1[] = { + 0, 1, 2, 2, 2, 3, 2, 4, 5, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, +}; + +static RE_UINT8 re_simple_case_folding_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 9, 10, + 6, 11, 6, 6, 12, 6, 6, 6, 6, 6, 6, 6, 13, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 14, 15, 6, 6, 6, 16, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 17, + 6, 6, 6, 6, 18, 6, 6, 6, 6, 6, 6, 6, 19, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 20, 6, 6, 6, 6, 6, 6, 6, +}; + +static RE_UINT8 re_simple_case_folding_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 3, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 4, 0, 2, 2, 5, 5, 0, 0, 0, 0, + 6, 6, 6, 6, 6, 6, 7, 8, 8, 7, 6, 6, 6, 6, 6, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 8, 20, 6, 6, 21, 6, + 6, 6, 6, 6, 22, 6, 23, 24, 25, 6, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 26, 0, 0, 0, 0, 0, 27, 28, + 29, 30, 1, 2, 31, 32, 0, 0, 33, 34, 35, 6, 6, 6, 36, 37, + 38, 38, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, + 39, 7, 6, 6, 6, 6, 6, 6, 40, 41, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 42, 43, 43, 43, 44, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 45, 45, 45, 45, 46, 47, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 49, 50, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 0, 51, 0, 48, 0, 51, 0, 51, 0, 48, 0, 52, 0, 51, 0, 0, + 0, 51, 0, 51, 0, 51, 0, 53, 0, 54, 0, 55, 0, 56, 0, 57, + 0, 0, 0, 0, 58, 59, 60, 0, 0, 0, 0, 0, 61, 61, 0, 0, + 62, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 63, 64, 64, 64, 0, 0, 0, 0, 0, 0, + 43, 43, 43, 43, 43, 44, 0, 0, 0, 0, 0, 0, 65, 66, 67, 68, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 23, 69, 33, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6, 49, 0, 0, + 6, 6, 6, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 6, 7, 6, 6, 6, 6, 6, 6, 6, 0, 70, + 6, 71, 27, 6, 6, 72, 73, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 2, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 75, 75, 75, 75, 75, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 76, 76, 76, 76, 76, 76, 77, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_simple_case_folding_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, + 3, 0, 3, 0, 3, 0, 3, 0, 0, 0, 3, 0, 3, 0, 3, 0, + 0, 3, 0, 3, 0, 3, 0, 3, 4, 3, 0, 3, 0, 3, 0, 5, + 0, 6, 3, 0, 3, 0, 7, 3, 0, 8, 8, 3, 0, 0, 9, 10, + 11, 3, 0, 8, 12, 0, 13, 14, 3, 0, 0, 0, 13, 15, 0, 16, + 3, 0, 3, 0, 3, 0, 17, 3, 0, 17, 0, 0, 3, 0, 17, 3, + 0, 18, 18, 3, 0, 3, 0, 19, 3, 0, 0, 0, 3, 0, 0, 0, + 0, 0, 0, 0, 20, 3, 0, 20, 3, 0, 20, 3, 0, 3, 0, 3, + 0, 3, 0, 3, 0, 0, 3, 0, 0, 20, 3, 0, 3, 0, 21, 22, + 23, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 0, 0, 0, 0, + 0, 0, 24, 3, 0, 25, 26, 0, 0, 3, 0, 27, 28, 29, 3, 0, + 0, 0, 0, 0, 0, 30, 0, 0, 3, 0, 3, 0, 0, 0, 3, 0, + 0, 0, 0, 0, 0, 0, 0, 30, 0, 0, 0, 0, 0, 0, 31, 0, + 32, 32, 32, 0, 33, 0, 34, 34, 1, 1, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 35, 36, 37, 0, 0, 0, 38, 39, 0, + 40, 41, 0, 0, 42, 43, 0, 3, 0, 44, 3, 0, 0, 23, 23, 23, + 45, 45, 45, 45, 45, 45, 45, 45, 3, 0, 0, 0, 0, 0, 0, 0, + 46, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 0, + 0, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 0, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 0, 48, 0, 0, 0, 0, 0, 48, 0, 0, + 49, 49, 49, 49, 49, 49, 0, 0, 3, 0, 3, 0, 3, 0, 0, 0, + 0, 0, 0, 50, 0, 0, 51, 0, 49, 49, 49, 49, 49, 49, 49, 49, + 0, 49, 0, 49, 0, 49, 0, 49, 49, 49, 52, 52, 53, 0, 54, 0, + 55, 55, 55, 55, 53, 0, 0, 0, 49, 49, 56, 56, 0, 0, 0, 0, + 49, 49, 57, 57, 44, 0, 0, 0, 58, 58, 59, 59, 53, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 60, 0, 0, 0, 61, 62, 0, 0, 0, 0, + 0, 0, 63, 0, 0, 0, 0, 0, 64, 64, 64, 64, 64, 64, 64, 64, + 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, 3, 0, 66, 67, 68, 0, 0, 3, + 0, 3, 0, 3, 0, 69, 70, 71, 72, 0, 3, 0, 0, 3, 0, 0, + 0, 0, 0, 0, 0, 0, 73, 73, 0, 0, 0, 3, 0, 3, 0, 0, + 0, 3, 0, 3, 0, 74, 3, 0, 0, 0, 0, 3, 0, 75, 0, 0, + 3, 0, 76, 77, 78, 79, 0, 0, 80, 81, 82, 83, 3, 0, 3, 0, + 84, 84, 84, 84, 84, 84, 84, 84, 85, 85, 85, 85, 85, 85, 85, 85, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 0, 0, 0, 0, 0, +}; + +/* Simple_Case_Folding: 1624 bytes. */ + +static RE_INT32 re_simple_case_folding_table[] = { + 0, + 32, + 775, + 1, + -121, + -268, + 210, + 206, + 205, + 79, + 202, + 203, + 207, + 211, + 209, + 213, + 214, + 218, + 217, + 219, + 2, + -97, + -56, + -130, + 10795, + -163, + 10792, + -195, + 69, + 71, + 116, + 38, + 37, + 64, + 63, + 8, + -30, + -25, + -15, + -22, + -54, + -48, + -60, + -64, + -7, + 80, + 15, + 48, + 7264, + -8, + -58, + -7615, + -74, + -9, + -7173, + -86, + -100, + -112, + -128, + -126, + -7517, + -8383, + -8262, + 28, + 16, + 26, + -10743, + -3814, + -10727, + -10780, + -10749, + -10783, + -10782, + -10815, + -35332, + -42280, + -42308, + -42319, + -42315, + -42305, + -42258, + -42282, + -42261, + 928, + -38864, + 40, +}; + +/* Simple_Case_Folding: 344 bytes. */ + +RE_UINT32 re_get_simple_case_folding(RE_UINT32 ch) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + RE_INT32 diff; + + f = ch >> 13; + code = ch ^ (f << 13); + pos = (RE_UINT32)re_simple_case_folding_stage_1[f] << 5; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_simple_case_folding_stage_2[pos + f] << 5; + f = code >> 3; + code ^= f << 3; + pos = (RE_UINT32)re_simple_case_folding_stage_3[pos + f] << 3; + value = re_simple_case_folding_stage_4[pos + code]; + + diff = re_simple_case_folding_table[value]; + + return (RE_UINT32)((RE_INT32)ch + diff); +} + +/* Full_Case_Folding. */ + +static RE_UINT8 re_full_case_folding_stage_1[] = { + 0, 1, 2, 2, 2, 3, 2, 4, 5, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, +}; + +static RE_UINT8 re_full_case_folding_stage_2[] = { + 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 9, 10, + 6, 11, 6, 6, 12, 6, 6, 6, 6, 6, 6, 6, 13, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 14, 15, 6, 6, 6, 16, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 17, 6, 6, 6, 18, + 6, 6, 6, 6, 19, 6, 6, 6, 6, 6, 6, 6, 20, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 21, 6, 6, 6, 6, 6, 6, 6, +}; + +static RE_UINT8 re_full_case_folding_stage_3[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 3, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 4, 0, 2, 2, 5, 6, 0, 0, 0, 0, + 7, 7, 7, 7, 7, 7, 8, 9, 9, 10, 7, 7, 7, 7, 7, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 9, 22, 7, 7, 23, 7, + 7, 7, 7, 7, 24, 7, 25, 26, 27, 7, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 0, 0, 0, 0, 29, 30, + 31, 32, 33, 2, 34, 35, 36, 0, 37, 38, 39, 7, 7, 7, 40, 41, + 42, 42, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, + 43, 44, 7, 7, 7, 7, 7, 7, 45, 46, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 47, 48, 48, 48, 49, 0, 0, 0, 0, 0, + 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 51, 51, 51, 51, 52, 53, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 54, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 55, 56, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 0, 57, 0, 54, 0, 57, 0, 57, 0, 54, 58, 59, 0, 57, 0, 0, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, + 0, 0, 0, 0, 76, 77, 78, 0, 0, 0, 0, 0, 79, 79, 0, 0, + 80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 81, 82, 82, 82, 0, 0, 0, 0, 0, 0, + 48, 48, 48, 48, 48, 49, 0, 0, 0, 0, 0, 0, 83, 84, 85, 86, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 25, 87, 37, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 88, 0, 0, + 7, 7, 7, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 44, 7, 44, 7, 7, 7, 7, 7, 7, 7, 0, 89, + 7, 90, 29, 7, 7, 91, 92, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 93, 93, + 93, 93, 93, 93, 93, 93, 93, 93, 0, 0, 0, 0, 0, 0, 0, 0, + 94, 0, 95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 2, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 96, 96, 96, 96, 96, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 97, 97, 97, 97, 97, 97, 98, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static RE_UINT8 re_full_case_folding_stage_4[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, + 1, 1, 1, 1, 1, 1, 1, 3, 4, 0, 4, 0, 4, 0, 4, 0, + 5, 0, 4, 0, 4, 0, 4, 0, 0, 4, 0, 4, 0, 4, 0, 4, + 0, 6, 4, 0, 4, 0, 4, 0, 7, 4, 0, 4, 0, 4, 0, 8, + 0, 9, 4, 0, 4, 0, 10, 4, 0, 11, 11, 4, 0, 0, 12, 13, + 14, 4, 0, 11, 15, 0, 16, 17, 4, 0, 0, 0, 16, 18, 0, 19, + 4, 0, 4, 0, 4, 0, 20, 4, 0, 20, 0, 0, 4, 0, 20, 4, + 0, 21, 21, 4, 0, 4, 0, 22, 4, 0, 0, 0, 4, 0, 0, 0, + 0, 0, 0, 0, 23, 4, 0, 23, 4, 0, 23, 4, 0, 4, 0, 4, + 0, 4, 0, 4, 0, 0, 4, 0, 24, 23, 4, 0, 4, 0, 25, 26, + 27, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 0, 0, 0, 0, + 0, 0, 28, 4, 0, 29, 30, 0, 0, 4, 0, 31, 32, 33, 4, 0, + 0, 0, 0, 0, 0, 34, 0, 0, 4, 0, 4, 0, 0, 0, 4, 0, + 0, 0, 0, 0, 0, 0, 0, 34, 0, 0, 0, 0, 0, 0, 35, 0, + 36, 36, 36, 0, 37, 0, 38, 38, 39, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, + 40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 41, 42, 43, 0, 0, 0, 44, 45, 0, + 46, 47, 0, 0, 48, 49, 0, 4, 0, 50, 4, 0, 0, 27, 27, 27, + 51, 51, 51, 51, 51, 51, 51, 51, 4, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 4, 0, 4, 0, 4, 0, 52, 4, 0, 4, 0, 4, 0, 4, + 0, 4, 0, 4, 0, 4, 0, 0, 0, 53, 53, 53, 53, 53, 53, 53, + 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 0, + 0, 0, 0, 0, 0, 0, 0, 54, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 0, 55, 0, 0, 0, 0, 0, 55, 0, 0, + 56, 56, 56, 56, 56, 56, 0, 0, 4, 0, 4, 0, 4, 0, 57, 58, + 59, 60, 61, 62, 0, 0, 63, 0, 56, 56, 56, 56, 56, 56, 56, 56, + 64, 0, 65, 0, 66, 0, 67, 0, 0, 56, 0, 56, 0, 56, 0, 56, + 68, 68, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, + 70, 70, 70, 70, 70, 70, 70, 70, 71, 71, 71, 71, 71, 71, 71, 71, + 72, 72, 72, 72, 72, 72, 72, 72, 73, 73, 73, 73, 73, 73, 73, 73, + 0, 0, 74, 75, 76, 0, 77, 78, 56, 56, 79, 79, 80, 0, 81, 0, + 0, 0, 82, 83, 84, 0, 85, 86, 87, 87, 87, 87, 88, 0, 0, 0, + 0, 0, 89, 90, 0, 0, 91, 92, 56, 56, 93, 93, 0, 0, 0, 0, + 0, 0, 94, 95, 96, 0, 97, 98, 56, 56, 99, 99, 50, 0, 0, 0, + 0, 0, 100, 101, 102, 0, 103, 104, 105, 105, 106, 106, 107, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 108, 0, 0, 0, 109, 110, 0, 0, 0, 0, + 0, 0, 111, 0, 0, 0, 0, 0, 112, 112, 112, 112, 112, 112, 112, 112, + 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 113, 113, + 113, 113, 113, 113, 113, 113, 113, 113, 4, 0, 114, 115, 116, 0, 0, 4, + 0, 4, 0, 4, 0, 117, 118, 119, 120, 0, 4, 0, 0, 4, 0, 0, + 0, 0, 0, 0, 0, 0, 121, 121, 0, 0, 0, 4, 0, 4, 0, 0, + 4, 0, 4, 0, 4, 0, 0, 0, 0, 4, 0, 4, 0, 122, 4, 0, + 0, 0, 0, 4, 0, 123, 0, 0, 4, 0, 124, 125, 126, 127, 0, 0, + 128, 129, 130, 131, 4, 0, 4, 0, 132, 132, 132, 132, 132, 132, 132, 132, + 133, 134, 135, 136, 137, 138, 139, 0, 0, 0, 0, 140, 141, 142, 143, 144, + 145, 145, 145, 145, 145, 145, 145, 145, 37, 37, 37, 37, 37, 37, 37, 37, + 37, 37, 37, 0, 0, 0, 0, 0, +}; + +/* Full_Case_Folding: 1824 bytes. */ + +static RE_FullCaseFolding re_full_case_folding_table[] = { + { 0, { 0, 0}}, + { 32, { 0, 0}}, + { 775, { 0, 0}}, + { -108, { 115, 0}}, + { 1, { 0, 0}}, + { -199, { 775, 0}}, + { 371, { 110, 0}}, + { -121, { 0, 0}}, + { -268, { 0, 0}}, + { 210, { 0, 0}}, + { 206, { 0, 0}}, + { 205, { 0, 0}}, + { 79, { 0, 0}}, + { 202, { 0, 0}}, + { 203, { 0, 0}}, + { 207, { 0, 0}}, + { 211, { 0, 0}}, + { 209, { 0, 0}}, + { 213, { 0, 0}}, + { 214, { 0, 0}}, + { 218, { 0, 0}}, + { 217, { 0, 0}}, + { 219, { 0, 0}}, + { 2, { 0, 0}}, + { -390, { 780, 0}}, + { -97, { 0, 0}}, + { -56, { 0, 0}}, + { -130, { 0, 0}}, + { 10795, { 0, 0}}, + { -163, { 0, 0}}, + { 10792, { 0, 0}}, + { -195, { 0, 0}}, + { 69, { 0, 0}}, + { 71, { 0, 0}}, + { 116, { 0, 0}}, + { 38, { 0, 0}}, + { 37, { 0, 0}}, + { 64, { 0, 0}}, + { 63, { 0, 0}}, + { 41, { 776, 769}}, + { 21, { 776, 769}}, + { 8, { 0, 0}}, + { -30, { 0, 0}}, + { -25, { 0, 0}}, + { -15, { 0, 0}}, + { -22, { 0, 0}}, + { -54, { 0, 0}}, + { -48, { 0, 0}}, + { -60, { 0, 0}}, + { -64, { 0, 0}}, + { -7, { 0, 0}}, + { 80, { 0, 0}}, + { 15, { 0, 0}}, + { 48, { 0, 0}}, + { -34, {1410, 0}}, + { 7264, { 0, 0}}, + { -8, { 0, 0}}, + { -7726, { 817, 0}}, + { -7715, { 776, 0}}, + { -7713, { 778, 0}}, + { -7712, { 778, 0}}, + { -7737, { 702, 0}}, + { -58, { 0, 0}}, + { -7723, { 115, 0}}, + { -7051, { 787, 0}}, + { -7053, { 787, 768}}, + { -7055, { 787, 769}}, + { -7057, { 787, 834}}, + { -128, { 953, 0}}, + { -136, { 953, 0}}, + { -112, { 953, 0}}, + { -120, { 953, 0}}, + { -64, { 953, 0}}, + { -72, { 953, 0}}, + { -66, { 953, 0}}, + { -7170, { 953, 0}}, + { -7176, { 953, 0}}, + { -7173, { 834, 0}}, + { -7174, { 834, 953}}, + { -74, { 0, 0}}, + { -7179, { 953, 0}}, + { -7173, { 0, 0}}, + { -78, { 953, 0}}, + { -7180, { 953, 0}}, + { -7190, { 953, 0}}, + { -7183, { 834, 0}}, + { -7184, { 834, 953}}, + { -86, { 0, 0}}, + { -7189, { 953, 0}}, + { -7193, { 776, 768}}, + { -7194, { 776, 769}}, + { -7197, { 834, 0}}, + { -7198, { 776, 834}}, + { -100, { 0, 0}}, + { -7197, { 776, 768}}, + { -7198, { 776, 769}}, + { -7203, { 787, 0}}, + { -7201, { 834, 0}}, + { -7202, { 776, 834}}, + { -112, { 0, 0}}, + { -118, { 953, 0}}, + { -7210, { 953, 0}}, + { -7206, { 953, 0}}, + { -7213, { 834, 0}}, + { -7214, { 834, 953}}, + { -128, { 0, 0}}, + { -126, { 0, 0}}, + { -7219, { 953, 0}}, + { -7517, { 0, 0}}, + { -8383, { 0, 0}}, + { -8262, { 0, 0}}, + { 28, { 0, 0}}, + { 16, { 0, 0}}, + { 26, { 0, 0}}, + {-10743, { 0, 0}}, + { -3814, { 0, 0}}, + {-10727, { 0, 0}}, + {-10780, { 0, 0}}, + {-10749, { 0, 0}}, + {-10783, { 0, 0}}, + {-10782, { 0, 0}}, + {-10815, { 0, 0}}, + {-35332, { 0, 0}}, + {-42280, { 0, 0}}, + {-42308, { 0, 0}}, + {-42319, { 0, 0}}, + {-42315, { 0, 0}}, + {-42305, { 0, 0}}, + {-42258, { 0, 0}}, + {-42282, { 0, 0}}, + {-42261, { 0, 0}}, + { 928, { 0, 0}}, + {-38864, { 0, 0}}, + {-64154, { 102, 0}}, + {-64155, { 105, 0}}, + {-64156, { 108, 0}}, + {-64157, { 102, 105}}, + {-64158, { 102, 108}}, + {-64146, { 116, 0}}, + {-64147, { 116, 0}}, + {-62879, {1398, 0}}, + {-62880, {1381, 0}}, + {-62881, {1387, 0}}, + {-62872, {1398, 0}}, + {-62883, {1389, 0}}, + { 40, { 0, 0}}, +}; + +/* Full_Case_Folding: 1168 bytes. */ + +int re_get_full_case_folding(RE_UINT32 ch, RE_UINT32* codepoints) { + RE_UINT32 code; + RE_UINT32 f; + RE_UINT32 pos; + RE_UINT32 value; + RE_FullCaseFolding* case_folding; + int count; + + f = ch >> 13; + code = ch ^ (f << 13); + pos = (RE_UINT32)re_full_case_folding_stage_1[f] << 5; + f = code >> 8; + code ^= f << 8; + pos = (RE_UINT32)re_full_case_folding_stage_2[pos + f] << 5; + f = code >> 3; + code ^= f << 3; + pos = (RE_UINT32)re_full_case_folding_stage_3[pos + f] << 3; + value = re_full_case_folding_stage_4[pos + code]; + + case_folding = &re_full_case_folding_table[value]; + + codepoints[0] = (RE_UINT32)((RE_INT32)ch + case_folding->diff); + count = 1; + + while (count < RE_MAX_FOLDED && case_folding->codepoints[count - 1] != 0) { + codepoints[count] = case_folding->codepoints[count - 1]; + ++count; + } + + return count; +} + +/* Property function table. */ + +RE_GetPropertyFunc re_get_property[] = { + re_get_general_category, + re_get_block, + re_get_script, + re_get_word_break, + re_get_grapheme_cluster_break, + re_get_sentence_break, + re_get_math, + re_get_alphabetic, + re_get_lowercase, + re_get_uppercase, + re_get_cased, + re_get_case_ignorable, + re_get_changes_when_lowercased, + re_get_changes_when_uppercased, + re_get_changes_when_titlecased, + re_get_changes_when_casefolded, + re_get_changes_when_casemapped, + re_get_id_start, + re_get_id_continue, + re_get_xid_start, + re_get_xid_continue, + re_get_default_ignorable_code_point, + re_get_grapheme_extend, + re_get_grapheme_base, + re_get_grapheme_link, + re_get_white_space, + re_get_bidi_control, + re_get_join_control, + re_get_dash, + re_get_hyphen, + re_get_quotation_mark, + re_get_terminal_punctuation, + re_get_other_math, + re_get_hex_digit, + re_get_ascii_hex_digit, + re_get_other_alphabetic, + re_get_ideographic, + re_get_diacritic, + re_get_extender, + re_get_other_lowercase, + re_get_other_uppercase, + re_get_noncharacter_code_point, + re_get_other_grapheme_extend, + re_get_ids_binary_operator, + re_get_ids_trinary_operator, + re_get_radical, + re_get_unified_ideograph, + re_get_other_default_ignorable_code_point, + re_get_deprecated, + re_get_soft_dotted, + re_get_logical_order_exception, + re_get_other_id_start, + re_get_other_id_continue, + re_get_sterm, + re_get_variation_selector, + re_get_pattern_white_space, + re_get_pattern_syntax, + re_get_hangul_syllable_type, + re_get_bidi_class, + re_get_canonical_combining_class, + re_get_decomposition_type, + re_get_east_asian_width, + re_get_joining_group, + re_get_joining_type, + re_get_line_break, + re_get_numeric_type, + re_get_numeric_value, + re_get_bidi_mirrored, + re_get_indic_positional_category, + re_get_indic_syllabic_category, + re_get_alphanumeric, + re_get_any, + re_get_blank, + re_get_graph, + re_get_print, + re_get_word, + re_get_xdigit, + re_get_posix_digit, + re_get_posix_alnum, + re_get_posix_punct, + re_get_posix_xdigit, +}; diff --git a/lib/regex/_regex_unicode.h b/lib/regex/_regex_unicode.h new file mode 100644 index 0000000000000000000000000000000000000000..0d2fd62db8bc86e4a27bee46edc6f1df637bc64d --- /dev/null +++ b/lib/regex/_regex_unicode.h @@ -0,0 +1,226 @@ +typedef unsigned char RE_UINT8; +typedef signed char RE_INT8; +typedef unsigned short RE_UINT16; +typedef signed short RE_INT16; +typedef unsigned int RE_UINT32; +typedef signed int RE_INT32; + +typedef unsigned char BOOL; +enum {FALSE, TRUE}; + +#define RE_ASCII_MAX 0x7F +#define RE_LOCALE_MAX 0xFF +#define RE_UNICODE_MAX 0x10FFFF + +#define RE_MAX_CASES 4 +#define RE_MAX_FOLDED 3 + +typedef struct RE_Property { + RE_UINT16 name; + RE_UINT8 id; + RE_UINT8 value_set; +} RE_Property; + +typedef struct RE_PropertyValue { + RE_UINT16 name; + RE_UINT8 value_set; + RE_UINT16 id; +} RE_PropertyValue; + +typedef RE_UINT32 (*RE_GetPropertyFunc)(RE_UINT32 ch); + +#define RE_PROP_GC 0x0 +#define RE_PROP_CASED 0xA +#define RE_PROP_UPPERCASE 0x9 +#define RE_PROP_LOWERCASE 0x8 + +#define RE_PROP_C 30 +#define RE_PROP_L 31 +#define RE_PROP_M 32 +#define RE_PROP_N 33 +#define RE_PROP_P 34 +#define RE_PROP_S 35 +#define RE_PROP_Z 36 +#define RE_PROP_ASSIGNED 38 +#define RE_PROP_CASEDLETTER 37 + +#define RE_PROP_CN 0 +#define RE_PROP_LU 1 +#define RE_PROP_LL 2 +#define RE_PROP_LT 3 +#define RE_PROP_LM 4 +#define RE_PROP_LO 5 +#define RE_PROP_MN 6 +#define RE_PROP_ME 7 +#define RE_PROP_MC 8 +#define RE_PROP_ND 9 +#define RE_PROP_NL 10 +#define RE_PROP_NO 11 +#define RE_PROP_ZS 12 +#define RE_PROP_ZL 13 +#define RE_PROP_ZP 14 +#define RE_PROP_CC 15 +#define RE_PROP_CF 16 +#define RE_PROP_CO 17 +#define RE_PROP_CS 18 +#define RE_PROP_PD 19 +#define RE_PROP_PS 20 +#define RE_PROP_PE 21 +#define RE_PROP_PC 22 +#define RE_PROP_PO 23 +#define RE_PROP_SM 24 +#define RE_PROP_SC 25 +#define RE_PROP_SK 26 +#define RE_PROP_SO 27 +#define RE_PROP_PI 28 +#define RE_PROP_PF 29 + +#define RE_PROP_C_MASK 0x00078001 +#define RE_PROP_L_MASK 0x0000003E +#define RE_PROP_M_MASK 0x000001C0 +#define RE_PROP_N_MASK 0x00000E00 +#define RE_PROP_P_MASK 0x30F80000 +#define RE_PROP_S_MASK 0x0F000000 +#define RE_PROP_Z_MASK 0x00007000 + +#define RE_PROP_ALNUM 0x460001 +#define RE_PROP_ALPHA 0x070001 +#define RE_PROP_ANY 0x470001 +#define RE_PROP_ASCII 0x010001 +#define RE_PROP_BLANK 0x480001 +#define RE_PROP_CNTRL 0x00000F +#define RE_PROP_DIGIT 0x000009 +#define RE_PROP_GRAPH 0x490001 +#define RE_PROP_LOWER 0x080001 +#define RE_PROP_PRINT 0x4A0001 +#define RE_PROP_SPACE 0x190001 +#define RE_PROP_UPPER 0x090001 +#define RE_PROP_WORD 0x4B0001 +#define RE_PROP_XDIGIT 0x4C0001 +#define RE_PROP_POSIX_ALNUM 0x4E0001 +#define RE_PROP_POSIX_DIGIT 0x4D0001 +#define RE_PROP_POSIX_PUNCT 0x4F0001 +#define RE_PROP_POSIX_XDIGIT 0x500001 + +#define RE_BREAK_OTHER 0 +#define RE_BREAK_DOUBLEQUOTE 1 +#define RE_BREAK_SINGLEQUOTE 2 +#define RE_BREAK_HEBREWLETTER 3 +#define RE_BREAK_CR 4 +#define RE_BREAK_LF 5 +#define RE_BREAK_NEWLINE 6 +#define RE_BREAK_EXTEND 7 +#define RE_BREAK_REGIONALINDICATOR 8 +#define RE_BREAK_FORMAT 9 +#define RE_BREAK_KATAKANA 10 +#define RE_BREAK_ALETTER 11 +#define RE_BREAK_MIDLETTER 12 +#define RE_BREAK_MIDNUM 13 +#define RE_BREAK_MIDNUMLET 14 +#define RE_BREAK_NUMERIC 15 +#define RE_BREAK_EXTENDNUMLET 16 + +#define RE_GBREAK_OTHER 0 +#define RE_GBREAK_CR 1 +#define RE_GBREAK_LF 2 +#define RE_GBREAK_CONTROL 3 +#define RE_GBREAK_EXTEND 4 +#define RE_GBREAK_REGIONALINDICATOR 5 +#define RE_GBREAK_SPACINGMARK 6 +#define RE_GBREAK_L 7 +#define RE_GBREAK_V 8 +#define RE_GBREAK_T 9 +#define RE_GBREAK_LV 10 +#define RE_GBREAK_LVT 11 +#define RE_GBREAK_PREPEND 12 + +extern char* re_strings[1296]; +extern RE_Property re_properties[147]; +extern RE_PropertyValue re_property_values[1412]; +extern RE_UINT16 re_expand_on_folding[104]; +extern RE_GetPropertyFunc re_get_property[81]; + +RE_UINT32 re_get_general_category(RE_UINT32 ch); +RE_UINT32 re_get_block(RE_UINT32 ch); +RE_UINT32 re_get_script(RE_UINT32 ch); +RE_UINT32 re_get_word_break(RE_UINT32 ch); +RE_UINT32 re_get_grapheme_cluster_break(RE_UINT32 ch); +RE_UINT32 re_get_sentence_break(RE_UINT32 ch); +RE_UINT32 re_get_math(RE_UINT32 ch); +RE_UINT32 re_get_alphabetic(RE_UINT32 ch); +RE_UINT32 re_get_lowercase(RE_UINT32 ch); +RE_UINT32 re_get_uppercase(RE_UINT32 ch); +RE_UINT32 re_get_cased(RE_UINT32 ch); +RE_UINT32 re_get_case_ignorable(RE_UINT32 ch); +RE_UINT32 re_get_changes_when_lowercased(RE_UINT32 ch); +RE_UINT32 re_get_changes_when_uppercased(RE_UINT32 ch); +RE_UINT32 re_get_changes_when_titlecased(RE_UINT32 ch); +RE_UINT32 re_get_changes_when_casefolded(RE_UINT32 ch); +RE_UINT32 re_get_changes_when_casemapped(RE_UINT32 ch); +RE_UINT32 re_get_id_start(RE_UINT32 ch); +RE_UINT32 re_get_id_continue(RE_UINT32 ch); +RE_UINT32 re_get_xid_start(RE_UINT32 ch); +RE_UINT32 re_get_xid_continue(RE_UINT32 ch); +RE_UINT32 re_get_default_ignorable_code_point(RE_UINT32 ch); +RE_UINT32 re_get_grapheme_extend(RE_UINT32 ch); +RE_UINT32 re_get_grapheme_base(RE_UINT32 ch); +RE_UINT32 re_get_grapheme_link(RE_UINT32 ch); +RE_UINT32 re_get_white_space(RE_UINT32 ch); +RE_UINT32 re_get_bidi_control(RE_UINT32 ch); +RE_UINT32 re_get_join_control(RE_UINT32 ch); +RE_UINT32 re_get_dash(RE_UINT32 ch); +RE_UINT32 re_get_hyphen(RE_UINT32 ch); +RE_UINT32 re_get_quotation_mark(RE_UINT32 ch); +RE_UINT32 re_get_terminal_punctuation(RE_UINT32 ch); +RE_UINT32 re_get_other_math(RE_UINT32 ch); +RE_UINT32 re_get_hex_digit(RE_UINT32 ch); +RE_UINT32 re_get_ascii_hex_digit(RE_UINT32 ch); +RE_UINT32 re_get_other_alphabetic(RE_UINT32 ch); +RE_UINT32 re_get_ideographic(RE_UINT32 ch); +RE_UINT32 re_get_diacritic(RE_UINT32 ch); +RE_UINT32 re_get_extender(RE_UINT32 ch); +RE_UINT32 re_get_other_lowercase(RE_UINT32 ch); +RE_UINT32 re_get_other_uppercase(RE_UINT32 ch); +RE_UINT32 re_get_noncharacter_code_point(RE_UINT32 ch); +RE_UINT32 re_get_other_grapheme_extend(RE_UINT32 ch); +RE_UINT32 re_get_ids_binary_operator(RE_UINT32 ch); +RE_UINT32 re_get_ids_trinary_operator(RE_UINT32 ch); +RE_UINT32 re_get_radical(RE_UINT32 ch); +RE_UINT32 re_get_unified_ideograph(RE_UINT32 ch); +RE_UINT32 re_get_other_default_ignorable_code_point(RE_UINT32 ch); +RE_UINT32 re_get_deprecated(RE_UINT32 ch); +RE_UINT32 re_get_soft_dotted(RE_UINT32 ch); +RE_UINT32 re_get_logical_order_exception(RE_UINT32 ch); +RE_UINT32 re_get_other_id_start(RE_UINT32 ch); +RE_UINT32 re_get_other_id_continue(RE_UINT32 ch); +RE_UINT32 re_get_sterm(RE_UINT32 ch); +RE_UINT32 re_get_variation_selector(RE_UINT32 ch); +RE_UINT32 re_get_pattern_white_space(RE_UINT32 ch); +RE_UINT32 re_get_pattern_syntax(RE_UINT32 ch); +RE_UINT32 re_get_hangul_syllable_type(RE_UINT32 ch); +RE_UINT32 re_get_bidi_class(RE_UINT32 ch); +RE_UINT32 re_get_canonical_combining_class(RE_UINT32 ch); +RE_UINT32 re_get_decomposition_type(RE_UINT32 ch); +RE_UINT32 re_get_east_asian_width(RE_UINT32 ch); +RE_UINT32 re_get_joining_group(RE_UINT32 ch); +RE_UINT32 re_get_joining_type(RE_UINT32 ch); +RE_UINT32 re_get_line_break(RE_UINT32 ch); +RE_UINT32 re_get_numeric_type(RE_UINT32 ch); +RE_UINT32 re_get_numeric_value(RE_UINT32 ch); +RE_UINT32 re_get_bidi_mirrored(RE_UINT32 ch); +RE_UINT32 re_get_indic_positional_category(RE_UINT32 ch); +RE_UINT32 re_get_indic_syllabic_category(RE_UINT32 ch); +RE_UINT32 re_get_alphanumeric(RE_UINT32 ch); +RE_UINT32 re_get_any(RE_UINT32 ch); +RE_UINT32 re_get_blank(RE_UINT32 ch); +RE_UINT32 re_get_graph(RE_UINT32 ch); +RE_UINT32 re_get_print(RE_UINT32 ch); +RE_UINT32 re_get_word(RE_UINT32 ch); +RE_UINT32 re_get_xdigit(RE_UINT32 ch); +RE_UINT32 re_get_posix_digit(RE_UINT32 ch); +RE_UINT32 re_get_posix_alnum(RE_UINT32 ch); +RE_UINT32 re_get_posix_punct(RE_UINT32 ch); +RE_UINT32 re_get_posix_xdigit(RE_UINT32 ch); +int re_get_all_cases(RE_UINT32 ch, RE_UINT32* codepoints); +RE_UINT32 re_get_simple_case_folding(RE_UINT32 ch); +int re_get_full_case_folding(RE_UINT32 ch, RE_UINT32* codepoints); diff --git a/lib/regex/test_regex.py b/lib/regex/test_regex.py new file mode 100644 index 0000000000000000000000000000000000000000..050c3c004d03e2eb6c2f336fffb02561da7cdf0d --- /dev/null +++ b/lib/regex/test_regex.py @@ -0,0 +1,3585 @@ +from __future__ import with_statement +import regex +import string +from weakref import proxy +import unittest +import copy +from test.test_support import run_unittest +import re + +# _AssertRaisesContext is defined here because the class doesn't exist before +# Python 2.7. +class _AssertRaisesContext(object): + """A context manager used to implement TestCase.assertRaises* methods.""" + + def __init__(self, expected, test_case, expected_regexp=None): + self.expected = expected + self.failureException = test_case.failureException + self.expected_regexp = expected_regexp + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, tb): + if exc_type is None: + try: + exc_name = self.expected.__name__ + except AttributeError: + exc_name = str(self.expected) + raise self.failureException( + "%s not raised" % exc_name) + if not issubclass(exc_type, self.expected): + # let unexpected exceptions pass through + return False + self.exception = exc_value # store for later retrieval + if self.expected_regexp is None: + return True + + expected_regexp = self.expected_regexp + if isinstance(expected_regexp, basestring): + expected_regexp = re.compile(expected_regexp) + if not expected_regexp.search(str(exc_value)): + raise self.failureException('"%s" does not match "%s"' % + (expected_regexp.pattern, str(exc_value))) + return True + +class RegexTests(unittest.TestCase): + PATTERN_CLASS = "<type '_regex.Pattern'>" + FLAGS_WITH_COMPILED_PAT = "cannot process flags argument with a compiled pattern" + INVALID_GROUP_REF = "invalid group reference" + MISSING_GT = "missing >" + BAD_GROUP_NAME = "bad character in group name" + MISSING_GROUP_NAME = "missing group name" + MISSING_LT = "missing <" + UNKNOWN_GROUP_I = "unknown group" + UNKNOWN_GROUP = "unknown group" + BAD_ESCAPE = r"bad escape \(end of pattern\)" + BAD_OCTAL_ESCAPE = r"bad escape \\" + BAD_SET = "unterminated character set" + STR_PAT_ON_BYTES = "cannot use a string pattern on a bytes-like object" + BYTES_PAT_ON_STR = "cannot use a bytes pattern on a string-like object" + STR_PAT_BYTES_TEMPL = "expected str instance, bytes found" + BYTES_PAT_STR_TEMPL = "expected a bytes-like object, str found" + BYTES_PAT_UNI_FLAG = "cannot use UNICODE flag with a bytes pattern" + MIXED_FLAGS = "ASCII, LOCALE and UNICODE flags are mutually incompatible" + MISSING_RPAREN = "missing \\)" + TRAILING_CHARS = "unbalanced parenthesis" + BAD_CHAR_RANGE = "bad character range" + NOTHING_TO_REPEAT = "nothing to repeat" + MULTIPLE_REPEAT = "multiple repeat" + OPEN_GROUP = "cannot refer to an open group" + DUPLICATE_GROUP = "duplicate group" + CANT_TURN_OFF = "bad inline flags: cannot turn flags off" + UNDEF_CHAR_NAME = "undefined character name" + + # assertRaisesRegex is defined here because the method isn't in the + # superclass before Python 2.7. + def assertRaisesRegex(self, expected_exception, expected_regexp, + callable_obj=None, *args, **kwargs): + """Asserts that the message in a raised exception matches a regexp. + + Args: + expected_exception: Exception class expected to be raised. + expected_regexp: Regexp (re pattern object or string) expected + to be found in error message. + callable_obj: Function to be called. + args: Extra args. + kwargs: Extra kwargs. + """ + context = _AssertRaisesContext(expected_exception, self, expected_regexp) + if callable_obj is None: + return context + with context: + callable_obj(*args, **kwargs) + + def assertTypedEqual(self, actual, expect, msg=None): + self.assertEqual(actual, expect, msg) + + def recurse(actual, expect): + if isinstance(expect, (tuple, list)): + for x, y in zip(actual, expect): + recurse(x, y) + else: + self.assertIs(type(actual), type(expect), msg) + + recurse(actual, expect) + + def test_weakref(self): + s = 'QabbbcR' + x = regex.compile('ab+c') + y = proxy(x) + if x.findall('QabbbcR') != y.findall('QabbbcR'): + self.fail() + + def test_search_star_plus(self): + self.assertEqual(regex.search('a*', 'xxx').span(0), (0, 0)) + self.assertEqual(regex.search('x*', 'axx').span(), (0, 0)) + self.assertEqual(regex.search('x+', 'axx').span(0), (1, 3)) + self.assertEqual(regex.search('x+', 'axx').span(), (1, 3)) + self.assertEqual(regex.search('x', 'aaa'), None) + self.assertEqual(regex.match('a*', 'xxx').span(0), (0, 0)) + self.assertEqual(regex.match('a*', 'xxx').span(), (0, 0)) + self.assertEqual(regex.match('x*', 'xxxa').span(0), (0, 3)) + self.assertEqual(regex.match('x*', 'xxxa').span(), (0, 3)) + self.assertEqual(regex.match('a+', 'xxx'), None) + + def bump_num(self, matchobj): + int_value = int(matchobj[0]) + return str(int_value + 1) + + def test_basic_regex_sub(self): + self.assertEqual(regex.sub("(?i)b+", "x", "bbbb BBBB"), 'x x') + self.assertEqual(regex.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'), + '9.3 -3 24x100y') + self.assertEqual(regex.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3), + '9.3 -3 23x99y') + + self.assertEqual(regex.sub('.', lambda m: r"\n", 'x'), "\\n") + self.assertEqual(regex.sub('.', r"\n", 'x'), "\n") + + self.assertEqual(regex.sub('(?P<a>x)', r'\g<a>\g<a>', 'xx'), 'xxxx') + self.assertEqual(regex.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx') + self.assertEqual(regex.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), + 'xxxx') + self.assertEqual(regex.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx') + + self.assertEqual(regex.sub('a', r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D', + 'a'), "\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D") + self.assertEqual(regex.sub('a', '\t\n\v\r\f\a', 'a'), "\t\n\v\r\f\a") + self.assertEqual(regex.sub('a', '\t\n\v\r\f\a', 'a'), chr(9) + chr(10) + + chr(11) + chr(13) + chr(12) + chr(7)) + + self.assertEqual(regex.sub(r'^\s*', 'X', 'test'), 'Xtest') + + self.assertEqual(regex.sub(ur"x", ur"\x0A", u"x"), u"\n") + self.assertEqual(regex.sub(ur"x", ur"\u000A", u"x"), u"\n") + self.assertEqual(regex.sub(ur"x", ur"\U0000000A", u"x"), u"\n") + self.assertEqual(regex.sub(ur"x", ur"\N{LATIN CAPITAL LETTER A}", + u"x"), u"A") + + self.assertEqual(regex.sub(r"x", r"\x0A", "x"), "\n") + self.assertEqual(regex.sub(r"x", r"\u000A", "x"), "\\u000A") + self.assertEqual(regex.sub(r"x", r"\U0000000A", "x"), + "\\U0000000A") + self.assertEqual(regex.sub(r"x", r"\N{LATIN CAPITAL LETTER A}", + "x"), "\\N{LATIN CAPITAL LETTER A}") + + def test_bug_449964(self): + # Fails for group followed by other escape. + self.assertEqual(regex.sub(r'(?P<unk>x)', r'\g<1>\g<1>\b', 'xx'), + "xx\bxx\b") + + def test_bug_449000(self): + # Test for sub() on escaped characters. + self.assertEqual(regex.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'), + "abc\ndef\n") + self.assertEqual(regex.sub('\r\n', r'\n', 'abc\r\ndef\r\n'), + "abc\ndef\n") + self.assertEqual(regex.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'), + "abc\ndef\n") + self.assertEqual(regex.sub('\r\n', '\n', 'abc\r\ndef\r\n'), + "abc\ndef\n") + + def test_bug_1140(self): + # regex.sub(x, y, u'') should return u'', not '', and + # regex.sub(x, y, '') should return '', not u''. + # Also: + # regex.sub(x, y, unicode(x)) should return unicode(y), and + # regex.sub(x, y, str(x)) should return + # str(y) if isinstance(y, str) else unicode(y). + for x in 'x', u'x': + for y in 'y', u'y': + z = regex.sub(x, y, u'') + self.assertEqual((type(z), z), (unicode, u'')) + z = regex.sub(x, y, '') + self.assertEqual((type(z), z), (str, '')) + z = regex.sub(x, y, unicode(x)) + self.assertEqual((type(z), z), (unicode, unicode(y))) + z = regex.sub(x, y, str(x)) + self.assertEqual((type(z), z), (type(y), y)) + + def test_bug_1661(self): + # Verify that flags do not get silently ignored with compiled patterns + pattern = regex.compile('.') + self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, + lambda: regex.match(pattern, 'A', regex.I)) + self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, + lambda: regex.search(pattern, 'A', regex.I)) + self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, + lambda: regex.findall(pattern, 'A', regex.I)) + self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, + lambda: regex.compile(pattern, regex.I)) + + def test_bug_3629(self): + # A regex that triggered a bug in the sre-code validator + self.assertEqual(repr(type(regex.compile("(?P<quote>)(?(quote))"))), + self.PATTERN_CLASS) + + def test_sub_template_numeric_escape(self): + # Bug 776311 and friends. + self.assertEqual(regex.sub('x', r'\0', 'x'), "\0") + self.assertEqual(regex.sub('x', r'\000', 'x'), "\000") + self.assertEqual(regex.sub('x', r'\001', 'x'), "\001") + self.assertEqual(regex.sub('x', r'\008', 'x'), "\0" + "8") + self.assertEqual(regex.sub('x', r'\009', 'x'), "\0" + "9") + self.assertEqual(regex.sub('x', r'\111', 'x'), "\111") + self.assertEqual(regex.sub('x', r'\117', 'x'), "\117") + + self.assertEqual(regex.sub('x', r'\1111', 'x'), "\1111") + self.assertEqual(regex.sub('x', r'\1111', 'x'), "\111" + "1") + + self.assertEqual(regex.sub('x', r'\00', 'x'), '\x00') + self.assertEqual(regex.sub('x', r'\07', 'x'), '\x07') + self.assertEqual(regex.sub('x', r'\08', 'x'), "\0" + "8") + self.assertEqual(regex.sub('x', r'\09', 'x'), "\0" + "9") + self.assertEqual(regex.sub('x', r'\0a', 'x'), "\0" + "a") + + self.assertEqual(regex.sub(u'x', ur'\400', u'x'), u"\u0100") + self.assertEqual(regex.sub(u'x', ur'\777', u'x'), u"\u01FF") + self.assertEqual(regex.sub('x', r'\400', 'x'), "\x00") + self.assertEqual(regex.sub('x', r'\777', 'x'), "\xFF") + + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\1', 'x')) + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\8', 'x')) + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\9', 'x')) + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\11', 'x')) + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\18', 'x')) + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\1a', 'x')) + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\90', 'x')) + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\99', 'x')) + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\118', 'x')) # r'\11' + '8' + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\11a', 'x')) + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\181', 'x')) # r'\18' + '1' + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\800', 'x')) # r'\80' + '0' + + # In Python 2.3 (etc), these loop endlessly in sre_parser.py. + self.assertEqual(regex.sub('(((((((((((x)))))))))))', r'\11', 'x'), + 'x') + self.assertEqual(regex.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'), + 'xz8') + self.assertEqual(regex.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'), + 'xza') + + def test_qualified_re_sub(self): + self.assertEqual(regex.sub('a', 'b', 'aaaaa'), 'bbbbb') + self.assertEqual(regex.sub('a', 'b', 'aaaaa', 1), 'baaaa') + + def test_bug_114660(self): + self.assertEqual(regex.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'), + 'hello there') + + def test_bug_462270(self): + # Test for empty sub() behaviour, see SF bug #462270 + self.assertEqual(regex.sub('(?V0)x*', '-', 'abxd'), '-a-b-d-') + self.assertEqual(regex.sub('(?V1)x*', '-', 'abxd'), '-a-b--d-') + self.assertEqual(regex.sub('x+', '-', 'abxd'), 'ab-d') + + def test_bug_14462(self): + # chr(255) is not a valid identifier in Python 2. + group_name = u'\xFF' + self.assertRaisesRegex(regex.error, self.BAD_GROUP_NAME, lambda: + regex.search(ur'(?P<' + group_name + '>a)', u'a')) + + def test_symbolic_refs(self): + self.assertRaisesRegex(regex.error, self.MISSING_GT, lambda: + regex.sub('(?P<a>x)', r'\g<a', 'xx')) + self.assertRaisesRegex(regex.error, self.MISSING_GROUP_NAME, lambda: + regex.sub('(?P<a>x)', r'\g<', 'xx')) + self.assertRaisesRegex(regex.error, self.MISSING_LT, lambda: + regex.sub('(?P<a>x)', r'\g', 'xx')) + self.assertRaisesRegex(regex.error, self.BAD_GROUP_NAME, lambda: + regex.sub('(?P<a>x)', r'\g<a a>', 'xx')) + self.assertRaisesRegex(regex.error, self.BAD_GROUP_NAME, lambda: + regex.sub('(?P<a>x)', r'\g<1a1>', 'xx')) + self.assertRaisesRegex(IndexError, self.UNKNOWN_GROUP_I, lambda: + regex.sub('(?P<a>x)', r'\g<ab>', 'xx')) + + # The new behaviour of unmatched but valid groups is to treat them like + # empty matches in the replacement template, like in Perl. + self.assertEqual(regex.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '') + self.assertEqual(regex.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '') + + # The old behaviour was to raise it as an IndexError. + self.assertRaisesRegex(regex.error, self.BAD_GROUP_NAME, lambda: + regex.sub('(?P<a>x)', r'\g<-1>', 'xx')) + + def test_re_subn(self): + self.assertEqual(regex.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) + self.assertEqual(regex.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1)) + self.assertEqual(regex.subn("b+", "x", "xyz"), ('xyz', 0)) + self.assertEqual(regex.subn("b*", "x", "xyz"), ('xxxyxzx', 4)) + self.assertEqual(regex.subn("b*", "x", "xyz", 2), ('xxxyz', 2)) + + def test_re_split(self): + self.assertEqual(regex.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c']) + self.assertEqual(regex.split(":*", ":a:b::c"), ['', 'a', 'b', 'c']) + self.assertEqual(regex.split("(:*)", ":a:b::c"), ['', ':', 'a', ':', + 'b', '::', 'c']) + self.assertEqual(regex.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c']) + self.assertEqual(regex.split("(:)*", ":a:b::c"), ['', ':', 'a', ':', + 'b', ':', 'c']) + self.assertEqual(regex.split("([b:]+)", ":a:b::c"), ['', ':', 'a', + ':b::', 'c']) + self.assertEqual(regex.split("(b)|(:+)", ":a:b::c"), ['', None, ':', + 'a', None, ':', '', 'b', None, '', None, '::', 'c']) + self.assertEqual(regex.split("(?:b)|(?::+)", ":a:b::c"), ['', 'a', '', + '', 'c']) + + self.assertEqual(regex.split("x", "xaxbxc"), ['', 'a', 'b', 'c']) + self.assertEqual([m for m in regex.splititer("x", "xaxbxc")], ['', 'a', + 'b', 'c']) + + self.assertEqual(regex.split("(?r)x", "xaxbxc"), ['c', 'b', 'a', '']) + self.assertEqual([m for m in regex.splititer("(?r)x", "xaxbxc")], ['c', + 'b', 'a', '']) + + self.assertEqual(regex.split("(x)|(y)", "xaxbxc"), ['', 'x', None, 'a', + 'x', None, 'b', 'x', None, 'c']) + self.assertEqual([m for m in regex.splititer("(x)|(y)", "xaxbxc")], + ['', 'x', None, 'a', 'x', None, 'b', 'x', None, 'c']) + + self.assertEqual(regex.split("(?r)(x)|(y)", "xaxbxc"), ['c', 'x', None, + 'b', 'x', None, 'a', 'x', None, '']) + self.assertEqual([m for m in regex.splititer("(?r)(x)|(y)", "xaxbxc")], + ['c', 'x', None, 'b', 'x', None, 'a', 'x', None, '']) + + self.assertEqual(regex.split(r"(?V1)\b", "a b c"), ['', 'a', ' ', 'b', + ' ', 'c', '']) + self.assertEqual(regex.split(r"(?V1)\m", "a b c"), ['', 'a ', 'b ', + 'c']) + self.assertEqual(regex.split(r"(?V1)\M", "a b c"), ['a', ' b', ' c', + '']) + + def test_qualified_re_split(self): + self.assertEqual(regex.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) + self.assertEqual(regex.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d']) + self.assertEqual(regex.split("(:)", ":a:b::c", 2), ['', ':', 'a', ':', + 'b::c']) + self.assertEqual(regex.split("(:*)", ":a:b::c", 2), ['', ':', 'a', ':', + 'b::c']) + + def test_re_findall(self): + self.assertEqual(regex.findall(":+", "abc"), []) + self.assertEqual(regex.findall(":+", "a:b::c:::d"), [':', '::', ':::']) + self.assertEqual(regex.findall("(:+)", "a:b::c:::d"), [':', '::', + ':::']) + self.assertEqual(regex.findall("(:)(:*)", "a:b::c:::d"), [(':', ''), + (':', ':'), (':', '::')]) + + self.assertEqual(regex.findall(r"\((?P<test>.{0,5}?TEST)\)", + "(MY TEST)"), ["MY TEST"]) + self.assertEqual(regex.findall(r"\((?P<test>.{0,3}?TEST)\)", + "(MY TEST)"), ["MY TEST"]) + self.assertEqual(regex.findall(r"\((?P<test>.{0,3}?T)\)", "(MY T)"), + ["MY T"]) + + self.assertEqual(regex.findall(r"[^a]{2}[A-Z]", "\n S"), [' S']) + self.assertEqual(regex.findall(r"[^a]{2,3}[A-Z]", "\n S"), ['\n S']) + self.assertEqual(regex.findall(r"[^a]{2,3}[A-Z]", "\n S"), [' S']) + + self.assertEqual(regex.findall(r"X(Y[^Y]+?){1,2}( |Q)+DEF", + "XYABCYPPQ\nQ DEF"), [('YPPQ\n', ' ')]) + + self.assertEqual(regex.findall(r"(\nTest(\n+.+?){0,2}?)?\n+End", + "\nTest\nxyz\nxyz\nEnd"), [('\nTest\nxyz\nxyz', '\nxyz')]) + + def test_bug_117612(self): + self.assertEqual(regex.findall(r"(a|(b))", "aba"), [('a', ''), ('b', + 'b'), ('a', '')]) + + def test_re_match(self): + self.assertEqual(regex.match('a', 'a')[:], ('a',)) + self.assertEqual(regex.match('(a)', 'a')[:], ('a', 'a')) + self.assertEqual(regex.match(r'(a)', 'a')[0], 'a') + self.assertEqual(regex.match(r'(a)', 'a')[1], 'a') + self.assertEqual(regex.match(r'(a)', 'a').group(1, 1), ('a', 'a')) + + pat = regex.compile('((a)|(b))(c)?') + self.assertEqual(pat.match('a')[:], ('a', 'a', 'a', None, None)) + self.assertEqual(pat.match('b')[:], ('b', 'b', None, 'b', None)) + self.assertEqual(pat.match('ac')[:], ('ac', 'a', 'a', None, 'c')) + self.assertEqual(pat.match('bc')[:], ('bc', 'b', None, 'b', 'c')) + self.assertEqual(pat.match('bc')[:], ('bc', 'b', None, 'b', 'c')) + + # A single group. + m = regex.match('(a)', 'a') + self.assertEqual(m.group(), 'a') + self.assertEqual(m.group(0), 'a') + self.assertEqual(m.group(1), 'a') + self.assertEqual(m.group(1, 1), ('a', 'a')) + + pat = regex.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?') + self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None)) + self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'), (None, 'b', + None)) + self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c')) + + def test_re_groupref_exists(self): + self.assertEqual(regex.match(r'^(\()?([^()]+)(?(1)\))$', '(a)')[:], + ('(a)', '(', 'a')) + self.assertEqual(regex.match(r'^(\()?([^()]+)(?(1)\))$', 'a')[:], ('a', + None, 'a')) + self.assertEqual(regex.match(r'^(\()?([^()]+)(?(1)\))$', 'a)'), None) + self.assertEqual(regex.match(r'^(\()?([^()]+)(?(1)\))$', '(a'), None) + self.assertEqual(regex.match('^(?:(a)|c)((?(1)b|d))$', 'ab')[:], ('ab', + 'a', 'b')) + self.assertEqual(regex.match('^(?:(a)|c)((?(1)b|d))$', 'cd')[:], ('cd', + None, 'd')) + self.assertEqual(regex.match('^(?:(a)|c)((?(1)|d))$', 'cd')[:], ('cd', + None, 'd')) + self.assertEqual(regex.match('^(?:(a)|c)((?(1)|d))$', 'a')[:], ('a', + 'a', '')) + + # Tests for bug #1177831: exercise groups other than the first group. + p = regex.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))') + self.assertEqual(p.match('abc')[:], ('abc', 'a', 'b', 'c')) + self.assertEqual(p.match('ad')[:], ('ad', 'a', None, 'd')) + self.assertEqual(p.match('abd'), None) + self.assertEqual(p.match('ac'), None) + + def test_re_groupref(self): + self.assertEqual(regex.match(r'^(\|)?([^()]+)\1$', '|a|')[:], ('|a|', + '|', 'a')) + self.assertEqual(regex.match(r'^(\|)?([^()]+)\1?$', 'a')[:], ('a', + None, 'a')) + self.assertEqual(regex.match(r'^(\|)?([^()]+)\1$', 'a|'), None) + self.assertEqual(regex.match(r'^(\|)?([^()]+)\1$', '|a'), None) + self.assertEqual(regex.match(r'^(?:(a)|c)(\1)$', 'aa')[:], ('aa', 'a', + 'a')) + self.assertEqual(regex.match(r'^(?:(a)|c)(\1)?$', 'c')[:], ('c', None, + None)) + + self.assertEqual(regex.findall("(?i)(.{1,40}?),(.{1,40}?)(?:;)+(.{1,80}).{1,40}?\\3(\ |;)+(.{1,80}?)\\1", + "TEST, BEST; LEST ; Lest 123 Test, Best"), [('TEST', ' BEST', + ' LEST', ' ', '123 ')]) + + def test_groupdict(self): + self.assertEqual(regex.match('(?P<first>first) (?P<second>second)', + 'first second').groupdict(), {'first': 'first', 'second': 'second'}) + + def test_expand(self): + self.assertEqual(regex.match("(?P<first>first) (?P<second>second)", + "first second").expand(r"\2 \1 \g<second> \g<first>"), + 'second first second first') + + def test_repeat_minmax(self): + self.assertEqual(regex.match(r"^(\w){1}$", "abc"), None) + self.assertEqual(regex.match(r"^(\w){1}?$", "abc"), None) + self.assertEqual(regex.match(r"^(\w){1,2}$", "abc"), None) + self.assertEqual(regex.match(r"^(\w){1,2}?$", "abc"), None) + + self.assertEqual(regex.match(r"^(\w){3}$", "abc")[1], 'c') + self.assertEqual(regex.match(r"^(\w){1,3}$", "abc")[1], 'c') + self.assertEqual(regex.match(r"^(\w){1,4}$", "abc")[1], 'c') + self.assertEqual(regex.match(r"^(\w){3,4}?$", "abc")[1], 'c') + self.assertEqual(regex.match(r"^(\w){3}?$", "abc")[1], 'c') + self.assertEqual(regex.match(r"^(\w){1,3}?$", "abc")[1], 'c') + self.assertEqual(regex.match(r"^(\w){1,4}?$", "abc")[1], 'c') + self.assertEqual(regex.match(r"^(\w){3,4}?$", "abc")[1], 'c') + + self.assertEqual(regex.match("^x{1}$", "xxx"), None) + self.assertEqual(regex.match("^x{1}?$", "xxx"), None) + self.assertEqual(regex.match("^x{1,2}$", "xxx"), None) + self.assertEqual(regex.match("^x{1,2}?$", "xxx"), None) + + self.assertEqual(regex.match("^x{1}", "xxx")[0], 'x') + self.assertEqual(regex.match("^x{1}?", "xxx")[0], 'x') + self.assertEqual(regex.match("^x{0,1}", "xxx")[0], 'x') + self.assertEqual(regex.match("^x{0,1}?", "xxx")[0], '') + + self.assertEqual(bool(regex.match("^x{3}$", "xxx")), True) + self.assertEqual(bool(regex.match("^x{1,3}$", "xxx")), True) + self.assertEqual(bool(regex.match("^x{1,4}$", "xxx")), True) + self.assertEqual(bool(regex.match("^x{3,4}?$", "xxx")), True) + self.assertEqual(bool(regex.match("^x{3}?$", "xxx")), True) + self.assertEqual(bool(regex.match("^x{1,3}?$", "xxx")), True) + self.assertEqual(bool(regex.match("^x{1,4}?$", "xxx")), True) + self.assertEqual(bool(regex.match("^x{3,4}?$", "xxx")), True) + + self.assertEqual(regex.match("^x{}$", "xxx"), None) + self.assertEqual(bool(regex.match("^x{}$", "x{}")), True) + + def test_getattr(self): + self.assertEqual(regex.compile("(?i)(a)(b)").pattern, '(?i)(a)(b)') + self.assertEqual(regex.compile("(?i)(a)(b)").flags, regex.A | regex.I | + regex.DEFAULT_VERSION) + self.assertEqual(regex.compile(u"(?i)(a)(b)").flags, regex.I | regex.U + | regex.DEFAULT_VERSION) + self.assertEqual(regex.compile("(?i)(a)(b)").groups, 2) + self.assertEqual(regex.compile("(?i)(a)(b)").groupindex, {}) + + self.assertEqual(regex.compile("(?i)(?P<first>a)(?P<other>b)").groupindex, + {'first': 1, 'other': 2}) + + self.assertEqual(regex.match("(a)", "a").pos, 0) + self.assertEqual(regex.match("(a)", "a").endpos, 1) + + self.assertEqual(regex.search("b(c)", "abcdef").pos, 0) + self.assertEqual(regex.search("b(c)", "abcdef").endpos, 6) + self.assertEqual(regex.search("b(c)", "abcdef").span(), (1, 3)) + self.assertEqual(regex.search("b(c)", "abcdef").span(1), (2, 3)) + + self.assertEqual(regex.match("(a)", "a").string, 'a') + self.assertEqual(regex.match("(a)", "a").regs, ((0, 1), (0, 1))) + self.assertEqual(repr(type(regex.match("(a)", "a").re)), + self.PATTERN_CLASS) + + # Issue 14260. + p = regex.compile(r'abc(?P<n>def)') + p.groupindex["n"] = 0 + self.assertEqual(p.groupindex["n"], 1) + + def test_special_escapes(self): + self.assertEqual(regex.search(r"\b(b.)\b", "abcd abc bcd bx")[1], 'bx') + self.assertEqual(regex.search(r"\B(b.)\B", "abc bcd bc abxd")[1], 'bx') + self.assertEqual(regex.search(r"\b(b.)\b", "abcd abc bcd bx", + regex.LOCALE)[1], 'bx') + self.assertEqual(regex.search(r"\B(b.)\B", "abc bcd bc abxd", + regex.LOCALE)[1], 'bx') + self.assertEqual(regex.search(ur"\b(b.)\b", u"abcd abc bcd bx", + regex.UNICODE)[1], u'bx') + self.assertEqual(regex.search(ur"\B(b.)\B", u"abc bcd bc abxd", + regex.UNICODE)[1], u'bx') + + self.assertEqual(regex.search(r"^abc$", "\nabc\n", regex.M)[0], 'abc') + self.assertEqual(regex.search(r"^\Aabc\Z$", "abc", regex.M)[0], 'abc') + self.assertEqual(regex.search(r"^\Aabc\Z$", "\nabc\n", regex.M), None) + + self.assertEqual(regex.search(ur"\b(b.)\b", u"abcd abc bcd bx")[1], + u'bx') + self.assertEqual(regex.search(ur"\B(b.)\B", u"abc bcd bc abxd")[1], + u'bx') + self.assertEqual(regex.search(ur"^abc$", u"\nabc\n", regex.M)[0], + u'abc') + self.assertEqual(regex.search(ur"^\Aabc\Z$", u"abc", regex.M)[0], + u'abc') + self.assertEqual(regex.search(ur"^\Aabc\Z$", u"\nabc\n", regex.M), + None) + + self.assertEqual(regex.search(r"\d\D\w\W\s\S", "1aa! a")[0], '1aa! a') + self.assertEqual(regex.search(r"\d\D\w\W\s\S", "1aa! a", + regex.LOCALE)[0], '1aa! a') + self.assertEqual(regex.search(ur"\d\D\w\W\s\S", u"1aa! a", + regex.UNICODE)[0], u'1aa! a') + + def test_bigcharset(self): + self.assertEqual(regex.match(ur"(?u)([\u2222\u2223])", u"\u2222")[1], + u'\u2222') + self.assertEqual(regex.match(ur"(?u)([\u2222\u2223])", u"\u2222", + regex.UNICODE)[1], u'\u2222') + self.assertEqual(u"".join(regex.findall(u".", + u"e\xe8\xe9\xea\xeb\u0113\u011b\u0117", flags=regex.UNICODE)), + u'e\xe8\xe9\xea\xeb\u0113\u011b\u0117') + self.assertEqual(u"".join(regex.findall(ur"[e\xe8\xe9\xea\xeb\u0113\u011b\u0117]", + u"e\xe8\xe9\xea\xeb\u0113\u011b\u0117", flags=regex.UNICODE)), + u'e\xe8\xe9\xea\xeb\u0113\u011b\u0117') + self.assertEqual(u"".join(regex.findall(ur"e|\xe8|\xe9|\xea|\xeb|\u0113|\u011b|\u0117", + u"e\xe8\xe9\xea\xeb\u0113\u011b\u0117", flags=regex.UNICODE)), + u'e\xe8\xe9\xea\xeb\u0113\u011b\u0117') + + def test_anyall(self): + self.assertEqual(regex.match("a.b", "a\nb", regex.DOTALL)[0], "a\nb") + self.assertEqual(regex.match("a.*b", "a\n\nb", regex.DOTALL)[0], + "a\n\nb") + + def test_non_consuming(self): + self.assertEqual(regex.match(r"(a(?=\s[^a]))", "a b")[1], 'a') + self.assertEqual(regex.match(r"(a(?=\s[^a]*))", "a b")[1], 'a') + self.assertEqual(regex.match(r"(a(?=\s[abc]))", "a b")[1], 'a') + self.assertEqual(regex.match(r"(a(?=\s[abc]*))", "a bc")[1], 'a') + self.assertEqual(regex.match(r"(a)(?=\s\1)", "a a")[1], 'a') + self.assertEqual(regex.match(r"(a)(?=\s\1*)", "a aa")[1], 'a') + self.assertEqual(regex.match(r"(a)(?=\s(abc|a))", "a a")[1], 'a') + + self.assertEqual(regex.match(r"(a(?!\s[^a]))", "a a")[1], 'a') + self.assertEqual(regex.match(r"(a(?!\s[abc]))", "a d")[1], 'a') + self.assertEqual(regex.match(r"(a)(?!\s\1)", "a b")[1], 'a') + self.assertEqual(regex.match(r"(a)(?!\s(abc|a))", "a b")[1], 'a') + + def test_ignore_case(self): + self.assertEqual(regex.match("abc", "ABC", regex.I)[0], 'ABC') + self.assertEqual(regex.match(u"abc", u"ABC", regex.I)[0], u'ABC') + + self.assertEqual(regex.match(r"(a\s[^a]*)", "a bb", regex.I)[1], + 'a bb') + self.assertEqual(regex.match(r"(a\s[abc])", "a b", regex.I)[1], 'a b') + self.assertEqual(regex.match(r"(a\s[abc]*)", "a bb", regex.I)[1], + 'a bb') + self.assertEqual(regex.match(r"((a)\s\2)", "a a", regex.I)[1], 'a a') + self.assertEqual(regex.match(r"((a)\s\2*)", "a aa", regex.I)[1], + 'a aa') + self.assertEqual(regex.match(r"((a)\s(abc|a))", "a a", regex.I)[1], + 'a a') + self.assertEqual(regex.match(r"((a)\s(abc|a)*)", "a aa", regex.I)[1], + 'a aa') + + # Issue 3511. + self.assertEqual(regex.match(r"[Z-a]", "_").span(), (0, 1)) + self.assertEqual(regex.match(r"(?i)[Z-a]", "_").span(), (0, 1)) + + self.assertEqual(bool(regex.match(ur"(?iu)nao", u"nAo")), True) + self.assertEqual(bool(regex.match(ur"(?iu)n\xE3o", u"n\xC3o")), True) + self.assertEqual(bool(regex.match(ur"(?iu)n\xE3o", u"N\xC3O")), True) + self.assertEqual(bool(regex.match(ur"(?iu)s", u"\u017F")), True) + + def test_case_folding(self): + self.assertEqual(regex.search(ur"(?fiu)ss", u"SS").span(), (0, 2)) + self.assertEqual(regex.search(ur"(?fiu)SS", u"ss").span(), (0, 2)) + self.assertEqual(regex.search(ur"(?fiu)SS", + u"\N{LATIN SMALL LETTER SHARP S}").span(), (0, 1)) + self.assertEqual(regex.search(ur"(?fi)\N{LATIN SMALL LETTER SHARP S}", + u"SS").span(), (0, 2)) + + self.assertEqual(regex.search(ur"(?fiu)\N{LATIN SMALL LIGATURE ST}", + u"ST").span(), (0, 2)) + self.assertEqual(regex.search(ur"(?fiu)ST", + u"\N{LATIN SMALL LIGATURE ST}").span(), (0, 1)) + self.assertEqual(regex.search(ur"(?fiu)ST", + u"\N{LATIN SMALL LIGATURE LONG S T}").span(), (0, 1)) + + self.assertEqual(regex.search(ur"(?fiu)SST", + u"\N{LATIN SMALL LETTER SHARP S}t").span(), (0, 2)) + self.assertEqual(regex.search(ur"(?fiu)SST", + u"s\N{LATIN SMALL LIGATURE LONG S T}").span(), (0, 2)) + self.assertEqual(regex.search(ur"(?fiu)SST", + u"s\N{LATIN SMALL LIGATURE ST}").span(), (0, 2)) + self.assertEqual(regex.search(ur"(?fiu)\N{LATIN SMALL LIGATURE ST}", + u"SST").span(), (1, 3)) + self.assertEqual(regex.search(ur"(?fiu)SST", + u"s\N{LATIN SMALL LIGATURE ST}").span(), (0, 2)) + + self.assertEqual(regex.search(ur"(?fiu)FFI", + u"\N{LATIN SMALL LIGATURE FFI}").span(), (0, 1)) + self.assertEqual(regex.search(ur"(?fiu)FFI", + u"\N{LATIN SMALL LIGATURE FF}i").span(), (0, 2)) + self.assertEqual(regex.search(ur"(?fiu)FFI", + u"f\N{LATIN SMALL LIGATURE FI}").span(), (0, 2)) + self.assertEqual(regex.search(ur"(?fiu)\N{LATIN SMALL LIGATURE FFI}", + u"FFI").span(), (0, 3)) + self.assertEqual(regex.search(ur"(?fiu)\N{LATIN SMALL LIGATURE FF}i", + u"FFI").span(), (0, 3)) + self.assertEqual(regex.search(ur"(?fiu)f\N{LATIN SMALL LIGATURE FI}", + u"FFI").span(), (0, 3)) + + sigma = u"\u03A3\u03C3\u03C2" + for ch1 in sigma: + for ch2 in sigma: + if not regex.match(ur"(?fiu)" + ch1, ch2): + self.fail() + + self.assertEqual(bool(regex.search(ur"(?iuV1)ff", u"\uFB00\uFB01")), + True) + self.assertEqual(bool(regex.search(ur"(?iuV1)ff", u"\uFB01\uFB00")), + True) + self.assertEqual(bool(regex.search(ur"(?iuV1)fi", u"\uFB00\uFB01")), + True) + self.assertEqual(bool(regex.search(ur"(?iuV1)fi", u"\uFB01\uFB00")), + True) + self.assertEqual(bool(regex.search(ur"(?iuV1)fffi", u"\uFB00\uFB01")), + True) + self.assertEqual(bool(regex.search(ur"(?iuV1)f\uFB03", + u"\uFB00\uFB01")), True) + self.assertEqual(bool(regex.search(ur"(?iuV1)ff", u"\uFB00\uFB01")), + True) + self.assertEqual(bool(regex.search(ur"(?iuV1)fi", u"\uFB00\uFB01")), + True) + self.assertEqual(bool(regex.search(ur"(?iuV1)fffi", u"\uFB00\uFB01")), + True) + self.assertEqual(bool(regex.search(ur"(?iuV1)f\uFB03", + u"\uFB00\uFB01")), True) + self.assertEqual(bool(regex.search(ur"(?iuV1)f\uFB01", u"\uFB00i")), + True) + self.assertEqual(bool(regex.search(ur"(?iuV1)f\uFB01", u"\uFB00i")), + True) + + self.assertEqual(regex.findall(ur"(?iuV0)\m(?:word){e<=3}\M(?<!\m(?:word){e<=1}\M)", + u"word word2 word word3 word word234 word23 word"), [u"word234", + u"word23"]) + self.assertEqual(regex.findall(ur"(?iuV1)\m(?:word){e<=3}\M(?<!\m(?:word){e<=1}\M)", + u"word word2 word word3 word word234 word23 word"), [u"word234", + u"word23"]) + + self.assertEqual(regex.search(ur"(?fi)a\N{LATIN SMALL LIGATURE FFI}ne", + u" affine ").span(), (2, 8)) + self.assertEqual(regex.search(ur"(?fi)a(?:\N{LATIN SMALL LIGATURE FFI}|x)ne", + u" affine ").span(), (2, 8)) + self.assertEqual(regex.search(ur"(?fi)a(?:\N{LATIN SMALL LIGATURE FFI}|xy)ne", + u" affine ").span(), (2, 8)) + self.assertEqual(regex.search(ur"(?fi)a\L<options>ne", u"affine", + options=[u"\N{LATIN SMALL LIGATURE FFI}"]).span(), (0, 6)) + self.assertEqual(regex.search(ur"(?fi)a\L<options>ne", + u"a\N{LATIN SMALL LIGATURE FFI}ne", options=[u"ffi"]).span(), (0, 4)) + + def test_category(self): + self.assertEqual(regex.match(r"(\s)", " ")[1], ' ') + + def test_not_literal(self): + self.assertEqual(regex.search(r"\s([^a])", " b")[1], 'b') + self.assertEqual(regex.search(r"\s([^a]*)", " bb")[1], 'bb') + + def test_search_coverage(self): + self.assertEqual(regex.search(r"\s(b)", " b")[1], 'b') + self.assertEqual(regex.search(r"a\s", "a ")[0], 'a ') + + def test_re_escape(self): + p = "" + self.assertEqual(regex.escape(p), p) + for i in range(0, 256): + p += chr(i) + self.assertEqual(bool(regex.match(regex.escape(chr(i)), chr(i))), + True) + self.assertEqual(regex.match(regex.escape(chr(i)), chr(i)).span(), + (0, 1)) + + pat = regex.compile(regex.escape(p)) + self.assertEqual(pat.match(p).span(), (0, 256)) + + def test_constants(self): + if regex.I != regex.IGNORECASE: + self.fail() + if regex.L != regex.LOCALE: + self.fail() + if regex.M != regex.MULTILINE: + self.fail() + if regex.S != regex.DOTALL: + self.fail() + if regex.X != regex.VERBOSE: + self.fail() + + def test_flags(self): + for flag in [regex.I, regex.M, regex.X, regex.S, regex.L]: + self.assertEqual(repr(type(regex.compile('^pattern$', flag))), + self.PATTERN_CLASS) + + def test_sre_character_literals(self): + for i in [0, 8, 16, 32, 64, 127, 128, 255]: + self.assertEqual(bool(regex.match(r"\%03o" % i, chr(i))), True) + self.assertEqual(bool(regex.match(r"\%03o0" % i, chr(i) + "0")), + True) + self.assertEqual(bool(regex.match(r"\%03o8" % i, chr(i) + "8")), + True) + self.assertEqual(bool(regex.match(r"\x%02x" % i, chr(i))), True) + self.assertEqual(bool(regex.match(r"\x%02x0" % i, chr(i) + "0")), + True) + self.assertEqual(bool(regex.match(r"\x%02xz" % i, chr(i) + "z")), + True) + + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.match(r"\911", "")) + + def test_sre_character_class_literals(self): + for i in [0, 8, 16, 32, 64, 127, 128, 255]: + self.assertEqual(bool(regex.match(r"[\%03o]" % i, chr(i))), True) + self.assertEqual(bool(regex.match(r"[\%03o0]" % i, chr(i))), True) + self.assertEqual(bool(regex.match(r"[\%03o8]" % i, chr(i))), True) + self.assertEqual(bool(regex.match(r"[\x%02x]" % i, chr(i))), True) + self.assertEqual(bool(regex.match(r"[\x%02x0]" % i, chr(i))), True) + self.assertEqual(bool(regex.match(r"[\x%02xz]" % i, chr(i))), True) + + self.assertRaisesRegex(regex.error, self.BAD_OCTAL_ESCAPE, lambda: + regex.match(r"[\911]", "")) + + def test_bug_113254(self): + self.assertEqual(regex.match(r'(a)|(b)', 'b').start(1), -1) + self.assertEqual(regex.match(r'(a)|(b)', 'b').end(1), -1) + self.assertEqual(regex.match(r'(a)|(b)', 'b').span(1), (-1, -1)) + + def test_bug_527371(self): + # Bug described in patches 527371/672491. + self.assertEqual(regex.match(r'(a)?a','a').lastindex, None) + self.assertEqual(regex.match(r'(a)(b)?b','ab').lastindex, 1) + self.assertEqual(regex.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, + 'a') + self.assertEqual(regex.match("(?P<a>a(b))", "ab").lastgroup, 'a') + self.assertEqual(regex.match("((a))", "a").lastindex, 1) + + def test_bug_545855(self): + # Bug 545855 -- This pattern failed to cause a compile error as it + # should, instead provoking a TypeError. + self.assertRaisesRegex(regex.error, self.BAD_SET, lambda: + regex.compile('foo[a-')) + + def test_bug_418626(self): + # Bugs 418626 at al. -- Testing Greg Chapman's addition of op code + # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of + # pattern '*?' on a long string. + self.assertEqual(regex.match('.*?c', 10000 * 'ab' + 'cd').end(0), + 20001) + self.assertEqual(regex.match('.*?cd', 5000 * 'ab' + 'c' + 5000 * 'ab' + + 'cde').end(0), 20003) + self.assertEqual(regex.match('.*?cd', 20000 * 'abc' + 'de').end(0), + 60001) + # Non-simple '*?' still used to hit the recursion limit, before the + # non-recursive scheme was implemented. + self.assertEqual(regex.search('(a|b)*?c', 10000 * 'ab' + 'cd').end(0), + 20001) + + def test_bug_612074(self): + pat = u"[" + regex.escape(u"\u2039") + u"]" + self.assertEqual(regex.compile(pat) and 1, 1) + + def test_stack_overflow(self): + # Nasty cases that used to overflow the straightforward recursive + # implementation of repeated groups. + self.assertEqual(regex.match('(x)*', 50000 * 'x')[1], 'x') + self.assertEqual(regex.match('(x)*y', 50000 * 'x' + 'y')[1], 'x') + self.assertEqual(regex.match('(x)*?y', 50000 * 'x' + 'y')[1], 'x') + + def test_scanner(self): + def s_ident(scanner, token): return token + def s_operator(scanner, token): return "op%s" % token + def s_float(scanner, token): return float(token) + def s_int(scanner, token): return int(token) + + scanner = regex.Scanner([(r"[a-zA-Z_]\w*", s_ident), (r"\d+\.\d*", + s_float), (r"\d+", s_int), (r"=|\+|-|\*|/", s_operator), (r"\s+", + None), ]) + + self.assertEqual(repr(type(scanner.scanner.scanner("").pattern)), + self.PATTERN_CLASS) + + self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"), (['sum', + 'op=', 3, 'op*', 'foo', 'op+', 312.5, 'op+', 'bar'], '')) + + def test_bug_448951(self): + # Bug 448951 (similar to 429357, but with single char match). + # (Also test greedy matches.) + for op in '', '?', '*': + self.assertEqual(regex.match(r'((.%s):)?z' % op, 'z')[:], ('z', + None, None)) + self.assertEqual(regex.match(r'((.%s):)?z' % op, 'a:z')[:], ('a:z', + 'a:', 'a')) + + def test_bug_725106(self): + # Capturing groups in alternatives in repeats. + self.assertEqual(regex.match('^((a)|b)*', 'abc')[:], ('ab', 'b', 'a')) + self.assertEqual(regex.match('^(([ab])|c)*', 'abc')[:], ('abc', 'c', + 'b')) + self.assertEqual(regex.match('^((d)|[ab])*', 'abc')[:], ('ab', 'b', + None)) + self.assertEqual(regex.match('^((a)c|[ab])*', 'abc')[:], ('ab', 'b', + None)) + self.assertEqual(regex.match('^((a)|b)*?c', 'abc')[:], ('abc', 'b', + 'a')) + self.assertEqual(regex.match('^(([ab])|c)*?d', 'abcd')[:], ('abcd', + 'c', 'b')) + self.assertEqual(regex.match('^((d)|[ab])*?c', 'abc')[:], ('abc', 'b', + None)) + self.assertEqual(regex.match('^((a)c|[ab])*?c', 'abc')[:], ('abc', 'b', + None)) + + def test_bug_725149(self): + # Mark_stack_base restoring before restoring marks. + self.assertEqual(regex.match('(a)(?:(?=(b)*)c)*', 'abb')[:], ('a', 'a', + None)) + self.assertEqual(regex.match('(a)((?!(b)*))*', 'abb')[:], ('a', 'a', + None, None)) + + def test_bug_764548(self): + # Bug 764548, regex.compile() barfs on str/unicode subclasses. + class my_unicode(str): pass + pat = regex.compile(my_unicode("abc")) + self.assertEqual(pat.match("xyz"), None) + + def test_finditer(self): + it = regex.finditer(r":+", "a:b::c:::d") + self.assertEqual([item[0] for item in it], [':', '::', ':::']) + + def test_bug_926075(self): + if regex.compile('bug_926075') is regex.compile(u'bug_926075'): + self.fail() + + def test_bug_931848(self): + pattern = u"[\u002E\u3002\uFF0E\uFF61]" + self.assertEqual(regex.compile(pattern).split("a.b.c"), ['a', 'b', + 'c']) + + def test_bug_581080(self): + it = regex.finditer(r"\s", "a b") + self.assertEqual(it.next().span(), (1, 2)) + self.assertRaises(StopIteration, lambda: it.next()) + + scanner = regex.compile(r"\s").scanner("a b") + self.assertEqual(scanner.search().span(), (1, 2)) + self.assertEqual(scanner.search(), None) + + def test_bug_817234(self): + it = regex.finditer(r".*", "asdf") + self.assertEqual(it.next().span(), (0, 4)) + self.assertEqual(it.next().span(), (4, 4)) + self.assertRaises(StopIteration, lambda: it.next()) + + def test_empty_array(self): + # SF buf 1647541. + import array + for typecode in 'cbBuhHiIlLfd': + a = array.array(typecode) + self.assertEqual(regex.compile("bla").match(a), None) + self.assertEqual(regex.compile("").match(a)[1 : ], ()) + + def test_inline_flags(self): + # Bug #1700. + upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Below + lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Below + + p = regex.compile(upper_char, regex.I | regex.U) + self.assertEqual(bool(p.match(lower_char)), True) + + p = regex.compile(lower_char, regex.I | regex.U) + self.assertEqual(bool(p.match(upper_char)), True) + + p = regex.compile('(?i)' + upper_char, regex.U) + self.assertEqual(bool(p.match(lower_char)), True) + + p = regex.compile('(?i)' + lower_char, regex.U) + self.assertEqual(bool(p.match(upper_char)), True) + + p = regex.compile('(?iu)' + upper_char) + self.assertEqual(bool(p.match(lower_char)), True) + + p = regex.compile('(?iu)' + lower_char) + self.assertEqual(bool(p.match(upper_char)), True) + + self.assertEqual(bool(regex.match(r"(?i)a", "A")), True) + self.assertEqual(bool(regex.match(r"a(?i)", "A")), True) + self.assertEqual(bool(regex.match(r"(?iV1)a", "A")), True) + self.assertEqual(regex.match(r"a(?iV1)", "A"), None) + + def test_dollar_matches_twice(self): + # $ matches the end of string, and just before the terminating \n. + pattern = regex.compile('$') + self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') + self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') + self.assertEqual(pattern.sub('#', '\n'), '#\n#') + + pattern = regex.compile('$', regex.MULTILINE) + self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#') + self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') + self.assertEqual(pattern.sub('#', '\n'), '#\n#') + + def test_ascii_and_unicode_flag(self): + # Unicode patterns. + for flags in (0, regex.UNICODE): + pat = regex.compile(u'\xc0', flags | regex.IGNORECASE) + self.assertEqual(bool(pat.match(u'\xe0')), True) + pat = regex.compile(u'\w', flags) + self.assertEqual(bool(pat.match(u'\xe0')), True) + + pat = regex.compile(u'\xc0', regex.ASCII | regex.IGNORECASE) + self.assertEqual(pat.match(u'\xe0'), None) + pat = regex.compile(u'(?a)\xc0', regex.IGNORECASE) + self.assertEqual(pat.match(u'\xe0'), None) + pat = regex.compile(u'\w', regex.ASCII) + self.assertEqual(pat.match(u'\xe0'), None) + pat = regex.compile(u'(?a)\w') + self.assertEqual(pat.match(u'\xe0'), None) + + # String patterns. + for flags in (0, regex.ASCII): + pat = regex.compile('\xc0', flags | regex.IGNORECASE) + self.assertEqual(pat.match('\xe0'), None) + pat = regex.compile('\w') + self.assertEqual(pat.match('\xe0'), None) + + self.assertRaisesRegex(ValueError, self.MIXED_FLAGS, lambda: + regex.compile('(?au)\w')) + + def test_subscripting_match(self): + m = regex.match(r'(?<a>\w)', 'xy') + if not m: + self.fail("Failed: expected match but returned None") + elif not m or m[0] != m.group(0) or m[1] != m.group(1): + self.fail("Failed") + if not m: + self.fail("Failed: expected match but returned None") + elif m[:] != ('x', 'x'): + self.fail("Failed: expected \"('x', 'x')\" but got %s instead" % + repr(m[:])) + + def test_new_named_groups(self): + m0 = regex.match(r'(?P<a>\w)', 'x') + m1 = regex.match(r'(?<a>\w)', 'x') + if not (m0 and m1 and m0[:] == m1[:]): + self.fail("Failed") + + def test_properties(self): + self.assertEqual(regex.match('(?i)\xC0', '\xE0'), None) + self.assertEqual(regex.match(r'(?i)\xC0', '\xE0'), None) + self.assertEqual(regex.match(r'\w', '\xE0'), None) + self.assertEqual(bool(regex.match(ur'(?u)\w', u'\xE0')), True) + + # Dropped the following test. It's not possible to determine what the + # correct result should be in the general case. +# self.assertEqual(bool(regex.match(r'(?L)\w', '\xE0')), +# '\xE0'.isalnum()) + + self.assertEqual(bool(regex.match(r'(?L)\d', '0')), True) + self.assertEqual(bool(regex.match(r'(?L)\s', ' ')), True) + self.assertEqual(bool(regex.match(r'(?L)\w', 'a')), True) + self.assertEqual(regex.match(r'(?L)\d', '?'), None) + self.assertEqual(regex.match(r'(?L)\s', '?'), None) + self.assertEqual(regex.match(r'(?L)\w', '?'), None) + + self.assertEqual(regex.match(r'(?L)\D', '0'), None) + self.assertEqual(regex.match(r'(?L)\S', ' '), None) + self.assertEqual(regex.match(r'(?L)\W', 'a'), None) + self.assertEqual(bool(regex.match(r'(?L)\D', '?')), True) + self.assertEqual(bool(regex.match(r'(?L)\S', '?')), True) + self.assertEqual(bool(regex.match(r'(?L)\W', '?')), True) + + self.assertEqual(bool(regex.match(ur'(?u)\p{Cyrillic}', + u'\N{CYRILLIC CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)(?iu)\p{Cyrillic}', + u'\N{CYRILLIC CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)\p{IsCyrillic}', + u'\N{CYRILLIC CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)\p{Script=Cyrillic}', + u'\N{CYRILLIC CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)\p{InCyrillic}', + u'\N{CYRILLIC CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)\p{Block=Cyrillic}', + u'\N{CYRILLIC CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)[[:Cyrillic:]]', + u'\N{CYRILLIC CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)[[:IsCyrillic:]]', + u'\N{CYRILLIC CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)[[:Script=Cyrillic:]]', + u'\N{CYRILLIC CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)[[:InCyrillic:]]', + u'\N{CYRILLIC CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)[[:Block=Cyrillic:]]', + u'\N{CYRILLIC CAPITAL LETTER A}')), True) + + self.assertEqual(bool(regex.match(ur'(?u)\P{Cyrillic}', + u'\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)\P{IsCyrillic}', + u'\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)\P{Script=Cyrillic}', + u'\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)\P{InCyrillic}', + u'\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)\P{Block=Cyrillic}', + u'\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)\p{^Cyrillic}', + u'\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)\p{^IsCyrillic}', + u'\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)\p{^Script=Cyrillic}', + u'\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)\p{^InCyrillic}', + u'\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)\p{^Block=Cyrillic}', + u'\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)[[:^Cyrillic:]]', + u'\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)[[:^IsCyrillic:]]', + u'\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)[[:^Script=Cyrillic:]]', + u'\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)[[:^InCyrillic:]]', + u'\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(ur'(?u)[[:^Block=Cyrillic:]]', + u'\N{LATIN CAPITAL LETTER A}')), True) + + self.assertEqual(bool(regex.match(ur'(?u)\d', u'0')), True) + self.assertEqual(bool(regex.match(ur'(?u)\s', u' ')), True) + self.assertEqual(bool(regex.match(ur'(?u)\w', u'A')), True) + self.assertEqual(regex.match(ur"(?u)\d", u"?"), None) + self.assertEqual(regex.match(ur"(?u)\s", u"?"), None) + self.assertEqual(regex.match(ur"(?u)\w", u"?"), None) + self.assertEqual(regex.match(ur"(?u)\D", u"0"), None) + self.assertEqual(regex.match(ur"(?u)\S", u" "), None) + self.assertEqual(regex.match(ur"(?u)\W", u"A"), None) + self.assertEqual(bool(regex.match(ur'(?u)\D', u'?')), True) + self.assertEqual(bool(regex.match(ur'(?u)\S', u'?')), True) + self.assertEqual(bool(regex.match(ur'(?u)\W', u'?')), True) + + self.assertEqual(bool(regex.match(ur'(?u)\p{L}', u'A')), True) + self.assertEqual(bool(regex.match(ur'(?u)\p{L}', u'a')), True) + self.assertEqual(bool(regex.match(ur'(?u)\p{Lu}', u'A')), True) + self.assertEqual(bool(regex.match(ur'(?u)\p{Ll}', u'a')), True) + + self.assertEqual(bool(regex.match(ur'(?u)(?i)a', u'a')), True) + self.assertEqual(bool(regex.match(ur'(?u)(?i)a', u'A')), True) + + self.assertEqual(bool(regex.match(ur'(?u)\w', u'0')), True) + self.assertEqual(bool(regex.match(ur'(?u)\w', u'a')), True) + self.assertEqual(bool(regex.match(ur'(?u)\w', u'_')), True) + + self.assertEqual(regex.match(ur"(?u)\X", u"\xE0").span(), (0, 1)) + self.assertEqual(regex.match(ur"(?u)\X", u"a\u0300").span(), (0, 2)) + self.assertEqual(regex.findall(ur"(?u)\X", + u"a\xE0a\u0300e\xE9e\u0301"), [u'a', u'\xe0', u'a\u0300', u'e', + u'\xe9', u'e\u0301']) + self.assertEqual(regex.findall(ur"(?u)\X{3}", + u"a\xE0a\u0300e\xE9e\u0301"), [u'a\xe0a\u0300', u'e\xe9e\u0301']) + self.assertEqual(regex.findall(ur"(?u)\X", u"\r\r\n\u0301A\u0301"), + [u'\r', u'\r\n', u'\u0301', u'A\u0301']) + + self.assertEqual(bool(regex.match(ur'(?u)\p{Ll}', u'a')), True) + + chars_u = u"-09AZaz_\u0393\u03b3" + chars_b = "-09AZaz_" + word_set = set("Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc".split()) + + tests = [ + (ur"(?u)\w", chars_u, u"09AZaz_\u0393\u03b3"), + (ur"(?u)[[:word:]]", chars_u, u"09AZaz_\u0393\u03b3"), + (ur"(?u)\W", chars_u, u"-"), + (ur"(?u)[[:^word:]]", chars_u, u"-"), + (ur"(?u)\d", chars_u, u"09"), + (ur"(?u)[[:digit:]]", chars_u, u"09"), + (ur"(?u)\D", chars_u, u"-AZaz_\u0393\u03b3"), + (ur"(?u)[[:^digit:]]", chars_u, u"-AZaz_\u0393\u03b3"), + (ur"(?u)[[:alpha:]]", chars_u, u"AZaz\u0393\u03b3"), + (ur"(?u)[[:^alpha:]]", chars_u, u"-09_"), + (ur"(?u)[[:alnum:]]", chars_u, u"09AZaz\u0393\u03b3"), + (ur"(?u)[[:^alnum:]]", chars_u, u"-_"), + (ur"(?u)[[:xdigit:]]", chars_u, u"09Aa"), + (ur"(?u)[[:^xdigit:]]", chars_u, u"-Zz_\u0393\u03b3"), + (ur"(?u)\p{InBasicLatin}", u"a\xE1", u"a"), + (ur"(?u)\P{InBasicLatin}", u"a\xE1", u"\xE1"), + (ur"(?iu)\p{InBasicLatin}", u"a\xE1", u"a"), + (ur"(?iu)\P{InBasicLatin}", u"a\xE1", u"\xE1"), + + (r"(?L)\w", chars_b, "09AZaz_"), + (r"(?L)[[:word:]]", chars_b, "09AZaz_"), + (r"(?L)\W", chars_b, "-"), + (r"(?L)[[:^word:]]", chars_b, "-"), + (r"(?L)\d", chars_b, "09"), + (r"(?L)[[:digit:]]", chars_b, "09"), + (r"(?L)\D", chars_b, "-AZaz_"), + (r"(?L)[[:^digit:]]", chars_b, "-AZaz_"), + (r"(?L)[[:alpha:]]", chars_b, "AZaz"), + (r"(?L)[[:^alpha:]]", chars_b, "-09_"), + (r"(?L)[[:alnum:]]", chars_b, "09AZaz"), + (r"(?L)[[:^alnum:]]", chars_b, "-_"), + (r"(?L)[[:xdigit:]]", chars_b, "09Aa"), + (r"(?L)[[:^xdigit:]]", chars_b, "-Zz_"), + + (r"\w", chars_b, "09AZaz_"), + (r"[[:word:]]", chars_b, "09AZaz_"), + (r"\W", chars_b, "-"), + (r"[[:^word:]]", chars_b, "-"), + (r"\d", chars_b, "09"), + (r"[[:digit:]]", chars_b, "09"), + (r"\D", chars_b, "-AZaz_"), + (r"[[:^digit:]]", chars_b, "-AZaz_"), + (r"[[:alpha:]]", chars_b, "AZaz"), + (r"[[:^alpha:]]", chars_b, "-09_"), + (r"[[:alnum:]]", chars_b, "09AZaz"), + (r"[[:^alnum:]]", chars_b, "-_"), + (r"[[:xdigit:]]", chars_b, "09Aa"), + (r"[[:^xdigit:]]", chars_b, "-Zz_"), + ] + for pattern, chars, expected in tests: + try: + if chars[ : 0].join(regex.findall(pattern, chars)) != expected: + self.fail("Failed: %s" % pattern) + except Exception, e: + self.fail("Failed: %s raised %s" % (pattern, repr(e))) + + self.assertEqual(bool(regex.match(ur"(?u)\p{NumericValue=0}", u"0")), + True) + self.assertEqual(bool(regex.match(ur"(?u)\p{NumericValue=1/2}", + u"\N{VULGAR FRACTION ONE HALF}")), True) + self.assertEqual(bool(regex.match(ur"(?u)\p{NumericValue=0.5}", + u"\N{VULGAR FRACTION ONE HALF}")), True) + + def test_word_class(self): + self.assertEqual(regex.findall(ur"(?u)\w+", + u" \u0939\u093f\u0928\u094d\u0926\u0940,"), + [u'\u0939\u093f\u0928\u094d\u0926\u0940']) + self.assertEqual(regex.findall(ur"(?u)\W+", + u" \u0939\u093f\u0928\u094d\u0926\u0940,"), [u' ', u',']) + self.assertEqual(regex.split(ur"(?uV1)\b", + u" \u0939\u093f\u0928\u094d\u0926\u0940,"), [u' ', + u'\u0939\u093f\u0928\u094d\u0926\u0940', u',']) + self.assertEqual(regex.split(ur"(?uV1)\B", + u" \u0939\u093f\u0928\u094d\u0926\u0940,"), [u'', u' \u0939', + u'\u093f', u'\u0928', u'\u094d', u'\u0926', u'\u0940,', u'']) + + def test_search_anchor(self): + self.assertEqual(regex.findall(r"\G\w{2}", "abcd ef"), ['ab', 'cd']) + + def test_search_reverse(self): + self.assertEqual(regex.findall(r"(?r).", "abc"), ['c', 'b', 'a']) + self.assertEqual(regex.findall(r"(?r).", "abc", overlapped=True), ['c', + 'b', 'a']) + self.assertEqual(regex.findall(r"(?r)..", "abcde"), ['de', 'bc']) + self.assertEqual(regex.findall(r"(?r)..", "abcde", overlapped=True), + ['de', 'cd', 'bc', 'ab']) + self.assertEqual(regex.findall(r"(?r)(.)(-)(.)", "a-b-c", + overlapped=True), [("b", "-", "c"), ("a", "-", "b")]) + + self.assertEqual([m[0] for m in regex.finditer(r"(?r).", "abc")], ['c', + 'b', 'a']) + self.assertEqual([m[0] for m in regex.finditer(r"(?r)..", "abcde", + overlapped=True)], ['de', 'cd', 'bc', 'ab']) + self.assertEqual([m[0] for m in regex.finditer(r"(?r).", "abc")], ['c', + 'b', 'a']) + self.assertEqual([m[0] for m in regex.finditer(r"(?r)..", "abcde", + overlapped=True)], ['de', 'cd', 'bc', 'ab']) + + self.assertEqual(regex.findall(r"^|\w+", "foo bar"), ['', 'foo', + 'bar']) + self.assertEqual(regex.findall(r"(?V1)^|\w+", "foo bar"), ['', 'foo', + 'bar']) + self.assertEqual(regex.findall(r"(?r)^|\w+", "foo bar"), ['bar', 'foo', + '']) + self.assertEqual(regex.findall(r"(?rV1)^|\w+", "foo bar"), ['bar', + 'foo', '']) + + self.assertEqual([m[0] for m in regex.finditer(r"^|\w+", "foo bar")], + ['', 'foo', 'bar']) + self.assertEqual([m[0] for m in regex.finditer(r"(?V1)^|\w+", + "foo bar")], ['', 'foo', 'bar']) + self.assertEqual([m[0] for m in regex.finditer(r"(?r)^|\w+", + "foo bar")], ['bar', 'foo', '']) + self.assertEqual([m[0] for m in regex.finditer(r"(?rV1)^|\w+", + "foo bar")], ['bar', 'foo', '']) + + self.assertEqual(regex.findall(r"\G\w{2}", "abcd ef"), ['ab', 'cd']) + self.assertEqual(regex.findall(r".{2}(?<=\G.*)", "abcd"), ['ab', 'cd']) + self.assertEqual(regex.findall(r"(?r)\G\w{2}", "abcd ef"), []) + self.assertEqual(regex.findall(r"(?r)\w{2}\G", "abcd ef"), ['ef']) + + self.assertEqual(regex.findall(r"q*", "qqwe"), ['qq', '', '', '']) + self.assertEqual(regex.findall(r"(?V1)q*", "qqwe"), ['qq', '', '', '']) + self.assertEqual(regex.findall(r"(?r)q*", "qqwe"), ['', '', 'qq', '']) + self.assertEqual(regex.findall(r"(?rV1)q*", "qqwe"), ['', '', 'qq', + '']) + + self.assertEqual(regex.findall(".", "abcd", pos=1, endpos=3), ['b', + 'c']) + self.assertEqual(regex.findall(".", "abcd", pos=1, endpos=-1), ['b', + 'c']) + self.assertEqual([m[0] for m in regex.finditer(".", "abcd", pos=1, + endpos=3)], ['b', 'c']) + self.assertEqual([m[0] for m in regex.finditer(".", "abcd", pos=1, + endpos=-1)], ['b', 'c']) + + self.assertEqual([m[0] for m in regex.finditer("(?r).", "abcd", pos=1, + endpos=3)], ['c', 'b']) + self.assertEqual([m[0] for m in regex.finditer("(?r).", "abcd", pos=1, + endpos=-1)], ['c', 'b']) + self.assertEqual(regex.findall("(?r).", "abcd", pos=1, endpos=3), ['c', + 'b']) + self.assertEqual(regex.findall("(?r).", "abcd", pos=1, endpos=-1), + ['c', 'b']) + + self.assertEqual(regex.findall(r"[ab]", "aB", regex.I), ['a', 'B']) + self.assertEqual(regex.findall(r"(?r)[ab]", "aB", regex.I), ['B', 'a']) + + self.assertEqual(regex.findall(r"(?r).{2}", "abc"), ['bc']) + self.assertEqual(regex.findall(r"(?r).{2}", "abc", overlapped=True), + ['bc', 'ab']) + self.assertEqual(regex.findall(r"(\w+) (\w+)", + "first second third fourth fifth"), [('first', 'second'), ('third', + 'fourth')]) + self.assertEqual(regex.findall(r"(?r)(\w+) (\w+)", + "first second third fourth fifth"), [('fourth', 'fifth'), ('second', + 'third')]) + + self.assertEqual([m[0] for m in regex.finditer(r"(?r).{2}", "abc")], + ['bc']) + self.assertEqual([m[0] for m in regex.finditer(r"(?r).{2}", "abc", + overlapped=True)], ['bc', 'ab']) + self.assertEqual([m[0] for m in regex.finditer(r"(\w+) (\w+)", + "first second third fourth fifth")], ['first second', + 'third fourth']) + self.assertEqual([m[0] for m in regex.finditer(r"(?r)(\w+) (\w+)", + "first second third fourth fifth")], ['fourth fifth', + 'second third']) + + self.assertEqual(regex.search("abcdef", "abcdef").span(), (0, 6)) + self.assertEqual(regex.search("(?r)abcdef", "abcdef").span(), (0, 6)) + self.assertEqual(regex.search("(?i)abcdef", "ABCDEF").span(), (0, 6)) + self.assertEqual(regex.search("(?ir)abcdef", "ABCDEF").span(), (0, 6)) + + self.assertEqual(regex.sub(r"(.)", r"\1", "abc"), 'abc') + self.assertEqual(regex.sub(r"(?r)(.)", r"\1", "abc"), 'abc') + + def test_atomic(self): + # Issue 433030. + self.assertEqual(regex.search(r"(?>a*)a", "aa"), None) + + def test_possessive(self): + # Single-character non-possessive. + self.assertEqual(regex.search(r"a?a", "a").span(), (0, 1)) + self.assertEqual(regex.search(r"a*a", "aaa").span(), (0, 3)) + self.assertEqual(regex.search(r"a+a", "aaa").span(), (0, 3)) + self.assertEqual(regex.search(r"a{1,3}a", "aaa").span(), (0, 3)) + + # Multiple-character non-possessive. + self.assertEqual(regex.search(r"(?:ab)?ab", "ab").span(), (0, 2)) + self.assertEqual(regex.search(r"(?:ab)*ab", "ababab").span(), (0, 6)) + self.assertEqual(regex.search(r"(?:ab)+ab", "ababab").span(), (0, 6)) + self.assertEqual(regex.search(r"(?:ab){1,3}ab", "ababab").span(), (0, + 6)) + + # Single-character possessive. + self.assertEqual(regex.search(r"a?+a", "a"), None) + self.assertEqual(regex.search(r"a*+a", "aaa"), None) + self.assertEqual(regex.search(r"a++a", "aaa"), None) + self.assertEqual(regex.search(r"a{1,3}+a", "aaa"), None) + + # Multiple-character possessive. + self.assertEqual(regex.search(r"(?:ab)?+ab", "ab"), None) + self.assertEqual(regex.search(r"(?:ab)*+ab", "ababab"), None) + self.assertEqual(regex.search(r"(?:ab)++ab", "ababab"), None) + self.assertEqual(regex.search(r"(?:ab){1,3}+ab", "ababab"), None) + + def test_zerowidth(self): + # Issue 3262. + self.assertEqual(regex.split(r"\b", "a b"), ['a b']) + self.assertEqual(regex.split(r"(?V1)\b", "a b"), ['', 'a', ' ', 'b', + '']) + + # Issue 1647489. + self.assertEqual(regex.findall(r"^|\w+", "foo bar"), ['', 'foo', + 'bar']) + self.assertEqual([m[0] for m in regex.finditer(r"^|\w+", "foo bar")], + ['', 'foo', 'bar']) + self.assertEqual(regex.findall(r"(?r)^|\w+", "foo bar"), ['bar', 'foo', + '']) + self.assertEqual([m[0] for m in regex.finditer(r"(?r)^|\w+", + "foo bar")], ['bar', 'foo', '']) + self.assertEqual(regex.findall(r"(?V1)^|\w+", "foo bar"), ['', 'foo', + 'bar']) + self.assertEqual([m[0] for m in regex.finditer(r"(?V1)^|\w+", + "foo bar")], ['', 'foo', 'bar']) + self.assertEqual(regex.findall(r"(?rV1)^|\w+", "foo bar"), ['bar', + 'foo', '']) + self.assertEqual([m[0] for m in regex.finditer(r"(?rV1)^|\w+", + "foo bar")], ['bar', 'foo', '']) + + self.assertEqual(regex.split("", "xaxbxc"), ['xaxbxc']) + self.assertEqual([m for m in regex.splititer("", "xaxbxc")], + ['xaxbxc']) + + self.assertEqual(regex.split("(?r)", "xaxbxc"), ['xaxbxc']) + self.assertEqual([m for m in regex.splititer("(?r)", "xaxbxc")], + ['xaxbxc']) + + self.assertEqual(regex.split("(?V1)", "xaxbxc"), ['', 'x', 'a', 'x', + 'b', 'x', 'c', '']) + self.assertEqual([m for m in regex.splititer("(?V1)", "xaxbxc")], ['', + 'x', 'a', 'x', 'b', 'x', 'c', '']) + + self.assertEqual(regex.split("(?rV1)", "xaxbxc"), ['', 'c', 'x', 'b', + 'x', 'a', 'x', '']) + self.assertEqual([m for m in regex.splititer("(?rV1)", "xaxbxc")], ['', + 'c', 'x', 'b', 'x', 'a', 'x', '']) + + def test_scoped_and_inline_flags(self): + # Issues 433028, 433024, 433027. + self.assertEqual(regex.search(r"(?i)Ab", "ab").span(), (0, 2)) + self.assertEqual(regex.search(r"(?i:A)b", "ab").span(), (0, 2)) + self.assertEqual(regex.search(r"A(?i)b", "ab").span(), (0, 2)) + self.assertEqual(regex.search(r"A(?iV1)b", "ab"), None) + + self.assertRaisesRegex(regex.error, self.CANT_TURN_OFF, lambda: + regex.search(r"(?V0-i)Ab", "ab", flags=regex.I)) + + self.assertEqual(regex.search(r"(?V0)Ab", "ab"), None) + self.assertEqual(regex.search(r"(?V1)Ab", "ab"), None) + self.assertEqual(regex.search(r"(?V1-i)Ab", "ab", flags=regex.I), None) + self.assertEqual(regex.search(r"(?-i:A)b", "ab", flags=regex.I), None) + self.assertEqual(regex.search(r"A(?V1-i)b", "ab", + flags=regex.I).span(), (0, 2)) + + def test_repeated_repeats(self): + # Issue 2537. + self.assertEqual(regex.search(r"(?:a+)+", "aaa").span(), (0, 3)) + self.assertEqual(regex.search(r"(?:(?:ab)+c)+", "abcabc").span(), (0, + 6)) + + def test_lookbehind(self): + self.assertEqual(regex.search(r"123(?<=a\d+)", "a123").span(), (1, 4)) + self.assertEqual(regex.search(r"123(?<=a\d+)", "b123"), None) + self.assertEqual(regex.search(r"123(?<!a\d+)", "a123"), None) + self.assertEqual(regex.search(r"123(?<!a\d+)", "b123").span(), (1, 4)) + + self.assertEqual(bool(regex.match("(a)b(?<=b)(c)", "abc")), True) + self.assertEqual(regex.match("(a)b(?<=c)(c)", "abc"), None) + self.assertEqual(bool(regex.match("(a)b(?=c)(c)", "abc")), True) + self.assertEqual(regex.match("(a)b(?=b)(c)", "abc"), None) + + self.assertEqual(regex.match("(?:(a)|(x))b(?<=(?(2)x|c))c", "abc"), + None) + self.assertEqual(regex.match("(?:(a)|(x))b(?<=(?(2)b|x))c", "abc"), + None) + self.assertEqual(bool(regex.match("(?:(a)|(x))b(?<=(?(2)x|b))c", + "abc")), True) + self.assertEqual(regex.match("(?:(a)|(x))b(?<=(?(1)c|x))c", "abc"), + None) + self.assertEqual(bool(regex.match("(?:(a)|(x))b(?<=(?(1)b|x))c", + "abc")), True) + + self.assertEqual(bool(regex.match("(?:(a)|(x))b(?=(?(2)x|c))c", + "abc")), True) + self.assertEqual(regex.match("(?:(a)|(x))b(?=(?(2)c|x))c", "abc"), + None) + self.assertEqual(bool(regex.match("(?:(a)|(x))b(?=(?(2)x|c))c", + "abc")), True) + self.assertEqual(regex.match("(?:(a)|(x))b(?=(?(1)b|x))c", "abc"), + None) + self.assertEqual(bool(regex.match("(?:(a)|(x))b(?=(?(1)c|x))c", + "abc")), True) + + self.assertEqual(regex.match("(a)b(?<=(?(2)x|c))(c)", "abc"), None) + self.assertEqual(regex.match("(a)b(?<=(?(2)b|x))(c)", "abc"), None) + self.assertEqual(regex.match("(a)b(?<=(?(1)c|x))(c)", "abc"), None) + self.assertEqual(bool(regex.match("(a)b(?<=(?(1)b|x))(c)", "abc")), + True) + + self.assertEqual(bool(regex.match("(a)b(?=(?(2)x|c))(c)", "abc")), + True) + self.assertEqual(regex.match("(a)b(?=(?(2)b|x))(c)", "abc"), None) + self.assertEqual(bool(regex.match("(a)b(?=(?(1)c|x))(c)", "abc")), + True) + + self.assertEqual(repr(type(regex.compile(r"(a)\2(b)"))), + self.PATTERN_CLASS) + + def test_unmatched_in_sub(self): + # Issue 1519638. + self.assertEqual(regex.sub(r"(?V0)(x)?(y)?", r"\2-\1", "xy"), 'y-x') + self.assertEqual(regex.sub(r"(?V1)(x)?(y)?", r"\2-\1", "xy"), 'y-x-') + self.assertEqual(regex.sub(r"(?V0)(x)?(y)?", r"\2-\1", "x"), '-x') + self.assertEqual(regex.sub(r"(?V1)(x)?(y)?", r"\2-\1", "x"), '-x-') + self.assertEqual(regex.sub(r"(?V0)(x)?(y)?", r"\2-\1", "y"), 'y-') + self.assertEqual(regex.sub(r"(?V1)(x)?(y)?", r"\2-\1", "y"), 'y--') + + def test_bug_10328 (self): + # Issue 10328. + pat = regex.compile(r'(?mV0)(?P<trailing_ws>[ \t]+\r*$)|(?P<no_final_newline>(?<=[^\n])\Z)') + self.assertEqual(pat.subn(lambda m: '<' + m.lastgroup + '>', + 'foobar '), ('foobar<trailing_ws>', 1)) + self.assertEqual([m.group() for m in pat.finditer('foobar ')], [' ', + '']) + pat = regex.compile(r'(?mV1)(?P<trailing_ws>[ \t]+\r*$)|(?P<no_final_newline>(?<=[^\n])\Z)') + self.assertEqual(pat.subn(lambda m: '<' + m.lastgroup + '>', + 'foobar '), ('foobar<trailing_ws><no_final_newline>', 2)) + self.assertEqual([m.group() for m in pat.finditer('foobar ')], [' ', + '']) + + def test_overlapped(self): + self.assertEqual(regex.findall(r"..", "abcde"), ['ab', 'cd']) + self.assertEqual(regex.findall(r"..", "abcde", overlapped=True), ['ab', + 'bc', 'cd', 'de']) + self.assertEqual(regex.findall(r"(?r)..", "abcde"), ['de', 'bc']) + self.assertEqual(regex.findall(r"(?r)..", "abcde", overlapped=True), + ['de', 'cd', 'bc', 'ab']) + self.assertEqual(regex.findall(r"(.)(-)(.)", "a-b-c", overlapped=True), + [("a", "-", "b"), ("b", "-", "c")]) + + self.assertEqual([m[0] for m in regex.finditer(r"..", "abcde")], ['ab', + 'cd']) + self.assertEqual([m[0] for m in regex.finditer(r"..", "abcde", + overlapped=True)], ['ab', 'bc', 'cd', 'de']) + self.assertEqual([m[0] for m in regex.finditer(r"(?r)..", "abcde")], + ['de', 'bc']) + self.assertEqual([m[0] for m in regex.finditer(r"(?r)..", "abcde", + overlapped=True)], ['de', 'cd', 'bc', 'ab']) + + self.assertEqual([m.groups() for m in regex.finditer(r"(.)(-)(.)", + "a-b-c", overlapped=True)], [("a", "-", "b"), ("b", "-", "c")]) + self.assertEqual([m.groups() for m in regex.finditer(r"(?r)(.)(-)(.)", + "a-b-c", overlapped=True)], [("b", "-", "c"), ("a", "-", "b")]) + + def test_splititer(self): + self.assertEqual(regex.split(r",", "a,b,,c,"), ['a', 'b', '', 'c', '']) + self.assertEqual([m for m in regex.splititer(r",", "a,b,,c,")], ['a', + 'b', '', 'c', '']) + + def test_grapheme(self): + self.assertEqual(regex.match(ur"(?u)\X", u"\xE0").span(), (0, 1)) + self.assertEqual(regex.match(ur"(?u)\X", u"a\u0300").span(), (0, 2)) + + self.assertEqual(regex.findall(ur"(?u)\X", + u"a\xE0a\u0300e\xE9e\u0301"), [u'a', u'\xe0', u'a\u0300', u'e', + u'\xe9', u'e\u0301']) + self.assertEqual(regex.findall(ur"(?u)\X{3}", + u"a\xE0a\u0300e\xE9e\u0301"), [u'a\xe0a\u0300', u'e\xe9e\u0301']) + self.assertEqual(regex.findall(ur"(?u)\X", u"\r\r\n\u0301A\u0301"), + [u'\r', u'\r\n', u'\u0301', u'A\u0301']) + + def test_word_boundary(self): + text = u'The quick ("brown") fox can\'t jump 32.3 feet, right?' + self.assertEqual(regex.split(ur'(?V1)\b', text), [u'', u'The', u' ', + u'quick', u' ("', u'brown', u'") ', u'fox', u' ', u'can', u"'", u't', + u' ', u'jump', u' ', u'32', u'.', u'3', u' ', u'feet', u', ', + u'right', u'?']) + self.assertEqual(regex.split(ur'(?V1w)\b', text), [u'', u'The', u' ', + u'quick', u' ', u'(', u'"', u'brown', u'"', u')', u' ', u'fox', u' ', + u"can't", u' ', u'jump', u' ', u'32.3', u' ', u'feet', u',', u' ', + u'right', u'?', u'']) + + text = u"The fox" + self.assertEqual(regex.split(ur'(?V1)\b', text), [u'', u'The', u' ', + u'fox', u'']) + self.assertEqual(regex.split(ur'(?V1w)\b', text), [u'', u'The', u' ', + u' ', u'fox', u'']) + + text = u"can't aujourd'hui l'objectif" + self.assertEqual(regex.split(ur'(?V1)\b', text), [u'', u'can', u"'", + u't', u' ', u'aujourd', u"'", u'hui', u' ', u'l', u"'", u'objectif', + u'']) + self.assertEqual(regex.split(ur'(?V1w)\b', text), [u'', u"can't", u' ', + u"aujourd'hui", u' ', u"l'", u'objectif', u'']) + + def test_line_boundary(self): + self.assertEqual(regex.findall(r".+", "Line 1\nLine 2\n"), ["Line 1", + "Line 2"]) + self.assertEqual(regex.findall(r".+", "Line 1\rLine 2\r"), + ["Line 1\rLine 2\r"]) + self.assertEqual(regex.findall(r".+", "Line 1\r\nLine 2\r\n"), + ["Line 1\r", "Line 2\r"]) + self.assertEqual(regex.findall(r"(?w).+", "Line 1\nLine 2\n"), + ["Line 1", "Line 2"]) + self.assertEqual(regex.findall(r"(?w).+", "Line 1\rLine 2\r"), + ["Line 1", "Line 2"]) + self.assertEqual(regex.findall(r"(?w).+", "Line 1\r\nLine 2\r\n"), + ["Line 1", "Line 2"]) + + self.assertEqual(regex.search(r"^abc", "abc").start(), 0) + self.assertEqual(regex.search(r"^abc", "\nabc"), None) + self.assertEqual(regex.search(r"^abc", "\rabc"), None) + self.assertEqual(regex.search(r"(?w)^abc", "abc").start(), 0) + self.assertEqual(regex.search(r"(?w)^abc", "\nabc"), None) + self.assertEqual(regex.search(r"(?w)^abc", "\rabc"), None) + + self.assertEqual(regex.search(r"abc$", "abc").start(), 0) + self.assertEqual(regex.search(r"abc$", "abc\n").start(), 0) + self.assertEqual(regex.search(r"abc$", "abc\r"), None) + self.assertEqual(regex.search(r"(?w)abc$", "abc").start(), 0) + self.assertEqual(regex.search(r"(?w)abc$", "abc\n").start(), 0) + self.assertEqual(regex.search(r"(?w)abc$", "abc\r").start(), 0) + + self.assertEqual(regex.search(r"(?m)^abc", "abc").start(), 0) + self.assertEqual(regex.search(r"(?m)^abc", "\nabc").start(), 1) + self.assertEqual(regex.search(r"(?m)^abc", "\rabc"), None) + self.assertEqual(regex.search(r"(?mw)^abc", "abc").start(), 0) + self.assertEqual(regex.search(r"(?mw)^abc", "\nabc").start(), 1) + self.assertEqual(regex.search(r"(?mw)^abc", "\rabc").start(), 1) + + self.assertEqual(regex.search(r"(?m)abc$", "abc").start(), 0) + self.assertEqual(regex.search(r"(?m)abc$", "abc\n").start(), 0) + self.assertEqual(regex.search(r"(?m)abc$", "abc\r"), None) + self.assertEqual(regex.search(r"(?mw)abc$", "abc").start(), 0) + self.assertEqual(regex.search(r"(?mw)abc$", "abc\n").start(), 0) + self.assertEqual(regex.search(r"(?mw)abc$", "abc\r").start(), 0) + + def test_branch_reset(self): + self.assertEqual(regex.match(r"(?:(a)|(b))(c)", "ac").groups(), ('a', + None, 'c')) + self.assertEqual(regex.match(r"(?:(a)|(b))(c)", "bc").groups(), (None, + 'b', 'c')) + self.assertEqual(regex.match(r"(?:(?<a>a)|(?<b>b))(?<c>c)", + "ac").groups(), ('a', None, 'c')) + self.assertEqual(regex.match(r"(?:(?<a>a)|(?<b>b))(?<c>c)", + "bc").groups(), (None, 'b', 'c')) + + self.assertEqual(regex.match(r"(?<a>a)(?:(?<b>b)|(?<c>c))(?<d>d)", + "abd").groups(), ('a', 'b', None, 'd')) + self.assertEqual(regex.match(r"(?<a>a)(?:(?<b>b)|(?<c>c))(?<d>d)", + "acd").groups(), ('a', None, 'c', 'd')) + self.assertEqual(regex.match(r"(a)(?:(b)|(c))(d)", "abd").groups(), + ('a', 'b', None, 'd')) + + self.assertEqual(regex.match(r"(a)(?:(b)|(c))(d)", "acd").groups(), + ('a', None, 'c', 'd')) + self.assertEqual(regex.match(r"(a)(?|(b)|(b))(d)", "abd").groups(), + ('a', 'b', 'd')) + self.assertEqual(regex.match(r"(?|(?<a>a)|(?<b>b))(c)", "ac").groups(), + ('a', None, 'c')) + self.assertEqual(regex.match(r"(?|(?<a>a)|(?<b>b))(c)", "bc").groups(), + (None, 'b', 'c')) + self.assertEqual(regex.match(r"(?|(?<a>a)|(?<a>b))(c)", "ac").groups(), + ('a', 'c')) + + self.assertEqual(regex.match(r"(?|(?<a>a)|(?<a>b))(c)", "bc").groups(), + ('b', 'c')) + + self.assertEqual(regex.match(r"(?|(?<a>a)(?<b>b)|(?<b>c)(?<a>d))(e)", + "abe").groups(), ('a', 'b', 'e')) + self.assertEqual(regex.match(r"(?|(?<a>a)(?<b>b)|(?<b>c)(?<a>d))(e)", + "cde").groups(), ('d', 'c', 'e')) + self.assertEqual(regex.match(r"(?|(?<a>a)(?<b>b)|(?<b>c)(d))(e)", + "abe").groups(), ('a', 'b', 'e')) + self.assertEqual(regex.match(r"(?|(?<a>a)(?<b>b)|(?<b>c)(d))(e)", + "cde").groups(), ('d', 'c', 'e')) + self.assertEqual(regex.match(r"(?|(?<a>a)(?<b>b)|(c)(d))(e)", + "abe").groups(), ('a', 'b', 'e')) + self.assertEqual(regex.match(r"(?|(?<a>a)(?<b>b)|(c)(d))(e)", + "cde").groups(), ('c', 'd', 'e')) + + # Hg issue 87. + self.assertEqual(regex.match(r"(?|(?<a>a)(?<b>b)|(c)(?<a>d))(e)", + "abe").groups(), ("a", "b", "e")) + self.assertEqual(regex.match(r"(?|(?<a>a)(?<b>b)|(c)(?<a>d))(e)", + "abe").capturesdict(), {"a": ["a"], "b": ["b"]}) + self.assertEqual(regex.match(r"(?|(?<a>a)(?<b>b)|(c)(?<a>d))(e)", + "cde").groups(), ("d", None, "e")) + self.assertEqual(regex.match(r"(?|(?<a>a)(?<b>b)|(c)(?<a>d))(e)", + "cde").capturesdict(), {"a": ["c", "d"], "b": []}) + + def test_set(self): + self.assertEqual(regex.match(r"[a]", "a").span(), (0, 1)) + self.assertEqual(regex.match(r"(?i)[a]", "A").span(), (0, 1)) + self.assertEqual(regex.match(r"[a-b]", r"a").span(), (0, 1)) + self.assertEqual(regex.match(r"(?i)[a-b]", r"A").span(), (0, 1)) + + self.assertEqual(regex.sub(r"(?V0)([][])", r"-", "a[b]c"), "a-b-c") + + self.assertEqual(regex.findall(ur"[\p{Alpha}]", u"a0"), [u"a"]) + self.assertEqual(regex.findall(ur"(?i)[\p{Alpha}]", u"A0"), [u"A"]) + + self.assertEqual(regex.findall(ur"[a\p{Alpha}]", u"ab0"), [u"a", u"b"]) + self.assertEqual(regex.findall(ur"[a\P{Alpha}]", u"ab0"), [u"a", u"0"]) + self.assertEqual(regex.findall(ur"(?i)[a\p{Alpha}]", u"ab0"), [u"a", + u"b"]) + self.assertEqual(regex.findall(ur"(?i)[a\P{Alpha}]", u"ab0"), [u"a", + u"0"]) + + self.assertEqual(regex.findall(ur"[a-b\p{Alpha}]", u"abC0"), [u"a", + u"b", u"C"]) + self.assertEqual(regex.findall(ur"(?i)[a-b\p{Alpha}]", u"AbC0"), [u"A", + u"b", u"C"]) + + self.assertEqual(regex.findall(ur"[\p{Alpha}]", u"a0"), [u"a"]) + self.assertEqual(regex.findall(ur"[\P{Alpha}]", u"a0"), [u"0"]) + self.assertEqual(regex.findall(ur"[^\p{Alpha}]", u"a0"), [u"0"]) + self.assertEqual(regex.findall(ur"[^\P{Alpha}]", u"a0"), [u"a"]) + + self.assertEqual("".join(regex.findall(r"[^\d-h]", "a^b12c-h")), + 'a^bc') + self.assertEqual("".join(regex.findall(r"[^\dh]", "a^b12c-h")), + 'a^bc-') + self.assertEqual("".join(regex.findall(r"[^h\s\db]", "a^b 12c-h")), + 'a^c-') + self.assertEqual("".join(regex.findall(r"[^b\w]", "a b")), ' ') + self.assertEqual("".join(regex.findall(r"[^b\S]", "a b")), ' ') + self.assertEqual("".join(regex.findall(r"[^8\d]", "a 1b2")), 'a b') + + all_chars = u"".join(unichr(c) for c in range(0x100)) + self.assertEqual(len(regex.findall(ur"(?u)\p{ASCII}", all_chars)), 128) + self.assertEqual(len(regex.findall(ur"(?u)\p{Letter}", all_chars)), + 117) + self.assertEqual(len(regex.findall(ur"(?u)\p{Digit}", all_chars)), 10) + + # Set operators + self.assertEqual(len(regex.findall(ur"(?uV1)[\p{ASCII}&&\p{Letter}]", + all_chars)), 52) + self.assertEqual(len(regex.findall(ur"(?uV1)[\p{ASCII}&&\p{Alnum}&&\p{Letter}]", + all_chars)), 52) + self.assertEqual(len(regex.findall(ur"(?uV1)[\p{ASCII}&&\p{Alnum}&&\p{Digit}]", + all_chars)), 10) + self.assertEqual(len(regex.findall(ur"(?uV1)[\p{ASCII}&&\p{Cc}]", + all_chars)), 33) + self.assertEqual(len(regex.findall(ur"(?uV1)[\p{ASCII}&&\p{Graph}]", + all_chars)), 94) + self.assertEqual(len(regex.findall(ur"(?uV1)[\p{ASCII}--\p{Cc}]", + all_chars)), 95) + self.assertEqual(len(regex.findall(ur"(?u)[\p{Letter}\p{Digit}]", + all_chars)), 127) + self.assertEqual(len(regex.findall(ur"(?uV1)[\p{Letter}||\p{Digit}]", + all_chars)), 127) + self.assertEqual(len(regex.findall(ur"(?u)\p{HexDigit}", all_chars)), + 22) + self.assertEqual(len(regex.findall(ur"(?uV1)[\p{HexDigit}~~\p{Digit}]", + all_chars)), 12) + self.assertEqual(len(regex.findall(ur"(?uV1)[\p{Digit}~~\p{HexDigit}]", + all_chars)), 12) + + self.assertEqual(repr(type(regex.compile(r"(?V0)([][-])"))), + self.PATTERN_CLASS) + self.assertEqual(regex.findall(r"(?V1)[[a-z]--[aei]]", "abc"), ["b", + "c"]) + self.assertEqual(regex.findall(r"(?iV1)[[a-z]--[aei]]", "abc"), ["b", + "c"]) + self.assertEqual(regex.findall("(?V1)[\w--a]","abc"), ["b", "c"]) + self.assertEqual(regex.findall("(?iV1)[\w--a]","abc"), ["b", "c"]) + + def test_various(self): + tests = [ + # Test ?P< and ?P= extensions. + ('(?P<foo_123', '', '', regex.error, self.MISSING_GT), # Unterminated group identifier. + ('(?P<1>a)', '', '', regex.error, self.BAD_GROUP_NAME), # Begins with a digit. + ('(?P<!>a)', '', '', regex.error, self.BAD_GROUP_NAME), # Begins with an illegal char. + ('(?P<foo!>a)', '', '', regex.error, self.BAD_GROUP_NAME), # Begins with an illegal char. + + # Same tests, for the ?P= form. + ('(?P<foo_123>a)(?P=foo_123', 'aa', '', regex.error, + self.MISSING_RPAREN), + ('(?P<foo_123>a)(?P=1)', 'aa', '1', repr('a')), + ('(?P<foo_123>a)(?P=0)', 'aa', '', regex.error, + self.BAD_GROUP_NAME), + ('(?P<foo_123>a)(?P=-1)', 'aa', '', regex.error, + self.BAD_GROUP_NAME), + ('(?P<foo_123>a)(?P=!)', 'aa', '', regex.error, + self.BAD_GROUP_NAME), + ('(?P<foo_123>a)(?P=foo_124)', 'aa', '', regex.error, + self.UNKNOWN_GROUP), # Backref to undefined group. + + ('(?P<foo_123>a)', 'a', '1', repr('a')), + ('(?P<foo_123>a)(?P=foo_123)', 'aa', '1', repr('a')), + + # Mal-formed \g in pattern treated as literal for compatibility. + (r'(?<foo_123>a)\g<foo_123', 'aa', '', repr(None)), + (r'(?<foo_123>a)\g<1>', 'aa', '1', repr('a')), + (r'(?<foo_123>a)\g<!>', 'aa', '', repr(None)), + (r'(?<foo_123>a)\g<foo_124>', 'aa', '', regex.error, + self.UNKNOWN_GROUP), # Backref to undefined group. + + ('(?<foo_123>a)', 'a', '1', repr('a')), + (r'(?<foo_123>a)\g<foo_123>', 'aa', '1', repr('a')), + + # Test octal escapes. + ('\\1', 'a', '', regex.error, self.INVALID_GROUP_REF), # Backreference. + ('[\\1]', '\1', '0', "'\\x01'"), # Character. + ('\\09', chr(0) + '9', '0', repr(chr(0) + '9')), + ('\\141', 'a', '0', repr('a')), + ('(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\\119', 'abcdefghijklk9', + '0,11', repr(('abcdefghijklk9', 'k'))), + + # Test \0 is handled everywhere. + (r'\0', '\0', '0', repr('\0')), + (r'[\0a]', '\0', '0', repr('\0')), + (r'[a\0]', '\0', '0', repr('\0')), + (r'[^a\0]', '\0', '', repr(None)), + + # Test various letter escapes. + (r'\a[\b]\f\n\r\t\v', '\a\b\f\n\r\t\v', '0', + repr('\a\b\f\n\r\t\v')), + (r'[\a][\b][\f][\n][\r][\t][\v]', '\a\b\f\n\r\t\v', '0', + repr('\a\b\f\n\r\t\v')), + (r'\c\e\g\h\i\j\k\o\p\q\y\z', 'ceghijkopqyz', '0', + repr('ceghijkopqyz')), + (r'\xff', '\377', '0', repr(chr(255))), + + # New \x semantics. + (r'\x00ffffffffffffff', '\377', '', repr(None)), + (r'\x00f', '\017', '', repr(None)), + (r'\x00fe', '\376', '', repr(None)), + + (r'\x00ff', '\377', '', repr(None)), + (r'\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', '0', repr('\t\n\v\r\f\ag')), + ('\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', '0', repr('\t\n\v\r\f\ag')), + (r'\t\n\v\r\f\a', '\t\n\v\r\f\a', '0', repr(chr(9) + chr(10) + + chr(11) + chr(13) + chr(12) + chr(7))), + (r'[\t][\n][\v][\r][\f][\b]', '\t\n\v\r\f\b', '0', + repr('\t\n\v\r\f\b')), + + (r"^\w+=(\\[\000-\277]|[^\n\\])*", + "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", '0', + repr("SRC=eval.c g.c blah blah blah \\\\")), + + # Test that . only matches \n in DOTALL mode. + ('a.b', 'acb', '0', repr('acb')), + ('a.b', 'a\nb', '', repr(None)), + ('a.*b', 'acc\nccb', '', repr(None)), + ('a.{4,5}b', 'acc\nccb', '', repr(None)), + ('a.b', 'a\rb', '0', repr('a\rb')), + # The new behaviour is that the inline flag affects only what follows. + ('a.b(?s)', 'a\nb', '0', repr('a\nb')), + ('a.b(?sV1)', 'a\nb', '', repr(None)), + ('(?s)a.b', 'a\nb', '0', repr('a\nb')), + ('a.*(?s)b', 'acc\nccb', '0', repr('acc\nccb')), + ('a.*(?sV1)b', 'acc\nccb', '', repr(None)), + ('(?s)a.*b', 'acc\nccb', '0', repr('acc\nccb')), + ('(?s)a.{4,5}b', 'acc\nccb', '0', repr('acc\nccb')), + + (')', '', '', regex.error, self.TRAILING_CHARS), # Unmatched right bracket. + ('', '', '0', "''"), # Empty pattern. + ('abc', 'abc', '0', repr('abc')), + ('abc', 'xbc', '', repr(None)), + ('abc', 'axc', '', repr(None)), + ('abc', 'abx', '', repr(None)), + ('abc', 'xabcy', '0', repr('abc')), + ('abc', 'ababc', '0', repr('abc')), + ('ab*c', 'abc', '0', repr('abc')), + ('ab*bc', 'abc', '0', repr('abc')), + + ('ab*bc', 'abbc', '0', repr('abbc')), + ('ab*bc', 'abbbbc', '0', repr('abbbbc')), + ('ab+bc', 'abbc', '0', repr('abbc')), + ('ab+bc', 'abc', '', repr(None)), + ('ab+bc', 'abq', '', repr(None)), + ('ab+bc', 'abbbbc', '0', repr('abbbbc')), + ('ab?bc', 'abbc', '0', repr('abbc')), + ('ab?bc', 'abc', '0', repr('abc')), + ('ab?bc', 'abbbbc', '', repr(None)), + ('ab?c', 'abc', '0', repr('abc')), + + ('^abc$', 'abc', '0', repr('abc')), + ('^abc$', 'abcc', '', repr(None)), + ('^abc', 'abcc', '0', repr('abc')), + ('^abc$', 'aabc', '', repr(None)), + ('abc$', 'aabc', '0', repr('abc')), + ('^', 'abc', '0', repr('')), + ('$', 'abc', '0', repr('')), + ('a.c', 'abc', '0', repr('abc')), + ('a.c', 'axc', '0', repr('axc')), + ('a.*c', 'axyzc', '0', repr('axyzc')), + + ('a.*c', 'axyzd', '', repr(None)), + ('a[bc]d', 'abc', '', repr(None)), + ('a[bc]d', 'abd', '0', repr('abd')), + ('a[b-d]e', 'abd', '', repr(None)), + ('a[b-d]e', 'ace', '0', repr('ace')), + ('a[b-d]', 'aac', '0', repr('ac')), + ('a[-b]', 'a-', '0', repr('a-')), + ('a[\\-b]', 'a-', '0', repr('a-')), + ('a[b-]', 'a-', '0', repr('a-')), + ('a[]b', '-', '', regex.error, self.BAD_SET), + + ('a[', '-', '', regex.error, self.BAD_SET), + ('a\\', '-', '', regex.error, self.BAD_ESCAPE), + ('abc)', '-', '', regex.error, self.TRAILING_CHARS), + ('(abc', '-', '', regex.error, self.MISSING_RPAREN), + ('a]', 'a]', '0', repr('a]')), + ('a[]]b', 'a]b', '0', repr('a]b')), + ('a[]]b', 'a]b', '0', repr('a]b')), + ('a[^bc]d', 'aed', '0', repr('aed')), + ('a[^bc]d', 'abd', '', repr(None)), + ('a[^-b]c', 'adc', '0', repr('adc')), + + ('a[^-b]c', 'a-c', '', repr(None)), + ('a[^]b]c', 'a]c', '', repr(None)), + ('a[^]b]c', 'adc', '0', repr('adc')), + ('\\ba\\b', 'a-', '0', repr('a')), + ('\\ba\\b', '-a', '0', repr('a')), + ('\\ba\\b', '-a-', '0', repr('a')), + ('\\by\\b', 'xy', '', repr(None)), + ('\\by\\b', 'yz', '', repr(None)), + ('\\by\\b', 'xyz', '', repr(None)), + ('x\\b', 'xyz', '', repr(None)), + + ('x\\B', 'xyz', '0', repr('x')), + ('\\Bz', 'xyz', '0', repr('z')), + ('z\\B', 'xyz', '', repr(None)), + ('\\Bx', 'xyz', '', repr(None)), + ('\\Ba\\B', 'a-', '', repr(None)), + ('\\Ba\\B', '-a', '', repr(None)), + ('\\Ba\\B', '-a-', '', repr(None)), + ('\\By\\B', 'xy', '', repr(None)), + ('\\By\\B', 'yz', '', repr(None)), + ('\\By\\b', 'xy', '0', repr('y')), + + ('\\by\\B', 'yz', '0', repr('y')), + ('\\By\\B', 'xyz', '0', repr('y')), + ('ab|cd', 'abc', '0', repr('ab')), + ('ab|cd', 'abcd', '0', repr('ab')), + ('()ef', 'def', '0,1', repr(('ef', ''))), + ('$b', 'b', '', repr(None)), + ('a\\(b', 'a(b', '', repr(('a(b',))), + ('a\\(*b', 'ab', '0', repr('ab')), + ('a\\(*b', 'a((b', '0', repr('a((b')), + ('a\\\\b', 'a\\b', '0', repr('a\\b')), + + ('((a))', 'abc', '0,1,2', repr(('a', 'a', 'a'))), + ('(a)b(c)', 'abc', '0,1,2', repr(('abc', 'a', 'c'))), + ('a+b+c', 'aabbabc', '0', repr('abc')), + ('(a+|b)*', 'ab', '0,1', repr(('ab', 'b'))), + ('(a+|b)+', 'ab', '0,1', repr(('ab', 'b'))), + ('(a+|b)?', 'ab', '0,1', repr(('a', 'a'))), + (')(', '-', '', regex.error, self.TRAILING_CHARS), + ('[^ab]*', 'cde', '0', repr('cde')), + ('abc', '', '', repr(None)), + ('a*', '', '0', repr('')), + + ('a|b|c|d|e', 'e', '0', repr('e')), + ('(a|b|c|d|e)f', 'ef', '0,1', repr(('ef', 'e'))), + ('abcd*efg', 'abcdefg', '0', repr('abcdefg')), + ('ab*', 'xabyabbbz', '0', repr('ab')), + ('ab*', 'xayabbbz', '0', repr('a')), + ('(ab|cd)e', 'abcde', '0,1', repr(('cde', 'cd'))), + ('[abhgefdc]ij', 'hij', '0', repr('hij')), + ('^(ab|cd)e', 'abcde', '', repr(None)), + ('(abc|)ef', 'abcdef', '0,1', repr(('ef', ''))), + ('(a|b)c*d', 'abcd', '0,1', repr(('bcd', 'b'))), + + ('(ab|ab*)bc', 'abc', '0,1', repr(('abc', 'a'))), + ('a([bc]*)c*', 'abc', '0,1', repr(('abc', 'bc'))), + ('a([bc]*)(c*d)', 'abcd', '0,1,2', repr(('abcd', 'bc', 'd'))), + ('a([bc]+)(c*d)', 'abcd', '0,1,2', repr(('abcd', 'bc', 'd'))), + ('a([bc]*)(c+d)', 'abcd', '0,1,2', repr(('abcd', 'b', 'cd'))), + ('a[bcd]*dcdcde', 'adcdcde', '0', repr('adcdcde')), + ('a[bcd]+dcdcde', 'adcdcde', '', repr(None)), + ('(ab|a)b*c', 'abc', '0,1', repr(('abc', 'ab'))), + ('((a)(b)c)(d)', 'abcd', '1,2,3,4', repr(('abc', 'a', 'b', 'd'))), + ('[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', '0', repr('alpha')), + + ('^a(bc+|b[eh])g|.h$', 'abh', '0,1', repr(('bh', None))), + ('(bc+d$|ef*g.|h?i(j|k))', 'effgz', '0,1,2', repr(('effgz', + 'effgz', None))), + ('(bc+d$|ef*g.|h?i(j|k))', 'ij', '0,1,2', repr(('ij', 'ij', + 'j'))), + ('(bc+d$|ef*g.|h?i(j|k))', 'effg', '', repr(None)), + ('(bc+d$|ef*g.|h?i(j|k))', 'bcdd', '', repr(None)), + ('(bc+d$|ef*g.|h?i(j|k))', 'reffgz', '0,1,2', repr(('effgz', + 'effgz', None))), + ('(((((((((a)))))))))', 'a', '0', repr('a')), + ('multiple words of text', 'uh-uh', '', repr(None)), + ('multiple words', 'multiple words, yeah', '0', + repr('multiple words')), + ('(.*)c(.*)', 'abcde', '0,1,2', repr(('abcde', 'ab', 'de'))), + + ('\\((.*), (.*)\\)', '(a, b)', '2,1', repr(('b', 'a'))), + ('[k]', 'ab', '', repr(None)), + ('a[-]?c', 'ac', '0', repr('ac')), + ('(abc)\\1', 'abcabc', '1', repr('abc')), + ('([a-c]*)\\1', 'abcabc', '1', repr('abc')), + ('^(.+)?B', 'AB', '1', repr('A')), + ('(a+).\\1$', 'aaaaa', '0,1', repr(('aaaaa', 'aa'))), + ('^(a+).\\1$', 'aaaa', '', repr(None)), + ('(abc)\\1', 'abcabc', '0,1', repr(('abcabc', 'abc'))), + ('([a-c]+)\\1', 'abcabc', '0,1', repr(('abcabc', 'abc'))), + + ('(a)\\1', 'aa', '0,1', repr(('aa', 'a'))), + ('(a+)\\1', 'aa', '0,1', repr(('aa', 'a'))), + ('(a+)+\\1', 'aa', '0,1', repr(('aa', 'a'))), + ('(a).+\\1', 'aba', '0,1', repr(('aba', 'a'))), + ('(a)ba*\\1', 'aba', '0,1', repr(('aba', 'a'))), + ('(aa|a)a\\1$', 'aaa', '0,1', repr(('aaa', 'a'))), + ('(a|aa)a\\1$', 'aaa', '0,1', repr(('aaa', 'a'))), + ('(a+)a\\1$', 'aaa', '0,1', repr(('aaa', 'a'))), + ('([abc]*)\\1', 'abcabc', '0,1', repr(('abcabc', 'abc'))), + ('(a)(b)c|ab', 'ab', '0,1,2', repr(('ab', None, None))), + + ('(a)+x', 'aaax', '0,1', repr(('aaax', 'a'))), + ('([ac])+x', 'aacx', '0,1', repr(('aacx', 'c'))), + ('([^/]*/)*sub1/', 'd:msgs/tdir/sub1/trial/away.cpp', '0,1', + repr(('d:msgs/tdir/sub1/', 'tdir/'))), + ('([^.]*)\\.([^:]*):[T ]+(.*)', 'track1.title:TBlah blah blah', + '0,1,2,3', repr(('track1.title:TBlah blah blah', 'track1', + 'title', 'Blah blah blah'))), + ('([^N]*N)+', 'abNNxyzN', '0,1', repr(('abNNxyzN', 'xyzN'))), + ('([^N]*N)+', 'abNNxyz', '0,1', repr(('abNN', 'N'))), + ('([abc]*)x', 'abcx', '0,1', repr(('abcx', 'abc'))), + ('([abc]*)x', 'abc', '', repr(None)), + ('([xyz]*)x', 'abcx', '0,1', repr(('x', ''))), + ('(a)+b|aac', 'aac', '0,1', repr(('aac', None))), + + # Test symbolic groups. + ('(?P<i d>aaa)a', 'aaaa', '', regex.error, self.BAD_GROUP_NAME), + ('(?P<id>aaa)a', 'aaaa', '0,id', repr(('aaaa', 'aaa'))), + ('(?P<id>aa)(?P=id)', 'aaaa', '0,id', repr(('aaaa', 'aa'))), + ('(?P<id>aa)(?P=xd)', 'aaaa', '', regex.error, self.UNKNOWN_GROUP), + + # Character properties. + (ur"\g", u"g", '0', repr(u'g')), + (ur"\g<1>", u"g", '', regex.error, self.INVALID_GROUP_REF), + (ur"(.)\g<1>", u"gg", '0', repr(u'gg')), + (ur"(.)\g<1>", u"gg", '', repr((u'gg', u'g'))), + (ur"\N", u"N", '0', repr(u'N')), + (ur"\N{LATIN SMALL LETTER A}", u"a", '0', repr(u'a')), + (ur"\p", u"p", '0', repr(u'p')), + (ur"\p{Ll}", u"a", '0', repr(u'a')), + (ur"\P", u"P", '0', repr(u'P')), + (ur"\P{Lu}", u"p", '0', repr(u'p')), + + # All tests from Perl. + ('abc', 'abc', '0', repr('abc')), + ('abc', 'xbc', '', repr(None)), + ('abc', 'axc', '', repr(None)), + ('abc', 'abx', '', repr(None)), + ('abc', 'xabcy', '0', repr('abc')), + ('abc', 'ababc', '0', repr('abc')), + + ('ab*c', 'abc', '0', repr('abc')), + ('ab*bc', 'abc', '0', repr('abc')), + ('ab*bc', 'abbc', '0', repr('abbc')), + ('ab*bc', 'abbbbc', '0', repr('abbbbc')), + ('ab{0,}bc', 'abbbbc', '0', repr('abbbbc')), + ('ab+bc', 'abbc', '0', repr('abbc')), + ('ab+bc', 'abc', '', repr(None)), + ('ab+bc', 'abq', '', repr(None)), + ('ab{1,}bc', 'abq', '', repr(None)), + ('ab+bc', 'abbbbc', '0', repr('abbbbc')), + + ('ab{1,}bc', 'abbbbc', '0', repr('abbbbc')), + ('ab{1,3}bc', 'abbbbc', '0', repr('abbbbc')), + ('ab{3,4}bc', 'abbbbc', '0', repr('abbbbc')), + ('ab{4,5}bc', 'abbbbc', '', repr(None)), + ('ab?bc', 'abbc', '0', repr('abbc')), + ('ab?bc', 'abc', '0', repr('abc')), + ('ab{0,1}bc', 'abc', '0', repr('abc')), + ('ab?bc', 'abbbbc', '', repr(None)), + ('ab?c', 'abc', '0', repr('abc')), + ('ab{0,1}c', 'abc', '0', repr('abc')), + + ('^abc$', 'abc', '0', repr('abc')), + ('^abc$', 'abcc', '', repr(None)), + ('^abc', 'abcc', '0', repr('abc')), + ('^abc$', 'aabc', '', repr(None)), + ('abc$', 'aabc', '0', repr('abc')), + ('^', 'abc', '0', repr('')), + ('$', 'abc', '0', repr('')), + ('a.c', 'abc', '0', repr('abc')), + ('a.c', 'axc', '0', repr('axc')), + ('a.*c', 'axyzc', '0', repr('axyzc')), + + ('a.*c', 'axyzd', '', repr(None)), + ('a[bc]d', 'abc', '', repr(None)), + ('a[bc]d', 'abd', '0', repr('abd')), + ('a[b-d]e', 'abd', '', repr(None)), + ('a[b-d]e', 'ace', '0', repr('ace')), + ('a[b-d]', 'aac', '0', repr('ac')), + ('a[-b]', 'a-', '0', repr('a-')), + ('a[b-]', 'a-', '0', repr('a-')), + ('a[b-a]', '-', '', regex.error, self.BAD_CHAR_RANGE), + ('a[]b', '-', '', regex.error, self.BAD_SET), + + ('a[', '-', '', regex.error, self.BAD_SET), + ('a]', 'a]', '0', repr('a]')), + ('a[]]b', 'a]b', '0', repr('a]b')), + ('a[^bc]d', 'aed', '0', repr('aed')), + ('a[^bc]d', 'abd', '', repr(None)), + ('a[^-b]c', 'adc', '0', repr('adc')), + ('a[^-b]c', 'a-c', '', repr(None)), + ('a[^]b]c', 'a]c', '', repr(None)), + ('a[^]b]c', 'adc', '0', repr('adc')), + ('ab|cd', 'abc', '0', repr('ab')), + + ('ab|cd', 'abcd', '0', repr('ab')), + ('()ef', 'def', '0,1', repr(('ef', ''))), + ('*a', '-', '', regex.error, self.NOTHING_TO_REPEAT), + ('(*)b', '-', '', regex.error, self.NOTHING_TO_REPEAT), + ('$b', 'b', '', repr(None)), + ('a\\', '-', '', regex.error, self.BAD_ESCAPE), + ('a\\(b', 'a(b', '', repr(('a(b',))), + ('a\\(*b', 'ab', '0', repr('ab')), + ('a\\(*b', 'a((b', '0', repr('a((b')), + ('a\\\\b', 'a\\b', '0', repr('a\\b')), + + ('abc)', '-', '', regex.error, self.TRAILING_CHARS), + ('(abc', '-', '', regex.error, self.MISSING_RPAREN), + ('((a))', 'abc', '0,1,2', repr(('a', 'a', 'a'))), + ('(a)b(c)', 'abc', '0,1,2', repr(('abc', 'a', 'c'))), + ('a+b+c', 'aabbabc', '0', repr('abc')), + ('a{1,}b{1,}c', 'aabbabc', '0', repr('abc')), + ('a**', '-', '', regex.error, self.MULTIPLE_REPEAT), + ('a.+?c', 'abcabc', '0', repr('abc')), + ('(a+|b)*', 'ab', '0,1', repr(('ab', 'b'))), + ('(a+|b){0,}', 'ab', '0,1', repr(('ab', 'b'))), + + ('(a+|b)+', 'ab', '0,1', repr(('ab', 'b'))), + ('(a+|b){1,}', 'ab', '0,1', repr(('ab', 'b'))), + ('(a+|b)?', 'ab', '0,1', repr(('a', 'a'))), + ('(a+|b){0,1}', 'ab', '0,1', repr(('a', 'a'))), + (')(', '-', '', regex.error, self.TRAILING_CHARS), + ('[^ab]*', 'cde', '0', repr('cde')), + ('abc', '', '', repr(None)), + ('a*', '', '0', repr('')), + ('([abc])*d', 'abbbcd', '0,1', repr(('abbbcd', 'c'))), + ('([abc])*bcd', 'abcd', '0,1', repr(('abcd', 'a'))), + + ('a|b|c|d|e', 'e', '0', repr('e')), + ('(a|b|c|d|e)f', 'ef', '0,1', repr(('ef', 'e'))), + ('abcd*efg', 'abcdefg', '0', repr('abcdefg')), + ('ab*', 'xabyabbbz', '0', repr('ab')), + ('ab*', 'xayabbbz', '0', repr('a')), + ('(ab|cd)e', 'abcde', '0,1', repr(('cde', 'cd'))), + ('[abhgefdc]ij', 'hij', '0', repr('hij')), + ('^(ab|cd)e', 'abcde', '', repr(None)), + ('(abc|)ef', 'abcdef', '0,1', repr(('ef', ''))), + ('(a|b)c*d', 'abcd', '0,1', repr(('bcd', 'b'))), + + ('(ab|ab*)bc', 'abc', '0,1', repr(('abc', 'a'))), + ('a([bc]*)c*', 'abc', '0,1', repr(('abc', 'bc'))), + ('a([bc]*)(c*d)', 'abcd', '0,1,2', repr(('abcd', 'bc', 'd'))), + ('a([bc]+)(c*d)', 'abcd', '0,1,2', repr(('abcd', 'bc', 'd'))), + ('a([bc]*)(c+d)', 'abcd', '0,1,2', repr(('abcd', 'b', 'cd'))), + ('a[bcd]*dcdcde', 'adcdcde', '0', repr('adcdcde')), + ('a[bcd]+dcdcde', 'adcdcde', '', repr(None)), + ('(ab|a)b*c', 'abc', '0,1', repr(('abc', 'ab'))), + ('((a)(b)c)(d)', 'abcd', '1,2,3,4', repr(('abc', 'a', 'b', 'd'))), + ('[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', '0', repr('alpha')), + + ('^a(bc+|b[eh])g|.h$', 'abh', '0,1', repr(('bh', None))), + ('(bc+d$|ef*g.|h?i(j|k))', 'effgz', '0,1,2', repr(('effgz', + 'effgz', None))), + ('(bc+d$|ef*g.|h?i(j|k))', 'ij', '0,1,2', repr(('ij', 'ij', + 'j'))), + ('(bc+d$|ef*g.|h?i(j|k))', 'effg', '', repr(None)), + ('(bc+d$|ef*g.|h?i(j|k))', 'bcdd', '', repr(None)), + ('(bc+d$|ef*g.|h?i(j|k))', 'reffgz', '0,1,2', repr(('effgz', + 'effgz', None))), + ('((((((((((a))))))))))', 'a', '10', repr('a')), + ('((((((((((a))))))))))\\10', 'aa', '0', repr('aa')), + + # Python does not have the same rules for \\41 so this is a syntax error + # ('((((((((((a))))))))))\\41', 'aa', '', repr(None)), + # ('((((((((((a))))))))))\\41', 'a!', '0', repr('a!')), + ('((((((((((a))))))))))\\41', '', '', regex.error, + self.INVALID_GROUP_REF), + ('(?i)((((((((((a))))))))))\\41', '', '', regex.error, + self.INVALID_GROUP_REF), + + ('(((((((((a)))))))))', 'a', '0', repr('a')), + ('multiple words of text', 'uh-uh', '', repr(None)), + ('multiple words', 'multiple words, yeah', '0', + repr('multiple words')), + ('(.*)c(.*)', 'abcde', '0,1,2', repr(('abcde', 'ab', 'de'))), + ('\\((.*), (.*)\\)', '(a, b)', '2,1', repr(('b', 'a'))), + ('[k]', 'ab', '', repr(None)), + ('a[-]?c', 'ac', '0', repr('ac')), + ('(abc)\\1', 'abcabc', '1', repr('abc')), + ('([a-c]*)\\1', 'abcabc', '1', repr('abc')), + ('(?i)abc', 'ABC', '0', repr('ABC')), + + ('(?i)abc', 'XBC', '', repr(None)), + ('(?i)abc', 'AXC', '', repr(None)), + ('(?i)abc', 'ABX', '', repr(None)), + ('(?i)abc', 'XABCY', '0', repr('ABC')), + ('(?i)abc', 'ABABC', '0', repr('ABC')), + ('(?i)ab*c', 'ABC', '0', repr('ABC')), + ('(?i)ab*bc', 'ABC', '0', repr('ABC')), + ('(?i)ab*bc', 'ABBC', '0', repr('ABBC')), + ('(?i)ab*?bc', 'ABBBBC', '0', repr('ABBBBC')), + ('(?i)ab{0,}?bc', 'ABBBBC', '0', repr('ABBBBC')), + + ('(?i)ab+?bc', 'ABBC', '0', repr('ABBC')), + ('(?i)ab+bc', 'ABC', '', repr(None)), + ('(?i)ab+bc', 'ABQ', '', repr(None)), + ('(?i)ab{1,}bc', 'ABQ', '', repr(None)), + ('(?i)ab+bc', 'ABBBBC', '0', repr('ABBBBC')), + ('(?i)ab{1,}?bc', 'ABBBBC', '0', repr('ABBBBC')), + ('(?i)ab{1,3}?bc', 'ABBBBC', '0', repr('ABBBBC')), + ('(?i)ab{3,4}?bc', 'ABBBBC', '0', repr('ABBBBC')), + ('(?i)ab{4,5}?bc', 'ABBBBC', '', repr(None)), + ('(?i)ab??bc', 'ABBC', '0', repr('ABBC')), + + ('(?i)ab??bc', 'ABC', '0', repr('ABC')), + ('(?i)ab{0,1}?bc', 'ABC', '0', repr('ABC')), + ('(?i)ab??bc', 'ABBBBC', '', repr(None)), + ('(?i)ab??c', 'ABC', '0', repr('ABC')), + ('(?i)ab{0,1}?c', 'ABC', '0', repr('ABC')), + ('(?i)^abc$', 'ABC', '0', repr('ABC')), + ('(?i)^abc$', 'ABCC', '', repr(None)), + ('(?i)^abc', 'ABCC', '0', repr('ABC')), + ('(?i)^abc$', 'AABC', '', repr(None)), + ('(?i)abc$', 'AABC', '0', repr('ABC')), + + ('(?i)^', 'ABC', '0', repr('')), + ('(?i)$', 'ABC', '0', repr('')), + ('(?i)a.c', 'ABC', '0', repr('ABC')), + ('(?i)a.c', 'AXC', '0', repr('AXC')), + ('(?i)a.*?c', 'AXYZC', '0', repr('AXYZC')), + ('(?i)a.*c', 'AXYZD', '', repr(None)), + ('(?i)a[bc]d', 'ABC', '', repr(None)), + ('(?i)a[bc]d', 'ABD', '0', repr('ABD')), + ('(?i)a[b-d]e', 'ABD', '', repr(None)), + ('(?i)a[b-d]e', 'ACE', '0', repr('ACE')), + + ('(?i)a[b-d]', 'AAC', '0', repr('AC')), + ('(?i)a[-b]', 'A-', '0', repr('A-')), + ('(?i)a[b-]', 'A-', '0', repr('A-')), + ('(?i)a[b-a]', '-', '', regex.error, self.BAD_CHAR_RANGE), + ('(?i)a[]b', '-', '', regex.error, self.BAD_SET), + ('(?i)a[', '-', '', regex.error, self.BAD_SET), + ('(?i)a]', 'A]', '0', repr('A]')), + ('(?i)a[]]b', 'A]B', '0', repr('A]B')), + ('(?i)a[^bc]d', 'AED', '0', repr('AED')), + ('(?i)a[^bc]d', 'ABD', '', repr(None)), + + ('(?i)a[^-b]c', 'ADC', '0', repr('ADC')), + ('(?i)a[^-b]c', 'A-C', '', repr(None)), + ('(?i)a[^]b]c', 'A]C', '', repr(None)), + ('(?i)a[^]b]c', 'ADC', '0', repr('ADC')), + ('(?i)ab|cd', 'ABC', '0', repr('AB')), + ('(?i)ab|cd', 'ABCD', '0', repr('AB')), + ('(?i)()ef', 'DEF', '0,1', repr(('EF', ''))), + ('(?i)*a', '-', '', regex.error, self.NOTHING_TO_REPEAT), + ('(?i)(*)b', '-', '', regex.error, self.NOTHING_TO_REPEAT), + ('(?i)$b', 'B', '', repr(None)), + + ('(?i)a\\', '-', '', regex.error, self.BAD_ESCAPE), + ('(?i)a\\(b', 'A(B', '', repr(('A(B',))), + ('(?i)a\\(*b', 'AB', '0', repr('AB')), + ('(?i)a\\(*b', 'A((B', '0', repr('A((B')), + ('(?i)a\\\\b', 'A\\B', '0', repr('A\\B')), + ('(?i)abc)', '-', '', regex.error, self.TRAILING_CHARS), + ('(?i)(abc', '-', '', regex.error, self.MISSING_RPAREN), + ('(?i)((a))', 'ABC', '0,1,2', repr(('A', 'A', 'A'))), + ('(?i)(a)b(c)', 'ABC', '0,1,2', repr(('ABC', 'A', 'C'))), + ('(?i)a+b+c', 'AABBABC', '0', repr('ABC')), + + ('(?i)a{1,}b{1,}c', 'AABBABC', '0', repr('ABC')), + ('(?i)a**', '-', '', regex.error, self.MULTIPLE_REPEAT), + ('(?i)a.+?c', 'ABCABC', '0', repr('ABC')), + ('(?i)a.*?c', 'ABCABC', '0', repr('ABC')), + ('(?i)a.{0,5}?c', 'ABCABC', '0', repr('ABC')), + ('(?i)(a+|b)*', 'AB', '0,1', repr(('AB', 'B'))), + ('(?i)(a+|b){0,}', 'AB', '0,1', repr(('AB', 'B'))), + ('(?i)(a+|b)+', 'AB', '0,1', repr(('AB', 'B'))), + ('(?i)(a+|b){1,}', 'AB', '0,1', repr(('AB', 'B'))), + ('(?i)(a+|b)?', 'AB', '0,1', repr(('A', 'A'))), + + ('(?i)(a+|b){0,1}', 'AB', '0,1', repr(('A', 'A'))), + ('(?i)(a+|b){0,1}?', 'AB', '0,1', repr(('', None))), + ('(?i))(', '-', '', regex.error, self.TRAILING_CHARS), + ('(?i)[^ab]*', 'CDE', '0', repr('CDE')), + ('(?i)abc', '', '', repr(None)), + ('(?i)a*', '', '0', repr('')), + ('(?i)([abc])*d', 'ABBBCD', '0,1', repr(('ABBBCD', 'C'))), + ('(?i)([abc])*bcd', 'ABCD', '0,1', repr(('ABCD', 'A'))), + ('(?i)a|b|c|d|e', 'E', '0', repr('E')), + ('(?i)(a|b|c|d|e)f', 'EF', '0,1', repr(('EF', 'E'))), + + ('(?i)abcd*efg', 'ABCDEFG', '0', repr('ABCDEFG')), + ('(?i)ab*', 'XABYABBBZ', '0', repr('AB')), + ('(?i)ab*', 'XAYABBBZ', '0', repr('A')), + ('(?i)(ab|cd)e', 'ABCDE', '0,1', repr(('CDE', 'CD'))), + ('(?i)[abhgefdc]ij', 'HIJ', '0', repr('HIJ')), + ('(?i)^(ab|cd)e', 'ABCDE', '', repr(None)), + ('(?i)(abc|)ef', 'ABCDEF', '0,1', repr(('EF', ''))), + ('(?i)(a|b)c*d', 'ABCD', '0,1', repr(('BCD', 'B'))), + ('(?i)(ab|ab*)bc', 'ABC', '0,1', repr(('ABC', 'A'))), + ('(?i)a([bc]*)c*', 'ABC', '0,1', repr(('ABC', 'BC'))), + + ('(?i)a([bc]*)(c*d)', 'ABCD', '0,1,2', repr(('ABCD', 'BC', 'D'))), + ('(?i)a([bc]+)(c*d)', 'ABCD', '0,1,2', repr(('ABCD', 'BC', 'D'))), + ('(?i)a([bc]*)(c+d)', 'ABCD', '0,1,2', repr(('ABCD', 'B', 'CD'))), + ('(?i)a[bcd]*dcdcde', 'ADCDCDE', '0', repr('ADCDCDE')), + ('(?i)a[bcd]+dcdcde', 'ADCDCDE', '', repr(None)), + ('(?i)(ab|a)b*c', 'ABC', '0,1', repr(('ABC', 'AB'))), + ('(?i)((a)(b)c)(d)', 'ABCD', '1,2,3,4', repr(('ABC', 'A', 'B', + 'D'))), + ('(?i)[a-zA-Z_][a-zA-Z0-9_]*', 'ALPHA', '0', repr('ALPHA')), + ('(?i)^a(bc+|b[eh])g|.h$', 'ABH', '0,1', repr(('BH', None))), + ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFGZ', '0,1,2', repr(('EFFGZ', + 'EFFGZ', None))), + + ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'IJ', '0,1,2', repr(('IJ', 'IJ', + 'J'))), + ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFG', '', repr(None)), + ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'BCDD', '', repr(None)), + ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'REFFGZ', '0,1,2', repr(('EFFGZ', + 'EFFGZ', None))), + ('(?i)((((((((((a))))))))))', 'A', '10', repr('A')), + ('(?i)((((((((((a))))))))))\\10', 'AA', '0', repr('AA')), + #('(?i)((((((((((a))))))))))\\41', 'AA', '', repr(None)), + #('(?i)((((((((((a))))))))))\\41', 'A!', '0', repr('A!')), + ('(?i)(((((((((a)))))))))', 'A', '0', repr('A')), + ('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a))))))))))', 'A', '1', + repr('A')), + ('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a|b|c))))))))))', 'C', '1', + repr('C')), + ('(?i)multiple words of text', 'UH-UH', '', repr(None)), + + ('(?i)multiple words', 'MULTIPLE WORDS, YEAH', '0', + repr('MULTIPLE WORDS')), + ('(?i)(.*)c(.*)', 'ABCDE', '0,1,2', repr(('ABCDE', 'AB', 'DE'))), + ('(?i)\\((.*), (.*)\\)', '(A, B)', '2,1', repr(('B', 'A'))), + ('(?i)[k]', 'AB', '', repr(None)), + # ('(?i)abcd', 'ABCD', SUCCEED, 'found+"-"+\\found+"-"+\\\\found', repr(ABCD-$&-\\ABCD)), + # ('(?i)a(bc)d', 'ABCD', SUCCEED, 'g1+"-"+\\g1+"-"+\\\\g1', repr(BC-$1-\\BC)), + ('(?i)a[-]?c', 'AC', '0', repr('AC')), + ('(?i)(abc)\\1', 'ABCABC', '1', repr('ABC')), + ('(?i)([a-c]*)\\1', 'ABCABC', '1', repr('ABC')), + ('a(?!b).', 'abad', '0', repr('ad')), + ('a(?=d).', 'abad', '0', repr('ad')), + ('a(?=c|d).', 'abad', '0', repr('ad')), + + ('a(?:b|c|d)(.)', 'ace', '1', repr('e')), + ('a(?:b|c|d)*(.)', 'ace', '1', repr('e')), + ('a(?:b|c|d)+?(.)', 'ace', '1', repr('e')), + ('a(?:b|(c|e){1,2}?|d)+?(.)', 'ace', '1,2', repr(('c', 'e'))), + + # Lookbehind: split by : but not if it is escaped by -. + ('(?<!-):(.*?)(?<!-):', 'a:bc-:de:f', '1', repr('bc-:de')), + # Escaping with \ as we know it. + ('(?<!\\\):(.*?)(?<!\\\):', 'a:bc\\:de:f', '1', repr('bc\\:de')), + # Terminating with ' and escaping with ? as in edifact. + ("(?<!\\?)'(.*?)(?<!\\?)'", "a'bc?'de'f", '1', repr("bc?'de")), + + # Comments using the (?#...) syntax. + + ('w(?# comment', 'w', '', regex.error, self.MISSING_RPAREN), + ('w(?# comment 1)xy(?# comment 2)z', 'wxyz', '0', repr('wxyz')), + + # Check odd placement of embedded pattern modifiers. + + # Not an error under PCRE/PRE: + # When the new behaviour is turned on positional inline flags affect + # only what follows. + ('w(?i)', 'W', '0', repr('W')), + ('w(?iV1)', 'W', '0', repr(None)), + ('w(?i)', 'w', '0', repr('w')), + ('w(?iV1)', 'w', '0', repr('w')), + ('(?i)w', 'W', '0', repr('W')), + ('(?iV1)w', 'W', '0', repr('W')), + + # Comments using the x embedded pattern modifier. + ("""(?x)w# comment 1 +x y +# comment 2 +z""", 'wxyz', '0', repr('wxyz')), + + # Using the m embedded pattern modifier. + ('^abc', """jkl +abc +xyz""", '', repr(None)), + ('(?m)^abc', """jkl +abc +xyz""", '0', repr('abc')), + + ('(?m)abc$', """jkl +xyzabc +123""", '0', repr('abc')), + + # Using the s embedded pattern modifier. + ('a.b', 'a\nb', '', repr(None)), + ('(?s)a.b', 'a\nb', '0', repr('a\nb')), + + # Test \w, etc. both inside and outside character classes. + ('\\w+', '--ab_cd0123--', '0', repr('ab_cd0123')), + ('[\\w]+', '--ab_cd0123--', '0', repr('ab_cd0123')), + ('\\D+', '1234abc5678', '0', repr('abc')), + ('[\\D]+', '1234abc5678', '0', repr('abc')), + ('[\\da-fA-F]+', '123abc', '0', repr('123abc')), + # Not an error under PCRE/PRE: + # ('[\\d-x]', '-', '', regex.error, self.BAD_CHAR_RANGE), + (r'([\s]*)([\S]*)([\s]*)', ' testing!1972', '3,2,1', repr(('', + 'testing!1972', ' '))), + (r'(\s*)(\S*)(\s*)', ' testing!1972', '3,2,1', repr(('', + 'testing!1972', ' '))), + + # + # Post-1.5.2 additions. + + # xmllib problem. + (r'(([a-z]+):)?([a-z]+)$', 'smil', '1,2,3', repr((None, None, + 'smil'))), + # Bug 110866: reference to undefined group. + (r'((.)\1+)', '', '', regex.error, self.OPEN_GROUP), + # Bug 111869: search (PRE/PCRE fails on this one, SRE doesn't). + (r'.*d', 'abc\nabd', '0', repr('abd')), + # Bug 112468: various expected syntax errors. + (r'(', '', '', regex.error, self.MISSING_RPAREN), + (r'[\41]', '!', '0', repr('!')), + # Bug 114033: nothing to repeat. + (r'(x?)?', 'x', '0', repr('x')), + # Bug 115040: rescan if flags are modified inside pattern. + # If the new behaviour is turned on then positional inline flags + # affect only what follows. + (r' (?x)foo ', 'foo', '0', repr('foo')), + (r' (?V1x)foo ', 'foo', '0', repr(None)), + (r'(?x) foo ', 'foo', '0', repr('foo')), + (r'(?V1x) foo ', 'foo', '0', repr('foo')), + (r'(?x)foo ', 'foo', '0', repr('foo')), + (r'(?V1x)foo ', 'foo', '0', repr('foo')), + # Bug 115618: negative lookahead. + (r'(?<!abc)(d.f)', 'abcdefdof', '0', repr('dof')), + # Bug 116251: character class bug. + (r'[\w-]+', 'laser_beam', '0', repr('laser_beam')), + # Bug 123769+127259: non-greedy backtracking bug. + (r'.*?\S *:', 'xx:', '0', repr('xx:')), + (r'a[ ]*?\ (\d+).*', 'a 10', '0', repr('a 10')), + (r'a[ ]*?\ (\d+).*', 'a 10', '0', repr('a 10')), + # Bug 127259: \Z shouldn't depend on multiline mode. + (r'(?ms).*?x\s*\Z(.*)','xx\nx\n', '1', repr('')), + # Bug 128899: uppercase literals under the ignorecase flag. + (r'(?i)M+', 'MMM', '0', repr('MMM')), + (r'(?i)m+', 'MMM', '0', repr('MMM')), + (r'(?i)[M]+', 'MMM', '0', repr('MMM')), + (r'(?i)[m]+', 'MMM', '0', repr('MMM')), + # Bug 130748: ^* should be an error (nothing to repeat). + # In 'regex' we won't bother to complain about this. + # (r'^*', '', '', regex.error, self.NOTHING_TO_REPEAT), + # Bug 133283: minimizing repeat problem. + (r'"(?:\\"|[^"])*?"', r'"\""', '0', repr(r'"\""')), + # Bug 477728: minimizing repeat problem. + (r'^.*?$', 'one\ntwo\nthree\n', '', repr(None)), + # Bug 483789: minimizing repeat problem. + (r'a[^>]*?b', 'a>b', '', repr(None)), + # Bug 490573: minimizing repeat problem. + (r'^a*?$', 'foo', '', repr(None)), + # Bug 470582: nested groups problem. + (r'^((a)c)?(ab)$', 'ab', '1,2,3', repr((None, None, 'ab'))), + # Another minimizing repeat problem (capturing groups in assertions). + ('^([ab]*?)(?=(b)?)c', 'abc', '1,2', repr(('ab', None))), + ('^([ab]*?)(?!(b))c', 'abc', '1,2', repr(('ab', None))), + ('^([ab]*?)(?<!(a))c', 'abc', '1,2', repr(('ab', None))), + # Bug 410271: \b broken under locales. + (r'\b.\b', 'a', '0', repr('a')), + (ur'(?u)\b.\b', u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}', '0', + repr(u'\xc4')), + (ur'(?u)\w', u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}', '0', + repr(u'\xc4')), + ] + + for t in tests: + excval = None + try: + if len(t) == 4: + pattern, string, groups, expected = t + else: + pattern, string, groups, expected, excval = t + except ValueError: + fields = ", ".join([repr(f) for f in t[ : 3]] + ["..."]) + self.fail("Incorrect number of test fields: (%s)" % fields) + else: + group_list = [] + if groups: + for group in groups.split(","): + try: + group_list.append(int(group)) + except ValueError: + group_list.append(group) + + if excval is not None: + self.assertRaisesRegex(expected, excval, regex.search, + pattern, string) + else: + m = regex.search(pattern, string) + if m: + if group_list: + actual = repr(m.group(*group_list)) + else: + actual = repr(m[:]) + else: + actual = repr(m) + + self.assertEqual(actual, expected) + + def test_replacement(self): + self.assertEqual(regex.sub("test\?", "result\?\.\a\q\m\n", "test?"), + "result\?\.\a\q\m\n") + self.assertEqual(regex.sub(r"test\?", "result\?\.\a\q\m\n", "test?"), + "result\?\.\a\q\m\n") + + self.assertEqual(regex.sub('(.)', r"\1\1", 'x'), 'xx') + self.assertEqual(regex.sub('(.)', regex.escape(r"\1\1"), 'x'), r"\1\1") + self.assertEqual(regex.sub('(.)', r"\\1\\1", 'x'), r"\1\1") + self.assertEqual(regex.sub('(.)', lambda m: r"\1\1", 'x'), r"\1\1") + + def test_common_prefix(self): + # Very long common prefix + all = string.ascii_lowercase + string.digits + string.ascii_uppercase + side = all * 4 + regexp = '(' + side + '|' + side + ')' + self.assertEqual(repr(type(regex.compile(regexp))), self.PATTERN_CLASS) + + def test_captures(self): + self.assertEqual(regex.search(r"(\w)+", "abc").captures(1), ['a', 'b', + 'c']) + self.assertEqual(regex.search(r"(\w{3})+", "abcdef").captures(0, 1), + (['abcdef'], ['abc', 'def'])) + self.assertEqual(regex.search(r"^(\d{1,3})(?:\.(\d{1,3})){3}$", + "192.168.0.1").captures(1, 2), (['192', ], ['168', '0', '1'])) + self.assertEqual(regex.match(r"^([0-9A-F]{2}){4} ([a-z]\d){5}$", + "3FB52A0C a2c4g3k9d3").captures(1, 2), (['3F', 'B5', '2A', '0C'], + ['a2', 'c4', 'g3', 'k9', 'd3'])) + self.assertEqual(regex.match("([a-z]W)([a-z]X)+([a-z]Y)", + "aWbXcXdXeXfY").captures(1, 2, 3), (['aW'], ['bX', 'cX', 'dX', 'eX'], + ['fY'])) + + self.assertEqual(regex.search(r".*?(?=(.)+)b", "ab").captures(1), + ['b']) + self.assertEqual(regex.search(r".*?(?>(.){0,2})d", "abcd").captures(1), + ['b', 'c']) + self.assertEqual(regex.search(r"(.)+", "a").captures(1), ['a']) + + def test_guards(self): + m = regex.search(r"(X.*?Y\s*){3}(X\s*)+AB:", + "XY\nX Y\nX Y\nXY\nXX AB:") + self.assertEqual(m.span(0, 1, 2), ((3, 21), (12, 15), (16, 18))) + + m = regex.search(r"(X.*?Y\s*){3,}(X\s*)+AB:", + "XY\nX Y\nX Y\nXY\nXX AB:") + self.assertEqual(m.span(0, 1, 2), ((0, 21), (12, 15), (16, 18))) + + m = regex.search(r'\d{4}(\s*\w)?\W*((?!\d)\w){2}', "9999XX") + self.assertEqual(m.span(0, 1, 2), ((0, 6), (-1, -1), (5, 6))) + + m = regex.search(r'A\s*?.*?(\n+.*?\s*?){0,2}\(X', 'A\n1\nS\n1 (X') + self.assertEqual(m.span(0, 1), ((0, 10), (5, 8))) + + m = regex.search('Derde\s*:', 'aaaaaa:\nDerde:') + self.assertEqual(m.span(), (8, 14)) + m = regex.search('Derde\s*:', 'aaaaa:\nDerde:') + self.assertEqual(m.span(), (7, 13)) + + def test_turkic(self): + # Turkish has dotted and dotless I/i. + pairs = u"I=i;I=\u0131;i=\u0130" + + all_chars = set() + matching = set() + for pair in pairs.split(";"): + ch1, ch2 = pair.split("=") + all_chars.update((ch1, ch2)) + matching.add((ch1, ch1)) + matching.add((ch1, ch2)) + matching.add((ch2, ch1)) + matching.add((ch2, ch2)) + + for ch1 in all_chars: + for ch2 in all_chars: + m = regex.match(ur"(?iu)\A" + ch1 + ur"\Z", ch2) + if m: + if (ch1, ch2) not in matching: + self.fail("%s matching %s" % (repr(ch1), repr(ch2))) + else: + if (ch1, ch2) in matching: + self.fail("%s not matching %s" % (repr(ch1), + repr(ch2))) + + def test_named_lists(self): + options = [u"one", u"two", u"three"] + self.assertEqual(regex.match(ur"333\L<bar>444", u"333one444", + bar=options).group(), u"333one444") + self.assertEqual(regex.match(ur"(?i)333\L<bar>444", u"333TWO444", + bar=options).group(), u"333TWO444") + self.assertEqual(regex.match(ur"333\L<bar>444", u"333four444", + bar=options), None) + + options = ["one", "two", "three"] + self.assertEqual(regex.match(r"333\L<bar>444", "333one444", + bar=options).group(), "333one444") + self.assertEqual(regex.match(r"(?i)333\L<bar>444", "333TWO444", + bar=options).group(), "333TWO444") + self.assertEqual(regex.match(r"333\L<bar>444", "333four444", + bar=options), None) + + self.assertEqual(repr(type(regex.compile(r"3\L<bar>4\L<bar>+5", + bar=["one", "two", "three"]))), self.PATTERN_CLASS) + + self.assertEqual(regex.findall(r"^\L<options>", "solid QWERT", + options=set(['good', 'brilliant', '+s\\ol[i}d'])), []) + self.assertEqual(regex.findall(r"^\L<options>", "+solid QWERT", + options=set(['good', 'brilliant', '+solid'])), ['+solid']) + + options = [u"STRASSE"] + self.assertEqual(regex.match(ur"(?fiu)\L<words>", + u"stra\N{LATIN SMALL LETTER SHARP S}e", words=options).span(), (0, + 6)) + + options = [u"STRASSE", u"stress"] + self.assertEqual(regex.match(ur"(?fiu)\L<words>", + u"stra\N{LATIN SMALL LETTER SHARP S}e", words=options).span(), (0, + 6)) + + options = [u"stra\N{LATIN SMALL LETTER SHARP S}e"] + self.assertEqual(regex.match(ur"(?fiu)\L<words>", u"STRASSE", + words=options).span(), (0, 7)) + + options = ["kit"] + self.assertEqual(regex.search(ur"(?iu)\L<words>", u"SKITS", + words=options).span(), (1, 4)) + self.assertEqual(regex.search(ur"(?iu)\L<words>", + u"SK\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}TS", + words=options).span(), (1, 4)) + + self.assertEqual(regex.search(ur"(?fiu)\b(\w+) +\1\b", + u" stra\N{LATIN SMALL LETTER SHARP S}e STRASSE ").span(), (1, 15)) + self.assertEqual(regex.search(ur"(?fiu)\b(\w+) +\1\b", + u" STRASSE stra\N{LATIN SMALL LETTER SHARP S}e ").span(), (1, 15)) + + self.assertEqual(regex.search(r"^\L<options>$", "", options=[]).span(), + (0, 0)) + + def test_fuzzy(self): + # Some tests borrowed from TRE library tests. + self.assertEqual(repr(type(regex.compile('(fou){s,e<=1}'))), + self.PATTERN_CLASS) + self.assertEqual(repr(type(regex.compile('(fuu){s}'))), + self.PATTERN_CLASS) + self.assertEqual(repr(type(regex.compile('(fuu){s,e}'))), + self.PATTERN_CLASS) + self.assertEqual(repr(type(regex.compile('(anaconda){1i+1d<1,s<=1}'))), + self.PATTERN_CLASS) + self.assertEqual(repr(type(regex.compile('(anaconda){1i+1d<1,s<=1,e<=10}'))), + self.PATTERN_CLASS) + self.assertEqual(repr(type(regex.compile('(anaconda){s<=1,e<=1,1i+1d<1}'))), + self.PATTERN_CLASS) + + text = 'molasses anaconda foo bar baz smith anderson ' + self.assertEqual(regex.search('(znacnda){s<=1,e<=3,1i+1d<1}', text), + None) + self.assertEqual(regex.search('(znacnda){s<=1,e<=3,1i+1d<2}', + text).span(0, 1), ((9, 17), (9, 17))) + self.assertEqual(regex.search('(ananda){1i+1d<2}', text), None) + self.assertEqual(regex.search(r"(?:\bznacnda){e<=2}", text)[0], + "anaconda") + self.assertEqual(regex.search(r"(?:\bnacnda){e<=2}", text)[0], + "anaconda") + + text = 'anaconda foo bar baz smith anderson' + self.assertEqual(regex.search('(fuu){i<=3,d<=3,e<=5}', text).span(0, + 1), ((0, 0), (0, 0))) + self.assertEqual(regex.search('(?b)(fuu){i<=3,d<=3,e<=5}', + text).span(0, 1), ((9, 10), (9, 10))) + self.assertEqual(regex.search('(fuu){i<=2,d<=2,e<=5}', text).span(0, + 1), ((7, 10), (7, 10))) + self.assertEqual(regex.search('(?e)(fuu){i<=2,d<=2,e<=5}', + text).span(0, 1), ((9, 10), (9, 10))) + self.assertEqual(regex.search('(fuu){i<=3,d<=3,e}', text).span(0, 1), + ((0, 0), (0, 0))) + self.assertEqual(regex.search('(?b)(fuu){i<=3,d<=3,e}', text).span(0, + 1), ((9, 10), (9, 10))) + + self.assertEqual(repr(type(regex.compile('(approximate){s<=3,1i+1d<3}'))), + self.PATTERN_CLASS) + + # No cost limit. + self.assertEqual(regex.search('(foobar){e}', + 'xirefoabralfobarxie').span(0, 1), ((0, 6), (0, 6))) + self.assertEqual(regex.search('(?e)(foobar){e}', + 'xirefoabralfobarxie').span(0, 1), ((0, 3), (0, 3))) + self.assertEqual(regex.search('(?b)(foobar){e}', + 'xirefoabralfobarxie').span(0, 1), ((11, 16), (11, 16))) + + # At most two errors. + self.assertEqual(regex.search('(foobar){e<=2}', + 'xirefoabrzlfd').span(0, 1), ((4, 9), (4, 9))) + self.assertEqual(regex.search('(foobar){e<=2}', 'xirefoabzlfd'), None) + + # At most two inserts or substitutions and max two errors total. + self.assertEqual(regex.search('(foobar){i<=2,s<=2,e<=2}', + 'oobargoobaploowap').span(0, 1), ((5, 11), (5, 11))) + + # Find best whole word match for "foobar". + self.assertEqual(regex.search('\\b(foobar){e}\\b', 'zfoobarz').span(0, + 1), ((0, 8), (0, 8))) + self.assertEqual(regex.search('\\b(foobar){e}\\b', + 'boing zfoobarz goobar woop').span(0, 1), ((0, 6), (0, 6))) + self.assertEqual(regex.search('(?b)\\b(foobar){e}\\b', + 'boing zfoobarz goobar woop').span(0, 1), ((15, 21), (15, 21))) + + # Match whole string, allow only 1 error. + self.assertEqual(regex.search('^(foobar){e<=1}$', 'foobar').span(0, 1), + ((0, 6), (0, 6))) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'xfoobar').span(0, + 1), ((0, 7), (0, 7))) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'foobarx').span(0, + 1), ((0, 7), (0, 7))) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'fooxbar').span(0, + 1), ((0, 7), (0, 7))) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'foxbar').span(0, 1), + ((0, 6), (0, 6))) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'xoobar').span(0, 1), + ((0, 6), (0, 6))) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'foobax').span(0, 1), + ((0, 6), (0, 6))) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'oobar').span(0, 1), + ((0, 5), (0, 5))) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'fobar').span(0, 1), + ((0, 5), (0, 5))) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'fooba').span(0, 1), + ((0, 5), (0, 5))) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'xfoobarx'), None) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'foobarxx'), None) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'xxfoobar'), None) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'xfoxbar'), None) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'foxbarx'), None) + + # At most one insert, two deletes, and three substitutions. + # Additionally, deletes cost two and substitutes one, and total + # cost must be less than 4. + self.assertEqual(regex.search('(foobar){i<=1,d<=2,s<=3,2d+1s<4}', + '3oifaowefbaoraofuiebofasebfaobfaorfeoaro').span(0, 1), ((6, 13), (6, + 13))) + self.assertEqual(regex.search('(?b)(foobar){i<=1,d<=2,s<=3,2d+1s<4}', + '3oifaowefbaoraofuiebofasebfaobfaorfeoaro').span(0, 1), ((34, 39), + (34, 39))) + + # Partially fuzzy matches. + self.assertEqual(regex.search('foo(bar){e<=1}zap', 'foobarzap').span(0, + 1), ((0, 9), (3, 6))) + self.assertEqual(regex.search('foo(bar){e<=1}zap', 'fobarzap'), None) + self.assertEqual(regex.search('foo(bar){e<=1}zap', 'foobrzap').span(0, + 1), ((0, 8), (3, 5))) + + text = ('www.cnn.com 64.236.16.20\nwww.slashdot.org 66.35.250.150\n' + 'For useful information, use www.slashdot.org\nthis is demo data!\n') + self.assertEqual(regex.search(r'(?s)^.*(dot.org){e}.*$', text).span(0, + 1), ((0, 120), (120, 120))) + self.assertEqual(regex.search(r'(?es)^.*(dot.org){e}.*$', text).span(0, + 1), ((0, 120), (93, 100))) + self.assertEqual(regex.search(r'^.*(dot.org){e}.*$', text).span(0, 1), + ((0, 119), (24, 101))) + + # Behaviour is unexpected, but arguably not wrong. It first finds the + # best match, then the best in what follows, etc. + self.assertEqual(regex.findall(r"\b\L<words>{e<=1}\b", + " book cot dog desk ", words="cat dog".split()), ["cot", "dog"]) + self.assertEqual(regex.findall(r"\b\L<words>{e<=1}\b", + " book dog cot desk ", words="cat dog".split()), [" dog", "cot"]) + self.assertEqual(regex.findall(r"(?e)\b\L<words>{e<=1}\b", + " book dog cot desk ", words="cat dog".split()), ["dog", "cot"]) + self.assertEqual(regex.findall(r"(?r)\b\L<words>{e<=1}\b", + " book cot dog desk ", words="cat dog".split()), ["dog ", "cot"]) + self.assertEqual(regex.findall(r"(?er)\b\L<words>{e<=1}\b", + " book cot dog desk ", words="cat dog".split()), ["dog", "cot"]) + self.assertEqual(regex.findall(r"(?r)\b\L<words>{e<=1}\b", + " book dog cot desk ", words="cat dog".split()), ["cot", "dog"]) + self.assertEqual(regex.findall(ur"\b\L<words>{e<=1}\b", + u" book cot dog desk ", words=u"cat dog".split()), [u"cot", u"dog"]) + self.assertEqual(regex.findall(ur"\b\L<words>{e<=1}\b", + u" book dog cot desk ", words=u"cat dog".split()), [u" dog", u"cot"]) + self.assertEqual(regex.findall(ur"(?e)\b\L<words>{e<=1}\b", + u" book dog cot desk ", words=u"cat dog".split()), [u"dog", u"cot"]) + self.assertEqual(regex.findall(ur"(?r)\b\L<words>{e<=1}\b", + u" book cot dog desk ", words=u"cat dog".split()), [u"dog ", u"cot"]) + self.assertEqual(regex.findall(ur"(?er)\b\L<words>{e<=1}\b", + u" book cot dog desk ", words=u"cat dog".split()), [u"dog", u"cot"]) + self.assertEqual(regex.findall(ur"(?r)\b\L<words>{e<=1}\b", + u" book dog cot desk ", words=u"cat dog".split()), [u"cot", u"dog"]) + + self.assertEqual(regex.search(r"(\w+) (\1{e<=1})", "foo fou").groups(), + ("foo", "fou")) + self.assertEqual(regex.search(r"(?r)(\2{e<=1}) (\w+)", + "foo fou").groups(), ("foo", "fou")) + self.assertEqual(regex.search(ur"(\w+) (\1{e<=1})", + u"foo fou").groups(), (u"foo", u"fou")) + + self.assertEqual(regex.findall(r"(?:(?:QR)+){e}","abcde"), ["abcde", + ""]) + self.assertEqual(regex.findall(r"(?:Q+){e}","abc"), ["abc", ""]) + + # Hg issue 41. + self.assertEqual(regex.match(r"(?:service detection){0<e<5}", + "servic detection").span(), (0, 16)) + self.assertEqual(regex.match(r"(?:service detection){0<e<5}", + "service detect").span(), (0, 14)) + self.assertEqual(regex.match(r"(?:service detection){0<e<5}", + "service detecti").span(), (0, 15)) + self.assertEqual(regex.match(r"(?:service detection){0<e<5}", + "service detection"), None) + self.assertEqual(regex.match(r"(?:service detection){0<e<5}", + "in service detection").span(), (0, 20)) + + # Hg issue 109. + self.assertEqual(regex.fullmatch(r"(?:cats|cat){e<=1}", + "cat").fuzzy_counts, (0, 0, 1)) + self.assertEqual(regex.fullmatch(r"(?e)(?:cats|cat){e<=1}", + "cat").fuzzy_counts, (0, 0, 0)) + + self.assertEqual(regex.fullmatch(r"(?:cat|cats){e<=1}", + "cats").fuzzy_counts, (0, 1, 0)) + self.assertEqual(regex.fullmatch(r"(?e)(?:cat|cats){e<=1}", + "cats").fuzzy_counts, (0, 0, 0)) + + self.assertEqual(regex.fullmatch(r"(?:cat){e<=1} (?:cat){e<=1}", + "cat cot").fuzzy_counts, (1, 0, 0)) + + def test_recursive(self): + self.assertEqual(regex.search(r"(\w)(?:(?R)|(\w?))\1", "xx")[ : ], + ("xx", "x", "")) + self.assertEqual(regex.search(r"(\w)(?:(?R)|(\w?))\1", "aba")[ : ], + ("aba", "a", "b")) + self.assertEqual(regex.search(r"(\w)(?:(?R)|(\w?))\1", "abba")[ : ], + ("abba", "a", None)) + self.assertEqual(regex.search(r"(\w)(?:(?R)|(\w?))\1", "kayak")[ : ], + ("kayak", "k", None)) + self.assertEqual(regex.search(r"(\w)(?:(?R)|(\w?))\1", "paper")[ : ], + ("pap", "p", "a")) + self.assertEqual(regex.search(r"(\w)(?:(?R)|(\w?))\1", "dontmatchme"), + None) + + self.assertEqual(regex.search(r"(?r)\2(?:(\w?)|(?R))(\w)", "xx")[ : ], + ("xx", "", "x")) + self.assertEqual(regex.search(r"(?r)\2(?:(\w?)|(?R))(\w)", "aba")[ : ], + ("aba", "b", "a")) + self.assertEqual(regex.search(r"(?r)\2(?:(\w?)|(?R))(\w)", "abba")[ : + ], ("abba", None, "a")) + self.assertEqual(regex.search(r"(?r)\2(?:(\w?)|(?R))(\w)", "kayak")[ : + ], ("kayak", None, "k")) + self.assertEqual(regex.search(r"(?r)\2(?:(\w?)|(?R))(\w)", "paper")[ : + ], ("pap", "a", "p")) + self.assertEqual(regex.search(r"(?r)\2(?:(\w?)|(?R))(\w)", + "dontmatchme"), None) + + self.assertEqual(regex.search(r"\(((?>[^()]+)|(?R))*\)", "(ab(cd)ef)")[ + : ], ("(ab(cd)ef)", "ef")) + self.assertEqual(regex.search(r"\(((?>[^()]+)|(?R))*\)", + "(ab(cd)ef)").captures(1), ["ab", "cd", "(cd)", "ef"]) + + self.assertEqual(regex.search(r"(?r)\(((?R)|(?>[^()]+))*\)", + "(ab(cd)ef)")[ : ], ("(ab(cd)ef)", "ab")) + self.assertEqual(regex.search(r"(?r)\(((?R)|(?>[^()]+))*\)", + "(ab(cd)ef)").captures(1), ["ef", "cd", "(cd)", "ab"]) + + self.assertEqual(regex.search(r"\(([^()]+|(?R))*\)", + "some text (a(b(c)d)e) more text")[ : ], ("(a(b(c)d)e)", "e")) + + self.assertEqual(regex.search(r"(?r)\(((?R)|[^()]+)*\)", + "some text (a(b(c)d)e) more text")[ : ], ("(a(b(c)d)e)", "a")) + + self.assertEqual(regex.search(r"(foo(\(((?:(?>[^()]+)|(?2))*)\)))", + "foo(bar(baz)+baz(bop))")[ : ], ("foo(bar(baz)+baz(bop))", + "foo(bar(baz)+baz(bop))", "(bar(baz)+baz(bop))", + "bar(baz)+baz(bop)")) + + self.assertEqual(regex.search(r"(?r)(foo(\(((?:(?2)|(?>[^()]+))*)\)))", + "foo(bar(baz)+baz(bop))")[ : ], ("foo(bar(baz)+baz(bop))", + "foo(bar(baz)+baz(bop))", "(bar(baz)+baz(bop))", + "bar(baz)+baz(bop)")) + + rgx = regex.compile(r"""^\s*(<\s*([a-zA-Z:]+)(?:\s*[a-zA-Z:]*\s*=\s*(?:'[^']*'|"[^"]*"))*\s*(/\s*)?>(?:[^<>]*|(?1))*(?(3)|<\s*/\s*\2\s*>))\s*$""") + self.assertEqual(bool(rgx.search('<foo><bar></bar></foo>')), True) + self.assertEqual(bool(rgx.search('<foo><bar></foo></bar>')), False) + self.assertEqual(bool(rgx.search('<foo><bar/></foo>')), True) + self.assertEqual(bool(rgx.search('<foo><bar></foo>')), False) + self.assertEqual(bool(rgx.search('<foo bar=baz/>')), False) + + self.assertEqual(bool(rgx.search('<foo bar="baz">')), False) + self.assertEqual(bool(rgx.search('<foo bar="baz"/>')), True) + self.assertEqual(bool(rgx.search('< fooo / >')), True) + # The next regex should and does match. Perl 5.14 agrees. + #self.assertEqual(bool(rgx.search('<foo/>foo')), False) + self.assertEqual(bool(rgx.search('foo<foo/>')), False) + + self.assertEqual(bool(rgx.search('<foo>foo</foo>')), True) + self.assertEqual(bool(rgx.search('<foo><bar/>foo</foo>')), True) + self.assertEqual(bool(rgx.search('<a><b><c></c></b></a>')), True) + + def test_copy(self): + # PatternObjects are immutable, therefore there's no need to clone them. + r = regex.compile("a") + self.assert_(copy.copy(r) is r) + self.assert_(copy.deepcopy(r) is r) + + # MatchObjects are normally mutable because the target string can be + # detached. However, after the target string has been detached, a + # MatchObject becomes immutable, so there's no need to clone it. + m = r.match("a") + self.assert_(copy.copy(m) is not m) + self.assert_(copy.deepcopy(m) is not m) + + self.assert_(m.string is not None) + m2 = copy.copy(m) + m2.detach_string() + self.assert_(m.string is not None) + self.assert_(m2.string is None) + + # The following behaviour matches that of the re module. + it = regex.finditer(".", "ab") + it2 = copy.copy(it) + self.assertEqual(it.next().group(), "a") + self.assertEqual(it2.next().group(), "b") + + # The following behaviour matches that of the re module. + it = regex.finditer(".", "ab") + it2 = copy.deepcopy(it) + self.assertEqual(it.next().group(), "a") + self.assertEqual(it2.next().group(), "b") + + # The following behaviour is designed to match that of copying 'finditer'. + it = regex.splititer(" ", "a b") + it2 = copy.copy(it) + self.assertEqual(it.next(), "a") + self.assertEqual(it2.next(), "b") + + # The following behaviour is designed to match that of copying 'finditer'. + it = regex.splititer(" ", "a b") + it2 = copy.deepcopy(it) + self.assertEqual(it.next(), "a") + self.assertEqual(it2.next(), "b") + + def test_format(self): + self.assertEqual(regex.subf(r"(\w+) (\w+)", "{0} => {2} {1}", + "foo bar"), "foo bar => bar foo") + self.assertEqual(regex.subf(r"(?<word1>\w+) (?<word2>\w+)", + "{word2} {word1}", "foo bar"), "bar foo") + + self.assertEqual(regex.subfn(r"(\w+) (\w+)", "{0} => {2} {1}", + "foo bar"), ("foo bar => bar foo", 1)) + self.assertEqual(regex.subfn(r"(?<word1>\w+) (?<word2>\w+)", + "{word2} {word1}", "foo bar"), ("bar foo", 1)) + + self.assertEqual(regex.match(r"(\w+) (\w+)", + "foo bar").expandf("{0} => {2} {1}"), "foo bar => bar foo") + + def test_fullmatch(self): + self.assertEqual(bool(regex.fullmatch(r"abc", "abc")), True) + self.assertEqual(bool(regex.fullmatch(r"abc", "abcx")), False) + self.assertEqual(bool(regex.fullmatch(r"abc", "abcx", endpos=3)), True) + + self.assertEqual(bool(regex.fullmatch(r"abc", "xabc", pos=1)), True) + self.assertEqual(bool(regex.fullmatch(r"abc", "xabcy", pos=1)), False) + self.assertEqual(bool(regex.fullmatch(r"abc", "xabcy", pos=1, + endpos=4)), True) + + self.assertEqual(bool(regex.fullmatch(r"(?r)abc", "abc")), True) + self.assertEqual(bool(regex.fullmatch(r"(?r)abc", "abcx")), False) + self.assertEqual(bool(regex.fullmatch(r"(?r)abc", "abcx", endpos=3)), + True) + + self.assertEqual(bool(regex.fullmatch(r"(?r)abc", "xabc", pos=1)), + True) + self.assertEqual(bool(regex.fullmatch(r"(?r)abc", "xabcy", pos=1)), + False) + self.assertEqual(bool(regex.fullmatch(r"(?r)abc", "xabcy", pos=1, + endpos=4)), True) + + def test_hg_bugs(self): + # Hg issue 28. + self.assertEqual(bool(regex.compile("(?>b)", flags=regex.V1)), True) + + # Hg issue 29. + self.assertEqual(bool(regex.compile(r"^((?>\w+)|(?>\s+))*$", + flags=regex.V1)), True) + + # Hg issue 31. + self.assertEqual(regex.findall(r"\((?:(?>[^()]+)|(?R))*\)", + "a(bcd(e)f)g(h)"), ['(bcd(e)f)', '(h)']) + self.assertEqual(regex.findall(r"\((?:(?:[^()]+)|(?R))*\)", + "a(bcd(e)f)g(h)"), ['(bcd(e)f)', '(h)']) + self.assertEqual(regex.findall(r"\((?:(?>[^()]+)|(?R))*\)", + "a(b(cd)e)f)g)h"), ['(b(cd)e)']) + self.assertEqual(regex.findall(r"\((?:(?>[^()]+)|(?R))*\)", + "a(bc(d(e)f)gh"), ['(d(e)f)']) + self.assertEqual(regex.findall(r"(?r)\((?:(?>[^()]+)|(?R))*\)", + "a(bc(d(e)f)gh"), ['(d(e)f)']) + self.assertEqual([m.group() for m in + regex.finditer(r"\((?:[^()]*+|(?0))*\)", "a(b(c(de)fg)h")], + ['(c(de)fg)']) + + # Hg issue 32. + self.assertEqual(regex.search("a(bc)d", "abcd", regex.I | + regex.V1).group(0), "abcd") + + # Hg issue 33. + self.assertEqual(regex.search("([\da-f:]+)$", "E", regex.I | + regex.V1).group(0), "E") + self.assertEqual(regex.search("([\da-f:]+)$", "e", regex.I | + regex.V1).group(0), "e") + + # Hg issue 34. + self.assertEqual(regex.search("^(?=ab(de))(abd)(e)", "abde").groups(), + ('de', 'abd', 'e')) + + # Hg issue 35. + self.assertEqual(bool(regex.match(r"\ ", " ", flags=regex.X)), True) + + # Hg issue 36. + self.assertEqual(regex.search(r"^(a|)\1{2}b", "b").group(0, 1), ('b', + '')) + + # Hg issue 37. + self.assertEqual(regex.search("^(a){0,0}", "abc").group(0, 1), ('', + None)) + + # Hg issue 38. + self.assertEqual(regex.search("(?>.*/)b", "a/b").group(0), "a/b") + + # Hg issue 39. + self.assertEqual(regex.search(r"(?V0)((?i)blah)\s+\1", + "blah BLAH").group(0, 1), ("blah BLAH", "blah")) + self.assertEqual(regex.search(r"(?V1)((?i)blah)\s+\1", "blah BLAH"), + None) + + # Hg issue 40. + self.assertEqual(regex.search(r"(\()?[^()]+(?(1)\)|)", + "(abcd").group(0), "abcd") + + # Hg issue 42. + self.assertEqual(regex.search("(a*)*", "a").span(1), (1, 1)) + self.assertEqual(regex.search("(a*)*", "aa").span(1), (2, 2)) + self.assertEqual(regex.search("(a*)*", "aaa").span(1), (3, 3)) + + # Hg issue 43. + self.assertEqual(regex.search("a(?#xxx)*", "aaa").group(), "aaa") + + # Hg issue 44. + self.assertEqual(regex.search("(?=abc){3}abc", "abcabcabc").span(), (0, + 3)) + + # Hg issue 45. + self.assertEqual(regex.search("^(?:a(?:(?:))+)+", "a").span(), (0, 1)) + self.assertEqual(regex.search("^(?:a(?:(?:))+)+", "aa").span(), (0, 2)) + + # Hg issue 46. + self.assertEqual(regex.search("a(?x: b c )d", "abcd").group(0), "abcd") + + # Hg issue 47. + self.assertEqual(regex.search("a#comment\n*", "aaa", + flags=regex.X).group(0), "aaa") + + # Hg issue 48. + self.assertEqual(regex.search(r"(?V1)(a(?(1)\1)){1}", + "aaaaaaaaaa").span(0, 1), ((0, 1), (0, 1))) + self.assertEqual(regex.search(r"(?V1)(a(?(1)\1)){2}", + "aaaaaaaaaa").span(0, 1), ((0, 3), (1, 3))) + self.assertEqual(regex.search(r"(?V1)(a(?(1)\1)){3}", + "aaaaaaaaaa").span(0, 1), ((0, 6), (3, 6))) + self.assertEqual(regex.search(r"(?V1)(a(?(1)\1)){4}", + "aaaaaaaaaa").span(0, 1), ((0, 10), (6, 10))) + + # Hg issue 49. + self.assertEqual(regex.search("(?V1)(a)(?<=b(?1))", "baz").group(0), + "a") + + # Hg issue 50. + self.assertEqual(regex.findall(ur'(?fi)\L<keywords>', + u'POST, Post, post, po\u017Ft, po\uFB06, and po\uFB05', + keywords=['post','pos']), [u'POST', u'Post', u'post', u'po\u017Ft', + u'po\uFB06', u'po\uFB05']) + self.assertEqual(regex.findall(ur'(?fi)pos|post', + u'POST, Post, post, po\u017Ft, po\uFB06, and po\uFB05'), [u'POS', + u'Pos', u'pos', u'po\u017F', u'po\uFB06', u'po\uFB05']) + self.assertEqual(regex.findall(ur'(?fi)post|pos', + u'POST, Post, post, po\u017Ft, po\uFB06, and po\uFB05'), [u'POST', + u'Post', u'post', u'po\u017Ft', u'po\uFB06', u'po\uFB05']) + self.assertEqual(regex.findall(ur'(?fi)post|another', + u'POST, Post, post, po\u017Ft, po\uFB06, and po\uFB05'), [u'POST', + u'Post', u'post', u'po\u017Ft', u'po\uFB06', u'po\uFB05']) + + # Hg issue 51. + self.assertEqual(regex.search("(?V1)((a)(?1)|(?2))", "a").group(0, 1, + 2), ('a', 'a', None)) + + # Hg issue 52. + self.assertEqual(regex.search(r"(?V1)(\1xx|){6}", "xx").span(0, 1), + ((0, 2), (2, 2))) + + # Hg issue 53. + self.assertEqual(regex.search("(a|)+", "a").group(0, 1), ("a", "")) + + # Hg issue 54. + self.assertEqual(regex.search(r"(a|)*\d", "a" * 80), None) + + # Hg issue 55. + self.assertEqual(regex.search("^(?:a?b?)*$", "ac"), None) + + # Hg issue 58. + self.assertRaisesRegex(regex.error, self.UNDEF_CHAR_NAME, lambda: + regex.compile("\\N{1}")) + + # Hg issue 59. + self.assertEqual(regex.search("\\Z", "a\na\n").span(0), (4, 4)) + + # Hg issue 60. + self.assertEqual(regex.search("(q1|.)*(q2|.)*(x(a|bc)*y){2,}", + "xayxay").group(0), "xayxay") + + # Hg issue 61. + self.assertEqual(regex.search("(?i)[^a]", "A"), None) + + # Hg issue 63. + self.assertEqual(regex.search(u"(?iu)[[:ascii:]]", u"\N{KELVIN SIGN}"), + None) + + # Hg issue 66. + self.assertEqual(regex.search("((a|b(?1)c){3,5})", "baaaaca").group(0, + 1, 2), ('aaaa', 'aaaa', 'a')) + + # Hg issue 71. + self.assertEqual(regex.findall(r"(?<=:\S+ )\w+", ":9 abc :10 def"), + ['abc', 'def']) + self.assertEqual(regex.findall(r"(?<=:\S* )\w+", ":9 abc :10 def"), + ['abc', 'def']) + self.assertEqual(regex.findall(r"(?<=:\S+? )\w+", ":9 abc :10 def"), + ['abc', 'def']) + self.assertEqual(regex.findall(r"(?<=:\S*? )\w+", ":9 abc :10 def"), + ['abc', 'def']) + + # Hg issue 73. + self.assertEqual(regex.search(r"(?:fe)?male", "female").group(), + "female") + self.assertEqual([m.group() for m in + regex.finditer(r"(fe)?male: h(?(1)(er)|(is)) (\w+)", + "female: her dog; male: his cat. asdsasda")], ['female: her dog', + 'male: his cat']) + + # Hg issue 78. + self.assertEqual(regex.search(r'(?<rec>\((?:[^()]++|(?&rec))*\))', + 'aaa(((1+0)+1)+1)bbb').captures('rec'), ['(1+0)', '((1+0)+1)', + '(((1+0)+1)+1)']) + + # Hg issue 80. + self.assertRaisesRegex(regex.error, self.BAD_ESCAPE, lambda: + regex.sub('x', '\\', 'x'), ) + + # Hg issue 82. + fz = "(CAGCCTCCCATTTCAGAATATACATCC){1<e<=2}" + seq = "tcagacgagtgcgttgtaaaacgacggccagtCAGCCTCCCATTCAGAATATACATCCcgacggccagttaaaaacaatgccaaggaggtcatagctgtttcctgccagttaaaaacaatgccaaggaggtcatagctgtttcctgacgcactcgtctgagcgggctggcaagg" + self.assertEqual(regex.search(fz, seq, regex.BESTMATCH)[0], + "tCAGCCTCCCATTCAGAATATACATCC") + + # Hg issue 83. + self.assertEqual(regex.findall(r"c..+/c", "cA/c\ncAb/c"), ['cAb/c']) + + # Hg issue 85. + self.assertEqual(repr(regex.sub(ur"(?u)(\w+)", ur"[\1]", + u'\u0905\u0928\u094d\u200d\u0928 \u0d28\u0d4d\u200d \u0915\u093f\u0928', + regex.WORD)), + repr(u'[\u0905\u0928\u094d\u200d\u0928] [\u0d28\u0d4d\u200d] [\u0915\u093f\u0928]')) + + # Hg issue 88. + self.assertEqual(regex.match(r".*a.*ba.*aa", "ababba"), None) + + # Hg issue 87. + self.assertEqual(regex.match(r'(?<x>a(?<x>b))', "ab").spans("x"), [(1, + 2), (0, 2)]) + + # Hg issue 91. + # Check that the replacement cache works. + self.assertEqual(regex.sub(r'(-)', lambda m: m.expand(r'x'), 'a-b-c'), + 'axbxc') + + # Hg issue 94. + rx = regex.compile(r'\bt(est){i<2}', flags=regex.V1) + self.assertEqual(rx.search("Some text"), None) + self.assertEqual(rx.findall("Some text"), []) + + # Hg issue 95. + self.assertRaisesRegex(regex.error, self.MULTIPLE_REPEAT, lambda: + regex.compile(r'.???')) + + # Hg issue 97. + self.assertEqual(regex.escape(u'foo!?'), u'foo\\!\\?') + self.assertEqual(regex.escape(u'foo!?', special_only=True), u'foo!\\?') + + self.assertEqual(regex.escape('foo!?'), 'foo\\!\\?') + self.assertEqual(regex.escape('foo!?', special_only=True), + 'foo!\\?') + + # Hg issue 100. + self.assertEqual(regex.search('^([^z]*(?:WWWi|W))?$', + 'WWWi').groups(), ('WWWi', )) + self.assertEqual(regex.search('^([^z]*(?:WWWi|w))?$', + 'WWWi').groups(), ('WWWi', )) + self.assertEqual(regex.search('^([^z]*?(?:WWWi|W))?$', + 'WWWi').groups(), ('WWWi', )) + + # Hg issue 101. + pat = regex.compile(r'xxx', flags=regex.FULLCASE | regex.UNICODE) + self.assertEqual([x.group() for x in pat.finditer('yxxx')], ['xxx']) + self.assertEqual(pat.findall('yxxx'), ['xxx']) + + raw = 'yxxx' + self.assertEqual([x.group() for x in pat.finditer(raw)], ['xxx']) + self.assertEqual(pat.findall(raw), ['xxx']) + + pat = regex.compile(r'xxx', flags=regex.FULLCASE | regex.IGNORECASE | + regex.UNICODE) + self.assertEqual([x.group() for x in pat.finditer('yxxx')], ['xxx']) + self.assertEqual(pat.findall('yxxx'), ['xxx']) + + raw = 'yxxx' + self.assertEqual([x.group() for x in pat.finditer(raw)], ['xxx']) + self.assertEqual(pat.findall(raw), ['xxx']) + + # Hg issue 106. + self.assertEqual(regex.sub('(?V0).*', 'x', 'test'), 'x') + self.assertEqual(regex.sub('(?V1).*', 'x', 'test'), 'xx') + + self.assertEqual(regex.sub('(?V0).*?', '|', 'test'), '|t|e|s|t|') + self.assertEqual(regex.sub('(?V1).*?', '|', 'test'), '|||||||||') + + # Hg issue 112. + self.assertEqual(regex.sub(r'^(@)\n(?!.*?@)(.*)', + r'\1\n==========\n\2', '@\n', flags=regex.DOTALL), '@\n==========\n') + + # Hg issue 109. + self.assertEqual(regex.match(r'(?:cats|cat){e<=1}', + 'caz').fuzzy_counts, (1, 0, 0)) + self.assertEqual(regex.match(r'(?e)(?:cats|cat){e<=1}', + 'caz').fuzzy_counts, (1, 0, 0)) + self.assertEqual(regex.match(r'(?b)(?:cats|cat){e<=1}', + 'caz').fuzzy_counts, (1, 0, 0)) + + self.assertEqual(regex.match(r'(?:cat){e<=1}', 'caz').fuzzy_counts, + (1, 0, 0)) + self.assertEqual(regex.match(r'(?e)(?:cat){e<=1}', + 'caz').fuzzy_counts, (1, 0, 0)) + self.assertEqual(regex.match(r'(?b)(?:cat){e<=1}', + 'caz').fuzzy_counts, (1, 0, 0)) + + self.assertEqual(regex.match(r'(?:cats){e<=2}', 'c ats').fuzzy_counts, + (1, 1, 0)) + self.assertEqual(regex.match(r'(?e)(?:cats){e<=2}', + 'c ats').fuzzy_counts, (0, 1, 0)) + self.assertEqual(regex.match(r'(?b)(?:cats){e<=2}', + 'c ats').fuzzy_counts, (0, 1, 0)) + + self.assertEqual(regex.match(r'(?:cats){e<=2}', + 'c a ts').fuzzy_counts, (0, 2, 0)) + self.assertEqual(regex.match(r'(?e)(?:cats){e<=2}', + 'c a ts').fuzzy_counts, (0, 2, 0)) + self.assertEqual(regex.match(r'(?b)(?:cats){e<=2}', + 'c a ts').fuzzy_counts, (0, 2, 0)) + + self.assertEqual(regex.match(r'(?:cats){e<=1}', 'c ats').fuzzy_counts, + (0, 1, 0)) + self.assertEqual(regex.match(r'(?e)(?:cats){e<=1}', + 'c ats').fuzzy_counts, (0, 1, 0)) + self.assertEqual(regex.match(r'(?b)(?:cats){e<=1}', + 'c ats').fuzzy_counts, (0, 1, 0)) + + # Hg issue 115. + self.assertEqual(regex.findall(r'\bof ([a-z]+) of \1\b', + 'To make use of one of these modules'), []) + + # Hg issue 125. + self.assertEqual(regex.sub(r'x', r'\g<0>', 'x'), 'x') + + # Unreported issue: no such builtin as 'ascii' in Python 2. + self.assertEqual(bool(regex.match(r'a', 'a', regex.DEBUG)), True) + + # Hg issue 131. + self.assertEqual(regex.findall(r'(?V1)[[b-e]--cd]', 'abcdef'), ['b', + 'e']) + self.assertEqual(regex.findall(r'(?V1)[b-e--cd]', 'abcdef'), ['b', + 'e']) + self.assertEqual(regex.findall(r'(?V1)[[bcde]--cd]', 'abcdef'), ['b', + 'e']) + self.assertEqual(regex.findall(r'(?V1)[bcde--cd]', 'abcdef'), ['b', + 'e']) + + # Hg issue 132. + self.assertRaisesRegex(regex.error, '^unknown property at position 4$', + lambda: regex.compile(ur'\p{}')) + + # Issue 23692. + self.assertEqual(regex.match('(?:()|(?(1)()|z)){2}(?(2)a|z)', + 'a').group(0, 1, 2), ('a', '', '')) + self.assertEqual(regex.match('(?:()|(?(1)()|z)){0,2}(?(2)a|z)', + 'a').group(0, 1, 2), ('a', '', '')) + + # Hg issue 137: Posix character class :punct: does not seem to be + # supported. + + # Posix compatibility as recommended here: + # http://www.unicode.org/reports/tr18/#Compatibility_Properties + + # Posix in Unicode. + chars = u''.join(unichr(c) for c in range(0x10000)) + + self.assertEqual(repr(u''.join(regex.findall(ur'''(?u)[[:alnum:]]+''', + chars))), repr(u''.join(regex.findall(ur'''(?u)[\p{Alpha}\p{PosixDigit}]+''', + chars)))) + self.assertEqual(repr(u''.join(regex.findall(ur'''(?u)[[:alpha:]]+''', + chars))), repr(u''.join(regex.findall(ur'''(?u)\p{Alpha}+''', + chars)))) + self.assertEqual(repr(u''.join(regex.findall(ur'''(?u)[[:ascii:]]+''', + chars))), repr(u''.join(regex.findall(ur'''(?u)[\p{InBasicLatin}]+''', + chars)))) + self.assertEqual(repr(u''.join(regex.findall(ur'''(?u)[[:blank:]]+''', + chars))), repr(u''.join(regex.findall(ur'''(?u)[\p{gc=Space_Separator}\t]+''', + chars)))) + self.assertEqual(repr(u''.join(regex.findall(ur'''(?u)[[:cntrl:]]+''', + chars))), repr(u''.join(regex.findall(ur'''(?u)\p{gc=Control}+''', chars)))) + self.assertEqual(repr(u''.join(regex.findall(ur'''(?u)[[:digit:]]+''', + chars))), repr(u''.join(regex.findall(ur'''(?u)[0-9]+''', chars)))) + self.assertEqual(repr(u''.join(regex.findall(ur'''(?u)[[:graph:]]+''', + chars))), repr(u''.join(regex.findall(ur'''(?u)[^\p{Space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]+''', + chars)))) + self.assertEqual(repr(u''.join(regex.findall(ur'''(?u)[[:lower:]]+''', + chars))), repr(u''.join(regex.findall(ur'''(?u)\p{Lower}+''', + chars)))) + self.assertEqual(repr(u''.join(regex.findall(ur'''(?u)[[:print:]]+''', + chars))), repr(u''.join(regex.findall(ur'''(?uV1)[\p{Graph}\p{Blank}--\p{Cntrl}]+''', chars)))) + self.assertEqual(repr(u''.join(regex.findall(ur'''(?u)[[:punct:]]+''', + chars))), + repr(u''.join(regex.findall(ur'''(?uV1)[\p{gc=Punctuation}\p{gc=Symbol}--\p{Alpha}]+''', + chars)))) + self.assertEqual(repr(u''.join(regex.findall(ur'''(?u)[[:space:]]+''', + chars))), repr(u''.join(regex.findall(ur'''(?u)\p{Whitespace}+''', + chars)))) + self.assertEqual(repr(u''.join(regex.findall(ur'''(?u)[[:upper:]]+''', + chars))), repr(u''.join(regex.findall(ur'''(?u)\p{Upper}+''', + chars)))) + self.assertEqual(repr(u''.join(regex.findall(ur'''(?u)[[:word:]]+''', + chars))), repr(u''.join(regex.findall(ur'''(?u)[\p{Alpha}\p{gc=Mark}\p{Digit}\p{gc=Connector_Punctuation}\p{Join_Control}]+''', + chars)))) + self.assertEqual(repr(u''.join(regex.findall(ur'''(?u)[[:xdigit:]]+''', + chars))), repr(u''.join(regex.findall(ur'''(?u)[0-9A-Fa-f]+''', + chars)))) + + # Posix in ASCII. + chars = ''.join(chr(c) for c in range(0x100)) + + self.assertEqual(repr(''.join(regex.findall(r'''[[:alnum:]]+''', + chars))), repr(''.join(regex.findall(r'''[\p{Alpha}\p{PosixDigit}]+''', + chars)))) + self.assertEqual(repr(''.join(regex.findall(r'''[[:alpha:]]+''', + chars))), repr(''.join(regex.findall(r'''\p{Alpha}+''', chars)))) + self.assertEqual(repr(''.join(regex.findall(r'''[[:ascii:]]+''', + chars))), repr(''.join(regex.findall(r'''[\x00-\x7F]+''', chars)))) + self.assertEqual(repr(''.join(regex.findall(r'''[[:blank:]]+''', + chars))), repr(''.join(regex.findall(r'''[\p{gc=Space_Separator}\t]+''', + chars)))) + self.assertEqual(repr(''.join(regex.findall(r'''[[:cntrl:]]+''', + chars))), repr(''.join(regex.findall(r'''\p{gc=Control}+''', + chars)))) + self.assertEqual(repr(''.join(regex.findall(r'''[[:digit:]]+''', + chars))), repr(''.join(regex.findall(r'''[0-9]+''', chars)))) + self.assertEqual(repr(''.join(regex.findall(r'''[[:graph:]]+''', + chars))), repr(''.join(regex.findall(r'''[^\p{Space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]+''', chars)))) + self.assertEqual(repr(''.join(regex.findall(r'''[[:lower:]]+''', + chars))), repr(''.join(regex.findall(r'''\p{Lower}+''', chars)))) + self.assertEqual(repr(''.join(regex.findall(r'''[[:print:]]+''', + chars))), repr(''.join(regex.findall(r'''(?V1)[\p{Graph}\p{Blank}--\p{Cntrl}]+''', chars)))) + self.assertEqual(repr(''.join(regex.findall(r'''[[:punct:]]+''', + chars))), repr(''.join(regex.findall(r'''(?V1)[\p{gc=Punctuation}\p{gc=Symbol}--\p{Alpha}]+''', + chars)))) + self.assertEqual(repr(''.join(regex.findall(r'''[[:space:]]+''', + chars))), repr(''.join(regex.findall(r'''\p{Whitespace}+''', chars)))) + self.assertEqual(repr(''.join(regex.findall(r'''[[:upper:]]+''', + chars))), repr(''.join(regex.findall(r'''\p{Upper}+''', chars)))) + self.assertEqual(repr(''.join(regex.findall(r'''[[:word:]]+''', + chars))), repr(''.join(regex.findall(r'''[\p{Alpha}\p{gc=Mark}\p{Digit}\p{gc=Connector_Punctuation}\p{Join_Control}]+''', chars)))) + self.assertEqual(repr(''.join(regex.findall(r'''[[:xdigit:]]+''', + chars))), repr(''.join(regex.findall(r'''[0-9A-Fa-f]+''', chars)))) + + # Hg issue 138: grapheme anchored search not working properly. + self.assertEqual(repr(regex.search(ur'(?u)\X$', u'ab\u2103').group()), + repr(u'\u2103')) + + # Hg issue 139: Regular expression with multiple wildcards where first + # should match empty string does not always work. + self.assertEqual(regex.search("([^L]*)([^R]*R)", "LtR").groups(), ('', + 'LtR')) + + # Hg issue 140: Replace with REVERSE and groups has unexpected + # behavior. + self.assertEqual(regex.sub(r'(.)', r'x\1y', 'ab'), 'xayxby') + self.assertEqual(regex.sub(r'(?r)(.)', r'x\1y', 'ab'), 'xayxby') + + # Hg issue 141: Crash on a certain partial match. + self.assertEqual(regex.fullmatch('(a)*abc', 'ab', + partial=True).span(), (0, 2)) + self.assertEqual(regex.fullmatch('(a)*abc', 'ab', + partial=True).partial, True) + + # Hg Issue #143: Partial matches have incorrect span if prefix is '.' + # wildcard. + self.assertEqual(regex.search('OXRG', 'OOGOX', partial=True).span(), + (3, 5)) + self.assertEqual(regex.search('.XRG', 'OOGOX', partial=True).span(), + (3, 5)) + self.assertEqual(regex.search('.{1,3}XRG', 'OOGOX', + partial=True).span(), (1, 5)) + + # Hg issue 144: Latest version problem with matching 'R|R'. + self.assertEqual(regex.match('R|R', 'R').span(), (0, 1)) + + # Hg issue 146: Forced-fail (?!) works improperly in conditional. + self.assertEqual(regex.match(r'(.)(?(1)(?!))', 'xy'), None) + + # Groups cleared after failure. + self.assertEqual(regex.findall(r'(y)?(\d)(?(1)\b\B)', 'ax1y2z3b'), + [('', '1'), ('', '2'), ('', '3')]) + self.assertEqual(regex.findall(r'(y)?+(\d)(?(1)\b\B)', 'ax1y2z3b'), + [('', '1'), ('', '2'), ('', '3')]) + + # Hg issue 147: Fuzzy match can return match points beyond buffer end. + self.assertEqual([m.span() for m in + regex.finditer(r'(?i)(?:error){e}', 'regex failure')], [(0, 5), (5, + 10), (10, 13), (13, 13)]) + self.assertEqual([m.span() for m in + regex.finditer(r'(?fi)(?:error){e}', 'regex failure')], [(0, 5), (5, + 10), (10, 13), (13, 13)]) + + # Hg issue 151: Request: \K. + self.assertEqual(regex.search(r'(ab\Kcd)', 'abcd').group(0, 1), ('cd', + 'abcd')) + self.assertEqual(regex.findall(r'\w\w\K\w\w', 'abcdefgh'), ['cd', + 'gh']) + self.assertEqual(regex.findall(r'(\w\w\K\w\w)', 'abcdefgh'), ['abcd', + 'efgh']) + + self.assertEqual(regex.search(r'(?r)(ab\Kcd)', 'abcd').group(0, 1), + ('ab', 'abcd')) + self.assertEqual(regex.findall(r'(?r)\w\w\K\w\w', 'abcdefgh'), ['ef', + 'ab']) + self.assertEqual(regex.findall(r'(?r)(\w\w\K\w\w)', 'abcdefgh'), + ['efgh', 'abcd']) + + # Hg issue 153: Request: (*SKIP). + self.assertEqual(regex.search(r'12(*FAIL)|3', '123')[0], '3') + self.assertEqual(regex.search(r'(?r)12(*FAIL)|3', '123')[0], '3') + + self.assertEqual(regex.search(r'\d+(*PRUNE)\d', '123'), None) + self.assertEqual(regex.search(r'\d+(?=(*PRUNE))\d', '123')[0], '123') + self.assertEqual(regex.search(r'\d+(*PRUNE)bcd|[3d]', '123bcd')[0], + '123bcd') + self.assertEqual(regex.search(r'\d+(*PRUNE)bcd|[3d]', '123zzd')[0], + 'd') + self.assertEqual(regex.search(r'\d+?(*PRUNE)bcd|[3d]', '123bcd')[0], + '3bcd') + self.assertEqual(regex.search(r'\d+?(*PRUNE)bcd|[3d]', '123zzd')[0], + 'd') + self.assertEqual(regex.search(r'\d++(?<=3(*PRUNE))zzd|[4d]$', + '123zzd')[0], '123zzd') + self.assertEqual(regex.search(r'\d++(?<=3(*PRUNE))zzd|[4d]$', + '124zzd')[0], 'd') + self.assertEqual(regex.search(r'\d++(?<=(*PRUNE)3)zzd|[4d]$', + '124zzd')[0], 'd') + self.assertEqual(regex.search(r'\d++(?<=2(*PRUNE)3)zzd|[3d]$', + '124zzd')[0], 'd') + + self.assertEqual(regex.search(r'(?r)\d(*PRUNE)\d+', '123'), None) + self.assertEqual(regex.search(r'(?r)\d(?<=(*PRUNE))\d+', '123')[0], + '123') + self.assertEqual(regex.search(r'(?r)\d+(*PRUNE)bcd|[3d]', + '123bcd')[0], '123bcd') + self.assertEqual(regex.search(r'(?r)\d+(*PRUNE)bcd|[3d]', + '123zzd')[0], 'd') + self.assertEqual(regex.search(r'(?r)\d++(?<=3(*PRUNE))zzd|[4d]$', + '123zzd')[0], '123zzd') + self.assertEqual(regex.search(r'(?r)\d++(?<=3(*PRUNE))zzd|[4d]$', + '124zzd')[0], 'd') + self.assertEqual(regex.search(r'(?r)\d++(?<=(*PRUNE)3)zzd|[4d]$', + '124zzd')[0], 'd') + self.assertEqual(regex.search(r'(?r)\d++(?<=2(*PRUNE)3)zzd|[3d]$', + '124zzd')[0], 'd') + + self.assertEqual(regex.search(r'\d+(*SKIP)bcd|[3d]', '123bcd')[0], + '123bcd') + self.assertEqual(regex.search(r'\d+(*SKIP)bcd|[3d]', '123zzd')[0], + 'd') + self.assertEqual(regex.search(r'\d+?(*SKIP)bcd|[3d]', '123bcd')[0], + '3bcd') + self.assertEqual(regex.search(r'\d+?(*SKIP)bcd|[3d]', '123zzd')[0], + 'd') + self.assertEqual(regex.search(r'\d++(?<=3(*SKIP))zzd|[4d]$', + '123zzd')[0], '123zzd') + self.assertEqual(regex.search(r'\d++(?<=3(*SKIP))zzd|[4d]$', + '124zzd')[0], 'd') + self.assertEqual(regex.search(r'\d++(?<=(*SKIP)3)zzd|[4d]$', + '124zzd')[0], 'd') + self.assertEqual(regex.search(r'\d++(?<=2(*SKIP)3)zzd|[3d]$', + '124zzd')[0], 'd') + + self.assertEqual(regex.search(r'(?r)\d+(*SKIP)bcd|[3d]', '123bcd')[0], + '123bcd') + self.assertEqual(regex.search(r'(?r)\d+(*SKIP)bcd|[3d]', '123zzd')[0], + 'd') + self.assertEqual(regex.search(r'(?r)\d++(?<=3(*SKIP))zzd|[4d]$', + '123zzd')[0], '123zzd') + self.assertEqual(regex.search(r'(?r)\d++(?<=3(*SKIP))zzd|[4d]$', + '124zzd')[0], 'd') + self.assertEqual(regex.search(r'(?r)\d++(?<=(*SKIP)3)zzd|[4d]$', + '124zzd')[0], 'd') + self.assertEqual(regex.search(r'(?r)\d++(?<=2(*SKIP)3)zzd|[3d]$', + '124zzd')[0], 'd') + + # Hg issue 152: Request: Request: (?(DEFINE)...). + self.assertEqual(regex.search(r'(?(DEFINE)(?<quant>\d+)(?<item>\w+))(?&quant) (?&item)', + '5 elephants')[0], '5 elephants') + + # Hg issue 150: Have an option for POSIX-compatible longest match of + # alternates. + self.assertEqual(regex.search(r'(?p)\d+(\w(\d*)?|[eE]([+-]\d+))', + '10b12')[0], '10b12') + self.assertEqual(regex.search(r'(?p)\d+(\w(\d*)?|[eE]([+-]\d+))', + '10E+12')[0], '10E+12') + + self.assertEqual(regex.search(r'(?p)(\w|ae|oe|ue|ss)', 'ae')[0], 'ae') + self.assertEqual(regex.search(r'(?p)one(self)?(selfsufficient)?', + 'oneselfsufficient')[0], 'oneselfsufficient') + + # Hg issue 156: regression on atomic grouping + self.assertEqual(regex.match('1(?>2)', '12').span(), (0, 2)) + + # Hg issue 157: regression: segfault on complex lookaround + self.assertEqual(regex.match(r'(?V1w)(?=(?=[^A-Z]*+[A-Z])(?=[^a-z]*+[a-z]))(?=\D*+\d)(?=\p{Alphanumeric}*+\P{Alphanumeric})\A(?s:.){8,255}+\Z', + 'AAaa11!!')[0], 'AAaa11!!') + + # Hg issue 158: Group issue with (?(DEFINE)...) + TEST_REGEX = regex.compile(r'''(?smx) +(?(DEFINE) + (?<subcat> + ^,[^,]+, + ) +) + +# Group 2 is defined on this line +^,([^,]+), + +(?:(?!(?&subcat)[\r\n]+(?&subcat)).)+ +''') + + TEST_DATA = ''' +,Cat 1, +,Brand 1, +some +thing +,Brand 2, +other +things +,Cat 2, +,Brand, +Some +thing +''' + + self.assertEqual([m.span(1, 2) for m in + TEST_REGEX.finditer(TEST_DATA)], [((-1, -1), (2, 7)), ((-1, -1), (54, + 59))]) + + # Hg issue 161: Unexpected fuzzy match results + self.assertEqual(regex.search('(abcdefgh){e}', + '******abcdefghijklmnopqrtuvwxyz', regex.BESTMATCH).span(), (6, 14)) + self.assertEqual(regex.search('(abcdefghi){e}', + '******abcdefghijklmnopqrtuvwxyz', regex.BESTMATCH).span(), (6, 15)) + + # Hg issue 163: allow lookarounds in conditionals. + self.assertEqual(regex.match(r'(?:(?=\d)\d+\b|\w+)', '123abc').span(), + (0, 6)) + self.assertEqual(regex.match(r'(?(?=\d)\d+\b|\w+)', '123abc'), None) + self.assertEqual(regex.search(r'(?(?<=love\s)you|(?<=hate\s)her)', + "I love you").span(), (7, 10)) + self.assertEqual(regex.findall(r'(?(?<=love\s)you|(?<=hate\s)her)', + "I love you but I don't hate her either"), ['you', 'her']) + + # Hg issue #180: bug of POSIX matching. + self.assertEqual(regex.search(r'(?p)a*(.*?)', 'aaabbb').group(0, 1), + ('aaabbb', 'bbb')) + self.assertEqual(regex.search(r'(?p)a*(.*)', 'aaabbb').group(0, 1), + ('aaabbb', 'bbb')) + self.assertEqual(regex.sub(r'(?p)a*(.*?)', r'\1', 'aaabbb'), 'bbb') + self.assertEqual(regex.sub(r'(?p)a*(.*)', r'\1', 'aaabbb'), 'bbb') + + def test_subscripted_captures(self): + self.assertEqual(regex.match(r'(?P<x>.)+', + 'abc').expandf('{0} {0[0]} {0[-1]}'), 'abc abc abc') + self.assertEqual(regex.match(r'(?P<x>.)+', + 'abc').expandf('{1} {1[0]} {1[1]} {1[2]} {1[-1]} {1[-2]} {1[-3]}'), + 'c a b c c b a') + self.assertEqual(regex.match(r'(?P<x>.)+', + 'abc').expandf('{x} {x[0]} {x[1]} {x[2]} {x[-1]} {x[-2]} {x[-3]}'), + 'c a b c c b a') + + self.assertEqual(regex.subf(r'(?P<x>.)+', r'{0} {0[0]} {0[-1]}', + 'abc'), 'abc abc abc') + self.assertEqual(regex.subf(r'(?P<x>.)+', + '{1} {1[0]} {1[1]} {1[2]} {1[-1]} {1[-2]} {1[-3]}', 'abc'), + 'c a b c c b a') + self.assertEqual(regex.subf(r'(?P<x>.)+', + '{x} {x[0]} {x[1]} {x[2]} {x[-1]} {x[-2]} {x[-3]}', 'abc'), + 'c a b c c b a') + +if not hasattr(str, "format"): + # Strings don't have the .format method (below Python 2.6). + del RegexTests.test_format + del RegexTests.test_subscripted_captures + +def test_main(): + run_unittest(RegexTests) + +if __name__ == "__main__": + test_main()