123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192 |
- #! /usr/bin/env python
- r"""Convert old ("regex") regular expressions to new syntax ("re").
- When imported as a module, there are two functions, with their own
- strings:
- convert(s, syntax=None) -- convert a regex regular expression to re syntax
- quote(s) -- return a quoted string literal
- When used as a script, read a Python string literal (or any other
- expression evaluating to a string) from stdin, and write the
- translated expression to stdout as a string literal. Unless stdout is
- a tty, no trailing \n is written to stdout. This is done so that it
- can be used with Emacs C-U M-| (shell-command-on-region with argument
- which filters the region through the shell command).
- No attempt has been made at coding for performance.
- Translation table...
- \( ( (unless RE_NO_BK_PARENS set)
- \) ) (unless RE_NO_BK_PARENS set)
- \| | (unless RE_NO_BK_VBAR set)
- \< \b (not quite the same, but alla...)
- \> \b (not quite the same, but alla...)
- \` \A
- \' \Z
- Not translated...
- .
- ^
- $
- *
- + (unless RE_BK_PLUS_QM set, then to \+)
- ? (unless RE_BK_PLUS_QM set, then to \?)
- \
- \b
- \B
- \w
- \W
- \1 ... \9
- Special cases...
- Non-printable characters are always replaced by their 3-digit
- escape code (except \t, \n, \r, which use mnemonic escapes)
- Newline is turned into | when RE_NEWLINE_OR is set
- XXX To be done...
- [...] (different treatment of backslashed items?)
- [^...] (different treatment of backslashed items?)
- ^ $ * + ? (in some error contexts these are probably treated differently)
- \vDD \DD (in the regex docs but only works when RE_ANSI_HEX set)
- """
- import warnings
- warnings.filterwarnings("ignore", ".* regex .*", DeprecationWarning, __name__,
- append=1)
- import regex
- from regex_syntax import * # RE_*
- __all__ = ["convert","quote"]
- # Default translation table
- mastertable = {
- r'\<': r'\b',
- r'\>': r'\b',
- r'\`': r'\A',
- r'\'': r'\Z',
- r'\(': '(',
- r'\)': ')',
- r'\|': '|',
- '(': r'\(',
- ')': r'\)',
- '|': r'\|',
- '\t': r'\t',
- '\n': r'\n',
- '\r': r'\r',
- }
- def convert(s, syntax=None):
- """Convert a regex regular expression to re syntax.
- The first argument is the regular expression, as a string object,
- just like it would be passed to regex.compile(). (I.e., pass the
- actual string object -- string quotes must already have been
- removed and the standard escape processing has already been done,
- e.g. by eval().)
- The optional second argument is the regex syntax variant to be
- used. This is an integer mask as passed to regex.set_syntax();
- the flag bits are defined in regex_syntax. When not specified, or
- when None is given, the current regex syntax mask (as retrieved by
- regex.get_syntax()) is used -- which is 0 by default.
- The return value is a regular expression, as a string object that
- could be passed to re.compile(). (I.e., no string quotes have
- been added -- use quote() below, or repr().)
- The conversion is not always guaranteed to be correct. More
- syntactical analysis should be performed to detect borderline
- cases and decide what to do with them. For example, 'x*?' is not
- translated correctly.
- """
- table = mastertable.copy()
- if syntax is None:
- syntax = regex.get_syntax()
- if syntax & RE_NO_BK_PARENS:
- del table[r'\('], table[r'\)']
- del table['('], table[')']
- if syntax & RE_NO_BK_VBAR:
- del table[r'\|']
- del table['|']
- if syntax & RE_BK_PLUS_QM:
- table['+'] = r'\+'
- table['?'] = r'\?'
- table[r'\+'] = '+'
- table[r'\?'] = '?'
- if syntax & RE_NEWLINE_OR:
- table['\n'] = '|'
- res = ""
- i = 0
- end = len(s)
- while i < end:
- c = s[i]
- i = i+1
- if c == '\\':
- c = s[i]
- i = i+1
- key = '\\' + c
- key = table.get(key, key)
- res = res + key
- else:
- c = table.get(c, c)
- res = res + c
- return res
- def quote(s, quote=None):
- """Convert a string object to a quoted string literal.
- This is similar to repr() but will return a "raw" string (r'...'
- or r"...") when the string contains backslashes, instead of
- doubling all backslashes. The resulting string does *not* always
- evaluate to the same string as the original; however it will do
- just the right thing when passed into re.compile().
- The optional second argument forces the string quote; it must be
- a single character which is a valid Python string quote.
- """
- if quote is None:
- q = "'"
- altq = "'"
- if q in s and altq not in s:
- q = altq
- else:
- assert quote in ('"', "'")
- q = quote
- res = q
- for c in s:
- if c == q: c = '\\' + c
- elif c < ' ' or c > '~': c = "\\%03o" % ord(c)
- res = res + c
- res = res + q
- if '\\' in res:
- res = 'r' + res
- return res
- def main():
- """Main program -- called when run as a script."""
- import sys
- s = eval(sys.stdin.read())
- sys.stdout.write(quote(convert(s)))
- if sys.stdout.isatty():
- sys.stdout.write("\n")
- if __name__ == '__main__':
- main()
|