sre_parse.py 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076
  1. #
  2. # Secret Labs' Regular Expression Engine
  3. #
  4. # convert re-style regular expression to sre pattern
  5. #
  6. # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
  7. #
  8. # See the sre.py file for information on usage and redistribution.
  9. #
  10. """Internal support module for sre"""
  11. # XXX: show string offset and offending character for all errors
  12. from sre_constants import *
  13. SPECIAL_CHARS = ".\\[{()*+?^$|"
  14. REPEAT_CHARS = "*+?{"
  15. DIGITS = frozenset("0123456789")
  16. OCTDIGITS = frozenset("01234567")
  17. HEXDIGITS = frozenset("0123456789abcdefABCDEF")
  18. ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
  19. WHITESPACE = frozenset(" \t\n\r\v\f")
  20. _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
  21. _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
  22. ESCAPES = {
  23. r"\a": (LITERAL, ord("\a")),
  24. r"\b": (LITERAL, ord("\b")),
  25. r"\f": (LITERAL, ord("\f")),
  26. r"\n": (LITERAL, ord("\n")),
  27. r"\r": (LITERAL, ord("\r")),
  28. r"\t": (LITERAL, ord("\t")),
  29. r"\v": (LITERAL, ord("\v")),
  30. r"\\": (LITERAL, ord("\\"))
  31. }
  32. CATEGORIES = {
  33. r"\A": (AT, AT_BEGINNING_STRING), # start of string
  34. r"\b": (AT, AT_BOUNDARY),
  35. r"\B": (AT, AT_NON_BOUNDARY),
  36. r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
  37. r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
  38. r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
  39. r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
  40. r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
  41. r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
  42. r"\Z": (AT, AT_END_STRING), # end of string
  43. }
  44. FLAGS = {
  45. # standard flags
  46. "i": SRE_FLAG_IGNORECASE,
  47. "L": SRE_FLAG_LOCALE,
  48. "m": SRE_FLAG_MULTILINE,
  49. "s": SRE_FLAG_DOTALL,
  50. "x": SRE_FLAG_VERBOSE,
  51. # extensions
  52. "a": SRE_FLAG_ASCII,
  53. "t": SRE_FLAG_TEMPLATE,
  54. "u": SRE_FLAG_UNICODE,
  55. }
  56. TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
  57. GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE
  58. class Verbose(Exception):
  59. pass
  60. class State:
  61. # keeps track of state for parsing
  62. def __init__(self):
  63. self.flags = 0
  64. self.groupdict = {}
  65. self.groupwidths = [None] # group 0
  66. self.lookbehindgroups = None
  67. self.grouprefpos = {}
  68. @property
  69. def groups(self):
  70. return len(self.groupwidths)
  71. def opengroup(self, name=None):
  72. gid = self.groups
  73. self.groupwidths.append(None)
  74. if self.groups > MAXGROUPS:
  75. raise error("too many groups")
  76. if name is not None:
  77. ogid = self.groupdict.get(name, None)
  78. if ogid is not None:
  79. raise error("redefinition of group name %r as group %d; "
  80. "was group %d" % (name, gid, ogid))
  81. self.groupdict[name] = gid
  82. return gid
  83. def closegroup(self, gid, p):
  84. self.groupwidths[gid] = p.getwidth()
  85. def checkgroup(self, gid):
  86. return gid < self.groups and self.groupwidths[gid] is not None
  87. def checklookbehindgroup(self, gid, source):
  88. if self.lookbehindgroups is not None:
  89. if not self.checkgroup(gid):
  90. raise source.error('cannot refer to an open group')
  91. if gid >= self.lookbehindgroups:
  92. raise source.error('cannot refer to group defined in the same '
  93. 'lookbehind subpattern')
  94. class SubPattern:
  95. # a subpattern, in intermediate form
  96. def __init__(self, state, data=None):
  97. self.state = state
  98. if data is None:
  99. data = []
  100. self.data = data
  101. self.width = None
  102. def dump(self, level=0):
  103. nl = True
  104. seqtypes = (tuple, list)
  105. for op, av in self.data:
  106. print(level*" " + str(op), end='')
  107. if op is IN:
  108. # member sublanguage
  109. print()
  110. for op, a in av:
  111. print((level+1)*" " + str(op), a)
  112. elif op is BRANCH:
  113. print()
  114. for i, a in enumerate(av[1]):
  115. if i:
  116. print(level*" " + "OR")
  117. a.dump(level+1)
  118. elif op is GROUPREF_EXISTS:
  119. condgroup, item_yes, item_no = av
  120. print('', condgroup)
  121. item_yes.dump(level+1)
  122. if item_no:
  123. print(level*" " + "ELSE")
  124. item_no.dump(level+1)
  125. elif isinstance(av, seqtypes):
  126. nl = False
  127. for a in av:
  128. if isinstance(a, SubPattern):
  129. if not nl:
  130. print()
  131. a.dump(level+1)
  132. nl = True
  133. else:
  134. if not nl:
  135. print(' ', end='')
  136. print(a, end='')
  137. nl = False
  138. if not nl:
  139. print()
  140. else:
  141. print('', av)
  142. def __repr__(self):
  143. return repr(self.data)
  144. def __len__(self):
  145. return len(self.data)
  146. def __delitem__(self, index):
  147. del self.data[index]
  148. def __getitem__(self, index):
  149. if isinstance(index, slice):
  150. return SubPattern(self.state, self.data[index])
  151. return self.data[index]
  152. def __setitem__(self, index, code):
  153. self.data[index] = code
  154. def insert(self, index, code):
  155. self.data.insert(index, code)
  156. def append(self, code):
  157. self.data.append(code)
  158. def getwidth(self):
  159. # determine the width (min, max) for this subpattern
  160. if self.width is not None:
  161. return self.width
  162. lo = hi = 0
  163. for op, av in self.data:
  164. if op is BRANCH:
  165. i = MAXREPEAT - 1
  166. j = 0
  167. for av in av[1]:
  168. l, h = av.getwidth()
  169. i = min(i, l)
  170. j = max(j, h)
  171. lo = lo + i
  172. hi = hi + j
  173. elif op is CALL:
  174. i, j = av.getwidth()
  175. lo = lo + i
  176. hi = hi + j
  177. elif op is SUBPATTERN:
  178. i, j = av[-1].getwidth()
  179. lo = lo + i
  180. hi = hi + j
  181. elif op in _REPEATCODES:
  182. i, j = av[2].getwidth()
  183. lo = lo + i * av[0]
  184. hi = hi + j * av[1]
  185. elif op in _UNITCODES:
  186. lo = lo + 1
  187. hi = hi + 1
  188. elif op is GROUPREF:
  189. i, j = self.state.groupwidths[av]
  190. lo = lo + i
  191. hi = hi + j
  192. elif op is GROUPREF_EXISTS:
  193. i, j = av[1].getwidth()
  194. if av[2] is not None:
  195. l, h = av[2].getwidth()
  196. i = min(i, l)
  197. j = max(j, h)
  198. else:
  199. i = 0
  200. lo = lo + i
  201. hi = hi + j
  202. elif op is SUCCESS:
  203. break
  204. self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
  205. return self.width
  206. class Tokenizer:
  207. def __init__(self, string):
  208. self.istext = isinstance(string, str)
  209. self.string = string
  210. if not self.istext:
  211. string = str(string, 'latin1')
  212. self.decoded_string = string
  213. self.index = 0
  214. self.next = None
  215. self.__next()
  216. def __next(self):
  217. index = self.index
  218. try:
  219. char = self.decoded_string[index]
  220. except IndexError:
  221. self.next = None
  222. return
  223. if char == "\\":
  224. index += 1
  225. try:
  226. char += self.decoded_string[index]
  227. except IndexError:
  228. raise error("bad escape (end of pattern)",
  229. self.string, len(self.string) - 1) from None
  230. self.index = index + 1
  231. self.next = char
  232. def match(self, char):
  233. if char == self.next:
  234. self.__next()
  235. return True
  236. return False
  237. def get(self):
  238. this = self.next
  239. self.__next()
  240. return this
  241. def getwhile(self, n, charset):
  242. result = ''
  243. for _ in range(n):
  244. c = self.next
  245. if c not in charset:
  246. break
  247. result += c
  248. self.__next()
  249. return result
  250. def getuntil(self, terminator, name):
  251. result = ''
  252. while True:
  253. c = self.next
  254. self.__next()
  255. if c is None:
  256. if not result:
  257. raise self.error("missing " + name)
  258. raise self.error("missing %s, unterminated name" % terminator,
  259. len(result))
  260. if c == terminator:
  261. if not result:
  262. raise self.error("missing " + name, 1)
  263. break
  264. result += c
  265. return result
  266. @property
  267. def pos(self):
  268. return self.index - len(self.next or '')
  269. def tell(self):
  270. return self.index - len(self.next or '')
  271. def seek(self, index):
  272. self.index = index
  273. self.__next()
  274. def error(self, msg, offset=0):
  275. return error(msg, self.string, self.tell() - offset)
  276. def _class_escape(source, escape):
  277. # handle escape code inside character class
  278. code = ESCAPES.get(escape)
  279. if code:
  280. return code
  281. code = CATEGORIES.get(escape)
  282. if code and code[0] is IN:
  283. return code
  284. try:
  285. c = escape[1:2]
  286. if c == "x":
  287. # hexadecimal escape (exactly two digits)
  288. escape += source.getwhile(2, HEXDIGITS)
  289. if len(escape) != 4:
  290. raise source.error("incomplete escape %s" % escape, len(escape))
  291. return LITERAL, int(escape[2:], 16)
  292. elif c == "u" and source.istext:
  293. # unicode escape (exactly four digits)
  294. escape += source.getwhile(4, HEXDIGITS)
  295. if len(escape) != 6:
  296. raise source.error("incomplete escape %s" % escape, len(escape))
  297. return LITERAL, int(escape[2:], 16)
  298. elif c == "U" and source.istext:
  299. # unicode escape (exactly eight digits)
  300. escape += source.getwhile(8, HEXDIGITS)
  301. if len(escape) != 10:
  302. raise source.error("incomplete escape %s" % escape, len(escape))
  303. c = int(escape[2:], 16)
  304. chr(c) # raise ValueError for invalid code
  305. return LITERAL, c
  306. elif c == "N" and source.istext:
  307. import unicodedata
  308. # named unicode escape e.g. \N{EM DASH}
  309. if not source.match('{'):
  310. raise source.error("missing {")
  311. charname = source.getuntil('}', 'character name')
  312. try:
  313. c = ord(unicodedata.lookup(charname))
  314. except (KeyError, TypeError):
  315. raise source.error("undefined character name %r" % charname,
  316. len(charname) + len(r'\N{}'))
  317. return LITERAL, c
  318. elif c in OCTDIGITS:
  319. # octal escape (up to three digits)
  320. escape += source.getwhile(2, OCTDIGITS)
  321. c = int(escape[1:], 8)
  322. if c > 0o377:
  323. raise source.error('octal escape value %s outside of '
  324. 'range 0-0o377' % escape, len(escape))
  325. return LITERAL, c
  326. elif c in DIGITS:
  327. raise ValueError
  328. if len(escape) == 2:
  329. if c in ASCIILETTERS:
  330. raise source.error('bad escape %s' % escape, len(escape))
  331. return LITERAL, ord(escape[1])
  332. except ValueError:
  333. pass
  334. raise source.error("bad escape %s" % escape, len(escape))
  335. def _escape(source, escape, state):
  336. # handle escape code in expression
  337. code = CATEGORIES.get(escape)
  338. if code:
  339. return code
  340. code = ESCAPES.get(escape)
  341. if code:
  342. return code
  343. try:
  344. c = escape[1:2]
  345. if c == "x":
  346. # hexadecimal escape
  347. escape += source.getwhile(2, HEXDIGITS)
  348. if len(escape) != 4:
  349. raise source.error("incomplete escape %s" % escape, len(escape))
  350. return LITERAL, int(escape[2:], 16)
  351. elif c == "u" and source.istext:
  352. # unicode escape (exactly four digits)
  353. escape += source.getwhile(4, HEXDIGITS)
  354. if len(escape) != 6:
  355. raise source.error("incomplete escape %s" % escape, len(escape))
  356. return LITERAL, int(escape[2:], 16)
  357. elif c == "U" and source.istext:
  358. # unicode escape (exactly eight digits)
  359. escape += source.getwhile(8, HEXDIGITS)
  360. if len(escape) != 10:
  361. raise source.error("incomplete escape %s" % escape, len(escape))
  362. c = int(escape[2:], 16)
  363. chr(c) # raise ValueError for invalid code
  364. return LITERAL, c
  365. elif c == "N" and source.istext:
  366. import unicodedata
  367. # named unicode escape e.g. \N{EM DASH}
  368. if not source.match('{'):
  369. raise source.error("missing {")
  370. charname = source.getuntil('}', 'character name')
  371. try:
  372. c = ord(unicodedata.lookup(charname))
  373. except (KeyError, TypeError):
  374. raise source.error("undefined character name %r" % charname,
  375. len(charname) + len(r'\N{}'))
  376. return LITERAL, c
  377. elif c == "0":
  378. # octal escape
  379. escape += source.getwhile(2, OCTDIGITS)
  380. return LITERAL, int(escape[1:], 8)
  381. elif c in DIGITS:
  382. # octal escape *or* decimal group reference (sigh)
  383. if source.next in DIGITS:
  384. escape += source.get()
  385. if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
  386. source.next in OCTDIGITS):
  387. # got three octal digits; this is an octal escape
  388. escape += source.get()
  389. c = int(escape[1:], 8)
  390. if c > 0o377:
  391. raise source.error('octal escape value %s outside of '
  392. 'range 0-0o377' % escape,
  393. len(escape))
  394. return LITERAL, c
  395. # not an octal escape, so this is a group reference
  396. group = int(escape[1:])
  397. if group < state.groups:
  398. if not state.checkgroup(group):
  399. raise source.error("cannot refer to an open group",
  400. len(escape))
  401. state.checklookbehindgroup(group, source)
  402. return GROUPREF, group
  403. raise source.error("invalid group reference %d" % group, len(escape) - 1)
  404. if len(escape) == 2:
  405. if c in ASCIILETTERS:
  406. raise source.error("bad escape %s" % escape, len(escape))
  407. return LITERAL, ord(escape[1])
  408. except ValueError:
  409. pass
  410. raise source.error("bad escape %s" % escape, len(escape))
  411. def _uniq(items):
  412. return list(dict.fromkeys(items))
  413. def _parse_sub(source, state, verbose, nested):
  414. # parse an alternation: a|b|c
  415. items = []
  416. itemsappend = items.append
  417. sourcematch = source.match
  418. start = source.tell()
  419. while True:
  420. itemsappend(_parse(source, state, verbose, nested + 1,
  421. not nested and not items))
  422. if not sourcematch("|"):
  423. break
  424. if len(items) == 1:
  425. return items[0]
  426. subpattern = SubPattern(state)
  427. # check if all items share a common prefix
  428. while True:
  429. prefix = None
  430. for item in items:
  431. if not item:
  432. break
  433. if prefix is None:
  434. prefix = item[0]
  435. elif item[0] != prefix:
  436. break
  437. else:
  438. # all subitems start with a common "prefix".
  439. # move it out of the branch
  440. for item in items:
  441. del item[0]
  442. subpattern.append(prefix)
  443. continue # check next one
  444. break
  445. # check if the branch can be replaced by a character set
  446. set = []
  447. for item in items:
  448. if len(item) != 1:
  449. break
  450. op, av = item[0]
  451. if op is LITERAL:
  452. set.append((op, av))
  453. elif op is IN and av[0][0] is not NEGATE:
  454. set.extend(av)
  455. else:
  456. break
  457. else:
  458. # we can store this as a character set instead of a
  459. # branch (the compiler may optimize this even more)
  460. subpattern.append((IN, _uniq(set)))
  461. return subpattern
  462. subpattern.append((BRANCH, (None, items)))
  463. return subpattern
  464. def _parse(source, state, verbose, nested, first=False):
  465. # parse a simple pattern
  466. subpattern = SubPattern(state)
  467. # precompute constants into local variables
  468. subpatternappend = subpattern.append
  469. sourceget = source.get
  470. sourcematch = source.match
  471. _len = len
  472. _ord = ord
  473. while True:
  474. this = source.next
  475. if this is None:
  476. break # end of pattern
  477. if this in "|)":
  478. break # end of subpattern
  479. sourceget()
  480. if verbose:
  481. # skip whitespace and comments
  482. if this in WHITESPACE:
  483. continue
  484. if this == "#":
  485. while True:
  486. this = sourceget()
  487. if this is None or this == "\n":
  488. break
  489. continue
  490. if this[0] == "\\":
  491. code = _escape(source, this, state)
  492. subpatternappend(code)
  493. elif this not in SPECIAL_CHARS:
  494. subpatternappend((LITERAL, _ord(this)))
  495. elif this == "[":
  496. here = source.tell() - 1
  497. # character set
  498. set = []
  499. setappend = set.append
  500. ## if sourcematch(":"):
  501. ## pass # handle character classes
  502. if source.next == '[':
  503. import warnings
  504. warnings.warn(
  505. 'Possible nested set at position %d' % source.tell(),
  506. FutureWarning, stacklevel=nested + 6
  507. )
  508. negate = sourcematch("^")
  509. # check remaining characters
  510. while True:
  511. this = sourceget()
  512. if this is None:
  513. raise source.error("unterminated character set",
  514. source.tell() - here)
  515. if this == "]" and set:
  516. break
  517. elif this[0] == "\\":
  518. code1 = _class_escape(source, this)
  519. else:
  520. if set and this in '-&~|' and source.next == this:
  521. import warnings
  522. warnings.warn(
  523. 'Possible set %s at position %d' % (
  524. 'difference' if this == '-' else
  525. 'intersection' if this == '&' else
  526. 'symmetric difference' if this == '~' else
  527. 'union',
  528. source.tell() - 1),
  529. FutureWarning, stacklevel=nested + 6
  530. )
  531. code1 = LITERAL, _ord(this)
  532. if sourcematch("-"):
  533. # potential range
  534. that = sourceget()
  535. if that is None:
  536. raise source.error("unterminated character set",
  537. source.tell() - here)
  538. if that == "]":
  539. if code1[0] is IN:
  540. code1 = code1[1][0]
  541. setappend(code1)
  542. setappend((LITERAL, _ord("-")))
  543. break
  544. if that[0] == "\\":
  545. code2 = _class_escape(source, that)
  546. else:
  547. if that == '-':
  548. import warnings
  549. warnings.warn(
  550. 'Possible set difference at position %d' % (
  551. source.tell() - 2),
  552. FutureWarning, stacklevel=nested + 6
  553. )
  554. code2 = LITERAL, _ord(that)
  555. if code1[0] != LITERAL or code2[0] != LITERAL:
  556. msg = "bad character range %s-%s" % (this, that)
  557. raise source.error(msg, len(this) + 1 + len(that))
  558. lo = code1[1]
  559. hi = code2[1]
  560. if hi < lo:
  561. msg = "bad character range %s-%s" % (this, that)
  562. raise source.error(msg, len(this) + 1 + len(that))
  563. setappend((RANGE, (lo, hi)))
  564. else:
  565. if code1[0] is IN:
  566. code1 = code1[1][0]
  567. setappend(code1)
  568. set = _uniq(set)
  569. # XXX: <fl> should move set optimization to compiler!
  570. if _len(set) == 1 and set[0][0] is LITERAL:
  571. # optimization
  572. if negate:
  573. subpatternappend((NOT_LITERAL, set[0][1]))
  574. else:
  575. subpatternappend(set[0])
  576. else:
  577. if negate:
  578. set.insert(0, (NEGATE, None))
  579. # charmap optimization can't be added here because
  580. # global flags still are not known
  581. subpatternappend((IN, set))
  582. elif this in REPEAT_CHARS:
  583. # repeat previous item
  584. here = source.tell()
  585. if this == "?":
  586. min, max = 0, 1
  587. elif this == "*":
  588. min, max = 0, MAXREPEAT
  589. elif this == "+":
  590. min, max = 1, MAXREPEAT
  591. elif this == "{":
  592. if source.next == "}":
  593. subpatternappend((LITERAL, _ord(this)))
  594. continue
  595. min, max = 0, MAXREPEAT
  596. lo = hi = ""
  597. while source.next in DIGITS:
  598. lo += sourceget()
  599. if sourcematch(","):
  600. while source.next in DIGITS:
  601. hi += sourceget()
  602. else:
  603. hi = lo
  604. if not sourcematch("}"):
  605. subpatternappend((LITERAL, _ord(this)))
  606. source.seek(here)
  607. continue
  608. if lo:
  609. min = int(lo)
  610. if min >= MAXREPEAT:
  611. raise OverflowError("the repetition number is too large")
  612. if hi:
  613. max = int(hi)
  614. if max >= MAXREPEAT:
  615. raise OverflowError("the repetition number is too large")
  616. if max < min:
  617. raise source.error("min repeat greater than max repeat",
  618. source.tell() - here)
  619. else:
  620. raise AssertionError("unsupported quantifier %r" % (char,))
  621. # figure out which item to repeat
  622. if subpattern:
  623. item = subpattern[-1:]
  624. else:
  625. item = None
  626. if not item or item[0][0] is AT:
  627. raise source.error("nothing to repeat",
  628. source.tell() - here + len(this))
  629. if item[0][0] in _REPEATCODES:
  630. raise source.error("multiple repeat",
  631. source.tell() - here + len(this))
  632. if item[0][0] is SUBPATTERN:
  633. group, add_flags, del_flags, p = item[0][1]
  634. if group is None and not add_flags and not del_flags:
  635. item = p
  636. if sourcematch("?"):
  637. subpattern[-1] = (MIN_REPEAT, (min, max, item))
  638. else:
  639. subpattern[-1] = (MAX_REPEAT, (min, max, item))
  640. elif this == ".":
  641. subpatternappend((ANY, None))
  642. elif this == "(":
  643. start = source.tell() - 1
  644. group = True
  645. name = None
  646. add_flags = 0
  647. del_flags = 0
  648. if sourcematch("?"):
  649. # options
  650. char = sourceget()
  651. if char is None:
  652. raise source.error("unexpected end of pattern")
  653. if char == "P":
  654. # python extensions
  655. if sourcematch("<"):
  656. # named group: skip forward to end of name
  657. name = source.getuntil(">", "group name")
  658. if not name.isidentifier():
  659. msg = "bad character in group name %r" % name
  660. raise source.error(msg, len(name) + 1)
  661. elif sourcematch("="):
  662. # named backreference
  663. name = source.getuntil(")", "group name")
  664. if not name.isidentifier():
  665. msg = "bad character in group name %r" % name
  666. raise source.error(msg, len(name) + 1)
  667. gid = state.groupdict.get(name)
  668. if gid is None:
  669. msg = "unknown group name %r" % name
  670. raise source.error(msg, len(name) + 1)
  671. if not state.checkgroup(gid):
  672. raise source.error("cannot refer to an open group",
  673. len(name) + 1)
  674. state.checklookbehindgroup(gid, source)
  675. subpatternappend((GROUPREF, gid))
  676. continue
  677. else:
  678. char = sourceget()
  679. if char is None:
  680. raise source.error("unexpected end of pattern")
  681. raise source.error("unknown extension ?P" + char,
  682. len(char) + 2)
  683. elif char == ":":
  684. # non-capturing group
  685. group = None
  686. elif char == "#":
  687. # comment
  688. while True:
  689. if source.next is None:
  690. raise source.error("missing ), unterminated comment",
  691. source.tell() - start)
  692. if sourceget() == ")":
  693. break
  694. continue
  695. elif char in "=!<":
  696. # lookahead assertions
  697. dir = 1
  698. if char == "<":
  699. char = sourceget()
  700. if char is None:
  701. raise source.error("unexpected end of pattern")
  702. if char not in "=!":
  703. raise source.error("unknown extension ?<" + char,
  704. len(char) + 2)
  705. dir = -1 # lookbehind
  706. lookbehindgroups = state.lookbehindgroups
  707. if lookbehindgroups is None:
  708. state.lookbehindgroups = state.groups
  709. p = _parse_sub(source, state, verbose, nested + 1)
  710. if dir < 0:
  711. if lookbehindgroups is None:
  712. state.lookbehindgroups = None
  713. if not sourcematch(")"):
  714. raise source.error("missing ), unterminated subpattern",
  715. source.tell() - start)
  716. if char == "=":
  717. subpatternappend((ASSERT, (dir, p)))
  718. else:
  719. subpatternappend((ASSERT_NOT, (dir, p)))
  720. continue
  721. elif char == "(":
  722. # conditional backreference group
  723. condname = source.getuntil(")", "group name")
  724. if condname.isidentifier():
  725. condgroup = state.groupdict.get(condname)
  726. if condgroup is None:
  727. msg = "unknown group name %r" % condname
  728. raise source.error(msg, len(condname) + 1)
  729. else:
  730. try:
  731. condgroup = int(condname)
  732. if condgroup < 0:
  733. raise ValueError
  734. except ValueError:
  735. msg = "bad character in group name %r" % condname
  736. raise source.error(msg, len(condname) + 1) from None
  737. if not condgroup:
  738. raise source.error("bad group number",
  739. len(condname) + 1)
  740. if condgroup >= MAXGROUPS:
  741. msg = "invalid group reference %d" % condgroup
  742. raise source.error(msg, len(condname) + 1)
  743. if condgroup not in state.grouprefpos:
  744. state.grouprefpos[condgroup] = (
  745. source.tell() - len(condname) - 1
  746. )
  747. state.checklookbehindgroup(condgroup, source)
  748. item_yes = _parse(source, state, verbose, nested + 1)
  749. if source.match("|"):
  750. item_no = _parse(source, state, verbose, nested + 1)
  751. if source.next == "|":
  752. raise source.error("conditional backref with more than two branches")
  753. else:
  754. item_no = None
  755. if not source.match(")"):
  756. raise source.error("missing ), unterminated subpattern",
  757. source.tell() - start)
  758. subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
  759. continue
  760. elif char in FLAGS or char == "-":
  761. # flags
  762. flags = _parse_flags(source, state, char)
  763. if flags is None: # global flags
  764. if not first or subpattern:
  765. import warnings
  766. warnings.warn(
  767. 'Flags not at the start of the expression %r%s'
  768. ' but at position %d' % (
  769. source.string[:20], # truncate long regexes
  770. ' (truncated)' if len(source.string) > 20 else '',
  771. start,
  772. ),
  773. DeprecationWarning, stacklevel=nested + 6
  774. )
  775. if (state.flags & SRE_FLAG_VERBOSE) and not verbose:
  776. raise Verbose
  777. continue
  778. add_flags, del_flags = flags
  779. group = None
  780. else:
  781. raise source.error("unknown extension ?" + char,
  782. len(char) + 1)
  783. # parse group contents
  784. if group is not None:
  785. try:
  786. group = state.opengroup(name)
  787. except error as err:
  788. raise source.error(err.msg, len(name) + 1) from None
  789. sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
  790. not (del_flags & SRE_FLAG_VERBOSE))
  791. p = _parse_sub(source, state, sub_verbose, nested + 1)
  792. if not source.match(")"):
  793. raise source.error("missing ), unterminated subpattern",
  794. source.tell() - start)
  795. if group is not None:
  796. state.closegroup(group, p)
  797. subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
  798. elif this == "^":
  799. subpatternappend((AT, AT_BEGINNING))
  800. elif this == "$":
  801. subpatternappend((AT, AT_END))
  802. else:
  803. raise AssertionError("unsupported special character %r" % (char,))
  804. # unpack non-capturing groups
  805. for i in range(len(subpattern))[::-1]:
  806. op, av = subpattern[i]
  807. if op is SUBPATTERN:
  808. group, add_flags, del_flags, p = av
  809. if group is None and not add_flags and not del_flags:
  810. subpattern[i: i+1] = p
  811. return subpattern
  812. def _parse_flags(source, state, char):
  813. sourceget = source.get
  814. add_flags = 0
  815. del_flags = 0
  816. if char != "-":
  817. while True:
  818. flag = FLAGS[char]
  819. if source.istext:
  820. if char == 'L':
  821. msg = "bad inline flags: cannot use 'L' flag with a str pattern"
  822. raise source.error(msg)
  823. else:
  824. if char == 'u':
  825. msg = "bad inline flags: cannot use 'u' flag with a bytes pattern"
  826. raise source.error(msg)
  827. add_flags |= flag
  828. if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag:
  829. msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible"
  830. raise source.error(msg)
  831. char = sourceget()
  832. if char is None:
  833. raise source.error("missing -, : or )")
  834. if char in ")-:":
  835. break
  836. if char not in FLAGS:
  837. msg = "unknown flag" if char.isalpha() else "missing -, : or )"
  838. raise source.error(msg, len(char))
  839. if char == ")":
  840. state.flags |= add_flags
  841. return None
  842. if add_flags & GLOBAL_FLAGS:
  843. raise source.error("bad inline flags: cannot turn on global flag", 1)
  844. if char == "-":
  845. char = sourceget()
  846. if char is None:
  847. raise source.error("missing flag")
  848. if char not in FLAGS:
  849. msg = "unknown flag" if char.isalpha() else "missing flag"
  850. raise source.error(msg, len(char))
  851. while True:
  852. flag = FLAGS[char]
  853. if flag & TYPE_FLAGS:
  854. msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'"
  855. raise source.error(msg)
  856. del_flags |= flag
  857. char = sourceget()
  858. if char is None:
  859. raise source.error("missing :")
  860. if char == ":":
  861. break
  862. if char not in FLAGS:
  863. msg = "unknown flag" if char.isalpha() else "missing :"
  864. raise source.error(msg, len(char))
  865. assert char == ":"
  866. if del_flags & GLOBAL_FLAGS:
  867. raise source.error("bad inline flags: cannot turn off global flag", 1)
  868. if add_flags & del_flags:
  869. raise source.error("bad inline flags: flag turned on and off", 1)
  870. return add_flags, del_flags
  871. def fix_flags(src, flags):
  872. # Check and fix flags according to the type of pattern (str or bytes)
  873. if isinstance(src, str):
  874. if flags & SRE_FLAG_LOCALE:
  875. raise ValueError("cannot use LOCALE flag with a str pattern")
  876. if not flags & SRE_FLAG_ASCII:
  877. flags |= SRE_FLAG_UNICODE
  878. elif flags & SRE_FLAG_UNICODE:
  879. raise ValueError("ASCII and UNICODE flags are incompatible")
  880. else:
  881. if flags & SRE_FLAG_UNICODE:
  882. raise ValueError("cannot use UNICODE flag with a bytes pattern")
  883. if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII:
  884. raise ValueError("ASCII and LOCALE flags are incompatible")
  885. return flags
  886. def parse(str, flags=0, state=None):
  887. # parse 're' pattern into list of (opcode, argument) tuples
  888. source = Tokenizer(str)
  889. if state is None:
  890. state = State()
  891. state.flags = flags
  892. state.str = str
  893. try:
  894. p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
  895. except Verbose:
  896. # the VERBOSE flag was switched on inside the pattern. to be
  897. # on the safe side, we'll parse the whole thing again...
  898. state = State()
  899. state.flags = flags | SRE_FLAG_VERBOSE
  900. state.str = str
  901. source.seek(0)
  902. p = _parse_sub(source, state, True, 0)
  903. p.state.flags = fix_flags(str, p.state.flags)
  904. if source.next is not None:
  905. assert source.next == ")"
  906. raise source.error("unbalanced parenthesis")
  907. for g in p.state.grouprefpos:
  908. if g >= p.state.groups:
  909. msg = "invalid group reference %d" % g
  910. raise error(msg, str, p.state.grouprefpos[g])
  911. if flags & SRE_FLAG_DEBUG:
  912. p.dump()
  913. return p
  914. def parse_template(source, state):
  915. # parse 're' replacement string into list of literals and
  916. # group references
  917. s = Tokenizer(source)
  918. sget = s.get
  919. groups = []
  920. literals = []
  921. literal = []
  922. lappend = literal.append
  923. def addgroup(index, pos):
  924. if index > state.groups:
  925. raise s.error("invalid group reference %d" % index, pos)
  926. if literal:
  927. literals.append(''.join(literal))
  928. del literal[:]
  929. groups.append((len(literals), index))
  930. literals.append(None)
  931. groupindex = state.groupindex
  932. while True:
  933. this = sget()
  934. if this is None:
  935. break # end of replacement string
  936. if this[0] == "\\":
  937. # group
  938. c = this[1]
  939. if c == "g":
  940. name = ""
  941. if not s.match("<"):
  942. raise s.error("missing <")
  943. name = s.getuntil(">", "group name")
  944. if name.isidentifier():
  945. try:
  946. index = groupindex[name]
  947. except KeyError:
  948. raise IndexError("unknown group name %r" % name)
  949. else:
  950. try:
  951. index = int(name)
  952. if index < 0:
  953. raise ValueError
  954. except ValueError:
  955. raise s.error("bad character in group name %r" % name,
  956. len(name) + 1) from None
  957. if index >= MAXGROUPS:
  958. raise s.error("invalid group reference %d" % index,
  959. len(name) + 1)
  960. addgroup(index, len(name) + 1)
  961. elif c == "0":
  962. if s.next in OCTDIGITS:
  963. this += sget()
  964. if s.next in OCTDIGITS:
  965. this += sget()
  966. lappend(chr(int(this[1:], 8) & 0xff))
  967. elif c in DIGITS:
  968. isoctal = False
  969. if s.next in DIGITS:
  970. this += sget()
  971. if (c in OCTDIGITS and this[2] in OCTDIGITS and
  972. s.next in OCTDIGITS):
  973. this += sget()
  974. isoctal = True
  975. c = int(this[1:], 8)
  976. if c > 0o377:
  977. raise s.error('octal escape value %s outside of '
  978. 'range 0-0o377' % this, len(this))
  979. lappend(chr(c))
  980. if not isoctal:
  981. addgroup(int(this[1:]), len(this) - 1)
  982. else:
  983. try:
  984. this = chr(ESCAPES[this][1])
  985. except KeyError:
  986. if c in ASCIILETTERS:
  987. raise s.error('bad escape %s' % this, len(this))
  988. lappend(this)
  989. else:
  990. lappend(this)
  991. if literal:
  992. literals.append(''.join(literal))
  993. if not isinstance(source, str):
  994. # The tokenizer implicitly decodes bytes objects as latin-1, we must
  995. # therefore re-encode the final representation.
  996. literals = [None if s is None else s.encode('latin-1') for s in literals]
  997. return groups, literals
  998. def expand_template(template, match):
  999. g = match.group
  1000. empty = match.string[:0]
  1001. groups, literals = template
  1002. literals = literals[:]
  1003. try:
  1004. for index, group in groups:
  1005. literals[index] = g(group) or empty
  1006. except IndexError:
  1007. raise error("invalid group reference %d" % index)
  1008. return empty.join(literals)