strutils.py 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311
  1. # -*- coding: utf-8 -*-
  2. # Copyright (c) 2013, Mahmoud Hashemi
  3. #
  4. # Redistribution and use in source and binary forms, with or without
  5. # modification, are permitted provided that the following conditions are
  6. # met:
  7. #
  8. # * Redistributions of source code must retain the above copyright
  9. # notice, this list of conditions and the following disclaimer.
  10. #
  11. # * Redistributions in binary form must reproduce the above
  12. # copyright notice, this list of conditions and the following
  13. # disclaimer in the documentation and/or other materials provided
  14. # with the distribution.
  15. #
  16. # * The names of the contributors may not be used to endorse or
  17. # promote products derived from this software without specific
  18. # prior written permission.
  19. #
  20. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21. # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22. # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23. # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24. # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25. # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27. # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28. # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29. # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30. # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31. """So much practical programming involves string manipulation, which
  32. Python readily accommodates. Still, there are dozens of basic and
  33. common capabilities missing from the standard library, several of them
  34. provided by ``strutils``.
  35. """
  36. from __future__ import print_function
  37. import re
  38. import sys
  39. import uuid
  40. import zlib
  41. import string
  42. import unicodedata
  43. import collections
  44. from gzip import GzipFile
  45. try:
  46. from cStringIO import cStringIO as StringIO
  47. except ImportError:
  48. from io import BytesIO as StringIO
  49. try:
  50. from collections.abc import Mapping
  51. except ImportError:
  52. from collections import Mapping
  53. try:
  54. unicode, str, bytes, basestring = unicode, str, str, basestring
  55. from HTMLParser import HTMLParser
  56. import htmlentitydefs
  57. except NameError: # basestring not defined in Python 3
  58. unicode, str, bytes, basestring = str, bytes, bytes, (str, bytes)
  59. unichr = chr
  60. from html.parser import HTMLParser
  61. from html import entities as htmlentitydefs
  62. try:
  63. import __builtin__ as builtins
  64. except ImportError:
  65. import builtins
  66. __all__ = ['camel2under', 'under2camel', 'slugify', 'split_punct_ws',
  67. 'unit_len', 'ordinalize', 'cardinalize', 'pluralize', 'singularize',
  68. 'asciify', 'is_ascii', 'is_uuid', 'html2text', 'strip_ansi',
  69. 'bytes2human', 'find_hashtags', 'a10n', 'gzip_bytes', 'gunzip_bytes',
  70. 'iter_splitlines', 'indent', 'escape_shell_args',
  71. 'args2cmd', 'args2sh', 'parse_int_list', 'format_int_list',
  72. 'int_list_complement', 'int_list_to_int_tuples', 'MultiReplace',
  73. 'multi_replace', 'unwrap_text']
  74. _punct_ws_str = string.punctuation + string.whitespace
  75. _punct_re = re.compile('[' + _punct_ws_str + ']+')
  76. _camel2under_re = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))')
  77. def camel2under(camel_string):
  78. """Converts a camelcased string to underscores. Useful for turning a
  79. class name into a function name.
  80. >>> camel2under('BasicParseTest')
  81. 'basic_parse_test'
  82. """
  83. return _camel2under_re.sub(r'_\1', camel_string).lower()
  84. def under2camel(under_string):
  85. """Converts an underscored string to camelcased. Useful for turning a
  86. function name into a class name.
  87. >>> under2camel('complex_tokenizer')
  88. 'ComplexTokenizer'
  89. """
  90. return ''.join(w.capitalize() or '_' for w in under_string.split('_'))
  91. def slugify(text, delim='_', lower=True, ascii=False):
  92. """
  93. A basic function that turns text full of scary characters
  94. (i.e., punctuation and whitespace), into a relatively safe
  95. lowercased string separated only by the delimiter specified
  96. by *delim*, which defaults to ``_``.
  97. The *ascii* convenience flag will :func:`asciify` the slug if
  98. you require ascii-only slugs.
  99. >>> slugify('First post! Hi!!!!~1 ')
  100. 'first_post_hi_1'
  101. >>> slugify("Kurt Gödel's pretty cool.", ascii=True) == \
  102. b'kurt_goedel_s_pretty_cool'
  103. True
  104. """
  105. ret = delim.join(split_punct_ws(text)) or delim if text else ''
  106. if ascii:
  107. ret = asciify(ret)
  108. if lower:
  109. ret = ret.lower()
  110. return ret
  111. def split_punct_ws(text):
  112. """While :meth:`str.split` will split on whitespace,
  113. :func:`split_punct_ws` will split on punctuation and
  114. whitespace. This used internally by :func:`slugify`, above.
  115. >>> split_punct_ws('First post! Hi!!!!~1 ')
  116. ['First', 'post', 'Hi', '1']
  117. """
  118. return [w for w in _punct_re.split(text) if w]
  119. def unit_len(sized_iterable, unit_noun='item'): # TODO: len_units()/unitize()?
  120. """Returns a plain-English description of an iterable's
  121. :func:`len()`, conditionally pluralized with :func:`cardinalize`,
  122. detailed below.
  123. >>> print(unit_len(range(10), 'number'))
  124. 10 numbers
  125. >>> print(unit_len('aeiou', 'vowel'))
  126. 5 vowels
  127. >>> print(unit_len([], 'worry'))
  128. No worries
  129. """
  130. count = len(sized_iterable)
  131. units = cardinalize(unit_noun, count)
  132. if count:
  133. return u'%s %s' % (count, units)
  134. return u'No %s' % (units,)
  135. _ORDINAL_MAP = {'1': 'st',
  136. '2': 'nd',
  137. '3': 'rd'} # 'th' is the default
  138. def ordinalize(number, ext_only=False):
  139. """Turns *number* into its cardinal form, i.e., 1st, 2nd,
  140. 3rd, 4th, etc. If the last character isn't a digit, it returns the
  141. string value unchanged.
  142. Args:
  143. number (int or str): Number to be cardinalized.
  144. ext_only (bool): Whether to return only the suffix. Default ``False``.
  145. >>> print(ordinalize(1))
  146. 1st
  147. >>> print(ordinalize(3694839230))
  148. 3694839230th
  149. >>> print(ordinalize('hi'))
  150. hi
  151. >>> print(ordinalize(1515))
  152. 1515th
  153. """
  154. numstr, ext = unicode(number), ''
  155. if numstr and numstr[-1] in string.digits:
  156. try:
  157. # first check for teens
  158. if numstr[-2] == '1':
  159. ext = 'th'
  160. else:
  161. # all other cases
  162. ext = _ORDINAL_MAP.get(numstr[-1], 'th')
  163. except IndexError:
  164. # single digit numbers (will reach here based on [-2] above)
  165. ext = _ORDINAL_MAP.get(numstr[-1], 'th')
  166. if ext_only:
  167. return ext
  168. else:
  169. return numstr + ext
  170. def cardinalize(unit_noun, count):
  171. """Conditionally pluralizes a singular word *unit_noun* if
  172. *count* is not one, preserving case when possible.
  173. >>> vowels = 'aeiou'
  174. >>> print(len(vowels), cardinalize('vowel', len(vowels)))
  175. 5 vowels
  176. >>> print(3, cardinalize('Wish', 3))
  177. 3 Wishes
  178. """
  179. if count == 1:
  180. return unit_noun
  181. return pluralize(unit_noun)
  182. def singularize(word):
  183. """Semi-intelligently converts an English plural *word* to its
  184. singular form, preserving case pattern.
  185. >>> singularize('chances')
  186. 'chance'
  187. >>> singularize('Activities')
  188. 'Activity'
  189. >>> singularize('Glasses')
  190. 'Glass'
  191. >>> singularize('FEET')
  192. 'FOOT'
  193. """
  194. orig_word, word = word, word.strip().lower()
  195. if not word or word in _IRR_S2P:
  196. return orig_word
  197. irr_singular = _IRR_P2S.get(word)
  198. if irr_singular:
  199. singular = irr_singular
  200. elif not word.endswith('s'):
  201. return orig_word
  202. elif len(word) == 2:
  203. singular = word[:-1] # or just return word?
  204. elif word.endswith('ies') and word[-4:-3] not in 'aeiou':
  205. singular = word[:-3] + 'y'
  206. elif word.endswith('es') and word[-3] == 's':
  207. singular = word[:-2]
  208. else:
  209. singular = word[:-1]
  210. return _match_case(orig_word, singular)
  211. def pluralize(word):
  212. """Semi-intelligently converts an English *word* from singular form to
  213. plural, preserving case pattern.
  214. >>> pluralize('friend')
  215. 'friends'
  216. >>> pluralize('enemy')
  217. 'enemies'
  218. >>> pluralize('Sheep')
  219. 'Sheep'
  220. """
  221. orig_word, word = word, word.strip().lower()
  222. if not word or word in _IRR_P2S:
  223. return orig_word
  224. irr_plural = _IRR_S2P.get(word)
  225. if irr_plural:
  226. plural = irr_plural
  227. elif word.endswith('y') and word[-2:-1] not in 'aeiou':
  228. plural = word[:-1] + 'ies'
  229. elif word[-1] == 's' or word.endswith('ch') or word.endswith('sh'):
  230. plural = word if word.endswith('es') else word + 'es'
  231. else:
  232. plural = word + 's'
  233. return _match_case(orig_word, plural)
  234. def _match_case(master, disciple):
  235. if not master.strip():
  236. return disciple
  237. if master.lower() == master:
  238. return disciple.lower()
  239. elif master.upper() == master:
  240. return disciple.upper()
  241. elif master.title() == master:
  242. return disciple.title()
  243. return disciple
  244. # Singular to plural map of irregular pluralizations
  245. _IRR_S2P = {'addendum': 'addenda', 'alga': 'algae', 'alumna': 'alumnae',
  246. 'alumnus': 'alumni', 'analysis': 'analyses', 'antenna': 'antennae',
  247. 'appendix': 'appendices', 'axis': 'axes', 'bacillus': 'bacilli',
  248. 'bacterium': 'bacteria', 'basis': 'bases', 'beau': 'beaux',
  249. 'bison': 'bison', 'bureau': 'bureaus', 'cactus': 'cacti',
  250. 'calf': 'calves', 'child': 'children', 'corps': 'corps',
  251. 'corpus': 'corpora', 'crisis': 'crises', 'criterion': 'criteria',
  252. 'curriculum': 'curricula', 'datum': 'data', 'deer': 'deer',
  253. 'diagnosis': 'diagnoses', 'die': 'dice', 'dwarf': 'dwarves',
  254. 'echo': 'echoes', 'elf': 'elves', 'ellipsis': 'ellipses',
  255. 'embargo': 'embargoes', 'emphasis': 'emphases', 'erratum': 'errata',
  256. 'fireman': 'firemen', 'fish': 'fish', 'focus': 'foci',
  257. 'foot': 'feet', 'formula': 'formulae', 'formula': 'formulas',
  258. 'fungus': 'fungi', 'genus': 'genera', 'goose': 'geese',
  259. 'half': 'halves', 'hero': 'heroes', 'hippopotamus': 'hippopotami',
  260. 'hoof': 'hooves', 'hypothesis': 'hypotheses', 'index': 'indices',
  261. 'knife': 'knives', 'leaf': 'leaves', 'life': 'lives',
  262. 'loaf': 'loaves', 'louse': 'lice', 'man': 'men',
  263. 'matrix': 'matrices', 'means': 'means', 'medium': 'media',
  264. 'memorandum': 'memoranda', 'millennium': 'milennia', 'moose': 'moose',
  265. 'mosquito': 'mosquitoes', 'mouse': 'mice', 'nebula': 'nebulae',
  266. 'neurosis': 'neuroses', 'nucleus': 'nuclei', 'oasis': 'oases',
  267. 'octopus': 'octopi', 'offspring': 'offspring', 'ovum': 'ova',
  268. 'ox': 'oxen', 'paralysis': 'paralyses', 'parenthesis': 'parentheses',
  269. 'person': 'people', 'phenomenon': 'phenomena', 'potato': 'potatoes',
  270. 'radius': 'radii', 'scarf': 'scarves', 'scissors': 'scissors',
  271. 'self': 'selves', 'sense': 'senses', 'series': 'series', 'sheep':
  272. 'sheep', 'shelf': 'shelves', 'species': 'species', 'stimulus':
  273. 'stimuli', 'stratum': 'strata', 'syllabus': 'syllabi', 'symposium':
  274. 'symposia', 'synopsis': 'synopses', 'synthesis': 'syntheses',
  275. 'tableau': 'tableaux', 'that': 'those', 'thesis': 'theses',
  276. 'thief': 'thieves', 'this': 'these', 'tomato': 'tomatoes', 'tooth':
  277. 'teeth', 'torpedo': 'torpedoes', 'vertebra': 'vertebrae', 'veto':
  278. 'vetoes', 'vita': 'vitae', 'watch': 'watches', 'wife': 'wives',
  279. 'wolf': 'wolves', 'woman': 'women'}
  280. # Reverse index of the above
  281. _IRR_P2S = dict([(v, k) for k, v in _IRR_S2P.items()])
  282. HASHTAG_RE = re.compile(r"(?:^|\s)[##]{1}(\w+)", re.UNICODE)
  283. def find_hashtags(string):
  284. """Finds and returns all hashtags in a string, with the hashmark
  285. removed. Supports full-width hashmarks for Asian languages and
  286. does not false-positive on URL anchors.
  287. >>> find_hashtags('#atag http://asite/#ananchor')
  288. ['atag']
  289. ``find_hashtags`` also works with unicode hashtags.
  290. """
  291. # the following works, doctest just struggles with it
  292. # >>> find_hashtags(u"can't get enough of that dignity chicken #肯德基 woo")
  293. # [u'\u80af\u5fb7\u57fa']
  294. return HASHTAG_RE.findall(string)
  295. def a10n(string):
  296. """That thing where "internationalization" becomes "i18n", what's it
  297. called? Abbreviation? Oh wait, no: ``a10n``. (It's actually a form
  298. of `numeronym`_.)
  299. >>> a10n('abbreviation')
  300. 'a10n'
  301. >>> a10n('internationalization')
  302. 'i18n'
  303. >>> a10n('')
  304. ''
  305. .. _numeronym: http://en.wikipedia.org/wiki/Numeronym
  306. """
  307. if len(string) < 3:
  308. return string
  309. return '%s%s%s' % (string[0], len(string[1:-1]), string[-1])
  310. # Based on https://en.wikipedia.org/wiki/ANSI_escape_code#Escape_sequences
  311. ANSI_SEQUENCES = re.compile(r'''
  312. \x1B # Sequence starts with ESC, i.e. hex 0x1B
  313. (?:
  314. [@-Z\\-_] # Second byte:
  315. # all 0x40–0x5F range but CSI char, i.e ASCII @A–Z\]^_
  316. | # Or
  317. \[ # CSI sequences, starting with [
  318. [0-?]* # Parameter bytes:
  319. # range 0x30–0x3F, ASCII 0–9:;<=>?
  320. [ -/]* # Intermediate bytes:
  321. # range 0x20–0x2F, ASCII space and !"#$%&'()*+,-./
  322. [@-~] # Final byte
  323. # range 0x40–0x7E, ASCII @A–Z[\]^_`a–z{|}~
  324. )
  325. ''', re.VERBOSE)
  326. def strip_ansi(text):
  327. """Strips ANSI escape codes from *text*. Useful for the occasional
  328. time when a log or redirected output accidentally captures console
  329. color codes and the like.
  330. >>> strip_ansi('\x1b[0m\x1b[1;36mart\x1b[46;34m')
  331. 'art'
  332. Supports unicode, str, bytes and bytearray content as input. Returns the
  333. same type as the input.
  334. There's a lot of ANSI art available for testing on `sixteencolors.net`_.
  335. This function does not interpret or render ANSI art, but you can do so with
  336. `ansi2img`_ or `escapes.js`_.
  337. .. _sixteencolors.net: http://sixteencolors.net
  338. .. _ansi2img: http://www.bedroomlan.org/projects/ansi2img
  339. .. _escapes.js: https://github.com/atdt/escapes.js
  340. """
  341. # TODO: move to cliutils.py
  342. # Transform any ASCII-like content to unicode to allow regex to match, and
  343. # save input type for later.
  344. target_type = None
  345. # Unicode type aliased to str is code-smell for Boltons in Python 3 env.
  346. is_py3 = (unicode == builtins.str)
  347. if is_py3 and isinstance(text, (bytes, bytearray)):
  348. target_type = type(text)
  349. text = text.decode('utf-8')
  350. cleaned = ANSI_SEQUENCES.sub('', text)
  351. # Transform back the result to the same bytearray type provided by the user.
  352. if target_type and target_type != type(cleaned):
  353. cleaned = target_type(cleaned, 'utf-8')
  354. return cleaned
  355. def asciify(text, ignore=False):
  356. """Converts a unicode or bytestring, *text*, into a bytestring with
  357. just ascii characters. Performs basic deaccenting for all you
  358. Europhiles out there.
  359. Also, a gentle reminder that this is a **utility**, primarily meant
  360. for slugification. Whenever possible, make your application work
  361. **with** unicode, not against it.
  362. Args:
  363. text (str or unicode): The string to be asciified.
  364. ignore (bool): Configures final encoding to ignore remaining
  365. unasciified unicode instead of replacing it.
  366. >>> asciify('Beyoncé') == b'Beyonce'
  367. True
  368. """
  369. try:
  370. try:
  371. return text.encode('ascii')
  372. except UnicodeDecodeError:
  373. # this usually means you passed in a non-unicode string
  374. text = text.decode('utf-8')
  375. return text.encode('ascii')
  376. except UnicodeEncodeError:
  377. mode = 'replace'
  378. if ignore:
  379. mode = 'ignore'
  380. transd = unicodedata.normalize('NFKD', text.translate(DEACCENT_MAP))
  381. ret = transd.encode('ascii', mode)
  382. return ret
  383. def is_ascii(text):
  384. """Check if a unicode or bytestring, *text*, is composed of ascii
  385. characters only. Raises :exc:`ValueError` if argument is not text.
  386. Args:
  387. text (str or unicode): The string to be checked.
  388. >>> is_ascii('Beyoncé')
  389. False
  390. >>> is_ascii('Beyonce')
  391. True
  392. """
  393. if isinstance(text, unicode):
  394. try:
  395. text.encode('ascii')
  396. except UnicodeEncodeError:
  397. return False
  398. elif isinstance(text, bytes):
  399. try:
  400. text.decode('ascii')
  401. except UnicodeDecodeError:
  402. return False
  403. else:
  404. raise ValueError('expected text or bytes, not %r' % type(text))
  405. return True
  406. class DeaccenterDict(dict):
  407. "A small caching dictionary for deaccenting."
  408. def __missing__(self, key):
  409. ch = self.get(key)
  410. if ch is not None:
  411. return ch
  412. try:
  413. de = unicodedata.decomposition(unichr(key))
  414. p1, _, p2 = de.rpartition(' ')
  415. if int(p2, 16) == 0x308:
  416. ch = self.get(key)
  417. else:
  418. ch = int(p1, 16)
  419. except (IndexError, ValueError):
  420. ch = self.get(key, key)
  421. self[key] = ch
  422. return ch
  423. try:
  424. from collections import defaultdict
  425. except ImportError:
  426. # no defaultdict means that __missing__ isn't supported in
  427. # this version of python, so we define __getitem__
  428. def __getitem__(self, key):
  429. try:
  430. return super(DeaccenterDict, self).__getitem__(key)
  431. except KeyError:
  432. return self.__missing__(key)
  433. else:
  434. del defaultdict
  435. # http://chmullig.com/2009/12/python-unicode-ascii-ifier/
  436. # For something more complete, investigate the unidecode
  437. # or isounidecode packages, which are capable of performing
  438. # crude transliteration.
  439. _BASE_DEACCENT_MAP = {
  440. 0xc6: u"AE", # Æ LATIN CAPITAL LETTER AE
  441. 0xd0: u"D", # Ð LATIN CAPITAL LETTER ETH
  442. 0xd8: u"OE", # Ø LATIN CAPITAL LETTER O WITH STROKE
  443. 0xde: u"Th", # Þ LATIN CAPITAL LETTER THORN
  444. 0xc4: u'Ae', # Ä LATIN CAPITAL LETTER A WITH DIAERESIS
  445. 0xd6: u'Oe', # Ö LATIN CAPITAL LETTER O WITH DIAERESIS
  446. 0xdc: u'Ue', # Ü LATIN CAPITAL LETTER U WITH DIAERESIS
  447. 0xc0: u"A", # À LATIN CAPITAL LETTER A WITH GRAVE
  448. 0xc1: u"A", # Á LATIN CAPITAL LETTER A WITH ACUTE
  449. 0xc3: u"A", # Ã LATIN CAPITAL LETTER A WITH TILDE
  450. 0xc7: u"C", # Ç LATIN CAPITAL LETTER C WITH CEDILLA
  451. 0xc8: u"E", # È LATIN CAPITAL LETTER E WITH GRAVE
  452. 0xc9: u"E", # É LATIN CAPITAL LETTER E WITH ACUTE
  453. 0xca: u"E", # Ê LATIN CAPITAL LETTER E WITH CIRCUMFLEX
  454. 0xcc: u"I", # Ì LATIN CAPITAL LETTER I WITH GRAVE
  455. 0xcd: u"I", # Í LATIN CAPITAL LETTER I WITH ACUTE
  456. 0xd2: u"O", # Ò LATIN CAPITAL LETTER O WITH GRAVE
  457. 0xd3: u"O", # Ó LATIN CAPITAL LETTER O WITH ACUTE
  458. 0xd5: u"O", # Õ LATIN CAPITAL LETTER O WITH TILDE
  459. 0xd9: u"U", # Ù LATIN CAPITAL LETTER U WITH GRAVE
  460. 0xda: u"U", # Ú LATIN CAPITAL LETTER U WITH ACUTE
  461. 0xdf: u"ss", # ß LATIN SMALL LETTER SHARP S
  462. 0xe6: u"ae", # æ LATIN SMALL LETTER AE
  463. 0xf0: u"d", # ð LATIN SMALL LETTER ETH
  464. 0xf8: u"oe", # ø LATIN SMALL LETTER O WITH STROKE
  465. 0xfe: u"th", # þ LATIN SMALL LETTER THORN,
  466. 0xe4: u'ae', # ä LATIN SMALL LETTER A WITH DIAERESIS
  467. 0xf6: u'oe', # ö LATIN SMALL LETTER O WITH DIAERESIS
  468. 0xfc: u'ue', # ü LATIN SMALL LETTER U WITH DIAERESIS
  469. 0xe0: u"a", # à LATIN SMALL LETTER A WITH GRAVE
  470. 0xe1: u"a", # á LATIN SMALL LETTER A WITH ACUTE
  471. 0xe3: u"a", # ã LATIN SMALL LETTER A WITH TILDE
  472. 0xe7: u"c", # ç LATIN SMALL LETTER C WITH CEDILLA
  473. 0xe8: u"e", # è LATIN SMALL LETTER E WITH GRAVE
  474. 0xe9: u"e", # é LATIN SMALL LETTER E WITH ACUTE
  475. 0xea: u"e", # ê LATIN SMALL LETTER E WITH CIRCUMFLEX
  476. 0xec: u"i", # ì LATIN SMALL LETTER I WITH GRAVE
  477. 0xed: u"i", # í LATIN SMALL LETTER I WITH ACUTE
  478. 0xf2: u"o", # ò LATIN SMALL LETTER O WITH GRAVE
  479. 0xf3: u"o", # ó LATIN SMALL LETTER O WITH ACUTE
  480. 0xf5: u"o", # õ LATIN SMALL LETTER O WITH TILDE
  481. 0xf9: u"u", # ù LATIN SMALL LETTER U WITH GRAVE
  482. 0xfa: u"u", # ú LATIN SMALL LETTER U WITH ACUTE
  483. 0x2018: u"'", # ‘ LEFT SINGLE QUOTATION MARK
  484. 0x2019: u"'", # ’ RIGHT SINGLE QUOTATION MARK
  485. 0x201c: u'"', # “ LEFT DOUBLE QUOTATION MARK
  486. 0x201d: u'"', # ” RIGHT DOUBLE QUOTATION MARK
  487. }
  488. DEACCENT_MAP = DeaccenterDict(_BASE_DEACCENT_MAP)
  489. _SIZE_SYMBOLS = ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')
  490. _SIZE_BOUNDS = [(1024 ** i, sym) for i, sym in enumerate(_SIZE_SYMBOLS)]
  491. _SIZE_RANGES = list(zip(_SIZE_BOUNDS, _SIZE_BOUNDS[1:]))
  492. def bytes2human(nbytes, ndigits=0):
  493. """Turns an integer value of *nbytes* into a human readable format. Set
  494. *ndigits* to control how many digits after the decimal point
  495. should be shown (default ``0``).
  496. >>> bytes2human(128991)
  497. '126K'
  498. >>> bytes2human(100001221)
  499. '95M'
  500. >>> bytes2human(0, 2)
  501. '0.00B'
  502. """
  503. abs_bytes = abs(nbytes)
  504. for (size, symbol), (next_size, next_symbol) in _SIZE_RANGES:
  505. if abs_bytes <= next_size:
  506. break
  507. hnbytes = float(nbytes) / size
  508. return '{hnbytes:.{ndigits}f}{symbol}'.format(hnbytes=hnbytes,
  509. ndigits=ndigits,
  510. symbol=symbol)
  511. class HTMLTextExtractor(HTMLParser):
  512. def __init__(self):
  513. self.reset()
  514. self.strict = False
  515. self.convert_charrefs = True
  516. self.result = []
  517. def handle_data(self, d):
  518. self.result.append(d)
  519. def handle_charref(self, number):
  520. if number[0] == u'x' or number[0] == u'X':
  521. codepoint = int(number[1:], 16)
  522. else:
  523. codepoint = int(number)
  524. self.result.append(unichr(codepoint))
  525. def handle_entityref(self, name):
  526. try:
  527. codepoint = htmlentitydefs.name2codepoint[name]
  528. except KeyError:
  529. self.result.append(u'&' + name + u';')
  530. else:
  531. self.result.append(unichr(codepoint))
  532. def get_text(self):
  533. return u''.join(self.result)
  534. def html2text(html):
  535. """Strips tags from HTML text, returning markup-free text. Also, does
  536. a best effort replacement of entities like "&nbsp;"
  537. >>> r = html2text(u'<a href="#">Test &amp;<em>(\u0394&#x03b7;&#956;&#x03CE;)</em></a>')
  538. >>> r == u'Test &(\u0394\u03b7\u03bc\u03ce)'
  539. True
  540. """
  541. # based on answers to http://stackoverflow.com/questions/753052/
  542. s = HTMLTextExtractor()
  543. s.feed(html)
  544. return s.get_text()
  545. _EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x089\xf3\xb9U\x00\x03empty\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00'
  546. _NON_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x08\xbc\xf7\xb9U\x00\x03not_empty\x00K\xaa,I-N\xcc\xc8\xafT\xe4\x02\x00\xf3nb\xbf\x0b\x00\x00\x00'
  547. def gunzip_bytes(bytestring):
  548. """The :mod:`gzip` module is great if you have a file or file-like
  549. object, but what if you just have bytes. StringIO is one
  550. possibility, but it's often faster, easier, and simpler to just
  551. use this one-liner. Use this tried-and-true utility function to
  552. decompress gzip from bytes.
  553. >>> gunzip_bytes(_EMPTY_GZIP_BYTES) == b''
  554. True
  555. >>> gunzip_bytes(_NON_EMPTY_GZIP_BYTES).rstrip() == b'bytesahoy!'
  556. True
  557. """
  558. return zlib.decompress(bytestring, 16 + zlib.MAX_WBITS)
  559. def gzip_bytes(bytestring, level=6):
  560. """Turn some bytes into some compressed bytes.
  561. >>> len(gzip_bytes(b'a' * 10000))
  562. 46
  563. Args:
  564. bytestring (bytes): Bytes to be compressed
  565. level (int): An integer, 1-9, controlling the
  566. speed/compression. 1 is fastest, least compressed, 9 is
  567. slowest, but most compressed.
  568. Note that all levels of gzip are pretty fast these days, though
  569. it's not really a competitor in compression, at any level.
  570. """
  571. out = StringIO()
  572. f = GzipFile(fileobj=out, mode='wb', compresslevel=level)
  573. f.write(bytestring)
  574. f.close()
  575. return out.getvalue()
  576. _line_ending_re = re.compile(r'(\r\n|\n|\x0b|\f|\r|\x85|\x2028|\x2029)',
  577. re.UNICODE)
  578. def iter_splitlines(text):
  579. r"""Like :meth:`str.splitlines`, but returns an iterator of lines
  580. instead of a list. Also similar to :meth:`file.next`, as that also
  581. lazily reads and yields lines from a file.
  582. This function works with a variety of line endings, but as always,
  583. be careful when mixing line endings within a file.
  584. >>> list(iter_splitlines('\nhi\nbye\n'))
  585. ['', 'hi', 'bye', '']
  586. >>> list(iter_splitlines('\r\nhi\rbye\r\n'))
  587. ['', 'hi', 'bye', '']
  588. >>> list(iter_splitlines(''))
  589. []
  590. """
  591. prev_end, len_text = 0, len(text)
  592. # print('last: %r' % last_idx)
  593. # start, end = None, None
  594. for match in _line_ending_re.finditer(text):
  595. start, end = match.start(1), match.end(1)
  596. # print(start, end)
  597. if prev_end <= start:
  598. yield text[prev_end:start]
  599. if end == len_text:
  600. yield ''
  601. prev_end = end
  602. tail = text[prev_end:]
  603. if tail:
  604. yield tail
  605. return
  606. def indent(text, margin, newline='\n', key=bool):
  607. """The missing counterpart to the built-in :func:`textwrap.dedent`.
  608. Args:
  609. text (str): The text to indent.
  610. margin (str): The string to prepend to each line.
  611. newline (str): The newline used to rejoin the lines (default: ``\\n``)
  612. key (callable): Called on each line to determine whether to
  613. indent it. Default: :class:`bool`, to ensure that empty lines do
  614. not get whitespace added.
  615. """
  616. indented_lines = [(margin + line if key(line) else line)
  617. for line in iter_splitlines(text)]
  618. return newline.join(indented_lines)
  619. def is_uuid(obj, version=4):
  620. """Check the argument is either a valid UUID object or string.
  621. Args:
  622. obj (object): The test target. Strings and UUID objects supported.
  623. version (int): The target UUID version, set to 0 to skip version check.
  624. >>> is_uuid('e682ccca-5a4c-4ef2-9711-73f9ad1e15ea')
  625. True
  626. >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9')
  627. False
  628. >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9', version=1)
  629. True
  630. """
  631. if not isinstance(obj, uuid.UUID):
  632. try:
  633. obj = uuid.UUID(obj)
  634. except (TypeError, ValueError, AttributeError):
  635. return False
  636. if version and obj.version != int(version):
  637. return False
  638. return True
  639. def escape_shell_args(args, sep=' ', style=None):
  640. """Returns an escaped version of each string in *args*, according to
  641. *style*.
  642. Args:
  643. args (list): A list of arguments to escape and join together
  644. sep (str): The separator used to join the escaped arguments.
  645. style (str): The style of escaping to use. Can be one of
  646. ``cmd`` or ``sh``, geared toward Windows and Linux/BSD/etc.,
  647. respectively. If *style* is ``None``, then it is picked
  648. according to the system platform.
  649. See :func:`args2cmd` and :func:`args2sh` for details and example
  650. output for each style.
  651. """
  652. if not style:
  653. style = 'cmd' if sys.platform == 'win32' else 'sh'
  654. if style == 'sh':
  655. return args2sh(args, sep=sep)
  656. elif style == 'cmd':
  657. return args2cmd(args, sep=sep)
  658. raise ValueError("style expected one of 'cmd' or 'sh', not %r" % style)
  659. _find_sh_unsafe = re.compile(r'[^a-zA-Z0-9_@%+=:,./-]').search
  660. def args2sh(args, sep=' '):
  661. """Return a shell-escaped string version of *args*, separated by
  662. *sep*, based on the rules of sh, bash, and other shells in the
  663. Linux/BSD/MacOS ecosystem.
  664. >>> print(args2sh(['aa', '[bb]', "cc'cc", 'dd"dd']))
  665. aa '[bb]' 'cc'"'"'cc' 'dd"dd'
  666. As you can see, arguments with no special characters are not
  667. escaped, arguments with special characters are quoted with single
  668. quotes, and single quotes themselves are quoted with double
  669. quotes. Double quotes are handled like any other special
  670. character.
  671. Based on code from the :mod:`pipes`/:mod:`shlex` modules. Also
  672. note that :mod:`shlex` and :mod:`argparse` have functions to split
  673. and parse strings escaped in this manner.
  674. """
  675. ret_list = []
  676. for arg in args:
  677. if not arg:
  678. ret_list.append("''")
  679. continue
  680. if _find_sh_unsafe(arg) is None:
  681. ret_list.append(arg)
  682. continue
  683. # use single quotes, and put single quotes into double quotes
  684. # the string $'b is then quoted as '$'"'"'b'
  685. ret_list.append("'" + arg.replace("'", "'\"'\"'") + "'")
  686. return ' '.join(ret_list)
  687. def args2cmd(args, sep=' '):
  688. r"""Return a shell-escaped string version of *args*, separated by
  689. *sep*, using the same rules as the Microsoft C runtime.
  690. >>> print(args2cmd(['aa', '[bb]', "cc'cc", 'dd"dd']))
  691. aa [bb] cc'cc dd\"dd
  692. As you can see, escaping is through backslashing and not quoting,
  693. and double quotes are the only special character. See the comment
  694. in the code for more details. Based on internal code from the
  695. :mod:`subprocess` module.
  696. """
  697. # technique description from subprocess below
  698. """
  699. 1) Arguments are delimited by white space, which is either a
  700. space or a tab.
  701. 2) A string surrounded by double quotation marks is
  702. interpreted as a single argument, regardless of white space
  703. contained within. A quoted string can be embedded in an
  704. argument.
  705. 3) A double quotation mark preceded by a backslash is
  706. interpreted as a literal double quotation mark.
  707. 4) Backslashes are interpreted literally, unless they
  708. immediately precede a double quotation mark.
  709. 5) If backslashes immediately precede a double quotation mark,
  710. every pair of backslashes is interpreted as a literal
  711. backslash. If the number of backslashes is odd, the last
  712. backslash escapes the next double quotation mark as
  713. described in rule 3.
  714. See http://msdn.microsoft.com/en-us/library/17w5ykft.aspx
  715. or search http://msdn.microsoft.com for
  716. "Parsing C++ Command-Line Arguments"
  717. """
  718. result = []
  719. needquote = False
  720. for arg in args:
  721. bs_buf = []
  722. # Add a space to separate this argument from the others
  723. if result:
  724. result.append(' ')
  725. needquote = (" " in arg) or ("\t" in arg) or not arg
  726. if needquote:
  727. result.append('"')
  728. for c in arg:
  729. if c == '\\':
  730. # Don't know if we need to double yet.
  731. bs_buf.append(c)
  732. elif c == '"':
  733. # Double backslashes.
  734. result.append('\\' * len(bs_buf)*2)
  735. bs_buf = []
  736. result.append('\\"')
  737. else:
  738. # Normal char
  739. if bs_buf:
  740. result.extend(bs_buf)
  741. bs_buf = []
  742. result.append(c)
  743. # Add remaining backslashes, if any.
  744. if bs_buf:
  745. result.extend(bs_buf)
  746. if needquote:
  747. result.extend(bs_buf)
  748. result.append('"')
  749. return ''.join(result)
  750. def parse_int_list(range_string, delim=',', range_delim='-'):
  751. """Returns a sorted list of positive integers based on
  752. *range_string*. Reverse of :func:`format_int_list`.
  753. Args:
  754. range_string (str): String of comma separated positive
  755. integers or ranges (e.g. '1,2,4-6,8'). Typical of a custom
  756. page range string used in printer dialogs.
  757. delim (char): Defaults to ','. Separates integers and
  758. contiguous ranges of integers.
  759. range_delim (char): Defaults to '-'. Indicates a contiguous
  760. range of integers.
  761. >>> parse_int_list('1,3,5-8,10-11,15')
  762. [1, 3, 5, 6, 7, 8, 10, 11, 15]
  763. """
  764. output = []
  765. for x in range_string.strip().split(delim):
  766. # Range
  767. if range_delim in x:
  768. range_limits = list(map(int, x.split(range_delim)))
  769. output += list(range(min(range_limits), max(range_limits)+1))
  770. # Empty String
  771. elif not x:
  772. continue
  773. # Integer
  774. else:
  775. output.append(int(x))
  776. return sorted(output)
  777. def format_int_list(int_list, delim=',', range_delim='-', delim_space=False):
  778. """Returns a sorted range string from a list of positive integers
  779. (*int_list*). Contiguous ranges of integers are collapsed to min
  780. and max values. Reverse of :func:`parse_int_list`.
  781. Args:
  782. int_list (list): List of positive integers to be converted
  783. into a range string (e.g. [1,2,4,5,6,8]).
  784. delim (char): Defaults to ','. Separates integers and
  785. contiguous ranges of integers.
  786. range_delim (char): Defaults to '-'. Indicates a contiguous
  787. range of integers.
  788. delim_space (bool): Defaults to ``False``. If ``True``, adds a
  789. space after all *delim* characters.
  790. >>> format_int_list([1,3,5,6,7,8,10,11,15])
  791. '1,3,5-8,10-11,15'
  792. """
  793. output = []
  794. contig_range = collections.deque()
  795. for x in sorted(int_list):
  796. # Handle current (and first) value.
  797. if len(contig_range) < 1:
  798. contig_range.append(x)
  799. # Handle current value, given multiple previous values are contiguous.
  800. elif len(contig_range) > 1:
  801. delta = x - contig_range[-1]
  802. # Current value is contiguous.
  803. if delta == 1:
  804. contig_range.append(x)
  805. # Current value is non-contiguous.
  806. elif delta > 1:
  807. range_substr = '{0:d}{1}{2:d}'.format(min(contig_range),
  808. range_delim,
  809. max(contig_range))
  810. output.append(range_substr)
  811. contig_range.clear()
  812. contig_range.append(x)
  813. # Current value repeated.
  814. else:
  815. continue
  816. # Handle current value, given no previous contiguous integers
  817. else:
  818. delta = x - contig_range[0]
  819. # Current value is contiguous.
  820. if delta == 1:
  821. contig_range.append(x)
  822. # Current value is non-contiguous.
  823. elif delta > 1:
  824. output.append('{0:d}'.format(contig_range.popleft()))
  825. contig_range.append(x)
  826. # Current value repeated.
  827. else:
  828. continue
  829. # Handle the last value.
  830. else:
  831. # Last value is non-contiguous.
  832. if len(contig_range) == 1:
  833. output.append('{0:d}'.format(contig_range.popleft()))
  834. contig_range.clear()
  835. # Last value is part of contiguous range.
  836. elif len(contig_range) > 1:
  837. range_substr = '{0:d}{1}{2:d}'.format(min(contig_range),
  838. range_delim,
  839. max(contig_range))
  840. output.append(range_substr)
  841. contig_range.clear()
  842. if delim_space:
  843. output_str = (delim+' ').join(output)
  844. else:
  845. output_str = delim.join(output)
  846. return output_str
  847. def complement_int_list(
  848. range_string, range_start=0, range_end=None,
  849. delim=',', range_delim='-'):
  850. """ Returns range string that is the complement of the one provided as
  851. *range_string* parameter.
  852. These range strings are of the kind produce by :func:`format_int_list`, and
  853. parseable by :func:`parse_int_list`.
  854. Args:
  855. range_string (str): String of comma separated positive integers or
  856. ranges (e.g. '1,2,4-6,8'). Typical of a custom page range string
  857. used in printer dialogs.
  858. range_start (int): A positive integer from which to start the resulting
  859. range. Value is inclusive. Defaults to ``0``.
  860. range_end (int): A positive integer from which the produced range is
  861. stopped. Value is exclusive. Defaults to the maximum value found in
  862. the provided ``range_string``.
  863. delim (char): Defaults to ','. Separates integers and contiguous ranges
  864. of integers.
  865. range_delim (char): Defaults to '-'. Indicates a contiguous range of
  866. integers.
  867. >>> complement_int_list('1,3,5-8,10-11,15')
  868. '0,2,4,9,12-14'
  869. >>> complement_int_list('1,3,5-8,10-11,15', range_start=0)
  870. '0,2,4,9,12-14'
  871. >>> complement_int_list('1,3,5-8,10-11,15', range_start=1)
  872. '2,4,9,12-14'
  873. >>> complement_int_list('1,3,5-8,10-11,15', range_start=2)
  874. '2,4,9,12-14'
  875. >>> complement_int_list('1,3,5-8,10-11,15', range_start=3)
  876. '4,9,12-14'
  877. >>> complement_int_list('1,3,5-8,10-11,15', range_end=15)
  878. '0,2,4,9,12-14'
  879. >>> complement_int_list('1,3,5-8,10-11,15', range_end=14)
  880. '0,2,4,9,12-13'
  881. >>> complement_int_list('1,3,5-8,10-11,15', range_end=13)
  882. '0,2,4,9,12'
  883. >>> complement_int_list('1,3,5-8,10-11,15', range_end=20)
  884. '0,2,4,9,12-14,16-19'
  885. >>> complement_int_list('1,3,5-8,10-11,15', range_end=0)
  886. ''
  887. >>> complement_int_list('1,3,5-8,10-11,15', range_start=-1)
  888. '0,2,4,9,12-14'
  889. >>> complement_int_list('1,3,5-8,10-11,15', range_end=-1)
  890. ''
  891. >>> complement_int_list('1,3,5-8', range_start=1, range_end=1)
  892. ''
  893. >>> complement_int_list('1,3,5-8', range_start=2, range_end=2)
  894. ''
  895. >>> complement_int_list('1,3,5-8', range_start=2, range_end=3)
  896. '2'
  897. >>> complement_int_list('1,3,5-8', range_start=-10, range_end=-5)
  898. ''
  899. >>> complement_int_list('1,3,5-8', range_start=20, range_end=10)
  900. ''
  901. >>> complement_int_list('')
  902. ''
  903. """
  904. int_list = set(parse_int_list(range_string, delim, range_delim))
  905. if range_end is None:
  906. if int_list:
  907. range_end = max(int_list) + 1
  908. else:
  909. range_end = range_start
  910. complement_values = set(
  911. range(range_end)) - int_list - set(range(range_start))
  912. return format_int_list(complement_values, delim, range_delim)
  913. def int_ranges_from_int_list(range_string, delim=',', range_delim='-'):
  914. """ Transform a string of ranges (*range_string*) into a tuple of tuples.
  915. Args:
  916. range_string (str): String of comma separated positive integers or
  917. ranges (e.g. '1,2,4-6,8'). Typical of a custom page range string
  918. used in printer dialogs.
  919. delim (char): Defaults to ','. Separates integers and contiguous ranges
  920. of integers.
  921. range_delim (char): Defaults to '-'. Indicates a contiguous range of
  922. integers.
  923. >>> int_ranges_from_int_list('1,3,5-8,10-11,15')
  924. ((1, 1), (3, 3), (5, 8), (10, 11), (15, 15))
  925. >>> int_ranges_from_int_list('1')
  926. ((1, 1),)
  927. >>> int_ranges_from_int_list('')
  928. ()
  929. """
  930. int_tuples = []
  931. # Normalize the range string to our internal format for processing.
  932. range_string = format_int_list(
  933. parse_int_list(range_string, delim, range_delim))
  934. if range_string:
  935. for bounds in range_string.split(','):
  936. if '-' in bounds:
  937. start, end = bounds.split('-')
  938. else:
  939. start, end = bounds, bounds
  940. int_tuples.append((int(start), int(end)))
  941. return tuple(int_tuples)
  942. class MultiReplace(object):
  943. """
  944. MultiReplace is a tool for doing multiple find/replace actions in one pass.
  945. Given a mapping of values to be replaced it allows for all of the matching
  946. values to be replaced in a single pass which can save a lot of performance
  947. on very large strings. In addition to simple replace, it also allows for
  948. replacing based on regular expressions.
  949. Keyword Arguments:
  950. :type regex: bool
  951. :param regex: Treat search keys as regular expressions [Default: False]
  952. :type flags: int
  953. :param flags: flags to pass to the regex engine during compile
  954. Dictionary Usage::
  955. from boltons import stringutils
  956. s = stringutils.MultiReplace({
  957. 'foo': 'zoo',
  958. 'cat': 'hat',
  959. 'bat': 'kraken'
  960. })
  961. new = s.sub('The foo bar cat ate a bat')
  962. new == 'The zoo bar hat ate a kraken'
  963. Iterable Usage::
  964. from boltons import stringutils
  965. s = stringutils.MultiReplace([
  966. ('foo', 'zoo'),
  967. ('cat', 'hat'),
  968. ('bat', 'kraken)'
  969. ])
  970. new = s.sub('The foo bar cat ate a bat')
  971. new == 'The zoo bar hat ate a kraken'
  972. The constructor can be passed a dictionary or other mapping as well as
  973. an iterable of tuples. If given an iterable, the substitution will be run
  974. in the order the replacement values are specified in the iterable. This is
  975. also true if it is given an OrderedDict. If given a dictionary then the
  976. order will be non-deterministic::
  977. >>> 'foo bar baz'.replace('foo', 'baz').replace('baz', 'bar')
  978. 'bar bar bar'
  979. >>> m = MultiReplace({'foo': 'baz', 'baz': 'bar'})
  980. >>> m.sub('foo bar baz')
  981. 'baz bar bar'
  982. This is because the order of replacement can matter if you're inserting
  983. something that might be replaced by a later substitution. Pay attention and
  984. if you need to rely on order then consider using a list of tuples instead
  985. of a dictionary.
  986. """
  987. def __init__(self, sub_map, **kwargs):
  988. """Compile any regular expressions that have been passed."""
  989. options = {
  990. 'regex': False,
  991. 'flags': 0,
  992. }
  993. options.update(kwargs)
  994. self.group_map = {}
  995. regex_values = []
  996. if isinstance(sub_map, Mapping):
  997. sub_map = sub_map.items()
  998. for idx, vals in enumerate(sub_map):
  999. group_name = 'group{0}'.format(idx)
  1000. if isinstance(vals[0], basestring):
  1001. # If we're not treating input strings like a regex, escape it
  1002. if not options['regex']:
  1003. exp = re.escape(vals[0])
  1004. else:
  1005. exp = vals[0]
  1006. else:
  1007. exp = vals[0].pattern
  1008. regex_values.append('(?P<{}>{})'.format(group_name, exp))
  1009. self.group_map[group_name] = vals[1]
  1010. self.combined_pattern = re.compile(
  1011. '|'.join(regex_values),
  1012. flags=options['flags']
  1013. )
  1014. def _get_value(self, match):
  1015. """Given a match object find replacement value."""
  1016. group_dict = match.groupdict()
  1017. key = [x for x in group_dict if group_dict[x]][0]
  1018. return self.group_map[key]
  1019. def sub(self, text):
  1020. """
  1021. Run substitutions on the input text.
  1022. Given an input string, run all substitutions given in the
  1023. constructor.
  1024. """
  1025. return self.combined_pattern.sub(self._get_value, text)
  1026. def multi_replace(text, sub_map, **kwargs):
  1027. """
  1028. Shortcut function to invoke MultiReplace in a single call.
  1029. Example Usage::
  1030. from boltons.stringutils import multi_replace
  1031. new = multi_replace(
  1032. 'The foo bar cat ate a bat',
  1033. {'foo': 'zoo', 'cat': 'hat', 'bat': 'kraken'}
  1034. )
  1035. new == 'The zoo bar hat ate a kraken'
  1036. """
  1037. m = MultiReplace(sub_map, **kwargs)
  1038. return m.sub(text)
  1039. def unwrap_text(text, ending='\n\n'):
  1040. r"""
  1041. Unwrap text, the natural complement to :func:`textwrap.wrap`.
  1042. >>> text = "Short \n lines \nwrapped\nsmall.\n\nAnother\nparagraph."
  1043. >>> unwrap_text(text)
  1044. 'Short lines wrapped small.\n\nAnother paragraph.'
  1045. Args:
  1046. text: A string to unwrap.
  1047. ending (str): The string to join all unwrapped paragraphs
  1048. by. Pass ``None`` to get the list. Defaults to '\n\n' for
  1049. compatibility with Markdown and RST.
  1050. """
  1051. all_grafs = []
  1052. cur_graf = []
  1053. for line in text.splitlines():
  1054. line = line.strip()
  1055. if line:
  1056. cur_graf.append(line)
  1057. else:
  1058. all_grafs.append(' '.join(cur_graf))
  1059. cur_graf = []
  1060. if cur_graf:
  1061. all_grafs.append(' '.join(cur_graf))
  1062. if ending is None:
  1063. return all_grafs
  1064. return ending.join(all_grafs)