123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311 |
- # -*- coding: utf-8 -*-
- # Copyright (c) 2013, Mahmoud Hashemi
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are
- # met:
- #
- # * Redistributions of source code must retain the above copyright
- # notice, this list of conditions and the following disclaimer.
- #
- # * Redistributions in binary form must reproduce the above
- # copyright notice, this list of conditions and the following
- # disclaimer in the documentation and/or other materials provided
- # with the distribution.
- #
- # * The names of the contributors may not be used to endorse or
- # promote products derived from this software without specific
- # prior written permission.
- #
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- """So much practical programming involves string manipulation, which
- Python readily accommodates. Still, there are dozens of basic and
- common capabilities missing from the standard library, several of them
- provided by ``strutils``.
- """
- from __future__ import print_function
- import re
- import sys
- import uuid
- import zlib
- import string
- import unicodedata
- import collections
- from gzip import GzipFile
- try:
- from cStringIO import cStringIO as StringIO
- except ImportError:
- from io import BytesIO as StringIO
- try:
- from collections.abc import Mapping
- except ImportError:
- from collections import Mapping
- try:
- unicode, str, bytes, basestring = unicode, str, str, basestring
- from HTMLParser import HTMLParser
- import htmlentitydefs
- except NameError: # basestring not defined in Python 3
- unicode, str, bytes, basestring = str, bytes, bytes, (str, bytes)
- unichr = chr
- from html.parser import HTMLParser
- from html import entities as htmlentitydefs
- try:
- import __builtin__ as builtins
- except ImportError:
- import builtins
- __all__ = ['camel2under', 'under2camel', 'slugify', 'split_punct_ws',
- 'unit_len', 'ordinalize', 'cardinalize', 'pluralize', 'singularize',
- 'asciify', 'is_ascii', 'is_uuid', 'html2text', 'strip_ansi',
- 'bytes2human', 'find_hashtags', 'a10n', 'gzip_bytes', 'gunzip_bytes',
- 'iter_splitlines', 'indent', 'escape_shell_args',
- 'args2cmd', 'args2sh', 'parse_int_list', 'format_int_list',
- 'int_list_complement', 'int_list_to_int_tuples', 'MultiReplace',
- 'multi_replace', 'unwrap_text']
- _punct_ws_str = string.punctuation + string.whitespace
- _punct_re = re.compile('[' + _punct_ws_str + ']+')
- _camel2under_re = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))')
- def camel2under(camel_string):
- """Converts a camelcased string to underscores. Useful for turning a
- class name into a function name.
- >>> camel2under('BasicParseTest')
- 'basic_parse_test'
- """
- return _camel2under_re.sub(r'_\1', camel_string).lower()
- def under2camel(under_string):
- """Converts an underscored string to camelcased. Useful for turning a
- function name into a class name.
- >>> under2camel('complex_tokenizer')
- 'ComplexTokenizer'
- """
- return ''.join(w.capitalize() or '_' for w in under_string.split('_'))
- def slugify(text, delim='_', lower=True, ascii=False):
- """
- A basic function that turns text full of scary characters
- (i.e., punctuation and whitespace), into a relatively safe
- lowercased string separated only by the delimiter specified
- by *delim*, which defaults to ``_``.
- The *ascii* convenience flag will :func:`asciify` the slug if
- you require ascii-only slugs.
- >>> slugify('First post! Hi!!!!~1 ')
- 'first_post_hi_1'
- >>> slugify("Kurt Gödel's pretty cool.", ascii=True) == \
- b'kurt_goedel_s_pretty_cool'
- True
- """
- ret = delim.join(split_punct_ws(text)) or delim if text else ''
- if ascii:
- ret = asciify(ret)
- if lower:
- ret = ret.lower()
- return ret
- def split_punct_ws(text):
- """While :meth:`str.split` will split on whitespace,
- :func:`split_punct_ws` will split on punctuation and
- whitespace. This used internally by :func:`slugify`, above.
- >>> split_punct_ws('First post! Hi!!!!~1 ')
- ['First', 'post', 'Hi', '1']
- """
- return [w for w in _punct_re.split(text) if w]
- def unit_len(sized_iterable, unit_noun='item'): # TODO: len_units()/unitize()?
- """Returns a plain-English description of an iterable's
- :func:`len()`, conditionally pluralized with :func:`cardinalize`,
- detailed below.
- >>> print(unit_len(range(10), 'number'))
- 10 numbers
- >>> print(unit_len('aeiou', 'vowel'))
- 5 vowels
- >>> print(unit_len([], 'worry'))
- No worries
- """
- count = len(sized_iterable)
- units = cardinalize(unit_noun, count)
- if count:
- return u'%s %s' % (count, units)
- return u'No %s' % (units,)
- _ORDINAL_MAP = {'1': 'st',
- '2': 'nd',
- '3': 'rd'} # 'th' is the default
- def ordinalize(number, ext_only=False):
- """Turns *number* into its cardinal form, i.e., 1st, 2nd,
- 3rd, 4th, etc. If the last character isn't a digit, it returns the
- string value unchanged.
- Args:
- number (int or str): Number to be cardinalized.
- ext_only (bool): Whether to return only the suffix. Default ``False``.
- >>> print(ordinalize(1))
- 1st
- >>> print(ordinalize(3694839230))
- 3694839230th
- >>> print(ordinalize('hi'))
- hi
- >>> print(ordinalize(1515))
- 1515th
- """
- numstr, ext = unicode(number), ''
- if numstr and numstr[-1] in string.digits:
- try:
- # first check for teens
- if numstr[-2] == '1':
- ext = 'th'
- else:
- # all other cases
- ext = _ORDINAL_MAP.get(numstr[-1], 'th')
- except IndexError:
- # single digit numbers (will reach here based on [-2] above)
- ext = _ORDINAL_MAP.get(numstr[-1], 'th')
- if ext_only:
- return ext
- else:
- return numstr + ext
- def cardinalize(unit_noun, count):
- """Conditionally pluralizes a singular word *unit_noun* if
- *count* is not one, preserving case when possible.
- >>> vowels = 'aeiou'
- >>> print(len(vowels), cardinalize('vowel', len(vowels)))
- 5 vowels
- >>> print(3, cardinalize('Wish', 3))
- 3 Wishes
- """
- if count == 1:
- return unit_noun
- return pluralize(unit_noun)
- def singularize(word):
- """Semi-intelligently converts an English plural *word* to its
- singular form, preserving case pattern.
- >>> singularize('chances')
- 'chance'
- >>> singularize('Activities')
- 'Activity'
- >>> singularize('Glasses')
- 'Glass'
- >>> singularize('FEET')
- 'FOOT'
- """
- orig_word, word = word, word.strip().lower()
- if not word or word in _IRR_S2P:
- return orig_word
- irr_singular = _IRR_P2S.get(word)
- if irr_singular:
- singular = irr_singular
- elif not word.endswith('s'):
- return orig_word
- elif len(word) == 2:
- singular = word[:-1] # or just return word?
- elif word.endswith('ies') and word[-4:-3] not in 'aeiou':
- singular = word[:-3] + 'y'
- elif word.endswith('es') and word[-3] == 's':
- singular = word[:-2]
- else:
- singular = word[:-1]
- return _match_case(orig_word, singular)
- def pluralize(word):
- """Semi-intelligently converts an English *word* from singular form to
- plural, preserving case pattern.
- >>> pluralize('friend')
- 'friends'
- >>> pluralize('enemy')
- 'enemies'
- >>> pluralize('Sheep')
- 'Sheep'
- """
- orig_word, word = word, word.strip().lower()
- if not word or word in _IRR_P2S:
- return orig_word
- irr_plural = _IRR_S2P.get(word)
- if irr_plural:
- plural = irr_plural
- elif word.endswith('y') and word[-2:-1] not in 'aeiou':
- plural = word[:-1] + 'ies'
- elif word[-1] == 's' or word.endswith('ch') or word.endswith('sh'):
- plural = word if word.endswith('es') else word + 'es'
- else:
- plural = word + 's'
- return _match_case(orig_word, plural)
- def _match_case(master, disciple):
- if not master.strip():
- return disciple
- if master.lower() == master:
- return disciple.lower()
- elif master.upper() == master:
- return disciple.upper()
- elif master.title() == master:
- return disciple.title()
- return disciple
- # Singular to plural map of irregular pluralizations
- _IRR_S2P = {'addendum': 'addenda', 'alga': 'algae', 'alumna': 'alumnae',
- 'alumnus': 'alumni', 'analysis': 'analyses', 'antenna': 'antennae',
- 'appendix': 'appendices', 'axis': 'axes', 'bacillus': 'bacilli',
- 'bacterium': 'bacteria', 'basis': 'bases', 'beau': 'beaux',
- 'bison': 'bison', 'bureau': 'bureaus', 'cactus': 'cacti',
- 'calf': 'calves', 'child': 'children', 'corps': 'corps',
- 'corpus': 'corpora', 'crisis': 'crises', 'criterion': 'criteria',
- 'curriculum': 'curricula', 'datum': 'data', 'deer': 'deer',
- 'diagnosis': 'diagnoses', 'die': 'dice', 'dwarf': 'dwarves',
- 'echo': 'echoes', 'elf': 'elves', 'ellipsis': 'ellipses',
- 'embargo': 'embargoes', 'emphasis': 'emphases', 'erratum': 'errata',
- 'fireman': 'firemen', 'fish': 'fish', 'focus': 'foci',
- 'foot': 'feet', 'formula': 'formulae', 'formula': 'formulas',
- 'fungus': 'fungi', 'genus': 'genera', 'goose': 'geese',
- 'half': 'halves', 'hero': 'heroes', 'hippopotamus': 'hippopotami',
- 'hoof': 'hooves', 'hypothesis': 'hypotheses', 'index': 'indices',
- 'knife': 'knives', 'leaf': 'leaves', 'life': 'lives',
- 'loaf': 'loaves', 'louse': 'lice', 'man': 'men',
- 'matrix': 'matrices', 'means': 'means', 'medium': 'media',
- 'memorandum': 'memoranda', 'millennium': 'milennia', 'moose': 'moose',
- 'mosquito': 'mosquitoes', 'mouse': 'mice', 'nebula': 'nebulae',
- 'neurosis': 'neuroses', 'nucleus': 'nuclei', 'oasis': 'oases',
- 'octopus': 'octopi', 'offspring': 'offspring', 'ovum': 'ova',
- 'ox': 'oxen', 'paralysis': 'paralyses', 'parenthesis': 'parentheses',
- 'person': 'people', 'phenomenon': 'phenomena', 'potato': 'potatoes',
- 'radius': 'radii', 'scarf': 'scarves', 'scissors': 'scissors',
- 'self': 'selves', 'sense': 'senses', 'series': 'series', 'sheep':
- 'sheep', 'shelf': 'shelves', 'species': 'species', 'stimulus':
- 'stimuli', 'stratum': 'strata', 'syllabus': 'syllabi', 'symposium':
- 'symposia', 'synopsis': 'synopses', 'synthesis': 'syntheses',
- 'tableau': 'tableaux', 'that': 'those', 'thesis': 'theses',
- 'thief': 'thieves', 'this': 'these', 'tomato': 'tomatoes', 'tooth':
- 'teeth', 'torpedo': 'torpedoes', 'vertebra': 'vertebrae', 'veto':
- 'vetoes', 'vita': 'vitae', 'watch': 'watches', 'wife': 'wives',
- 'wolf': 'wolves', 'woman': 'women'}
- # Reverse index of the above
- _IRR_P2S = dict([(v, k) for k, v in _IRR_S2P.items()])
- HASHTAG_RE = re.compile(r"(?:^|\s)[##]{1}(\w+)", re.UNICODE)
- def find_hashtags(string):
- """Finds and returns all hashtags in a string, with the hashmark
- removed. Supports full-width hashmarks for Asian languages and
- does not false-positive on URL anchors.
- >>> find_hashtags('#atag http://asite/#ananchor')
- ['atag']
- ``find_hashtags`` also works with unicode hashtags.
- """
- # the following works, doctest just struggles with it
- # >>> find_hashtags(u"can't get enough of that dignity chicken #肯德基 woo")
- # [u'\u80af\u5fb7\u57fa']
- return HASHTAG_RE.findall(string)
- def a10n(string):
- """That thing where "internationalization" becomes "i18n", what's it
- called? Abbreviation? Oh wait, no: ``a10n``. (It's actually a form
- of `numeronym`_.)
- >>> a10n('abbreviation')
- 'a10n'
- >>> a10n('internationalization')
- 'i18n'
- >>> a10n('')
- ''
- .. _numeronym: http://en.wikipedia.org/wiki/Numeronym
- """
- if len(string) < 3:
- return string
- return '%s%s%s' % (string[0], len(string[1:-1]), string[-1])
- # Based on https://en.wikipedia.org/wiki/ANSI_escape_code#Escape_sequences
- ANSI_SEQUENCES = re.compile(r'''
- \x1B # Sequence starts with ESC, i.e. hex 0x1B
- (?:
- [@-Z\\-_] # Second byte:
- # all 0x40–0x5F range but CSI char, i.e ASCII @A–Z\]^_
- | # Or
- \[ # CSI sequences, starting with [
- [0-?]* # Parameter bytes:
- # range 0x30–0x3F, ASCII 0–9:;<=>?
- [ -/]* # Intermediate bytes:
- # range 0x20–0x2F, ASCII space and !"#$%&'()*+,-./
- [@-~] # Final byte
- # range 0x40–0x7E, ASCII @A–Z[\]^_`a–z{|}~
- )
- ''', re.VERBOSE)
- def strip_ansi(text):
- """Strips ANSI escape codes from *text*. Useful for the occasional
- time when a log or redirected output accidentally captures console
- color codes and the like.
- >>> strip_ansi('\x1b[0m\x1b[1;36mart\x1b[46;34m')
- 'art'
- Supports unicode, str, bytes and bytearray content as input. Returns the
- same type as the input.
- There's a lot of ANSI art available for testing on `sixteencolors.net`_.
- This function does not interpret or render ANSI art, but you can do so with
- `ansi2img`_ or `escapes.js`_.
- .. _sixteencolors.net: http://sixteencolors.net
- .. _ansi2img: http://www.bedroomlan.org/projects/ansi2img
- .. _escapes.js: https://github.com/atdt/escapes.js
- """
- # TODO: move to cliutils.py
- # Transform any ASCII-like content to unicode to allow regex to match, and
- # save input type for later.
- target_type = None
- # Unicode type aliased to str is code-smell for Boltons in Python 3 env.
- is_py3 = (unicode == builtins.str)
- if is_py3 and isinstance(text, (bytes, bytearray)):
- target_type = type(text)
- text = text.decode('utf-8')
- cleaned = ANSI_SEQUENCES.sub('', text)
- # Transform back the result to the same bytearray type provided by the user.
- if target_type and target_type != type(cleaned):
- cleaned = target_type(cleaned, 'utf-8')
- return cleaned
- def asciify(text, ignore=False):
- """Converts a unicode or bytestring, *text*, into a bytestring with
- just ascii characters. Performs basic deaccenting for all you
- Europhiles out there.
- Also, a gentle reminder that this is a **utility**, primarily meant
- for slugification. Whenever possible, make your application work
- **with** unicode, not against it.
- Args:
- text (str or unicode): The string to be asciified.
- ignore (bool): Configures final encoding to ignore remaining
- unasciified unicode instead of replacing it.
- >>> asciify('Beyoncé') == b'Beyonce'
- True
- """
- try:
- try:
- return text.encode('ascii')
- except UnicodeDecodeError:
- # this usually means you passed in a non-unicode string
- text = text.decode('utf-8')
- return text.encode('ascii')
- except UnicodeEncodeError:
- mode = 'replace'
- if ignore:
- mode = 'ignore'
- transd = unicodedata.normalize('NFKD', text.translate(DEACCENT_MAP))
- ret = transd.encode('ascii', mode)
- return ret
- def is_ascii(text):
- """Check if a unicode or bytestring, *text*, is composed of ascii
- characters only. Raises :exc:`ValueError` if argument is not text.
- Args:
- text (str or unicode): The string to be checked.
- >>> is_ascii('Beyoncé')
- False
- >>> is_ascii('Beyonce')
- True
- """
- if isinstance(text, unicode):
- try:
- text.encode('ascii')
- except UnicodeEncodeError:
- return False
- elif isinstance(text, bytes):
- try:
- text.decode('ascii')
- except UnicodeDecodeError:
- return False
- else:
- raise ValueError('expected text or bytes, not %r' % type(text))
- return True
- class DeaccenterDict(dict):
- "A small caching dictionary for deaccenting."
- def __missing__(self, key):
- ch = self.get(key)
- if ch is not None:
- return ch
- try:
- de = unicodedata.decomposition(unichr(key))
- p1, _, p2 = de.rpartition(' ')
- if int(p2, 16) == 0x308:
- ch = self.get(key)
- else:
- ch = int(p1, 16)
- except (IndexError, ValueError):
- ch = self.get(key, key)
- self[key] = ch
- return ch
- try:
- from collections import defaultdict
- except ImportError:
- # no defaultdict means that __missing__ isn't supported in
- # this version of python, so we define __getitem__
- def __getitem__(self, key):
- try:
- return super(DeaccenterDict, self).__getitem__(key)
- except KeyError:
- return self.__missing__(key)
- else:
- del defaultdict
- # http://chmullig.com/2009/12/python-unicode-ascii-ifier/
- # For something more complete, investigate the unidecode
- # or isounidecode packages, which are capable of performing
- # crude transliteration.
- _BASE_DEACCENT_MAP = {
- 0xc6: u"AE", # Æ LATIN CAPITAL LETTER AE
- 0xd0: u"D", # Ð LATIN CAPITAL LETTER ETH
- 0xd8: u"OE", # Ø LATIN CAPITAL LETTER O WITH STROKE
- 0xde: u"Th", # Þ LATIN CAPITAL LETTER THORN
- 0xc4: u'Ae', # Ä LATIN CAPITAL LETTER A WITH DIAERESIS
- 0xd6: u'Oe', # Ö LATIN CAPITAL LETTER O WITH DIAERESIS
- 0xdc: u'Ue', # Ü LATIN CAPITAL LETTER U WITH DIAERESIS
- 0xc0: u"A", # À LATIN CAPITAL LETTER A WITH GRAVE
- 0xc1: u"A", # Á LATIN CAPITAL LETTER A WITH ACUTE
- 0xc3: u"A", # Ã LATIN CAPITAL LETTER A WITH TILDE
- 0xc7: u"C", # Ç LATIN CAPITAL LETTER C WITH CEDILLA
- 0xc8: u"E", # È LATIN CAPITAL LETTER E WITH GRAVE
- 0xc9: u"E", # É LATIN CAPITAL LETTER E WITH ACUTE
- 0xca: u"E", # Ê LATIN CAPITAL LETTER E WITH CIRCUMFLEX
- 0xcc: u"I", # Ì LATIN CAPITAL LETTER I WITH GRAVE
- 0xcd: u"I", # Í LATIN CAPITAL LETTER I WITH ACUTE
- 0xd2: u"O", # Ò LATIN CAPITAL LETTER O WITH GRAVE
- 0xd3: u"O", # Ó LATIN CAPITAL LETTER O WITH ACUTE
- 0xd5: u"O", # Õ LATIN CAPITAL LETTER O WITH TILDE
- 0xd9: u"U", # Ù LATIN CAPITAL LETTER U WITH GRAVE
- 0xda: u"U", # Ú LATIN CAPITAL LETTER U WITH ACUTE
- 0xdf: u"ss", # ß LATIN SMALL LETTER SHARP S
- 0xe6: u"ae", # æ LATIN SMALL LETTER AE
- 0xf0: u"d", # ð LATIN SMALL LETTER ETH
- 0xf8: u"oe", # ø LATIN SMALL LETTER O WITH STROKE
- 0xfe: u"th", # þ LATIN SMALL LETTER THORN,
- 0xe4: u'ae', # ä LATIN SMALL LETTER A WITH DIAERESIS
- 0xf6: u'oe', # ö LATIN SMALL LETTER O WITH DIAERESIS
- 0xfc: u'ue', # ü LATIN SMALL LETTER U WITH DIAERESIS
- 0xe0: u"a", # à LATIN SMALL LETTER A WITH GRAVE
- 0xe1: u"a", # á LATIN SMALL LETTER A WITH ACUTE
- 0xe3: u"a", # ã LATIN SMALL LETTER A WITH TILDE
- 0xe7: u"c", # ç LATIN SMALL LETTER C WITH CEDILLA
- 0xe8: u"e", # è LATIN SMALL LETTER E WITH GRAVE
- 0xe9: u"e", # é LATIN SMALL LETTER E WITH ACUTE
- 0xea: u"e", # ê LATIN SMALL LETTER E WITH CIRCUMFLEX
- 0xec: u"i", # ì LATIN SMALL LETTER I WITH GRAVE
- 0xed: u"i", # í LATIN SMALL LETTER I WITH ACUTE
- 0xf2: u"o", # ò LATIN SMALL LETTER O WITH GRAVE
- 0xf3: u"o", # ó LATIN SMALL LETTER O WITH ACUTE
- 0xf5: u"o", # õ LATIN SMALL LETTER O WITH TILDE
- 0xf9: u"u", # ù LATIN SMALL LETTER U WITH GRAVE
- 0xfa: u"u", # ú LATIN SMALL LETTER U WITH ACUTE
- 0x2018: u"'", # ‘ LEFT SINGLE QUOTATION MARK
- 0x2019: u"'", # ’ RIGHT SINGLE QUOTATION MARK
- 0x201c: u'"', # “ LEFT DOUBLE QUOTATION MARK
- 0x201d: u'"', # ” RIGHT DOUBLE QUOTATION MARK
- }
- DEACCENT_MAP = DeaccenterDict(_BASE_DEACCENT_MAP)
- _SIZE_SYMBOLS = ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')
- _SIZE_BOUNDS = [(1024 ** i, sym) for i, sym in enumerate(_SIZE_SYMBOLS)]
- _SIZE_RANGES = list(zip(_SIZE_BOUNDS, _SIZE_BOUNDS[1:]))
- def bytes2human(nbytes, ndigits=0):
- """Turns an integer value of *nbytes* into a human readable format. Set
- *ndigits* to control how many digits after the decimal point
- should be shown (default ``0``).
- >>> bytes2human(128991)
- '126K'
- >>> bytes2human(100001221)
- '95M'
- >>> bytes2human(0, 2)
- '0.00B'
- """
- abs_bytes = abs(nbytes)
- for (size, symbol), (next_size, next_symbol) in _SIZE_RANGES:
- if abs_bytes <= next_size:
- break
- hnbytes = float(nbytes) / size
- return '{hnbytes:.{ndigits}f}{symbol}'.format(hnbytes=hnbytes,
- ndigits=ndigits,
- symbol=symbol)
- class HTMLTextExtractor(HTMLParser):
- def __init__(self):
- self.reset()
- self.strict = False
- self.convert_charrefs = True
- self.result = []
- def handle_data(self, d):
- self.result.append(d)
- def handle_charref(self, number):
- if number[0] == u'x' or number[0] == u'X':
- codepoint = int(number[1:], 16)
- else:
- codepoint = int(number)
- self.result.append(unichr(codepoint))
- def handle_entityref(self, name):
- try:
- codepoint = htmlentitydefs.name2codepoint[name]
- except KeyError:
- self.result.append(u'&' + name + u';')
- else:
- self.result.append(unichr(codepoint))
- def get_text(self):
- return u''.join(self.result)
- def html2text(html):
- """Strips tags from HTML text, returning markup-free text. Also, does
- a best effort replacement of entities like " "
- >>> r = html2text(u'<a href="#">Test &<em>(\u0394ημώ)</em></a>')
- >>> r == u'Test &(\u0394\u03b7\u03bc\u03ce)'
- True
- """
- # based on answers to http://stackoverflow.com/questions/753052/
- s = HTMLTextExtractor()
- s.feed(html)
- return s.get_text()
- _EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x089\xf3\xb9U\x00\x03empty\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00'
- _NON_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x08\xbc\xf7\xb9U\x00\x03not_empty\x00K\xaa,I-N\xcc\xc8\xafT\xe4\x02\x00\xf3nb\xbf\x0b\x00\x00\x00'
- def gunzip_bytes(bytestring):
- """The :mod:`gzip` module is great if you have a file or file-like
- object, but what if you just have bytes. StringIO is one
- possibility, but it's often faster, easier, and simpler to just
- use this one-liner. Use this tried-and-true utility function to
- decompress gzip from bytes.
- >>> gunzip_bytes(_EMPTY_GZIP_BYTES) == b''
- True
- >>> gunzip_bytes(_NON_EMPTY_GZIP_BYTES).rstrip() == b'bytesahoy!'
- True
- """
- return zlib.decompress(bytestring, 16 + zlib.MAX_WBITS)
- def gzip_bytes(bytestring, level=6):
- """Turn some bytes into some compressed bytes.
- >>> len(gzip_bytes(b'a' * 10000))
- 46
- Args:
- bytestring (bytes): Bytes to be compressed
- level (int): An integer, 1-9, controlling the
- speed/compression. 1 is fastest, least compressed, 9 is
- slowest, but most compressed.
- Note that all levels of gzip are pretty fast these days, though
- it's not really a competitor in compression, at any level.
- """
- out = StringIO()
- f = GzipFile(fileobj=out, mode='wb', compresslevel=level)
- f.write(bytestring)
- f.close()
- return out.getvalue()
- _line_ending_re = re.compile(r'(\r\n|\n|\x0b|\f|\r|\x85|\x2028|\x2029)',
- re.UNICODE)
- def iter_splitlines(text):
- r"""Like :meth:`str.splitlines`, but returns an iterator of lines
- instead of a list. Also similar to :meth:`file.next`, as that also
- lazily reads and yields lines from a file.
- This function works with a variety of line endings, but as always,
- be careful when mixing line endings within a file.
- >>> list(iter_splitlines('\nhi\nbye\n'))
- ['', 'hi', 'bye', '']
- >>> list(iter_splitlines('\r\nhi\rbye\r\n'))
- ['', 'hi', 'bye', '']
- >>> list(iter_splitlines(''))
- []
- """
- prev_end, len_text = 0, len(text)
- # print('last: %r' % last_idx)
- # start, end = None, None
- for match in _line_ending_re.finditer(text):
- start, end = match.start(1), match.end(1)
- # print(start, end)
- if prev_end <= start:
- yield text[prev_end:start]
- if end == len_text:
- yield ''
- prev_end = end
- tail = text[prev_end:]
- if tail:
- yield tail
- return
- def indent(text, margin, newline='\n', key=bool):
- """The missing counterpart to the built-in :func:`textwrap.dedent`.
- Args:
- text (str): The text to indent.
- margin (str): The string to prepend to each line.
- newline (str): The newline used to rejoin the lines (default: ``\\n``)
- key (callable): Called on each line to determine whether to
- indent it. Default: :class:`bool`, to ensure that empty lines do
- not get whitespace added.
- """
- indented_lines = [(margin + line if key(line) else line)
- for line in iter_splitlines(text)]
- return newline.join(indented_lines)
- def is_uuid(obj, version=4):
- """Check the argument is either a valid UUID object or string.
- Args:
- obj (object): The test target. Strings and UUID objects supported.
- version (int): The target UUID version, set to 0 to skip version check.
- >>> is_uuid('e682ccca-5a4c-4ef2-9711-73f9ad1e15ea')
- True
- >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9')
- False
- >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9', version=1)
- True
- """
- if not isinstance(obj, uuid.UUID):
- try:
- obj = uuid.UUID(obj)
- except (TypeError, ValueError, AttributeError):
- return False
- if version and obj.version != int(version):
- return False
- return True
- def escape_shell_args(args, sep=' ', style=None):
- """Returns an escaped version of each string in *args*, according to
- *style*.
- Args:
- args (list): A list of arguments to escape and join together
- sep (str): The separator used to join the escaped arguments.
- style (str): The style of escaping to use. Can be one of
- ``cmd`` or ``sh``, geared toward Windows and Linux/BSD/etc.,
- respectively. If *style* is ``None``, then it is picked
- according to the system platform.
- See :func:`args2cmd` and :func:`args2sh` for details and example
- output for each style.
- """
- if not style:
- style = 'cmd' if sys.platform == 'win32' else 'sh'
- if style == 'sh':
- return args2sh(args, sep=sep)
- elif style == 'cmd':
- return args2cmd(args, sep=sep)
- raise ValueError("style expected one of 'cmd' or 'sh', not %r" % style)
- _find_sh_unsafe = re.compile(r'[^a-zA-Z0-9_@%+=:,./-]').search
- def args2sh(args, sep=' '):
- """Return a shell-escaped string version of *args*, separated by
- *sep*, based on the rules of sh, bash, and other shells in the
- Linux/BSD/MacOS ecosystem.
- >>> print(args2sh(['aa', '[bb]', "cc'cc", 'dd"dd']))
- aa '[bb]' 'cc'"'"'cc' 'dd"dd'
- As you can see, arguments with no special characters are not
- escaped, arguments with special characters are quoted with single
- quotes, and single quotes themselves are quoted with double
- quotes. Double quotes are handled like any other special
- character.
- Based on code from the :mod:`pipes`/:mod:`shlex` modules. Also
- note that :mod:`shlex` and :mod:`argparse` have functions to split
- and parse strings escaped in this manner.
- """
- ret_list = []
- for arg in args:
- if not arg:
- ret_list.append("''")
- continue
- if _find_sh_unsafe(arg) is None:
- ret_list.append(arg)
- continue
- # use single quotes, and put single quotes into double quotes
- # the string $'b is then quoted as '$'"'"'b'
- ret_list.append("'" + arg.replace("'", "'\"'\"'") + "'")
- return ' '.join(ret_list)
- def args2cmd(args, sep=' '):
- r"""Return a shell-escaped string version of *args*, separated by
- *sep*, using the same rules as the Microsoft C runtime.
- >>> print(args2cmd(['aa', '[bb]', "cc'cc", 'dd"dd']))
- aa [bb] cc'cc dd\"dd
- As you can see, escaping is through backslashing and not quoting,
- and double quotes are the only special character. See the comment
- in the code for more details. Based on internal code from the
- :mod:`subprocess` module.
- """
- # technique description from subprocess below
- """
- 1) Arguments are delimited by white space, which is either a
- space or a tab.
- 2) A string surrounded by double quotation marks is
- interpreted as a single argument, regardless of white space
- contained within. A quoted string can be embedded in an
- argument.
- 3) A double quotation mark preceded by a backslash is
- interpreted as a literal double quotation mark.
- 4) Backslashes are interpreted literally, unless they
- immediately precede a double quotation mark.
- 5) If backslashes immediately precede a double quotation mark,
- every pair of backslashes is interpreted as a literal
- backslash. If the number of backslashes is odd, the last
- backslash escapes the next double quotation mark as
- described in rule 3.
- See http://msdn.microsoft.com/en-us/library/17w5ykft.aspx
- or search http://msdn.microsoft.com for
- "Parsing C++ Command-Line Arguments"
- """
- result = []
- needquote = False
- for arg in args:
- bs_buf = []
- # Add a space to separate this argument from the others
- if result:
- result.append(' ')
- needquote = (" " in arg) or ("\t" in arg) or not arg
- if needquote:
- result.append('"')
- for c in arg:
- if c == '\\':
- # Don't know if we need to double yet.
- bs_buf.append(c)
- elif c == '"':
- # Double backslashes.
- result.append('\\' * len(bs_buf)*2)
- bs_buf = []
- result.append('\\"')
- else:
- # Normal char
- if bs_buf:
- result.extend(bs_buf)
- bs_buf = []
- result.append(c)
- # Add remaining backslashes, if any.
- if bs_buf:
- result.extend(bs_buf)
- if needquote:
- result.extend(bs_buf)
- result.append('"')
- return ''.join(result)
- def parse_int_list(range_string, delim=',', range_delim='-'):
- """Returns a sorted list of positive integers based on
- *range_string*. Reverse of :func:`format_int_list`.
- Args:
- range_string (str): String of comma separated positive
- integers or ranges (e.g. '1,2,4-6,8'). Typical of a custom
- page range string used in printer dialogs.
- delim (char): Defaults to ','. Separates integers and
- contiguous ranges of integers.
- range_delim (char): Defaults to '-'. Indicates a contiguous
- range of integers.
- >>> parse_int_list('1,3,5-8,10-11,15')
- [1, 3, 5, 6, 7, 8, 10, 11, 15]
- """
- output = []
- for x in range_string.strip().split(delim):
- # Range
- if range_delim in x:
- range_limits = list(map(int, x.split(range_delim)))
- output += list(range(min(range_limits), max(range_limits)+1))
- # Empty String
- elif not x:
- continue
- # Integer
- else:
- output.append(int(x))
- return sorted(output)
- def format_int_list(int_list, delim=',', range_delim='-', delim_space=False):
- """Returns a sorted range string from a list of positive integers
- (*int_list*). Contiguous ranges of integers are collapsed to min
- and max values. Reverse of :func:`parse_int_list`.
- Args:
- int_list (list): List of positive integers to be converted
- into a range string (e.g. [1,2,4,5,6,8]).
- delim (char): Defaults to ','. Separates integers and
- contiguous ranges of integers.
- range_delim (char): Defaults to '-'. Indicates a contiguous
- range of integers.
- delim_space (bool): Defaults to ``False``. If ``True``, adds a
- space after all *delim* characters.
- >>> format_int_list([1,3,5,6,7,8,10,11,15])
- '1,3,5-8,10-11,15'
- """
- output = []
- contig_range = collections.deque()
- for x in sorted(int_list):
- # Handle current (and first) value.
- if len(contig_range) < 1:
- contig_range.append(x)
- # Handle current value, given multiple previous values are contiguous.
- elif len(contig_range) > 1:
- delta = x - contig_range[-1]
- # Current value is contiguous.
- if delta == 1:
- contig_range.append(x)
- # Current value is non-contiguous.
- elif delta > 1:
- range_substr = '{0:d}{1}{2:d}'.format(min(contig_range),
- range_delim,
- max(contig_range))
- output.append(range_substr)
- contig_range.clear()
- contig_range.append(x)
- # Current value repeated.
- else:
- continue
- # Handle current value, given no previous contiguous integers
- else:
- delta = x - contig_range[0]
- # Current value is contiguous.
- if delta == 1:
- contig_range.append(x)
- # Current value is non-contiguous.
- elif delta > 1:
- output.append('{0:d}'.format(contig_range.popleft()))
- contig_range.append(x)
- # Current value repeated.
- else:
- continue
- # Handle the last value.
- else:
- # Last value is non-contiguous.
- if len(contig_range) == 1:
- output.append('{0:d}'.format(contig_range.popleft()))
- contig_range.clear()
- # Last value is part of contiguous range.
- elif len(contig_range) > 1:
- range_substr = '{0:d}{1}{2:d}'.format(min(contig_range),
- range_delim,
- max(contig_range))
- output.append(range_substr)
- contig_range.clear()
- if delim_space:
- output_str = (delim+' ').join(output)
- else:
- output_str = delim.join(output)
- return output_str
- def complement_int_list(
- range_string, range_start=0, range_end=None,
- delim=',', range_delim='-'):
- """ Returns range string that is the complement of the one provided as
- *range_string* parameter.
- These range strings are of the kind produce by :func:`format_int_list`, and
- parseable by :func:`parse_int_list`.
- Args:
- range_string (str): String of comma separated positive integers or
- ranges (e.g. '1,2,4-6,8'). Typical of a custom page range string
- used in printer dialogs.
- range_start (int): A positive integer from which to start the resulting
- range. Value is inclusive. Defaults to ``0``.
- range_end (int): A positive integer from which the produced range is
- stopped. Value is exclusive. Defaults to the maximum value found in
- the provided ``range_string``.
- delim (char): Defaults to ','. Separates integers and contiguous ranges
- of integers.
- range_delim (char): Defaults to '-'. Indicates a contiguous range of
- integers.
- >>> complement_int_list('1,3,5-8,10-11,15')
- '0,2,4,9,12-14'
- >>> complement_int_list('1,3,5-8,10-11,15', range_start=0)
- '0,2,4,9,12-14'
- >>> complement_int_list('1,3,5-8,10-11,15', range_start=1)
- '2,4,9,12-14'
- >>> complement_int_list('1,3,5-8,10-11,15', range_start=2)
- '2,4,9,12-14'
- >>> complement_int_list('1,3,5-8,10-11,15', range_start=3)
- '4,9,12-14'
- >>> complement_int_list('1,3,5-8,10-11,15', range_end=15)
- '0,2,4,9,12-14'
- >>> complement_int_list('1,3,5-8,10-11,15', range_end=14)
- '0,2,4,9,12-13'
- >>> complement_int_list('1,3,5-8,10-11,15', range_end=13)
- '0,2,4,9,12'
- >>> complement_int_list('1,3,5-8,10-11,15', range_end=20)
- '0,2,4,9,12-14,16-19'
- >>> complement_int_list('1,3,5-8,10-11,15', range_end=0)
- ''
- >>> complement_int_list('1,3,5-8,10-11,15', range_start=-1)
- '0,2,4,9,12-14'
- >>> complement_int_list('1,3,5-8,10-11,15', range_end=-1)
- ''
- >>> complement_int_list('1,3,5-8', range_start=1, range_end=1)
- ''
- >>> complement_int_list('1,3,5-8', range_start=2, range_end=2)
- ''
- >>> complement_int_list('1,3,5-8', range_start=2, range_end=3)
- '2'
- >>> complement_int_list('1,3,5-8', range_start=-10, range_end=-5)
- ''
- >>> complement_int_list('1,3,5-8', range_start=20, range_end=10)
- ''
- >>> complement_int_list('')
- ''
- """
- int_list = set(parse_int_list(range_string, delim, range_delim))
- if range_end is None:
- if int_list:
- range_end = max(int_list) + 1
- else:
- range_end = range_start
- complement_values = set(
- range(range_end)) - int_list - set(range(range_start))
- return format_int_list(complement_values, delim, range_delim)
- def int_ranges_from_int_list(range_string, delim=',', range_delim='-'):
- """ Transform a string of ranges (*range_string*) into a tuple of tuples.
- Args:
- range_string (str): String of comma separated positive integers or
- ranges (e.g. '1,2,4-6,8'). Typical of a custom page range string
- used in printer dialogs.
- delim (char): Defaults to ','. Separates integers and contiguous ranges
- of integers.
- range_delim (char): Defaults to '-'. Indicates a contiguous range of
- integers.
- >>> int_ranges_from_int_list('1,3,5-8,10-11,15')
- ((1, 1), (3, 3), (5, 8), (10, 11), (15, 15))
- >>> int_ranges_from_int_list('1')
- ((1, 1),)
- >>> int_ranges_from_int_list('')
- ()
- """
- int_tuples = []
- # Normalize the range string to our internal format for processing.
- range_string = format_int_list(
- parse_int_list(range_string, delim, range_delim))
- if range_string:
- for bounds in range_string.split(','):
- if '-' in bounds:
- start, end = bounds.split('-')
- else:
- start, end = bounds, bounds
- int_tuples.append((int(start), int(end)))
- return tuple(int_tuples)
- class MultiReplace(object):
- """
- MultiReplace is a tool for doing multiple find/replace actions in one pass.
- Given a mapping of values to be replaced it allows for all of the matching
- values to be replaced in a single pass which can save a lot of performance
- on very large strings. In addition to simple replace, it also allows for
- replacing based on regular expressions.
- Keyword Arguments:
- :type regex: bool
- :param regex: Treat search keys as regular expressions [Default: False]
- :type flags: int
- :param flags: flags to pass to the regex engine during compile
- Dictionary Usage::
- from boltons import stringutils
- s = stringutils.MultiReplace({
- 'foo': 'zoo',
- 'cat': 'hat',
- 'bat': 'kraken'
- })
- new = s.sub('The foo bar cat ate a bat')
- new == 'The zoo bar hat ate a kraken'
- Iterable Usage::
- from boltons import stringutils
- s = stringutils.MultiReplace([
- ('foo', 'zoo'),
- ('cat', 'hat'),
- ('bat', 'kraken)'
- ])
- new = s.sub('The foo bar cat ate a bat')
- new == 'The zoo bar hat ate a kraken'
- The constructor can be passed a dictionary or other mapping as well as
- an iterable of tuples. If given an iterable, the substitution will be run
- in the order the replacement values are specified in the iterable. This is
- also true if it is given an OrderedDict. If given a dictionary then the
- order will be non-deterministic::
- >>> 'foo bar baz'.replace('foo', 'baz').replace('baz', 'bar')
- 'bar bar bar'
- >>> m = MultiReplace({'foo': 'baz', 'baz': 'bar'})
- >>> m.sub('foo bar baz')
- 'baz bar bar'
- This is because the order of replacement can matter if you're inserting
- something that might be replaced by a later substitution. Pay attention and
- if you need to rely on order then consider using a list of tuples instead
- of a dictionary.
- """
- def __init__(self, sub_map, **kwargs):
- """Compile any regular expressions that have been passed."""
- options = {
- 'regex': False,
- 'flags': 0,
- }
- options.update(kwargs)
- self.group_map = {}
- regex_values = []
- if isinstance(sub_map, Mapping):
- sub_map = sub_map.items()
- for idx, vals in enumerate(sub_map):
- group_name = 'group{0}'.format(idx)
- if isinstance(vals[0], basestring):
- # If we're not treating input strings like a regex, escape it
- if not options['regex']:
- exp = re.escape(vals[0])
- else:
- exp = vals[0]
- else:
- exp = vals[0].pattern
- regex_values.append('(?P<{}>{})'.format(group_name, exp))
- self.group_map[group_name] = vals[1]
- self.combined_pattern = re.compile(
- '|'.join(regex_values),
- flags=options['flags']
- )
- def _get_value(self, match):
- """Given a match object find replacement value."""
- group_dict = match.groupdict()
- key = [x for x in group_dict if group_dict[x]][0]
- return self.group_map[key]
- def sub(self, text):
- """
- Run substitutions on the input text.
- Given an input string, run all substitutions given in the
- constructor.
- """
- return self.combined_pattern.sub(self._get_value, text)
- def multi_replace(text, sub_map, **kwargs):
- """
- Shortcut function to invoke MultiReplace in a single call.
- Example Usage::
- from boltons.stringutils import multi_replace
- new = multi_replace(
- 'The foo bar cat ate a bat',
- {'foo': 'zoo', 'cat': 'hat', 'bat': 'kraken'}
- )
- new == 'The zoo bar hat ate a kraken'
- """
- m = MultiReplace(sub_map, **kwargs)
- return m.sub(text)
- def unwrap_text(text, ending='\n\n'):
- r"""
- Unwrap text, the natural complement to :func:`textwrap.wrap`.
- >>> text = "Short \n lines \nwrapped\nsmall.\n\nAnother\nparagraph."
- >>> unwrap_text(text)
- 'Short lines wrapped small.\n\nAnother paragraph.'
- Args:
- text: A string to unwrap.
- ending (str): The string to join all unwrapped paragraphs
- by. Pass ``None`` to get the list. Defaults to '\n\n' for
- compatibility with Markdown and RST.
- """
- all_grafs = []
- cur_graf = []
- for line in text.splitlines():
- line = line.strip()
- if line:
- cur_graf.append(line)
- else:
- all_grafs.append(' '.join(cur_graf))
- cur_graf = []
- if cur_graf:
- all_grafs.append(' '.join(cur_graf))
- if ending is None:
- return all_grafs
- return ending.join(all_grafs)
|