statsutils.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821
  1. # -*- coding: utf-8 -*-
  2. # Copyright (c) 2013, Mahmoud Hashemi
  3. #
  4. # Redistribution and use in source and binary forms, with or without
  5. # modification, are permitted provided that the following conditions are
  6. # met:
  7. #
  8. # * Redistributions of source code must retain the above copyright
  9. # notice, this list of conditions and the following disclaimer.
  10. #
  11. # * Redistributions in binary form must reproduce the above
  12. # copyright notice, this list of conditions and the following
  13. # disclaimer in the documentation and/or other materials provided
  14. # with the distribution.
  15. #
  16. # * The names of the contributors may not be used to endorse or
  17. # promote products derived from this software without specific
  18. # prior written permission.
  19. #
  20. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21. # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22. # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23. # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24. # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25. # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27. # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28. # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29. # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30. # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31. """``statsutils`` provides tools aimed primarily at descriptive
  32. statistics for data analysis, such as :func:`mean` (average),
  33. :func:`median`, :func:`variance`, and many others,
  34. The :class:`Stats` type provides all the main functionality of the
  35. ``statsutils`` module. A :class:`Stats` object wraps a given dataset,
  36. providing all statistical measures as property attributes. These
  37. attributes cache their results, which allows efficient computation of
  38. multiple measures, as many measures rely on other measures. For
  39. example, relative standard deviation (:attr:`Stats.rel_std_dev`)
  40. relies on both the mean and standard deviation. The Stats object
  41. caches those results so no rework is done.
  42. The :class:`Stats` type's attributes have module-level counterparts for
  43. convenience when the computation reuse advantages do not apply.
  44. >>> stats = Stats(range(42))
  45. >>> stats.mean
  46. 20.5
  47. >>> mean(range(42))
  48. 20.5
  49. Statistics is a large field, and ``statsutils`` is focused on a few
  50. basic techniques that are useful in software. The following is a brief
  51. introduction to those techniques. For a more in-depth introduction,
  52. `Statistics for Software
  53. <https://www.paypal-engineering.com/2016/04/11/statistics-for-software/>`_,
  54. an article I wrote on the topic. It introduces key terminology vital
  55. to effective usage of statistics.
  56. Statistical moments
  57. -------------------
  58. Python programmers are probably familiar with the concept of the
  59. *mean* or *average*, which gives a rough quantitiative middle value by
  60. which a sample can be can be generalized. However, the mean is just
  61. the first of four `moment`_-based measures by which a sample or
  62. distribution can be measured.
  63. The four `Standardized moments`_ are:
  64. 1. `Mean`_ - :func:`mean` - theoretical middle value
  65. 2. `Variance`_ - :func:`variance` - width of value dispersion
  66. 3. `Skewness`_ - :func:`skewness` - symmetry of distribution
  67. 4. `Kurtosis`_ - :func:`kurtosis` - "peakiness" or "long-tailed"-ness
  68. For more information check out `the Moment article on Wikipedia`_.
  69. .. _moment: https://en.wikipedia.org/wiki/Moment_(mathematics)
  70. .. _Standardized moments: https://en.wikipedia.org/wiki/Standardized_moment
  71. .. _Mean: https://en.wikipedia.org/wiki/Mean
  72. .. _Variance: https://en.wikipedia.org/wiki/Variance
  73. .. _Skewness: https://en.wikipedia.org/wiki/Skewness
  74. .. _Kurtosis: https://en.wikipedia.org/wiki/Kurtosis
  75. .. _the Moment article on Wikipedia: https://en.wikipedia.org/wiki/Moment_(mathematics)
  76. Keep in mind that while these moments can give a bit more insight into
  77. the shape and distribution of data, they do not guarantee a complete
  78. picture. Wildly different datasets can have the same values for all
  79. four moments, so generalize wisely.
  80. Robust statistics
  81. -----------------
  82. Moment-based statistics are notorious for being easily skewed by
  83. outliers. The whole field of robust statistics aims to mitigate this
  84. dilemma. ``statsutils`` also includes several robust statistical methods:
  85. * `Median`_ - The middle value of a sorted dataset
  86. * `Trimean`_ - Another robust measure of the data's central tendency
  87. * `Median Absolute Deviation`_ (MAD) - A robust measure of
  88. variability, a natural counterpart to :func:`variance`.
  89. * `Trimming`_ - Reducing a dataset to only the middle majority of
  90. data is a simple way of making other estimators more robust.
  91. .. _Median: https://en.wikipedia.org/wiki/Median
  92. .. _Trimean: https://en.wikipedia.org/wiki/Trimean
  93. .. _Median Absolute Deviation: https://en.wikipedia.org/wiki/Median_absolute_deviation
  94. .. _Trimming: https://en.wikipedia.org/wiki/Trimmed_estimator
  95. Online and Offline Statistics
  96. -----------------------------
  97. Unrelated to computer networking, `online`_ statistics involve
  98. calculating statistics in a `streaming`_ fashion, without all the data
  99. being available. The :class:`Stats` type is meant for the more
  100. traditional offline statistics when all the data is available. For
  101. pure-Python online statistics accumulators, look at the `Lithoxyl`_
  102. system instrumentation package.
  103. .. _Online: https://en.wikipedia.org/wiki/Online_algorithm
  104. .. _streaming: https://en.wikipedia.org/wiki/Streaming_algorithm
  105. .. _Lithoxyl: https://github.com/mahmoud/lithoxyl
  106. """
  107. from __future__ import print_function
  108. import bisect
  109. from math import floor, ceil
  110. class _StatsProperty(object):
  111. def __init__(self, name, func):
  112. self.name = name
  113. self.func = func
  114. self.internal_name = '_' + name
  115. doc = func.__doc__ or ''
  116. pre_doctest_doc, _, _ = doc.partition('>>>')
  117. self.__doc__ = pre_doctest_doc
  118. def __get__(self, obj, objtype=None):
  119. if obj is None:
  120. return self
  121. if not obj.data:
  122. return obj.default
  123. try:
  124. return getattr(obj, self.internal_name)
  125. except AttributeError:
  126. setattr(obj, self.internal_name, self.func(obj))
  127. return getattr(obj, self.internal_name)
  128. class Stats(object):
  129. """The ``Stats`` type is used to represent a group of unordered
  130. statistical datapoints for calculations such as mean, median, and
  131. variance.
  132. Args:
  133. data (list): List or other iterable containing numeric values.
  134. default (float): A value to be returned when a given
  135. statistical measure is not defined. 0.0 by default, but
  136. ``float('nan')`` is appropriate for stricter applications.
  137. use_copy (bool): By default Stats objects copy the initial
  138. data into a new list to avoid issues with
  139. modifications. Pass ``False`` to disable this behavior.
  140. is_sorted (bool): Presorted data can skip an extra sorting
  141. step for a little speed boost. Defaults to False.
  142. """
  143. def __init__(self, data, default=0.0, use_copy=True, is_sorted=False):
  144. self._use_copy = use_copy
  145. self._is_sorted = is_sorted
  146. if use_copy:
  147. self.data = list(data)
  148. else:
  149. self.data = data
  150. self.default = default
  151. cls = self.__class__
  152. self._prop_attr_names = [a for a in dir(self)
  153. if isinstance(getattr(cls, a, None),
  154. _StatsProperty)]
  155. self._pearson_precision = 0
  156. def __len__(self):
  157. return len(self.data)
  158. def __iter__(self):
  159. return iter(self.data)
  160. def _get_sorted_data(self):
  161. """When using a copy of the data, it's better to have that copy be
  162. sorted, but we do it lazily using this method, in case no
  163. sorted measures are used. I.e., if median is never called,
  164. sorting would be a waste.
  165. When not using a copy, it's presumed that all optimizations
  166. are on the user.
  167. """
  168. if not self._use_copy:
  169. return sorted(self.data)
  170. elif not self._is_sorted:
  171. self.data.sort()
  172. return self.data
  173. def clear_cache(self):
  174. """``Stats`` objects automatically cache intermediary calculations
  175. that can be reused. For instance, accessing the ``std_dev``
  176. attribute after the ``variance`` attribute will be
  177. significantly faster for medium-to-large datasets.
  178. If you modify the object by adding additional data points,
  179. call this function to have the cached statistics recomputed.
  180. """
  181. for attr_name in self._prop_attr_names:
  182. attr_name = getattr(self.__class__, attr_name).internal_name
  183. if not hasattr(self, attr_name):
  184. continue
  185. delattr(self, attr_name)
  186. return
  187. def _calc_count(self):
  188. """The number of items in this Stats object. Returns the same as
  189. :func:`len` on a Stats object, but provided for pandas terminology
  190. parallelism.
  191. >>> Stats(range(20)).count
  192. 20
  193. """
  194. return len(self.data)
  195. count = _StatsProperty('count', _calc_count)
  196. def _calc_mean(self):
  197. """
  198. The arithmetic mean, or "average". Sum of the values divided by
  199. the number of values.
  200. >>> mean(range(20))
  201. 9.5
  202. >>> mean(list(range(19)) + [949]) # 949 is an arbitrary outlier
  203. 56.0
  204. """
  205. return sum(self.data, 0.0) / len(self.data)
  206. mean = _StatsProperty('mean', _calc_mean)
  207. def _calc_max(self):
  208. """
  209. The maximum value present in the data.
  210. >>> Stats([2, 1, 3]).max
  211. 3
  212. """
  213. if self._is_sorted:
  214. return self.data[-1]
  215. return max(self.data)
  216. max = _StatsProperty('max', _calc_max)
  217. def _calc_min(self):
  218. """
  219. The minimum value present in the data.
  220. >>> Stats([2, 1, 3]).min
  221. 1
  222. """
  223. if self._is_sorted:
  224. return self.data[0]
  225. return min(self.data)
  226. min = _StatsProperty('min', _calc_min)
  227. def _calc_median(self):
  228. """
  229. The median is either the middle value or the average of the two
  230. middle values of a sample. Compared to the mean, it's generally
  231. more resilient to the presence of outliers in the sample.
  232. >>> median([2, 1, 3])
  233. 2
  234. >>> median(range(97))
  235. 48
  236. >>> median(list(range(96)) + [1066]) # 1066 is an arbitrary outlier
  237. 48
  238. """
  239. return self._get_quantile(self._get_sorted_data(), 0.5)
  240. median = _StatsProperty('median', _calc_median)
  241. def _calc_iqr(self):
  242. """Inter-quartile range (IQR) is the difference between the 75th
  243. percentile and 25th percentile. IQR is a robust measure of
  244. dispersion, like standard deviation, but safer to compare
  245. between datasets, as it is less influenced by outliers.
  246. >>> iqr([1, 2, 3, 4, 5])
  247. 2
  248. >>> iqr(range(1001))
  249. 500
  250. """
  251. return self.get_quantile(0.75) - self.get_quantile(0.25)
  252. iqr = _StatsProperty('iqr', _calc_iqr)
  253. def _calc_trimean(self):
  254. """The trimean is a robust measure of central tendency, like the
  255. median, that takes the weighted average of the median and the
  256. upper and lower quartiles.
  257. >>> trimean([2, 1, 3])
  258. 2.0
  259. >>> trimean(range(97))
  260. 48.0
  261. >>> trimean(list(range(96)) + [1066]) # 1066 is an arbitrary outlier
  262. 48.0
  263. """
  264. sorted_data = self._get_sorted_data()
  265. gq = lambda q: self._get_quantile(sorted_data, q)
  266. return (gq(0.25) + (2 * gq(0.5)) + gq(0.75)) / 4.0
  267. trimean = _StatsProperty('trimean', _calc_trimean)
  268. def _calc_variance(self):
  269. """\
  270. Variance is the average of the squares of the difference between
  271. each value and the mean.
  272. >>> variance(range(97))
  273. 784.0
  274. """
  275. global mean # defined elsewhere in this file
  276. return mean(self._get_pow_diffs(2))
  277. variance = _StatsProperty('variance', _calc_variance)
  278. def _calc_std_dev(self):
  279. """\
  280. Standard deviation. Square root of the variance.
  281. >>> std_dev(range(97))
  282. 28.0
  283. """
  284. return self.variance ** 0.5
  285. std_dev = _StatsProperty('std_dev', _calc_std_dev)
  286. def _calc_median_abs_dev(self):
  287. """\
  288. Median Absolute Deviation is a robust measure of statistical
  289. dispersion: http://en.wikipedia.org/wiki/Median_absolute_deviation
  290. >>> median_abs_dev(range(97))
  291. 24.0
  292. """
  293. global median # defined elsewhere in this file
  294. sorted_vals = sorted(self.data)
  295. x = float(median(sorted_vals))
  296. return median([abs(x - v) for v in sorted_vals])
  297. median_abs_dev = _StatsProperty('median_abs_dev', _calc_median_abs_dev)
  298. mad = median_abs_dev # convenience
  299. def _calc_rel_std_dev(self):
  300. """\
  301. Standard deviation divided by the absolute value of the average.
  302. http://en.wikipedia.org/wiki/Relative_standard_deviation
  303. >>> print('%1.3f' % rel_std_dev(range(97)))
  304. 0.583
  305. """
  306. abs_mean = abs(self.mean)
  307. if abs_mean:
  308. return self.std_dev / abs_mean
  309. else:
  310. return self.default
  311. rel_std_dev = _StatsProperty('rel_std_dev', _calc_rel_std_dev)
  312. def _calc_skewness(self):
  313. """\
  314. Indicates the asymmetry of a curve. Positive values mean the bulk
  315. of the values are on the left side of the average and vice versa.
  316. http://en.wikipedia.org/wiki/Skewness
  317. See the module docstring for more about statistical moments.
  318. >>> skewness(range(97)) # symmetrical around 48.0
  319. 0.0
  320. >>> left_skewed = skewness(list(range(97)) + list(range(10)))
  321. >>> right_skewed = skewness(list(range(97)) + list(range(87, 97)))
  322. >>> round(left_skewed, 3), round(right_skewed, 3)
  323. (0.114, -0.114)
  324. """
  325. data, s_dev = self.data, self.std_dev
  326. if len(data) > 1 and s_dev > 0:
  327. return (sum(self._get_pow_diffs(3)) /
  328. float((len(data) - 1) * (s_dev ** 3)))
  329. else:
  330. return self.default
  331. skewness = _StatsProperty('skewness', _calc_skewness)
  332. def _calc_kurtosis(self):
  333. """\
  334. Indicates how much data is in the tails of the distribution. The
  335. result is always positive, with the normal "bell-curve"
  336. distribution having a kurtosis of 3.
  337. http://en.wikipedia.org/wiki/Kurtosis
  338. See the module docstring for more about statistical moments.
  339. >>> kurtosis(range(9))
  340. 1.99125
  341. With a kurtosis of 1.99125, [0, 1, 2, 3, 4, 5, 6, 7, 8] is more
  342. centrally distributed than the normal curve.
  343. """
  344. data, s_dev = self.data, self.std_dev
  345. if len(data) > 1 and s_dev > 0:
  346. return (sum(self._get_pow_diffs(4)) /
  347. float((len(data) - 1) * (s_dev ** 4)))
  348. else:
  349. return 0.0
  350. kurtosis = _StatsProperty('kurtosis', _calc_kurtosis)
  351. def _calc_pearson_type(self):
  352. precision = self._pearson_precision
  353. skewness = self.skewness
  354. kurtosis = self.kurtosis
  355. beta1 = skewness ** 2.0
  356. beta2 = kurtosis * 1.0
  357. # TODO: range checks?
  358. c0 = (4 * beta2) - (3 * beta1)
  359. c1 = skewness * (beta2 + 3)
  360. c2 = (2 * beta2) - (3 * beta1) - 6
  361. if round(c1, precision) == 0:
  362. if round(beta2, precision) == 3:
  363. return 0 # Normal
  364. else:
  365. if beta2 < 3:
  366. return 2 # Symmetric Beta
  367. elif beta2 > 3:
  368. return 7
  369. elif round(c2, precision) == 0:
  370. return 3 # Gamma
  371. else:
  372. k = c1 ** 2 / (4 * c0 * c2)
  373. if k < 0:
  374. return 1 # Beta
  375. raise RuntimeError('missed a spot')
  376. pearson_type = _StatsProperty('pearson_type', _calc_pearson_type)
  377. @staticmethod
  378. def _get_quantile(sorted_data, q):
  379. data, n = sorted_data, len(sorted_data)
  380. idx = q / 1.0 * (n - 1)
  381. idx_f, idx_c = int(floor(idx)), int(ceil(idx))
  382. if idx_f == idx_c:
  383. return data[idx_f]
  384. return (data[idx_f] * (idx_c - idx)) + (data[idx_c] * (idx - idx_f))
  385. def get_quantile(self, q):
  386. """Get a quantile from the dataset. Quantiles are floating point
  387. values between ``0.0`` and ``1.0``, with ``0.0`` representing
  388. the minimum value in the dataset and ``1.0`` representing the
  389. maximum. ``0.5`` represents the median:
  390. >>> Stats(range(100)).get_quantile(0.5)
  391. 49.5
  392. """
  393. q = float(q)
  394. if not 0.0 <= q <= 1.0:
  395. raise ValueError('expected q between 0.0 and 1.0, not %r' % q)
  396. elif not self.data:
  397. return self.default
  398. return self._get_quantile(self._get_sorted_data(), q)
  399. def get_zscore(self, value):
  400. """Get the z-score for *value* in the group. If the standard deviation
  401. is 0, 0 inf or -inf will be returned to indicate whether the value is
  402. equal to, greater than or below the group's mean.
  403. """
  404. mean = self.mean
  405. if self.std_dev == 0:
  406. if value == mean:
  407. return 0
  408. if value > mean:
  409. return float('inf')
  410. if value < mean:
  411. return float('-inf')
  412. return (float(value) - mean) / self.std_dev
  413. def trim_relative(self, amount=0.15):
  414. """A utility function used to cut a proportion of values off each end
  415. of a list of values. This has the effect of limiting the
  416. effect of outliers.
  417. Args:
  418. amount (float): A value between 0.0 and 0.5 to trim off of
  419. each side of the data.
  420. .. note:
  421. This operation modifies the data in-place. It does not
  422. make or return a copy.
  423. """
  424. trim = float(amount)
  425. if not 0.0 <= trim < 0.5:
  426. raise ValueError('expected amount between 0.0 and 0.5, not %r'
  427. % trim)
  428. size = len(self.data)
  429. size_diff = int(size * trim)
  430. if size_diff == 0.0:
  431. return
  432. self.data = self._get_sorted_data()[size_diff:-size_diff]
  433. self.clear_cache()
  434. def _get_pow_diffs(self, power):
  435. """
  436. A utility function used for calculating statistical moments.
  437. """
  438. m = self.mean
  439. return [(v - m) ** power for v in self.data]
  440. def _get_bin_bounds(self, count=None, with_max=False):
  441. if not self.data:
  442. return [0.0] # TODO: raise?
  443. data = self.data
  444. len_data, min_data, max_data = len(data), min(data), max(data)
  445. if len_data < 4:
  446. if not count:
  447. count = len_data
  448. dx = (max_data - min_data) / float(count)
  449. bins = [min_data + (dx * i) for i in range(count)]
  450. elif count is None:
  451. # freedman algorithm for fixed-width bin selection
  452. q25, q75 = self.get_quantile(0.25), self.get_quantile(0.75)
  453. dx = 2 * (q75 - q25) / (len_data ** (1 / 3.0))
  454. bin_count = max(1, int(ceil((max_data - min_data) / dx)))
  455. bins = [min_data + (dx * i) for i in range(bin_count + 1)]
  456. bins = [b for b in bins if b < max_data]
  457. else:
  458. dx = (max_data - min_data) / float(count)
  459. bins = [min_data + (dx * i) for i in range(count)]
  460. if with_max:
  461. bins.append(float(max_data))
  462. return bins
  463. def get_histogram_counts(self, bins=None, **kw):
  464. """Produces a list of ``(bin, count)`` pairs comprising a histogram of
  465. the Stats object's data, using fixed-width bins. See
  466. :meth:`Stats.format_histogram` for more details.
  467. Args:
  468. bins (int): maximum number of bins, or list of
  469. floating-point bin boundaries. Defaults to the output of
  470. Freedman's algorithm.
  471. bin_digits (int): Number of digits used to round down the
  472. bin boundaries. Defaults to 1.
  473. The output of this method can be stored and/or modified, and
  474. then passed to :func:`statsutils.format_histogram_counts` to
  475. achieve the same text formatting as the
  476. :meth:`~Stats.format_histogram` method. This can be useful for
  477. snapshotting over time.
  478. """
  479. bin_digits = int(kw.pop('bin_digits', 1))
  480. if kw:
  481. raise TypeError('unexpected keyword arguments: %r' % kw.keys())
  482. if not bins:
  483. bins = self._get_bin_bounds()
  484. else:
  485. try:
  486. bin_count = int(bins)
  487. except TypeError:
  488. try:
  489. bins = [float(x) for x in bins]
  490. except Exception:
  491. raise ValueError('bins expected integer bin count or list'
  492. ' of float bin boundaries, not %r' % bins)
  493. if self.min < bins[0]:
  494. bins = [self.min] + bins
  495. else:
  496. bins = self._get_bin_bounds(bin_count)
  497. # floor and ceil really should have taken ndigits, like round()
  498. round_factor = 10.0 ** bin_digits
  499. bins = [floor(b * round_factor) / round_factor for b in bins]
  500. bins = sorted(set(bins))
  501. idxs = [bisect.bisect(bins, d) - 1 for d in self.data]
  502. count_map = {} # would have used Counter, but py26 support
  503. for idx in idxs:
  504. try:
  505. count_map[idx] += 1
  506. except KeyError:
  507. count_map[idx] = 1
  508. bin_counts = [(b, count_map.get(i, 0)) for i, b in enumerate(bins)]
  509. return bin_counts
  510. def format_histogram(self, bins=None, **kw):
  511. """Produces a textual histogram of the data, using fixed-width bins,
  512. allowing for simple visualization, even in console environments.
  513. >>> data = list(range(20)) + list(range(5, 15)) + [10]
  514. >>> print(Stats(data).format_histogram(width=30))
  515. 0.0: 5 #########
  516. 4.4: 8 ###############
  517. 8.9: 11 ####################
  518. 13.3: 5 #########
  519. 17.8: 2 ####
  520. In this histogram, five values are between 0.0 and 4.4, eight
  521. are between 4.4 and 8.9, and two values lie between 17.8 and
  522. the max.
  523. You can specify the number of bins, or provide a list of
  524. bin boundaries themselves. If no bins are provided, as in the
  525. example above, `Freedman's algorithm`_ for bin selection is
  526. used.
  527. Args:
  528. bins (int): Maximum number of bins for the
  529. histogram. Also accepts a list of floating-point
  530. bin boundaries. If the minimum boundary is still
  531. greater than the minimum value in the data, that
  532. boundary will be implicitly added. Defaults to the bin
  533. boundaries returned by `Freedman's algorithm`_.
  534. bin_digits (int): Number of digits to round each bin
  535. to. Note that bins are always rounded down to avoid
  536. clipping any data. Defaults to 1.
  537. width (int): integer number of columns in the longest line
  538. in the histogram. Defaults to console width on Python
  539. 3.3+, or 80 if that is not available.
  540. format_bin (callable): Called on each bin to create a
  541. label for the final output. Use this function to add
  542. units, such as "ms" for milliseconds.
  543. Should you want something more programmatically reusable, see
  544. the :meth:`~Stats.get_histogram_counts` method, the output of
  545. is used by format_histogram. The :meth:`~Stats.describe`
  546. method is another useful summarization method, albeit less
  547. visual.
  548. .. _Freedman's algorithm: https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule
  549. """
  550. width = kw.pop('width', None)
  551. format_bin = kw.pop('format_bin', None)
  552. bin_counts = self.get_histogram_counts(bins=bins, **kw)
  553. return format_histogram_counts(bin_counts,
  554. width=width,
  555. format_bin=format_bin)
  556. def describe(self, quantiles=None, format=None):
  557. """Provides standard summary statistics for the data in the Stats
  558. object, in one of several convenient formats.
  559. Args:
  560. quantiles (list): A list of numeric values to use as
  561. quantiles in the resulting summary. All values must be
  562. 0.0-1.0, with 0.5 representing the median. Defaults to
  563. ``[0.25, 0.5, 0.75]``, representing the standard
  564. quartiles.
  565. format (str): Controls the return type of the function,
  566. with one of three valid values: ``"dict"`` gives back
  567. a :class:`dict` with the appropriate keys and
  568. values. ``"list"`` is a list of key-value pairs in an
  569. order suitable to pass to an OrderedDict or HTML
  570. table. ``"text"`` converts the values to text suitable
  571. for printing, as seen below.
  572. Here is the information returned by a default ``describe``, as
  573. presented in the ``"text"`` format:
  574. >>> stats = Stats(range(1, 8))
  575. >>> print(stats.describe(format='text'))
  576. count: 7
  577. mean: 4.0
  578. std_dev: 2.0
  579. mad: 2.0
  580. min: 1
  581. 0.25: 2.5
  582. 0.5: 4
  583. 0.75: 5.5
  584. max: 7
  585. For more advanced descriptive statistics, check out my blog
  586. post on the topic `Statistics for Software
  587. <https://www.paypal-engineering.com/2016/04/11/statistics-for-software/>`_.
  588. """
  589. if format is None:
  590. format = 'dict'
  591. elif format not in ('dict', 'list', 'text'):
  592. raise ValueError('invalid format for describe,'
  593. ' expected one of "dict"/"list"/"text", not %r'
  594. % format)
  595. quantiles = quantiles or [0.25, 0.5, 0.75]
  596. q_items = []
  597. for q in quantiles:
  598. q_val = self.get_quantile(q)
  599. q_items.append((str(q), q_val))
  600. items = [('count', self.count),
  601. ('mean', self.mean),
  602. ('std_dev', self.std_dev),
  603. ('mad', self.mad),
  604. ('min', self.min)]
  605. items.extend(q_items)
  606. items.append(('max', self.max))
  607. if format == 'dict':
  608. ret = dict(items)
  609. elif format == 'list':
  610. ret = items
  611. elif format == 'text':
  612. ret = '\n'.join(['%s%s' % ((label + ':').ljust(10), val)
  613. for label, val in items])
  614. return ret
  615. def describe(data, quantiles=None, format=None):
  616. """A convenience function to get standard summary statistics useful
  617. for describing most data. See :meth:`Stats.describe` for more
  618. details.
  619. >>> print(describe(range(7), format='text'))
  620. count: 7
  621. mean: 3.0
  622. std_dev: 2.0
  623. mad: 2.0
  624. min: 0
  625. 0.25: 1.5
  626. 0.5: 3
  627. 0.75: 4.5
  628. max: 6
  629. See :meth:`Stats.format_histogram` for another very useful
  630. summarization that uses textual visualization.
  631. """
  632. return Stats(data).describe(quantiles=quantiles, format=format)
  633. def _get_conv_func(attr_name):
  634. def stats_helper(data, default=0.0):
  635. return getattr(Stats(data, default=default, use_copy=False),
  636. attr_name)
  637. return stats_helper
  638. for attr_name, attr in list(Stats.__dict__.items()):
  639. if isinstance(attr, _StatsProperty):
  640. if attr_name in ('max', 'min', 'count'): # don't shadow builtins
  641. continue
  642. if attr_name in ('mad',): # convenience aliases
  643. continue
  644. func = _get_conv_func(attr_name)
  645. func.__doc__ = attr.func.__doc__
  646. globals()[attr_name] = func
  647. delattr(Stats, '_calc_' + attr_name)
  648. # cleanup
  649. del attr
  650. del attr_name
  651. del func
  652. def format_histogram_counts(bin_counts, width=None, format_bin=None):
  653. """The formatting logic behind :meth:`Stats.format_histogram`, which
  654. takes the output of :meth:`Stats.get_histogram_counts`, and passes
  655. them to this function.
  656. Args:
  657. bin_counts (list): A list of bin values to counts.
  658. width (int): Number of character columns in the text output,
  659. defaults to 80 or console width in Python 3.3+.
  660. format_bin (callable): Used to convert bin values into string
  661. labels.
  662. """
  663. lines = []
  664. if not format_bin:
  665. format_bin = lambda v: v
  666. if not width:
  667. try:
  668. import shutil # python 3 convenience
  669. width = shutil.get_terminal_size()[0]
  670. except Exception:
  671. width = 80
  672. bins = [b for b, _ in bin_counts]
  673. count_max = max([count for _, count in bin_counts])
  674. count_cols = len(str(count_max))
  675. labels = ['%s' % format_bin(b) for b in bins]
  676. label_cols = max([len(l) for l in labels])
  677. tmp_line = '%s: %s #' % ('x' * label_cols, count_max)
  678. bar_cols = max(width - len(tmp_line), 3)
  679. line_k = float(bar_cols) / count_max
  680. tmpl = "{label:>{label_cols}}: {count:>{count_cols}} {bar}"
  681. for label, (bin_val, count) in zip(labels, bin_counts):
  682. bar_len = int(round(count * line_k))
  683. bar = ('#' * bar_len) or '|'
  684. line = tmpl.format(label=label,
  685. label_cols=label_cols,
  686. count=count,
  687. count_cols=count_cols,
  688. bar=bar)
  689. lines.append(line)
  690. return '\n'.join(lines)