jsonutils.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. # -*- coding: utf-8 -*-
  2. # Copyright (c) 2013, Mahmoud Hashemi
  3. #
  4. # Redistribution and use in source and binary forms, with or without
  5. # modification, are permitted provided that the following conditions are
  6. # met:
  7. #
  8. # * Redistributions of source code must retain the above copyright
  9. # notice, this list of conditions and the following disclaimer.
  10. #
  11. # * Redistributions in binary form must reproduce the above
  12. # copyright notice, this list of conditions and the following
  13. # disclaimer in the documentation and/or other materials provided
  14. # with the distribution.
  15. #
  16. # * The names of the contributors may not be used to endorse or
  17. # promote products derived from this software without specific
  18. # prior written permission.
  19. #
  20. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21. # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22. # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23. # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24. # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25. # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27. # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28. # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29. # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30. # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31. """``jsonutils`` aims to provide various helpers for working with
  32. JSON. Currently it focuses on providing a reliable and intuitive means
  33. of working with `JSON Lines`_-formatted files.
  34. .. _JSON Lines: http://jsonlines.org/
  35. """
  36. from __future__ import print_function
  37. import io
  38. import os
  39. import json
  40. DEFAULT_BLOCKSIZE = 4096
  41. __all__ = ['JSONLIterator', 'reverse_iter_lines']
  42. def reverse_iter_lines(file_obj, blocksize=DEFAULT_BLOCKSIZE, preseek=True, encoding=None):
  43. """Returns an iterator over the lines from a file object, in
  44. reverse order, i.e., last line first, first line last. Uses the
  45. :meth:`file.seek` method of file objects, and is tested compatible with
  46. :class:`file` objects, as well as :class:`StringIO.StringIO`.
  47. Args:
  48. file_obj (file): An open file object. Note that
  49. ``reverse_iter_lines`` mutably reads from the file and
  50. other functions should not mutably interact with the file
  51. object after being passed. Files can be opened in bytes or
  52. text mode.
  53. blocksize (int): The block size to pass to
  54. :meth:`file.read()`. Warning: keep this a fairly large
  55. multiple of 2, defaults to 4096.
  56. preseek (bool): Tells the function whether or not to automatically
  57. seek to the end of the file. Defaults to ``True``.
  58. ``preseek=False`` is useful in cases when the
  59. file cursor is already in position, either at the end of
  60. the file or in the middle for relative reverse line
  61. generation.
  62. """
  63. # This function is a bit of a pain because it attempts to be byte/text agnostic
  64. try:
  65. encoding = encoding or file_obj.encoding
  66. except AttributeError:
  67. # BytesIO
  68. encoding = None
  69. else:
  70. encoding = 'utf-8'
  71. # need orig_obj to keep alive otherwise __del__ on the TextWrapper will close the file
  72. orig_obj = file_obj
  73. try:
  74. file_obj = orig_obj.detach()
  75. except (AttributeError, io.UnsupportedOperation):
  76. pass
  77. empty_bytes, newline_bytes, empty_text = b'', b'\n', u''
  78. if preseek:
  79. file_obj.seek(0, os.SEEK_END)
  80. buff = empty_bytes
  81. cur_pos = file_obj.tell()
  82. while 0 < cur_pos:
  83. read_size = min(blocksize, cur_pos)
  84. cur_pos -= read_size
  85. file_obj.seek(cur_pos, os.SEEK_SET)
  86. cur = file_obj.read(read_size)
  87. buff = cur + buff
  88. lines = buff.splitlines()
  89. if len(lines) < 2 or lines[0] == empty_bytes:
  90. continue
  91. if buff[-1:] == newline_bytes:
  92. yield empty_text if encoding else empty_bytes
  93. for line in lines[:0:-1]:
  94. yield line.decode(encoding) if encoding else line
  95. buff = lines[0]
  96. if buff:
  97. yield buff.decode(encoding) if encoding else buff
  98. """
  99. TODO: allow passthroughs for:
  100. json.load(fp[, encoding[, cls[, object_hook[, parse_float[, parse_int[, parse_constant[, object_pairs_hook[, **kw]]]]]]]])
  101. """
  102. class JSONLIterator(object):
  103. """The ``JSONLIterator`` is used to iterate over JSON-encoded objects
  104. stored in the `JSON Lines format`_ (one object per line).
  105. Most notably it has the ability to efficiently read from the
  106. bottom of files, making it very effective for reading in simple
  107. append-only JSONL use cases. It also has the ability to start from
  108. anywhere in the file and ignore corrupted lines.
  109. Args:
  110. file_obj (file): An open file object.
  111. ignore_errors (bool): Whether to skip over lines that raise an error on
  112. deserialization (:func:`json.loads`).
  113. reverse (bool): Controls the direction of the iteration.
  114. Defaults to ``False``. If set to ``True`` and *rel_seek*
  115. is unset, seeks to the end of the file before iteration
  116. begins.
  117. rel_seek (float): Used to preseek the start position of
  118. iteration. Set to 0.0 for the start of the file, 1.0 for the
  119. end, and anything in between.
  120. .. _JSON Lines format: http://jsonlines.org/
  121. """
  122. def __init__(self, file_obj,
  123. ignore_errors=False, reverse=False, rel_seek=None):
  124. self._reverse = bool(reverse)
  125. self._file_obj = file_obj
  126. self.ignore_errors = ignore_errors
  127. if rel_seek is None:
  128. if reverse:
  129. rel_seek = 1.0
  130. elif not -1.0 < rel_seek < 1.0:
  131. raise ValueError("'rel_seek' expected a float between"
  132. " -1.0 and 1.0, not %r" % rel_seek)
  133. elif rel_seek < 0:
  134. rel_seek = 1.0 - rel_seek
  135. self._rel_seek = rel_seek
  136. self._blocksize = 4096
  137. if rel_seek is not None:
  138. self._init_rel_seek()
  139. if self._reverse:
  140. self._line_iter = reverse_iter_lines(self._file_obj,
  141. blocksize=self._blocksize,
  142. preseek=False)
  143. else:
  144. self._line_iter = iter(self._file_obj)
  145. @property
  146. def cur_byte_pos(self):
  147. "A property representing where in the file the iterator is reading."
  148. return self._file_obj.tell()
  149. def _align_to_newline(self):
  150. "Aligns the file object's position to the next newline."
  151. fo, bsize = self._file_obj, self._blocksize
  152. cur, total_read = '', 0
  153. cur_pos = fo.tell()
  154. while '\n' not in cur:
  155. cur = fo.read(bsize)
  156. total_read += bsize
  157. try:
  158. newline_offset = cur.index('\n') + total_read - bsize
  159. except ValueError:
  160. raise # TODO: seek to end?
  161. fo.seek(cur_pos + newline_offset)
  162. def _init_rel_seek(self):
  163. "Sets the file object's position to the relative location set above."
  164. rs, fo = self._rel_seek, self._file_obj
  165. if rs == 0.0:
  166. fo.seek(0, os.SEEK_SET)
  167. else:
  168. fo.seek(0, os.SEEK_END)
  169. size = fo.tell()
  170. if rs == 1.0:
  171. self._cur_pos = size
  172. else:
  173. target = int(size * rs)
  174. fo.seek(target, os.SEEK_SET)
  175. self._align_to_newline()
  176. self._cur_pos = fo.tell()
  177. def __iter__(self):
  178. return self
  179. def next(self):
  180. """Yields one :class:`dict` loaded with :func:`json.loads`, advancing
  181. the file object by one line. Raises :exc:`StopIteration` upon reaching
  182. the end of the file (or beginning, if ``reverse`` was set to ``True``.
  183. """
  184. while 1:
  185. line = next(self._line_iter).lstrip()
  186. if not line:
  187. continue
  188. try:
  189. obj = json.loads(line)
  190. except Exception:
  191. if not self.ignore_errors:
  192. raise
  193. continue
  194. return obj
  195. __next__ = next
  196. if __name__ == '__main__':
  197. def _main():
  198. import sys
  199. if '-h' in sys.argv or '--help' in sys.argv:
  200. print('loads one or more JSON Line files for basic validation.')
  201. return
  202. verbose = False
  203. if '-v' in sys.argv or '--verbose' in sys.argv:
  204. verbose = True
  205. file_count, obj_count = 0, 0
  206. filenames = sys.argv[1:]
  207. for filename in filenames:
  208. if filename in ('-h', '--help', '-v', '--verbose'):
  209. continue
  210. file_count += 1
  211. with open(filename, 'rb') as file_obj:
  212. iterator = JSONLIterator(file_obj)
  213. cur_obj_count = 0
  214. while 1:
  215. try:
  216. next(iterator)
  217. except ValueError:
  218. print('error reading object #%s around byte %s in %s'
  219. % (cur_obj_count + 1, iterator.cur_byte_pos, filename))
  220. return
  221. except StopIteration:
  222. break
  223. obj_count += 1
  224. cur_obj_count += 1
  225. if verbose and obj_count and obj_count % 100 == 0:
  226. sys.stdout.write('.')
  227. if obj_count % 10000:
  228. sys.stdout.write('%s\n' % obj_count)
  229. if verbose:
  230. print('files checked: %s' % file_count)
  231. print('objects loaded: %s' % obj_count)
  232. return
  233. _main()