123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270 |
- # -*- coding: utf-8 -*-
- # Copyright (c) 2013, Mahmoud Hashemi
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are
- # met:
- #
- # * Redistributions of source code must retain the above copyright
- # notice, this list of conditions and the following disclaimer.
- #
- # * Redistributions in binary form must reproduce the above
- # copyright notice, this list of conditions and the following
- # disclaimer in the documentation and/or other materials provided
- # with the distribution.
- #
- # * The names of the contributors may not be used to endorse or
- # promote products derived from this software without specific
- # prior written permission.
- #
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- """``jsonutils`` aims to provide various helpers for working with
- JSON. Currently it focuses on providing a reliable and intuitive means
- of working with `JSON Lines`_-formatted files.
- .. _JSON Lines: http://jsonlines.org/
- """
- from __future__ import print_function
- import io
- import os
- import json
- DEFAULT_BLOCKSIZE = 4096
- __all__ = ['JSONLIterator', 'reverse_iter_lines']
- def reverse_iter_lines(file_obj, blocksize=DEFAULT_BLOCKSIZE, preseek=True, encoding=None):
- """Returns an iterator over the lines from a file object, in
- reverse order, i.e., last line first, first line last. Uses the
- :meth:`file.seek` method of file objects, and is tested compatible with
- :class:`file` objects, as well as :class:`StringIO.StringIO`.
- Args:
- file_obj (file): An open file object. Note that
- ``reverse_iter_lines`` mutably reads from the file and
- other functions should not mutably interact with the file
- object after being passed. Files can be opened in bytes or
- text mode.
- blocksize (int): The block size to pass to
- :meth:`file.read()`. Warning: keep this a fairly large
- multiple of 2, defaults to 4096.
- preseek (bool): Tells the function whether or not to automatically
- seek to the end of the file. Defaults to ``True``.
- ``preseek=False`` is useful in cases when the
- file cursor is already in position, either at the end of
- the file or in the middle for relative reverse line
- generation.
- """
- # This function is a bit of a pain because it attempts to be byte/text agnostic
- try:
- encoding = encoding or file_obj.encoding
- except AttributeError:
- # BytesIO
- encoding = None
- else:
- encoding = 'utf-8'
- # need orig_obj to keep alive otherwise __del__ on the TextWrapper will close the file
- orig_obj = file_obj
- try:
- file_obj = orig_obj.detach()
- except (AttributeError, io.UnsupportedOperation):
- pass
- empty_bytes, newline_bytes, empty_text = b'', b'\n', u''
- if preseek:
- file_obj.seek(0, os.SEEK_END)
- buff = empty_bytes
- cur_pos = file_obj.tell()
- while 0 < cur_pos:
- read_size = min(blocksize, cur_pos)
- cur_pos -= read_size
- file_obj.seek(cur_pos, os.SEEK_SET)
- cur = file_obj.read(read_size)
- buff = cur + buff
- lines = buff.splitlines()
- if len(lines) < 2 or lines[0] == empty_bytes:
- continue
- if buff[-1:] == newline_bytes:
- yield empty_text if encoding else empty_bytes
- for line in lines[:0:-1]:
- yield line.decode(encoding) if encoding else line
- buff = lines[0]
- if buff:
- yield buff.decode(encoding) if encoding else buff
- """
- TODO: allow passthroughs for:
- json.load(fp[, encoding[, cls[, object_hook[, parse_float[, parse_int[, parse_constant[, object_pairs_hook[, **kw]]]]]]]])
- """
- class JSONLIterator(object):
- """The ``JSONLIterator`` is used to iterate over JSON-encoded objects
- stored in the `JSON Lines format`_ (one object per line).
- Most notably it has the ability to efficiently read from the
- bottom of files, making it very effective for reading in simple
- append-only JSONL use cases. It also has the ability to start from
- anywhere in the file and ignore corrupted lines.
- Args:
- file_obj (file): An open file object.
- ignore_errors (bool): Whether to skip over lines that raise an error on
- deserialization (:func:`json.loads`).
- reverse (bool): Controls the direction of the iteration.
- Defaults to ``False``. If set to ``True`` and *rel_seek*
- is unset, seeks to the end of the file before iteration
- begins.
- rel_seek (float): Used to preseek the start position of
- iteration. Set to 0.0 for the start of the file, 1.0 for the
- end, and anything in between.
- .. _JSON Lines format: http://jsonlines.org/
- """
- def __init__(self, file_obj,
- ignore_errors=False, reverse=False, rel_seek=None):
- self._reverse = bool(reverse)
- self._file_obj = file_obj
- self.ignore_errors = ignore_errors
- if rel_seek is None:
- if reverse:
- rel_seek = 1.0
- elif not -1.0 < rel_seek < 1.0:
- raise ValueError("'rel_seek' expected a float between"
- " -1.0 and 1.0, not %r" % rel_seek)
- elif rel_seek < 0:
- rel_seek = 1.0 - rel_seek
- self._rel_seek = rel_seek
- self._blocksize = 4096
- if rel_seek is not None:
- self._init_rel_seek()
- if self._reverse:
- self._line_iter = reverse_iter_lines(self._file_obj,
- blocksize=self._blocksize,
- preseek=False)
- else:
- self._line_iter = iter(self._file_obj)
- @property
- def cur_byte_pos(self):
- "A property representing where in the file the iterator is reading."
- return self._file_obj.tell()
- def _align_to_newline(self):
- "Aligns the file object's position to the next newline."
- fo, bsize = self._file_obj, self._blocksize
- cur, total_read = '', 0
- cur_pos = fo.tell()
- while '\n' not in cur:
- cur = fo.read(bsize)
- total_read += bsize
- try:
- newline_offset = cur.index('\n') + total_read - bsize
- except ValueError:
- raise # TODO: seek to end?
- fo.seek(cur_pos + newline_offset)
- def _init_rel_seek(self):
- "Sets the file object's position to the relative location set above."
- rs, fo = self._rel_seek, self._file_obj
- if rs == 0.0:
- fo.seek(0, os.SEEK_SET)
- else:
- fo.seek(0, os.SEEK_END)
- size = fo.tell()
- if rs == 1.0:
- self._cur_pos = size
- else:
- target = int(size * rs)
- fo.seek(target, os.SEEK_SET)
- self._align_to_newline()
- self._cur_pos = fo.tell()
- def __iter__(self):
- return self
- def next(self):
- """Yields one :class:`dict` loaded with :func:`json.loads`, advancing
- the file object by one line. Raises :exc:`StopIteration` upon reaching
- the end of the file (or beginning, if ``reverse`` was set to ``True``.
- """
- while 1:
- line = next(self._line_iter).lstrip()
- if not line:
- continue
- try:
- obj = json.loads(line)
- except Exception:
- if not self.ignore_errors:
- raise
- continue
- return obj
- __next__ = next
- if __name__ == '__main__':
- def _main():
- import sys
- if '-h' in sys.argv or '--help' in sys.argv:
- print('loads one or more JSON Line files for basic validation.')
- return
- verbose = False
- if '-v' in sys.argv or '--verbose' in sys.argv:
- verbose = True
- file_count, obj_count = 0, 0
- filenames = sys.argv[1:]
- for filename in filenames:
- if filename in ('-h', '--help', '-v', '--verbose'):
- continue
- file_count += 1
- with open(filename, 'rb') as file_obj:
- iterator = JSONLIterator(file_obj)
- cur_obj_count = 0
- while 1:
- try:
- next(iterator)
- except ValueError:
- print('error reading object #%s around byte %s in %s'
- % (cur_obj_count + 1, iterator.cur_byte_pos, filename))
- return
- except StopIteration:
- break
- obj_count += 1
- cur_obj_count += 1
- if verbose and obj_count and obj_count % 100 == 0:
- sys.stdout.write('.')
- if obj_count % 10000:
- sys.stdout.write('%s\n' % obj_count)
- if verbose:
- print('files checked: %s' % file_count)
- print('objects loaded: %s' % obj_count)
- return
- _main()
|