ioutils.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604
  1. # -*- coding: utf-8 -*-
  2. # Copyright (c) 2013, Mahmoud Hashemi
  3. #
  4. # Redistribution and use in source and binary forms, with or without
  5. # modification, are permitted provided that the following conditions are
  6. # met:
  7. #
  8. # * Redistributions of source code must retain the above copyright
  9. # notice, this list of conditions and the following disclaimer.
  10. #
  11. # * Redistributions in binary form must reproduce the above
  12. # copyright notice, this list of conditions and the following
  13. # disclaimer in the documentation and/or other materials provided
  14. # with the distribution.
  15. #
  16. # * The names of the contributors may not be used to endorse or
  17. # promote products derived from this software without specific
  18. # prior written permission.
  19. #
  20. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21. # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22. # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23. # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24. # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25. # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27. # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28. # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29. # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30. # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31. # Coding decl above needed for rendering the emdash properly in the
  32. # documentation.
  33. """
  34. Module ``ioutils`` implements a number of helper classes and functions which
  35. are useful when dealing with input, output, and bytestreams in a variety of
  36. ways.
  37. """
  38. import os
  39. from io import BytesIO, IOBase
  40. from abc import (
  41. ABCMeta,
  42. abstractmethod,
  43. abstractproperty,
  44. )
  45. from errno import EINVAL
  46. from codecs import EncodedFile
  47. from tempfile import TemporaryFile
  48. try:
  49. from itertools import izip_longest as zip_longest # Python 2
  50. except ImportError:
  51. from itertools import zip_longest # Python 3
  52. try:
  53. text_type = unicode # Python 2
  54. binary_type = str
  55. except NameError:
  56. text_type = str # Python 3
  57. binary_type = bytes
  58. READ_CHUNK_SIZE = 21333
  59. """
  60. Number of bytes to read at a time. The value is ~ 1/3rd of 64k which means that
  61. the value will easily fit in the L2 cache of most processors even if every
  62. codepoint in a string is three bytes long which makes it a nice fast default
  63. value.
  64. """
  65. class SpooledIOBase(IOBase):
  66. """
  67. A base class shared by the SpooledBytesIO and SpooledStringIO classes.
  68. The SpooledTemporaryFile class is missing several attributes and methods
  69. present in the StringIO implementation. This brings the api as close to
  70. parity as possible so that classes derived from SpooledIOBase can be used
  71. as near drop-in replacements to save memory.
  72. """
  73. __metaclass__ = ABCMeta
  74. def __init__(self, max_size=5000000, dir=None):
  75. self._max_size = max_size
  76. self._dir = dir
  77. def _checkClosed(self, msg=None):
  78. """Raise a ValueError if file is closed"""
  79. if self.closed:
  80. raise ValueError('I/O operation on closed file.'
  81. if msg is None else msg)
  82. @abstractmethod
  83. def read(self, n=-1):
  84. """Read n characters from the buffer"""
  85. @abstractmethod
  86. def write(self, s):
  87. """Write into the buffer"""
  88. @abstractmethod
  89. def seek(self, pos, mode=0):
  90. """Seek to a specific point in a file"""
  91. @abstractmethod
  92. def readline(self, length=None):
  93. """Returns the next available line"""
  94. @abstractmethod
  95. def readlines(self, sizehint=0):
  96. """Returns a list of all lines from the current position forward"""
  97. def writelines(self, lines):
  98. """
  99. Write lines to the file from an interable.
  100. NOTE: writelines() does NOT add line separators.
  101. """
  102. self._checkClosed()
  103. for line in lines:
  104. self.write(line)
  105. @abstractmethod
  106. def rollover(self):
  107. """Roll file-like-object over into a real temporary file"""
  108. @abstractmethod
  109. def tell(self):
  110. """Return the current position"""
  111. @abstractproperty
  112. def buffer(self):
  113. """Should return a flo instance"""
  114. @abstractproperty
  115. def _rolled(self):
  116. """Returns whether the file has been rolled to a real file or not"""
  117. @abstractproperty
  118. def len(self):
  119. """Returns the length of the data"""
  120. def _get_softspace(self):
  121. return self.buffer.softspace
  122. def _set_softspace(self, val):
  123. self.buffer.softspace = val
  124. softspace = property(_get_softspace, _set_softspace)
  125. @property
  126. def _file(self):
  127. return self.buffer
  128. def close(self):
  129. return self.buffer.close()
  130. def flush(self):
  131. self._checkClosed()
  132. return self.buffer.flush()
  133. def isatty(self):
  134. self._checkClosed()
  135. return self.buffer.isatty()
  136. @property
  137. def closed(self):
  138. return self.buffer.closed
  139. @property
  140. def pos(self):
  141. return self.tell()
  142. @property
  143. def buf(self):
  144. return self.getvalue()
  145. def fileno(self):
  146. self.rollover()
  147. return self.buffer.fileno()
  148. def truncate(self, size=None):
  149. """
  150. Truncate the contents of the buffer.
  151. Custom version of truncate that takes either no arguments (like the
  152. real SpooledTemporaryFile) or a single argument that truncates the
  153. value to a certain index location.
  154. """
  155. self._checkClosed()
  156. if size is None:
  157. return self.buffer.truncate()
  158. if size < 0:
  159. raise IOError(EINVAL, "Negative size not allowed")
  160. # Emulate truncation to a particular location
  161. pos = self.tell()
  162. self.seek(size)
  163. self.buffer.truncate()
  164. if pos < size:
  165. self.seek(pos)
  166. def getvalue(self):
  167. """Return the entire files contents."""
  168. self._checkClosed()
  169. pos = self.tell()
  170. self.seek(0)
  171. val = self.read()
  172. self.seek(pos)
  173. return val
  174. def seekable(self):
  175. return True
  176. def readable(self):
  177. return True
  178. def writable(self):
  179. return True
  180. def __next__(self):
  181. self._checkClosed()
  182. line = self.readline()
  183. if not line:
  184. pos = self.buffer.tell()
  185. self.buffer.seek(0, os.SEEK_END)
  186. if pos == self.buffer.tell():
  187. raise StopIteration
  188. else:
  189. self.buffer.seek(pos)
  190. return line
  191. next = __next__
  192. def __len__(self):
  193. return self.len
  194. def __iter__(self):
  195. self._checkClosed()
  196. return self
  197. def __enter__(self):
  198. self._checkClosed()
  199. return self
  200. def __exit__(self, *args):
  201. self._file.close()
  202. def __eq__(self, other):
  203. if isinstance(other, self.__class__):
  204. self_pos = self.tell()
  205. other_pos = other.tell()
  206. try:
  207. self.seek(0)
  208. other.seek(0)
  209. eq = True
  210. for self_line, other_line in zip_longest(self, other):
  211. if self_line != other_line:
  212. eq = False
  213. break
  214. self.seek(self_pos)
  215. other.seek(other_pos)
  216. except Exception:
  217. # Attempt to return files to original position if there were any errors
  218. try:
  219. self.seek(self_pos)
  220. except Exception:
  221. pass
  222. try:
  223. other.seek(other_pos)
  224. except Exception:
  225. pass
  226. raise
  227. else:
  228. return eq
  229. return False
  230. def __ne__(self, other):
  231. return not self.__eq__(other)
  232. def __bool__(self):
  233. return True
  234. def __del__(self):
  235. """Can fail when called at program exit so suppress traceback."""
  236. try:
  237. self.close()
  238. except Exception:
  239. pass
  240. __nonzero__ = __bool__
  241. class SpooledBytesIO(SpooledIOBase):
  242. """
  243. SpooledBytesIO is a spooled file-like-object that only accepts bytes. On
  244. Python 2.x this means the 'str' type; on Python 3.x this means the 'bytes'
  245. type. Bytes are written in and retrieved exactly as given, but it will
  246. raise TypeErrors if something other than bytes are written.
  247. Example::
  248. >>> from boltons import ioutils
  249. >>> with ioutils.SpooledBytesIO() as f:
  250. ... f.write(b"Happy IO")
  251. ... _ = f.seek(0)
  252. ... isinstance(f.getvalue(), ioutils.binary_type)
  253. True
  254. """
  255. def read(self, n=-1):
  256. self._checkClosed()
  257. return self.buffer.read(n)
  258. def write(self, s):
  259. self._checkClosed()
  260. if not isinstance(s, binary_type):
  261. raise TypeError("{} expected, got {}".format(
  262. binary_type.__name__,
  263. type(s).__name__
  264. ))
  265. if self.tell() + len(s) >= self._max_size:
  266. self.rollover()
  267. self.buffer.write(s)
  268. def seek(self, pos, mode=0):
  269. self._checkClosed()
  270. return self.buffer.seek(pos, mode)
  271. def readline(self, length=None):
  272. self._checkClosed()
  273. if length:
  274. return self.buffer.readline(length)
  275. else:
  276. return self.buffer.readline()
  277. def readlines(self, sizehint=0):
  278. return self.buffer.readlines(sizehint)
  279. def rollover(self):
  280. """Roll the StringIO over to a TempFile"""
  281. if not self._rolled:
  282. tmp = TemporaryFile(dir=self._dir)
  283. pos = self.buffer.tell()
  284. tmp.write(self.buffer.getvalue())
  285. tmp.seek(pos)
  286. self.buffer.close()
  287. self._buffer = tmp
  288. @property
  289. def _rolled(self):
  290. return not isinstance(self.buffer, BytesIO)
  291. @property
  292. def buffer(self):
  293. try:
  294. return self._buffer
  295. except AttributeError:
  296. self._buffer = BytesIO()
  297. return self._buffer
  298. @property
  299. def len(self):
  300. """Determine the length of the file"""
  301. pos = self.tell()
  302. if self._rolled:
  303. self.seek(0)
  304. val = os.fstat(self.fileno()).st_size
  305. else:
  306. self.seek(0, os.SEEK_END)
  307. val = self.tell()
  308. self.seek(pos)
  309. return val
  310. def tell(self):
  311. self._checkClosed()
  312. return self.buffer.tell()
  313. class SpooledStringIO(SpooledIOBase):
  314. """
  315. SpooledStringIO is a spooled file-like-object that only accepts unicode
  316. values. On Python 2.x this means the 'unicode' type and on Python 3.x this
  317. means the 'str' type. Values are accepted as unicode and then coerced into
  318. utf-8 encoded bytes for storage. On retrieval, the values are returned as
  319. unicode.
  320. Example::
  321. >>> from boltons import ioutils
  322. >>> with ioutils.SpooledStringIO() as f:
  323. ... f.write(u"\u2014 Hey, an emdash!")
  324. ... _ = f.seek(0)
  325. ... isinstance(f.read(), ioutils.text_type)
  326. True
  327. """
  328. def __init__(self, *args, **kwargs):
  329. self._tell = 0
  330. super(SpooledStringIO, self).__init__(*args, **kwargs)
  331. def read(self, n=-1):
  332. self._checkClosed()
  333. ret = self.buffer.reader.read(n, n)
  334. self._tell = self.tell() + len(ret)
  335. return ret
  336. def write(self, s):
  337. self._checkClosed()
  338. if not isinstance(s, text_type):
  339. raise TypeError("{} expected, got {}".format(
  340. text_type.__name__,
  341. type(s).__name__
  342. ))
  343. current_pos = self.tell()
  344. if self.buffer.tell() + len(s.encode('utf-8')) >= self._max_size:
  345. self.rollover()
  346. self.buffer.write(s.encode('utf-8'))
  347. self._tell = current_pos + len(s)
  348. def _traverse_codepoints(self, current_position, n):
  349. """Traverse from current position to the right n codepoints"""
  350. dest = current_position + n
  351. while True:
  352. if current_position == dest:
  353. # By chance we've landed on the right position, break
  354. break
  355. # If the read would take us past the intended position then
  356. # seek only enough to cover the offset
  357. if current_position + READ_CHUNK_SIZE > dest:
  358. self.read(dest - current_position)
  359. break
  360. else:
  361. ret = self.read(READ_CHUNK_SIZE)
  362. # Increment our current position
  363. current_position += READ_CHUNK_SIZE
  364. # If we kept reading but there was nothing here, break
  365. # as we are at the end of the file
  366. if not ret:
  367. break
  368. return dest
  369. def seek(self, pos, mode=0):
  370. """Traverse from offset to the specified codepoint"""
  371. self._checkClosed()
  372. # Seek to position from the start of the file
  373. if mode == os.SEEK_SET:
  374. self.buffer.seek(0)
  375. self._traverse_codepoints(0, pos)
  376. self._tell = pos
  377. # Seek to new position relative to current position
  378. elif mode == os.SEEK_CUR:
  379. start_pos = self.tell()
  380. self._traverse_codepoints(self.tell(), pos)
  381. self._tell = start_pos + pos
  382. elif mode == os.SEEK_END:
  383. self.buffer.seek(0)
  384. dest_position = self.len - pos
  385. self._traverse_codepoints(0, dest_position)
  386. self._tell = dest_position
  387. else:
  388. raise ValueError(
  389. "Invalid whence ({0}, should be 0, 1, or 2)".format(mode)
  390. )
  391. return self.tell()
  392. def readline(self, length=None):
  393. self._checkClosed()
  394. ret = self.buffer.readline(length).decode('utf-8')
  395. self._tell = self.tell() + len(ret)
  396. return ret
  397. def readlines(self, sizehint=0):
  398. ret = [x.decode('utf-8') for x in self.buffer.readlines(sizehint)]
  399. self._tell = self.tell() + sum((len(x) for x in ret))
  400. return ret
  401. @property
  402. def buffer(self):
  403. try:
  404. return self._buffer
  405. except AttributeError:
  406. self._buffer = EncodedFile(BytesIO(), data_encoding='utf-8')
  407. return self._buffer
  408. @property
  409. def _rolled(self):
  410. return not isinstance(self.buffer.stream, BytesIO)
  411. def rollover(self):
  412. """Roll the buffer over to a TempFile"""
  413. if not self._rolled:
  414. tmp = EncodedFile(TemporaryFile(dir=self._dir),
  415. data_encoding='utf-8')
  416. pos = self.buffer.tell()
  417. tmp.write(self.buffer.getvalue())
  418. tmp.seek(pos)
  419. self.buffer.close()
  420. self._buffer = tmp
  421. def tell(self):
  422. """Return the codepoint position"""
  423. self._checkClosed()
  424. return self._tell
  425. @property
  426. def len(self):
  427. """Determine the number of codepoints in the file"""
  428. pos = self.buffer.tell()
  429. self.buffer.seek(0)
  430. total = 0
  431. while True:
  432. ret = self.read(READ_CHUNK_SIZE)
  433. if not ret:
  434. break
  435. total += len(ret)
  436. self.buffer.seek(pos)
  437. return total
  438. def is_text_fileobj(fileobj):
  439. if getattr(fileobj, 'encoding', False):
  440. # codecs.open and io.TextIOBase
  441. return True
  442. if getattr(fileobj, 'getvalue', False):
  443. # StringIO.StringIO / cStringIO.StringIO / io.StringIO
  444. try:
  445. if isinstance(fileobj.getvalue(), type(u'')):
  446. return True
  447. except Exception:
  448. pass
  449. return False
  450. class MultiFileReader(object):
  451. """Takes a list of open files or file-like objects and provides an
  452. interface to read from them all contiguously. Like
  453. :func:`itertools.chain()`, but for reading files.
  454. >>> mfr = MultiFileReader(BytesIO(b'ab'), BytesIO(b'cd'), BytesIO(b'e'))
  455. >>> mfr.read(3).decode('ascii')
  456. u'abc'
  457. >>> mfr.read(3).decode('ascii')
  458. u'de'
  459. The constructor takes as many fileobjs as you hand it, and will
  460. raise a TypeError on non-file-like objects. A ValueError is raised
  461. when file-like objects are a mix of bytes- and text-handling
  462. objects (for instance, BytesIO and StringIO).
  463. """
  464. def __init__(self, *fileobjs):
  465. if not all([callable(getattr(f, 'read', None)) and
  466. callable(getattr(f, 'seek', None)) for f in fileobjs]):
  467. raise TypeError('MultiFileReader expected file-like objects'
  468. ' with .read() and .seek()')
  469. if all([is_text_fileobj(f) for f in fileobjs]):
  470. # codecs.open and io.TextIOBase
  471. self._joiner = u''
  472. elif any([is_text_fileobj(f) for f in fileobjs]):
  473. raise ValueError('All arguments to MultiFileReader must handle'
  474. ' bytes OR text, not a mix')
  475. else:
  476. # open/file and io.BytesIO
  477. self._joiner = b''
  478. self._fileobjs = fileobjs
  479. self._index = 0
  480. def read(self, amt=None):
  481. """Read up to the specified *amt*, seamlessly bridging across
  482. files. Returns the appropriate type of string (bytes or text)
  483. for the input, and returns an empty string when the files are
  484. exhausted.
  485. """
  486. if not amt:
  487. return self._joiner.join(f.read() for f in self._fileobjs)
  488. parts = []
  489. while amt > 0 and self._index < len(self._fileobjs):
  490. parts.append(self._fileobjs[self._index].read(amt))
  491. got = len(parts[-1])
  492. if got < amt:
  493. self._index += 1
  494. amt -= got
  495. return self._joiner.join(parts)
  496. def seek(self, offset, whence=os.SEEK_SET):
  497. """Enables setting position of the file cursor to a given
  498. *offset*. Currently only supports ``offset=0``.
  499. """
  500. if whence != os.SEEK_SET:
  501. raise NotImplementedError(
  502. 'MultiFileReader.seek() only supports os.SEEK_SET')
  503. if offset != 0:
  504. raise NotImplementedError(
  505. 'MultiFileReader only supports seeking to start at this time')
  506. for f in self._fileobjs:
  507. f.seek(0)