Skip to content

Commit 5aa9c08

Browse files
committed
Fix performance of tarfile reading with "r|*"
1 parent 1a2e7a7 commit 5aa9c08

3 files changed

Lines changed: 13 additions & 8 deletions

File tree

Lib/tarfile.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize,
382382
except ImportError:
383383
raise CompressionError("bz2 module is not available") from None
384384
if mode == "r":
385-
self.dbuf = b""
386385
self.cmp = bz2.BZ2Decompressor()
387386
self.exception = OSError
388387
else:
@@ -394,7 +393,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize,
394393
except ImportError:
395394
raise CompressionError("lzma module is not available") from None
396395
if mode == "r":
397-
self.dbuf = b""
398396
self.cmp = lzma.LZMADecompressor()
399397
self.exception = lzma.LZMAError
400398
else:
@@ -475,7 +473,6 @@ def _init_read_gz(self):
475473
"""Initialize for reading a gzip compressed fileobj.
476474
"""
477475
self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
478-
self.dbuf = b""
479476

480477
# taken from gzip.GzipFile with some alterations
481478
if self.__read(2) != b"\037\213":
@@ -533,26 +530,31 @@ def _read(self, size):
533530
if self.comptype == "tar":
534531
return self.__read(size)
535532

536-
c = len(self.dbuf)
537-
t = [self.dbuf]
533+
c = 0
534+
t = []
538535
while c < size:
539536
# Skip underlying buffer to avoid unaligned double buffering.
540537
if self.buf:
541538
buf = self.buf
542539
self.buf = b""
540+
elif self.comptype != "gz" and not self.cmp.needs_input:
541+
buf = b""
543542
else:
544543
buf = self.fileobj.read(self.bufsize)
545544
if not buf:
546545
break
547546
try:
548-
buf = self.cmp.decompress(buf)
547+
buf = self.cmp.decompress(buf, size - c)
548+
if self.comptype == "gz":
549+
self.buf = self.cmp.unconsumed_tail
549550
except self.exception as e:
550551
raise ReadError("invalid compressed data") from e
551552
t.append(buf)
552553
c += len(buf)
553554
t = b"".join(t)
554-
self.dbuf = t[size:]
555-
return t[:size]
555+
if len(t) > size:
556+
raise ReadError("decompress() returned too much data")
557+
return t
556558

557559
def __read(self, size):
558560
"""Return size bytes from stream. If internal buffer is empty,

Misc/ACKS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ Bas van Beek
141141
Ian Beer
142142
Stefan Behnel
143143
Reimer Behrends
144+
Tomi Belan
144145
Maxime Bélanger
145146
Ben Bell
146147
Thomas Bellman
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix :mod:`tarfile` performance issue when reading archives in streaming mode
2+
(e.g. ``r|*``).

0 commit comments

Comments
 (0)