Skip to content

Commit 24110fb

Browse files
committed
Fix performance of tarfile reading with "r|*"
1 parent fad0674 commit 24110fb

3 files changed

Lines changed: 13 additions & 8 deletions

File tree

Lib/tarfile.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize,
380380
except ImportError:
381381
raise CompressionError("bz2 module is not available") from None
382382
if mode == "r":
383-
self.dbuf = b""
384383
self.cmp = bz2.BZ2Decompressor()
385384
self.exception = OSError
386385
else:
@@ -392,7 +391,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize,
392391
except ImportError:
393392
raise CompressionError("lzma module is not available") from None
394393
if mode == "r":
395-
self.dbuf = b""
396394
self.cmp = lzma.LZMADecompressor()
397395
self.exception = lzma.LZMAError
398396
else:
@@ -485,7 +483,6 @@ def _init_read_gz(self):
485483
"""Initialize for reading a gzip compressed fileobj.
486484
"""
487485
self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
488-
self.dbuf = b""
489486

490487
# taken from gzip.GzipFile with some alterations
491488
if self.__read(2) != b"\037\213":
@@ -543,26 +540,31 @@ def _read(self, size):
543540
if self.comptype == "tar":
544541
return self.__read(size)
545542

546-
c = len(self.dbuf)
547-
t = [self.dbuf]
543+
c = 0
544+
t = []
548545
while c < size:
549546
# Skip underlying buffer to avoid unaligned double buffering.
550547
if self.buf:
551548
buf = self.buf
552549
self.buf = b""
550+
elif self.comptype != "gz" and not self.cmp.needs_input:
551+
buf = b""
553552
else:
554553
buf = self.fileobj.read(self.bufsize)
555554
if not buf:
556555
break
557556
try:
558-
buf = self.cmp.decompress(buf)
557+
buf = self.cmp.decompress(buf, size - c)
558+
if self.comptype == "gz":
559+
self.buf = self.cmp.unconsumed_tail
559560
except self.exception as e:
560561
raise ReadError("invalid compressed data") from e
561562
t.append(buf)
562563
c += len(buf)
563564
t = b"".join(t)
564-
self.dbuf = t[size:]
565-
return t[:size]
565+
if len(t) > size:
566+
raise ReadError("decompress() returned too much data")
567+
return t
566568

567569
def __read(self, size):
568570
"""Return size bytes from stream. If internal buffer is empty,

Misc/ACKS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ Bas van Beek
144144
Ian Beer
145145
Stefan Behnel
146146
Reimer Behrends
147+
Tomi Belan
147148
Maxime Bélanger
148149
Ben Bell
149150
Thomas Bellman
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix :mod:`tarfile` performance issue when reading archives in streaming mode
2+
(e.g. ``r|*``).

0 commit comments

Comments
 (0)