From e0f98a47ff888708287531a53190248dcf13e0e0 Mon Sep 17 00:00:00 2001 From: TrellixVulnTeam Date: Sat, 15 Oct 2022 12:26:29 +0000 Subject: [PATCH] Adding tarfile member sanitization to extractall() --- speechcolab/datasets/gigaspeech.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/speechcolab/datasets/gigaspeech.py b/speechcolab/datasets/gigaspeech.py index c17d9a3..e1055cd 100644 --- a/speechcolab/datasets/gigaspeech.py +++ b/speechcolab/datasets/gigaspeech.py @@ -154,7 +154,29 @@ def download_and_process_object_from_release(self, remote_md5, obj): subdir = local_obj_dec.parent / Path(local_obj_dec.stem.strip('.tgz')) subdir.mkdir(parents=True, exist_ok=True) with tarfile.open(local_obj_dec) as tar: - tar.extractall(path=subdir) + + import os + + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(tar, path=subdir) elif local_obj_dec.suffix == '.gz': # encripted-gziped object represents a regular GigaSpeech file out_path = local_obj_dec.parent / Path(local_obj_dec.stem.strip('.gz'))