From 11082cfca6502a637ccee45e1dec7f821bf9de06 Mon Sep 17 00:00:00 2001 From: David Farkas Date: Mon, 30 Oct 2017 16:29:40 +0100 Subject: [PATCH 01/12] Remove CAS logic, use uuid instead of hash, update the unit tests to work well without CAS --- api/download.py | 28 ++-- api/files.py | 66 ++++----- api/handlers/containerhandler.py | 4 - api/handlers/listhandler.py | 18 +-- api/handlers/refererhandler.py | 30 ++-- api/jobs/handlers.py | 8 +- api/placer.py | 14 +- api/resolver.py | 2 +- api/upload.py | 9 +- api/util.py | 47 +++++-- bin/database.py | 2 +- bin/oneoffs/create_placeholders.py | 60 ++++++++ bin/oneoffs/remove_cas.py | 130 ++++++++++++++++++ raml/schemas/definitions/analysis.json | 0 raml/schemas/definitions/file.json | 0 swagger/examples/file_info_list.json | 2 + swagger/examples/output/acquisition-list.json | 3 + swagger/examples/output/acquisition.json | 1 + swagger/examples/output/analysis.json | 2 + swagger/schemas/mongo/file.json | 3 +- tests/integration_tests/python/conftest.py | 32 +++++ .../python/test_containers.py | 5 - .../integration_tests/python/test_download.py | 44 +++++- .../integration_tests/python/test_resolver.py | 9 +- 24 files changed, 397 insertions(+), 122 deletions(-) create mode 100755 bin/oneoffs/create_placeholders.py create mode 100755 bin/oneoffs/remove_cas.py create mode 100644 raml/schemas/definitions/analysis.json create mode 100644 raml/schemas/definitions/file.json diff --git a/api/download.py b/api/download.py index e75443cde..0664cbefa 100644 --- a/api/download.py +++ b/api/download.py @@ -1,3 +1,4 @@ +import os import bson import pytz import os.path @@ -7,11 +8,9 @@ from .web import base from .web.request import AccessType -from . import config -from . import util -from . import validators -import os +from . import config, files, util, validators from .dao.containerutil import pluralize + log = config.log BYTES_IN_MEGABYTE = float(1<<20) @@ -46,16 +45,18 @@ def _append_targets(self, targets, cont_name, container, prefix, total_size, tot break if filtered: continue - filepath = os.path.join(data_path, util.path_from_hash(f['hash'])) - if os.path.exists(filepath): # silently skip missing files + file_path = files.get_file_abs_path(f.get('_id', '')) + if not util.file_exists(file_path): + file_path = os.path.join(data_path, util.path_from_hash(f['hash'])) + if util.file_exists(file_path): # silently skip missing files if cont_name == 'analyses': - targets.append((filepath, prefix + '/' + ('input' if f.get('input') else 'output') + '/' + f['name'], cont_name, str(container.get('_id')),f['size'])) + targets.append((file_path, prefix + '/' + ('input' if f.get('input') else 'output') + '/' + f['name'], cont_name, str(container.get('_id')), f['size'])) else: - targets.append((filepath, prefix + '/' + f['name'], cont_name, str(container.get('_id')),f['size'])) + targets.append((file_path, prefix + '/' + f['name'], cont_name, str(container.get('_id')), f['size'])) total_size += f['size'] total_cnt += 1 else: - log.warn("Expected {} to exist but it is missing. File will be skipped in download.".format(filepath)) + log.warn("Expected {} to exist but it is missing. File will be skipped in download.".format(file_path)) return total_size, total_cnt def _bulk_preflight_archivestream(self, file_refs): @@ -95,9 +96,12 @@ def _bulk_preflight_archivestream(self, file_refs): log.warn("Expected file {} on Container {} {} to exist but it is missing. File will be skipped in download.".format(filename, cont_name, cont_id)) continue - filepath = os.path.join(data_path, util.path_from_hash(file_obj['hash'])) - if os.path.exists(filepath): # silently skip missing files - targets.append((filepath, cont_name+'/'+cont_id+'/'+file_obj['name'], cont_name, cont_id, file_obj['size'])) + file_id = file_obj.get('_id', '') + file_path = files.get_file_abs_path(file_id) if file_id else '' + if not file_id or not os.path.exists(file_path): + file_path = os.path.join(data_path, util.path_from_hash(file_obj['hash'])) + if os.path.exists(file_path): # silently skip missing files + targets.append((file_path, cont_name+'/'+cont_id+'/'+file_obj['name'], cont_name, cont_id, file_obj['size'])) total_size += file_obj['size'] file_cnt += 1 diff --git a/api/files.py b/api/files.py index 5060637a4..b7c7b9289 100644 --- a/api/files.py +++ b/api/files.py @@ -3,7 +3,6 @@ import json import shutil import hashlib -import collections from backports import tempfile @@ -18,19 +17,19 @@ def move_file(path, target_path): os.makedirs(target_dir) shutil.move(path, target_path) -def move_form_file_field_into_cas(file_field): + +def move_form_file_field_into_storage(file_field): """ - Given a file form field, move the (downloaded, tempdir-stored) file into the CAS. + Given a file form field, move the (downloaded, tempdir-stored) file into the final storage. Requires an augmented file field; see upload.process_upload() for details. """ - if not file_field.hash or not file_field.path: - raise Exception("Field is not a file field with hash and path") + if not file_field.uuid or not file_field.path: + raise Exception("Field is not a file field with uuid and path") + + move_file(file_field.path, get_file_abs_path(file_field.uuid)) - base = config.get_item('persistent', 'data_path') - cas = util.path_from_hash(file_field.hash) - move_file(file_field.path, os.path.join(base, cas)) def hash_file_formatted(path, hash_alg=None, buffer_size=65536): """ @@ -49,28 +48,14 @@ def hash_file_formatted(path, hash_alg=None, buffer_size=65536): return util.format_hash(hash_alg, hasher.hexdigest()) -class HashingFile(file): - def __init__(self, file_path, hash_alg): - super(HashingFile, self).__init__(file_path, "wb") - self.hash_alg = hashlib.new(hash_alg) - self.hash_name = hash_alg - - def write(self, data): - self.hash_alg.update(data) - return file.write(self, data) - def get_hash(self): - return self.hash_alg.hexdigest() +class FileStoreException(Exception): + pass - def get_formatted_hash(self): - return util.format_hash(self.hash_name, self.get_hash()) -ParsedFile = collections.namedtuple('ParsedFile', ['info', 'path']) - -def process_form(request, hash_alg=None): +def process_form(request): """ - Some workarounds to make webapp2 process forms in an intelligent way, - and hash files we process. + Some workarounds to make webapp2 process forms in an intelligent way. Normally webapp2/WebOb Reqest.POST would copy the entire request stream into a single file on disk. https://github.com/Pylons/webob/blob/cb9c0b4f51542a7d0ed5cc5bf0a73f528afbe03e/webob/request.py#L787 @@ -82,14 +67,12 @@ def process_form(request, hash_alg=None): without copying the data gain. Returns (tuple): - form: HashingFieldStorage instance + form: SingleFileFieldStorage instance tempdir: tempdir the file was stored in. Keep tempdir in scope until you don't need it anymore; it will be deleted on GC. """ - hash_alg = hash_alg or DEFAULT_HASH_ALG - # Store form file fields in a tempdir tempdir = tempfile.TemporaryDirectory(prefix='.tmp', dir=config.get_item('persistent', 'data_path')) @@ -99,17 +82,18 @@ def process_form(request, hash_alg=None): env.setdefault('CONTENT_LENGTH', '0') env['QUERY_STRING'] = '' - field_storage_class = getHashingFieldStorage( - tempdir.name, DEFAULT_HASH_ALG + field_storage_class = get_single_file_field_storage( + tempdir.name ) form = field_storage_class( fp=request.body_file, environ=env, keep_blank_values=True ) - return (form, tempdir) + return form, tempdir -def getHashingFieldStorage(upload_dir, hash_alg): + +def get_single_file_field_storage(upload_dir): # pylint: disable=attribute-defined-outside-init # We dynamically create this class because we @@ -120,15 +104,13 @@ def getHashingFieldStorage(upload_dir, hash_alg): # https://github.com/python/cpython/blob/1e3e162ff5c0cc656559c43914439ab3e5734f00/Lib/cgi.py#L696 # https://github.com/python/cpython/blob/1e3e162ff5c0cc656559c43914439ab3e5734f00/Lib/cgi.py#L728 - class HashingFieldStorage(cgi.FieldStorage): + class SingleFileFieldStorage(cgi.FieldStorage): bufsize = 2**20 def make_file(self, binary=None): # Sanitize form's filename (read: prevent malicious escapes, bad characters, etc) self.filename = os.path.basename(self.filename) - # self.filename = util.sanitize_string_to_filename(self.filename) - - self.open_file = HashingFile(os.path.join(upload_dir, self.filename), hash_alg) + self.open_file = open(os.path.join(upload_dir, self.filename), 'wb') return self.open_file # override private method __write of superclass FieldStorage @@ -144,10 +126,7 @@ def _FieldStorage__write(self, line): self._FieldStorage__file = None self.file.write(line) - def get_hash(self): - return self.open_file.get_hash() - - return HashingFieldStorage + return SingleFileFieldStorage # File extension --> scitran file type detection hueristics. # Listed in precendence order. @@ -166,3 +145,8 @@ def guess_type_from_filename(filename): else: filetype = None return filetype + + +def get_file_abs_path(file_id): + version = 'v1' + return os.path.join(config.get_item('persistent', 'data_path'), version, util.path_from_uuid(file_id)) diff --git a/api/handlers/containerhandler.py b/api/handlers/containerhandler.py index 2bc5c5609..2a94fc20e 100644 --- a/api/handlers/containerhandler.py +++ b/api/handlers/containerhandler.py @@ -108,10 +108,6 @@ def get(self, cont_name, **kwargs): self._filter_permissions(result, self.uid) if self.is_true('join_avatars'): self.join_user_info([result]) - # build and insert file paths if they are requested - if self.is_true('paths'): - for fileinfo in result['files']: - fileinfo['path'] = util.path_from_hash(fileinfo['hash']) inflate_job_info = cont_name == 'sessions' result['analyses'] = AnalysisStorage().get_analyses(cont_name, _id, inflate_job_info) diff --git a/api/handlers/listhandler.py b/api/handlers/listhandler.py index 5f731a7e9..e31b163ea 100644 --- a/api/handlers/listhandler.py +++ b/api/handlers/listhandler.py @@ -8,10 +8,7 @@ import zipfile from ..web import base -from .. import config -from .. import upload -from .. import util -from .. import validators +from .. import config, files, upload, util, validators from ..auth import listauth, always_ok from ..dao import noop from ..dao import liststorage @@ -406,7 +403,10 @@ def get(self, cont_name, list_name, **kwargs): hash_ = self.get_param('hash') if hash_ and hash_ != fileinfo['hash']: self.abort(409, 'file exists, hash mismatch') - filepath = os.path.join(config.get_item('persistent', 'data_path'), util.path_from_hash(fileinfo['hash'])) + data_path = config.get_item('persistent', 'data_path') + file_path = files.get_file_abs_path(fileinfo.get('_id', '')) + if not util.file_exists(file_path): + file_path = os.path.join(data_path, util.path_from_hash(fileinfo['hash'])) # Request for download ticket if self.get_param('ticket') == '': @@ -416,7 +416,7 @@ def get(self, cont_name, list_name, **kwargs): # Request for info about zipfile elif self.is_true('info'): try: - info = self.build_zip_info(filepath) + info = self.build_zip_info(file_path) except zipfile.BadZipfile: self.abort(400, 'not a zip file') return info @@ -425,7 +425,7 @@ def get(self, cont_name, list_name, **kwargs): elif self.get_param('member') is not None: zip_member = self.get_param('member') try: - with zipfile.ZipFile(filepath) as zf: + with zipfile.ZipFile(file_path) as zf: self.response.headers['Content-Type'] = util.guess_mimetype(zip_member) self.response.write(zf.open(zip_member).read()) except zipfile.BadZipfile: @@ -456,7 +456,7 @@ def get(self, cont_name, list_name, **kwargs): raise util.RangeHeaderParseError('Invalid range') except util.RangeHeaderParseError: - self.response.app_iter = open(filepath, 'rb') + self.response.app_iter = open(file_path, 'rb') self.response.headers['Content-Length'] = str(fileinfo['size']) # must be set after setting app_iter if self.is_true('view'): @@ -474,7 +474,7 @@ def get(self, cont_name, list_name, **kwargs): self.response.headers['Content-Range'] = 'bytes %s-%s/%s' % (str(ranges[0][0]), str(ranges[0][1]), str(fileinfo['size'])) - with open(filepath, 'rb') as f: + with open(file_path, 'rb') as f: for first, last in ranges: mode = os.SEEK_SET if first < 0: diff --git a/api/handlers/refererhandler.py b/api/handlers/refererhandler.py index a04440187..441eb9daa 100644 --- a/api/handlers/refererhandler.py +++ b/api/handlers/refererhandler.py @@ -12,10 +12,7 @@ import datetime from abc import ABCMeta, abstractproperty -from .. import config -from .. import upload -from .. import util -from .. import validators +from .. import config, files, upload, util, validators from ..auth import containerauth, always_ok from ..dao import containerstorage, noop from ..dao.basecontainerstorage import ContainerStorage @@ -362,16 +359,19 @@ def download(self, **kwargs): self.abort(404, "{} doesn't exist".format(filename)) else: fileinfo = fileinfo[0] - filepath = os.path.join( - config.get_item('persistent', 'data_path'), - util.path_from_hash(fileinfo['hash']) - ) + data_path = config.get_item('persistent', 'data_path') + file_path = files.get_file_abs_path(fileinfo.get('_id', '')) + if not util.file_exists(file_path): + file_path = os.path.join( + data_path, + util.path_from_hash(fileinfo['hash']) + ) filename = fileinfo['name'] # Request for info about zipfile if self.is_true('info'): try: - info = FileListHandler.build_zip_info(filepath) + info = FileListHandler.build_zip_info(file_path) except zipfile.BadZipfile: self.abort(400, 'not a zip file') return info @@ -380,7 +380,7 @@ def download(self, **kwargs): elif self.get_param('member') is not None: zip_member = self.get_param('member') try: - with zipfile.ZipFile(filepath) as zf: + with zipfile.ZipFile(file_path) as zf: self.response.headers['Content-Type'] = util.guess_mimetype(zip_member) self.response.write(zf.open(zip_member).read()) except zipfile.BadZipfile: @@ -397,7 +397,7 @@ def download(self, **kwargs): # Request to download the file itself else: - self.response.app_iter = open(filepath, 'rb') + self.response.app_iter = open(file_path, 'rb') self.response.headers['Content-Length'] = str(fileinfo['size']) # must be set after setting app_iter if self.is_true('view'): self.response.headers['Content-Type'] = str(fileinfo.get('mimetype', 'application/octet-stream')) @@ -441,9 +441,11 @@ def _prepare_batch(self, fileinfo, analysis): total_size = total_cnt = 0 data_path = config.get_item('persistent', 'data_path') for f in fileinfo: - filepath = os.path.join(data_path, util.path_from_hash(f['hash'])) - if os.path.exists(filepath): # silently skip missing files - targets.append((filepath, + file_path = files.get_file_abs_path(f.get('_id', '')) + if not util.file_exists(file_path): + file_path = os.path.join(data_path, util.path_from_hash(f['hash'])) + if util.file_exists(file_path): # silently skip missing files + targets.append((file_path, util.sanitize_string_to_filename(analysis['label']) + '/' + ('input' if f.get('input') else 'output') + '/'+ f['name'], 'analyses', analysis['_id'], f['size'])) total_size += f['size'] diff --git a/api/jobs/handlers.py b/api/jobs/handlers.py index 3c058c970..511812327 100644 --- a/api/jobs/handlers.py +++ b/api/jobs/handlers.py @@ -11,7 +11,7 @@ from . import batch from .. import config from .. import upload -from .. import util +from .. import files from ..auth import require_drone, require_login, require_admin, has_access from ..auth.apikeys import JobApiKey from ..dao import hierarchy @@ -89,10 +89,10 @@ def download(self, **kwargs): # pragma: no cover """Download gear tarball file""" dl_id = kwargs.pop('cid') gear = get_gear(dl_id) - hash_ = gear['exchange']['rootfs-hash'] - filepath = os.path.join(config.get_item('persistent', 'data_path'), util.path_from_hash('v0-' + hash_.replace(':', '-'))) + file_id = gear['exchange']['rootfs-id'] + file_path = files.get_file_abs_path(file_id) - stream = open(filepath, 'rb') + stream = open(file_path, 'rb') set_for_download(self.response, stream=stream, filename='gear.tar') @require_admin diff --git a/api/placer.py b/api/placer.py index 77aa4eacf..b4a90ea4a 100644 --- a/api/placer.py +++ b/api/placer.py @@ -5,6 +5,7 @@ import os import pymongo import shutil +import uuid import zipfile from backports import tempfile @@ -89,7 +90,7 @@ def save_file(self, field=None, file_attrs=None): # Save file if field is not None: - files.move_form_file_field_into_cas(field) + files.move_form_file_field_into_storage(field) # Update the DB if file_attrs is not None: @@ -190,7 +191,7 @@ def process_file_field(self, field, file_attrs): self.save_file(field, file_attrs) else: if field is not None: - files.move_form_file_field_into_cas(field) + files.move_form_file_field_into_storage(field) if file_attrs is not None: container.upsert_file(file_attrs) @@ -548,7 +549,8 @@ def finalize(self): 'filename': self.name, 'path': self.path, 'size': os.path.getsize(self.path), - 'hash': files.hash_file_formatted(self.path), + 'hash': files.hash_file_formatted(self.path), + 'uuid': str(uuid.uuid4()), 'mimetype': util.guess_mimetype('lol.zip'), 'modified': self.timestamp }) @@ -557,10 +559,11 @@ def finalize(self): # This could be coalesced into a single map thrown on file fields, for example. # Used in the API return. cgi_attrs = { + '_id': cgi_field.uuid, 'name': cgi_field.filename, 'modified': cgi_field.modified, 'size': cgi_field.size, - 'hash': cgi_field.hash, + 'hash': cgi_field.hash, 'mimetype': cgi_field.mimetype, 'type': self.metadata['packfile']['type'], @@ -763,7 +766,8 @@ def process_file_field(self, field, file_attrs): proper_hash = file_attrs.get('hash')[3:].replace('-', ':') self.metadata.update({'exchange': {'rootfs-hash': proper_hash, 'git-commit': 'local', - 'rootfs-url': 'INVALID'}}) + 'rootfs-url': 'INVALID', + 'rootfs-id': file_attrs['_id']}}) # self.metadata['hash'] = file_attrs.get('hash') self.save_file(field) self.saved.append(file_attrs) diff --git a/api/resolver.py b/api/resolver.py index 0b110ea1c..7eca86f9c 100644 --- a/api/resolver.py +++ b/api/resolver.py @@ -1,4 +1,4 @@ -""" +"""1 Resolve an ambiguous path through the data hierarchy. """ diff --git a/api/upload.py b/api/upload.py index 75de2e470..a98123e40 100644 --- a/api/upload.py +++ b/api/upload.py @@ -3,6 +3,7 @@ import json import os.path import shutil +import uuid from .web import base from .web.errors import FileStoreException, FileFormException @@ -110,19 +111,21 @@ def process_upload(request, strategy, container_type=None, id_=None, origin=None tempdir_exists, tempdir_exists and os.listdir(tempdir.name), )) - field.size = os.path.getsize(field.path) - field.hash = field.file.get_formatted_hash() + field.size = os.path.getsize(field.path) + field.hash = files.hash_file_formatted(field.path) + field.uuid = str(uuid.uuid4()) field.mimetype = util.guess_mimetype(field.filename) # TODO: does not honor metadata's mime type if any field.modified = timestamp # create a file-attribute map commonly used elsewhere in the codebase. # Stands in for a dedicated object... for now. file_attrs = { + '_id': field.uuid, 'name': field.filename, 'modified': field.modified, # 'size': field.size, 'mimetype': field.mimetype, - 'hash': field.hash, + 'hash': field.hash, 'origin': origin, 'type': None, diff --git a/api/util.py b/api/util.py index 64d2f85d1..4b88a2100 100644 --- a/api/util.py +++ b/api/util.py @@ -197,20 +197,6 @@ def obj_from_map(_map): return type('',(object,),_map)() -def path_from_hash(hash_): - """ - create a filepath from a hash - e.g. - hash_ = v0-sha384-01b395a1cbc0f218 - will return - v0/sha384/01/b3/v0-sha384-01b395a1cbc0f218 - """ - hash_version, hash_alg, actual_hash = hash_.split('-') - first_stanza = actual_hash[0:2] - second_stanza = actual_hash[2:4] - path = (hash_version, hash_alg, first_stanza, second_stanza, hash_) - return os.path.join(*path) - def set_for_download(response, stream=None, filename=None, length=None): """Takes a self.response, and various download options.""" @@ -290,6 +276,39 @@ def create_nonce(): return ''.join([NONCE_CHARS[randrange(x)] for _ in range(NONCE_LENGTH)]) +def path_from_uuid(uuid_): + """ + create a filepath from a UUID + e.g. + hash_ = v0-sha384-01b395a1cbc0f218 + will return + v0/sha384/01/b3/v0-sha384-01b395a1cbc0f218 + """ + uuid_1 = uuid_.split('-')[0] + first_stanza = uuid_1[0:2] + second_stanza = uuid_1[2:4] + path = (first_stanza, second_stanza, uuid_) + return os.path.join(*path) + + +def path_from_hash(hash_): + """ + create a filepath from a hash + e.g. + hash_ = v0-sha384-01b395a1cbc0f218 + will return + v0/sha384/01/b3/v0-sha384-01b395a1cbc0f218 + """ + hash_version, hash_alg, actual_hash = hash_.split('-') + first_stanza = actual_hash[0:2] + second_stanza = actual_hash[2:4] + path = (hash_version, hash_alg, first_stanza, second_stanza, hash_) + return os.path.join(*path) + + +def file_exists(path): + return os.path.exists(path) and os.path.isfile(path) + class RangeHeaderParseError(ValueError): """Exception class representing a string parsing error.""" diff --git a/bin/database.py b/bin/database.py index 375c4b76f..0ce62d838 100755 --- a/bin/database.py +++ b/bin/database.py @@ -1436,4 +1436,4 @@ def upgrade_schema(force_from = None): sys.exit(1) except Exception as e: logging.exception('Unexpected error in database.py') - sys.exit(1) + sys.exit(1) \ No newline at end of file diff --git a/bin/oneoffs/create_placeholders.py b/bin/oneoffs/create_placeholders.py new file mode 100755 index 000000000..b2453ff3b --- /dev/null +++ b/bin/oneoffs/create_placeholders.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python +import logging +import os +import shutil + +from api import util, config + +log = logging.getLogger('create_placeholders') +log.setLevel(logging.INFO) + + +def get_files_by_prefix(document, prefix): + for key in prefix.split('.'): + document = document.get(key, {}) + return document + + +def create_placeholders(): + """ + Create placeholder files to help testing a system using sanitized customer DBs without the corresponding data files. + """ + COLLECTIONS_PREFIXES = [('projects', 'files'), + ('acquisitions', 'files'), + ('analyses', 'files'), + ('sessions', 'files'), + ('sessions', 'subject.files')] + + _files = [] + + for collection, prefix in COLLECTIONS_PREFIXES: + cursor = config.db.get_collection(collection).find({}) + for document in cursor: + for f in get_files_by_prefix(document, prefix): + f_dict = { + 'collection_id': document.get('_id'), + 'collection': collection, + 'fileinfo': f, + 'prefix': prefix + } + _files.append(f_dict) + + base = config.get_item('persistent', 'data_path') + for i, f in enumerate(_files): + f_path = os.path.join(base, util.path_from_hash(f['fileinfo']['hash'])) + + target_dir = os.path.dirname(f_path) + if not os.path.exists(target_dir): + os.makedirs(target_dir) + with open(f_path, 'w') as placeholder: + placeholder.write('%s %s' % (f_path, f['fileinfo']['size'])) + + # Show progress + if i % (len(_files) / 10) == 0: + log.info('Processed %s files of total %s files ...' % (i, len(_files))) + + +if __name__ == '__main__': + create_placeholders() + + diff --git a/bin/oneoffs/remove_cas.py b/bin/oneoffs/remove_cas.py new file mode 100755 index 000000000..61b0f3f1e --- /dev/null +++ b/bin/oneoffs/remove_cas.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python +import datetime +import logging +import os +import shutil +import uuid + +from collections import Counter + +from api import config, files, util + +log = logging.getLogger('remove_cas') +log.setLevel(logging.INFO) + +class MigrationError(Exception): + pass + +def get_files_by_prefix(document, prefix): + for key in prefix.split('.'): + document = document.get(key, {}) + return document + + +def copy_file(path, target_path): + target_dir = os.path.dirname(target_path) + if not os.path.exists(target_dir): + os.makedirs(target_dir) + shutil.copy(path, target_path) + + +def remove_cas(): + """ + Remove CAS logic, generate UUID for the files and rename them on the filesystem, make a copy of the file if more + than one container using the same hash. + """ + COLLECTIONS_PREFIXES = [('projects', 'files'), + ('acquisitions', 'files'), + ('analyses', 'files'), + ('sessions', 'files'), + ('sessions', 'subject.files'), + ('collections', 'files')] + + _hashes = [] + _files = [] + + for collection, prefix in COLLECTIONS_PREFIXES: + cursor = config.db.get_collection(collection).find({}) + for document in cursor: + for f in get_files_by_prefix(document, prefix): + u = f.get('_id', '') + if u: + continue + + _hashes.append(f.get('hash', '')) + f_dict = { + 'collection_id': document.get('_id'), + 'collection': collection, + 'fileinfo': f, + 'prefix': prefix + } + _files.append(f_dict) + + counter = Counter(_hashes) + + try: + base = config.get_item('persistent', 'data_path') + for i, f in enumerate(_files): + try: + f_uuid = str(uuid.uuid4()) + f['_id'] = f_uuid + f_path = os.path.join(base, util.path_from_hash(f['fileinfo']['hash'])) + log.debug('copy file %s to %s' % (f_path, util.path_from_uuid(f_uuid))) + copy_file(f_path, files.get_file_abs_path(f_uuid)) + + update_set = { + f['prefix'] + '.$.modified': datetime.datetime.utcnow(), + f['prefix'] + '.$._id': f_uuid + } + log.debug('update file in mongo: %s' % update_set) + # Update the file with the newly generated UUID + config.db[f['collection']].find_one_and_update( + {'_id': f['collection_id'], f['prefix'] + '.name': f['fileinfo']['name']}, + {'$set': update_set} + ) + + # Decrease the count of the current hash, so we will know when we can remove the original file + counter[f['fileinfo']['hash']] -= 1 + + if counter[f['fileinfo']['hash']] == 0: + log.debug('remove old file: %s' % f_path) + os.remove(f_path) + + # Show progress + if i % (len(_files) / 10) == 0: + log.info('Processed %s files of total %s files ...' % (i, len(_files))) + + except Exception as e: + log.exception(e) + raise MigrationError('Wasn\'t able to migrate the \'%s\' ' + 'file in the \'%s\' collection (collection id: %s)' % + (f['fileinfo']['name'], f['collection'], str(f['collection_id'])), e) + except MigrationError as e: + log.exception(e) + log.info('Rollback...') + base = config.get_item('persistent', 'data_path') + for f in _files: + if f.get('_id', ''): + hash_path = os.path.join(base, util.path_from_hash(f['fileinfo']['hash'])) + uuid_path = files.get_file_abs_path(f['_id']) + if os.path.exists(hash_path) and os.path.exists(uuid_path): + os.remove(uuid_path) + elif os.path.exists(uuid_path): + copy_file(uuid_path, hash_path) + os.remove(uuid_path) + config.db[f['collection']].find_one_and_update( + {'_id': f['collection_id'], f['prefix'] + '.name': f['fileinfo']['name']}, + {'$unset': {f['prefix'] + '.$._id': ''}} + ) + exit(1) + + # Cleanup the empty folders + log.info('Cleanup empty folders') + for _dirpath, _, _ in os.walk(config.get_item('persistent', 'data_path'), topdown=False): + if not (os.listdir(_dirpath) or config.get_item('persistent', 'data_path') == _dirpath): + os.rmdir(_dirpath) + + +if __name__ == '__main__': + remove_cas() + diff --git a/raml/schemas/definitions/analysis.json b/raml/schemas/definitions/analysis.json new file mode 100644 index 000000000..e69de29bb diff --git a/raml/schemas/definitions/file.json b/raml/schemas/definitions/file.json new file mode 100644 index 000000000..e69de29bb diff --git a/swagger/examples/file_info_list.json b/swagger/examples/file_info_list.json index 70992e72e..cf4edfcf8 100755 --- a/swagger/examples/file_info_list.json +++ b/swagger/examples/file_info_list.json @@ -4,6 +4,7 @@ "type": "job", "id": "58063f24e5dc5b001657a87f" }, + "_id": "50708f69-7549-4331-bac0-b4789e5c9ca1", "mimetype": "application/octet-stream", "hash": "v0-sha384-12188e00a26650b2baa3f0195337dcf504f4362bb2136eef0cdbefb57159356b1355a0402fca0ab5ab081f21c305e5c2", "name": "cortical_surface_right_hemisphere.obj", @@ -19,6 +20,7 @@ "type": "job", "id": "58065fa7e5dc5b001457a882" }, + "_id": "50708f69-7549-4331-bac0-b4779e5c9ca1", "mimetype": "application/octet-stream", "hash": "v0-sha384-12188e00a26650b2baa3f0195337dcf504f4362bb2136eef0cdbefb57159356b1355a0402fca0ab5ab081f21c305e5c2", "name": "cortical_surface_right_hemisphere.obj", diff --git a/swagger/examples/output/acquisition-list.json b/swagger/examples/output/acquisition-list.json index 1b29d562c..4dde51b80 100644 --- a/swagger/examples/output/acquisition-list.json +++ b/swagger/examples/output/acquisition-list.json @@ -8,6 +8,7 @@ "id": "importer_Admin_Import", "name": "Admin Import" }, + "_id": "50708f69-7549-4331-bac0-b4779e5c9ca1", "mimetype": "application/zip", "measurements": [], "hash": "v0-sha384-dd3c97bfe0ad1fcba75ae6718c6e81038c59af4f447f5db194d52732efa4f955b28455db02eb64cad3e4e55f11e3679f", @@ -47,6 +48,7 @@ "id": "importer_Admin_Import", "name": "Admin Import" }, + "_id": "50708f69-7549-4331-bac0-b4779e5c9ca1", "mimetype": "application/zip", "measurements": [], "hash": "v0-sha384-ca055fb36845db86e4278cf6e185f8674d11a96f4b29af27e401fc495cc82ef6b53a5729c3757713064649dc71c8c725", @@ -86,6 +88,7 @@ "id": "importer_Admin_Import", "name": "Admin Import" }, + "_id": "50708f69-7549-4331-bac0-b4779e5c9ca1", "mimetype": "application/zip", "measurements": [], "hash": "v0-sha384-537e42b1dd8f1feef9844fbfb4f60461361e71cafa7055556097e9d0b9f7fac68c8f234ed126af9412bd43a548948847", diff --git a/swagger/examples/output/acquisition.json b/swagger/examples/output/acquisition.json index fef6ba738..fdffc3310 100644 --- a/swagger/examples/output/acquisition.json +++ b/swagger/examples/output/acquisition.json @@ -7,6 +7,7 @@ "id": "importer_Admin_Import", "name": "Admin Import" }, + "_id": "50708f69-7549-4331-bac0-b4779e5c9ca1", "mimetype": "application/zip", "measurements": [], "hash": "v0-sha384-dd3c97bfe0ad1fcba75ae6718c6e81038c59af4f447f5db194d52732efa4f955b28455db02eb64cad3e4e55f11e3679f", diff --git a/swagger/examples/output/analysis.json b/swagger/examples/output/analysis.json index bb0c4caae..fde0d0aea 100644 --- a/swagger/examples/output/analysis.json +++ b/swagger/examples/output/analysis.json @@ -4,6 +4,7 @@ "type": "job", "id": "58063f24e5dc5b001657a87f" }, + "_id": "50708f69-7549-4331-bac0-b4779e5c9ca1", "mimetype": "application/octet-stream", "hash": "v0-sha384-12188e00a26650b2baa3f0195337dcf504f4362bb2136eef0cdbefb57159356b1355a0402fca0ab5ab081f21c305e5c2", "name": "cortical_surface_right_hemisphere.obj", @@ -20,6 +21,7 @@ "type": "job", "id": "58065fa7e5dc5b001457a882" }, + "_id": "50708f69-7549-4331-bac0-b4779e5c9ca1", "mimetype": "application/octet-stream", "hash": "v0-sha384-12188e00a26650b2baa3f0195337dcf504f4362bb2136eef0cdbefb57159356b1355a0402fca0ab5ab081f21c305e5c2", "name": "cortical_surface_right_hemisphere.obj", diff --git a/swagger/schemas/mongo/file.json b/swagger/schemas/mongo/file.json index 4e077d85a..6fc7cfe30 100644 --- a/swagger/schemas/mongo/file.json +++ b/swagger/schemas/mongo/file.json @@ -2,6 +2,7 @@ "$schema": "http://json-schema.org/draft-04/schema#", "type": "object", "properties": { + "_id": { "type": "string" }, "name": { "type": "string" }, "created": {}, "modified": {}, @@ -24,7 +25,7 @@ "type": "object" } }, - "required": ["name", "created", "modified", "size", "hash"], + "required": ["_id", "name", "created", "modified", "size", "hash"], "key_fields": ["name"], "additionalProperties": false } diff --git a/tests/integration_tests/python/conftest.py b/tests/integration_tests/python/conftest.py index 1cba4892d..80fb294fe 100644 --- a/tests/integration_tests/python/conftest.py +++ b/tests/integration_tests/python/conftest.py @@ -4,6 +4,7 @@ import json import logging import os +import shutil import attrdict import bson @@ -11,6 +12,7 @@ import pytest import requests +from api import files, util # load required envvars w/ the same name SCITRAN_CORE_DRONE_SECRET = os.environ['SCITRAN_CORE_DRONE_SECRET'] @@ -207,6 +209,36 @@ def with_user(data_builder, randstr, as_public): return attrdict.AttrDict(user=user, api_key=api_key, session=session) +@pytest.yield_fixture(scope='function') +def legacy_cas_file(as_admin, api_db, data_builder, randstr, file_form): + """Yield legacy CAS file""" + project = data_builder.create_project() + file_name = '%s.csv' % randstr() + as_admin.post('/projects/' + project + '/files', files=file_form(file_name)) + + file_id = api_db['projects'].find_one( + {'files.name': file_name} + )['files'][0]['_id'] + # verify cas backward compatibility + api_db['projects'].find_one_and_update( + {'files.name': file_name}, + {'$unset': {'files.$._id': ''}} + ) + data_path = os.getenv('SCITRAN_PERSISTENT_DATA_PATH') + file_hash = 'v0-sha384-03a9df0a5e6e21f5d25aacbce76d8a5d9f8de14f6654c31ab2daed961cfbfb236b1708063350856f752a5a094fb64321' + file_path = os.path.join(data_path, util.path_from_hash(file_hash)) + target_dir = os.path.dirname(file_path) + if not os.path.exists(target_dir): + os.makedirs(target_dir) + shutil.move(files.get_file_abs_path(file_id), file_path) + + yield (project, file_name) + + # clean up + os.remove(file_path) + os.removedirs(target_dir) + api_db['projects'].delete_one({'_id': project}) + class BaseUrlSession(requests.Session): """Requests session subclass using core api's base url""" def request(self, method, url, **kwargs): diff --git a/tests/integration_tests/python/test_containers.py b/tests/integration_tests/python/test_containers.py index 792481a7d..dc11cfda1 100644 --- a/tests/integration_tests/python/test_containers.py +++ b/tests/integration_tests/python/test_containers.py @@ -297,11 +297,6 @@ def test_get_container(data_builder, default_payload, file_form, as_drone, as_ad r = as_public.get('/projects/' + project) assert r.ok - # get container w/ ?paths=true - r = as_public.get('/projects/' + project, params={'paths': 'true'}) - assert r.ok - assert all('path' in f for f in r.json()['files']) - # get container w/ ?join=origin&join=origin_job_gear_name r = as_public.get('/projects/' + project, params={'join': ['origin', 'origin_job_gear_name']}) assert r.ok diff --git a/tests/integration_tests/python/test_download.py b/tests/integration_tests/python/test_download.py index 3abd655bf..025cc1b80 100644 --- a/tests/integration_tests/python/test_download.py +++ b/tests/integration_tests/python/test_download.py @@ -4,11 +4,11 @@ import zipfile -def test_download(data_builder, file_form, as_admin, api_db): +def test_download(data_builder, file_form, as_admin, api_db, legacy_cas_file): project = data_builder.create_project(label='project1') - session = data_builder.create_session(label='session1') - session2 = data_builder.create_session(label='session1') - session3 = data_builder.create_session(label='session1') + session = data_builder.create_session(label='session1', project=project) + session2 = data_builder.create_session(label='session1', project=project) + session3 = data_builder.create_session(label='session1', project=project) acquisition = data_builder.create_acquisition(session=session) acquisition2 = data_builder.create_acquisition(session=session2) acquisition3 = data_builder.create_acquisition(session=session3) @@ -151,8 +151,32 @@ def test_download(data_builder, file_form, as_admin, api_db): r = as_admin.get('/download', params={'ticket': ticket, 'symlinks': 'true'}) assert r.ok + # test legacy cas file handling + (project_legacy, file_name_legacy) = legacy_cas_file + r = as_admin.post('/download', json={ + 'optional': False, + 'nodes': [ + {'level': 'project', '_id': project_legacy}, + ] + }) + assert r.ok + ticket = r.json()['ticket'] + + # Perform the download + r = as_admin.get('/download', params={'ticket': ticket}) + assert r.ok -def test_filelist_download(data_builder, file_form, as_admin): + tar_file = cStringIO.StringIO(r.content) + tar = tarfile.open(mode="r", fileobj=tar_file) + + # Verify a single file in tar with correct file name + for tarinfo in tar: + assert os.path.basename(tarinfo.name) == file_name_legacy + + tar.close() + + +def test_filelist_download(data_builder, file_form, as_admin, legacy_cas_file): session = data_builder.create_session() zip_cont = cStringIO.StringIO() with zipfile.ZipFile(zip_cont, 'w') as zip_file: @@ -208,6 +232,16 @@ def test_filelist_download(data_builder, file_form, as_admin): r = as_admin.get(session_files + '/two.zip', params={'ticket': ticket, 'member': 'two.csv'}) assert r.ok + # test legacy cas file handling + (project, file_name) = legacy_cas_file + r = as_admin.get('/projects/' + project + '/files/' + file_name, params={'ticket': ''}) + assert r.ok + + ticket = r.json()['ticket'] + + r = as_admin.get('/projects/' + project + '/files/' + file_name, params={'ticket': ticket}) + assert r.content == 'test\ndata\n' + def test_filelist_range_download(data_builder, as_admin, file_form): session = data_builder.create_session() diff --git a/tests/integration_tests/python/test_resolver.py b/tests/integration_tests/python/test_resolver.py index 37c988226..8a32629ee 100644 --- a/tests/integration_tests/python/test_resolver.py +++ b/tests/integration_tests/python/test_resolver.py @@ -74,6 +74,7 @@ def test_resolver(data_builder, as_admin, as_user, as_public, file_form): project_file = 'project_file' r = as_admin.post('/projects/' + project + '/files', files=file_form(project_file)) assert r.ok + project_file_id = r.json()[0]['_id'] # save the file id for later usage r = as_admin.post('/resolve', json={'path': [group, project_label]}) result = r.json() assert r.ok @@ -95,7 +96,7 @@ def test_resolver(data_builder, as_admin, as_user, as_public, file_form): r = as_admin.post('/resolve', json={'path': [group, project_label, project_file]}) result = r.json() assert r.ok - assert path_in_result([group, project, project_file], result) + assert path_in_result([group, project, project_file_id], result) assert result['children'] == [] # try to resolve non-existent root/group/project/child @@ -115,6 +116,7 @@ def test_resolver(data_builder, as_admin, as_user, as_public, file_form): session_file = 'session_file' r = as_admin.post('/sessions/' + session + '/files', files=file_form(session_file)) assert r.ok + session_file_id = r.json()[0]['_id'] r = as_admin.post('/resolve', json={'path': [group, project_label, session_label]}) result = r.json() assert r.ok @@ -136,7 +138,7 @@ def test_resolver(data_builder, as_admin, as_user, as_public, file_form): r = as_admin.post('/resolve', json={'path': [group, project_label, session_label, session_file]}) result = r.json() assert r.ok - assert path_in_result([group, project, session, session_file], result) + assert path_in_result([group, project, session, session_file_id], result) assert result['children'] == [] # try to resolve non-existent root/group/project/session/child @@ -156,6 +158,7 @@ def test_resolver(data_builder, as_admin, as_user, as_public, file_form): acquisition_file = 'acquisition_file' r = as_admin.post('/acquisitions/' + acquisition + '/files', files=file_form(acquisition_file)) assert r.ok + acquisition_file_id = r.json()[0]['_id'] r = as_admin.post('/resolve', json={'path': [group, project_label, session_label, acquisition_label]}) result = r.json() assert r.ok @@ -167,7 +170,7 @@ def test_resolver(data_builder, as_admin, as_user, as_public, file_form): r = as_admin.post('/resolve', json={'path': [group, project_label, session_label, acquisition_label, acquisition_file]}) result = r.json() assert r.ok - assert path_in_result([group, project, session, acquisition, acquisition_file], result) + assert path_in_result([group, project, session, acquisition, acquisition_file_id], result) assert result['children'] == [] def idz(s): From 089d116fc3934fb111f94d4dca52d11b365e0d11 Mon Sep 17 00:00:00 2001 From: David Farkas Date: Tue, 12 Dec 2017 15:12:10 +0100 Subject: [PATCH 02/12] Create placeholder files for collection attachments and gear files, update the remove-cas script to show progress of the rollback --- api/jobs/handlers.py | 1 - bin/oneoffs/create_placeholders.py | 56 +++++-- bin/oneoffs/remove_cas.py | 244 +++++++++++++++++++++-------- 3 files changed, 218 insertions(+), 83 deletions(-) diff --git a/api/jobs/handlers.py b/api/jobs/handlers.py index 511812327..7e69d49af 100644 --- a/api/jobs/handlers.py +++ b/api/jobs/handlers.py @@ -3,7 +3,6 @@ """ import bson import copy -import os import StringIO from jsonschema import ValidationError from urlparse import urlparse diff --git a/bin/oneoffs/create_placeholders.py b/bin/oneoffs/create_placeholders.py index b2453ff3b..534ea88c3 100755 --- a/bin/oneoffs/create_placeholders.py +++ b/bin/oneoffs/create_placeholders.py @@ -15,15 +15,22 @@ def get_files_by_prefix(document, prefix): return document -def create_placeholders(): - """ - Create placeholder files to help testing a system using sanitized customer DBs without the corresponding data files. - """ +def create_placeholder_file(f_path, extra_content): + target_dir = os.path.dirname(f_path) + if not os.path.exists(target_dir): + os.makedirs(target_dir) + with open(f_path, 'w') as placeholder: + placeholder.write('%s %s' % (f_path, extra_content)) + + +def placeholders_for_collections(): + log.info('Create placeholders for colelctions') COLLECTIONS_PREFIXES = [('projects', 'files'), ('acquisitions', 'files'), ('analyses', 'files'), ('sessions', 'files'), - ('sessions', 'subject.files')] + ('sessions', 'subject.files'), + ('collections', 'files')] _files = [] @@ -42,19 +49,40 @@ def create_placeholders(): base = config.get_item('persistent', 'data_path') for i, f in enumerate(_files): f_path = os.path.join(base, util.path_from_hash(f['fileinfo']['hash'])) - - target_dir = os.path.dirname(f_path) - if not os.path.exists(target_dir): - os.makedirs(target_dir) - with open(f_path, 'w') as placeholder: - placeholder.write('%s %s' % (f_path, f['fileinfo']['size'])) + create_placeholder_file(f_path, f['fileinfo']['size']) # Show progress - if i % (len(_files) / 10) == 0: + if i % (len(_files) / 10 + 1) == 0: log.info('Processed %s files of total %s files ...' % (i, len(_files))) -if __name__ == '__main__': - create_placeholders() +def placeholders_for_gears(): + log.info('Create placeholders for gears') + cursor = config.db.get_collection('gears').find({}) + _files = [] + for document in cursor: + if document['exchange']['git-commit'] == 'local': + f_dict = { + 'gear_id': document['_id'], + 'gear_name': document['gear']['name'], + 'exchange': document['exchange'] + } + _files.append(f_dict) + + base = config.get_item('persistent', 'data_path') + for i, f in enumerate(_files): + f_hash = 'v0-' + f['exchange']['rootfs-hash'].replace(':', '-') + f_path = os.path.join(base, util.path_from_hash(f_hash)) + create_placeholder_file(f_path, f['gear_name']) + + # Show progress + if i % (len(_files) / 10 + 1) == 0: + log.info('Processed %s gear files of total %s files ...' % (i, len(_files))) +if __name__ == '__main__': + """ + Create placeholder files to help testing a system using sanitized customer DBs without the corresponding data files. + """ + placeholders_for_collections() + placeholders_for_gears() diff --git a/bin/oneoffs/remove_cas.py b/bin/oneoffs/remove_cas.py index 61b0f3f1e..2cbb852d4 100755 --- a/bin/oneoffs/remove_cas.py +++ b/bin/oneoffs/remove_cas.py @@ -12,9 +12,11 @@ log = logging.getLogger('remove_cas') log.setLevel(logging.INFO) + class MigrationError(Exception): pass + def get_files_by_prefix(document, prefix): for key in prefix.split('.'): document = document.get(key, {}) @@ -23,23 +25,32 @@ def get_files_by_prefix(document, prefix): def copy_file(path, target_path): target_dir = os.path.dirname(target_path) + if not os.path.exists(target_dir): os.makedirs(target_dir) shutil.copy(path, target_path) -def remove_cas(): - """ - Remove CAS logic, generate UUID for the files and rename them on the filesystem, make a copy of the file if more - than one container using the same hash. - """ +def show_progress(current_index, total_files): + if current_index % (total_files / 10 + 1) == 0: + log.info('Processed %s files of total %s files ...' % (current_index, total_files)) + + +def cleanup_empty_folders(): + log.info('Cleanup empty folders') + + for _dirpath, _, _ in os.walk(config.get_item('persistent', 'data_path'), topdown=False): + if not (os.listdir(_dirpath) or config.get_item('persistent', 'data_path') == _dirpath): + os.rmdir(_dirpath) + + +def get_collections_files_hashes(): COLLECTIONS_PREFIXES = [('projects', 'files'), ('acquisitions', 'files'), ('analyses', 'files'), ('sessions', 'files'), ('sessions', 'subject.files'), ('collections', 'files')] - _hashes = [] _files = [] @@ -47,10 +58,6 @@ def remove_cas(): cursor = config.db.get_collection(collection).find({}) for document in cursor: for f in get_files_by_prefix(document, prefix): - u = f.get('_id', '') - if u: - continue - _hashes.append(f.get('hash', '')) f_dict = { 'collection_id': document.get('_id'), @@ -60,71 +67,172 @@ def remove_cas(): } _files.append(f_dict) + return _files, _hashes + + +def get_gears_files(): + cursor = config.db.get_collection('gears').find({}) + _files = [] + + for document in cursor: + if document['exchange']['git-commit'] == 'local': + f_dict = { + 'gear_id': document['_id'], + 'gear_name': document['gear']['name'], + 'exchange': document['exchange'] + } + _files.append(f_dict) + + return _files + + +def migrate_collections(): + log.info('Migrate collections...') + + _files, _hashes = get_collections_files_hashes() counter = Counter(_hashes) + base = config.get_item('persistent', 'data_path') + + for i, f in enumerate(_files): + if f['fileinfo'].get('_id', ''): + counter[f['fileinfo']['hash']] -= 1 + continue + try: + f_uuid = str(uuid.uuid4()) + f_path = os.path.join(base, util.path_from_hash(f['fileinfo']['hash'])) + log.debug('copy file %s to %s' % (f_path, util.path_from_uuid(f_uuid))) + copy_file(f_path, files.get_file_abs_path(f_uuid)) + + update_set = { + f['prefix'] + '.$.modified': datetime.datetime.utcnow(), + f['prefix'] + '.$._id': f_uuid + } + log.debug('update file in mongo: %s' % update_set) + # Update the file with the newly generated UUID + config.db[f['collection']].find_one_and_update( + {'_id': f['collection_id'], f['prefix'] + '.name': f['fileinfo']['name']}, + {'$set': update_set} + ) + + # Decrease the count of the current hash, so we will know when we can remove the original file + counter[f['fileinfo']['hash']] -= 1 + + if counter[f['fileinfo']['hash']] == 0: + log.debug('remove old file: %s' % f_path) + os.remove(f_path) + + show_progress(i+1, len(_files)) + except Exception as e: + log.exception(e) + raise MigrationError('Wasn\'t able to migrate the \'%s\' ' + 'file in the \'%s\' collection (collection id: %s)' % + (f['fileinfo']['name'], f['collection'], str(f['collection_id'])), e) + + +def rollback_collections(): + log.info('Rollback collections...') + + _files, _ = get_collections_files_hashes() + base = config.get_item('persistent', 'data_path') + + for i, f in enumerate(_files): + if f['fileinfo'].get('_id', ''): + hash_path = os.path.join(base, util.path_from_hash(f['fileinfo']['hash'])) + uuid_path = files.get_file_abs_path(f['fileinfo']['_id']) + if os.path.exists(hash_path) and os.path.exists(uuid_path): + os.remove(uuid_path) + elif os.path.exists(uuid_path): + copy_file(uuid_path, hash_path) + os.remove(uuid_path) + + config.db[f['collection']].find_one_and_update( + {'_id': f['collection_id'], f['prefix'] + '.name': f['fileinfo']['name']}, + {'$unset': {f['prefix'] + '.$._id': ''}} + ) + + show_progress(i + 1, len(_files)) + + +def migrate_gears(): + log.info('Migrate gears...') + + _files = get_gears_files() + base = config.get_item('persistent', 'data_path') + + for i, f in enumerate(_files): + if f['exchange'].get('rootfs-id', ''): + continue + try: + f_uuid = str(uuid.uuid4()) + f_hash = 'v0-' + f['exchange']['rootfs-hash'].replace(':', '-') + f_path = os.path.join(base, util.path_from_hash(f_hash)) + log.debug('copy file %s to %s' % (f_path, util.path_from_uuid(f_uuid))) + copy_file(f_path, files.get_file_abs_path(f_uuid)) + + update_set = { + 'modified': datetime.datetime.utcnow(), + 'exchange.rootfs-id': f_uuid + } + + log.debug('update file in mongo: %s' % update_set) + # Update the gear with the newly generated UUID + config.db['gears'].find_one_and_update( + {'_id': f['gear_id']}, + {'$set': update_set} + ) + + log.debug('remove old file: %s' % f_path) + os.remove(f_path) + + show_progress(i + 1, len(_files)) + except Exception as e: + log.exception(e) + raise MigrationError('Wasn\'t able to migrate the \'%s\' gear (gear id: %s)' % + (f['gear_name'], f['gear_id']), e) + + +def rollback_gears(): + log.info('Rollback gears...') + + _files = get_gears_files() + base = config.get_item('persistent', 'data_path') + + for i, f in enumerate(_files): + if f['exchange'].get('rootfs-id', ''): + f_hash = 'v0-' + f['exchange']['rootfs-hash'].replace(':', '-') + hash_path = os.path.join(base, util.path_from_hash(f_hash)) + uuid_path = files.get_file_abs_path(f['exchange']['rootfs-id']) + if os.path.exists(hash_path) and os.path.exists(uuid_path): + os.remove(uuid_path) + elif os.path.exists(uuid_path): + copy_file(uuid_path, hash_path) + os.remove(uuid_path) + + config.db['gears'].find_one_and_update( + {'_id': f['gear_id']}, + {'$unset': {'exchange.rootfs-id': ''}} + ) + + show_progress(i + 1, len(_files)) + + +def remove_cas(): + """ + Remove CAS logic, generate UUID for the files and rename them on the filesystem, make a copy of the file if more + than one container using the same hash. + """ try: - base = config.get_item('persistent', 'data_path') - for i, f in enumerate(_files): - try: - f_uuid = str(uuid.uuid4()) - f['_id'] = f_uuid - f_path = os.path.join(base, util.path_from_hash(f['fileinfo']['hash'])) - log.debug('copy file %s to %s' % (f_path, util.path_from_uuid(f_uuid))) - copy_file(f_path, files.get_file_abs_path(f_uuid)) - - update_set = { - f['prefix'] + '.$.modified': datetime.datetime.utcnow(), - f['prefix'] + '.$._id': f_uuid - } - log.debug('update file in mongo: %s' % update_set) - # Update the file with the newly generated UUID - config.db[f['collection']].find_one_and_update( - {'_id': f['collection_id'], f['prefix'] + '.name': f['fileinfo']['name']}, - {'$set': update_set} - ) - - # Decrease the count of the current hash, so we will know when we can remove the original file - counter[f['fileinfo']['hash']] -= 1 - - if counter[f['fileinfo']['hash']] == 0: - log.debug('remove old file: %s' % f_path) - os.remove(f_path) - - # Show progress - if i % (len(_files) / 10) == 0: - log.info('Processed %s files of total %s files ...' % (i, len(_files))) - - except Exception as e: - log.exception(e) - raise MigrationError('Wasn\'t able to migrate the \'%s\' ' - 'file in the \'%s\' collection (collection id: %s)' % - (f['fileinfo']['name'], f['collection'], str(f['collection_id'])), e) + migrate_collections() + migrate_gears() except MigrationError as e: log.exception(e) - log.info('Rollback...') - base = config.get_item('persistent', 'data_path') - for f in _files: - if f.get('_id', ''): - hash_path = os.path.join(base, util.path_from_hash(f['fileinfo']['hash'])) - uuid_path = files.get_file_abs_path(f['_id']) - if os.path.exists(hash_path) and os.path.exists(uuid_path): - os.remove(uuid_path) - elif os.path.exists(uuid_path): - copy_file(uuid_path, hash_path) - os.remove(uuid_path) - config.db[f['collection']].find_one_and_update( - {'_id': f['collection_id'], f['prefix'] + '.name': f['fileinfo']['name']}, - {'$unset': {f['prefix'] + '.$._id': ''}} - ) + rollback_collections() + rollback_gears() exit(1) - - # Cleanup the empty folders - log.info('Cleanup empty folders') - for _dirpath, _, _ in os.walk(config.get_item('persistent', 'data_path'), topdown=False): - if not (os.listdir(_dirpath) or config.get_item('persistent', 'data_path') == _dirpath): - os.rmdir(_dirpath) + finally: + cleanup_empty_folders() if __name__ == '__main__': remove_cas() - From 88a5272f2403cf0205945b2cf6b6d0e2af36479d Mon Sep 17 00:00:00 2001 From: David Farkas Date: Wed, 17 Jan 2018 14:01:27 +0100 Subject: [PATCH 03/12] Add generic storage --- api/config.py | 11 + api/download.py | 49 ++-- api/files.py | 221 ++++++++------ api/handlers/listhandler.py | 161 +++++----- api/handlers/refererhandler.py | 38 +-- api/jobs/handlers.py | 6 +- api/placer.py | 90 +++--- api/upload.py | 42 ++- api/util.py | 20 +- bin/oneoffs/migrate_storage.py | 216 ++++++++++++++ bin/oneoffs/remove_cas.py | 238 --------------- requirements.txt | 1 + tests/bin/run-tests-ubuntu.sh | 2 + tests/integration_tests/python/conftest.py | 33 ++- .../integration_tests/python/test_download.py | 9 +- tests/integration_tests/python/test_jobs.py | 2 + .../python/test_migrate_storage.py | 274 ++++++++++++++++++ 17 files changed, 859 insertions(+), 554 deletions(-) create mode 100644 bin/oneoffs/migrate_storage.py delete mode 100755 bin/oneoffs/remove_cas.py create mode 100644 tests/integration_tests/python/test_migrate_storage.py diff --git a/api/config.py b/api/config.py index c2bbaf696..3fe15c5ac 100644 --- a/api/config.py +++ b/api/config.py @@ -7,6 +7,8 @@ import datetime import elasticsearch +from fs import open_fs + from . import util from .dao.dbutil import try_replace_one @@ -63,6 +65,7 @@ 'db_server_selection_timeout': '3000', 'data_path': os.path.join(os.path.dirname(__file__), '../persistent/data'), 'elasticsearch_host': 'localhost:9200', + 'fs_url': 'osfs://' + os.path.join(os.path.dirname(__file__), '../persistent/data') }, } @@ -321,3 +324,11 @@ def mongo_pipeline(table, pipeline): def get_auth(auth_type): return get_config()['auth'][auth_type] + + +# Storage configuration +fs = open_fs(__config['persistent']['fs_url']) +signed_url_available = ['GCSFS'] + +legacy_fs = open_fs('osfs://' + __config['persistent']['data_path']) +support_legacy_fs = True diff --git a/api/download.py b/api/download.py index 0664cbefa..882b6bda0 100644 --- a/api/download.py +++ b/api/download.py @@ -1,11 +1,13 @@ import os import bson import pytz -import os.path import tarfile import datetime import cStringIO +import fs.path +import fs.errors + from .web import base from .web.request import AccessType from . import config, files, util, validators @@ -31,7 +33,7 @@ def _filter_check(property_filter, property_values): class Download(base.RequestHandler): - def _append_targets(self, targets, cont_name, container, prefix, total_size, total_cnt, data_path, filters): + def _append_targets(self, targets, cont_name, container, prefix, total_size, total_cnt, filters): for f in container.get('files', []): if filters: filtered = True @@ -45,10 +47,9 @@ def _append_targets(self, targets, cont_name, container, prefix, total_size, tot break if filtered: continue - file_path = files.get_file_abs_path(f.get('_id', '')) - if not util.file_exists(file_path): - file_path = os.path.join(data_path, util.path_from_hash(f['hash'])) - if util.file_exists(file_path): # silently skip missing files + + file_path, _ = files.get_valid_file(f) + if file_path: # silently skip missing files if cont_name == 'analyses': targets.append((file_path, prefix + '/' + ('input' if f.get('input') else 'output') + '/' + f['name'], cont_name, str(container.get('_id')), f['size'])) else: @@ -60,7 +61,6 @@ def _append_targets(self, targets, cont_name, container, prefix, total_size, tot return total_size, total_cnt def _bulk_preflight_archivestream(self, file_refs): - data_path = config.get_item('persistent', 'data_path') arc_prefix = self.get_param('prefix', 'scitran') file_cnt = 0 total_size = 0 @@ -96,11 +96,8 @@ def _bulk_preflight_archivestream(self, file_refs): log.warn("Expected file {} on Container {} {} to exist but it is missing. File will be skipped in download.".format(filename, cont_name, cont_id)) continue - file_id = file_obj.get('_id', '') - file_path = files.get_file_abs_path(file_id) if file_id else '' - if not file_id or not os.path.exists(file_path): - file_path = os.path.join(data_path, util.path_from_hash(file_obj['hash'])) - if os.path.exists(file_path): # silently skip missing files + file_path, _ = files.get_valid_file(file_obj) + if file_path: # silently skip missing files targets.append((file_path, cont_name+'/'+cont_id+'/'+file_obj['name'], cont_name, cont_id, file_obj['size'])) total_size += file_obj['size'] file_cnt += 1 @@ -115,7 +112,6 @@ def _bulk_preflight_archivestream(self, file_refs): def _preflight_archivestream(self, req_spec, collection=None): - data_path = config.get_item('persistent', 'data_path') arc_prefix = self.get_param('prefix', 'scitran') file_cnt = 0 total_size = 0 @@ -140,7 +136,7 @@ def _preflight_archivestream(self, req_spec, collection=None): continue prefix = '/'.join([arc_prefix, project['group'], project['label']]) - total_size, file_cnt = self._append_targets(targets, 'projects', project, prefix, total_size, file_cnt, data_path, req_spec.get('filters')) + total_size, file_cnt = self._append_targets(targets, 'projects', project, prefix, total_size, file_cnt, req_spec.get('filters')) sessions = config.db.sessions.find({'project': item_id, 'deleted': {'$exists': False}}, ['label', 'files', 'uid', 'timestamp', 'timezone', 'subject']) session_dict = {session['_id']: session for session in sessions} @@ -161,19 +157,19 @@ def _preflight_archivestream(self, req_spec, collection=None): for code, subject in subject_dict.iteritems(): subject_prefix = self._path_from_container(prefix, subject, ids_of_paths, code) subject_prefixes[code] = subject_prefix - total_size, file_cnt = self._append_targets(targets, 'subjects', subject, subject_prefix, total_size, file_cnt, data_path, req_spec.get('filters')) + total_size, file_cnt = self._append_targets(targets, 'subjects', subject, subject_prefix, total_size, file_cnt, req_spec.get('filters')) for session in session_dict.itervalues(): subject_code = session['subject'].get('code', 'unknown_subject') subject = subject_dict[subject_code] session_prefix = self._path_from_container(subject_prefixes[subject_code], session, ids_of_paths, session["_id"]) session_prefixes[session['_id']] = session_prefix - total_size, file_cnt = self._append_targets(targets, 'sessions', session, session_prefix, total_size, file_cnt, data_path, req_spec.get('filters')) + total_size, file_cnt = self._append_targets(targets, 'sessions', session, session_prefix, total_size, file_cnt, req_spec.get('filters')) for acq in acquisitions: session = session_dict[acq['session']] acq_prefix = self._path_from_container(session_prefixes[session['_id']], acq, ids_of_paths, acq['_id']) - total_size, file_cnt = self._append_targets(targets, 'acquisitions', acq, acq_prefix, total_size, file_cnt, data_path, req_spec.get('filters')) + total_size, file_cnt = self._append_targets(targets, 'acquisitions', acq, acq_prefix, total_size, file_cnt, req_spec.get('filters')) elif item['level'] == 'session': @@ -188,7 +184,7 @@ def _preflight_archivestream(self, req_spec, collection=None): if not subject.get('code'): subject['code'] = 'unknown_subject' prefix = self._path_from_container(self._path_from_container(project['group'] + '/' + project['label'], subject, ids_of_paths, subject["code"]), session, ids_of_paths, session['_id']) - total_size, file_cnt = self._append_targets(targets, 'sessions', session, prefix, total_size, file_cnt, data_path, req_spec.get('filters')) + total_size, file_cnt = self._append_targets(targets, 'sessions', session, prefix, total_size, file_cnt, req_spec.get('filters')) # If the param `collection` holding a collection id is not None, filter out acquisitions that are not in the collection a_query = {'session': item_id, 'deleted': {'$exists': False}} @@ -198,7 +194,7 @@ def _preflight_archivestream(self, req_spec, collection=None): for acq in acquisitions: acq_prefix = self._path_from_container(prefix, acq, ids_of_paths, acq['_id']) - total_size, file_cnt = self._append_targets(targets, 'acquisitions', acq, acq_prefix, total_size, file_cnt, data_path, req_spec.get('filters')) + total_size, file_cnt = self._append_targets(targets, 'acquisitions', acq, acq_prefix, total_size, file_cnt, req_spec.get('filters')) elif item['level'] == 'acquisition': acq = config.db.acquisitions.find_one(base_query, ['session', 'label', 'files', 'uid', 'timestamp', 'timezone']) @@ -214,7 +210,7 @@ def _preflight_archivestream(self, req_spec, collection=None): project = config.db.projects.find_one({'_id': session['project']}, ['group', 'label']) prefix = self._path_from_container(self._path_from_container(self._path_from_container(project['group'] + '/' + project['label'], subject, ids_of_paths, subject['code']), session, ids_of_paths, session["_id"]), acq, ids_of_paths, acq['_id']) - total_size, file_cnt = self._append_targets(targets, 'acquisitions', acq, prefix, total_size, file_cnt, data_path, req_spec.get('filters')) + total_size, file_cnt = self._append_targets(targets, 'acquisitions', acq, prefix, total_size, file_cnt, req_spec.get('filters')) elif item['level'] == 'analysis': analysis = config.db.analyses.find_one(base_query, ['parent', 'label', 'files', 'uid', 'timestamp']) @@ -224,7 +220,7 @@ def _preflight_archivestream(self, req_spec, collection=None): continue prefix = self._path_from_container("", analysis, ids_of_paths, util.sanitize_string_to_filename(analysis['label'])) filename = 'analysis_' + util.sanitize_string_to_filename(analysis['label']) + '.tar' - total_size, file_cnt = self._append_targets(targets, 'analyses', analysis, prefix, total_size, file_cnt, data_path, req_spec.get('filters')) + total_size, file_cnt = self._append_targets(targets, 'analyses', analysis, prefix, total_size, file_cnt, req_spec.get('filters')) if len(targets) > 0: if not filename: @@ -282,8 +278,9 @@ def archivestream(self, ticket): stream = cStringIO.StringIO() with tarfile.open(mode='w|', fileobj=stream) as archive: for filepath, arcpath, cont_name, cont_id, _ in ticket['target']: - yield archive.gettarinfo(filepath, arcpath).tobuf() - with open(filepath, 'rb') as fd: + file_system = files.get_fs_by_file_path(filepath) + with file_system.open(filepath, 'rb') as fd: + yield archive.gettarinfo(fileobj=fd, arcname=arcpath).tobuf() chunk = '' for chunk in iter(lambda: fd.read(CHUNKSIZE), ''): # pylint: disable=cell-var-from-loop yield chunk @@ -293,11 +290,11 @@ def archivestream(self, ticket): yield stream.getvalue() # get tar stream trailer stream.close() - def symlinkarchivestream(self, ticket, data_path): + def symlinkarchivestream(self, ticket): for filepath, arcpath, cont_name, cont_id, _ in ticket['target']: t = tarfile.TarInfo(name=arcpath) t.type = tarfile.SYMTYPE - t.linkname = os.path.relpath(filepath, data_path) + t.linkname = fs.path.relpath(filepath) yield t.tobuf() self.log_user_access(AccessType.download_file, cont_name=cont_name, cont_id=cont_id, filename=os.path.basename(arcpath), multifile=True, origin_override=ticket['origin']) # log download stream = cStringIO.StringIO() @@ -316,7 +313,7 @@ def download(self): if ticket['ip'] != self.request.client_addr: self.abort(400, 'ticket not for this source IP') if self.get_param('symlinks'): - self.response.app_iter = self.symlinkarchivestream(ticket, config.get_item('persistent', 'data_path')) + self.response.app_iter = self.symlinkarchivestream(ticket) else: self.response.app_iter = self.archivestream(ticket) self.response.headers['Content-Type'] = 'application/octet-stream' diff --git a/api/files.py b/api/files.py index b7c7b9289..963be4950 100644 --- a/api/files.py +++ b/api/files.py @@ -1,99 +1,106 @@ import os import cgi import json -import shutil +import six import hashlib -from backports import tempfile - -from . import util -from . import config - -DEFAULT_HASH_ALG='sha384' - -def move_file(path, target_path): - target_dir = os.path.dirname(target_path) - if not os.path.exists(target_dir): - os.makedirs(target_dir) - shutil.move(path, target_path) - - -def move_form_file_field_into_storage(file_field): - """ - Given a file form field, move the (downloaded, tempdir-stored) file into the final storage. - - Requires an augmented file field; see upload.process_upload() for details. - """ - - if not file_field.uuid or not file_field.path: - raise Exception("Field is not a file field with uuid and path") +import fs.move +import fs.tempfs +import fs.path +import fs.errors + +from . import config, util + +DEFAULT_HASH_ALG = 'sha384' + +class FileProcessor(object): + def __init__(self, base, presistent_fs): + self.base = base + self._temp_fs = fs.tempfs.TempFS(identifier='.temp', temp_dir=self.base) + self._presistent_fs = presistent_fs + + def store_temp_file(self, src_path, dest_path): + if not isinstance(src_path, unicode): + src_path = six.u(src_path) + if not isinstance(dest_path, unicode): + dest_path = six.u(dest_path) + dst_dir = fs.path.dirname(dest_path) + self._presistent_fs.makedirs(dst_dir, recreate=True) + fs.move.move_file(src_fs=self.temp_fs, src_path=src_path, dst_fs=self._presistent_fs, dst_path=dest_path) + + def process_form(self, request): + """ + Some workarounds to make webapp2 process forms in an intelligent way. + Normally webapp2/WebOb Reqest.POST would copy the entire request stream + into a single file on disk. + https://github.com/Pylons/webob/blob/cb9c0b4f51542a7d0ed5cc5bf0a73f528afbe03e/webob/request.py#L787 + https://github.com/moraes/webapp-improved/pull/12 + We pass request.body_file (wrapped wsgi input stream) + to our custom subclass of cgi.FieldStorage to write each upload file + to a separate file on disk, as it comes in off the network stream from the client. + Then we can rename these files to their final destination, + without copying the data gain. + + Returns (tuple): + form: SingleFileFieldStorage instance + tempdir: tempdir the file was stored in. + + Keep tempdir in scope until you don't need it anymore; it will be deleted on GC. + """ + + # Copied from WebOb source: + # https://github.com/Pylons/webob/blob/cb9c0b4f51542a7d0ed5cc5bf0a73f528afbe03e/webob/request.py#L790 + env = request.environ.copy() + env.setdefault('CONTENT_LENGTH', '0') + env['QUERY_STRING'] = '' + + field_storage_class = get_single_file_field_storage(self._temp_fs) + + form = field_storage_class( + fp=request.body_file, environ=env, keep_blank_values=True + ) - move_file(file_field.path, get_file_abs_path(file_field.uuid)) + return form + def hash_file_formatted(self, filepath, f_system, hash_alg=None, buffer_size=65536): + """ + Return the scitran-formatted hash of a file, specified by path. + """ -def hash_file_formatted(path, hash_alg=None, buffer_size=65536): - """ - Return the scitran-formatted hash of a file, specified by path. - """ + if not isinstance(filepath, unicode): + filepath = six.u(filepath) - hash_alg = hash_alg or DEFAULT_HASH_ALG - hasher = hashlib.new(hash_alg) + hash_alg = hash_alg or DEFAULT_HASH_ALG + hasher = hashlib.new(hash_alg) - with open(path, 'rb') as f: - while True: - data = f.read(buffer_size) - if not data: - break - hasher.update(data) + with f_system.open(filepath, 'rb') as f: + while True: + data = f.read(buffer_size) + if not data: + break + hasher.update(data) - return util.format_hash(hash_alg, hasher.hexdigest()) + return util.format_hash(hash_alg, hasher.hexdigest()) + @property + def temp_fs(self): + return self._temp_fs -class FileStoreException(Exception): - pass + @property + def persistent_fs(self): + return self._presistent_fs + def __exit__(self, exc, value, tb): + self.close() -def process_form(request): - """ - Some workarounds to make webapp2 process forms in an intelligent way. - Normally webapp2/WebOb Reqest.POST would copy the entire request stream - into a single file on disk. - https://github.com/Pylons/webob/blob/cb9c0b4f51542a7d0ed5cc5bf0a73f528afbe03e/webob/request.py#L787 - https://github.com/moraes/webapp-improved/pull/12 - We pass request.body_file (wrapped wsgi input stream) - to our custom subclass of cgi.FieldStorage to write each upload file - to a separate file on disk, as it comes in off the network stream from the client. - Then we can rename these files to their final destination, - without copying the data gain. - - Returns (tuple): - form: SingleFileFieldStorage instance - tempdir: tempdir the file was stored in. - - Keep tempdir in scope until you don't need it anymore; it will be deleted on GC. - """ - - # Store form file fields in a tempdir - tempdir = tempfile.TemporaryDirectory(prefix='.tmp', dir=config.get_item('persistent', 'data_path')) - - # Copied from WebOb source: - # https://github.com/Pylons/webob/blob/cb9c0b4f51542a7d0ed5cc5bf0a73f528afbe03e/webob/request.py#L790 - env = request.environ.copy() - env.setdefault('CONTENT_LENGTH', '0') - env['QUERY_STRING'] = '' - - field_storage_class = get_single_file_field_storage( - tempdir.name - ) + def __del__(self): + self.close() - form = field_storage_class( - fp=request.body_file, environ=env, keep_blank_values=True - ) + def close(self): + self.temp_fs.close() - return form, tempdir - -def get_single_file_field_storage(upload_dir): +def get_single_file_field_storage(file_system): # pylint: disable=attribute-defined-outside-init # We dynamically create this class because we @@ -105,12 +112,15 @@ def get_single_file_field_storage(upload_dir): # https://github.com/python/cpython/blob/1e3e162ff5c0cc656559c43914439ab3e5734f00/Lib/cgi.py#L728 class SingleFileFieldStorage(cgi.FieldStorage): - bufsize = 2**20 + bufsize = 2 ** 20 def make_file(self, binary=None): # Sanitize form's filename (read: prevent malicious escapes, bad characters, etc) - self.filename = os.path.basename(self.filename) - self.open_file = open(os.path.join(upload_dir, self.filename), 'wb') + + self.filename = fs.path.basename(self.filename) + if not isinstance(self.filename, unicode): + self.filename = six.u(self.filename) + self.open_file = file_system.open(self.filename, 'wb') return self.open_file # override private method __write of superclass FieldStorage @@ -128,6 +138,9 @@ def _FieldStorage__write(self, line): return SingleFileFieldStorage +class FileStoreException(Exception): + pass + # File extension --> scitran file type detection hueristics. # Listed in precendence order. with open(os.path.join(os.path.dirname(__file__), 'filetypes.json')) as fd: @@ -147,6 +160,46 @@ def guess_type_from_filename(filename): return filetype -def get_file_abs_path(file_id): - version = 'v1' - return os.path.join(config.get_item('persistent', 'data_path'), version, util.path_from_uuid(file_id)) +def get_valid_file(file_info): + file_id = file_info.get('_id', '') + file_hash = file_info.get('hash', '') + file_uuid_path = None + file_hash_path = None + + if file_hash: + file_hash_path = util.path_from_hash(file_hash) + + if file_id: + file_uuid_path = util.path_from_uuid(file_id) + + if config.support_legacy_fs: + if file_hash_path and config.legacy_fs.isfile(file_hash_path): + return file_hash_path, config.legacy_fs + elif file_uuid_path and config.legacy_fs.isfile(file_uuid_path): + return file_uuid_path, config.legacy_fs + + if file_uuid_path and config.fs.isfile(file_uuid_path): + return file_uuid_path, config.fs + else: + raise fs.errors.ResourceNotFound('File not found: %s', file_info['name']) + + +def get_signed_url(file_path, file_system, filename=None): + try: + if type(file_system).__name__ in config.signed_url_available: + return file_system.geturl(file_path, filename=filename) + except fs.errors.NoURL: + return None + + +def get_fs_by_file_path(file_path): + if config.support_legacy_fs: + if config.legacy_fs.isfile(file_path): + return config.legacy_fs + elif config.legacy_fs.isfile(file_path): + return config.legacy_fs + + if config.fs.isfile(file_path): + return config.fs + else: + raise fs.errors.ResourceNotFound('File not found: %s', file_path) diff --git a/api/handlers/listhandler.py b/api/handlers/listhandler.py index e31b163ea..fcab5e3ae 100644 --- a/api/handlers/listhandler.py +++ b/api/handlers/listhandler.py @@ -1,4 +1,3 @@ -import os import bson import copy import datetime @@ -357,24 +356,25 @@ def _check_ticket(self, ticket_id, _id, filename): return ticket @staticmethod - def build_zip_info(filepath): + def build_zip_info(file_path, file_system): """ Builds a json response containing member and comment info for a zipfile """ - with zipfile.ZipFile(filepath) as zf: - info = {} - info['comment'] = zf.comment - info['members'] = [] - for zi in zf.infolist(): - m = {} - m['path'] = zi.filename - m['size'] = zi.file_size - m['timestamp'] = datetime.datetime(*zi.date_time) - m['comment'] = zi.comment - - info['members'].append(m) - - return info + with file_system.open(file_path, 'rb') as f: + with zipfile.ZipFile(f) as zf: + info = {} + info['comment'] = zf.comment + info['members'] = [] + for zi in zf.infolist(): + m = {} + m['path'] = zi.filename + m['size'] = zi.file_size + m['timestamp'] = datetime.datetime(*zi.date_time) + m['comment'] = zi.comment + + info['members'].append(m) + + return info def get(self, cont_name, list_name, **kwargs): _id = kwargs.pop('cid') @@ -403,10 +403,8 @@ def get(self, cont_name, list_name, **kwargs): hash_ = self.get_param('hash') if hash_ and hash_ != fileinfo['hash']: self.abort(409, 'file exists, hash mismatch') - data_path = config.get_item('persistent', 'data_path') - file_path = files.get_file_abs_path(fileinfo.get('_id', '')) - if not util.file_exists(file_path): - file_path = os.path.join(data_path, util.path_from_hash(fileinfo['hash'])) + + file_path, file_system = files.get_valid_file(fileinfo) # Request for download ticket if self.get_param('ticket') == '': @@ -416,18 +414,19 @@ def get(self, cont_name, list_name, **kwargs): # Request for info about zipfile elif self.is_true('info'): try: - info = self.build_zip_info(file_path) + info = self.build_zip_info(file_path, file_system) + return info except zipfile.BadZipfile: self.abort(400, 'not a zip file') - return info # Request to download zipfile member elif self.get_param('member') is not None: zip_member = self.get_param('member') try: - with zipfile.ZipFile(file_path) as zf: - self.response.headers['Content-Type'] = util.guess_mimetype(zip_member) - self.response.write(zf.open(zip_member).read()) + with file_system.open(file_path, 'rb') as f: + with zipfile.ZipFile(f) as zf: + self.response.headers['Content-Type'] = util.guess_mimetype(zip_member) + self.response.write(zf.open(zip_member).read()) except zipfile.BadZipfile: self.abort(400, 'not a zip file') except KeyError: @@ -442,67 +441,71 @@ def get(self, cont_name, list_name, **kwargs): # Authenticated or ticketed download request else: - range_header = self.request.headers.get('Range', '') - try: - if not self.is_true('view'): - raise util.RangeHeaderParseError('Feature flag not set') + signed_url = files.get_signed_url(file_path, file_system, filename=filename) + if signed_url: + self.redirect(signed_url) + else: + range_header = self.request.headers.get('Range', '') + try: + if not self.is_true('view'): + raise util.RangeHeaderParseError('Feature flag not set') - ranges = util.parse_range_header(range_header) - for first, last in ranges: - if first > fileinfo['size'] - 1: - self.abort(416, 'Invalid range') + ranges = util.parse_range_header(range_header) + for first, last in ranges: + if first > fileinfo['size'] - 1: + self.abort(416, 'Invalid range') - if last > fileinfo['size'] - 1: - raise util.RangeHeaderParseError('Invalid range') + if last > fileinfo['size'] - 1: + raise util.RangeHeaderParseError('Invalid range') - except util.RangeHeaderParseError: - self.response.app_iter = open(file_path, 'rb') - self.response.headers['Content-Length'] = str(fileinfo['size']) # must be set after setting app_iter + except util.RangeHeaderParseError: + self.response.app_iter = file_system.open(file_path, 'rb') + self.response.headers['Content-Length'] = str(fileinfo['size']) # must be set after setting app_iter - if self.is_true('view'): - self.response.headers['Content-Type'] = str(fileinfo.get('mimetype', 'application/octet-stream')) - else: - self.response.headers['Content-Type'] = 'application/octet-stream' - self.response.headers['Content-Disposition'] = 'attachment; filename="' + filename + '"' - else: - self.response.status = 206 - if len(ranges) > 1: - self.response.headers['Content-Type'] = 'multipart/byteranges; boundary=%s' % self.request.id + if self.is_true('view'): + self.response.headers['Content-Type'] = str(fileinfo.get('mimetype', 'application/octet-stream')) + else: + self.response.headers['Content-Type'] = 'application/octet-stream' + self.response.headers['Content-Disposition'] = 'attachment; filename="' + filename + '"' else: - self.response.headers['Content-Type'] = str( - fileinfo.get('mimetype', 'application/octet-stream')) - self.response.headers['Content-Range'] = 'bytes %s-%s/%s' % (str(ranges[0][0]), - str(ranges[0][1]), - str(fileinfo['size'])) - with open(file_path, 'rb') as f: - for first, last in ranges: - mode = os.SEEK_SET - if first < 0: - mode = os.SEEK_END - length = abs(first) - elif last is None: - length = fileinfo['size'] - first - else: - if last > fileinfo['size']: + self.response.status = 206 + if len(ranges) > 1: + self.response.headers['Content-Type'] = 'multipart/byteranges; boundary=%s' % self.request.id + else: + self.response.headers['Content-Type'] = str( + fileinfo.get('mimetype', 'application/octet-stream')) + self.response.headers['Content-Range'] = 'bytes %s-%s/%s' % (str(ranges[0][0]), + str(ranges[0][1]), + str(fileinfo['size'])) + with file_system.open(file_path, 'rb') as f: + for first, last in ranges: + mode = os.SEEK_SET + if first < 0: + mode = os.SEEK_END + length = abs(first) + elif last is None: length = fileinfo['size'] - first else: - length = last - first + 1 - - f.seek(first, mode) - data = f.read(length) - - if len(ranges) > 1: - self.response.write('--%s\n' % self.request.id) - self.response.write('Content-Type: %s\n' % str( - fileinfo.get('mimetype', 'application/octet-stream'))) - self.response.write('Content-Range: %s' % 'bytes %s-%s/%s\n' % (str(first), - str(last), - str(fileinfo['size']))) - self.response.write('\n') - self.response.write(data) - self.response.write('\n') - else: - self.response.write(data) + if last > fileinfo['size']: + length = fileinfo['size'] - first + else: + length = last - first + 1 + + f.seek(first, mode) + data = f.read(length) + + if len(ranges) > 1: + self.response.write('--%s\n' % self.request.id) + self.response.write('Content-Type: %s\n' % str( + fileinfo.get('mimetype', 'application/octet-stream'))) + self.response.write('Content-Range: %s' % 'bytes %s-%s/%s\n' % (str(first), + str(last), + str(fileinfo['size']))) + self.response.write('\n') + self.response.write(data) + self.response.write('\n') + else: + self.response.write(data) # log download if we haven't already for this ticket if ticket: diff --git a/api/handlers/refererhandler.py b/api/handlers/refererhandler.py index 441eb9daa..2ca729d4c 100644 --- a/api/handlers/refererhandler.py +++ b/api/handlers/refererhandler.py @@ -7,7 +7,6 @@ """ import bson -import os import zipfile import datetime from abc import ABCMeta, abstractproperty @@ -359,30 +358,25 @@ def download(self, **kwargs): self.abort(404, "{} doesn't exist".format(filename)) else: fileinfo = fileinfo[0] - data_path = config.get_item('persistent', 'data_path') - file_path = files.get_file_abs_path(fileinfo.get('_id', '')) - if not util.file_exists(file_path): - file_path = os.path.join( - data_path, - util.path_from_hash(fileinfo['hash']) - ) + file_path, file_system = files.get_valid_file(fileinfo) filename = fileinfo['name'] # Request for info about zipfile if self.is_true('info'): try: - info = FileListHandler.build_zip_info(file_path) + info = FileListHandler.build_zip_info(file_path, file_system) + return info except zipfile.BadZipfile: self.abort(400, 'not a zip file') - return info # Request to download zipfile member elif self.get_param('member') is not None: zip_member = self.get_param('member') try: - with zipfile.ZipFile(file_path) as zf: - self.response.headers['Content-Type'] = util.guess_mimetype(zip_member) - self.response.write(zf.open(zip_member).read()) + with file_system.open(file_path, 'rb') as f: + with zipfile.ZipFile(f) as zf: + self.response.headers['Content-Type'] = util.guess_mimetype(zip_member) + self.response.write(zf.open(zip_member).read()) except zipfile.BadZipfile: self.abort(400, 'not a zip file') except KeyError: @@ -397,7 +391,7 @@ def download(self, **kwargs): # Request to download the file itself else: - self.response.app_iter = open(file_path, 'rb') + self.response.app_iter = file_system.open(file_path, 'rb') self.response.headers['Content-Length'] = str(fileinfo['size']) # must be set after setting app_iter if self.is_true('view'): self.response.headers['Content-Type'] = str(fileinfo.get('mimetype', 'application/octet-stream')) @@ -439,17 +433,13 @@ def _prepare_batch(self, fileinfo, analysis): ## we need a way to avoid this targets = [] total_size = total_cnt = 0 - data_path = config.get_item('persistent', 'data_path') for f in fileinfo: - file_path = files.get_file_abs_path(f.get('_id', '')) - if not util.file_exists(file_path): - file_path = os.path.join(data_path, util.path_from_hash(f['hash'])) - if util.file_exists(file_path): # silently skip missing files - targets.append((file_path, - util.sanitize_string_to_filename(analysis['label']) + '/' + ('input' if f.get('input') else 'output') + '/'+ f['name'], - 'analyses', analysis['_id'], f['size'])) - total_size += f['size'] - total_cnt += 1 + file_path, _ = files.get_valid_file(f) + targets.append((file_path, + util.sanitize_string_to_filename(analysis['label']) + '/' + ('input' if f.get('input') else 'output') + '/'+ f['name'], + 'analyses', analysis['_id'], f['size'])) + total_size += f['size'] + total_cnt += 1 return targets, total_size, total_cnt diff --git a/api/jobs/handlers.py b/api/jobs/handlers.py index 7e69d49af..b3e919e2c 100644 --- a/api/jobs/handlers.py +++ b/api/jobs/handlers.py @@ -88,8 +88,10 @@ def download(self, **kwargs): # pragma: no cover """Download gear tarball file""" dl_id = kwargs.pop('cid') gear = get_gear(dl_id) - file_id = gear['exchange']['rootfs-id'] - file_path = files.get_file_abs_path(file_id) + file_path, file_system = files.get_valid_file({ + '_id': gear['exchange'].get('rootfs-id', ''), + 'hash': 'v0-' + gear['exchange']['rootfs-hash'].replace(':', '-') + }) stream = open(file_path, 'rb') set_for_download(self.response, stream=stream, filename='gear.tar') diff --git a/api/placer.py b/api/placer.py index b4a90ea4a..2e37a92db 100644 --- a/api/placer.py +++ b/api/placer.py @@ -2,18 +2,14 @@ import copy import datetime import dateutil -import os import pymongo -import shutil import uuid import zipfile -from backports import tempfile +import fs.path +import fs.errors -from . import config -from . import files -from . import util -from . import validators +from . import config, util, validators from .dao import containerutil, hierarchy from .dao.containerstorage import SessionStorage, AcquisitionStorage from .jobs import rules @@ -28,7 +24,7 @@ class Placer(object): Interface for a placer, which knows how to process files and place them where they belong - on disk and database. """ - def __init__(self, container_type, container, id_, metadata, timestamp, origin, context): + def __init__(self, container_type, container, id_, metadata, timestamp, origin, context, file_processor): self.container_type = container_type self.container = container self.id_ = id_ @@ -47,6 +43,8 @@ def __init__(self, container_type, container, id_, metadata, timestamp, origin, # A list of files that have been saved via save_file() usually returned by finalize() self.saved = [] + self.file_processor = file_processor + def check(self): """ @@ -87,10 +85,12 @@ def save_file(self, field=None, file_attrs=None): Requires an augmented file field; see process_upload() for details. """ - # Save file - if field is not None: - files.move_form_file_field_into_storage(field) + if field is not None and self.file_processor is not None: + self.file_processor.store_temp_file(field.path, util.path_from_uuid(field.uuid)) + + # if field is not None: + # files.move_form_file_field_into_storage(field) # Update the DB if file_attrs is not None: @@ -140,8 +140,8 @@ class UIDPlacer(Placer): create_hierarchy = staticmethod(hierarchy.upsert_top_down_hierarchy) match_type = 'uid' - def __init__(self, container_type, container, id_, metadata, timestamp, origin, context): - super(UIDPlacer, self).__init__(container_type, container, id_, metadata, timestamp, origin, context) + def __init__(self, container_type, container, id_, metadata, timestamp, origin, context, file_processor): + super(UIDPlacer, self).__init__(container_type, container, id_, metadata, timestamp, origin, context, file_processor) self.metadata_for_file = {} self.session_id = None self.count = 0 @@ -191,7 +191,7 @@ def process_file_field(self, field, file_attrs): self.save_file(field, file_attrs) else: if field is not None: - files.move_form_file_field_into_storage(field) + self.file_processor.store_temp_file(field.path, util.path_from_uuid(field.uuid)) if file_attrs is not None: container.upsert_file(file_attrs) @@ -363,8 +363,8 @@ class TokenPlacer(Placer): Intended for use with a token that tracks where the files will be stored. """ - def __init__(self, container_type, container, id_, metadata, timestamp, origin, context): - super(TokenPlacer, self).__init__(container_type, container, id_, metadata, timestamp, origin, context) + def __init__(self, container_type, container, id_, metadata, timestamp, origin, context, file_processor): + super(TokenPlacer, self).__init__(container_type, container, id_, metadata, timestamp, origin, context, file_processor) self.paths = [] self.folder = None @@ -381,10 +381,9 @@ def check(self): # upload.clean_packfile_tokens # # It must be kept in sync between each instance. - base_path = config.get_item('persistent', 'data_path') - self.folder = os.path.join(base_path, 'tokens', 'packfile', token) + self.folder = fs.path.join('tokens', 'packfile', token) - util.mkdir_p(self.folder) + util.mkdir_p(self.folder, config.fs) def process_file_field(self, field, file_attrs): self.saved.append(file_attrs) @@ -392,8 +391,8 @@ def process_file_field(self, field, file_attrs): def finalize(self): for path in self.paths: - dest = os.path.join(self.folder, os.path.basename(path)) - shutil.move(path, dest) + dest = fs.path.join(self.folder, path) + self.file_processor.store_temp_file(path, dest) self.recalc_session_compliance() return self.saved @@ -402,8 +401,8 @@ class PackfilePlacer(Placer): A placer that can accept N files, save them into a zip archive, and place the result on an acquisition. """ - def __init__(self, container_type, container, id_, metadata, timestamp, origin, context): - super(PackfilePlacer, self).__init__(container_type, container, id_, metadata, timestamp, origin, context) + def __init__(self, container_type, container, id_, metadata, timestamp, origin, context, file_processor): + super(PackfilePlacer, self).__init__(container_type, container, id_, metadata, timestamp, origin, context, file_processor) # This endpoint is an SSE endpoint self.sse = True @@ -439,10 +438,11 @@ def check(self): # upload.clean_packfile_tokens # # It must be kept in sync between each instance. - base_path = config.get_item('persistent', 'data_path') - self.folder = os.path.join(base_path, 'tokens', 'packfile', token) + self.folder = fs.path.join('tokens', 'packfile', token) - if not os.path.isdir(self.folder): + try: + config.fs.isdir(self.folder) + except fs.errors.ResourceNotFound: raise Exception('Packfile directory does not exist or has been deleted') self.requireMetadata() @@ -480,27 +480,20 @@ def check(self): stamp = minimum # Remember the timestamp integer for later use with os.utime. - self.ziptime = int(dateutil.parser.parse(stamp).strftime('%s')) + self.ziptime = dateutil.parser.parse(stamp) # The zipfile is a santizied acquisition label self.dir_ = util.sanitize_string_to_filename(self.a_label) self.name = self.dir_ + '.zip' - # Make a tempdir to store zip until moved - # OPPORTUNITY: this is also called in files.py. Could be a util func. - self.tempdir = tempfile.TemporaryDirectory(prefix='.tmp', dir=config.get_item('persistent', 'data_path')) - # Create a zip in the tempdir that later gets moved into the CAS. - self.path = os.path.join(self.tempdir.name, 'temp.zip') - self.zip_ = zipfile.ZipFile(self.path, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) + self.path = u'temp.zip' + self.zip_ = zipfile.ZipFile(self.file_processor.temp_fs.open(self.path, 'wb'), + 'w', zipfile.ZIP_DEFLATED, allowZip64=True) # OPPORTUNITY: add zip comment # self.zip.comment = json.dumps(metadata, default=metadata_encoder) - # Bit of a silly hack: write our tempdir directory into the zip (not including its contents). - # Creates an empty directory entry in the zip which will hold all the files inside. - # This way, when you expand a zip, you'll get folder/things instead of a thousand dicoms splattered everywhere. - self.zip_.write(self.tempdir.name, self.dir_) def process_file_field(self, field, file_attrs): # Should not be called with any files @@ -508,19 +501,20 @@ def process_file_field(self, field, file_attrs): def finalize(self): - paths = os.listdir(self.folder) + paths = self.file_processor.persistent_fs.listdir(self.folder) total = len(paths) # Write all files to zip complete = 0 for path in paths: - p = os.path.join(self.folder, path) + p = fs.path.join(self.folder, path) # Set the file's mtime & atime. - os.utime(p, (self.ziptime, self.ziptime)) + self.file_processor.persistent_fs.settimes(p, self.ziptime, self.ziptime) # Place file into the zip folder we created before - self.zip_.write(p, os.path.join(self.dir_, os.path.basename(path))) + with self.file_processor.persistent_fs.open(p, 'rb') as f: + self.zip_.writestr(fs.path.join(self.dir_, path), f.read()) # Report progress complete += 1 @@ -532,7 +526,7 @@ def finalize(self): self.zip_.close() # Remove the folder created by TokenPlacer - shutil.rmtree(self.folder) + self.file_processor.persistent_fs.removetree(self.folder) # Lookup uid on token token = self.context['token'] @@ -547,10 +541,10 @@ def finalize(self): # Not a great practice. See process_upload() for details. cgi_field = util.obj_from_map({ 'filename': self.name, - 'path': self.path, - 'size': os.path.getsize(self.path), - 'hash': files.hash_file_formatted(self.path), - 'uuid': str(uuid.uuid4()), + 'path': self.path, + 'size': self.file_processor.temp_fs.getsize(self.path), + 'hash': self.file_processor.hash_file_formatted(self.path, self.file_processor.temp_fs), + 'uuid': str(uuid.uuid4()), 'mimetype': util.guess_mimetype('lol.zip'), 'modified': self.timestamp }) @@ -560,9 +554,9 @@ def finalize(self): # Used in the API return. cgi_attrs = { '_id': cgi_field.uuid, - 'name': cgi_field.filename, + 'name': cgi_field.filename, 'modified': cgi_field.modified, - 'size': cgi_field.size, + 'size': cgi_field.size, 'hash': cgi_field.hash, 'mimetype': cgi_field.mimetype, diff --git a/api/upload.py b/api/upload.py index a98123e40..d0a3bd6ff 100644 --- a/api/upload.py +++ b/api/upload.py @@ -1,10 +1,10 @@ import bson import datetime import json -import os.path -import shutil import uuid +import fs.path + from .web import base from .web.errors import FileStoreException, FileFormException from . import config @@ -75,7 +75,8 @@ def process_upload(request, strategy, container_type=None, id_=None, origin=None # The vast majority of this function's wall-clock time is spent here. # Tempdir is deleted off disk once out of scope, so let's hold onto this reference. - form, tempdir = files.process_form(request) + file_processor = files.FileProcessor(config.get_item('persistent', 'data_path'), config.fs) + form = file_processor.process_form(request) if 'metadata' in form: try: @@ -84,7 +85,7 @@ def process_upload(request, strategy, container_type=None, id_=None, origin=None raise FileStoreException('wrong format for field "metadata"') placer_class = strategy.value - placer = placer_class(container_type, container, id_, metadata, timestamp, origin, context) + placer = placer_class(container_type, container, id_, metadata, timestamp, origin, context, file_processor) placer.check() # Browsers, when sending a multipart upload, will send files with field name "file" (if sinuglar) @@ -102,19 +103,17 @@ def process_upload(request, strategy, container_type=None, id_=None, origin=None # Augment the cgi.FieldStorage with a variety of custom fields. # Not the best practice. Open to improvements. # These are presumbed to be required by every function later called with field as a parameter. - field.path = os.path.join(tempdir.name, field.filename) - if not os.path.exists(field.path): - tempdir_exists = os.path.exists(tempdir.name) - raise Exception("file {} does not exist, tmpdir {} exists: {}, files in tmpdir: {}".format( + field.path = field.filename + if not file_processor.temp_fs.exists(field.path): + #tempdir_exists = os.path.exists(tempdir.name) + raise Exception("file {} does not exist, files in tmpdir: {}".format( field.path, - tempdir.name, - tempdir_exists, - tempdir_exists and os.listdir(tempdir.name), + file_processor.temp_fs.listdir('/'), )) - field.size = os.path.getsize(field.path) - field.hash = files.hash_file_formatted(field.path) + field.size = file_processor.temp_fs.getsize(field.path) + field.hash = file_processor.hash_file_formatted(field.path, file_processor.temp_fs) field.uuid = str(uuid.uuid4()) - field.mimetype = util.guess_mimetype(field.filename) # TODO: does not honor metadata's mime type if any + field.mimetype = util.guess_mimetype(field.filename) # TODO: does not honor metadata's mime type if any field.modified = timestamp # create a file-attribute map commonly used elsewhere in the codebase. @@ -122,7 +121,7 @@ def process_upload(request, strategy, container_type=None, id_=None, origin=None file_attrs = { '_id': field.uuid, 'name': field.filename, - 'modified': field.modified, # + 'modified': field.modified, 'size': field.size, 'mimetype': field.mimetype, 'hash': field.hash, @@ -240,15 +239,14 @@ def clean_packfile_tokens(self): # upload.clean_packfile_tokens # # It must be kept in sync between each instance. - basepath = config.get_item('persistent', 'data_path') - folder = os.path.join(basepath, 'tokens', 'packfile') + folder = fs.path.join('tokens', 'packfile') - util.mkdir_p(folder) - paths = os.listdir(folder) + util.mkdir_p(folder, config.fs) + paths = config.fs.listdir(folder) cleaned = 0 for token in paths: - path = os.path.join(folder, token) + path = fs.path.join(folder, token) result = None try: @@ -261,7 +259,7 @@ def clean_packfile_tokens(self): if result is None: log.info('Cleaning expired token directory ' + token) - shutil.rmtree(path) + config.fs.removetree(path) cleaned += 1 return { @@ -272,7 +270,7 @@ def clean_packfile_tokens(self): } def extract_file_fields(form): - """Returns a list of file fields in the form, handling multiple values""" + """Returns a list of file fields in the form, handling multiple values""" result = [] for fieldname in form: field = form[fieldname] diff --git a/api/util.py b/api/util.py index 4b88a2100..edacef534 100644 --- a/api/util.py +++ b/api/util.py @@ -1,6 +1,5 @@ import datetime import enum as baseEnum -import errno import hashlib import json import mimetypes @@ -11,6 +10,9 @@ import string import uuid +import fs.path +import fs.errors + import django from django.conf import settings from django.template import Template, Context @@ -254,14 +256,11 @@ def __eq__(self, other): else: return super.__eq__(other) -def mkdir_p(path): +def mkdir_p(path, file_system): try: - os.makedirs(path) - except OSError as exc: # Python >2.5 - if exc.errno == errno.EEXIST and os.path.isdir(path): - pass - else: - raise + file_system.makedirs(path) + except fs.errors.DirectoryExists: + pass NONCE_CHARS = string.ascii_letters + string.digits NONCE_LENGTH = 18 @@ -288,7 +287,7 @@ def path_from_uuid(uuid_): first_stanza = uuid_1[0:2] second_stanza = uuid_1[2:4] path = (first_stanza, second_stanza, uuid_) - return os.path.join(*path) + return fs.path.join(*path) def path_from_hash(hash_): @@ -306,9 +305,6 @@ def path_from_hash(hash_): return os.path.join(*path) -def file_exists(path): - return os.path.exists(path) and os.path.isfile(path) - class RangeHeaderParseError(ValueError): """Exception class representing a string parsing error.""" diff --git a/bin/oneoffs/migrate_storage.py b/bin/oneoffs/migrate_storage.py new file mode 100644 index 000000000..f10cf6b92 --- /dev/null +++ b/bin/oneoffs/migrate_storage.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python +import argparse +import datetime +import logging +import os +import uuid + +import fs.path +import fs.move + +from api import config, util + +log = logging.getLogger('migrate_storage') +log.setLevel(logging.INFO) + +COLLECTIONS_PREFIXES = [('projects', 'files'), + ('acquisitions', 'files'), + ('analyses', 'files'), + ('sessions', 'files'), + ('sessions', 'subject.files'), + ('collections', 'files')] + + +class MigrationError(Exception): + pass + + +def get_files_by_prefix(document, prefix): + for key in prefix.split('.'): + document = document.get(key, {}) + return document + + +def show_progress(current_index, total_files): + if current_index % (total_files / 10 + 1) == 0: + log.info('Processed %s files of total %s files ...' % (current_index, total_files)) + + +def cleanup_empty_folders(): + log.info('Cleanup empty folders') + + for _dirpath, _, _ in os.walk(config.get_item('persistent', 'data_path'), topdown=False): + if not (os.listdir(_dirpath) or config.get_item('persistent', 'data_path') == _dirpath): + os.rmdir(_dirpath) + + +def get_collections_files(): + _files = [] + + for collection, prefix in COLLECTIONS_PREFIXES: + cursor = config.db.get_collection(collection).find({}) + for document in cursor: + for f in get_files_by_prefix(document, prefix): + f_dict = { + 'collection_id': document.get('_id'), + 'collection': collection, + 'fileinfo': f, + 'prefix': prefix + } + _files.append(f_dict) + + return _files + + +def get_gears_files(): + cursor = config.db.get_collection('gears').find({}) + _files = [] + + for document in cursor: + if document.get('exchange', {}).get('git-commit', '') == 'local': + f_dict = { + 'gear_id': document['_id'], + 'gear_name': document['gear']['name'], + 'exchange': document['exchange'] + } + _files.append(f_dict) + + return _files + + +def migrate_collections(): + log.info('Migrate collection files...') + + _files = get_collections_files() + + for i, f in enumerate(_files): + try: + file_id = f['fileinfo'].get('_id', '') + if file_id: + file_path = util.path_from_uuid(file_id) + if not config.fs.isfile(file_path): + """Copy file from legacy to new storage""" + + log.debug('copy file between the legacy and new storage: %s' % file_path) + + dst_dir = fs.path.dirname(file_path) + config.fs.makedirs(dst_dir, recreate=True) + fs.move.copy_file(src_fs=config.legacy_fs, src_path=file_path, dst_fs=config.fs, dst_path=file_path) + else: + """generate uuuid, set the id field and copy the file""" + file_id = str(uuid.uuid4()) + f_old_path = util.path_from_hash(f['fileinfo']['hash']) + f_new_path = util.path_from_uuid(file_id) + + log.debug('copy file %s to %s' % (f_old_path, f_new_path)) + + dst_dir = fs.path.dirname(f_new_path) + config.fs.makedirs(dst_dir, recreate=True) + fs.move.copy_file(src_fs=config.legacy_fs, src_path=f_old_path, dst_fs=config.fs, dst_path=f_new_path) + + update_set = { + f['prefix'] + '.$.modified': datetime.datetime.utcnow(), + f['prefix'] + '.$._id': file_id + } + log.debug('update file in mongo: %s' % update_set) + # Update the file with the newly generated UUID + config.db[f['collection']].find_one_and_update( + {'_id': f['collection_id'], f['prefix'] + '.name': f['fileinfo']['name']}, + {'$set': update_set} + ) + + show_progress(i + 1, len(_files)) + + except Exception as e: + log.exception(e) + raise MigrationError('Wasn\'t able to migrate the \'%s\' ' + 'file in the \'%s\' collection (collection id: %s)' % + (f['fileinfo']['name'], f['collection'], str(f['collection_id'])), e) + + +def migrate_gears(): + log.info('Migrate gears...') + + _files = get_gears_files() + + for i, f in enumerate(_files): + try: + file_id = f['exchange'].get('rootfs-id', '') + if file_id: + file_path = util.path_from_uuid(file_id) + if not config.fs.isfile(file_path): + """Copy file from legacy to new storage""" + + log.debug('copy file between the legacy and new storage: %s' % file_path) + + dst_dir = fs.path.dirname(file_path) + config.fs.makedirs(dst_dir, recreate=True) + fs.move.copy_file(src_fs=config.legacy_fs, src_path=file_path, dst_fs=config.fs, dst_path=file_path) + else: + file_id = str(uuid.uuid4()) + file_hash = 'v0-' + f['exchange']['rootfs-hash'].replace(':', '-') + f_old_path = util.path_from_hash(file_hash) + f_new_path = util.path_from_uuid(file_id) + + log.debug('copy file %s to %s' % (f_old_path, f_new_path)) + + dst_dir = fs.path.dirname(f_new_path) + config.fs.makedirs(dst_dir, recreate=True) + fs.move.copy_file(src_fs=config.legacy_fs, src_path=f_old_path, dst_fs=config.fs, dst_path=f_new_path) + + update_set = { + 'modified': datetime.datetime.utcnow(), + 'exchange.rootfs-id': file_id + } + + log.debug('update file in mongo: %s' % update_set) + # Update the gear with the newly generated UUID + config.db['gears'].find_one_and_update( + {'_id': f['gear_id']}, + {'$set': update_set} + ) + + show_progress(i + 1, len(_files)) + except Exception as e: + log.exception(e) + raise MigrationError('Wasn\'t able to migrate the \'%s\' gear (gear id: %s)' % + (f['gear_name'], f['gear_id']), e) + + +def migrate_storage(): + """ + Remove CAS logic, generate UUID for the files and move the files from the lagacy storage to the new one. + """ + + parser = argparse.ArgumentParser(prog='Migrate storage') + parser.add_argument('--collections', action='store_true', help='Migrate collections') + parser.add_argument('--gears', action='store_true', help='Migrate gears') + parser.add_argument('--delete-files', action='store_true', help='Delete files from legacy storage') + + + args = parser.parse_args() + + try: + if not (args.collections or args.gears): + migrate_collections() + migrate_gears() + + if args.collections: + migrate_collections() + + if args.gears: + migrate_gears() + + if args.delete_files: + config.legacy_fs.removetree('/') + + except MigrationError as e: + log.exception(e) + exit(1) + finally: + cleanup_empty_folders() + pass + + +if __name__ == '__main__': + migrate_storage() diff --git a/bin/oneoffs/remove_cas.py b/bin/oneoffs/remove_cas.py deleted file mode 100755 index 2cbb852d4..000000000 --- a/bin/oneoffs/remove_cas.py +++ /dev/null @@ -1,238 +0,0 @@ -#!/usr/bin/env python -import datetime -import logging -import os -import shutil -import uuid - -from collections import Counter - -from api import config, files, util - -log = logging.getLogger('remove_cas') -log.setLevel(logging.INFO) - - -class MigrationError(Exception): - pass - - -def get_files_by_prefix(document, prefix): - for key in prefix.split('.'): - document = document.get(key, {}) - return document - - -def copy_file(path, target_path): - target_dir = os.path.dirname(target_path) - - if not os.path.exists(target_dir): - os.makedirs(target_dir) - shutil.copy(path, target_path) - - -def show_progress(current_index, total_files): - if current_index % (total_files / 10 + 1) == 0: - log.info('Processed %s files of total %s files ...' % (current_index, total_files)) - - -def cleanup_empty_folders(): - log.info('Cleanup empty folders') - - for _dirpath, _, _ in os.walk(config.get_item('persistent', 'data_path'), topdown=False): - if not (os.listdir(_dirpath) or config.get_item('persistent', 'data_path') == _dirpath): - os.rmdir(_dirpath) - - -def get_collections_files_hashes(): - COLLECTIONS_PREFIXES = [('projects', 'files'), - ('acquisitions', 'files'), - ('analyses', 'files'), - ('sessions', 'files'), - ('sessions', 'subject.files'), - ('collections', 'files')] - _hashes = [] - _files = [] - - for collection, prefix in COLLECTIONS_PREFIXES: - cursor = config.db.get_collection(collection).find({}) - for document in cursor: - for f in get_files_by_prefix(document, prefix): - _hashes.append(f.get('hash', '')) - f_dict = { - 'collection_id': document.get('_id'), - 'collection': collection, - 'fileinfo': f, - 'prefix': prefix - } - _files.append(f_dict) - - return _files, _hashes - - -def get_gears_files(): - cursor = config.db.get_collection('gears').find({}) - _files = [] - - for document in cursor: - if document['exchange']['git-commit'] == 'local': - f_dict = { - 'gear_id': document['_id'], - 'gear_name': document['gear']['name'], - 'exchange': document['exchange'] - } - _files.append(f_dict) - - return _files - - -def migrate_collections(): - log.info('Migrate collections...') - - _files, _hashes = get_collections_files_hashes() - counter = Counter(_hashes) - base = config.get_item('persistent', 'data_path') - - for i, f in enumerate(_files): - if f['fileinfo'].get('_id', ''): - counter[f['fileinfo']['hash']] -= 1 - continue - try: - f_uuid = str(uuid.uuid4()) - f_path = os.path.join(base, util.path_from_hash(f['fileinfo']['hash'])) - log.debug('copy file %s to %s' % (f_path, util.path_from_uuid(f_uuid))) - copy_file(f_path, files.get_file_abs_path(f_uuid)) - - update_set = { - f['prefix'] + '.$.modified': datetime.datetime.utcnow(), - f['prefix'] + '.$._id': f_uuid - } - log.debug('update file in mongo: %s' % update_set) - # Update the file with the newly generated UUID - config.db[f['collection']].find_one_and_update( - {'_id': f['collection_id'], f['prefix'] + '.name': f['fileinfo']['name']}, - {'$set': update_set} - ) - - # Decrease the count of the current hash, so we will know when we can remove the original file - counter[f['fileinfo']['hash']] -= 1 - - if counter[f['fileinfo']['hash']] == 0: - log.debug('remove old file: %s' % f_path) - os.remove(f_path) - - show_progress(i+1, len(_files)) - except Exception as e: - log.exception(e) - raise MigrationError('Wasn\'t able to migrate the \'%s\' ' - 'file in the \'%s\' collection (collection id: %s)' % - (f['fileinfo']['name'], f['collection'], str(f['collection_id'])), e) - - -def rollback_collections(): - log.info('Rollback collections...') - - _files, _ = get_collections_files_hashes() - base = config.get_item('persistent', 'data_path') - - for i, f in enumerate(_files): - if f['fileinfo'].get('_id', ''): - hash_path = os.path.join(base, util.path_from_hash(f['fileinfo']['hash'])) - uuid_path = files.get_file_abs_path(f['fileinfo']['_id']) - if os.path.exists(hash_path) and os.path.exists(uuid_path): - os.remove(uuid_path) - elif os.path.exists(uuid_path): - copy_file(uuid_path, hash_path) - os.remove(uuid_path) - - config.db[f['collection']].find_one_and_update( - {'_id': f['collection_id'], f['prefix'] + '.name': f['fileinfo']['name']}, - {'$unset': {f['prefix'] + '.$._id': ''}} - ) - - show_progress(i + 1, len(_files)) - - -def migrate_gears(): - log.info('Migrate gears...') - - _files = get_gears_files() - base = config.get_item('persistent', 'data_path') - - for i, f in enumerate(_files): - if f['exchange'].get('rootfs-id', ''): - continue - try: - f_uuid = str(uuid.uuid4()) - f_hash = 'v0-' + f['exchange']['rootfs-hash'].replace(':', '-') - f_path = os.path.join(base, util.path_from_hash(f_hash)) - log.debug('copy file %s to %s' % (f_path, util.path_from_uuid(f_uuid))) - copy_file(f_path, files.get_file_abs_path(f_uuid)) - - update_set = { - 'modified': datetime.datetime.utcnow(), - 'exchange.rootfs-id': f_uuid - } - - log.debug('update file in mongo: %s' % update_set) - # Update the gear with the newly generated UUID - config.db['gears'].find_one_and_update( - {'_id': f['gear_id']}, - {'$set': update_set} - ) - - log.debug('remove old file: %s' % f_path) - os.remove(f_path) - - show_progress(i + 1, len(_files)) - except Exception as e: - log.exception(e) - raise MigrationError('Wasn\'t able to migrate the \'%s\' gear (gear id: %s)' % - (f['gear_name'], f['gear_id']), e) - - -def rollback_gears(): - log.info('Rollback gears...') - - _files = get_gears_files() - base = config.get_item('persistent', 'data_path') - - for i, f in enumerate(_files): - if f['exchange'].get('rootfs-id', ''): - f_hash = 'v0-' + f['exchange']['rootfs-hash'].replace(':', '-') - hash_path = os.path.join(base, util.path_from_hash(f_hash)) - uuid_path = files.get_file_abs_path(f['exchange']['rootfs-id']) - if os.path.exists(hash_path) and os.path.exists(uuid_path): - os.remove(uuid_path) - elif os.path.exists(uuid_path): - copy_file(uuid_path, hash_path) - os.remove(uuid_path) - - config.db['gears'].find_one_and_update( - {'_id': f['gear_id']}, - {'$unset': {'exchange.rootfs-id': ''}} - ) - - show_progress(i + 1, len(_files)) - - -def remove_cas(): - """ - Remove CAS logic, generate UUID for the files and rename them on the filesystem, make a copy of the file if more - than one container using the same hash. - """ - - try: - migrate_collections() - migrate_gears() - except MigrationError as e: - log.exception(e) - rollback_collections() - rollback_gears() - exit(1) - finally: - cleanup_empty_folders() - - -if __name__ == '__main__': - remove_cas() diff --git a/requirements.txt b/requirements.txt index e544505b0..61f6a1f92 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ backports.tempfile==1.0 django>=1.11.0,<1.12.0 elasticsearch==5.3.0 enum==0.4.6 +fs==2.0.16 jsonschema==2.6.0 Markdown==2.6.5 pymongo==3.2 diff --git a/tests/bin/run-tests-ubuntu.sh b/tests/bin/run-tests-ubuntu.sh index 443fa304f..b2ee2e140 100755 --- a/tests/bin/run-tests-ubuntu.sh +++ b/tests/bin/run-tests-ubuntu.sh @@ -78,6 +78,8 @@ function main() { export SCITRAN_PERSISTENT_DB_LOG_URI=${SCITRAN_PERSISTENT_DB_LOG_URI:-"mongodb://localhost:$SCITRAN_PERSISTENT_DB_PORT/logs"} export SCITRAN_PERSISTENT_PATH=`mktemp -d` export SCITRAN_PERSISTENT_DATA_PATH="$SCITRAN_PERSISTENT_PATH/data" + mkdir -p "$SCITRAN_PERSISTENT_DATA_PATH/v1" + export SCITRAN_PERSISTENT_FS_URL="osfs://$SCITRAN_PERSISTENT_PATH/data/v1" export SCITRAN_CORE_DRONE_SECRET=${SCITRAN_CORE_DRONE_SECRET:-$( openssl rand -base64 32 )} if ${RUN_LINT}; then diff --git a/tests/integration_tests/python/conftest.py b/tests/integration_tests/python/conftest.py index 80fb294fe..e98ec608d 100644 --- a/tests/integration_tests/python/conftest.py +++ b/tests/integration_tests/python/conftest.py @@ -4,15 +4,16 @@ import json import logging import os -import shutil import attrdict import bson +import fs.move +import fs.path import pymongo import pytest import requests -from api import files, util +from api import config, files, util # load required envvars w/ the same name SCITRAN_CORE_DRONE_SECRET = os.environ['SCITRAN_CORE_DRONE_SECRET'] @@ -214,29 +215,31 @@ def legacy_cas_file(as_admin, api_db, data_builder, randstr, file_form): """Yield legacy CAS file""" project = data_builder.create_project() file_name = '%s.csv' % randstr() - as_admin.post('/projects/' + project + '/files', files=file_form(file_name)) + file_content = randstr() + as_admin.post('/projects/' + project + '/files', files=file_form((file_name, file_content))) - file_id = api_db['projects'].find_one( + file_info = api_db['projects'].find_one( {'files.name': file_name} - )['files'][0]['_id'] + )['files'][0] + file_id = file_info['_id'] + file_hash = file_info['hash'] # verify cas backward compatibility api_db['projects'].find_one_and_update( {'files.name': file_name}, {'$unset': {'files.$._id': ''}} ) - data_path = os.getenv('SCITRAN_PERSISTENT_DATA_PATH') - file_hash = 'v0-sha384-03a9df0a5e6e21f5d25aacbce76d8a5d9f8de14f6654c31ab2daed961cfbfb236b1708063350856f752a5a094fb64321' - file_path = os.path.join(data_path, util.path_from_hash(file_hash)) - target_dir = os.path.dirname(file_path) - if not os.path.exists(target_dir): - os.makedirs(target_dir) - shutil.move(files.get_file_abs_path(file_id), file_path) - yield (project, file_name) + file_path = unicode(util.path_from_hash(file_hash)) + target_dir = fs.path.dirname(file_path) + if not config.legacy_fs.exists(target_dir): + config.legacy_fs.makedirs(target_dir) + fs.move.move_file(src_fs=config.fs, src_path=util.path_from_uuid(file_id), dst_fs=config.legacy_fs, dst_path=file_path) + + yield (project, file_name, file_content) # clean up - os.remove(file_path) - os.removedirs(target_dir) + config.legacy_fs.remove(file_path) + config.legacy_fs.removetree(target_dir) api_db['projects'].delete_one({'_id': project}) class BaseUrlSession(requests.Session): diff --git a/tests/integration_tests/python/test_download.py b/tests/integration_tests/python/test_download.py index 025cc1b80..eaa46bad3 100644 --- a/tests/integration_tests/python/test_download.py +++ b/tests/integration_tests/python/test_download.py @@ -4,7 +4,7 @@ import zipfile -def test_download(data_builder, file_form, as_admin, api_db, legacy_cas_file): +def test_download_k(data_builder, file_form, as_admin, api_db, legacy_cas_file): project = data_builder.create_project(label='project1') session = data_builder.create_session(label='session1', project=project) session2 = data_builder.create_session(label='session1', project=project) @@ -152,7 +152,7 @@ def test_download(data_builder, file_form, as_admin, api_db, legacy_cas_file): assert r.ok # test legacy cas file handling - (project_legacy, file_name_legacy) = legacy_cas_file + (project_legacy, file_name_legacy, file_content) = legacy_cas_file r = as_admin.post('/download', json={ 'optional': False, 'nodes': [ @@ -233,14 +233,15 @@ def test_filelist_download(data_builder, file_form, as_admin, legacy_cas_file): assert r.ok # test legacy cas file handling - (project, file_name) = legacy_cas_file + (project, file_name, file_content) = legacy_cas_file r = as_admin.get('/projects/' + project + '/files/' + file_name, params={'ticket': ''}) assert r.ok ticket = r.json()['ticket'] r = as_admin.get('/projects/' + project + '/files/' + file_name, params={'ticket': ticket}) - assert r.content == 'test\ndata\n' + assert r.ok + assert r.content == file_content def test_filelist_range_download(data_builder, as_admin, file_form): diff --git a/tests/integration_tests/python/test_jobs.py b/tests/integration_tests/python/test_jobs.py index 220b21140..1e41105cf 100644 --- a/tests/integration_tests/python/test_jobs.py +++ b/tests/integration_tests/python/test_jobs.py @@ -284,6 +284,8 @@ def test_jobs(data_builder, default_payload, as_public, as_user, as_admin, as_ro r = as_admin.post('/jobs/add', json=job6) assert r.status_code == 500 + assert as_root.delete('/gears/' + gear3).ok + # Attempt to set a malformed file reference as input job7 = copy.deepcopy(job_data) job7['inputs'] = { diff --git a/tests/integration_tests/python/test_migrate_storage.py b/tests/integration_tests/python/test_migrate_storage.py new file mode 100644 index 000000000..79900d7e3 --- /dev/null +++ b/tests/integration_tests/python/test_migrate_storage.py @@ -0,0 +1,274 @@ +import os +import sys + +import fs.move +import fs.path +import pytest + +from api import config, util +from bson.objectid import ObjectId + + +def move_file_to_legacy(src, dst): + target_dir = fs.path.dirname(dst) + if not config.legacy_fs.exists(target_dir): + config.legacy_fs.makedirs(target_dir) + fs.move.move_file(src_fs=config.fs, src_path=src, + dst_fs=config.legacy_fs, dst_path=dst) + +@pytest.fixture(scope='function') +def migrate_storage(mocker): + """Enable importing from `bin` and return `undelete.undelete`.""" + bin_path = os.path.join(os.getcwd(), 'bin', 'oneoffs') + mocker.patch('sys.path', [bin_path] + sys.path) + import migrate_storage + return migrate_storage + + +@pytest.yield_fixture(scope='function') +def gears_to_migrate(api_db, as_admin, randstr, file_form): + def gen_gear_meta(gear_name): + return {'gear': { + "version": '0.0.1', + "config": {}, + "name": gear_name, + "inputs": { + "file": { + "base": "file", + "description": "Any image." + } + }, + "maintainer": "Test", + "description": "Test", + "license": "Other", + "author": "Test", + "url": "http://example.example", + "label": "Test Gear", + "flywheel": "0", + "source": "http://example.example" + }} + + gears = [] + + gear_name_1 = randstr() + + file_name = '%s.tar.gz' % randstr() + file_content = randstr() + r = as_admin.post('/gears/temp', files=file_form((file_name, file_content), meta=gen_gear_meta(gear_name_1))) + gear_id_1 = r.json()['_id'] + + r = as_admin.get('/gears/' + gear_id_1) + gear_json_1 = r.json() + + file_hash__1 = 'v0-' + gear_json_1['exchange']['rootfs-hash'].replace(':', '-') + file_id_1 = gear_json_1['exchange']['rootfs-id'] + + file_path = unicode(util.path_from_hash(file_hash__1)) + target_dir = fs.path.dirname(file_path) + if not config.legacy_fs.exists(target_dir): + config.legacy_fs.makedirs(target_dir) + fs.move.move_file(src_fs=config.fs, src_path=util.path_from_uuid(file_id_1), + dst_fs=config.legacy_fs, dst_path=file_path) + + api_db['gears'].find_one_and_update( + {'_id': ObjectId(gear_id_1)}, + {'$unset': {'exchange.rootfs-id': ''}}) + + gears.append((gear_id_1, file_path)) + + gear_name_2 = randstr() + file_name = '%s.tar.gz' % randstr() + file_content = randstr() + r = as_admin.post('/gears/temp', files=file_form((file_name, file_content), meta=gen_gear_meta(gear_name_2))) + gear_id_2 = r.json()['_id'] + + + r = as_admin.get('/gears/' + gear_id_2) + gear_json_2 = r.json() + + file_id_2 = gear_json_2['exchange']['rootfs-id'] + + file_path = unicode(util.path_from_uuid(file_id_2)) + target_dir = fs.path.dirname(file_path) + if not config.legacy_fs.exists(target_dir): + config.legacy_fs.makedirs(target_dir) + fs.move.move_file(src_fs=config.fs, src_path=file_path, + dst_fs=config.legacy_fs, dst_path=file_path) + + gears.append((gear_id_2, file_path)) + + yield gears + + # clean up + gear_json_1 = api_db['gears'].find_one({'_id': ObjectId(gear_id_1)}) + gear_json_2 = api_db['gears'].find_one({'_id': ObjectId(gear_id_2)}) + files_to_delete = [] + files_to_delete.append(util.path_from_uuid(gear_json_1['exchange'].get('rootfs-id', ''))) + files_to_delete.append(util.path_from_uuid(gear_json_1['exchange'].get('rootfs-hash', ''))) + files_to_delete.append(util.path_from_uuid(gear_json_2['exchange'].get('rootfs-id', ''))) + + for f_path in files_to_delete: + try: + config.fs.remove(f_path) + except: + pass + + api_db['gears'].delete_one({'_id': ObjectId(gear_id_1)}) + api_db['gears'].delete_one({'_id': ObjectId(gear_id_2)}) + +@pytest.yield_fixture(scope='function') +def files_to_migrate(data_builder, api_db, as_admin, randstr, file_form): + # Create a project + project_id = data_builder.create_project() + + files = [] + + # Create a CAS file + file_name_1 = '%s.csv' % randstr() + file_content_1 = randstr() + as_admin.post('/projects/' + project_id + '/files', files=file_form((file_name_1, file_content_1))) + + file_info = api_db['projects'].find_one( + {'files.name': file_name_1} + )['files'][0] + file_id_1 = file_info['_id'] + file_hash_1 = file_info['hash'] + url_1 = '/projects/' + project_id + '/files/' + file_name_1 + + api_db['projects'].find_one_and_update( + {'files.name': file_name_1}, + {'$unset': {'files.$._id': ''}} + ) + + move_file_to_legacy(util.path_from_uuid(file_id_1), util.path_from_hash(file_hash_1)) + files.append((url_1, util.path_from_hash(file_hash_1))) + # Create an UUID file + file_name_2 = '%s.csv' % randstr() + file_content_2 = randstr() + as_admin.post('/projects/' + project_id + '/files', files=file_form((file_name_2, file_content_2))) + + file_info = api_db['projects'].find_one( + {'files.name': file_name_2} + )['files'][1] + file_id_2 = file_info['_id'] + url_2 = '/projects/' + project_id + '/files/' + file_name_2 + + move_file_to_legacy(util.path_from_uuid(file_id_2), util.path_from_uuid(file_id_2)) + files.append((url_2, util.path_from_uuid(file_id_2))) + + yield files + + # Clean up, get the files + files = api_db['projects'].find_one( + {'_id': ObjectId(project_id)} + )['files'] + # Delete the files + for f in files: + try: + config.fs.remove(util.path_from_uuid(f['_id'])) + except: + pass + +def test_migrate_collections(files_to_migrate, as_admin, migrate_storage): + """Testing collection migration""" + + # get file storing by hash in legacy storage + (url_1, file_path_1) = files_to_migrate[0] + # get ile storing by uuid in legacy storage + (url_2, file_path_2) = files_to_migrate[1] + + # get the ticket + r = as_admin.get(url_1, params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # download the file + assert as_admin.get(url_1, params={'ticket': ticket}).ok + + # get the ticket + r = as_admin.get(url_2, params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # download the file + assert as_admin.get(url_2, params={'ticket': ticket}).ok + # run the migration + migrate_storage.migrate_collections() + + # delete files from the legacy storage + config.legacy_fs.remove(file_path_1) + config.legacy_fs.remove(file_path_2) + + # get the files from the new filesystem + # get the ticket + r = as_admin.get(url_1, params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # download the file + assert as_admin.get(url_1, params={'ticket': ticket}).ok + + # get the ticket + r = as_admin.get(url_2, params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # download the file + assert as_admin.get(url_2, params={'ticket': ticket}).ok + +def test_migrate_collections_error(files_to_migrate, migrate_storage): + """Testing that the migration script throws an exception if it couldn't migrate a file""" + + # get file storing by hash in legacy storage + (url, file_path_1) = files_to_migrate[0] + # get the other file, so we can clean up + (_, file_path_2) = files_to_migrate[1] + + # delete the file + config.legacy_fs.remove(file_path_1) + + with pytest.raises(migrate_storage.MigrationError): + migrate_storage.migrate_collections() + + # clean up + config.legacy_fs.remove(file_path_2) + + +def test_migrate_gears(gears_to_migrate, as_admin, migrate_storage): + """Testing collection migration""" + + (gear_id_1, gear_file_path_1) = gears_to_migrate[0] + (gear_id_2, gear_file_path_2) = gears_to_migrate[1] + + # get gears before migration + assert as_admin.get('/gears/temp/' + gear_id_1).ok + assert as_admin.get('/gears/temp/' + gear_id_2).ok + + # run migration + migrate_storage.migrate_gears() + + # delete files from the legacy storage + config.legacy_fs.remove(gear_file_path_1) + config.legacy_fs.remove(gear_file_path_2) + + # get the files from the new filesystem + assert as_admin.get('/gears/temp/' + gear_id_1).ok + assert as_admin.get('/gears/temp/' + gear_id_2).ok + + +def test_migrate_gears_error(gears_to_migrate, migrate_storage): + """Testing that the migration script throws an exception if it couldn't migrate a file""" + + # get file storing by hash in legacy storage + (gear_id, gear_file_path_1) = gears_to_migrate[0] + # get the other file, so we can clean up + (_, gear_file_path_2) = gears_to_migrate[1] + + # delete the file + config.legacy_fs.remove(gear_file_path_1) + + with pytest.raises(migrate_storage.MigrationError): + migrate_storage.migrate_gears() + + # clean up + config.legacy_fs.remove(gear_file_path_2) From 128542b74d12dd3c4a5542425224a0f95221bab8 Mon Sep 17 00:00:00 2001 From: David Farkas Date: Thu, 15 Feb 2018 15:12:01 +0100 Subject: [PATCH 04/12] Add requested changes, imporve singed url handling --- api/config.py | 2 -- api/files.py | 11 ++++------- api/resolver.py | 2 +- api/util.py | 4 ++-- 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/api/config.py b/api/config.py index 3fe15c5ac..a4a2afe1e 100644 --- a/api/config.py +++ b/api/config.py @@ -328,7 +328,5 @@ def get_auth(auth_type): # Storage configuration fs = open_fs(__config['persistent']['fs_url']) -signed_url_available = ['GCSFS'] - legacy_fs = open_fs('osfs://' + __config['persistent']['data_path']) support_legacy_fs = True diff --git a/api/files.py b/api/files.py index 963be4950..7d3de336b 100644 --- a/api/files.py +++ b/api/files.py @@ -186,18 +186,15 @@ def get_valid_file(file_info): def get_signed_url(file_path, file_system, filename=None): try: - if type(file_system).__name__ in config.signed_url_available: - return file_system.geturl(file_path, filename=filename) + if hasattr(config.fs, 'get_signed_url'): + return file_system.get_signed_url(file_path, filename=filename) except fs.errors.NoURL: return None def get_fs_by_file_path(file_path): - if config.support_legacy_fs: - if config.legacy_fs.isfile(file_path): - return config.legacy_fs - elif config.legacy_fs.isfile(file_path): - return config.legacy_fs + if config.support_legacy_fs and config.legacy_fs.isfile(file_path): + return config.legacy_fs if config.fs.isfile(file_path): return config.fs diff --git a/api/resolver.py b/api/resolver.py index 7eca86f9c..0b110ea1c 100644 --- a/api/resolver.py +++ b/api/resolver.py @@ -1,4 +1,4 @@ -"""1 +""" Resolve an ambiguous path through the data hierarchy. """ diff --git a/api/util.py b/api/util.py index edacef534..87d1a5f7e 100644 --- a/api/util.py +++ b/api/util.py @@ -279,9 +279,9 @@ def path_from_uuid(uuid_): """ create a filepath from a UUID e.g. - hash_ = v0-sha384-01b395a1cbc0f218 + uuid_ = cbb33a87-6754-4dfd-abd3-7466d4463ebc will return - v0/sha384/01/b3/v0-sha384-01b395a1cbc0f218 + cb/b3/cbb33a87-6754-4dfd-abd3-7466d4463ebc """ uuid_1 = uuid_.split('-')[0] first_stanza = uuid_1[0:2] From ea5a89b6d78f9a400cc18754a6e3c62318b9b1d5 Mon Sep 17 00:00:00 2001 From: Megan Henning Date: Thu, 15 Feb 2018 14:16:09 -0600 Subject: [PATCH 05/12] Check file's storage system rather than default --- api/files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/files.py b/api/files.py index 7d3de336b..1b7abc4f4 100644 --- a/api/files.py +++ b/api/files.py @@ -186,7 +186,7 @@ def get_valid_file(file_info): def get_signed_url(file_path, file_system, filename=None): try: - if hasattr(config.fs, 'get_signed_url'): + if hasattr(file_system, 'get_signed_url'): return file_system.get_signed_url(file_path, filename=filename) except fs.errors.NoURL: return None From b3ac92042464fb4e5780b8acbabc0e33d9d4f56b Mon Sep 17 00:00:00 2001 From: Ryan Sanford Date: Thu, 15 Feb 2018 14:36:18 -0600 Subject: [PATCH 06/12] Trigger docker image builds on branch `generic-storage` --- .travis.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index a3ce03413..1842e89ca 100644 --- a/.travis.yml +++ b/.travis.yml @@ -39,7 +39,10 @@ after_success: - if [ "$TRAVIS_EVENT_TYPE" == "push" -a "$TRAVIS_BRANCH" == "master" ]; then ./docker/build-trigger.sh Branch "$TRAVIS_BRANCH" "$BUILD_TRIGGER_URL"; fi + - if [ "$TRAVIS_EVENT_TYPE" == "push" -a "$TRAVIS_BRANCH" == "generic-storage" ]; then + ./docker/build-trigger.sh Branch "$TRAVIS_BRANCH" "$BUILD_TRIGGER_URL"; + fi - if [ "$TRAVIS_EVENT_TYPE" == "push" -a -z "$TRAVIS_TAG" ]; then ./bin/push-docs.sh "$GIT_REMOTE" branches "$TRAVIS_BRANCH" "Travis Core Docs Build - ${TRAVIS_BUILD_NUMBER}"; fi - + From 465237bf22341340fe4a47b34e9ecf97ddfaf8e3 Mon Sep 17 00:00:00 2001 From: David Farkas Date: Tue, 20 Feb 2018 11:37:12 +0100 Subject: [PATCH 07/12] Use the same type of filsystem as temp fs too --- api/files.py | 18 +++++++++++++----- api/upload.py | 3 ++- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/api/files.py b/api/files.py index 1b7abc4f4..d781f86ce 100644 --- a/api/files.py +++ b/api/files.py @@ -3,9 +3,10 @@ import json import six import hashlib +import uuid import fs.move -import fs.tempfs +import fs.subfs import fs.path import fs.errors @@ -16,7 +17,8 @@ class FileProcessor(object): def __init__(self, base, presistent_fs): self.base = base - self._temp_fs = fs.tempfs.TempFS(identifier='.temp', temp_dir=self.base) + self._tempdir_name = str(uuid.uuid4()) + self._temp_fs = fs.subfs.SubFS(presistent_fs, fs.path.join('tmp', self._tempdir_name)) self._presistent_fs = presistent_fs def store_temp_file(self, src_path, dest_path): @@ -26,7 +28,7 @@ def store_temp_file(self, src_path, dest_path): dest_path = six.u(dest_path) dst_dir = fs.path.dirname(dest_path) self._presistent_fs.makedirs(dst_dir, recreate=True) - fs.move.move_file(src_fs=self.temp_fs, src_path=src_path, dst_fs=self._presistent_fs, dst_path=dest_path) + self._presistent_fs.move(src_path=fs.path.join('tmp', self._tempdir_name, src_path), dst_path=dest_path) def process_form(self, request): """ @@ -97,7 +99,9 @@ def __del__(self): self.close() def close(self): - self.temp_fs.close() + # Cleaning up + self._presistent_fs.listdir(fs.path.join('tmp', self._tempdir_name)) + self._presistent_fs.removetree(fs.path.join('tmp', self._tempdir_name)) def get_single_file_field_storage(file_system): @@ -118,6 +122,7 @@ def make_file(self, binary=None): # Sanitize form's filename (read: prevent malicious escapes, bad characters, etc) self.filename = fs.path.basename(self.filename) + self.hasher = hashlib.new(DEFAULT_HASH_ALG) if not isinstance(self.filename, unicode): self.filename = six.u(self.filename) self.open_file = file_system.open(self.filename, 'wb') @@ -131,10 +136,13 @@ def _FieldStorage__write(self, line): # Always write fields of type "file" to disk for consistent renaming behavior if self.filename: self.file = self.make_file('') - self.file.write(self._FieldStorage__file.getvalue()) + self.hasher.update(self._FieldStorage__file.getvalue()) self._FieldStorage__file = None + self.file.write(line) + if hasattr(self, 'hasher'): + self.hasher.update(line) return SingleFileFieldStorage diff --git a/api/upload.py b/api/upload.py index d0a3bd6ff..90c753141 100644 --- a/api/upload.py +++ b/api/upload.py @@ -100,6 +100,7 @@ def process_upload(request, strategy, container_type=None, id_=None, origin=None raise FileFormException("Targeted uploads can only send one file") for field in file_fields: + field.file.close() # Augment the cgi.FieldStorage with a variety of custom fields. # Not the best practice. Open to improvements. # These are presumbed to be required by every function later called with field as a parameter. @@ -111,7 +112,7 @@ def process_upload(request, strategy, container_type=None, id_=None, origin=None file_processor.temp_fs.listdir('/'), )) field.size = file_processor.temp_fs.getsize(field.path) - field.hash = file_processor.hash_file_formatted(field.path, file_processor.temp_fs) + field.hash = util.format_hash(files.DEFAULT_HASH_ALG, field.hasher.hexdigest()) field.uuid = str(uuid.uuid4()) field.mimetype = util.guess_mimetype(field.filename) # TODO: does not honor metadata's mime type if any field.modified = timestamp From 011832326a9c09fec1ee30d903d7f1f4ae8f448c Mon Sep 17 00:00:00 2001 From: David Farkas Date: Wed, 21 Feb 2018 16:01:59 +0100 Subject: [PATCH 08/12] Silenc urllib3 library log messages --- api/config.py | 2 +- api/files.py | 3 ++- api/handlers/listhandler.py | 1 + raml/schemas/definitions/analysis.json | 0 raml/schemas/definitions/file.json | 0 swagger/schemas/definitions/file.json | 2 ++ tests/integration_tests/python/test_resolver.py | 4 ++-- 7 files changed, 8 insertions(+), 4 deletions(-) delete mode 100644 raml/schemas/definitions/analysis.json delete mode 100644 raml/schemas/definitions/file.json diff --git a/api/config.py b/api/config.py index a4a2afe1e..f1cc31d55 100644 --- a/api/config.py +++ b/api/config.py @@ -23,7 +23,7 @@ logging.getLogger('requests').setLevel(logging.WARNING) # silence Requests library logging.getLogger('paste.httpserver').setLevel(logging.WARNING) # silence Paste library logging.getLogger('elasticsearch').setLevel(logging.WARNING) # silence Elastic library - +logging.getLogger('urllib3').setLevel(logging.WARNING) # silence urllib3 library # NOTE: Keep in sync with environment variables in sample.config file. DEFAULT_CONFIG = { diff --git a/api/files.py b/api/files.py index d781f86ce..5e3da5a34 100644 --- a/api/files.py +++ b/api/files.py @@ -18,8 +18,9 @@ class FileProcessor(object): def __init__(self, base, presistent_fs): self.base = base self._tempdir_name = str(uuid.uuid4()) - self._temp_fs = fs.subfs.SubFS(presistent_fs, fs.path.join('tmp', self._tempdir_name)) self._presistent_fs = presistent_fs + self._presistent_fs.makedirs(fs.path.join('tmp', self._tempdir_name), recreate=True) + self._temp_fs = fs.subfs.SubFS(presistent_fs, fs.path.join('tmp', self._tempdir_name)) def store_temp_file(self, src_path, dest_path): if not isinstance(src_path, unicode): diff --git a/api/handlers/listhandler.py b/api/handlers/listhandler.py index fcab5e3ae..14501a8dd 100644 --- a/api/handlers/listhandler.py +++ b/api/handlers/listhandler.py @@ -1,3 +1,4 @@ +import os import bson import copy import datetime diff --git a/raml/schemas/definitions/analysis.json b/raml/schemas/definitions/analysis.json deleted file mode 100644 index e69de29bb..000000000 diff --git a/raml/schemas/definitions/file.json b/raml/schemas/definitions/file.json deleted file mode 100644 index e69de29bb..000000000 diff --git a/swagger/schemas/definitions/file.json b/swagger/schemas/definitions/file.json index f6cc81590..2dbe43b14 100644 --- a/swagger/schemas/definitions/file.json +++ b/swagger/schemas/definitions/file.json @@ -1,6 +1,7 @@ { "$schema": "http://json-schema.org/draft-04/schema#", "definitions":{ + "_id": { "type": "string" }, "name": { "type": "string" }, "file-type": { "type": "string" }, "mimetype": { "type": "string" }, @@ -44,6 +45,7 @@ "file": { "type": "object", "properties": { + "_id": {"$ref":"#/definitions/_id"}, "name": {"$ref":"#/definitions/name"}, "type": {"$ref":"#/definitions/file-type"}, "mimetype": {"$ref":"#/definitions/mimetype"}, diff --git a/tests/integration_tests/python/test_resolver.py b/tests/integration_tests/python/test_resolver.py index 8a32629ee..b772a03d0 100644 --- a/tests/integration_tests/python/test_resolver.py +++ b/tests/integration_tests/python/test_resolver.py @@ -177,10 +177,10 @@ def idz(s): return '' # resolve root/group/project/session/acquisition/file with id - r = as_admin.post('/resolve', json={'path': [idz(group), idz(project), idz(session), idz(acquisition), acquisition_file]}) + r = as_admin.post('/resolve', json={'path': [idz(group), idz(project), idz(session), idz(acquisition), idz(acquisition_file)]}) result = r.json() assert r.ok - assert path_in_result([group, project, session, acquisition, acquisition_file], result) + assert path_in_result([group, project, session, acquisition, acquisition_file_id], result) assert result['children'] == [] # try to resolve non-existent root/group/project/session/acquisition/child From df6eeebb81fef36ab596f3a64477f8558db5aebd Mon Sep 17 00:00:00 2001 From: David Farkas Date: Fri, 23 Feb 2018 13:11:34 +0100 Subject: [PATCH 09/12] Handle file replace during the migration and update pytest packages --- api/jobs/handlers.py | 2 +- bin/oneoffs/migrate_storage.py | 10 +++- .../python/test_migrate_storage.py | 50 ++++++++++++++++--- .../integration_tests/python/test_undelete.py | 2 +- .../requirements-integration-test.txt | 8 +-- 5 files changed, 58 insertions(+), 14 deletions(-) diff --git a/api/jobs/handlers.py b/api/jobs/handlers.py index b3e919e2c..60c1f535f 100644 --- a/api/jobs/handlers.py +++ b/api/jobs/handlers.py @@ -93,7 +93,7 @@ def download(self, **kwargs): # pragma: no cover 'hash': 'v0-' + gear['exchange']['rootfs-hash'].replace(':', '-') }) - stream = open(file_path, 'rb') + stream = file_system.open(file_path, 'rb') set_for_download(self.response, stream=stream, filename='gear.tar') @require_admin diff --git a/bin/oneoffs/migrate_storage.py b/bin/oneoffs/migrate_storage.py index f10cf6b92..278fc48f4 100644 --- a/bin/oneoffs/migrate_storage.py +++ b/bin/oneoffs/migrate_storage.py @@ -114,11 +114,17 @@ def migrate_collections(): } log.debug('update file in mongo: %s' % update_set) # Update the file with the newly generated UUID - config.db[f['collection']].find_one_and_update( - {'_id': f['collection_id'], f['prefix'] + '.name': f['fileinfo']['name']}, + updated_doc = config.db[f['collection']].find_one_and_update( + {'_id': f['collection_id'], f['prefix'] + '.name': f['fileinfo']['name'], + f['prefix'] + '.hash': f['fileinfo']['hash']}, {'$set': update_set} ) + if not updated_doc: + log.info('Probably the following file has been updated during the migration ' + 'and its hash is changed, cleaning up from the new filesystem') + config.fs.remove(f_new_path) + show_progress(i + 1, len(_files)) except Exception as e: diff --git a/tests/integration_tests/python/test_migrate_storage.py b/tests/integration_tests/python/test_migrate_storage.py index 79900d7e3..764d932bf 100644 --- a/tests/integration_tests/python/test_migrate_storage.py +++ b/tests/integration_tests/python/test_migrate_storage.py @@ -4,6 +4,7 @@ import fs.move import fs.path import pytest +import pymongo from api import config, util from bson.objectid import ObjectId @@ -141,7 +142,7 @@ def files_to_migrate(data_builder, api_db, as_admin, randstr, file_form): ) move_file_to_legacy(util.path_from_uuid(file_id_1), util.path_from_hash(file_hash_1)) - files.append((url_1, util.path_from_hash(file_hash_1))) + files.append((project_id, file_name_1, url_1, util.path_from_hash(file_hash_1))) # Create an UUID file file_name_2 = '%s.csv' % randstr() file_content_2 = randstr() @@ -154,7 +155,7 @@ def files_to_migrate(data_builder, api_db, as_admin, randstr, file_form): url_2 = '/projects/' + project_id + '/files/' + file_name_2 move_file_to_legacy(util.path_from_uuid(file_id_2), util.path_from_uuid(file_id_2)) - files.append((url_2, util.path_from_uuid(file_id_2))) + files.append((project_id, file_name_2, url_2, util.path_from_uuid(file_id_2))) yield files @@ -173,9 +174,9 @@ def test_migrate_collections(files_to_migrate, as_admin, migrate_storage): """Testing collection migration""" # get file storing by hash in legacy storage - (url_1, file_path_1) = files_to_migrate[0] + (_, _, url_1, file_path_1) = files_to_migrate[0] # get ile storing by uuid in legacy storage - (url_2, file_path_2) = files_to_migrate[1] + (_, _,url_2, file_path_2) = files_to_migrate[1] # get the ticket r = as_admin.get(url_1, params={'ticket': ''}) @@ -220,9 +221,9 @@ def test_migrate_collections_error(files_to_migrate, migrate_storage): """Testing that the migration script throws an exception if it couldn't migrate a file""" # get file storing by hash in legacy storage - (url, file_path_1) = files_to_migrate[0] + (_, _, url, file_path_1) = files_to_migrate[0] # get the other file, so we can clean up - (_, file_path_2) = files_to_migrate[1] + (_, _, _, file_path_2) = files_to_migrate[1] # delete the file config.legacy_fs.remove(file_path_1) @@ -272,3 +273,40 @@ def test_migrate_gears_error(gears_to_migrate, migrate_storage): # clean up config.legacy_fs.remove(gear_file_path_2) + + +def test_file_replaced_handling(files_to_migrate, migrate_storage, as_admin, file_form, api_db, mocker, caplog): + + origin_find_one_and_update = pymongo.collection.Collection.find_one_and_update + + def mocked(*args, **kwargs): + self = args[0] + filter = args[1] + update = args[2] + + as_admin.post('/projects/' + project_id + '/files', files=file_form((file_name_1, 'new_content'))) + + return origin_find_one_and_update(self, filter, update) + + + with mocker.mock_module.patch.object(pymongo.collection.Collection, 'find_one_and_update', mocked): + # get file storing by hash in legacy storage + (project_id, file_name_1, url_1, file_path_1) = files_to_migrate[0] + # get ile storing by uuid in legacy storage + (_, file_name_2, url_2, file_path_2) = files_to_migrate[1] + + # run the migration + migrate_storage.migrate_collections() + + file_1_id = api_db['projects'].find_one( + {'files.name': file_name_1} + )['files'][0]['_id'] + + file_2_id = api_db['projects'].find_one( + {'files.name': file_name_2} + )['files'][1]['_id'] + + assert config.fs.exists(util.path_from_uuid(file_1_id)) + assert config.fs.exists(util.path_from_uuid(file_2_id)) + + assert any(log.message == 'Probably the following file has been updated during the migration and its hash is changed, cleaning up from the new filesystem' for log in caplog.records) diff --git a/tests/integration_tests/python/test_undelete.py b/tests/integration_tests/python/test_undelete.py index 0af999ef2..2b2550008 100644 --- a/tests/integration_tests/python/test_undelete.py +++ b/tests/integration_tests/python/test_undelete.py @@ -116,7 +116,7 @@ def test_undelete_scope(undelete, containers, as_admin, api_db): def test_undelete_options(undelete, containers): - with pytest.raises(RuntimeError, match=r'use --include-parents'): + with pytest.raises(RuntimeError, match=r'use `--include-parents`'): undelete('acquisitions', containers.ac_1_1_1, filename='f_1_1_1_1') undelete('acquisitions', containers.ac_1_1_1, filename='f_1_1_1_1', include_parents=True) diff --git a/tests/integration_tests/requirements-integration-test.txt b/tests/integration_tests/requirements-integration-test.txt index 53c9cec21..474c13d0b 100644 --- a/tests/integration_tests/requirements-integration-test.txt +++ b/tests/integration_tests/requirements-integration-test.txt @@ -4,9 +4,9 @@ mock==2.0.0 mongomock==3.8.0 pdbpp==0.8.3 pylint==1.5.3 -pytest-cov==2.2.0 -pytest-mock==1.6.0 -pytest-watch==3.8.0 -pytest==2.8.5 +pytest-cov==2.5.1 +pytest-mock==1.7.0 +pytest-watch==4.1.0 +pytest==3.4.1 requests_mock==1.3.0 testfixtures==4.10.1 From 48162ade560cf2d19e0d408f1c80f36fdb6168f3 Mon Sep 17 00:00:00 2001 From: David Farkas Date: Thu, 1 Mar 2018 15:14:45 +0100 Subject: [PATCH 10/12] Handle specially the analysis during storage migration to avoid input files duplication --- bin/oneoffs/migrate_storage.py | 166 +++++++++++------- .../python/test_migrate_storage.py | 10 +- 2 files changed, 103 insertions(+), 73 deletions(-) diff --git a/bin/oneoffs/migrate_storage.py b/bin/oneoffs/migrate_storage.py index 278fc48f4..0df7fed36 100644 --- a/bin/oneoffs/migrate_storage.py +++ b/bin/oneoffs/migrate_storage.py @@ -13,13 +13,6 @@ log = logging.getLogger('migrate_storage') log.setLevel(logging.INFO) -COLLECTIONS_PREFIXES = [('projects', 'files'), - ('acquisitions', 'files'), - ('analyses', 'files'), - ('sessions', 'files'), - ('sessions', 'subject.files'), - ('collections', 'files')] - class MigrationError(Exception): pass @@ -44,16 +37,16 @@ def cleanup_empty_folders(): os.rmdir(_dirpath) -def get_collections_files(): +def get_containers_files(containers_prefixes): _files = [] - for collection, prefix in COLLECTIONS_PREFIXES: - cursor = config.db.get_collection(collection).find({}) + for container, prefix in containers_prefixes: + cursor = config.db.get_collection(container).find({}) for document in cursor: for f in get_files_by_prefix(document, prefix): f_dict = { - 'collection_id': document.get('_id'), - 'collection': collection, + 'container_id': document.get('_id'), + 'container': container, 'fileinfo': f, 'prefix': prefix } @@ -78,60 +71,97 @@ def get_gears_files(): return _files -def migrate_collections(): - log.info('Migrate collection files...') - - _files = get_collections_files() - - for i, f in enumerate(_files): - try: - file_id = f['fileinfo'].get('_id', '') - if file_id: - file_path = util.path_from_uuid(file_id) - if not config.fs.isfile(file_path): - """Copy file from legacy to new storage""" - - log.debug('copy file between the legacy and new storage: %s' % file_path) - - dst_dir = fs.path.dirname(file_path) - config.fs.makedirs(dst_dir, recreate=True) - fs.move.copy_file(src_fs=config.legacy_fs, src_path=file_path, dst_fs=config.fs, dst_path=file_path) - else: - """generate uuuid, set the id field and copy the file""" - file_id = str(uuid.uuid4()) - f_old_path = util.path_from_hash(f['fileinfo']['hash']) - f_new_path = util.path_from_uuid(file_id) +def migrate_file(f): + try: + file_id = f['fileinfo'].get('_id', '') + if file_id: + file_path = util.path_from_uuid(file_id) + if not config.fs.isfile(file_path): + """Copy file from legacy to new storage""" - log.debug('copy file %s to %s' % (f_old_path, f_new_path)) + log.debug('copy file between the legacy and new storage: %s' % file_path) - dst_dir = fs.path.dirname(f_new_path) + dst_dir = fs.path.dirname(file_path) config.fs.makedirs(dst_dir, recreate=True) - fs.move.copy_file(src_fs=config.legacy_fs, src_path=f_old_path, dst_fs=config.fs, dst_path=f_new_path) - - update_set = { - f['prefix'] + '.$.modified': datetime.datetime.utcnow(), - f['prefix'] + '.$._id': file_id - } - log.debug('update file in mongo: %s' % update_set) - # Update the file with the newly generated UUID - updated_doc = config.db[f['collection']].find_one_and_update( - {'_id': f['collection_id'], f['prefix'] + '.name': f['fileinfo']['name'], - f['prefix'] + '.hash': f['fileinfo']['hash']}, - {'$set': update_set} - ) - - if not updated_doc: - log.info('Probably the following file has been updated during the migration ' - 'and its hash is changed, cleaning up from the new filesystem') - config.fs.remove(f_new_path) - - show_progress(i + 1, len(_files)) - - except Exception as e: - log.exception(e) - raise MigrationError('Wasn\'t able to migrate the \'%s\' ' - 'file in the \'%s\' collection (collection id: %s)' % - (f['fileinfo']['name'], f['collection'], str(f['collection_id'])), e) + fs.move.copy_file(src_fs=config.legacy_fs, src_path=file_path, dst_fs=config.fs, dst_path=file_path) + else: + """generate uuuid, set the id field and copy the file""" + file_id = str(uuid.uuid4()) + f_old_path = util.path_from_hash(f['fileinfo']['hash']) + f_new_path = util.path_from_uuid(file_id) + + log.debug('copy file %s to %s' % (f_old_path, f_new_path)) + + dst_dir = fs.path.dirname(f_new_path) + config.fs.makedirs(dst_dir, recreate=True) + fs.move.copy_file(src_fs=config.legacy_fs, src_path=f_old_path, dst_fs=config.fs, dst_path=f_new_path) + + update_set = { + f['prefix'] + '.$.modified': datetime.datetime.utcnow(), + f['prefix'] + '.$._id': file_id + } + log.debug('update file in mongo: %s' % update_set) + # Update the file with the newly generated UUID + updated_doc = config.db[f['container']].find_one_and_update( + {'_id': f['container_id'], + f['prefix'] + '.name': f['fileinfo']['name'], + f['prefix'] + '.hash': f['fileinfo']['hash']}, + {'$set': update_set} + ) + + if not updated_doc: + log.info('Probably the following file has been updated during the migration ' + 'and its hash is changed, cleaning up from the new filesystem') + config.fs.remove(f_new_path) + + except Exception as e: + log.exception(e) + raise MigrationError('Wasn\'t able to migrate the \'%s\' ' + 'file in the \'%s\' container (container id: %s)' % + (f['fileinfo']['name'], f['container'], str(f['container_id'])), e) + + +def migrate_containers(): + log.info('Migrate container (projects, acquisitions, sessions, subject, collections) files...') + + container_files = get_containers_files([('projects', 'files'), + ('acquisitions', 'files'), + ('sessions', 'files'), + ('sessions', 'subject.files'), + ('collections', 'files')]) + + for i, f in enumerate(container_files): + migrate_file(f) + show_progress(i + 1, len(container_files)) + + log.info('Migrate analysis files...') + # Refresh the list of container files + container_files = get_containers_files([('projects', 'files'), + ('acquisitions', 'files'), + ('sessions', 'files'), + ('sessions', 'subject.files'), + ('collections', 'files')]) + analysis_files = get_containers_files([('analyses', 'files')]) + + for i, f in enumerate(analysis_files): + match = [cf for cf in container_files if cf['fileinfo']['hash'] == f['fileinfo']['hash'] and cf['fileinfo'].get('_id')] + # The file is already migrated + if len(match) > 0 and not f['fileinfo'].get('_id'): + update_set = { + f['prefix'] + '.$.modified': match[0]['fileinfo']['modified'], + f['prefix'] + '.$._id': match[0]['fileinfo']['_id'] + } + log.debug('update file in mongo: %s' % update_set) + # Update the file with the newly generated UUID + config.db[f['container']].find_one_and_update( + {'_id': f['container_id'], + f['prefix'] + '.name': f['fileinfo']['name'], + f['prefix'] + '.hash': f['fileinfo']['hash']}, + {'$set': update_set} + ) + else: + migrate_file(f) + show_progress(i + 1, len(analysis_files)) def migrate_gears(): @@ -189,7 +219,7 @@ def migrate_storage(): """ parser = argparse.ArgumentParser(prog='Migrate storage') - parser.add_argument('--collections', action='store_true', help='Migrate collections') + parser.add_argument('--containers', action='store_true', help='Migrate containers') parser.add_argument('--gears', action='store_true', help='Migrate gears') parser.add_argument('--delete-files', action='store_true', help='Delete files from legacy storage') @@ -197,12 +227,12 @@ def migrate_storage(): args = parser.parse_args() try: - if not (args.collections or args.gears): - migrate_collections() + if not (args.containers or args.gears): + migrate_containers() migrate_gears() - if args.collections: - migrate_collections() + if args.containers: + migrate_containers() if args.gears: migrate_gears() diff --git a/tests/integration_tests/python/test_migrate_storage.py b/tests/integration_tests/python/test_migrate_storage.py index 764d932bf..5cc179984 100644 --- a/tests/integration_tests/python/test_migrate_storage.py +++ b/tests/integration_tests/python/test_migrate_storage.py @@ -170,7 +170,7 @@ def files_to_migrate(data_builder, api_db, as_admin, randstr, file_form): except: pass -def test_migrate_collections(files_to_migrate, as_admin, migrate_storage): +def test_migrate_containers(files_to_migrate, as_admin, migrate_storage): """Testing collection migration""" # get file storing by hash in legacy storage @@ -194,7 +194,7 @@ def test_migrate_collections(files_to_migrate, as_admin, migrate_storage): # download the file assert as_admin.get(url_2, params={'ticket': ticket}).ok # run the migration - migrate_storage.migrate_collections() + migrate_storage.migrate_containers() # delete files from the legacy storage config.legacy_fs.remove(file_path_1) @@ -217,7 +217,7 @@ def test_migrate_collections(files_to_migrate, as_admin, migrate_storage): # download the file assert as_admin.get(url_2, params={'ticket': ticket}).ok -def test_migrate_collections_error(files_to_migrate, migrate_storage): +def test_migrate_containers_error(files_to_migrate, migrate_storage): """Testing that the migration script throws an exception if it couldn't migrate a file""" # get file storing by hash in legacy storage @@ -229,7 +229,7 @@ def test_migrate_collections_error(files_to_migrate, migrate_storage): config.legacy_fs.remove(file_path_1) with pytest.raises(migrate_storage.MigrationError): - migrate_storage.migrate_collections() + migrate_storage.migrate_containers() # clean up config.legacy_fs.remove(file_path_2) @@ -296,7 +296,7 @@ def mocked(*args, **kwargs): (_, file_name_2, url_2, file_path_2) = files_to_migrate[1] # run the migration - migrate_storage.migrate_collections() + migrate_storage.migrate_containers() file_1_id = api_db['projects'].find_one( {'files.name': file_name_1} From ddc023203c18bd1e3de4e6b82d3c00a911f0986a Mon Sep 17 00:00:00 2001 From: David Farkas Date: Fri, 2 Mar 2018 12:19:48 +0100 Subject: [PATCH 11/12] Add integration test to test the new analysis handling during the storage migration --- .../python/test_migrate_storage.py | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/tests/integration_tests/python/test_migrate_storage.py b/tests/integration_tests/python/test_migrate_storage.py index 5cc179984..6695d9f38 100644 --- a/tests/integration_tests/python/test_migrate_storage.py +++ b/tests/integration_tests/python/test_migrate_storage.py @@ -310,3 +310,85 @@ def mocked(*args, **kwargs): assert config.fs.exists(util.path_from_uuid(file_2_id)) assert any(log.message == 'Probably the following file has been updated during the migration and its hash is changed, cleaning up from the new filesystem' for log in caplog.records) + + +def test_migrate_analysis(files_to_migrate, as_admin, migrate_storage, default_payload, data_builder, file_form): + """Testing analysis migration""" + + # get file storing by hash in legacy storage + (project_id, file_name_1, url_1, file_path_1) = files_to_migrate[0] + # get ile storing by uuid in legacy storage + (_, _,url_2, file_path_2) = files_to_migrate[1] + + gear_doc = default_payload['gear']['gear'] + gear_doc['inputs'] = { + 'csv': { + 'base': 'file' + } + } + gear = data_builder.create_gear(gear=gear_doc) + + # create project analysis (job) using project's file as input + r = as_admin.post('/projects/' + project_id + '/analyses', params={'job': 'true'}, json={ + 'analysis': {'label': 'test analysis job'}, + 'job': { + 'gear_id': gear, + 'inputs': { + 'csv': { + 'type': 'projects', + 'id': project_id, + 'name': file_name_1 + } + }, + 'tags': ['example'] + } + }) + assert r.ok + analysis_id1 = r.json()['_id'] + + r = as_admin.get('/projects/' + project_id + '/analyses/' + analysis_id1) + assert r.ok + analysis_files1 = '/projects/' + project_id + '/analyses/' + analysis_id1 + '/files' + + # run the migration + migrate_storage.migrate_containers() + + # delete files from the legacy storage + config.legacy_fs.remove(file_path_1) + config.legacy_fs.remove(file_path_2) + + # get the files from the new filesystem + # get the ticket + r = as_admin.get(url_1, params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # download the file + assert as_admin.get(url_1, params={'ticket': ticket}).ok + + # get the ticket + r = as_admin.get(url_2, params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # download the file + assert as_admin.get(url_2, params={'ticket': ticket}).ok + + # get analysis download ticket for single file + r = as_admin.get(analysis_files1 + '/' + file_name_1, params={'ticket': ''}) + assert r.ok + ticket = r.json()['ticket'] + + # download single analysis file w/ ticket + r = as_admin.get(analysis_files1 + '/' + file_name_1, params={'ticket': ticket}) + assert r.ok + + r = as_admin.get('/projects/' + project_id + '/analyses/' + analysis_id1) + assert r.ok + input_file_id = r.json()['files'][0]['_id'] + + r = as_admin.get('/projects/' + project_id) + assert r.ok + project_file_id = r.json()['files'][0]['_id'] + + assert input_file_id == project_file_id From 08d5016cfa0b10fae3336c5ad02bfd244aeb006d Mon Sep 17 00:00:00 2001 From: David Farkas Date: Thu, 8 Mar 2018 17:53:44 +0100 Subject: [PATCH 12/12] Add requsted changes --- api/config.py | 5 +++-- api/files.py | 5 ++++- api/handlers/listhandler.py | 20 ++++++++++---------- api/placer.py | 9 +++------ 4 files changed, 20 insertions(+), 19 deletions(-) diff --git a/api/config.py b/api/config.py index f1cc31d55..da24f7ff9 100644 --- a/api/config.py +++ b/api/config.py @@ -65,7 +65,8 @@ 'db_server_selection_timeout': '3000', 'data_path': os.path.join(os.path.dirname(__file__), '../persistent/data'), 'elasticsearch_host': 'localhost:9200', - 'fs_url': 'osfs://' + os.path.join(os.path.dirname(__file__), '../persistent/data') + 'fs_url': 'osfs://' + os.path.join(os.path.dirname(__file__), '../persistent/data'), + 'support_legacy_fs': True }, } @@ -329,4 +330,4 @@ def get_auth(auth_type): # Storage configuration fs = open_fs(__config['persistent']['fs_url']) legacy_fs = open_fs('osfs://' + __config['persistent']['data_path']) -support_legacy_fs = True +support_legacy_fs = __config['persistent']['support_legacy_fs'] diff --git a/api/files.py b/api/files.py index 5e3da5a34..5b2bfaeee 100644 --- a/api/files.py +++ b/api/files.py @@ -142,7 +142,10 @@ def _FieldStorage__write(self, line): self._FieldStorage__file = None self.file.write(line) - if hasattr(self, 'hasher'): + + # NOTE: In case of metadata, we don't have a file name and we also don't have a hasher, + # so skipping the hasher.update + if self.filename: self.hasher.update(line) return SingleFileFieldStorage diff --git a/api/handlers/listhandler.py b/api/handlers/listhandler.py index 14501a8dd..54d36da41 100644 --- a/api/handlers/listhandler.py +++ b/api/handlers/listhandler.py @@ -363,17 +363,17 @@ def build_zip_info(file_path, file_system): """ with file_system.open(file_path, 'rb') as f: with zipfile.ZipFile(f) as zf: - info = {} - info['comment'] = zf.comment - info['members'] = [] + info = { + 'comment': zf.comment, + 'members': [] + } for zi in zf.infolist(): - m = {} - m['path'] = zi.filename - m['size'] = zi.file_size - m['timestamp'] = datetime.datetime(*zi.date_time) - m['comment'] = zi.comment - - info['members'].append(m) + info['members'].append({ + 'path': zi.filename, + 'size': zi.file_size, + 'timestamp': datetime.datetime(*zi.date_time), + 'comment': zi.comment + }) return info diff --git a/api/placer.py b/api/placer.py index 2e37a92db..2e1215092 100644 --- a/api/placer.py +++ b/api/placer.py @@ -89,9 +89,6 @@ def save_file(self, field=None, file_attrs=None): if field is not None and self.file_processor is not None: self.file_processor.store_temp_file(field.path, util.path_from_uuid(field.uuid)) - # if field is not None: - # files.move_form_file_field_into_storage(field) - # Update the DB if file_attrs is not None: container_before, self.container = hierarchy.upsert_fileinfo(self.container_type, self.id_, file_attrs) @@ -507,13 +504,13 @@ def finalize(self): # Write all files to zip complete = 0 for path in paths: - p = fs.path.join(self.folder, path) + full_path = fs.path.join(self.folder, path) # Set the file's mtime & atime. - self.file_processor.persistent_fs.settimes(p, self.ziptime, self.ziptime) + self.file_processor.persistent_fs.settimes(full_path, self.ziptime, self.ziptime) # Place file into the zip folder we created before - with self.file_processor.persistent_fs.open(p, 'rb') as f: + with self.file_processor.persistent_fs.open(full_path, 'rb') as f: self.zip_.writestr(fs.path.join(self.dir_, path), f.read()) # Report progress