From 0e68c718081a290129736a54cb25eecc7c6c5d4f Mon Sep 17 00:00:00 2001 From: David Farkas Date: Fri, 20 Oct 2017 17:36:38 +0200 Subject: [PATCH 01/16] Add regexs to indentify file types, store these in mongodb, load them at app startup --- api/config.py | 13 ++++++++++ api/files.py | 17 ++++--------- api/filetypes.json | 56 +++++++++++++++++++++--------------------- api/filetypes_old.json | 29 ++++++++++++++++++++++ 4 files changed, 75 insertions(+), 40 deletions(-) create mode 100644 api/filetypes_old.json diff --git a/api/config.py b/api/config.py index 8bc5375bc..e30dfba27 100644 --- a/api/config.py +++ b/api/config.py @@ -226,6 +226,13 @@ def create_or_recreate_ttl_index(coll_name, index_name, ttl): def initialize_db(): log.info('Initializing database, creating indexes') + + try: + db.singletons.insert({'_id': 'init_db'}) + except pymongo.errors.DuplicateKeyError: + log.info('Database is already initialized') + return + # TODO review all indexes db.users.create_index('api_key.key') db.projects.create_index([('gid', 1), ('name', 1)]) @@ -253,6 +260,12 @@ def initialize_db(): now = datetime.datetime.utcnow() db.groups.update_one({'_id': 'unknown'}, {'$setOnInsert': { 'created': now, 'modified': now, 'label': 'Unknown', 'permissions': []}}, upsert=True) + with open(os.path.join(os.path.dirname(__file__), 'filetypes.json')) as fd: + filetypes = json.load(fd) + db.filetypes.insert_many(filetypes) + + log.info('Initializing database, creating indexes ....DONE') + def get_config(): global __last_update, __config, __config_persisted #pylint: disable=global-statement now = datetime.datetime.utcnow() diff --git a/api/files.py b/api/files.py index 0585b5993..169266648 100644 --- a/api/files.py +++ b/api/files.py @@ -150,18 +150,11 @@ def get_hash(self): # File extension --> scitran file type detection hueristics. # Listed in precendence order. -with open(os.path.join(os.path.dirname(__file__), 'filetypes.json')) as fd: - TYPE_MAP = json.load(fd) - -KNOWN_FILETYPES = {ext: filetype for filetype, extensions in TYPE_MAP.iteritems() for ext in extensions} def guess_type_from_filename(filename): - particles = filename.split('.')[1:] - extentions = ['.' + '.'.join(particles[i:]) for i in range(len(particles))] - for ext in extentions: - filetype = KNOWN_FILETYPES.get(ext.lower()) - if filetype: - break - else: - filetype = None + filetype = None + result = config.db.filetypes.find_one({'$where': 'function() {return RegExp(this.regex).test(\'%s\');}' % filename}) + if result: + filetype = result['_id'] + return filetype diff --git a/api/filetypes.json b/api/filetypes.json index aeef59564..f8f003c0e 100644 --- a/api/filetypes.json +++ b/api/filetypes.json @@ -1,29 +1,29 @@ -{ - "bval": [ ".bval", ".bvals" ], - "bvec": [ ".bvec", ".bvecs" ], - "dicom": [ ".dcm", ".dcm.zip", ".dicom.zip" ], - "eeg": [ ".eeg.zip" ], - "gephysio": [ ".gephysio.zip" ], - "ismrmrd": [ ".h5", ".hdf5" ], - "MATLAB data": [ ".mat" ], - "MGH data": [ ".mgh", ".mgz", ".mgh.gz" ], - "nifti": [ ".nii.gz", ".nii" ], - "parrec": [ ".parrec.zip", ".par-rec.zip" ], - "pfile": [ ".7.gz", ".7", ".7.zip" ], - "PsychoPy data": [ ".psydat" ], - "qa": [ ".qa.png", ".qa.json", ".qa.html" ], +[ + { "_id": "bval", "regex": ".*\\.(bval$|bvals$)" }, + { "_id": "bvec", "regex": ".*\\.(bvec$|bvecs$)" }, + { "_id": "dicom", "regex": ".*\\.(dcm$|dcm\\.zip|dicom\\.zip$)" }, + { "_id": "eeg", "regex": ".*\\.eeg\\.zip$" }, + { "_id": "gephysio", "regex": ".*\\.gephysio\\.zip$" }, + { "_id": "ismrmrd", "regex": ".*\\.(h5$|hdf5$)" }, + { "_id": "MATLAB data", "regex": ".*\\.mat$" }, + { "_id": "MGH data", "regex": ".*\\.(mgh$|mgz$|mgh\\.gz$)" }, + { "_id": "nifti", "regex": ".*\\.(nii\\.gz$|nii$)" }, + { "_id": "parrec", "regex": ".*\\.(parrec\\.zip$|par-rec\\.zip$)" }, + { "_id": "pfile", "regex": ".*\\.(7\\.gz$|7$|7\\.zip)" }, + { "_id": "PsychoPy data", "regex": ".*\\.psydat$" }, + { "_id": "qa", "regex": ".*\\.(qa\\.png$|qa\\.json|qa\\.html$)" }, - "archive": [ ".zip", ".tbz2", ".tar.gz", ".tbz", ".tar.bz2", ".tgz", ".tar", ".txz", ".tar.xz" ], - "document": [ ".docx", ".doc" ], - "image": [ ".jpg", ".tif", ".jpeg", ".gif", ".bmp", ".png", ".tiff" ], - "markup": [ ".html", ".htm", ".xml" ], - "markdown": [ ".md", ".markdown" ], - "log": [ ".log" ], - "pdf": [ ".pdf" ], - "presentation": [ ".ppt", ".pptx" ], - "source code": [ ".c", ".py", ".cpp", ".js", ".m", ".json", ".java", ".php", ".css", ".toml", ".yaml", ".yml" ], - "spreadsheet": [ ".xls", ".xlsx" ], - "tabular data": [ ".csv.gz", ".csv" ], - "text": [ ".txt" ], - "video": [ ".mpeg", ".mpg", ".mov", ".mp4", ".m4v", ".mts" ] -} + { "_id": "archive", "regex": ".*\\.(zip$|tbz2$|tar\\.gz$|tbz$|tar\\.bz2$|tgz$|tar$|txz$|tar\\.xz$)" }, + { "_id": "document", "regex": ".*\\.(docx$|doc$)" }, + { "_id": "image", "regex": ".*\\.(jpg$|tif$|jpeg$|gif$|bmp$|png$|tiff$)" }, + { "_id": "markup", "regex": ".*\\.(html$|htm$|xml$)" }, + { "_id": "markdown", "regex": ".*\\.(md$|markdown$)" }, + { "_id": "log", "regex": ".*\\.log$" }, + { "_id": "pdf", "regex": ".*\\.pdf$" }, + { "_id": "presentation", "regex": ".*\\.(ppt$|pptx$)" }, + { "_id": "source code", "regex": ".*\\.(c$|py$|cpp$|js$|m$|json$|java$|php$|css$|toml$|yaml$|yml$)" }, + { "_id": "spreadsheet", "regex": ".*\\.(xls$|xlsx$)" }, + { "_id": "tabular data", "regex": ".*\\.(csv\\.gz$|csv$)" }, + { "_id": "text", "regex": ".*\\.txt$" }, + { "_id": "video", "regex": ".*\\.(mpeg$|mpg$|mov$|mp4$|m4v$|mts$)" } +] \ No newline at end of file diff --git a/api/filetypes_old.json b/api/filetypes_old.json new file mode 100644 index 000000000..aeef59564 --- /dev/null +++ b/api/filetypes_old.json @@ -0,0 +1,29 @@ +{ + "bval": [ ".bval", ".bvals" ], + "bvec": [ ".bvec", ".bvecs" ], + "dicom": [ ".dcm", ".dcm.zip", ".dicom.zip" ], + "eeg": [ ".eeg.zip" ], + "gephysio": [ ".gephysio.zip" ], + "ismrmrd": [ ".h5", ".hdf5" ], + "MATLAB data": [ ".mat" ], + "MGH data": [ ".mgh", ".mgz", ".mgh.gz" ], + "nifti": [ ".nii.gz", ".nii" ], + "parrec": [ ".parrec.zip", ".par-rec.zip" ], + "pfile": [ ".7.gz", ".7", ".7.zip" ], + "PsychoPy data": [ ".psydat" ], + "qa": [ ".qa.png", ".qa.json", ".qa.html" ], + + "archive": [ ".zip", ".tbz2", ".tar.gz", ".tbz", ".tar.bz2", ".tgz", ".tar", ".txz", ".tar.xz" ], + "document": [ ".docx", ".doc" ], + "image": [ ".jpg", ".tif", ".jpeg", ".gif", ".bmp", ".png", ".tiff" ], + "markup": [ ".html", ".htm", ".xml" ], + "markdown": [ ".md", ".markdown" ], + "log": [ ".log" ], + "pdf": [ ".pdf" ], + "presentation": [ ".ppt", ".pptx" ], + "source code": [ ".c", ".py", ".cpp", ".js", ".m", ".json", ".java", ".php", ".css", ".toml", ".yaml", ".yml" ], + "spreadsheet": [ ".xls", ".xlsx" ], + "tabular data": [ ".csv.gz", ".csv" ], + "text": [ ".txt" ], + "video": [ ".mpeg", ".mpg", ".mov", ".mp4", ".m4v", ".mts" ] +} From f2af51aa0da1d9fbb429c49a8e9035ea30f586b1 Mon Sep 17 00:00:00 2001 From: David Farkas Date: Fri, 20 Oct 2017 17:44:09 +0200 Subject: [PATCH 02/16] Remove unused import --- api/files.py | 1 - 1 file changed, 1 deletion(-) diff --git a/api/files.py b/api/files.py index 169266648..89f871009 100644 --- a/api/files.py +++ b/api/files.py @@ -1,6 +1,5 @@ import os import cgi -import json import shutil import hashlib import collections From 5cdd129424c598f0ed23cf5524eb877334af142b Mon Sep 17 00:00:00 2001 From: David Farkas Date: Wed, 25 Oct 2017 21:11:33 +0200 Subject: [PATCH 03/16] Add file type handler to manage file type through the api (add/replace, delete) --- api/api.py | 12 ++++-- api/config.py | 1 + api/handlers/filetypehandler.py | 37 +++++++++++++++++++ raml/schemas/mongo/filetype.json | 17 +++++++++ .../python/test_files.py | 2 - 5 files changed, 64 insertions(+), 5 deletions(-) create mode 100644 api/handlers/filetypehandler.py create mode 100644 raml/schemas/mongo/filetype.json rename tests/{unit_tests => integration_tests}/python/test_files.py (98%) diff --git a/api/api.py b/api/api.py index f0506ad45..e71605ad6 100644 --- a/api/api.py +++ b/api/api.py @@ -7,6 +7,7 @@ from .handlers.containerhandler import ContainerHandler from .handlers.dataexplorerhandler import DataExplorerHandler from .handlers.devicehandler import DeviceHandler +from .handlers.filetypehandler import FileType from .handlers.grouphandler import GroupHandler from .handlers.listhandler import FileListHandler, NotesListHandler, PermissionsListHandler, TagsListHandler from .handlers.refererhandler import AnalysesHandler @@ -43,6 +44,9 @@ # Filename 'fname': '[^/]+', + # File type name + 'ftypename': '[^/]+', + # Note ID 'nid': '[0-9a-f]{24}', @@ -77,9 +81,11 @@ def prefix(path, routes): # System configuration - route('/config', Config, m=['GET']), - route('/config.js', Config, h='get_js', m=['GET']), - route('/version', Version, m=['GET']), + route('/config', Config, m=['GET']), + route('/config.js', Config, h='get_js', m=['GET']), + route('/version', Version, m=['GET']), + route('/filetype', FileType, m=['GET', 'POST']), + route('/filetype/<_id:{ftypename}>', FileType, m=['DELETE']), # General-purpose upload & download diff --git a/api/config.py b/api/config.py index e30dfba27..b471758ab 100644 --- a/api/config.py +++ b/api/config.py @@ -139,6 +139,7 @@ def apply_env_variables(config): 'collection.json', 'container.json', 'file.json', + 'filetype.json', 'group.json', 'note.json', 'permission.json', diff --git a/api/handlers/filetypehandler.py b/api/handlers/filetypehandler.py new file mode 100644 index 000000000..7c56cc0b9 --- /dev/null +++ b/api/handlers/filetypehandler.py @@ -0,0 +1,37 @@ +from ..web import base +from .. import config +from .. import validators +from ..auth import userauth +from ..dao import noop + +class FileType(base.RequestHandler): + + def get(self): + """Get file types""" + resp = config.db.filetypes.find() + if resp != None: + return resp + else: + self.abort(404, "Version document does not exist") + + def post(self): + permchecker = userauth.default(self) + payload = self.request.json_body + mongo_schema_uri = validators.schema_uri('mongo', 'filetype.json') + mongo_validator = validators.decorator_from_schema_path(mongo_schema_uri) + mongo_validator(permchecker(noop))('PUT', payload=payload) + result = config.db.filetypes.replace_one({'_id': payload['_id']}, payload, upsert=True) + if result.acknowledged: + _id = result.upserted_id if result.upserted_id else payload['_id'] + return {'_id': _id} + else: + self.abort(404, 'File type {} not updated'.format(payload['_id'])) + + def delete(self, _id): + permchecker = userauth.default(self) + permchecker(noop)('DELETE', _id) + result = config.db.filetypes.delete_one({'_id': _id}) + if result.acknowledged: + return {'deleted': result.deleted_count} + else: + self.abort(404, 'File type {} not removed'.format(_id)) \ No newline at end of file diff --git a/raml/schemas/mongo/filetype.json b/raml/schemas/mongo/filetype.json new file mode 100644 index 000000000..258d9d926 --- /dev/null +++ b/raml/schemas/mongo/filetype.json @@ -0,0 +1,17 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "File type", + "type": "object", + "properties": { + "_id": { + "title": "ID", + "type": "string" + }, + "regex": { + "title": "Regular Expression", + "type": "string" + } + }, + "additionalProperties": false, + "required":["_id", "regex"] +} diff --git a/tests/unit_tests/python/test_files.py b/tests/integration_tests/python/test_files.py similarity index 98% rename from tests/unit_tests/python/test_files.py rename to tests/integration_tests/python/test_files.py index 01b977787..b0a2409d5 100644 --- a/tests/unit_tests/python/test_files.py +++ b/tests/integration_tests/python/test_files.py @@ -1,5 +1,3 @@ - -import pytest from api import files From 466ca1b75c95e7938c92f6901bd0bc9f3e9200b1 Mon Sep 17 00:00:00 2001 From: David Farkas Date: Wed, 25 Oct 2017 22:21:05 +0200 Subject: [PATCH 04/16] New integration test for the file types handlers --- tests/integration_tests/python/test_files.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/integration_tests/python/test_files.py b/tests/integration_tests/python/test_files.py index b0a2409d5..ee005c97e 100644 --- a/tests/integration_tests/python/test_files.py +++ b/tests/integration_tests/python/test_files.py @@ -21,3 +21,10 @@ def test_qa(): def test_unknown(): assert files.guess_type_from_filename('example.unknown') == None + +def test_insert_delete(as_drone): + as_drone.post('/filetype', json={'_id': 'new', 'regex': '.*\.new$'}) + assert files.guess_type_from_filename('example.new') == 'new' + as_drone.post('/filetype', json={'_id': 'new', 'regex': '.*\.new2$'}) + assert files.guess_type_from_filename('example.new') == None + assert files.guess_type_from_filename('example.new2') == 'new' \ No newline at end of file From 0e1ffa0f6ed7b05ec46e230bc7b948d10d402d82 Mon Sep 17 00:00:00 2001 From: David Farkas Date: Thu, 26 Oct 2017 00:24:47 +0200 Subject: [PATCH 05/16] Add new try_update_one db util method, and fix the dup key exists error in initialize_db method --- api/config.py | 16 +++++++--------- api/dao/dbutil.py | 16 ++++++++++++++++ api/filetypes_old.json | 29 ----------------------------- 3 files changed, 23 insertions(+), 38 deletions(-) delete mode 100644 api/filetypes_old.json diff --git a/api/config.py b/api/config.py index b471758ab..ec94ead05 100644 --- a/api/config.py +++ b/api/config.py @@ -8,7 +8,7 @@ import elasticsearch from . import util -from .dao.dbutil import try_replace_one +from .dao.dbutil import try_replace_one, try_update_one logging.basicConfig( format='%(asctime)s %(name)16.16s %(filename)24.24s %(lineno)5d:%(levelname)4.4s %(message)s', @@ -228,12 +228,6 @@ def create_or_recreate_ttl_index(coll_name, index_name, ttl): def initialize_db(): log.info('Initializing database, creating indexes') - try: - db.singletons.insert({'_id': 'init_db'}) - except pymongo.errors.DuplicateKeyError: - log.info('Database is already initialized') - return - # TODO review all indexes db.users.create_index('api_key.key') db.projects.create_index([('gid', 1), ('name', 1)]) @@ -259,11 +253,15 @@ def initialize_db(): create_or_recreate_ttl_index('downloads', 'timestamp', 60) now = datetime.datetime.utcnow() - db.groups.update_one({'_id': 'unknown'}, {'$setOnInsert': { 'created': now, 'modified': now, 'label': 'Unknown', 'permissions': []}}, upsert=True) + try_update_one(db, + 'singletons', {'_id': 'unknown'}, + {'$setOnInsert': {'created': now, 'modified': now, 'label': 'Unknown', 'permissions': []}}, + upsert=True) with open(os.path.join(os.path.dirname(__file__), 'filetypes.json')) as fd: filetypes = json.load(fd) - db.filetypes.insert_many(filetypes) + for filetype in filetypes: + try_replace_one(db, 'filetypes', {'_id': filetype['_id']}, filetype, upsert=True) log.info('Initializing database, creating indexes ....DONE') diff --git a/api/dao/dbutil.py b/api/dao/dbutil.py index 9d3db91c0..266dc3e4f 100644 --- a/api/dao/dbutil.py +++ b/api/dao/dbutil.py @@ -4,6 +4,7 @@ from pymongo.errors import DuplicateKeyError from ..web.errors import APIStorageException + def try_replace_one(db, coll_name, query, update, upsert=False): """ Mongo does not see replace w/ upsert as an atomic action: @@ -39,3 +40,18 @@ def fault_tolerant_replace_one(db, coll_name, query, update, upsert=False): time.sleep(random.uniform(0.01,0.05)) raise APIStorageException('Unable to replace object.') + + +def try_update_one(db, coll_name, query, update, upsert=False): + """ + Mongo does not see replace w/ upsert as an atomic action: + https://jira.mongodb.org/browse/SERVER-14322 + + This function will try a replace_one operation, returning the result and if the operation succeeded. + """ + try: + result = db[coll_name].update_one(query, update, upsert=upsert) + except DuplicateKeyError: + return result, False + else: + return result, True diff --git a/api/filetypes_old.json b/api/filetypes_old.json deleted file mode 100644 index aeef59564..000000000 --- a/api/filetypes_old.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "bval": [ ".bval", ".bvals" ], - "bvec": [ ".bvec", ".bvecs" ], - "dicom": [ ".dcm", ".dcm.zip", ".dicom.zip" ], - "eeg": [ ".eeg.zip" ], - "gephysio": [ ".gephysio.zip" ], - "ismrmrd": [ ".h5", ".hdf5" ], - "MATLAB data": [ ".mat" ], - "MGH data": [ ".mgh", ".mgz", ".mgh.gz" ], - "nifti": [ ".nii.gz", ".nii" ], - "parrec": [ ".parrec.zip", ".par-rec.zip" ], - "pfile": [ ".7.gz", ".7", ".7.zip" ], - "PsychoPy data": [ ".psydat" ], - "qa": [ ".qa.png", ".qa.json", ".qa.html" ], - - "archive": [ ".zip", ".tbz2", ".tar.gz", ".tbz", ".tar.bz2", ".tgz", ".tar", ".txz", ".tar.xz" ], - "document": [ ".docx", ".doc" ], - "image": [ ".jpg", ".tif", ".jpeg", ".gif", ".bmp", ".png", ".tiff" ], - "markup": [ ".html", ".htm", ".xml" ], - "markdown": [ ".md", ".markdown" ], - "log": [ ".log" ], - "pdf": [ ".pdf" ], - "presentation": [ ".ppt", ".pptx" ], - "source code": [ ".c", ".py", ".cpp", ".js", ".m", ".json", ".java", ".php", ".css", ".toml", ".yaml", ".yml" ], - "spreadsheet": [ ".xls", ".xlsx" ], - "tabular data": [ ".csv.gz", ".csv" ], - "text": [ ".txt" ], - "video": [ ".mpeg", ".mpg", ".mov", ".mp4", ".m4v", ".mts" ] -} From 206d69e93a1a52163435f48730c21b85d55cbd72 Mon Sep 17 00:00:00 2001 From: David Farkas Date: Thu, 26 Oct 2017 02:37:08 +0200 Subject: [PATCH 06/16] Fix typo in config.py --- api/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/config.py b/api/config.py index ec94ead05..225e3d4e4 100644 --- a/api/config.py +++ b/api/config.py @@ -254,7 +254,7 @@ def initialize_db(): now = datetime.datetime.utcnow() try_update_one(db, - 'singletons', {'_id': 'unknown'}, + 'groups', {'_id': 'unknown'}, {'$setOnInsert': {'created': now, 'modified': now, 'label': 'Unknown', 'permissions': []}}, upsert=True) From b043b861d8d36f6f85a17240f4d0bbe80c5d5ad7 Mon Sep 17 00:00:00 2001 From: David Farkas Date: Thu, 26 Oct 2017 03:52:01 +0200 Subject: [PATCH 07/16] Add some documentation, and increase code coverage --- api/handlers/filetypehandler.py | 13 +++++++------ tests/integration_tests/python/test_files.py | 19 ++++++++++++++----- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/api/handlers/filetypehandler.py b/api/handlers/filetypehandler.py index 7c56cc0b9..95736afea 100644 --- a/api/handlers/filetypehandler.py +++ b/api/handlers/filetypehandler.py @@ -8,13 +8,13 @@ class FileType(base.RequestHandler): def get(self): """Get file types""" - resp = config.db.filetypes.find() - if resp != None: - return resp - else: - self.abort(404, "Version document does not exist") + return config.db.filetypes.find() def post(self): + """ + Insert or replace a file type. Required fields: '_id' and 'regex' where the '_id' is the unique name of + the file type and 'regex' is a regular expression which is used to figure out the file type from the file name. + """ permchecker = userauth.default(self) payload = self.request.json_body mongo_schema_uri = validators.schema_uri('mongo', 'filetype.json') @@ -28,10 +28,11 @@ def post(self): self.abort(404, 'File type {} not updated'.format(payload['_id'])) def delete(self, _id): + """Delete a file type""" permchecker = userauth.default(self) permchecker(noop)('DELETE', _id) result = config.db.filetypes.delete_one({'_id': _id}) - if result.acknowledged: + if result.deleted_count: return {'deleted': result.deleted_count} else: self.abort(404, 'File type {} not removed'.format(_id)) \ No newline at end of file diff --git a/tests/integration_tests/python/test_files.py b/tests/integration_tests/python/test_files.py index ee005c97e..f1dd94b31 100644 --- a/tests/integration_tests/python/test_files.py +++ b/tests/integration_tests/python/test_files.py @@ -1,6 +1,5 @@ from api import files - def test_extension(): assert files.guess_type_from_filename('example.pdf') == 'pdf' @@ -22,9 +21,19 @@ def test_qa(): def test_unknown(): assert files.guess_type_from_filename('example.unknown') == None -def test_insert_delete(as_drone): - as_drone.post('/filetype', json={'_id': 'new', 'regex': '.*\.new$'}) +def test_get_insert_delete(as_drone): + r = as_drone.get('/filetype') + assert r.ok + r = as_drone.post('/filetype', json={'_id': 'new', 'regex': '.*\.new$'}) + assert r.ok assert files.guess_type_from_filename('example.new') == 'new' - as_drone.post('/filetype', json={'_id': 'new', 'regex': '.*\.new2$'}) + r = as_drone.post('/filetype', json={'_id': 'new', 'regex': '.*\.new2$'}) + assert r.ok assert files.guess_type_from_filename('example.new') == None - assert files.guess_type_from_filename('example.new2') == 'new' \ No newline at end of file + assert files.guess_type_from_filename('example.new2') == 'new' + r = as_drone.delete('/filetype/new') + assert r.ok + +def test_insert_delete_abort(as_drone): + r = as_drone.delete('/filetype/notexists') + assert r.status_code == 404 \ No newline at end of file From ad9a703c423cc3ff6e672f77a56b278c069d1f21 Mon Sep 17 00:00:00 2001 From: David Farkas Date: Fri, 10 Nov 2017 17:33:38 +0100 Subject: [PATCH 08/16] Update permission checking, schema validating, remove JS query --- api/config.py | 2 +- api/files.py | 15 +++++++++------ api/handlers/filetypehandler.py | 20 +++++++++----------- raml/schemas/{mongo => input}/filetype.json | 0 4 files changed, 19 insertions(+), 18 deletions(-) rename raml/schemas/{mongo => input}/filetype.json (100%) diff --git a/api/config.py b/api/config.py index 225e3d4e4..e2fa37079 100644 --- a/api/config.py +++ b/api/config.py @@ -139,7 +139,6 @@ def apply_env_variables(config): 'collection.json', 'container.json', 'file.json', - 'filetype.json', 'group.json', 'note.json', 'permission.json', @@ -162,6 +161,7 @@ def apply_env_variables(config): 'container.json', 'device.json', 'file.json', + 'filetype.json', 'file-update.json', 'group-new.json', 'group-update.json', diff --git a/api/files.py b/api/files.py index 89f871009..1e007c027 100644 --- a/api/files.py +++ b/api/files.py @@ -1,8 +1,9 @@ -import os import cgi -import shutil -import hashlib import collections +import hashlib +import os +import re +import shutil from . import util from . import config @@ -152,8 +153,10 @@ def get_hash(self): def guess_type_from_filename(filename): filetype = None - result = config.db.filetypes.find_one({'$where': 'function() {return RegExp(this.regex).test(\'%s\');}' % filename}) - if result: - filetype = result['_id'] + cursor = config.db.filetypes.find({}) + for document in cursor: + if re.match(document['regex'], filename): + filetype = document['_id'] + break return filetype diff --git a/api/handlers/filetypehandler.py b/api/handlers/filetypehandler.py index 95736afea..7b9ef0071 100644 --- a/api/handlers/filetypehandler.py +++ b/api/handlers/filetypehandler.py @@ -1,25 +1,24 @@ -from ..web import base from .. import config -from .. import validators -from ..auth import userauth -from ..dao import noop +from ..auth import require_admin, require_login +from ..validators import validate_data +from ..web import base + class FileType(base.RequestHandler): + @require_login def get(self): """Get file types""" return config.db.filetypes.find() + @require_admin def post(self): """ Insert or replace a file type. Required fields: '_id' and 'regex' where the '_id' is the unique name of the file type and 'regex' is a regular expression which is used to figure out the file type from the file name. """ - permchecker = userauth.default(self) payload = self.request.json_body - mongo_schema_uri = validators.schema_uri('mongo', 'filetype.json') - mongo_validator = validators.decorator_from_schema_path(mongo_schema_uri) - mongo_validator(permchecker(noop))('PUT', payload=payload) + validate_data(payload, 'filetype.json', 'input', 'POST') result = config.db.filetypes.replace_one({'_id': payload['_id']}, payload, upsert=True) if result.acknowledged: _id = result.upserted_id if result.upserted_id else payload['_id'] @@ -27,12 +26,11 @@ def post(self): else: self.abort(404, 'File type {} not updated'.format(payload['_id'])) + @require_admin def delete(self, _id): """Delete a file type""" - permchecker = userauth.default(self) - permchecker(noop)('DELETE', _id) result = config.db.filetypes.delete_one({'_id': _id}) if result.deleted_count: return {'deleted': result.deleted_count} else: - self.abort(404, 'File type {} not removed'.format(_id)) \ No newline at end of file + self.abort(404, 'File type {} not removed'.format(_id)) diff --git a/raml/schemas/mongo/filetype.json b/raml/schemas/input/filetype.json similarity index 100% rename from raml/schemas/mongo/filetype.json rename to raml/schemas/input/filetype.json From dd7e4ca94fc1f3bcb12d9ee73aaafcdada44955b Mon Sep 17 00:00:00 2001 From: David Farkas Date: Mon, 13 Nov 2017 16:38:59 +0100 Subject: [PATCH 09/16] Load mongo filetypes via the bootstrap script --- api/config.py | 5 ---- api/filetypes.json | 29 ------------------- ...s_drone_secret.py => load_drone_secret.py} | 13 ++++++--- bootstrap.sample.json | 29 +++++++++++++++++++ docker/README.md | 2 +- docker/bootstrap-accounts.sh | 28 ------------------ docker/bootstrap-defaults.sh | 28 ++++++++++++++++++ 7 files changed, 67 insertions(+), 67 deletions(-) delete mode 100644 api/filetypes.json rename bin/{load_users_drone_secret.py => load_drone_secret.py} (91%) delete mode 100755 docker/bootstrap-accounts.sh create mode 100755 docker/bootstrap-defaults.sh diff --git a/api/config.py b/api/config.py index e2fa37079..2306cbc94 100644 --- a/api/config.py +++ b/api/config.py @@ -258,11 +258,6 @@ def initialize_db(): {'$setOnInsert': {'created': now, 'modified': now, 'label': 'Unknown', 'permissions': []}}, upsert=True) - with open(os.path.join(os.path.dirname(__file__), 'filetypes.json')) as fd: - filetypes = json.load(fd) - for filetype in filetypes: - try_replace_one(db, 'filetypes', {'_id': filetype['_id']}, filetype, upsert=True) - log.info('Initializing database, creating indexes ....DONE') def get_config(): diff --git a/api/filetypes.json b/api/filetypes.json deleted file mode 100644 index f8f003c0e..000000000 --- a/api/filetypes.json +++ /dev/null @@ -1,29 +0,0 @@ -[ - { "_id": "bval", "regex": ".*\\.(bval$|bvals$)" }, - { "_id": "bvec", "regex": ".*\\.(bvec$|bvecs$)" }, - { "_id": "dicom", "regex": ".*\\.(dcm$|dcm\\.zip|dicom\\.zip$)" }, - { "_id": "eeg", "regex": ".*\\.eeg\\.zip$" }, - { "_id": "gephysio", "regex": ".*\\.gephysio\\.zip$" }, - { "_id": "ismrmrd", "regex": ".*\\.(h5$|hdf5$)" }, - { "_id": "MATLAB data", "regex": ".*\\.mat$" }, - { "_id": "MGH data", "regex": ".*\\.(mgh$|mgz$|mgh\\.gz$)" }, - { "_id": "nifti", "regex": ".*\\.(nii\\.gz$|nii$)" }, - { "_id": "parrec", "regex": ".*\\.(parrec\\.zip$|par-rec\\.zip$)" }, - { "_id": "pfile", "regex": ".*\\.(7\\.gz$|7$|7\\.zip)" }, - { "_id": "PsychoPy data", "regex": ".*\\.psydat$" }, - { "_id": "qa", "regex": ".*\\.(qa\\.png$|qa\\.json|qa\\.html$)" }, - - { "_id": "archive", "regex": ".*\\.(zip$|tbz2$|tar\\.gz$|tbz$|tar\\.bz2$|tgz$|tar$|txz$|tar\\.xz$)" }, - { "_id": "document", "regex": ".*\\.(docx$|doc$)" }, - { "_id": "image", "regex": ".*\\.(jpg$|tif$|jpeg$|gif$|bmp$|png$|tiff$)" }, - { "_id": "markup", "regex": ".*\\.(html$|htm$|xml$)" }, - { "_id": "markdown", "regex": ".*\\.(md$|markdown$)" }, - { "_id": "log", "regex": ".*\\.log$" }, - { "_id": "pdf", "regex": ".*\\.pdf$" }, - { "_id": "presentation", "regex": ".*\\.(ppt$|pptx$)" }, - { "_id": "source code", "regex": ".*\\.(c$|py$|cpp$|js$|m$|json$|java$|php$|css$|toml$|yaml$|yml$)" }, - { "_id": "spreadsheet", "regex": ".*\\.(xls$|xlsx$)" }, - { "_id": "tabular data", "regex": ".*\\.(csv\\.gz$|csv$)" }, - { "_id": "text", "regex": ".*\\.txt$" }, - { "_id": "video", "regex": ".*\\.(mpeg$|mpg$|mov$|mp4$|m4v$|mts$)" } -] \ No newline at end of file diff --git a/bin/load_users_drone_secret.py b/bin/load_drone_secret.py similarity index 91% rename from bin/load_users_drone_secret.py rename to bin/load_drone_secret.py index 471a73669..9f170e1fd 100755 --- a/bin/load_users_drone_secret.py +++ b/bin/load_drone_secret.py @@ -62,9 +62,9 @@ def _upsert_permission(request_session, api_url, permission_doc, group_id): full_permission_url = "{0}/{1}".format(base_permission_url, permission_doc['_id']) return request_session.put(full_permission_url, json=permission_doc) -def users(filepath, api_url, http_headers, insecure): +def bootstrap(filepath, api_url, http_headers, insecure): """ - Upserts the users/groups/permissions defined in filepath parameter. + Upserts the users/groups/permissions/file types defined in filepath parameter. Raises: requests.HTTPError: Upsert failed. @@ -95,7 +95,7 @@ def users(filepath, api_url, http_headers, insecure): log.info('bootstrapping projects...') for p in input_data.get('projects', []): - r = rs.post(api_url + '/projects?inherit=true' , json=p) + r = rs.post(api_url + '/projects?inherit=true', json=p) r.raise_for_status() project_id = r.json()['_id'] @@ -111,6 +111,11 @@ def users(filepath, api_url, http_headers, insecure): r = rs.post(api_url + '/projects/' + project_id + '/rules', json=rule) r.raise_for_status() + log.info('bootstrapping file types...') + for f in input_data.get('filetypes', []): + r = rs.post(api_url + '/filetype', json=f) + r.raise_for_status() + log.info('bootstrapping complete') @@ -134,7 +139,7 @@ def users(filepath, api_url, http_headers, insecure): # TODO: extend this to support oauth tokens try: - users(args.json, args.url, http_headers, args.insecure) + bootstrap(args.json, args.url, http_headers, args.insecure) except requests.HTTPError as ex: log.error(ex) log.error("request_body={0}".format(ex.response.request.body)) diff --git a/bootstrap.sample.json b/bootstrap.sample.json index e85943b67..35c067f96 100644 --- a/bootstrap.sample.json +++ b/bootstrap.sample.json @@ -25,5 +25,34 @@ "_id": "local", "type": "engine" } + ], + "filetypes": [ + { "_id": "bval", "regex": ".*\\.(bval$|bvals$)" }, + { "_id": "bvec", "regex": ".*\\.(bvec$|bvecs$)" }, + { "_id": "dicom", "regex": ".*\\.(dcm$|dcm\\.zip|dicom\\.zip$)" }, + { "_id": "eeg", "regex": ".*\\.eeg\\.zip$" }, + { "_id": "gephysio", "regex": ".*\\.gephysio\\.zip$" }, + { "_id": "ismrmrd", "regex": ".*\\.(h5$|hdf5$)" }, + { "_id": "MATLAB data", "regex": ".*\\.mat$" }, + { "_id": "MGH data", "regex": ".*\\.(mgh$|mgz$|mgh\\.gz$)" }, + { "_id": "nifti", "regex": ".*\\.(nii\\.gz$|nii$)" }, + { "_id": "parrec", "regex": ".*\\.(parrec\\.zip$|par-rec\\.zip$)" }, + { "_id": "pfile", "regex": ".*\\.(7\\.gz$|7$|7\\.zip)" }, + { "_id": "PsychoPy data", "regex": ".*\\.psydat$" }, + { "_id": "qa", "regex": ".*\\.(qa\\.png$|qa\\.json|qa\\.html$)" }, + + { "_id": "archive", "regex": ".*\\.(zip$|tbz2$|tar\\.gz$|tbz$|tar\\.bz2$|tgz$|tar$|txz$|tar\\.xz$)" }, + { "_id": "document", "regex": ".*\\.(docx$|doc$)" }, + { "_id": "image", "regex": ".*\\.(jpg$|tif$|jpeg$|gif$|bmp$|png$|tiff$)" }, + { "_id": "markup", "regex": ".*\\.(html$|htm$|xml$)" }, + { "_id": "markdown", "regex": ".*\\.(md$|markdown$)" }, + { "_id": "log", "regex": ".*\\.log$" }, + { "_id": "pdf", "regex": ".*\\.pdf$" }, + { "_id": "presentation", "regex": ".*\\.(ppt$|pptx$)" }, + { "_id": "source code", "regex": ".*\\.(c$|py$|cpp$|js$|m$|json$|java$|php$|css$|toml$|yaml$|yml$)" }, + { "_id": "spreadsheet", "regex": ".*\\.(xls$|xlsx$)" }, + { "_id": "tabular data", "regex": ".*\\.(csv\\.gz$|csv$)" }, + { "_id": "text", "regex": ".*\\.txt$" }, + { "_id": "video", "regex": ".*\\.(mpeg$|mpg$|mov$|mp4$|m4v$|mts$)" } ] } diff --git a/docker/README.md b/docker/README.md index cd98f3cf9..18bf1dc9f 100644 --- a/docker/README.md +++ b/docker/README.md @@ -44,7 +44,7 @@ preserving their contents across container instances. --rm \ -v /dev/bali.prod/docker/uwsgi/bootstrap-dev.json:/accounts.json \ scitran-core \ - /var/scitran/code/api/docker/bootstrap-accounts.sh \ + /var/scitran/code/api/docker/bootstrap-defaults.sh \ /accounts.json diff --git a/docker/bootstrap-accounts.sh b/docker/bootstrap-accounts.sh deleted file mode 100755 index e8aab4c1b..000000000 --- a/docker/bootstrap-accounts.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -set -e -set -x - -echo "IN BOOTSTRAP ACCOUNTS" - -( - -# Parse input parameters... -# -# bootstrap account file -bootstrap_user_file=${1:-'/var/scitran/code/api/bootstrap.json.sample'} - - -# Move to API folder for relative path assumptions later on -# -cd /var/scitran/code/api - -# Export PYTHONPATH for python script later on. -# -export PYTHONPATH=. - - -# Bootstrap Users -./bin/load_users_drone_secret.py --insecure --secret "${SCITRAN_CORE_DRONE_SECRET}" "${SCITRAN_SITE_API_URL}" "${bootstrap_user_file}" - - -) diff --git a/docker/bootstrap-defaults.sh b/docker/bootstrap-defaults.sh new file mode 100755 index 000000000..9d1823bac --- /dev/null +++ b/docker/bootstrap-defaults.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set -e +set -x + +echo "IN BOOTSTRAP DEFAULTS" + +( + +# Parse input parameters... +# +# bootstrap file +bootstrap_file=${1:-'/var/scitran/code/api/bootstrap.sample.json'} + + +# Move to API folder for relative path assumptions later on +# +cd /var/scitran/code/api + +# Export PYTHONPATH for python script later on. +# +export PYTHONPATH=. + + +# Bootstrap users and file types +./bin/load_drone_secret.py --insecure --secret "${SCITRAN_CORE_DRONE_SECRET}" "${SCITRAN_SITE_API_URL}" "${bootstrap_file}" + + +) From 0cdbf5def5843134ebe1b7cdc2c71dbd7f3d2f89 Mon Sep 17 00:00:00 2001 From: David Farkas Date: Mon, 13 Nov 2017 17:23:00 +0100 Subject: [PATCH 10/16] Update file type tests --- api/files.py | 1 - tests/integration_tests/python/test_files.py | 22 ++++++++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/api/files.py b/api/files.py index 1e007c027..ee73c15ed 100644 --- a/api/files.py +++ b/api/files.py @@ -157,6 +157,5 @@ def guess_type_from_filename(filename): for document in cursor: if re.match(document['regex'], filename): filetype = document['_id'] - break return filetype diff --git a/tests/integration_tests/python/test_files.py b/tests/integration_tests/python/test_files.py index f1dd94b31..c9fc60cf2 100644 --- a/tests/integration_tests/python/test_files.py +++ b/tests/integration_tests/python/test_files.py @@ -1,18 +1,32 @@ from api import files -def test_extension(): +def test_extension(as_drone): + r = as_drone.post('/filetype', json={'_id': 'pdf', 'regex': '.*\.pdf$'}) + assert r.ok assert files.guess_type_from_filename('example.pdf') == 'pdf' -def test_multi_extension(): +def test_multi_extension(as_drone): + r = as_drone.post('/filetype', + json={'_id': 'archive', + 'regex': '.*\.(zip$|tbz2$|tar\.gz$|tbz$|tar\.bz2$|tgz$|tar$|txz$|tar\.xz$)'}) + assert r.ok + r = as_drone.post('/filetype', json={'_id': 'gephysio', 'regex': '.*\.gephysio\.zip$'}) + assert r.ok assert files.guess_type_from_filename('example.zip') == 'archive' assert files.guess_type_from_filename('example.gephysio.zip') == 'gephysio' -def test_nifti(): +def test_nifti(as_drone): + r = as_drone.post('/filetype', json={'_id': 'nifti', 'regex': '.*\.(nii\.gz$|nii$)'}) + assert r.ok assert files.guess_type_from_filename('example.nii') == 'nifti' assert files.guess_type_from_filename('example.nii.gz') == 'nifti' assert files.guess_type_from_filename('example.nii.x.gz') == None -def test_qa(): +def test_qa(as_drone): + r = as_drone.post('/filetype', json={'_id': 'image', 'regex': '.*\.(jpg$|tif$|jpeg$|gif$|bmp$|png$|tiff$)'}) + assert r.ok + r = as_drone.post('/filetype', json={'_id': 'qa', 'regex': '.*\.(qa\.png$|qa\.json|qa\.html$)'}) + assert r.ok assert files.guess_type_from_filename('example.png') == 'image' assert files.guess_type_from_filename('example.qa.png') == 'qa' assert files.guess_type_from_filename('example.qa') == None From 6cf6e2ceefc90eb88799f3999c47c107629e821d Mon Sep 17 00:00:00 2001 From: David Farkas Date: Thu, 16 Nov 2017 13:07:46 +0100 Subject: [PATCH 11/16] Refine the file type regular expressions, use re.search instead of match, update the integration tests and validate the regex in the file type handler --- api/files.py | 5 +- api/handlers/filetypehandler.py | 8 ++- bootstrap.sample.json | 52 ++++++++++---------- tests/integration_tests/python/conftest.py | 7 +++ tests/integration_tests/python/test_files.py | 50 ++++++++++++------- 5 files changed, 75 insertions(+), 47 deletions(-) diff --git a/api/files.py b/api/files.py index ee73c15ed..b5596e2bd 100644 --- a/api/files.py +++ b/api/files.py @@ -153,9 +153,12 @@ def get_hash(self): def guess_type_from_filename(filename): filetype = None + m_length = 0 cursor = config.db.filetypes.find({}) + for document in cursor: - if re.match(document['regex'], filename): + m = re.search(document['regex'], filename) + if m and m_length < len(m.group(0)): filetype = document['_id'] return filetype diff --git a/api/handlers/filetypehandler.py b/api/handlers/filetypehandler.py index 7b9ef0071..257eb8d4c 100644 --- a/api/handlers/filetypehandler.py +++ b/api/handlers/filetypehandler.py @@ -1,6 +1,8 @@ +import re + from .. import config from ..auth import require_admin, require_login -from ..validators import validate_data +from ..validators import validate_data, InputValidationException from ..web import base @@ -19,6 +21,10 @@ def post(self): """ payload = self.request.json_body validate_data(payload, 'filetype.json', 'input', 'POST') + try: + re.compile(payload['regex']) + except re.error: + raise InputValidationException('Invalid regular expression') result = config.db.filetypes.replace_one({'_id': payload['_id']}, payload, upsert=True) if result.acknowledged: _id = result.upserted_id if result.upserted_id else payload['_id'] diff --git a/bootstrap.sample.json b/bootstrap.sample.json index 35c067f96..86f8d40a5 100644 --- a/bootstrap.sample.json +++ b/bootstrap.sample.json @@ -27,32 +27,32 @@ } ], "filetypes": [ - { "_id": "bval", "regex": ".*\\.(bval$|bvals$)" }, - { "_id": "bvec", "regex": ".*\\.(bvec$|bvecs$)" }, - { "_id": "dicom", "regex": ".*\\.(dcm$|dcm\\.zip|dicom\\.zip$)" }, - { "_id": "eeg", "regex": ".*\\.eeg\\.zip$" }, - { "_id": "gephysio", "regex": ".*\\.gephysio\\.zip$" }, - { "_id": "ismrmrd", "regex": ".*\\.(h5$|hdf5$)" }, - { "_id": "MATLAB data", "regex": ".*\\.mat$" }, - { "_id": "MGH data", "regex": ".*\\.(mgh$|mgz$|mgh\\.gz$)" }, - { "_id": "nifti", "regex": ".*\\.(nii\\.gz$|nii$)" }, - { "_id": "parrec", "regex": ".*\\.(parrec\\.zip$|par-rec\\.zip$)" }, - { "_id": "pfile", "regex": ".*\\.(7\\.gz$|7$|7\\.zip)" }, - { "_id": "PsychoPy data", "regex": ".*\\.psydat$" }, - { "_id": "qa", "regex": ".*\\.(qa\\.png$|qa\\.json|qa\\.html$)" }, + { "_id": "bval", "regex": "\\.(bval|bvals)$" }, + { "_id": "bvec", "regex": "\\.(bvec|bvecs)$" }, + { "_id": "dicom", "regex": "\\.(dcm|dcm\\.zip|dicom\\.zip)$" }, + { "_id": "eeg", "regex": "\\.eeg\\.zip$" }, + { "_id": "gephysio", "regex": "\\.gephysio\\.zip$" }, + { "_id": "ismrmrd", "regex": "\\.(h5|hdf5)$" }, + { "_id": "MATLAB data", "regex": "\\.mat$" }, + { "_id": "MGH data", "regex": "\\.(mgh|mgz|mgh\\.gz)$" }, + { "_id": "nifti", "regex": "\\.(nii\\.gz|nii)$" }, + { "_id": "parrec", "regex": "\\.(parrec\\.zip|par-rec\\.zip)$" }, + { "_id": "pfile", "regex": "\\.(7\\.gz|7|7\\.zip)$" }, + { "_id": "PsychoPy data", "regex": "\\.psydat$" }, + { "_id": "qa", "regex": "\\.(qa\\.png|qa\\.json|qa\\.html)$" }, - { "_id": "archive", "regex": ".*\\.(zip$|tbz2$|tar\\.gz$|tbz$|tar\\.bz2$|tgz$|tar$|txz$|tar\\.xz$)" }, - { "_id": "document", "regex": ".*\\.(docx$|doc$)" }, - { "_id": "image", "regex": ".*\\.(jpg$|tif$|jpeg$|gif$|bmp$|png$|tiff$)" }, - { "_id": "markup", "regex": ".*\\.(html$|htm$|xml$)" }, - { "_id": "markdown", "regex": ".*\\.(md$|markdown$)" }, - { "_id": "log", "regex": ".*\\.log$" }, - { "_id": "pdf", "regex": ".*\\.pdf$" }, - { "_id": "presentation", "regex": ".*\\.(ppt$|pptx$)" }, - { "_id": "source code", "regex": ".*\\.(c$|py$|cpp$|js$|m$|json$|java$|php$|css$|toml$|yaml$|yml$)" }, - { "_id": "spreadsheet", "regex": ".*\\.(xls$|xlsx$)" }, - { "_id": "tabular data", "regex": ".*\\.(csv\\.gz$|csv$)" }, - { "_id": "text", "regex": ".*\\.txt$" }, - { "_id": "video", "regex": ".*\\.(mpeg$|mpg$|mov$|mp4$|m4v$|mts$)" } + { "_id": "archive", "regex": "\\.(zip|tbz2|tar\\.gz|tbz|tar\\.bz2|tgz|tar|txz|tar\\.xz)$" }, + { "_id": "document", "regex": "\\.(docx|doc)$" }, + { "_id": "image", "regex": "\\.(jpg|tif|jpeg|gif|bmp|png|tiff)$" }, + { "_id": "markup", "regex": "\\.(html|htm|xml)$" }, + { "_id": "markdown", "regex": "\\.(md|markdown)$" }, + { "_id": "log", "regex": "\\.log$" }, + { "_id": "pdf", "regex": "\\.pdf$" }, + { "_id": "presentation", "regex": "\\.(ppt|pptx)$" }, + { "_id": "source code", "regex": "\\.(c|py|cpp|js|m|json|java|php|css|toml|yaml|yml)$" }, + { "_id": "spreadsheet", "regex": "\\.(xls|xlsx)$" }, + { "_id": "tabular data", "regex": "\\.(csv\\.gz|csv)$" }, + { "_id": "text", "regex": "\\.txt$" }, + { "_id": "video", "regex": "\\.(mpeg|mpg|mov|mp4|m4v|mts)$" } ] } diff --git a/tests/integration_tests/python/conftest.py b/tests/integration_tests/python/conftest.py index 7cd2593d5..509303c4f 100644 --- a/tests/integration_tests/python/conftest.py +++ b/tests/integration_tests/python/conftest.py @@ -32,6 +32,13 @@ def bootstrap_users(as_drone): return data_builder +@pytest.fixture(scope='session', autouse=True) +def bootstrap_filetypes(as_admin): + """Create file types""" + as_admin.post('/filetype', json={'_id': 'tabular data', 'regex': '\.(csv\.gz|csv)$'}) + as_admin.post('/filetype', json={'_id': 'text', 'regex': '\.txt$'}) + + @pytest.fixture(scope='session') def as_drone(): """Return requests session with drone access""" diff --git a/tests/integration_tests/python/test_files.py b/tests/integration_tests/python/test_files.py index c9fc60cf2..c382fcdcd 100644 --- a/tests/integration_tests/python/test_files.py +++ b/tests/integration_tests/python/test_files.py @@ -1,53 +1,65 @@ from api import files -def test_extension(as_drone): - r = as_drone.post('/filetype', json={'_id': 'pdf', 'regex': '.*\.pdf$'}) + +def test_extension(as_admin): + r = as_admin.post('/filetype', json={'_id': 'pdf', 'regex': '\.pdf$'}) assert r.ok assert files.guess_type_from_filename('example.pdf') == 'pdf' -def test_multi_extension(as_drone): - r = as_drone.post('/filetype', + +def test_multi_extension(as_admin): + r = as_admin.post('/filetype', json={'_id': 'archive', - 'regex': '.*\.(zip$|tbz2$|tar\.gz$|tbz$|tar\.bz2$|tgz$|tar$|txz$|tar\.xz$)'}) + 'regex': '\.zip$'}) assert r.ok - r = as_drone.post('/filetype', json={'_id': 'gephysio', 'regex': '.*\.gephysio\.zip$'}) + r = as_admin.post('/filetype', json={'_id': 'gephysio', 'regex': '\.gephysio\.zip$'}) assert r.ok assert files.guess_type_from_filename('example.zip') == 'archive' assert files.guess_type_from_filename('example.gephysio.zip') == 'gephysio' -def test_nifti(as_drone): - r = as_drone.post('/filetype', json={'_id': 'nifti', 'regex': '.*\.(nii\.gz$|nii$)'}) + +def test_nifti(as_admin): + r = as_admin.post('/filetype', json={'_id': 'nifti', 'regex': '\.(nii\.gz|nii)$'}) assert r.ok assert files.guess_type_from_filename('example.nii') == 'nifti' assert files.guess_type_from_filename('example.nii.gz') == 'nifti' assert files.guess_type_from_filename('example.nii.x.gz') == None -def test_qa(as_drone): - r = as_drone.post('/filetype', json={'_id': 'image', 'regex': '.*\.(jpg$|tif$|jpeg$|gif$|bmp$|png$|tiff$)'}) + +def test_qa(as_admin): + r = as_admin.post('/filetype', json={'_id': 'image', 'regex': '\.png$'}) assert r.ok - r = as_drone.post('/filetype', json={'_id': 'qa', 'regex': '.*\.(qa\.png$|qa\.json|qa\.html$)'}) + r = as_admin.post('/filetype', json={'_id': 'qa', 'regex': '\.qa\.png$'}) assert r.ok assert files.guess_type_from_filename('example.png') == 'image' assert files.guess_type_from_filename('example.qa.png') == 'qa' assert files.guess_type_from_filename('example.qa') == None assert files.guess_type_from_filename('example.qa.png.unknown') == None + def test_unknown(): assert files.guess_type_from_filename('example.unknown') == None -def test_get_insert_delete(as_drone): - r = as_drone.get('/filetype') + +def test_get_insert_delete(as_admin): + r = as_admin.get('/filetype') assert r.ok - r = as_drone.post('/filetype', json={'_id': 'new', 'regex': '.*\.new$'}) + r = as_admin.post('/filetype', json={'_id': 'new', 'regex': '\.new$'}) assert r.ok assert files.guess_type_from_filename('example.new') == 'new' - r = as_drone.post('/filetype', json={'_id': 'new', 'regex': '.*\.new2$'}) + r = as_admin.post('/filetype', json={'_id': 'new', 'regex': '\.new2$'}) assert r.ok assert files.guess_type_from_filename('example.new') == None assert files.guess_type_from_filename('example.new2') == 'new' - r = as_drone.delete('/filetype/new') + r = as_admin.delete('/filetype/new') assert r.ok -def test_insert_delete_abort(as_drone): - r = as_drone.delete('/filetype/notexists') - assert r.status_code == 404 \ No newline at end of file + +def test_insert_delete_abort(as_admin): + r = as_admin.delete('/filetype/notexists') + assert r.status_code == 404 + + +def test_invalid_regex(as_admin): + r = as_admin.post('/filetype', json={'_id': 'invalid', 'regex': '\\'}) + assert r.status_code == 400 From c5c6b4216992040639bc657fba20ae6c963f6cef Mon Sep 17 00:00:00 2001 From: David Farkas Date: Thu, 16 Nov 2017 13:24:54 +0100 Subject: [PATCH 12/16] Update abao load fixture script to load the necessary file types too --- tests/integration_tests/abao/load_fixture.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/integration_tests/abao/load_fixture.py b/tests/integration_tests/abao/load_fixture.py index 32c036bdc..1d4113a8a 100644 --- a/tests/integration_tests/abao/load_fixture.py +++ b/tests/integration_tests/abao/load_fixture.py @@ -56,6 +56,12 @@ def main(): r = as_root.post('/groups', json={'_id': 'test-group'}) assert r.ok + # create file types + r = as_root.post('/filetype', json={'_id': 'dicom', 'regex': '\.(dcm|dcm\.zip|dicom\.zip)$'}) + assert r.ok + r = as_root.post('/filetype', json={'_id': 'text', 'regex': '\.txt$'}) + assert r.ok + # upload file to test-project-1/test-session-1/test-acquisition-1 # depends on 'create test-group' r = as_root.post('/upload/label', files={ From 82684f2a5c3b2750cb0f1edb4ac09d16583617df Mon Sep 17 00:00:00 2001 From: Gunnar Schaefer Date: Tue, 5 Dec 2017 22:44:39 -0600 Subject: [PATCH 13/16] Add stylized file types --- bootstrap.sample.json | 62 +++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/bootstrap.sample.json b/bootstrap.sample.json index 86f8d40a5..60350b6c0 100644 --- a/bootstrap.sample.json +++ b/bootstrap.sample.json @@ -27,32 +27,42 @@ } ], "filetypes": [ - { "_id": "bval", "regex": "\\.(bval|bvals)$" }, - { "_id": "bvec", "regex": "\\.(bvec|bvecs)$" }, - { "_id": "dicom", "regex": "\\.(dcm|dcm\\.zip|dicom\\.zip)$" }, - { "_id": "eeg", "regex": "\\.eeg\\.zip$" }, - { "_id": "gephysio", "regex": "\\.gephysio\\.zip$" }, - { "_id": "ismrmrd", "regex": "\\.(h5|hdf5)$" }, - { "_id": "MATLAB data", "regex": "\\.mat$" }, - { "_id": "MGH data", "regex": "\\.(mgh|mgz|mgh\\.gz)$" }, - { "_id": "nifti", "regex": "\\.(nii\\.gz|nii)$" }, - { "_id": "parrec", "regex": "\\.(parrec\\.zip|par-rec\\.zip)$" }, - { "_id": "pfile", "regex": "\\.(7\\.gz|7|7\\.zip)$" }, - { "_id": "PsychoPy data", "regex": "\\.psydat$" }, - { "_id": "qa", "regex": "\\.(qa\\.png|qa\\.json|qa\\.html)$" }, + { "_id": "BVAL", "regex": "\\.(bval|bvals)$" }, + { "_id": "BVEC", "regex": "\\.(bvec|bvecs)$" }, + { "_id": "DICOM", "regex": "\\.(dcm|dcm\\.zip|dicom\\.zip)$" }, + { "_id": "EFile", "regex": "^E.*P.*\\.7$" }, + { "_id": "GE Physio", "regex": "\\.gephysio\\.zip$" }, + { "_id": "MGH Data", "regex": "\\.(mgh|mgz|mgh\\.gz)$" }, + { "_id": "NIfTI", "regex": "\\.(nii\\.gz|nii)$" }, + { "_id": "PAR/REC", "regex": "\\.(parrec\\.zip|par-rec\\.zip)$" }, + { "_id": "PFile Header", "regex": "\\.(7\\.hdr)$" }, + { "_id": "PFile", "regex": "\\.(7\\.gz|7|7\\.zip)$" }, - { "_id": "archive", "regex": "\\.(zip|tbz2|tar\\.gz|tbz|tar\\.bz2|tgz|tar|txz|tar\\.xz)$" }, - { "_id": "document", "regex": "\\.(docx|doc)$" }, - { "_id": "image", "regex": "\\.(jpg|tif|jpeg|gif|bmp|png|tiff)$" }, - { "_id": "markup", "regex": "\\.(html|htm|xml)$" }, - { "_id": "markdown", "regex": "\\.(md|markdown)$" }, - { "_id": "log", "regex": "\\.log$" }, - { "_id": "pdf", "regex": "\\.pdf$" }, - { "_id": "presentation", "regex": "\\.(ppt|pptx)$" }, - { "_id": "source code", "regex": "\\.(c|py|cpp|js|m|json|java|php|css|toml|yaml|yml)$" }, - { "_id": "spreadsheet", "regex": "\\.(xls|xlsx)$" }, - { "_id": "tabular data", "regex": "\\.(csv\\.gz|csv)$" }, - { "_id": "text", "regex": "\\.txt$" }, - { "_id": "video", "regex": "\\.(mpeg|mpg|mov|mp4|m4v|mts)$" } + { "_id": "EEG", "regex": "\\.eeg\\.zip$" }, + + { "_id": "QC", "regex": "\\.(q[ac]\\.png|q[ac]\\.json|q[ac]\\.html)$" }, + + { "_id": "MATLAB Data", "regex": "\\.mat$" }, + { "_id": "PsychoPy Data", "regex": "\\.psydat$" }, + + { "_id": "Archive", "regex": "\\.(zip|tbz2|tar\\.gz|tbz|tar\\.bz2|tgz|tar|txz|tar\\.xz)$" }, + { "_id": "Audio", "regex": "\\.(mp3|wav|wave)$" }, + { "_id": "Document", "regex": "\\.(docx|doc)$" }, + { "_id": "HDF5", "regex": "\\.(h5|hdf5)$" }, + { "_id": "Image", "regex": "\\.(jpg|tif|jpeg|gif|bmp|png|tiff)$" }, + { "_id": "JSON", "regex": "\\.json$" }, + { "_id": "Log", "regex": "\\.log$" }, + { "_id": "Markdown", "regex": "\\.(md|markdown)$" }, + { "_id": "Markup", "regex": "\\.(html|htm)$" }, + { "_id": "PDF", "regex": "\\.pdf$" }, + { "_id": "Plain Text", "regex": "\\.txt$" }, + { "_id": "Presentation", "regex": "\\.(ppt|pptx)$" }, + { "_id": "Source code", "regex": "\\.(c|py|cpp|js|m|json|java|php|css|toml|yaml|yml)$" }, + { "_id": "Spreadsheet", "regex": "\\.(xls|xlsx)$" }, + { "_id": "TOML", "regex": "\\.toml$" }, + { "_id": "Tabular data", "regex": "\\.([ct]sv\\.gz|[ct]sv)$" }, + { "_id": "Video", "regex": "\\.(mpeg|mpg|mov|mp4|m4v|mts)$" }, + { "_id": "XML", "regex": "\\.xml$" }, + { "_id": "YAML", "regex": "\\.(yaml|yml)$" } ] } From d79658af70c65ddd31841a21e360451660601467 Mon Sep 17 00:00:00 2001 From: Gunnar Schaefer Date: Wed, 6 Dec 2017 11:12:01 -0600 Subject: [PATCH 14/16] Refine file types --- bootstrap.sample.json | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/bootstrap.sample.json b/bootstrap.sample.json index 60350b6c0..a945dfa9f 100644 --- a/bootstrap.sample.json +++ b/bootstrap.sample.json @@ -45,24 +45,32 @@ { "_id": "MATLAB Data", "regex": "\\.mat$" }, { "_id": "PsychoPy Data", "regex": "\\.psydat$" }, + { "_id": "C/C++", "regex": "\\.(c|cpp)$" }, + { "_id": "CSS", "regex": "\\.css$" }, + { "_id": "HDF5", "regex": "\\.(h5|hdf5)$" }, + { "_id": "HTML", "regex": "\\.(html|htm)$" }, + { "_id": "JSON", "regex": "\\.json$" }, + { "_id": "Java", "regex": "\\.java$" }, + { "_id": "JavaScript", "regex": "\\.js$" }, + { "_id": "Jupyter", "regex": "\\.ipynb$" }, + { "_id": "MATLAB", "regex": "\\.(m|mex|mlx)$" }, + { "_id": "Markdown", "regex": "\\.(md|markdown)$" }, + { "_id": "PHP", "regex": "\\.php$" }, + { "_id": "Plain Text", "regex": "\\.txt$" }, + { "_id": "Python", "regex": "\\.py$" }, + { "_id": "TOML", "regex": "\\.toml$" }, + { "_id": "XML", "regex": "\\.xml$" }, + { "_id": "YAML", "regex": "\\.(yaml|yml)$" }, + { "_id": "Archive", "regex": "\\.(zip|tbz2|tar\\.gz|tbz|tar\\.bz2|tgz|tar|txz|tar\\.xz)$" }, { "_id": "Audio", "regex": "\\.(mp3|wav|wave)$" }, { "_id": "Document", "regex": "\\.(docx|doc)$" }, - { "_id": "HDF5", "regex": "\\.(h5|hdf5)$" }, { "_id": "Image", "regex": "\\.(jpg|tif|jpeg|gif|bmp|png|tiff)$" }, - { "_id": "JSON", "regex": "\\.json$" }, { "_id": "Log", "regex": "\\.log$" }, - { "_id": "Markdown", "regex": "\\.(md|markdown)$" }, - { "_id": "Markup", "regex": "\\.(html|htm)$" }, { "_id": "PDF", "regex": "\\.pdf$" }, - { "_id": "Plain Text", "regex": "\\.txt$" }, { "_id": "Presentation", "regex": "\\.(ppt|pptx)$" }, - { "_id": "Source code", "regex": "\\.(c|py|cpp|js|m|json|java|php|css|toml|yaml|yml)$" }, { "_id": "Spreadsheet", "regex": "\\.(xls|xlsx)$" }, - { "_id": "TOML", "regex": "\\.toml$" }, { "_id": "Tabular data", "regex": "\\.([ct]sv\\.gz|[ct]sv)$" }, - { "_id": "Video", "regex": "\\.(mpeg|mpg|mov|mp4|m4v|mts)$" }, - { "_id": "XML", "regex": "\\.xml$" }, - { "_id": "YAML", "regex": "\\.(yaml|yml)$" } + { "_id": "Video", "regex": "\\.(mpeg|mpg|mov|mp4|m4v|mts)$" } ] } From 3af4e5854bb3bd7889ac28314cb768bf08f68014 Mon Sep 17 00:00:00 2001 From: Megan Henning Date: Wed, 13 Dec 2017 16:21:16 -0600 Subject: [PATCH 15/16] Add db upgrade for filetypes --- bin/database.py | 97 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 96 insertions(+), 1 deletion(-) diff --git a/bin/database.py b/bin/database.py index 57672b556..c6a79271c 100755 --- a/bin/database.py +++ b/bin/database.py @@ -15,6 +15,7 @@ from api import config from api import util +from api import files as files_module from api.dao import containerutil from api.dao.containerstorage import ProjectStorage from api.jobs.jobs import Job @@ -22,7 +23,7 @@ from api.types import Origin from api.jobs import batch -CURRENT_DATABASE_VERSION = 40 # An int that is bumped when a new schema change is made +CURRENT_DATABASE_VERSION = 41 # An int that is bumped when a new schema change is made def get_db_version(): @@ -1301,6 +1302,100 @@ def upgrade_to_40(): cursor = config.db.acquisitions.find({'timestamp':{'$type':'string'}}) process_cursor(cursor, upgrade_to_40_closure) + +def upgrade_to_41_closure(cont, context): + """ + Re-type files based on new filetypes stored in mongo collection + """ + + # passing filetypes rather than using util function to speed upgrade and skip db lookup + filetypes = context['filetypes'] + cont_name = context['cont_name'] + + files = cont.get('files', []) + + for f in files: + + new_type = None + m_length = 0 + + for document in filetypes: + m = re.search(document['regex'], f['name']) + if m and m_length < len(m.group(0)): + new_type = document['_id'] + if new_type is not None: + f['type'] = new_type + + config.db['cont_name'].update_one({'_id': cont['_id']}, {'$set': {'files': files}}) + + return True + + +def upgrade_to_41(): + """ + Load initial filetypes into mongo, retype existing files + """ + + # It was decided an initial load of filetypes here for existing users was + # easiest way to move those users forward. Future changes a site's + # filetypes will happen through the API endpoints as expected + filetypes = [ + { "_id": "BVAL", "regex": "\\.(bval|bvals)$" }, + { "_id": "BVEC", "regex": "\\.(bvec|bvecs)$" }, + { "_id": "DICOM", "regex": "\\.(dcm|dcm\\.zip|dicom\\.zip)$" }, + { "_id": "EFile", "regex": "^E.*P.*\\.7$" }, + { "_id": "GE Physio", "regex": "\\.gephysio\\.zip$" }, + { "_id": "MGH Data", "regex": "\\.(mgh|mgz|mgh\\.gz)$" }, + { "_id": "NIfTI", "regex": "\\.(nii\\.gz|nii)$" }, + { "_id": "PAR/REC", "regex": "\\.(parrec\\.zip|par-rec\\.zip)$" }, + { "_id": "PFile Header", "regex": "\\.(7\\.hdr)$" }, + { "_id": "PFile", "regex": "\\.(7\\.gz|7|7\\.zip)$" }, + + { "_id": "EEG", "regex": "\\.eeg\\.zip$" }, + + { "_id": "QC", "regex": "\\.(q[ac]\\.png|q[ac]\\.json|q[ac]\\.html)$" }, + + { "_id": "MATLAB Data", "regex": "\\.mat$" }, + { "_id": "PsychoPy Data", "regex": "\\.psydat$" }, + + { "_id": "C/C++", "regex": "\\.(c|cpp)$" }, + { "_id": "CSS", "regex": "\\.css$" }, + { "_id": "HDF5", "regex": "\\.(h5|hdf5)$" }, + { "_id": "HTML", "regex": "\\.(html|htm)$" }, + { "_id": "JSON", "regex": "\\.json$" }, + { "_id": "Java", "regex": "\\.java$" }, + { "_id": "JavaScript", "regex": "\\.js$" }, + { "_id": "Jupyter", "regex": "\\.ipynb$" }, + { "_id": "MATLAB", "regex": "\\.(m|mex|mlx)$" }, + { "_id": "Markdown", "regex": "\\.(md|markdown)$" }, + { "_id": "PHP", "regex": "\\.php$" }, + { "_id": "Plain Text", "regex": "\\.txt$" }, + { "_id": "Python", "regex": "\\.py$" }, + { "_id": "TOML", "regex": "\\.toml$" }, + { "_id": "XML", "regex": "\\.xml$" }, + { "_id": "YAML", "regex": "\\.(yaml|yml)$" }, + + { "_id": "Archive", "regex": "\\.(zip|tbz2|tar\\.gz|tbz|tar\\.bz2|tgz|tar|txz|tar\\.xz)$" }, + { "_id": "Audio", "regex": "\\.(mp3|wav|wave)$" }, + { "_id": "Document", "regex": "\\.(docx|doc)$" }, + { "_id": "Image", "regex": "\\.(jpg|tif|jpeg|gif|bmp|png|tiff)$" }, + { "_id": "Log", "regex": "\\.log$" }, + { "_id": "PDF", "regex": "\\.pdf$" }, + { "_id": "Presentation", "regex": "\\.(ppt|pptx)$" }, + { "_id": "Spreadsheet", "regex": "\\.(xls|xlsx)$" }, + { "_id": "Tabular data", "regex": "\\.([ct]sv\\.gz|[ct]sv)$" }, + { "_id": "Video", "regex": "\\.(mpeg|mpg|mov|mp4|m4v|mts)$" } + ] + + for ft in filetypes: + config.db.filetypes.replace_one({'_id': ft['_id']}, ft, upsert=True) + + for cont_name in ['projects', 'sessions', 'acquisitions', 'analyses', 'collections']: + + # Find all containers that have at least one file + cursor = config.db[cont_name].find({'files': { '$gt': [] }}) + process_cursor(cursor, upgrade_to_41_closure, context={'filetypes': filetypes, 'cont_name': cont_name}) + ### ### BEGIN RESERVED UPGRADE SECTION ### From 0cc3c282524e4aecf15c6bd67c2f67d1131fdf54 Mon Sep 17 00:00:00 2001 From: Gunnar Schaefer Date: Thu, 14 Dec 2017 10:57:14 -0800 Subject: [PATCH 16/16] Add one more case change --- bin/database.py | 2 +- bootstrap.sample.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/database.py b/bin/database.py index c6a79271c..9dcd148c0 100755 --- a/bin/database.py +++ b/bin/database.py @@ -1383,7 +1383,7 @@ def upgrade_to_41(): { "_id": "PDF", "regex": "\\.pdf$" }, { "_id": "Presentation", "regex": "\\.(ppt|pptx)$" }, { "_id": "Spreadsheet", "regex": "\\.(xls|xlsx)$" }, - { "_id": "Tabular data", "regex": "\\.([ct]sv\\.gz|[ct]sv)$" }, + { "_id": "Tabular Data", "regex": "\\.([ct]sv\\.gz|[ct]sv)$" }, { "_id": "Video", "regex": "\\.(mpeg|mpg|mov|mp4|m4v|mts)$" } ] diff --git a/bootstrap.sample.json b/bootstrap.sample.json index a945dfa9f..85709b432 100644 --- a/bootstrap.sample.json +++ b/bootstrap.sample.json @@ -70,7 +70,7 @@ { "_id": "PDF", "regex": "\\.pdf$" }, { "_id": "Presentation", "regex": "\\.(ppt|pptx)$" }, { "_id": "Spreadsheet", "regex": "\\.(xls|xlsx)$" }, - { "_id": "Tabular data", "regex": "\\.([ct]sv\\.gz|[ct]sv)$" }, + { "_id": "Tabular Data", "regex": "\\.([ct]sv\\.gz|[ct]sv)$" }, { "_id": "Video", "regex": "\\.(mpeg|mpg|mov|mp4|m4v|mts)$" } ] }