From 0e68c718081a290129736a54cb25eecc7c6c5d4f Mon Sep 17 00:00:00 2001
From: David Farkas <david.farkas93@gmail.com>
Date: Fri, 20 Oct 2017 17:36:38 +0200
Subject: [PATCH 01/16] Add regexs to indentify file types, store these in
 mongodb, load them at app startup

---
 api/config.py          | 13 ++++++++++
 api/files.py           | 17 ++++---------
 api/filetypes.json     | 56 +++++++++++++++++++++---------------------
 api/filetypes_old.json | 29 ++++++++++++++++++++++
 4 files changed, 75 insertions(+), 40 deletions(-)
 create mode 100644 api/filetypes_old.json

diff --git a/api/config.py b/api/config.py
index 8bc5375bc..e30dfba27 100644
--- a/api/config.py
+++ b/api/config.py
@@ -226,6 +226,13 @@ def create_or_recreate_ttl_index(coll_name, index_name, ttl):
 
 def initialize_db():
     log.info('Initializing database, creating indexes')
+
+    try:
+        db.singletons.insert({'_id': 'init_db'})
+    except pymongo.errors.DuplicateKeyError:
+        log.info('Database is already initialized')
+        return
+
     # TODO review all indexes
     db.users.create_index('api_key.key')
     db.projects.create_index([('gid', 1), ('name', 1)])
@@ -253,6 +260,12 @@ def initialize_db():
     now = datetime.datetime.utcnow()
     db.groups.update_one({'_id': 'unknown'}, {'$setOnInsert': { 'created': now, 'modified': now, 'label': 'Unknown', 'permissions': []}}, upsert=True)
 
+    with open(os.path.join(os.path.dirname(__file__), 'filetypes.json')) as fd:
+        filetypes = json.load(fd)
+        db.filetypes.insert_many(filetypes)
+
+    log.info('Initializing database, creating indexes ....DONE')
+
 def get_config():
     global __last_update, __config, __config_persisted #pylint: disable=global-statement
     now = datetime.datetime.utcnow()
diff --git a/api/files.py b/api/files.py
index 0585b5993..169266648 100644
--- a/api/files.py
+++ b/api/files.py
@@ -150,18 +150,11 @@ def get_hash(self):
 
 # File extension --> scitran file type detection hueristics.
 # Listed in precendence order.
-with open(os.path.join(os.path.dirname(__file__), 'filetypes.json')) as fd:
-    TYPE_MAP = json.load(fd)
-
-KNOWN_FILETYPES = {ext: filetype for filetype, extensions in TYPE_MAP.iteritems() for ext in extensions}
 
 def guess_type_from_filename(filename):
-    particles = filename.split('.')[1:]
-    extentions = ['.' + '.'.join(particles[i:]) for i in range(len(particles))]
-    for ext in extentions:
-        filetype = KNOWN_FILETYPES.get(ext.lower())
-        if filetype:
-            break
-    else:
-        filetype = None
+    filetype = None
+    result = config.db.filetypes.find_one({'$where': 'function() {return RegExp(this.regex).test(\'%s\');}' % filename})
+    if result:
+        filetype = result['_id']
+
     return filetype
diff --git a/api/filetypes.json b/api/filetypes.json
index aeef59564..f8f003c0e 100644
--- a/api/filetypes.json
+++ b/api/filetypes.json
@@ -1,29 +1,29 @@
-{
-    "bval":         [ ".bval", ".bvals" ],
-    "bvec":         [ ".bvec", ".bvecs" ],
-    "dicom":        [ ".dcm", ".dcm.zip", ".dicom.zip" ],
-    "eeg":          [ ".eeg.zip" ],
-    "gephysio":     [ ".gephysio.zip" ],
-    "ismrmrd":      [ ".h5", ".hdf5" ],
-    "MATLAB data":  [ ".mat" ],
-    "MGH data":     [ ".mgh", ".mgz", ".mgh.gz" ],
-    "nifti":        [ ".nii.gz", ".nii" ],
-    "parrec":       [ ".parrec.zip", ".par-rec.zip" ],
-    "pfile":        [ ".7.gz", ".7", ".7.zip" ],
-    "PsychoPy data":  [ ".psydat" ],
-    "qa":           [ ".qa.png", ".qa.json", ".qa.html" ],
+[
+    { "_id": "bval", "regex": ".*\\.(bval$|bvals$)" },
+    { "_id": "bvec", "regex": ".*\\.(bvec$|bvecs$)" },
+    { "_id": "dicom", "regex": ".*\\.(dcm$|dcm\\.zip|dicom\\.zip$)" },
+    { "_id": "eeg", "regex": ".*\\.eeg\\.zip$" },
+    { "_id": "gephysio", "regex": ".*\\.gephysio\\.zip$" },
+    { "_id": "ismrmrd", "regex": ".*\\.(h5$|hdf5$)" },
+    { "_id": "MATLAB data", "regex": ".*\\.mat$" },
+    { "_id": "MGH data", "regex": ".*\\.(mgh$|mgz$|mgh\\.gz$)" },
+    { "_id": "nifti", "regex": ".*\\.(nii\\.gz$|nii$)" },
+    { "_id": "parrec", "regex": ".*\\.(parrec\\.zip$|par-rec\\.zip$)" },
+    { "_id": "pfile", "regex": ".*\\.(7\\.gz$|7$|7\\.zip)" },
+    { "_id": "PsychoPy data", "regex": ".*\\.psydat$" },
+    { "_id": "qa", "regex": ".*\\.(qa\\.png$|qa\\.json|qa\\.html$)" },
 
-    "archive":      [ ".zip", ".tbz2", ".tar.gz", ".tbz", ".tar.bz2", ".tgz", ".tar", ".txz", ".tar.xz" ],
-    "document":     [ ".docx", ".doc" ],
-    "image":        [ ".jpg", ".tif", ".jpeg", ".gif", ".bmp", ".png", ".tiff" ],
-    "markup":       [ ".html", ".htm", ".xml"  ],
-    "markdown":     [ ".md", ".markdown"  ],
-    "log":          [ ".log" ],
-    "pdf":          [ ".pdf" ],
-    "presentation": [ ".ppt", ".pptx" ],
-    "source code":  [ ".c", ".py", ".cpp", ".js", ".m", ".json", ".java", ".php", ".css", ".toml", ".yaml", ".yml" ],
-    "spreadsheet":  [ ".xls", ".xlsx" ],
-    "tabular data": [ ".csv.gz", ".csv" ],
-    "text":         [ ".txt" ],
-    "video":        [ ".mpeg", ".mpg", ".mov", ".mp4", ".m4v", ".mts" ]
-}
+    { "_id": "archive", "regex": ".*\\.(zip$|tbz2$|tar\\.gz$|tbz$|tar\\.bz2$|tgz$|tar$|txz$|tar\\.xz$)" },
+    { "_id": "document", "regex": ".*\\.(docx$|doc$)" },
+    { "_id": "image", "regex": ".*\\.(jpg$|tif$|jpeg$|gif$|bmp$|png$|tiff$)" },
+    { "_id": "markup", "regex": ".*\\.(html$|htm$|xml$)" },
+    { "_id": "markdown", "regex": ".*\\.(md$|markdown$)" },
+    { "_id": "log", "regex": ".*\\.log$" },
+    { "_id": "pdf", "regex": ".*\\.pdf$" },
+    { "_id": "presentation", "regex": ".*\\.(ppt$|pptx$)" },
+    { "_id": "source code", "regex": ".*\\.(c$|py$|cpp$|js$|m$|json$|java$|php$|css$|toml$|yaml$|yml$)" },
+    { "_id": "spreadsheet", "regex": ".*\\.(xls$|xlsx$)" },
+    { "_id": "tabular data", "regex": ".*\\.(csv\\.gz$|csv$)" },
+    { "_id": "text", "regex": ".*\\.txt$" },
+    { "_id": "video", "regex": ".*\\.(mpeg$|mpg$|mov$|mp4$|m4v$|mts$)" }
+]
\ No newline at end of file
diff --git a/api/filetypes_old.json b/api/filetypes_old.json
new file mode 100644
index 000000000..aeef59564
--- /dev/null
+++ b/api/filetypes_old.json
@@ -0,0 +1,29 @@
+{
+    "bval":         [ ".bval", ".bvals" ],
+    "bvec":         [ ".bvec", ".bvecs" ],
+    "dicom":        [ ".dcm", ".dcm.zip", ".dicom.zip" ],
+    "eeg":          [ ".eeg.zip" ],
+    "gephysio":     [ ".gephysio.zip" ],
+    "ismrmrd":      [ ".h5", ".hdf5" ],
+    "MATLAB data":  [ ".mat" ],
+    "MGH data":     [ ".mgh", ".mgz", ".mgh.gz" ],
+    "nifti":        [ ".nii.gz", ".nii" ],
+    "parrec":       [ ".parrec.zip", ".par-rec.zip" ],
+    "pfile":        [ ".7.gz", ".7", ".7.zip" ],
+    "PsychoPy data":  [ ".psydat" ],
+    "qa":           [ ".qa.png", ".qa.json", ".qa.html" ],
+
+    "archive":      [ ".zip", ".tbz2", ".tar.gz", ".tbz", ".tar.bz2", ".tgz", ".tar", ".txz", ".tar.xz" ],
+    "document":     [ ".docx", ".doc" ],
+    "image":        [ ".jpg", ".tif", ".jpeg", ".gif", ".bmp", ".png", ".tiff" ],
+    "markup":       [ ".html", ".htm", ".xml"  ],
+    "markdown":     [ ".md", ".markdown"  ],
+    "log":          [ ".log" ],
+    "pdf":          [ ".pdf" ],
+    "presentation": [ ".ppt", ".pptx" ],
+    "source code":  [ ".c", ".py", ".cpp", ".js", ".m", ".json", ".java", ".php", ".css", ".toml", ".yaml", ".yml" ],
+    "spreadsheet":  [ ".xls", ".xlsx" ],
+    "tabular data": [ ".csv.gz", ".csv" ],
+    "text":         [ ".txt" ],
+    "video":        [ ".mpeg", ".mpg", ".mov", ".mp4", ".m4v", ".mts" ]
+}

From f2af51aa0da1d9fbb429c49a8e9035ea30f586b1 Mon Sep 17 00:00:00 2001
From: David Farkas <david.farkas93@gmail.com>
Date: Fri, 20 Oct 2017 17:44:09 +0200
Subject: [PATCH 02/16] Remove unused import

---
 api/files.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/api/files.py b/api/files.py
index 169266648..89f871009 100644
--- a/api/files.py
+++ b/api/files.py
@@ -1,6 +1,5 @@
 import os
 import cgi
-import json
 import shutil
 import hashlib
 import collections

From 5cdd129424c598f0ed23cf5524eb877334af142b Mon Sep 17 00:00:00 2001
From: David Farkas <david.farkas93@gmail.com>
Date: Wed, 25 Oct 2017 21:11:33 +0200
Subject: [PATCH 03/16] Add file type handler to manage file type through the
 api (add/replace, delete)

---
 api/api.py                                    | 12 ++++--
 api/config.py                                 |  1 +
 api/handlers/filetypehandler.py               | 37 +++++++++++++++++++
 raml/schemas/mongo/filetype.json              | 17 +++++++++
 .../python/test_files.py                      |  2 -
 5 files changed, 64 insertions(+), 5 deletions(-)
 create mode 100644 api/handlers/filetypehandler.py
 create mode 100644 raml/schemas/mongo/filetype.json
 rename tests/{unit_tests => integration_tests}/python/test_files.py (98%)

diff --git a/api/api.py b/api/api.py
index f0506ad45..e71605ad6 100644
--- a/api/api.py
+++ b/api/api.py
@@ -7,6 +7,7 @@
 from .handlers.containerhandler     import ContainerHandler
 from .handlers.dataexplorerhandler  import DataExplorerHandler
 from .handlers.devicehandler        import DeviceHandler
+from .handlers.filetypehandler      import FileType
 from .handlers.grouphandler         import GroupHandler
 from .handlers.listhandler          import FileListHandler, NotesListHandler, PermissionsListHandler, TagsListHandler
 from .handlers.refererhandler       import AnalysesHandler
@@ -43,6 +44,9 @@
     # Filename
     'fname': '[^/]+',
 
+    # File type name
+    'ftypename': '[^/]+',
+
     # Note ID
     'nid': '[0-9a-f]{24}',
 
@@ -77,9 +81,11 @@ def prefix(path, routes):
 
         # System configuration
 
-        route('/config',           Config,              m=['GET']),
-        route('/config.js',        Config,  h='get_js', m=['GET']),
-        route('/version',          Version,             m=['GET']),
+        route('/config',                        Config,              m=['GET']),
+        route('/config.js',                     Config,  h='get_js', m=['GET']),
+        route('/version',                       Version,             m=['GET']),
+        route('/filetype',                      FileType,            m=['GET', 'POST']),
+        route('/filetype/<_id:{ftypename}>',    FileType,            m=['DELETE']),
 
 
         # General-purpose upload & download
diff --git a/api/config.py b/api/config.py
index e30dfba27..b471758ab 100644
--- a/api/config.py
+++ b/api/config.py
@@ -139,6 +139,7 @@ def apply_env_variables(config):
     'collection.json',
     'container.json',
     'file.json',
+    'filetype.json',
     'group.json',
     'note.json',
     'permission.json',
diff --git a/api/handlers/filetypehandler.py b/api/handlers/filetypehandler.py
new file mode 100644
index 000000000..7c56cc0b9
--- /dev/null
+++ b/api/handlers/filetypehandler.py
@@ -0,0 +1,37 @@
+from ..web import base
+from .. import config
+from .. import validators
+from ..auth import userauth
+from ..dao import noop
+
+class FileType(base.RequestHandler):
+
+    def get(self):
+        """Get file types"""
+        resp = config.db.filetypes.find()
+        if resp != None:
+            return resp
+        else:
+            self.abort(404, "Version document does not exist")
+
+    def post(self):
+        permchecker = userauth.default(self)
+        payload = self.request.json_body
+        mongo_schema_uri = validators.schema_uri('mongo', 'filetype.json')
+        mongo_validator = validators.decorator_from_schema_path(mongo_schema_uri)
+        mongo_validator(permchecker(noop))('PUT', payload=payload)
+        result = config.db.filetypes.replace_one({'_id': payload['_id']}, payload, upsert=True)
+        if result.acknowledged:
+            _id = result.upserted_id if result.upserted_id else payload['_id']
+            return {'_id': _id}
+        else:
+            self.abort(404, 'File type {} not updated'.format(payload['_id']))
+
+    def delete(self, _id):
+        permchecker = userauth.default(self)
+        permchecker(noop)('DELETE', _id)
+        result = config.db.filetypes.delete_one({'_id': _id})
+        if result.acknowledged:
+            return {'deleted': result.deleted_count}
+        else:
+            self.abort(404, 'File type {} not removed'.format(_id))
\ No newline at end of file
diff --git a/raml/schemas/mongo/filetype.json b/raml/schemas/mongo/filetype.json
new file mode 100644
index 000000000..258d9d926
--- /dev/null
+++ b/raml/schemas/mongo/filetype.json
@@ -0,0 +1,17 @@
+{
+  "$schema": "http://json-schema.org/draft-04/schema#",
+  "title": "File type",
+  "type": "object",
+  "properties": {
+    "_id":              {
+                          "title": "ID",
+                          "type": "string"
+                        },
+    "regex":        {
+                          "title": "Regular Expression",
+                          "type": "string"
+                        }
+  },
+  "additionalProperties": false,
+  "required":["_id", "regex"]
+}
diff --git a/tests/unit_tests/python/test_files.py b/tests/integration_tests/python/test_files.py
similarity index 98%
rename from tests/unit_tests/python/test_files.py
rename to tests/integration_tests/python/test_files.py
index 01b977787..b0a2409d5 100644
--- a/tests/unit_tests/python/test_files.py
+++ b/tests/integration_tests/python/test_files.py
@@ -1,5 +1,3 @@
-
-import pytest
 from api import files
 
 

From 466ca1b75c95e7938c92f6901bd0bc9f3e9200b1 Mon Sep 17 00:00:00 2001
From: David Farkas <david.farkas93@gmail.com>
Date: Wed, 25 Oct 2017 22:21:05 +0200
Subject: [PATCH 04/16] New integration test for the file types handlers

---
 tests/integration_tests/python/test_files.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/integration_tests/python/test_files.py b/tests/integration_tests/python/test_files.py
index b0a2409d5..ee005c97e 100644
--- a/tests/integration_tests/python/test_files.py
+++ b/tests/integration_tests/python/test_files.py
@@ -21,3 +21,10 @@ def test_qa():
 
 def test_unknown():
     assert files.guess_type_from_filename('example.unknown') == None
+
+def test_insert_delete(as_drone):
+    as_drone.post('/filetype', json={'_id': 'new', 'regex': '.*\.new$'})
+    assert files.guess_type_from_filename('example.new') == 'new'
+    as_drone.post('/filetype', json={'_id': 'new', 'regex': '.*\.new2$'})
+    assert files.guess_type_from_filename('example.new') == None
+    assert files.guess_type_from_filename('example.new2') == 'new'
\ No newline at end of file

From 0e1ffa0f6ed7b05ec46e230bc7b948d10d402d82 Mon Sep 17 00:00:00 2001
From: David Farkas <david.farkas93@gmail.com>
Date: Thu, 26 Oct 2017 00:24:47 +0200
Subject: [PATCH 05/16] Add new try_update_one db util method, and fix the dup
 key exists error in initialize_db method

---
 api/config.py          | 16 +++++++---------
 api/dao/dbutil.py      | 16 ++++++++++++++++
 api/filetypes_old.json | 29 -----------------------------
 3 files changed, 23 insertions(+), 38 deletions(-)
 delete mode 100644 api/filetypes_old.json

diff --git a/api/config.py b/api/config.py
index b471758ab..ec94ead05 100644
--- a/api/config.py
+++ b/api/config.py
@@ -8,7 +8,7 @@
 import elasticsearch
 
 from . import util
-from .dao.dbutil import try_replace_one
+from .dao.dbutil import try_replace_one, try_update_one
 
 logging.basicConfig(
     format='%(asctime)s %(name)16.16s %(filename)24.24s %(lineno)5d:%(levelname)4.4s %(message)s',
@@ -228,12 +228,6 @@ def create_or_recreate_ttl_index(coll_name, index_name, ttl):
 def initialize_db():
     log.info('Initializing database, creating indexes')
 
-    try:
-        db.singletons.insert({'_id': 'init_db'})
-    except pymongo.errors.DuplicateKeyError:
-        log.info('Database is already initialized')
-        return
-
     # TODO review all indexes
     db.users.create_index('api_key.key')
     db.projects.create_index([('gid', 1), ('name', 1)])
@@ -259,11 +253,15 @@ def initialize_db():
     create_or_recreate_ttl_index('downloads', 'timestamp', 60)
 
     now = datetime.datetime.utcnow()
-    db.groups.update_one({'_id': 'unknown'}, {'$setOnInsert': { 'created': now, 'modified': now, 'label': 'Unknown', 'permissions': []}}, upsert=True)
+    try_update_one(db,
+                   'singletons', {'_id': 'unknown'},
+                   {'$setOnInsert': {'created': now, 'modified': now, 'label': 'Unknown', 'permissions': []}},
+                   upsert=True)
 
     with open(os.path.join(os.path.dirname(__file__), 'filetypes.json')) as fd:
         filetypes = json.load(fd)
-        db.filetypes.insert_many(filetypes)
+        for filetype in filetypes:
+            try_replace_one(db, 'filetypes', {'_id': filetype['_id']}, filetype, upsert=True)
 
     log.info('Initializing database, creating indexes ....DONE')
 
diff --git a/api/dao/dbutil.py b/api/dao/dbutil.py
index 9d3db91c0..266dc3e4f 100644
--- a/api/dao/dbutil.py
+++ b/api/dao/dbutil.py
@@ -4,6 +4,7 @@
 from pymongo.errors import DuplicateKeyError
 from ..web.errors import APIStorageException
 
+
 def try_replace_one(db, coll_name, query, update, upsert=False):
     """
     Mongo does not see replace w/ upsert as an atomic action:
@@ -39,3 +40,18 @@ def fault_tolerant_replace_one(db, coll_name, query, update, upsert=False):
             time.sleep(random.uniform(0.01,0.05))
 
     raise APIStorageException('Unable to replace object.')
+
+
+def try_update_one(db, coll_name, query, update, upsert=False):
+    """
+    Mongo does not see replace w/ upsert as an atomic action:
+    https://jira.mongodb.org/browse/SERVER-14322
+
+    This function will try a replace_one operation, returning the result and if the operation succeeded.
+    """
+    try:
+        result = db[coll_name].update_one(query, update, upsert=upsert)
+    except DuplicateKeyError:
+        return result, False
+    else:
+        return result, True
diff --git a/api/filetypes_old.json b/api/filetypes_old.json
deleted file mode 100644
index aeef59564..000000000
--- a/api/filetypes_old.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-    "bval":         [ ".bval", ".bvals" ],
-    "bvec":         [ ".bvec", ".bvecs" ],
-    "dicom":        [ ".dcm", ".dcm.zip", ".dicom.zip" ],
-    "eeg":          [ ".eeg.zip" ],
-    "gephysio":     [ ".gephysio.zip" ],
-    "ismrmrd":      [ ".h5", ".hdf5" ],
-    "MATLAB data":  [ ".mat" ],
-    "MGH data":     [ ".mgh", ".mgz", ".mgh.gz" ],
-    "nifti":        [ ".nii.gz", ".nii" ],
-    "parrec":       [ ".parrec.zip", ".par-rec.zip" ],
-    "pfile":        [ ".7.gz", ".7", ".7.zip" ],
-    "PsychoPy data":  [ ".psydat" ],
-    "qa":           [ ".qa.png", ".qa.json", ".qa.html" ],
-
-    "archive":      [ ".zip", ".tbz2", ".tar.gz", ".tbz", ".tar.bz2", ".tgz", ".tar", ".txz", ".tar.xz" ],
-    "document":     [ ".docx", ".doc" ],
-    "image":        [ ".jpg", ".tif", ".jpeg", ".gif", ".bmp", ".png", ".tiff" ],
-    "markup":       [ ".html", ".htm", ".xml"  ],
-    "markdown":     [ ".md", ".markdown"  ],
-    "log":          [ ".log" ],
-    "pdf":          [ ".pdf" ],
-    "presentation": [ ".ppt", ".pptx" ],
-    "source code":  [ ".c", ".py", ".cpp", ".js", ".m", ".json", ".java", ".php", ".css", ".toml", ".yaml", ".yml" ],
-    "spreadsheet":  [ ".xls", ".xlsx" ],
-    "tabular data": [ ".csv.gz", ".csv" ],
-    "text":         [ ".txt" ],
-    "video":        [ ".mpeg", ".mpg", ".mov", ".mp4", ".m4v", ".mts" ]
-}

From 206d69e93a1a52163435f48730c21b85d55cbd72 Mon Sep 17 00:00:00 2001
From: David Farkas <david.farkas93@gmail.com>
Date: Thu, 26 Oct 2017 02:37:08 +0200
Subject: [PATCH 06/16] Fix typo in config.py

---
 api/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/api/config.py b/api/config.py
index ec94ead05..225e3d4e4 100644
--- a/api/config.py
+++ b/api/config.py
@@ -254,7 +254,7 @@ def initialize_db():
 
     now = datetime.datetime.utcnow()
     try_update_one(db,
-                   'singletons', {'_id': 'unknown'},
+                   'groups', {'_id': 'unknown'},
                    {'$setOnInsert': {'created': now, 'modified': now, 'label': 'Unknown', 'permissions': []}},
                    upsert=True)
 

From b043b861d8d36f6f85a17240f4d0bbe80c5d5ad7 Mon Sep 17 00:00:00 2001
From: David Farkas <david.farkas93@gmail.com>
Date: Thu, 26 Oct 2017 03:52:01 +0200
Subject: [PATCH 07/16] Add some documentation, and increase code coverage

---
 api/handlers/filetypehandler.py              | 13 +++++++------
 tests/integration_tests/python/test_files.py | 19 ++++++++++++++-----
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/api/handlers/filetypehandler.py b/api/handlers/filetypehandler.py
index 7c56cc0b9..95736afea 100644
--- a/api/handlers/filetypehandler.py
+++ b/api/handlers/filetypehandler.py
@@ -8,13 +8,13 @@ class FileType(base.RequestHandler):
 
     def get(self):
         """Get file types"""
-        resp = config.db.filetypes.find()
-        if resp != None:
-            return resp
-        else:
-            self.abort(404, "Version document does not exist")
+        return config.db.filetypes.find()
 
     def post(self):
+        """
+        Insert or replace a file type. Required fields: '_id' and 'regex' where the '_id' is the unique name of
+        the file type and 'regex' is a regular expression which is used to figure out the file type from the file name.
+        """
         permchecker = userauth.default(self)
         payload = self.request.json_body
         mongo_schema_uri = validators.schema_uri('mongo', 'filetype.json')
@@ -28,10 +28,11 @@ def post(self):
             self.abort(404, 'File type {} not updated'.format(payload['_id']))
 
     def delete(self, _id):
+        """Delete a file type"""
         permchecker = userauth.default(self)
         permchecker(noop)('DELETE', _id)
         result = config.db.filetypes.delete_one({'_id': _id})
-        if result.acknowledged:
+        if result.deleted_count:
             return {'deleted': result.deleted_count}
         else:
             self.abort(404, 'File type {} not removed'.format(_id))
\ No newline at end of file
diff --git a/tests/integration_tests/python/test_files.py b/tests/integration_tests/python/test_files.py
index ee005c97e..f1dd94b31 100644
--- a/tests/integration_tests/python/test_files.py
+++ b/tests/integration_tests/python/test_files.py
@@ -1,6 +1,5 @@
 from api import files
 
-
 def test_extension():
     assert files.guess_type_from_filename('example.pdf') == 'pdf'
 
@@ -22,9 +21,19 @@ def test_qa():
 def test_unknown():
     assert files.guess_type_from_filename('example.unknown') == None
 
-def test_insert_delete(as_drone):
-    as_drone.post('/filetype', json={'_id': 'new', 'regex': '.*\.new$'})
+def test_get_insert_delete(as_drone):
+    r = as_drone.get('/filetype')
+    assert r.ok
+    r = as_drone.post('/filetype', json={'_id': 'new', 'regex': '.*\.new$'})
+    assert r.ok
     assert files.guess_type_from_filename('example.new') == 'new'
-    as_drone.post('/filetype', json={'_id': 'new', 'regex': '.*\.new2$'})
+    r = as_drone.post('/filetype', json={'_id': 'new', 'regex': '.*\.new2$'})
+    assert r.ok
     assert files.guess_type_from_filename('example.new') == None
-    assert files.guess_type_from_filename('example.new2') == 'new'
\ No newline at end of file
+    assert files.guess_type_from_filename('example.new2') == 'new'
+    r = as_drone.delete('/filetype/new')
+    assert r.ok
+
+def test_insert_delete_abort(as_drone):
+    r = as_drone.delete('/filetype/notexists')
+    assert r.status_code == 404
\ No newline at end of file

From ad9a703c423cc3ff6e672f77a56b278c069d1f21 Mon Sep 17 00:00:00 2001
From: David Farkas <david.farkas93@gmail.com>
Date: Fri, 10 Nov 2017 17:33:38 +0100
Subject: [PATCH 08/16] Update permission checking, schema validating, remove
 JS query

---
 api/config.py                               |  2 +-
 api/files.py                                | 15 +++++++++------
 api/handlers/filetypehandler.py             | 20 +++++++++-----------
 raml/schemas/{mongo => input}/filetype.json |  0
 4 files changed, 19 insertions(+), 18 deletions(-)
 rename raml/schemas/{mongo => input}/filetype.json (100%)

diff --git a/api/config.py b/api/config.py
index 225e3d4e4..e2fa37079 100644
--- a/api/config.py
+++ b/api/config.py
@@ -139,7 +139,6 @@ def apply_env_variables(config):
     'collection.json',
     'container.json',
     'file.json',
-    'filetype.json',
     'group.json',
     'note.json',
     'permission.json',
@@ -162,6 +161,7 @@ def apply_env_variables(config):
     'container.json',
     'device.json',
     'file.json',
+    'filetype.json',
     'file-update.json',
     'group-new.json',
     'group-update.json',
diff --git a/api/files.py b/api/files.py
index 89f871009..1e007c027 100644
--- a/api/files.py
+++ b/api/files.py
@@ -1,8 +1,9 @@
-import os
 import cgi
-import shutil
-import hashlib
 import collections
+import hashlib
+import os
+import re
+import shutil
 
 from . import util
 from . import config
@@ -152,8 +153,10 @@ def get_hash(self):
 
 def guess_type_from_filename(filename):
     filetype = None
-    result = config.db.filetypes.find_one({'$where': 'function() {return RegExp(this.regex).test(\'%s\');}' % filename})
-    if result:
-        filetype = result['_id']
+    cursor = config.db.filetypes.find({})
+    for document in cursor:
+        if re.match(document['regex'], filename):
+            filetype = document['_id']
+            break
 
     return filetype
diff --git a/api/handlers/filetypehandler.py b/api/handlers/filetypehandler.py
index 95736afea..7b9ef0071 100644
--- a/api/handlers/filetypehandler.py
+++ b/api/handlers/filetypehandler.py
@@ -1,25 +1,24 @@
-from ..web import base
 from .. import config
-from .. import validators
-from ..auth import userauth
-from ..dao import noop
+from ..auth import require_admin, require_login
+from ..validators import validate_data
+from ..web import base
+
 
 class FileType(base.RequestHandler):
 
+    @require_login
     def get(self):
         """Get file types"""
         return config.db.filetypes.find()
 
+    @require_admin
     def post(self):
         """
         Insert or replace a file type. Required fields: '_id' and 'regex' where the '_id' is the unique name of
         the file type and 'regex' is a regular expression which is used to figure out the file type from the file name.
         """
-        permchecker = userauth.default(self)
         payload = self.request.json_body
-        mongo_schema_uri = validators.schema_uri('mongo', 'filetype.json')
-        mongo_validator = validators.decorator_from_schema_path(mongo_schema_uri)
-        mongo_validator(permchecker(noop))('PUT', payload=payload)
+        validate_data(payload, 'filetype.json', 'input', 'POST')
         result = config.db.filetypes.replace_one({'_id': payload['_id']}, payload, upsert=True)
         if result.acknowledged:
             _id = result.upserted_id if result.upserted_id else payload['_id']
@@ -27,12 +26,11 @@ def post(self):
         else:
             self.abort(404, 'File type {} not updated'.format(payload['_id']))
 
+    @require_admin
     def delete(self, _id):
         """Delete a file type"""
-        permchecker = userauth.default(self)
-        permchecker(noop)('DELETE', _id)
         result = config.db.filetypes.delete_one({'_id': _id})
         if result.deleted_count:
             return {'deleted': result.deleted_count}
         else:
-            self.abort(404, 'File type {} not removed'.format(_id))
\ No newline at end of file
+            self.abort(404, 'File type {} not removed'.format(_id))
diff --git a/raml/schemas/mongo/filetype.json b/raml/schemas/input/filetype.json
similarity index 100%
rename from raml/schemas/mongo/filetype.json
rename to raml/schemas/input/filetype.json

From dd7e4ca94fc1f3bcb12d9ee73aaafcdada44955b Mon Sep 17 00:00:00 2001
From: David Farkas <david.farkas93@gmail.com>
Date: Mon, 13 Nov 2017 16:38:59 +0100
Subject: [PATCH 09/16] Load mongo filetypes via the bootstrap script

---
 api/config.py                                 |  5 ----
 api/filetypes.json                            | 29 -------------------
 ...s_drone_secret.py => load_drone_secret.py} | 13 ++++++---
 bootstrap.sample.json                         | 29 +++++++++++++++++++
 docker/README.md                              |  2 +-
 docker/bootstrap-accounts.sh                  | 28 ------------------
 docker/bootstrap-defaults.sh                  | 28 ++++++++++++++++++
 7 files changed, 67 insertions(+), 67 deletions(-)
 delete mode 100644 api/filetypes.json
 rename bin/{load_users_drone_secret.py => load_drone_secret.py} (91%)
 delete mode 100755 docker/bootstrap-accounts.sh
 create mode 100755 docker/bootstrap-defaults.sh

diff --git a/api/config.py b/api/config.py
index e2fa37079..2306cbc94 100644
--- a/api/config.py
+++ b/api/config.py
@@ -258,11 +258,6 @@ def initialize_db():
                    {'$setOnInsert': {'created': now, 'modified': now, 'label': 'Unknown', 'permissions': []}},
                    upsert=True)
 
-    with open(os.path.join(os.path.dirname(__file__), 'filetypes.json')) as fd:
-        filetypes = json.load(fd)
-        for filetype in filetypes:
-            try_replace_one(db, 'filetypes', {'_id': filetype['_id']}, filetype, upsert=True)
-
     log.info('Initializing database, creating indexes ....DONE')
 
 def get_config():
diff --git a/api/filetypes.json b/api/filetypes.json
deleted file mode 100644
index f8f003c0e..000000000
--- a/api/filetypes.json
+++ /dev/null
@@ -1,29 +0,0 @@
-[
-    { "_id": "bval", "regex": ".*\\.(bval$|bvals$)" },
-    { "_id": "bvec", "regex": ".*\\.(bvec$|bvecs$)" },
-    { "_id": "dicom", "regex": ".*\\.(dcm$|dcm\\.zip|dicom\\.zip$)" },
-    { "_id": "eeg", "regex": ".*\\.eeg\\.zip$" },
-    { "_id": "gephysio", "regex": ".*\\.gephysio\\.zip$" },
-    { "_id": "ismrmrd", "regex": ".*\\.(h5$|hdf5$)" },
-    { "_id": "MATLAB data", "regex": ".*\\.mat$" },
-    { "_id": "MGH data", "regex": ".*\\.(mgh$|mgz$|mgh\\.gz$)" },
-    { "_id": "nifti", "regex": ".*\\.(nii\\.gz$|nii$)" },
-    { "_id": "parrec", "regex": ".*\\.(parrec\\.zip$|par-rec\\.zip$)" },
-    { "_id": "pfile", "regex": ".*\\.(7\\.gz$|7$|7\\.zip)" },
-    { "_id": "PsychoPy data", "regex": ".*\\.psydat$" },
-    { "_id": "qa", "regex": ".*\\.(qa\\.png$|qa\\.json|qa\\.html$)" },
-
-    { "_id": "archive", "regex": ".*\\.(zip$|tbz2$|tar\\.gz$|tbz$|tar\\.bz2$|tgz$|tar$|txz$|tar\\.xz$)" },
-    { "_id": "document", "regex": ".*\\.(docx$|doc$)" },
-    { "_id": "image", "regex": ".*\\.(jpg$|tif$|jpeg$|gif$|bmp$|png$|tiff$)" },
-    { "_id": "markup", "regex": ".*\\.(html$|htm$|xml$)" },
-    { "_id": "markdown", "regex": ".*\\.(md$|markdown$)" },
-    { "_id": "log", "regex": ".*\\.log$" },
-    { "_id": "pdf", "regex": ".*\\.pdf$" },
-    { "_id": "presentation", "regex": ".*\\.(ppt$|pptx$)" },
-    { "_id": "source code", "regex": ".*\\.(c$|py$|cpp$|js$|m$|json$|java$|php$|css$|toml$|yaml$|yml$)" },
-    { "_id": "spreadsheet", "regex": ".*\\.(xls$|xlsx$)" },
-    { "_id": "tabular data", "regex": ".*\\.(csv\\.gz$|csv$)" },
-    { "_id": "text", "regex": ".*\\.txt$" },
-    { "_id": "video", "regex": ".*\\.(mpeg$|mpg$|mov$|mp4$|m4v$|mts$)" }
-]
\ No newline at end of file
diff --git a/bin/load_users_drone_secret.py b/bin/load_drone_secret.py
similarity index 91%
rename from bin/load_users_drone_secret.py
rename to bin/load_drone_secret.py
index 471a73669..9f170e1fd 100755
--- a/bin/load_users_drone_secret.py
+++ b/bin/load_drone_secret.py
@@ -62,9 +62,9 @@ def _upsert_permission(request_session, api_url, permission_doc, group_id):
     full_permission_url = "{0}/{1}".format(base_permission_url, permission_doc['_id'])
     return request_session.put(full_permission_url, json=permission_doc)
 
-def users(filepath, api_url, http_headers, insecure):
+def bootstrap(filepath, api_url, http_headers, insecure):
     """
-    Upserts the users/groups/permissions defined in filepath parameter.
+    Upserts the users/groups/permissions/file types defined in filepath parameter.
 
     Raises:
         requests.HTTPError: Upsert failed.
@@ -95,7 +95,7 @@ def users(filepath, api_url, http_headers, insecure):
 
         log.info('bootstrapping projects...')
         for p in input_data.get('projects', []):
-            r = rs.post(api_url + '/projects?inherit=true' , json=p)
+            r = rs.post(api_url + '/projects?inherit=true', json=p)
             r.raise_for_status()
 
             project_id = r.json()['_id']
@@ -111,6 +111,11 @@ def users(filepath, api_url, http_headers, insecure):
                     r = rs.post(api_url + '/projects/' +  project_id + '/rules', json=rule)
                     r.raise_for_status()
 
+        log.info('bootstrapping file types...')
+        for f in input_data.get('filetypes', []):
+            r = rs.post(api_url + '/filetype', json=f)
+            r.raise_for_status()
+
     log.info('bootstrapping complete')
 
 
@@ -134,7 +139,7 @@ def users(filepath, api_url, http_headers, insecure):
 # TODO: extend this to support oauth tokens
 
 try:
-    users(args.json, args.url, http_headers, args.insecure)
+    bootstrap(args.json, args.url, http_headers, args.insecure)
 except requests.HTTPError as ex:
     log.error(ex)
     log.error("request_body={0}".format(ex.response.request.body))
diff --git a/bootstrap.sample.json b/bootstrap.sample.json
index e85943b67..35c067f96 100644
--- a/bootstrap.sample.json
+++ b/bootstrap.sample.json
@@ -25,5 +25,34 @@
       "_id": "local",
       "type": "engine"
     }
+  ],
+  "filetypes": [
+    { "_id": "bval", "regex": ".*\\.(bval$|bvals$)" },
+    { "_id": "bvec", "regex": ".*\\.(bvec$|bvecs$)" },
+    { "_id": "dicom", "regex": ".*\\.(dcm$|dcm\\.zip|dicom\\.zip$)" },
+    { "_id": "eeg", "regex": ".*\\.eeg\\.zip$" },
+    { "_id": "gephysio", "regex": ".*\\.gephysio\\.zip$" },
+    { "_id": "ismrmrd", "regex": ".*\\.(h5$|hdf5$)" },
+    { "_id": "MATLAB data", "regex": ".*\\.mat$" },
+    { "_id": "MGH data", "regex": ".*\\.(mgh$|mgz$|mgh\\.gz$)" },
+    { "_id": "nifti", "regex": ".*\\.(nii\\.gz$|nii$)" },
+    { "_id": "parrec", "regex": ".*\\.(parrec\\.zip$|par-rec\\.zip$)" },
+    { "_id": "pfile", "regex": ".*\\.(7\\.gz$|7$|7\\.zip)" },
+    { "_id": "PsychoPy data", "regex": ".*\\.psydat$" },
+    { "_id": "qa", "regex": ".*\\.(qa\\.png$|qa\\.json|qa\\.html$)" },
+
+    { "_id": "archive", "regex": ".*\\.(zip$|tbz2$|tar\\.gz$|tbz$|tar\\.bz2$|tgz$|tar$|txz$|tar\\.xz$)" },
+    { "_id": "document", "regex": ".*\\.(docx$|doc$)" },
+    { "_id": "image", "regex": ".*\\.(jpg$|tif$|jpeg$|gif$|bmp$|png$|tiff$)" },
+    { "_id": "markup", "regex": ".*\\.(html$|htm$|xml$)" },
+    { "_id": "markdown", "regex": ".*\\.(md$|markdown$)" },
+    { "_id": "log", "regex": ".*\\.log$" },
+    { "_id": "pdf", "regex": ".*\\.pdf$" },
+    { "_id": "presentation", "regex": ".*\\.(ppt$|pptx$)" },
+    { "_id": "source code", "regex": ".*\\.(c$|py$|cpp$|js$|m$|json$|java$|php$|css$|toml$|yaml$|yml$)" },
+    { "_id": "spreadsheet", "regex": ".*\\.(xls$|xlsx$)" },
+    { "_id": "tabular data", "regex": ".*\\.(csv\\.gz$|csv$)" },
+    { "_id": "text", "regex": ".*\\.txt$" },
+    { "_id": "video", "regex": ".*\\.(mpeg$|mpg$|mov$|mp4$|m4v$|mts$)" }
   ]
 }
diff --git a/docker/README.md b/docker/README.md
index cd98f3cf9..18bf1dc9f 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -44,7 +44,7 @@ preserving their contents across container instances.
      --rm \
      -v /dev/bali.prod/docker/uwsgi/bootstrap-dev.json:/accounts.json \
      scitran-core \
-       /var/scitran/code/api/docker/bootstrap-accounts.sh \
+       /var/scitran/code/api/docker/bootstrap-defaults.sh \
        /accounts.json
 
 
diff --git a/docker/bootstrap-accounts.sh b/docker/bootstrap-accounts.sh
deleted file mode 100755
index e8aab4c1b..000000000
--- a/docker/bootstrap-accounts.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-set -e
-set -x
-
-echo "IN BOOTSTRAP ACCOUNTS"
-
-(
-
-# Parse input parameters...
-#
-# bootstrap account file
-bootstrap_user_file=${1:-'/var/scitran/code/api/bootstrap.json.sample'}
-
-
-# Move to API folder for relative path assumptions later on
-#
-cd /var/scitran/code/api
-
-# Export PYTHONPATH for python script later on.
-#
-export PYTHONPATH=.
-
-
-# Bootstrap Users
-./bin/load_users_drone_secret.py --insecure --secret "${SCITRAN_CORE_DRONE_SECRET}" "${SCITRAN_SITE_API_URL}" "${bootstrap_user_file}"
-
-
-)
diff --git a/docker/bootstrap-defaults.sh b/docker/bootstrap-defaults.sh
new file mode 100755
index 000000000..9d1823bac
--- /dev/null
+++ b/docker/bootstrap-defaults.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+set -e
+set -x
+
+echo "IN BOOTSTRAP DEFAULTS"
+
+(
+
+# Parse input parameters...
+#
+# bootstrap file
+bootstrap_file=${1:-'/var/scitran/code/api/bootstrap.sample.json'}
+
+
+# Move to API folder for relative path assumptions later on
+#
+cd /var/scitran/code/api
+
+# Export PYTHONPATH for python script later on.
+#
+export PYTHONPATH=.
+
+
+# Bootstrap users and file types
+./bin/load_drone_secret.py --insecure --secret "${SCITRAN_CORE_DRONE_SECRET}" "${SCITRAN_SITE_API_URL}" "${bootstrap_file}"
+
+
+)

From 0cdbf5def5843134ebe1b7cdc2c71dbd7f3d2f89 Mon Sep 17 00:00:00 2001
From: David Farkas <david.farkas93@gmail.com>
Date: Mon, 13 Nov 2017 17:23:00 +0100
Subject: [PATCH 10/16] Update file type tests

---
 api/files.py                                 |  1 -
 tests/integration_tests/python/test_files.py | 22 ++++++++++++++++----
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/api/files.py b/api/files.py
index 1e007c027..ee73c15ed 100644
--- a/api/files.py
+++ b/api/files.py
@@ -157,6 +157,5 @@ def guess_type_from_filename(filename):
     for document in cursor:
         if re.match(document['regex'], filename):
             filetype = document['_id']
-            break
 
     return filetype
diff --git a/tests/integration_tests/python/test_files.py b/tests/integration_tests/python/test_files.py
index f1dd94b31..c9fc60cf2 100644
--- a/tests/integration_tests/python/test_files.py
+++ b/tests/integration_tests/python/test_files.py
@@ -1,18 +1,32 @@
 from api import files
 
-def test_extension():
+def test_extension(as_drone):
+    r = as_drone.post('/filetype', json={'_id': 'pdf', 'regex': '.*\.pdf$'})
+    assert r.ok
     assert files.guess_type_from_filename('example.pdf') == 'pdf'
 
-def test_multi_extension():
+def test_multi_extension(as_drone):
+    r = as_drone.post('/filetype',
+                      json={'_id': 'archive',
+                            'regex': '.*\.(zip$|tbz2$|tar\.gz$|tbz$|tar\.bz2$|tgz$|tar$|txz$|tar\.xz$)'})
+    assert r.ok
+    r = as_drone.post('/filetype', json={'_id': 'gephysio', 'regex': '.*\.gephysio\.zip$'})
+    assert r.ok
     assert files.guess_type_from_filename('example.zip') == 'archive'
     assert files.guess_type_from_filename('example.gephysio.zip') == 'gephysio'
 
-def test_nifti():
+def test_nifti(as_drone):
+    r = as_drone.post('/filetype', json={'_id': 'nifti', 'regex': '.*\.(nii\.gz$|nii$)'})
+    assert r.ok
     assert files.guess_type_from_filename('example.nii') == 'nifti'
     assert files.guess_type_from_filename('example.nii.gz') == 'nifti'
     assert files.guess_type_from_filename('example.nii.x.gz') == None
 
-def test_qa():
+def test_qa(as_drone):
+    r = as_drone.post('/filetype', json={'_id': 'image', 'regex': '.*\.(jpg$|tif$|jpeg$|gif$|bmp$|png$|tiff$)'})
+    assert r.ok
+    r = as_drone.post('/filetype', json={'_id': 'qa', 'regex': '.*\.(qa\.png$|qa\.json|qa\.html$)'})
+    assert r.ok
     assert files.guess_type_from_filename('example.png') == 'image'
     assert files.guess_type_from_filename('example.qa.png') == 'qa'
     assert files.guess_type_from_filename('example.qa') == None

From 6cf6e2ceefc90eb88799f3999c47c107629e821d Mon Sep 17 00:00:00 2001
From: David Farkas <david.farkas93@gmail.com>
Date: Thu, 16 Nov 2017 13:07:46 +0100
Subject: [PATCH 11/16] Refine the file type regular expressions, use re.search
 instead of match, update the integration tests and validate the regex in the
 file type handler

---
 api/files.py                                 |  5 +-
 api/handlers/filetypehandler.py              |  8 ++-
 bootstrap.sample.json                        | 52 ++++++++++----------
 tests/integration_tests/python/conftest.py   |  7 +++
 tests/integration_tests/python/test_files.py | 50 ++++++++++++-------
 5 files changed, 75 insertions(+), 47 deletions(-)

diff --git a/api/files.py b/api/files.py
index ee73c15ed..b5596e2bd 100644
--- a/api/files.py
+++ b/api/files.py
@@ -153,9 +153,12 @@ def get_hash(self):
 
 def guess_type_from_filename(filename):
     filetype = None
+    m_length = 0
     cursor = config.db.filetypes.find({})
+
     for document in cursor:
-        if re.match(document['regex'], filename):
+        m = re.search(document['regex'], filename)
+        if m and m_length < len(m.group(0)):
             filetype = document['_id']
 
     return filetype
diff --git a/api/handlers/filetypehandler.py b/api/handlers/filetypehandler.py
index 7b9ef0071..257eb8d4c 100644
--- a/api/handlers/filetypehandler.py
+++ b/api/handlers/filetypehandler.py
@@ -1,6 +1,8 @@
+import re
+
 from .. import config
 from ..auth import require_admin, require_login
-from ..validators import validate_data
+from ..validators import validate_data, InputValidationException
 from ..web import base
 
 
@@ -19,6 +21,10 @@ def post(self):
         """
         payload = self.request.json_body
         validate_data(payload, 'filetype.json', 'input', 'POST')
+        try:
+            re.compile(payload['regex'])
+        except re.error:
+            raise InputValidationException('Invalid regular expression')
         result = config.db.filetypes.replace_one({'_id': payload['_id']}, payload, upsert=True)
         if result.acknowledged:
             _id = result.upserted_id if result.upserted_id else payload['_id']
diff --git a/bootstrap.sample.json b/bootstrap.sample.json
index 35c067f96..86f8d40a5 100644
--- a/bootstrap.sample.json
+++ b/bootstrap.sample.json
@@ -27,32 +27,32 @@
     }
   ],
   "filetypes": [
-    { "_id": "bval", "regex": ".*\\.(bval$|bvals$)" },
-    { "_id": "bvec", "regex": ".*\\.(bvec$|bvecs$)" },
-    { "_id": "dicom", "regex": ".*\\.(dcm$|dcm\\.zip|dicom\\.zip$)" },
-    { "_id": "eeg", "regex": ".*\\.eeg\\.zip$" },
-    { "_id": "gephysio", "regex": ".*\\.gephysio\\.zip$" },
-    { "_id": "ismrmrd", "regex": ".*\\.(h5$|hdf5$)" },
-    { "_id": "MATLAB data", "regex": ".*\\.mat$" },
-    { "_id": "MGH data", "regex": ".*\\.(mgh$|mgz$|mgh\\.gz$)" },
-    { "_id": "nifti", "regex": ".*\\.(nii\\.gz$|nii$)" },
-    { "_id": "parrec", "regex": ".*\\.(parrec\\.zip$|par-rec\\.zip$)" },
-    { "_id": "pfile", "regex": ".*\\.(7\\.gz$|7$|7\\.zip)" },
-    { "_id": "PsychoPy data", "regex": ".*\\.psydat$" },
-    { "_id": "qa", "regex": ".*\\.(qa\\.png$|qa\\.json|qa\\.html$)" },
+    { "_id": "bval", "regex": "\\.(bval|bvals)$" },
+    { "_id": "bvec", "regex": "\\.(bvec|bvecs)$" },
+    { "_id": "dicom", "regex": "\\.(dcm|dcm\\.zip|dicom\\.zip)$" },
+    { "_id": "eeg", "regex": "\\.eeg\\.zip$" },
+    { "_id": "gephysio", "regex": "\\.gephysio\\.zip$" },
+    { "_id": "ismrmrd", "regex": "\\.(h5|hdf5)$" },
+    { "_id": "MATLAB data", "regex": "\\.mat$" },
+    { "_id": "MGH data", "regex": "\\.(mgh|mgz|mgh\\.gz)$" },
+    { "_id": "nifti", "regex": "\\.(nii\\.gz|nii)$" },
+    { "_id": "parrec", "regex": "\\.(parrec\\.zip|par-rec\\.zip)$" },
+    { "_id": "pfile", "regex": "\\.(7\\.gz|7|7\\.zip)$" },
+    { "_id": "PsychoPy data", "regex": "\\.psydat$" },
+    { "_id": "qa", "regex": "\\.(qa\\.png|qa\\.json|qa\\.html)$" },
 
-    { "_id": "archive", "regex": ".*\\.(zip$|tbz2$|tar\\.gz$|tbz$|tar\\.bz2$|tgz$|tar$|txz$|tar\\.xz$)" },
-    { "_id": "document", "regex": ".*\\.(docx$|doc$)" },
-    { "_id": "image", "regex": ".*\\.(jpg$|tif$|jpeg$|gif$|bmp$|png$|tiff$)" },
-    { "_id": "markup", "regex": ".*\\.(html$|htm$|xml$)" },
-    { "_id": "markdown", "regex": ".*\\.(md$|markdown$)" },
-    { "_id": "log", "regex": ".*\\.log$" },
-    { "_id": "pdf", "regex": ".*\\.pdf$" },
-    { "_id": "presentation", "regex": ".*\\.(ppt$|pptx$)" },
-    { "_id": "source code", "regex": ".*\\.(c$|py$|cpp$|js$|m$|json$|java$|php$|css$|toml$|yaml$|yml$)" },
-    { "_id": "spreadsheet", "regex": ".*\\.(xls$|xlsx$)" },
-    { "_id": "tabular data", "regex": ".*\\.(csv\\.gz$|csv$)" },
-    { "_id": "text", "regex": ".*\\.txt$" },
-    { "_id": "video", "regex": ".*\\.(mpeg$|mpg$|mov$|mp4$|m4v$|mts$)" }
+    { "_id": "archive", "regex": "\\.(zip|tbz2|tar\\.gz|tbz|tar\\.bz2|tgz|tar|txz|tar\\.xz)$" },
+    { "_id": "document", "regex": "\\.(docx|doc)$" },
+    { "_id": "image", "regex": "\\.(jpg|tif|jpeg|gif|bmp|png|tiff)$" },
+    { "_id": "markup", "regex": "\\.(html|htm|xml)$" },
+    { "_id": "markdown", "regex": "\\.(md|markdown)$" },
+    { "_id": "log", "regex": "\\.log$" },
+    { "_id": "pdf", "regex": "\\.pdf$" },
+    { "_id": "presentation", "regex": "\\.(ppt|pptx)$" },
+    { "_id": "source code", "regex": "\\.(c|py|cpp|js|m|json|java|php|css|toml|yaml|yml)$" },
+    { "_id": "spreadsheet", "regex": "\\.(xls|xlsx)$" },
+    { "_id": "tabular data", "regex": "\\.(csv\\.gz|csv)$" },
+    { "_id": "text", "regex": "\\.txt$" },
+    { "_id": "video", "regex": "\\.(mpeg|mpg|mov|mp4|m4v|mts)$" }
   ]
 }
diff --git a/tests/integration_tests/python/conftest.py b/tests/integration_tests/python/conftest.py
index 7cd2593d5..509303c4f 100644
--- a/tests/integration_tests/python/conftest.py
+++ b/tests/integration_tests/python/conftest.py
@@ -32,6 +32,13 @@ def bootstrap_users(as_drone):
     return data_builder
 
 
+@pytest.fixture(scope='session', autouse=True)
+def bootstrap_filetypes(as_admin):
+    """Create file types"""
+    as_admin.post('/filetype', json={'_id': 'tabular data', 'regex': '\.(csv\.gz|csv)$'})
+    as_admin.post('/filetype', json={'_id': 'text', 'regex': '\.txt$'})
+
+
 @pytest.fixture(scope='session')
 def as_drone():
     """Return requests session with drone access"""
diff --git a/tests/integration_tests/python/test_files.py b/tests/integration_tests/python/test_files.py
index c9fc60cf2..c382fcdcd 100644
--- a/tests/integration_tests/python/test_files.py
+++ b/tests/integration_tests/python/test_files.py
@@ -1,53 +1,65 @@
 from api import files
 
-def test_extension(as_drone):
-    r = as_drone.post('/filetype', json={'_id': 'pdf', 'regex': '.*\.pdf$'})
+
+def test_extension(as_admin):
+    r = as_admin.post('/filetype', json={'_id': 'pdf', 'regex': '\.pdf$'})
     assert r.ok
     assert files.guess_type_from_filename('example.pdf') == 'pdf'
 
-def test_multi_extension(as_drone):
-    r = as_drone.post('/filetype',
+
+def test_multi_extension(as_admin):
+    r = as_admin.post('/filetype',
                       json={'_id': 'archive',
-                            'regex': '.*\.(zip$|tbz2$|tar\.gz$|tbz$|tar\.bz2$|tgz$|tar$|txz$|tar\.xz$)'})
+                            'regex': '\.zip$'})
     assert r.ok
-    r = as_drone.post('/filetype', json={'_id': 'gephysio', 'regex': '.*\.gephysio\.zip$'})
+    r = as_admin.post('/filetype', json={'_id': 'gephysio', 'regex': '\.gephysio\.zip$'})
     assert r.ok
     assert files.guess_type_from_filename('example.zip') == 'archive'
     assert files.guess_type_from_filename('example.gephysio.zip') == 'gephysio'
 
-def test_nifti(as_drone):
-    r = as_drone.post('/filetype', json={'_id': 'nifti', 'regex': '.*\.(nii\.gz$|nii$)'})
+
+def test_nifti(as_admin):
+    r = as_admin.post('/filetype', json={'_id': 'nifti', 'regex': '\.(nii\.gz|nii)$'})
     assert r.ok
     assert files.guess_type_from_filename('example.nii') == 'nifti'
     assert files.guess_type_from_filename('example.nii.gz') == 'nifti'
     assert files.guess_type_from_filename('example.nii.x.gz') == None
 
-def test_qa(as_drone):
-    r = as_drone.post('/filetype', json={'_id': 'image', 'regex': '.*\.(jpg$|tif$|jpeg$|gif$|bmp$|png$|tiff$)'})
+
+def test_qa(as_admin):
+    r = as_admin.post('/filetype', json={'_id': 'image', 'regex': '\.png$'})
     assert r.ok
-    r = as_drone.post('/filetype', json={'_id': 'qa', 'regex': '.*\.(qa\.png$|qa\.json|qa\.html$)'})
+    r = as_admin.post('/filetype', json={'_id': 'qa', 'regex': '\.qa\.png$'})
     assert r.ok
     assert files.guess_type_from_filename('example.png') == 'image'
     assert files.guess_type_from_filename('example.qa.png') == 'qa'
     assert files.guess_type_from_filename('example.qa') == None
     assert files.guess_type_from_filename('example.qa.png.unknown') == None
 
+
 def test_unknown():
     assert files.guess_type_from_filename('example.unknown') == None
 
-def test_get_insert_delete(as_drone):
-    r = as_drone.get('/filetype')
+
+def test_get_insert_delete(as_admin):
+    r = as_admin.get('/filetype')
     assert r.ok
-    r = as_drone.post('/filetype', json={'_id': 'new', 'regex': '.*\.new$'})
+    r = as_admin.post('/filetype', json={'_id': 'new', 'regex': '\.new$'})
     assert r.ok
     assert files.guess_type_from_filename('example.new') == 'new'
-    r = as_drone.post('/filetype', json={'_id': 'new', 'regex': '.*\.new2$'})
+    r = as_admin.post('/filetype', json={'_id': 'new', 'regex': '\.new2$'})
     assert r.ok
     assert files.guess_type_from_filename('example.new') == None
     assert files.guess_type_from_filename('example.new2') == 'new'
-    r = as_drone.delete('/filetype/new')
+    r = as_admin.delete('/filetype/new')
     assert r.ok
 
-def test_insert_delete_abort(as_drone):
-    r = as_drone.delete('/filetype/notexists')
-    assert r.status_code == 404
\ No newline at end of file
+
+def test_insert_delete_abort(as_admin):
+    r = as_admin.delete('/filetype/notexists')
+    assert r.status_code == 404
+
+
+def test_invalid_regex(as_admin):
+    r = as_admin.post('/filetype', json={'_id': 'invalid', 'regex': '\\'})
+    assert r.status_code == 400

From c5c6b4216992040639bc657fba20ae6c963f6cef Mon Sep 17 00:00:00 2001
From: David Farkas <david.farkas93@gmail.com>
Date: Thu, 16 Nov 2017 13:24:54 +0100
Subject: [PATCH 12/16] Update abao load fixture script to load the necessary
 file types too

---
 tests/integration_tests/abao/load_fixture.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/integration_tests/abao/load_fixture.py b/tests/integration_tests/abao/load_fixture.py
index 32c036bdc..1d4113a8a 100644
--- a/tests/integration_tests/abao/load_fixture.py
+++ b/tests/integration_tests/abao/load_fixture.py
@@ -56,6 +56,12 @@ def main():
     r = as_root.post('/groups', json={'_id': 'test-group'})
     assert r.ok
 
+    # create file types
+    r = as_root.post('/filetype', json={'_id': 'dicom', 'regex': '\.(dcm|dcm\.zip|dicom\.zip)$'})
+    assert r.ok
+    r = as_root.post('/filetype', json={'_id': 'text', 'regex': '\.txt$'})
+    assert r.ok
+
     # upload file to test-project-1/test-session-1/test-acquisition-1
     # depends on 'create test-group'
     r = as_root.post('/upload/label', files={

From 82684f2a5c3b2750cb0f1edb4ac09d16583617df Mon Sep 17 00:00:00 2001
From: Gunnar Schaefer <gsfr@flywheel.io>
Date: Tue, 5 Dec 2017 22:44:39 -0600
Subject: [PATCH 13/16] Add stylized file types

---
 bootstrap.sample.json | 62 +++++++++++++++++++++++++------------------
 1 file changed, 36 insertions(+), 26 deletions(-)

diff --git a/bootstrap.sample.json b/bootstrap.sample.json
index 86f8d40a5..60350b6c0 100644
--- a/bootstrap.sample.json
+++ b/bootstrap.sample.json
@@ -27,32 +27,42 @@
     }
   ],
   "filetypes": [
-    { "_id": "bval", "regex": "\\.(bval|bvals)$" },
-    { "_id": "bvec", "regex": "\\.(bvec|bvecs)$" },
-    { "_id": "dicom", "regex": "\\.(dcm|dcm\\.zip|dicom\\.zip)$" },
-    { "_id": "eeg", "regex": "\\.eeg\\.zip$" },
-    { "_id": "gephysio", "regex": "\\.gephysio\\.zip$" },
-    { "_id": "ismrmrd", "regex": "\\.(h5|hdf5)$" },
-    { "_id": "MATLAB data", "regex": "\\.mat$" },
-    { "_id": "MGH data", "regex": "\\.(mgh|mgz|mgh\\.gz)$" },
-    { "_id": "nifti", "regex": "\\.(nii\\.gz|nii)$" },
-    { "_id": "parrec", "regex": "\\.(parrec\\.zip|par-rec\\.zip)$" },
-    { "_id": "pfile", "regex": "\\.(7\\.gz|7|7\\.zip)$" },
-    { "_id": "PsychoPy data", "regex": "\\.psydat$" },
-    { "_id": "qa", "regex": "\\.(qa\\.png|qa\\.json|qa\\.html)$" },
+    { "_id": "BVAL",            "regex": "\\.(bval|bvals)$" },
+    { "_id": "BVEC",            "regex": "\\.(bvec|bvecs)$" },
+    { "_id": "DICOM",           "regex": "\\.(dcm|dcm\\.zip|dicom\\.zip)$" },
+    { "_id": "EFile",           "regex": "^E.*P.*\\.7$" },
+    { "_id": "GE Physio",       "regex": "\\.gephysio\\.zip$" },
+    { "_id": "MGH Data",        "regex": "\\.(mgh|mgz|mgh\\.gz)$" },
+    { "_id": "NIfTI",           "regex": "\\.(nii\\.gz|nii)$" },
+    { "_id": "PAR/REC",         "regex": "\\.(parrec\\.zip|par-rec\\.zip)$" },
+    { "_id": "PFile Header",    "regex": "\\.(7\\.hdr)$" },
+    { "_id": "PFile",           "regex": "\\.(7\\.gz|7|7\\.zip)$" },
 
-    { "_id": "archive", "regex": "\\.(zip|tbz2|tar\\.gz|tbz|tar\\.bz2|tgz|tar|txz|tar\\.xz)$" },
-    { "_id": "document", "regex": "\\.(docx|doc)$" },
-    { "_id": "image", "regex": "\\.(jpg|tif|jpeg|gif|bmp|png|tiff)$" },
-    { "_id": "markup", "regex": "\\.(html|htm|xml)$" },
-    { "_id": "markdown", "regex": "\\.(md|markdown)$" },
-    { "_id": "log", "regex": "\\.log$" },
-    { "_id": "pdf", "regex": "\\.pdf$" },
-    { "_id": "presentation", "regex": "\\.(ppt|pptx)$" },
-    { "_id": "source code", "regex": "\\.(c|py|cpp|js|m|json|java|php|css|toml|yaml|yml)$" },
-    { "_id": "spreadsheet", "regex": "\\.(xls|xlsx)$" },
-    { "_id": "tabular data", "regex": "\\.(csv\\.gz|csv)$" },
-    { "_id": "text", "regex": "\\.txt$" },
-    { "_id": "video", "regex": "\\.(mpeg|mpg|mov|mp4|m4v|mts)$" }
+    { "_id": "EEG",             "regex": "\\.eeg\\.zip$" },
+
+    { "_id": "QC",              "regex": "\\.(q[ac]\\.png|q[ac]\\.json|q[ac]\\.html)$" },
+
+    { "_id": "MATLAB Data",     "regex": "\\.mat$" },
+    { "_id": "PsychoPy Data",   "regex": "\\.psydat$" },
+
+    { "_id": "Archive",         "regex": "\\.(zip|tbz2|tar\\.gz|tbz|tar\\.bz2|tgz|tar|txz|tar\\.xz)$" },
+    { "_id": "Audio",           "regex": "\\.(mp3|wav|wave)$" },
+    { "_id": "Document",        "regex": "\\.(docx|doc)$" },
+    { "_id": "HDF5",            "regex": "\\.(h5|hdf5)$" },
+    { "_id": "Image",           "regex": "\\.(jpg|tif|jpeg|gif|bmp|png|tiff)$" },
+    { "_id": "JSON",            "regex": "\\.json$" },
+    { "_id": "Log",             "regex": "\\.log$" },
+    { "_id": "Markdown",        "regex": "\\.(md|markdown)$" },
+    { "_id": "Markup",          "regex": "\\.(html|htm)$" },
+    { "_id": "PDF",             "regex": "\\.pdf$" },
+    { "_id": "Plain Text",      "regex": "\\.txt$" },
+    { "_id": "Presentation",    "regex": "\\.(ppt|pptx)$" },
+    { "_id": "Source code",     "regex": "\\.(c|py|cpp|js|m|json|java|php|css|toml|yaml|yml)$" },
+    { "_id": "Spreadsheet",     "regex": "\\.(xls|xlsx)$" },
+    { "_id": "TOML",            "regex": "\\.toml$" },
+    { "_id": "Tabular data",    "regex": "\\.([ct]sv\\.gz|[ct]sv)$" },
+    { "_id": "Video",           "regex": "\\.(mpeg|mpg|mov|mp4|m4v|mts)$" },
+    { "_id": "XML",             "regex": "\\.xml$" },
+    { "_id": "YAML",            "regex": "\\.(yaml|yml)$" }
   ]
 }

From d79658af70c65ddd31841a21e360451660601467 Mon Sep 17 00:00:00 2001
From: Gunnar Schaefer <gsfr@flywheel.io>
Date: Wed, 6 Dec 2017 11:12:01 -0600
Subject: [PATCH 14/16] Refine file types

---
 bootstrap.sample.json | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/bootstrap.sample.json b/bootstrap.sample.json
index 60350b6c0..a945dfa9f 100644
--- a/bootstrap.sample.json
+++ b/bootstrap.sample.json
@@ -45,24 +45,32 @@
     { "_id": "MATLAB Data",     "regex": "\\.mat$" },
     { "_id": "PsychoPy Data",   "regex": "\\.psydat$" },
 
+    { "_id": "C/C++",           "regex": "\\.(c|cpp)$" },
+    { "_id": "CSS",             "regex": "\\.css$" },
+    { "_id": "HDF5",            "regex": "\\.(h5|hdf5)$" },
+    { "_id": "HTML",            "regex": "\\.(html|htm)$" },
+    { "_id": "JSON",            "regex": "\\.json$" },
+    { "_id": "Java",            "regex": "\\.java$" },
+    { "_id": "JavaScript",      "regex": "\\.js$" },
+    { "_id": "Jupyter",         "regex": "\\.ipynb$" },
+    { "_id": "MATLAB",          "regex": "\\.(m|mex|mlx)$" },
+    { "_id": "Markdown",        "regex": "\\.(md|markdown)$" },
+    { "_id": "PHP",             "regex": "\\.php$" },
+    { "_id": "Plain Text",      "regex": "\\.txt$" },
+    { "_id": "Python",          "regex": "\\.py$" },
+    { "_id": "TOML",            "regex": "\\.toml$" },
+    { "_id": "XML",             "regex": "\\.xml$" },
+    { "_id": "YAML",            "regex": "\\.(yaml|yml)$" },
+
     { "_id": "Archive",         "regex": "\\.(zip|tbz2|tar\\.gz|tbz|tar\\.bz2|tgz|tar|txz|tar\\.xz)$" },
     { "_id": "Audio",           "regex": "\\.(mp3|wav|wave)$" },
     { "_id": "Document",        "regex": "\\.(docx|doc)$" },
-    { "_id": "HDF5",            "regex": "\\.(h5|hdf5)$" },
     { "_id": "Image",           "regex": "\\.(jpg|tif|jpeg|gif|bmp|png|tiff)$" },
-    { "_id": "JSON",            "regex": "\\.json$" },
     { "_id": "Log",             "regex": "\\.log$" },
-    { "_id": "Markdown",        "regex": "\\.(md|markdown)$" },
-    { "_id": "Markup",          "regex": "\\.(html|htm)$" },
     { "_id": "PDF",             "regex": "\\.pdf$" },
-    { "_id": "Plain Text",      "regex": "\\.txt$" },
     { "_id": "Presentation",    "regex": "\\.(ppt|pptx)$" },
-    { "_id": "Source code",     "regex": "\\.(c|py|cpp|js|m|json|java|php|css|toml|yaml|yml)$" },
     { "_id": "Spreadsheet",     "regex": "\\.(xls|xlsx)$" },
-    { "_id": "TOML",            "regex": "\\.toml$" },
     { "_id": "Tabular data",    "regex": "\\.([ct]sv\\.gz|[ct]sv)$" },
-    { "_id": "Video",           "regex": "\\.(mpeg|mpg|mov|mp4|m4v|mts)$" },
-    { "_id": "XML",             "regex": "\\.xml$" },
-    { "_id": "YAML",            "regex": "\\.(yaml|yml)$" }
+    { "_id": "Video",           "regex": "\\.(mpeg|mpg|mov|mp4|m4v|mts)$" }
   ]
 }

From 3af4e5854bb3bd7889ac28314cb768bf08f68014 Mon Sep 17 00:00:00 2001
From: Megan Henning <meganhenning@flywheel.io>
Date: Wed, 13 Dec 2017 16:21:16 -0600
Subject: [PATCH 15/16] Add db upgrade for filetypes

---
 bin/database.py | 97 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 96 insertions(+), 1 deletion(-)

diff --git a/bin/database.py b/bin/database.py
index 57672b556..c6a79271c 100755
--- a/bin/database.py
+++ b/bin/database.py
@@ -15,6 +15,7 @@
 
 from api import config
 from api import util
+from api import files as files_module
 from api.dao import containerutil
 from api.dao.containerstorage import ProjectStorage
 from api.jobs.jobs import Job
@@ -22,7 +23,7 @@
 from api.types import Origin
 from api.jobs import batch
 
-CURRENT_DATABASE_VERSION = 40 # An int that is bumped when a new schema change is made
+CURRENT_DATABASE_VERSION = 41 # An int that is bumped when a new schema change is made
 
 def get_db_version():
 
@@ -1301,6 +1302,100 @@ def upgrade_to_40():
     cursor = config.db.acquisitions.find({'timestamp':{'$type':'string'}})
     process_cursor(cursor, upgrade_to_40_closure)
 
+
+def upgrade_to_41_closure(cont, context):
+    """
+    Re-type files based on new filetypes stored in mongo collection
+    """
+
+    # passing filetypes rather than using util function to speed upgrade and skip db lookup
+    filetypes = context['filetypes']
+    cont_name = context['cont_name']
+
+    files = cont.get('files', [])
+
+    for f in files:
+
+        new_type = None
+        m_length = 0
+
+        for document in filetypes:
+            m = re.search(document['regex'], f['name'])
+            if m and m_length < len(m.group(0)):
+                new_type = document['_id']
+        if new_type is not None:
+            f['type'] = new_type
+
+    config.db['cont_name'].update_one({'_id': cont['_id']}, {'$set': {'files': files}})
+
+    return True
+
+
+def upgrade_to_41():
+    """
+    Load initial filetypes into mongo, retype existing files
+    """
+
+    # It was decided an initial load of filetypes here for existing users was
+    # easiest way to move those users forward. Future changes a site's
+    # filetypes will happen through the API endpoints as expected
+    filetypes = [
+        { "_id": "BVAL",            "regex": "\\.(bval|bvals)$" },
+        { "_id": "BVEC",            "regex": "\\.(bvec|bvecs)$" },
+        { "_id": "DICOM",           "regex": "\\.(dcm|dcm\\.zip|dicom\\.zip)$" },
+        { "_id": "EFile",           "regex": "^E.*P.*\\.7$" },
+        { "_id": "GE Physio",       "regex": "\\.gephysio\\.zip$" },
+        { "_id": "MGH Data",        "regex": "\\.(mgh|mgz|mgh\\.gz)$" },
+        { "_id": "NIfTI",           "regex": "\\.(nii\\.gz|nii)$" },
+        { "_id": "PAR/REC",         "regex": "\\.(parrec\\.zip|par-rec\\.zip)$" },
+        { "_id": "PFile Header",    "regex": "\\.(7\\.hdr)$" },
+        { "_id": "PFile",           "regex": "\\.(7\\.gz|7|7\\.zip)$" },
+
+        { "_id": "EEG",             "regex": "\\.eeg\\.zip$" },
+
+        { "_id": "QC",              "regex": "\\.(q[ac]\\.png|q[ac]\\.json|q[ac]\\.html)$" },
+
+        { "_id": "MATLAB Data",     "regex": "\\.mat$" },
+        { "_id": "PsychoPy Data",   "regex": "\\.psydat$" },
+
+        { "_id": "C/C++",           "regex": "\\.(c|cpp)$" },
+        { "_id": "CSS",             "regex": "\\.css$" },
+        { "_id": "HDF5",            "regex": "\\.(h5|hdf5)$" },
+        { "_id": "HTML",            "regex": "\\.(html|htm)$" },
+        { "_id": "JSON",            "regex": "\\.json$" },
+        { "_id": "Java",            "regex": "\\.java$" },
+        { "_id": "JavaScript",      "regex": "\\.js$" },
+        { "_id": "Jupyter",         "regex": "\\.ipynb$" },
+        { "_id": "MATLAB",          "regex": "\\.(m|mex|mlx)$" },
+        { "_id": "Markdown",        "regex": "\\.(md|markdown)$" },
+        { "_id": "PHP",             "regex": "\\.php$" },
+        { "_id": "Plain Text",      "regex": "\\.txt$" },
+        { "_id": "Python",          "regex": "\\.py$" },
+        { "_id": "TOML",            "regex": "\\.toml$" },
+        { "_id": "XML",             "regex": "\\.xml$" },
+        { "_id": "YAML",            "regex": "\\.(yaml|yml)$" },
+
+        { "_id": "Archive",         "regex": "\\.(zip|tbz2|tar\\.gz|tbz|tar\\.bz2|tgz|tar|txz|tar\\.xz)$" },
+        { "_id": "Audio",           "regex": "\\.(mp3|wav|wave)$" },
+        { "_id": "Document",        "regex": "\\.(docx|doc)$" },
+        { "_id": "Image",           "regex": "\\.(jpg|tif|jpeg|gif|bmp|png|tiff)$" },
+        { "_id": "Log",             "regex": "\\.log$" },
+        { "_id": "PDF",             "regex": "\\.pdf$" },
+        { "_id": "Presentation",    "regex": "\\.(ppt|pptx)$" },
+        { "_id": "Spreadsheet",     "regex": "\\.(xls|xlsx)$" },
+        { "_id": "Tabular data",    "regex": "\\.([ct]sv\\.gz|[ct]sv)$" },
+        { "_id": "Video",           "regex": "\\.(mpeg|mpg|mov|mp4|m4v|mts)$" }
+    ]
+
+    for ft in filetypes:
+        config.db.filetypes.replace_one({'_id': ft['_id']}, ft, upsert=True)
+
+    for cont_name in ['projects', 'sessions', 'acquisitions', 'analyses', 'collections']:
+
+        # Find all containers that have at least one file
+        cursor = config.db[cont_name].find({'files': { '$gt': [] }})
+        process_cursor(cursor, upgrade_to_41_closure, context={'filetypes': filetypes, 'cont_name': cont_name})
+
 ###
 ### BEGIN RESERVED UPGRADE SECTION
 ###

From 0cc3c282524e4aecf15c6bd67c2f67d1131fdf54 Mon Sep 17 00:00:00 2001
From: Gunnar Schaefer <gsfr@flywheel.io>
Date: Thu, 14 Dec 2017 10:57:14 -0800
Subject: [PATCH 16/16] Add one more case change

---
 bin/database.py       | 2 +-
 bootstrap.sample.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/database.py b/bin/database.py
index c6a79271c..9dcd148c0 100755
--- a/bin/database.py
+++ b/bin/database.py
@@ -1383,7 +1383,7 @@ def upgrade_to_41():
         { "_id": "PDF",             "regex": "\\.pdf$" },
         { "_id": "Presentation",    "regex": "\\.(ppt|pptx)$" },
         { "_id": "Spreadsheet",     "regex": "\\.(xls|xlsx)$" },
-        { "_id": "Tabular data",    "regex": "\\.([ct]sv\\.gz|[ct]sv)$" },
+        { "_id": "Tabular Data",    "regex": "\\.([ct]sv\\.gz|[ct]sv)$" },
         { "_id": "Video",           "regex": "\\.(mpeg|mpg|mov|mp4|m4v|mts)$" }
     ]
 
diff --git a/bootstrap.sample.json b/bootstrap.sample.json
index a945dfa9f..85709b432 100644
--- a/bootstrap.sample.json
+++ b/bootstrap.sample.json
@@ -70,7 +70,7 @@
     { "_id": "PDF",             "regex": "\\.pdf$" },
     { "_id": "Presentation",    "regex": "\\.(ppt|pptx)$" },
     { "_id": "Spreadsheet",     "regex": "\\.(xls|xlsx)$" },
-    { "_id": "Tabular data",    "regex": "\\.([ct]sv\\.gz|[ct]sv)$" },
+    { "_id": "Tabular Data",    "regex": "\\.([ct]sv\\.gz|[ct]sv)$" },
     { "_id": "Video",           "regex": "\\.(mpeg|mpg|mov|mp4|m4v|mts)$" }
   ]
 }