From b1c1eb3814fbb16d668529bf29763f1b8a872be5 Mon Sep 17 00:00:00 2001 From: TeriForey Date: Tue, 3 Jul 2018 15:46:11 +0100 Subject: [PATCH 1/3] Added access to scan helper function to better scale exports --- .../app/search/elasticsearch_dsl_builder.py | 9 ++++++ arches/app/search/search.py | 32 +++++++++++++++++++ eamena/eamena/views/search.py | 7 ++-- 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/arches/app/search/elasticsearch_dsl_builder.py b/arches/app/search/elasticsearch_dsl_builder.py index 0291767ff2..f49d576f21 100644 --- a/arches/app/search/elasticsearch_dsl_builder.py +++ b/arches/app/search/elasticsearch_dsl_builder.py @@ -112,6 +112,15 @@ def search(self, index='', doc_type='', **kwargs): #print self return self.se.search(index=index, doc_type=doc_type, body=self.dsl) + def scan(self, index='', doc_type='', **kwargs): + self.fields = kwargs.pop('fields', self.fields) + self.start = kwargs.pop('start', self.start) + self.limit = kwargs.pop('limit', self.limit) + + self.prepare() + # print self + return self.se.scan(index=index, doc_type=doc_type, query=self.dsl) + def delete(self, index=''): return self.se.delete(index=index, body=self.dsl) diff --git a/arches/app/search/search.py b/arches/app/search/search.py index 46bdc44c4a..f0350212de 100644 --- a/arches/app/search/search.py +++ b/arches/app/search/search.py @@ -93,6 +93,38 @@ def search(self, **kwargs): return ret + def scan(self, **kwargs): + """ + Search for an item in the index using the scan helper. + Pass an index, doc_type, and id to get a specific document + Pass a query dsl to perform a search + + """ + + query = kwargs.get('query', None) + index = kwargs.get('index', None) + id = kwargs.get('id', None) + + if index is None: + raise NotImplementedError("You must specify an 'index' in your call to search") + + if id: + if isinstance(id, list): + kwargs.setdefault('query', {'ids': kwargs.pop('id')}) + return self.es.mget(**kwargs) + else: + return self.es.get(**kwargs) + + ret = None + try: + ret = helpers.scan(self.es, **kwargs) + except Exception as detail: + self.logger.warning( + '%s: WARNING: search failed for query: %s \nException detail: %s\n' % (datetime.now(), query, detail)) + pass + + return ret + def index_term(self, term, id, context='', options={}): """ If the term is already indexed, then simply increment the count and add the id of the term to the existing index. diff --git a/eamena/eamena/views/search.py b/eamena/eamena/views/search.py index 1299ed0e19..48fd1966ab 100644 --- a/eamena/eamena/views/search.py +++ b/eamena/eamena/views/search.py @@ -143,12 +143,15 @@ def export_results(request): dsl.add_filter(ids_filter) - search_results = dsl.search(index='entity', doc_type='') + search_results = dsl.scan(index='entity', doc_type='') + allres = [] + for res in search_results: + allres.append(res) response = None format = request.GET.get('export', 'csv') exporter = ResourceExporter(format) - results = exporter.export(search_results['hits']['hits']) + results = exporter.export(allres) related_resources = [{'id1':rr.entityid1, 'id2':rr.entityid2, 'type':rr.relationshiptype} for rr in models.RelatedResource.objects.all()] csv_name = 'resource_relationships.csv' From 02c9e3cbec94d5b31ed83dd273569d879cb7680a Mon Sep 17 00:00:00 2001 From: Andrea Zerbini Date: Mon, 24 Sep 2018 09:53:17 +0300 Subject: [PATCH 2/3] update gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index ff3d05ab7f..26628110c3 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ eamena/eamena/elasticsearch eamena/eamena/logs eamena/eamena/uploadedfiles eamena/eamena/bulk_upload +eamena/eamena/additional_resource_graphs virtualenv/ENV tests/elasticsearch tests/logs From edaedfcafe22d63bcb069c7b0404a98337acf913 Mon Sep 17 00:00:00 2001 From: TeriForey Date: Mon, 1 Oct 2018 16:35:12 +0100 Subject: [PATCH 3/3] Removed loop to pullout Resource class, instead raw ES results are exported to JSON --- .../resources/formats/archesjson.py | 20 +------------------ 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/arches/app/utils/data_management/resources/formats/archesjson.py b/arches/app/utils/data_management/resources/formats/archesjson.py index f11bc41ae0..3ff1787468 100644 --- a/arches/app/utils/data_management/resources/formats/archesjson.py +++ b/arches/app/utils/data_management/resources/formats/archesjson.py @@ -30,26 +30,8 @@ def write_resources(self, resources, resource_export_configs): iso_date = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") json_file_name = os.path.join('{0}_{1}.{2}'.format('EAMENA', iso_date, 'json')) f = StringIO() - - - for count, resource in enumerate(resources, 1): - if count % 1000 == 0: - print "%s Resources exported" % count - errors = [] - - try: - a_resource = Resource().get(resource['_id']) - - a_resource.form_groups = None - json_resources.append(a_resource) - except Exception as e: - if e not in errors: - errors.append(e) - if len(errors) > 0: - print errors[0], ':', len(errors) - - f.write((JSONSerializer().serialize({'resources':json_resources}, indent = 4, separators=(',',':')))) + f.write((JSONSerializer().serialize({'resources': resources}, indent = 4, separators=(',',':')))) json_resources_for_export.append({'name': json_file_name, 'outputfile': f}) return json_resources_for_export