diff --git a/.gitignore b/.gitignore index a564df9..30d83dc 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,6 @@ generated/ /eclipse /backup ./do_jenkins.sh +test/url_cache +.esg/ +env/ diff --git a/docs/concepts.rst b/docs/concepts.rst index 8f06c9f..d733704 100644 --- a/docs/concepts.rst +++ b/docs/concepts.rst @@ -27,18 +27,22 @@ facets SearchContext Set in constructor fields SearchContext Set in constructor replica SearchContext Set in constructor type SearchContext Create contexts with the right type using :meth:`ResultSet.file_context`, etc. -from SearchContext Not implemented yet. Placeholder name "from_timestamp" -to SearchContext Not implemented yet. Placeholder name "to_timestamp" +from SearchContext Set in constructor. Use "from_timestamp" in the context API. +to SearchContext Set in constructor. Use "to_timestamp" in the context API. fields n/a Managed internally format n/a Managed internally id n/a Managed internally =========== ================ =================================================================================================== +Temporal keywords +''''''''''''''''' -Temporal / Spatial keywords -''''''''''''''''''''''''''' +Temporal keywords are supported for Dataset search. The terms "from_timestamp" and "to_timestamp" should be used with values following the format "YYYY-MM-DDThh:mm:ssZ". -Temporal and spatial keywords are not yet supported by :mod:`pyesgf.search` however the API does have placeholders for these keywords anticipating future implementation: +Spatial keywords +'''''''''''''''' + +Spatial keywords are not yet supported by :mod:`pyesgf.search` however the API does have placeholders for these keywords anticipating future implementation: Facet keywords '''''''''''''' diff --git a/docs/examples.rst b/docs/examples.rst index 307d5f1..2316c95 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -52,6 +52,26 @@ Find download URLs for all files in a dataset http://esg-datanode.jpl.nasa.gov/thredds/fileServer/esg_dataroot/obs4MIPs/observations/atmos/tro3/mon/grid/NASA-JPL/TES/v20110608/tro3_TES_L3_tbd_200507-200912.nc http://esg-datanode.jpl.nasa.gov/thredds/fileServer/esg_dataroot/obs4MIPs/observations/atmos/tro3Stderr/mon/grid/NASA-JPL/TES/v20110608/tro3Stderr_TES_L3_tbd_200507-200912.nc +Define a search for datasets that includes a temporal range: + + >>> conn = SearchConnection('http://esgf-index1.ceda.ac.uk/esg-search', + distrib=False) + >>> ctx = conn.new_context(project = "CMIP5", model = "HadGEM2-ES", + time_frequency = "mon", realm = "atmos", ensemble = "r1i1p1", latest = True, + from_timestamp = "2100-12-30T23:23:59Z", to_timestamp = "2200-01-01T00:00:00Z") + >>> ctx.hit_count + 3 + +Or do the same thing by searching without temporal constraints and then applying the constraint: + + >>> ctx = conn.new_context(project = "CMIP5", model = "HadGEM2-ES", + time_frequency = "mon", realm = "atmos", ensemble = "r1i1p1", latest = True) + >>> ctx.hit_count + 21 + >>> ctx = ctx.constrain(from_timestamp = "2100-12-30T23:23:59Z", to_timestamp = "2200-01-01T00:00:00Z") + >>> ctx.hit_count + 3 + Obtain MyProxy credentials to allow downloading files or using secured OPeNDAP >>> from pyesgf.logon import LogonManager diff --git a/docs/logon.rst b/docs/logon.rst index 30642d1..e08bcff 100644 --- a/docs/logon.rst +++ b/docs/logon.rst @@ -4,5 +4,7 @@ ESGF Security API :mod:`pyesgf` provides a simplified interface to obtaining ESGF credentials. +.. warning:: This interface only works with ***Python versions 2.7.9 or greater*** (due to an SSL update). + .. automodule:: pyesgf.logon :members: diff --git a/pyesgf/__init__.py b/pyesgf/__init__.py index 9edfdbc..c454589 100644 --- a/pyesgf/__init__.py +++ b/pyesgf/__init__.py @@ -3,7 +3,7 @@ """ -__version__ = '0.1.2' +__version__ = '0.1.6' #!TODO: ResultFormatter class. process response json to specialise the result json. Default is None #!TODO: pipe results to new process. Command-line interface. diff --git a/pyesgf/search/connection.py b/pyesgf/search/connection.py index c7a239f..3a10d2e 100644 --- a/pyesgf/search/connection.py +++ b/pyesgf/search/connection.py @@ -27,7 +27,9 @@ import warnings import logging +logging.basicConfig() log = logging.getLogger(__name__) +log.setLevel(logging.INFO) from .context import DatasetSearchContext from .consts import RESPONSE_FORMAT, SHARD_REXP @@ -126,7 +128,7 @@ def send_wget(self, query_dict, shards=None): def _send_query(self, endpoint, full_query): """ Generally not to be called directly by the user but via SearchContext - instances. + instances. :param full_query: dictionary of query string parameers to send. :return: the urllib2 response object from the query. @@ -138,7 +140,16 @@ def _send_query(self, endpoint, full_query): query_url = '%s/%s?%s' % (self.url, endpoint, urlencode(full_query)) log.debug('Query request is %s' % query_url) - response = urllib2.urlopen(query_url) + try: + response = urllib2.urlopen(query_url) + except urllib2.HTTPError, err: + log.warn("HTTP request received error code: %s" % err.code) + if err.code == 400: + errors = set(re.findall("Invalid HTTP query parameter=(\w+)", err.fp.read())) + content = "; ".join([e for e in list(errors)]) + raise Exception("Invalid query parameter(s): %s" % content) + else: + raise Exception("Error returned from URL: %s" % query_url) return response @@ -155,7 +166,12 @@ def _build_query(self, query_dict, limit=None, offset=None, shards=None): else: for port, suffix in self._available_shards[shard]: # suffix should be ommited when querying - shard_specs.append('%s:%s/solr' % (shard, port)) + if not port: + port_string = "" + else: + port_string = ":%s" % port + + shard_specs.append('%s%s/solr' % (shard, port_string)) shard_str = ','.join(shard_specs) else: @@ -229,9 +245,8 @@ def get_shard_list(self): def new_context(self, context_class=None, latest=None, facets=None, fields=None, - #!TODO: add once implemented - #from_timestamp=None, to_timestamp=None, - replica=None, shards=None, + from_timestamp=None, to_timestamp=None, + replica=None, shards=None, search_type=None, **constraints): """ Returns a :class:`pyesgf.search.context.SearchContext` class for @@ -246,10 +261,10 @@ def new_context(self, context_class=None, return context_class(self, constraints, latest=latest, facets=facets, fields=fields, - #!TODO: add once implemented - #from_timestamp=from_timestamp, - #to_timestamp=to_timestamp, + from_timestamp=from_timestamp, + to_timestamp=to_timestamp, replica=replica, shards=shards, + search_type=search_type, ) @@ -266,7 +281,7 @@ def query_keyword_type(keyword): if keyword == 'query': return 'freetext' - elif keyword in ['start', 'end']: + elif keyword in ['start', 'end', 'from_timestamp', 'to_timestamp']: return 'temporal' elif keyword in ['lat', 'lon', 'bbox', 'location', 'radius', 'polygon']: return 'geospatial' diff --git a/pyesgf/search/consts.py b/pyesgf/search/consts.py index ec82a5f..0950962 100644 --- a/pyesgf/search/consts.py +++ b/pyesgf/search/consts.py @@ -10,4 +10,5 @@ OPERATOR_NEQ = 'not_equal' -SHARD_REXP = r'(?P.*?):(?P\d*)/solr(?P.*)' +SHARD_REXP = r'^(?Phttps?://)?(?P.+?):?(?P\d+)?/(?P.*)$' + diff --git a/pyesgf/search/context.py b/pyesgf/search/context.py index b88261b..eb061c1 100644 --- a/pyesgf/search/context.py +++ b/pyesgf/search/context.py @@ -65,8 +65,10 @@ def __init__(self, connection, constraints, search_type=None, or only non-latest versions, or None to return both. :param shards: list of shards to restrict searches to. Should be from the list self.connection.get_shard_list() - :param from_timestamp: NotImplemented - :param to_timestamp: NotImplemented + :param from_timestamp: Date-time string to specify start of search range + (e.g. "2000-01-01T00:00:00Z"). + :param to_timestamp: Date-time string to specify end of search range + (e.g. "2100-12-31T23:59:59Z"). """ @@ -80,7 +82,7 @@ def __init__(self, connection, constraints, search_type=None, # Constraints self.freetext_constraint = None self.facet_constraints = MultiDict() - self.temporal_constraint = (None, None) + self.temporal_constraint = [from_timestamp, to_timestamp] self.geosplatial_constraint = None self._update_constraints(constraints) @@ -221,7 +223,10 @@ def _update_constraints(self, constraints): self._constrain_freetext(new_freetext) #!TODO: implement temporal and geospatial constraints - #self._constrain_temporal() + if 'from_timestamp' in constraints_split['temporal']: + self.temporal_constraint[0] = constraints_split['temporal']['from_timestamp'] + if 'to_timestamp' in constraints_split['temporal']: + self.temporal_constraint[1] = constraints_split['temporal']['to_timestamp'] #self._constrain_geospatial() # reset cached values @@ -242,18 +247,6 @@ def _constrain_facets(self, facet_constraints): def _constrain_freetext(self, query): self.freetext_constraint = query - def _constrain_temporal(self, start, end): - """ - :param start: a datetime instance specifying the start of the temporal - constraint. - :param end: a datetime instance specifying the end of the temporal - constraint. - - """ - #!TODO: support solr date keywords like "NOW" and "NOW-1DAY" - # we will probably need a separate TemporalConstraint object - self.temporal_constraint = (start, end) - def _constrain_geospatial(self, lat=None, lon=None, bbox=None, location=None, radius=None, polygon=None): self.geospatial_constraint = GeospatialConstraint(lat, lon, bbox, location, radius, polygon) @@ -277,6 +270,7 @@ def _split_constraints(self, constraints): from .connection import query_keyword_type constraints_split = dict((kw, MultiDict()) for kw in QUERY_KEYWORD_TYPES) + for kw, val in constraints.items(): constraint_type = query_keyword_type(kw) constraints_split[constraint_type][kw] = val @@ -300,8 +294,8 @@ def _build_query(self): query_dict.extend(self.facet_constraints) #!TODO: encode datetime - #start, end = self.temporal_constraint - #query_dict.update(start=start, end=end) + start, end = self.temporal_constraint + query_dict.update(start=start, end=end) return query_dict diff --git a/setup.py b/setup.py index 255439a..7267343 100644 --- a/setup.py +++ b/setup.py @@ -25,10 +25,10 @@ 'Programming Language :: Python :: 2.6', ], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers keywords='', - author='Stephen Pascoe', - author_email='Stephen.Pascoe@stfc.ac.uk', + author='Ag Stephens', + author_email='Ag.Stephens@stfc.ac.uk', url='http://esgf-pyclient.readthedocs.org', - download_url='http://github.org/stephenpascoe/esgf-pyclient', + download_url='http://github.com/ESGF/esgf-pyclient', license='BSD', packages=find_packages(exclude=['ez_setup', 'examples', 'test']), include_package_data=True, diff --git a/test/config.py b/test/config.py index 7ae4674..d14bd5d 100644 --- a/test/config.py +++ b/test/config.py @@ -3,6 +3,6 @@ """ -TEST_SERVICE='http://esgf-node.ipsl.fr/esg-search' +TEST_SERVICE='http://esgf-index1.ceda.ac.uk/esg-search' CACHE_DIR = 'url_cache' diff --git a/test/test_connection.py b/test/test_connection.py index b21c3ba..5d9d404 100644 --- a/test/test_connection.py +++ b/test/test_connection.py @@ -28,9 +28,9 @@ def test_get_shard_list(): shards = conn.get_shard_list() #!NOTE: the exact shard list will change depending on the shard replication configuration # on the test server - assert 'esgf-node.ipsl.fr' in shards + assert 'esgf-index2.ceda.ac.uk' in shards # IPSL now replicates all non-local shards. Just check it has a few shards - assert len(shards['esgf-node.ipsl.fr']) > 4 + assert len(shards['esgf-index2.ceda.ac.uk']) > 3 def test_url_fixing(): diff --git a/test/test_context.py b/test/test_context.py index f9e375b..fe03cb5 100644 --- a/test/test_context.py +++ b/test/test_context.py @@ -63,14 +63,10 @@ def test_context_facet_options(): conn = SearchConnection(TEST_SERVICE) context = conn.new_context(project='CMIP5', model='IPSL-CM5A-LR', ensemble='r1i1p1', experiment='rcp60', - realm='seaIce' - ) + realm='seaIce') - assert context.get_facet_options().keys() == [ - 'product', 'cf_standard_name', 'variable_long_name', 'cmor_table', - 'time_frequency', 'variable' - ] - + assert context.get_facet_options().keys() == ['data_node', 'cf_standard_name', 'variable_long_name', + 'cmor_table', 'time_frequency', 'variable'] def test_context_facets3(): @@ -153,7 +149,6 @@ def test_negative_facet(): assert hits1 == hits2 + hits3 - def test_replica(): # Test that we can exclude replicas # This tests assumes the test dataset is replicated @@ -169,3 +164,19 @@ def test_replica(): replica=False) assert context.hit_count == 1 + +def test_response_from_bad_parameter(): + # Test that a bad parameter name raises a useful exception + # NOTE::: !!! This fails because urllib2 HTTP query is overrided with + # !!! cache handler instead of usual response. + # !!! Fix needs to make sure cached URL request has response exceptions matching urllib2 exception + conn = SearchConnection(TEST_SERVICE) + context = conn.new_context(project='CMIP5', rubbish='nonsense') + context.hit_count + + try: + context.hit_count + except Exception, err: + assert str(err).strip() == "Invalid query parameter(s): rubbish" + + diff --git a/test/test_results.py b/test/test_results.py index 3b37f47..0461851 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -17,7 +17,7 @@ def test_result1(): results = ctx.search() r1 = results[0] - assert re.match(r'cmip5\.output1\.IPSL\..\|vesg.ipsl.fr', r1.dataset_id) + assert re.match(r'cmip5\.output1\.MOHC\..+\|esgf-data1.ceda.ac.uk', r1.dataset_id) def test_file_context(): conn = SearchConnection(TEST_SERVICE, distrib=False) @@ -60,8 +60,7 @@ def test_file_list2(): file_results = f_ctx.search() for file_result in file_results: - print file_result.download_url - assert re.match(r'http://vesg.ipsl.fr/thredds/.*\.nc', file_result.download_url) + assert re.search(r'ds/.*\.nc', file_result.download_url) def test_aggregations(): conn = SearchConnection(TEST_SERVICE, distrib=False) @@ -123,6 +122,7 @@ def test_shards_constrain(): full_query = f_ctx.connection._build_query(query_dict, shards=f_ctx.shards) #!TODO: Force fail to see whether shards is passed through. + # NOTE: 'shards' is NOT even a key in this dictionary. Needs rewrite!!! q_shard = full_query['shards'] # Check it isn't a ',' separated list assert ',' not in q_shard diff --git a/test/test_shard_regex.py b/test/test_shard_regex.py new file mode 100644 index 0000000..aa92b98 --- /dev/null +++ b/test/test_shard_regex.py @@ -0,0 +1,40 @@ +""" +Test regular expression for matching shard end points. +""" + +from pyesgf.search.consts import SHARD_REXP +import re + +tests = [ +"https://esgf-test.a.b.c/solr", +"http://esgf.a.c/solr/data", +"http://esgs.a.d:80/data/solr", +"esgf.a.c:80/solr", +"esgf.a.c/solr" +] + +expected = [ +("https://", "esgf-test.a.b.c", None, "solr"), +("http://", "esgf.a.c", None, "solr/data"), +("http://", "esgs.a.d", "80", "data/solr"), +(None, "esgf.a.c", "80", "solr"), +(None, "esgf.a.c", None, "solr") +] + +keys = ("prefix", "host", "port", "suffix") + +R = re.compile("^(?Phttps?://)?(?P.+?):?(?P\d+)?/(?P.+)$") + +def test_regex(): + for i, test in enumerate(tests): + + match = R.match(test) + d = match.groupdict() + values = tuple([d[key] for key in keys]) + + assert values == expected[i] + + +if __name__ == "__main__": + test_regex() + diff --git a/test/test_temporal_search.py b/test/test_temporal_search.py new file mode 100644 index 0000000..2c697ce --- /dev/null +++ b/test/test_temporal_search.py @@ -0,0 +1,38 @@ +""" +test_temporal_search.py +======================= + +Uses CMIP5 to find HadGEM2-ES dataset ids that include data between 1960 and 1962. + +Tests whether the "from_timestamp" and "to_timestamp" options are working in esgf-pyclient. + +""" + +from pyesgf.search import SearchConnection + + +def test_temporal_search_CMIP5(): + conn = SearchConnection('http://esgf-index1.ceda.ac.uk/esg-search', + distrib=False) + + ctx1 = conn.new_context(project = "CMIP5", model = "HadGEM2-ES", + time_frequency = "mon", realm = "atmos", ensemble = "r1i1p1", latest = True) + + ctx2 = conn.new_context(project = "CMIP5", model = "HadGEM2-ES", + time_frequency = "mon", realm = "atmos", ensemble = "r1i1p1", latest = True, + from_timestamp = "2100-12-30T23:23:59Z", to_timestamp = "2200-01-01T00:00:00Z") + + assert ctx2.hit_count < ctx1.hit_count + +def test_temporal_search_CORDEX(): + conn = SearchConnection("http://esgf-data.dkrz.de/esg-search", distrib=True) + + ctx1 = conn.new_context(project='CORDEX', + from_timestamp="1990-01-01T12:00:00Z", + to_timestamp="2100-12-31T12:00:00Z") + + ctx2 = conn.new_context(project='CORDEX', + from_timestamp="2011-01-01T12:00:00Z", + to_timestamp="2100-12-31T12:00:00Z") + + assert ctx2.hit_count < ctx1.hit_count diff --git a/test/test_util.py b/test/test_util.py index c323435..4b6d0c3 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -14,18 +14,18 @@ def test_get_manifest(): conn = SearchConnection(CEDA_SERVICE, distrib=False) - manifest = get_manifest('GeoMIP.output1.MOHC.HadGEM2-ES.G1.day.atmos.day.r1i1p1', + manifest = get_manifest('GeoMIP.output.MOHC.HadGEM2-ES.G1.day.atmos.day.r1i1p1', 20120223, conn) filename = 'psl_day_HadGEM2-ES_G1_r1i1p1_19291201-19291230.nc' - assert manifest[filename]['checksum'] == 'd20bbba8e05d6689f44cf3f8eebb9e7b' + assert manifest[filename]['checksum'] == '5c459a61cfb904ca235ad1f796227114df095d9162a2a3f044bc01f881b532ce' #!TODO: this test belongs somewhere else def test_opendap_url(): conn = SearchConnection(CEDA_SERVICE, distrib=False) ctx = conn.new_context() - results = ctx.search(drs_id='GeoMIP.output1.MOHC.HadGEM2-ES.G1.day.atmos.day.r1i1p1') + results = ctx.search(drs_id='GeoMIP.output.MOHC.HadGEM2-ES.G1.day.atmos.day.r1i1p1') assert len(results) == 1 agg_ctx = results[0].aggregation_context() @@ -45,7 +45,7 @@ def test_download_url(): conn = SearchConnection(CEDA_SERVICE, distrib=False) ctx = conn.new_context() - results = ctx.search(drs_id='GeoMIP.output1.MOHC.HadGEM2-ES.G1.day.atmos.day.r1i1p1') + results = ctx.search(drs_id='GeoMIP.output.MOHC.HadGEM2-ES.G1.day.atmos.day.r1i1p1') files = results[0].file_context().search() download_url = files[0].download_url @@ -57,7 +57,7 @@ def test_opendap_fail(): ctx = conn.new_context() results = ctx.search(project='CMIP5', experiment='rcp45', time_frequency='mon', - realm='atmos', ensemble='r1i1p1') + realm='fx', ensemble='r1i1p1') files_ctx = results[0].file_context() hit = files_ctx.search()[0]