From cd550a610be8c01e36024f193c82acf1ccce76cb Mon Sep 17 00:00:00 2001 From: kochbj Date: Sun, 29 May 2022 12:36:32 -0400 Subject: [PATCH 1/4] Generalized get works -Changed works to hidden method called get entities so that it works with authors, concepts, institutions, etc.... -Added global PAGE_LIMIT variable -Added method to set_email since it wasn't clear to me initially how to do that. --- openalexapi/__init__.py | 128 ++++++++++++++++++++++++++++++++++------ 1 file changed, 111 insertions(+), 17 deletions(-) diff --git a/openalexapi/__init__.py b/openalexapi/__init__.py index 211df98..992a070 100644 --- a/openalexapi/__init__.py +++ b/openalexapi/__init__.py @@ -8,7 +8,18 @@ import requests from pydantic import BaseModel, EmailStr +from openalexapi.basetype import OpenAlexBaseType from openalexapi.work import Work +from openalexapi.author import Author +from openalexapi.work import Concept +from openalexapi.venue import Venue +from openalexapi.institution import Institution + +PAGE_LIMIT = 50 # Limit of 200 is imposed by OpenAlex API + + + + logger = logging.getLogger(__name__) @@ -21,13 +32,16 @@ class OpenAlex(BaseModel): """ email: Optional[EmailStr] base_url = "https://api.openalex.org/" + + def set_email(self,email: Optional[EmailStr]): + self.email = email @backoff.on_exception(backoff.expo, - (requests.exceptions.Timeout, - requests.exceptions.ConnectionError), - max_time=60, - on_backoff=print(f"Backing off")) - def get_single_work(self, id: str) -> Optional[Work]: + (requests.exceptions.Timeout, + requests.exceptions.ConnectionError), + max_time=60, + on_backoff=print(f"Backing off")) + def __get_single_entity(self, id: str, oatype: OpenAlexBaseType): """This models the single work entity endpoint :parameter id can be and OpenAlex ID e.g. "W123" or a namespace ID like "doi:10.123" @@ -39,7 +53,7 @@ def get_single_work(self, id: str) -> Optional[Work]: "Please be nice and supply your email as the first argument " "when calling this class to get into the polite pool. This way " "OpenAlex can contact you if needed.") - url = self.base_url + "works/" + id + url = self.base_url + oatype.__name__.lower() + 's/' + id logger.debug(f"Fetching {url}") headers = { "Accept": "application/json", @@ -47,19 +61,60 @@ def get_single_work(self, id: str) -> Optional[Work]: } response = requests.get(url, headers=headers) if response.status_code == 200: - return Work(**response.json()) + return oatype(**response.json()) elif response.status_code == 404: return None else: raise ValueError(f"Got {response.status_code} from OpenAlex") + + @backoff.on_exception(backoff.expo, + (requests.exceptions.Timeout, + requests.exceptions.ConnectionError), + max_time=60, + on_backoff=print(f"Backing off")) + def get_single_work(self, id: str) -> Optional[Work]: + return self.__get_single_entity(id, Work) + + @backoff.on_exception(backoff.expo, + (requests.exceptions.Timeout, + requests.exceptions.ConnectionError), + max_time=60, + on_backoff=print(f"Backing off")) + def get_single_author(self, id: str) -> Optional[Author]: + return self.__get_single_entity(id, Author) + + @backoff.on_exception(backoff.expo, + (requests.exceptions.Timeout, + requests.exceptions.ConnectionError), + max_time=60, + on_backoff=print(f"Backing off")) + def get_single_concept(self, id: str) -> Optional[Concept]: + return self.__get_single_entity(id, Concept) + @backoff.on_exception(backoff.expo, + (requests.exceptions.Timeout, + requests.exceptions.ConnectionError), + max_time=60, + on_backoff=print(f"Backing off")) + def get_single_venue(self, id: str) -> Optional[Venue]: + return self.__get_single_entity(id, Venue) + + @backoff.on_exception(backoff.expo, + (requests.exceptions.Timeout, + requests.exceptions.ConnectionError), + max_time=60, + on_backoff=print(f"Backing off")) + def get_single_institution(self, id: str) -> Optional[Institution]: + return self.__get_single_entity(id, Venue) + + # TODO: Adapt this to support multiple namespaces @backoff.on_exception(backoff.expo, (requests.exceptions.Timeout, requests.exceptions.ConnectionError), max_time=60, on_backoff=print(f"Backing off")) - def get_multiple_works(self, ids: List[str]) -> List[Work]: + def __get_multiple_entities(self, ids: List[str], oatype: OpenAlexBaseType) -> List[Work]: """Fetches multiple works by OpenAlex IDs. Note this does not support alternative namespaces. @@ -77,20 +132,59 @@ def get_multiple_works(self, ids: List[str]) -> List[Work]: "User-Agent": f"OpenAlexAPI https://github.com/dpriskorn/OpenAlexAPI mailto:{self.email}" } ids = [s.replace("https://openalex.org/", "") for s in ids] - works = [] - # Limit of 50 is imposed by OpenAlex API - for i in range(0, len(ids), 50): - url_ids = '|'.join(ids[i:i+50]) - url = self.base_url + f"works?filter=openalex_id:{url_ids}&per_page=50" - logger.debug(f"Fetching works {i} through {i+50}") + entities = [] + for i in range(0, len(ids), PAGE_LIMIT): + url_ids = '|'.join(ids[i:i+PAGE_LIMIT]) + url = self.base_url + f"{oatype.__name__.lower()}s?filter=openalex_id:{url_ids}&per_page={PAGE_LIMIT}" + logger.debug(f"Fetching {oatype.__name__.lower()}s {i} through {i+PAGE_LIMIT}") response = requests.get(url, headers=headers) if response.status_code == 200: - works += [Work(**w) for w in response.json()['results']] + entities += [oatype(**e) for e in response.json()['results']] elif response.status_code == 403: raise ValueError("Got error 403. Are you using OpenAlex IDs?") else: raise ValueError(f"Got {response.status_code} from OpenAlex") - return works + return entities + + @backoff.on_exception(backoff.expo, + (requests.exceptions.Timeout, + requests.exceptions.ConnectionError), + max_time=60, + on_backoff=print(f"Backing off")) + def get_multiple_works(self, ids: List[str]) -> List[Work]: + return self.__get_multiple_entities(ids, Work) + + @backoff.on_exception(backoff.expo, + (requests.exceptions.Timeout, + requests.exceptions.ConnectionError), + max_time=60, + on_backoff=print(f"Backing off")) + def get_multiple_authors(self, ids: List[str]) -> List[Author]: + return self.__get_multiple_entities(ids, Author) + + @backoff.on_exception(backoff.expo, + (requests.exceptions.Timeout, + requests.exceptions.ConnectionError), + max_time=60, + on_backoff=print(f"Backing off")) + def get_multiple_concepts(self, ids: List[str]) -> List[Author]: + return self.__get_multiple_entities(ids, Author) + + @backoff.on_exception(backoff.expo, + (requests.exceptions.Timeout, + requests.exceptions.ConnectionError), + max_time=60, + on_backoff=print(f"Backing off")) + def get_multiple_venues(self, ids: List[str]) -> List[Venue]: + return self.__get_multiple_entities(ids, Venue) + + @backoff.on_exception(backoff.expo, + (requests.exceptions.Timeout, + requests.exceptions.ConnectionError), + max_time=60, + on_backoff=print(f"Backing off")) + def get_multiple_institutions(self, ids: List[str]) -> List[Institution]: + return self.__get_multiple_entities(ids, Institution) @backoff.on_exception(backoff.expo, (requests.exceptions.Timeout, @@ -136,7 +230,7 @@ def get_cited_by_works(self, work: Work, limit: int = None) -> List[Work]: "Accept": "application/json", "User-Agent": f"OpenAlexAPI https://github.com/dpriskorn/OpenAlexAPI mailto:{self.email}" } - per_page = 200 if limit is None else min(200, limit) + per_page = PAGE_LIMIT if limit is None else min(PAGE_LIMIT, limit) works = [] cursor = '*' while cursor: From e90b13b58b2028739ed48c98ab386e2993bfc72e Mon Sep 17 00:00:00 2001 From: kochbj Date: Sun, 29 May 2022 12:38:16 -0400 Subject: [PATCH 2/4] Fleshed out authors -Fleshed out author objects -Added fields to ids and years so they work with both authors and works --- openalexapi/author.py | 17 ++++++++++++++--- openalexapi/ids.py | 4 ++++ openalexapi/year.py | 2 ++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/openalexapi/author.py b/openalexapi/author.py index 6424564..40ec183 100644 --- a/openalexapi/author.py +++ b/openalexapi/author.py @@ -1,15 +1,26 @@ """ Copyright 2022 Dennis Priskorn """ -from typing import Optional +from typing import Optional, List from openalexapi.basetype import OpenAlexBaseType - +from openalexapi.ids import Ids +from openalexapi.institution import Institution +from openalexapi.year import Year class Author(OpenAlexBaseType): display_name: Optional[str] orcid: Optional[str] - + display_name_alternatives: Optional[List[str]] + works_count: Optional[int] + cited_by_count: Optional[int] + ids: Optional[Ids] + last_known_institution: Optional[Institution] + counts_by_year: Optional[List[Year]] + works_api_url: Optional[str] + updated_date: Optional[str] + created_date: Optional[str] + class Config: arbitrary_types_allowed = True diff --git a/openalexapi/ids.py b/openalexapi/ids.py index 195f8e6..0b68c1c 100644 --- a/openalexapi/ids.py +++ b/openalexapi/ids.py @@ -10,6 +10,9 @@ class Ids(BaseModel): doi: Optional[str] pmid: Optional[str] mag: Optional[str] + twitter: Optional[str] + wikipedia: Optional[str] + scopus: Optional[str] class Config: arbitrary_types_allowed = True @@ -35,3 +38,4 @@ def pmid_id(self): @property def pmid_url(self): return self.pmid + diff --git a/openalexapi/year.py b/openalexapi/year.py index 6f35170..230e772 100644 --- a/openalexapi/year.py +++ b/openalexapi/year.py @@ -2,8 +2,10 @@ Copyright 2022 Dennis Priskorn """ from pydantic import BaseModel, conint +from typing import Optional class Year(BaseModel): year: conint(le=2023, ge=0) cited_by_count: int + works_count: Optional[int] From e97491f3588d8ca1bb5d54f501a9249214c030aa Mon Sep 17 00:00:00 2001 From: kochbj Date: Fri, 3 Jun 2022 15:52:26 -0400 Subject: [PATCH 3/4] Large expansion of API Lots of changes: 1. Remove get_works and replace with universal get_entities 2. Exhaustively fleshed out ALL fields in OpenAlex API. This required making some new enums and classes (i.e., geo) 3. Added dehydrated objects for four core object types. Also added a convenience function hydrate. I had not planned to use dehydrated objects, but it does slow things down and I think it's good to clarify to users why fields are missing. 4. Added basic search functionality 5. Added get associated_works 6. Made explicit set_email method. I needed this to correct the headers using pydantic. Perhaps there is a better way, but I think it's good to allow users to change after construction. TODO: IMPORTANT: 1. More serious unit testing 2. Advanced search, filter, groupby functionality MINOR: 1. If theres a way to not have to do the backoff decorator every time that would be cool. 2.I struggled with the openaccess enum and allowing typing to do None so gave up. 3. I really hate how years are returned. If its possible to parse these as dict of lists rather list of dicts I'd be all for it. --- openalexapi/__init__.py | 255 ++++++++++++++++++++----------------- openalexapi/author.py | 18 ++- openalexapi/authorship.py | 12 +- openalexapi/concept.py | 24 +++- openalexapi/enums.py | 33 ++++- openalexapi/geo.py | 18 +++ openalexapi/ids.py | 12 +- openalexapi/institution.py | 32 ++++- openalexapi/openaccess.py | 4 +- openalexapi/venue.py | 27 ++++ openalexapi/work.py | 12 +- tests/test___init__.py | 17 +++ 12 files changed, 327 insertions(+), 137 deletions(-) create mode 100644 openalexapi/geo.py diff --git a/openalexapi/__init__.py b/openalexapi/__init__.py index 992a070..04a7a02 100644 --- a/openalexapi/__init__.py +++ b/openalexapi/__init__.py @@ -2,7 +2,7 @@ Copyright 2022 Dennis Priskorn """ import logging -from typing import Optional, List +from typing import Optional, List, Union import backoff # type: ignore import requests @@ -10,15 +10,10 @@ from openalexapi.basetype import OpenAlexBaseType from openalexapi.work import Work -from openalexapi.author import Author -from openalexapi.work import Concept -from openalexapi.venue import Venue -from openalexapi.institution import Institution - -PAGE_LIMIT = 50 # Limit of 200 is imposed by OpenAlex API - - - +from openalexapi.author import Author, DehydratedAuthor +from openalexapi.work import Concept, DehydratedConcept +from openalexapi.venue import Venue, DehydratedVenue, HostVenue +from openalexapi.institution import Institution, DehydratedInstitution logger = logging.getLogger(__name__) @@ -31,17 +26,48 @@ class OpenAlex(BaseModel): :parameter=email """ email: Optional[EmailStr] - base_url = "https://api.openalex.org/" - - def set_email(self,email: Optional[EmailStr]): + page_limit: int = 50 + _base_url: str = "https://api.openalex.org/" + _headers: dict = { + "Accept": "application/json", + "User-Agent": f"OpenAlexAPI https://github.com/dpriskorn/OpenAlexAPI" + } + #Convenience dict because dehydrated entities do not have works_api_urls and annoying inconsistencies in endpoints (institution vs instititions, host_venue vs venue) + _works_urls: dict = { + Author: _base_url+"works?filter=author.id:", + DehydratedAuthor: _base_url+"works?filter=author.id:", + Concept: _base_url+"works?filter=concept.id:", + DehydratedConcept: _base_url+"works?filter=concept.id:", + Institution: _base_url+"works?filter=institutions.id:", + DehydratedInstitution: _base_url+"works?filter=institutions.id:", + Venue: _base_url+"works?filter=host_venue.id:", + DehydratedVenue: _base_url+"works?filter=host_venue.id:", + Host: _base_url+"works?filter=host_venue.id:" + } + _entities_prefixes: dict = { + 'A':Author, + 'C':Concept, + 'I':Institution, + 'V':Venue, + 'W':Work + } + + class Config: + underscore_attrs_are_private = True + + def set_email(self,email: EmailStr): self.email = email + self._headers = { + "Accept": "application/json", + "User-Agent": f"OpenAlexAPI https://github.com/dpriskorn/OpenAlexAPI mailto:{self.email}" + } @backoff.on_exception(backoff.expo, (requests.exceptions.Timeout, requests.exceptions.ConnectionError), max_time=60, on_backoff=print(f"Backing off")) - def __get_single_entity(self, id: str, oatype: OpenAlexBaseType): + def get_single_entity(self, id: str) -> Union[Author,Concept,Institution,Venue,Work]: """This models the single work entity endpoint :parameter id can be and OpenAlex ID e.g. "W123" or a namespace ID like "doi:10.123" @@ -53,60 +79,27 @@ def __get_single_entity(self, id: str, oatype: OpenAlexBaseType): "Please be nice and supply your email as the first argument " "when calling this class to get into the polite pool. This way " "OpenAlex can contact you if needed.") - url = self.base_url + oatype.__name__.lower() + 's/' + id + id =id.replace("https://openalex.org/", "") + if id[0] in self._entities_prefixes: etype = self._entities_prefixes[id[0]] + else: raise ValueError("Id prefix does not correspond to a valid entity.") + url = self._base_url + etype.__name__.lower() + 's/' + id logger.debug(f"Fetching {url}") - headers = { - "Accept": "application/json", - "User-Agent": f"OpenAlexAPI https://github.com/dpriskorn/OpenAlexAPI mailto:{self.email}" - } - response = requests.get(url, headers=headers) + response = requests.get(url, headers=self._headers) if response.status_code == 200: - return oatype(**response.json()) + return etype(**response.json()) elif response.status_code == 404: return None else: raise ValueError(f"Got {response.status_code} from OpenAlex") - - @backoff.on_exception(backoff.expo, - (requests.exceptions.Timeout, - requests.exceptions.ConnectionError), - max_time=60, - on_backoff=print(f"Backing off")) - def get_single_work(self, id: str) -> Optional[Work]: - return self.__get_single_entity(id, Work) - - @backoff.on_exception(backoff.expo, - (requests.exceptions.Timeout, - requests.exceptions.ConnectionError), - max_time=60, - on_backoff=print(f"Backing off")) - def get_single_author(self, id: str) -> Optional[Author]: - return self.__get_single_entity(id, Author) - + + # TODO: Adapt this to support multiple namespaces @backoff.on_exception(backoff.expo, (requests.exceptions.Timeout, requests.exceptions.ConnectionError), max_time=60, - on_backoff=print(f"Backing off")) - def get_single_concept(self, id: str) -> Optional[Concept]: - return self.__get_single_entity(id, Concept) - - @backoff.on_exception(backoff.expo, - (requests.exceptions.Timeout, - requests.exceptions.ConnectionError), - max_time=60, - on_backoff=print(f"Backing off")) - def get_single_venue(self, id: str) -> Optional[Venue]: - return self.__get_single_entity(id, Venue) - - @backoff.on_exception(backoff.expo, - (requests.exceptions.Timeout, - requests.exceptions.ConnectionError), - max_time=60, - on_backoff=print(f"Backing off")) - def get_single_institution(self, id: str) -> Optional[Institution]: - return self.__get_single_entity(id, Venue) - + on_backoff=print(f"Backing off")) + def hydrate_entity(self, dehydrated_entity: Union[DehydratedAuthor, DehydratedConcept, DehydratedInstitution, DehydratedVenue, HostVenue]) -> Union[Author,Concept,Institution,Venue]: + return self.get_single_entity(dehydrated_entity.id) # TODO: Adapt this to support multiple namespaces @backoff.on_exception(backoff.expo, @@ -114,7 +107,7 @@ def get_single_institution(self, id: str) -> Optional[Institution]: requests.exceptions.ConnectionError), max_time=60, on_backoff=print(f"Backing off")) - def __get_multiple_entities(self, ids: List[str], oatype: OpenAlexBaseType) -> List[Work]: + def get_entities(self, ids: List[str]) -> Union[List[Author],List[Concept],List[Institution],List[Venue],List[Work]]: """Fetches multiple works by OpenAlex IDs. Note this does not support alternative namespaces. @@ -127,19 +120,17 @@ def __get_multiple_entities(self, ids: List[str], oatype: OpenAlexBaseType) -> L "Please be nice and supply your email as the first argument " "when calling this class to get into the polite pool. This way " "OpenAlex can contact you if needed.") - headers = { - "Accept": "application/json", - "User-Agent": f"OpenAlexAPI https://github.com/dpriskorn/OpenAlexAPI mailto:{self.email}" - } ids = [s.replace("https://openalex.org/", "") for s in ids] + if ids[0][0] in self._entities_prefixes: etype = self._entities_prefixes[ids[0][0]] + else: raise ValueError("Id prefix does not correspond to a valid entity.") entities = [] - for i in range(0, len(ids), PAGE_LIMIT): - url_ids = '|'.join(ids[i:i+PAGE_LIMIT]) - url = self.base_url + f"{oatype.__name__.lower()}s?filter=openalex_id:{url_ids}&per_page={PAGE_LIMIT}" - logger.debug(f"Fetching {oatype.__name__.lower()}s {i} through {i+PAGE_LIMIT}") - response = requests.get(url, headers=headers) + for i in range(0, len(ids), self.page_limit): + url_ids = '|'.join(ids[i:i+self.page_limit]) + url = self._base_url + f"{etype.__name__.lower()}s?filter=openalex_id:{url_ids}&per_page={self.page_limit}" + logger.debug(f"Fetching {etype.__name__.lower()}s {i} through {i+self.page_limit}") + response = requests.get(url, headers=self._headers) if response.status_code == 200: - entities += [oatype(**e) for e in response.json()['results']] + entities += [etype(**e) for e in response.json()['results']] elif response.status_code == 403: raise ValueError("Got error 403. Are you using OpenAlex IDs?") else: @@ -147,44 +138,13 @@ def __get_multiple_entities(self, ids: List[str], oatype: OpenAlexBaseType) -> L return entities @backoff.on_exception(backoff.expo, - (requests.exceptions.Timeout, - requests.exceptions.ConnectionError), - max_time=60, - on_backoff=print(f"Backing off")) - def get_multiple_works(self, ids: List[str]) -> List[Work]: - return self.__get_multiple_entities(ids, Work) - - @backoff.on_exception(backoff.expo, - (requests.exceptions.Timeout, - requests.exceptions.ConnectionError), - max_time=60, - on_backoff=print(f"Backing off")) - def get_multiple_authors(self, ids: List[str]) -> List[Author]: - return self.__get_multiple_entities(ids, Author) - - @backoff.on_exception(backoff.expo, - (requests.exceptions.Timeout, - requests.exceptions.ConnectionError), - max_time=60, - on_backoff=print(f"Backing off")) - def get_multiple_concepts(self, ids: List[str]) -> List[Author]: - return self.__get_multiple_entities(ids, Author) - - @backoff.on_exception(backoff.expo, - (requests.exceptions.Timeout, - requests.exceptions.ConnectionError), - max_time=60, - on_backoff=print(f"Backing off")) - def get_multiple_venues(self, ids: List[str]) -> List[Venue]: - return self.__get_multiple_entities(ids, Venue) - - @backoff.on_exception(backoff.expo, - (requests.exceptions.Timeout, - requests.exceptions.ConnectionError), - max_time=60, - on_backoff=print(f"Backing off")) - def get_multiple_institutions(self, ids: List[str]) -> List[Institution]: - return self.__get_multiple_entities(ids, Institution) + (requests.exceptions.Timeout, + requests.exceptions.ConnectionError), + max_time=60, + on_backoff=print(f"Backing off")) + def hydrate_entities(self, dehydrated_entities: Union[List[DehydratedAuthor],List[DehydratedConcept],List[DehydratedInstitution],List[DehydratedVenue],List[HostVenue]]) -> Union[List[Author],List[Concept],List[Institution],List[Venue]]: + return self.get_entities([i.id for i in dehydrated_entities]) + @backoff.on_exception(backoff.expo, (requests.exceptions.Timeout, @@ -196,7 +156,7 @@ def get_related_works(self, work: Work) -> List[Work]: :parameter work is OpenAlex Work """ - return self.get_multiple_works(work.related_works) + return self.get_entities(work.related_works) @backoff.on_exception(backoff.expo, (requests.exceptions.Timeout, @@ -208,7 +168,7 @@ def get_referenced_works(self, work: Work) -> List[Work]: :parameter work is OpenAlex Work """ - return self.get_multiple_works(work.referenced_works) + return self.get_entities(work.referenced_works) @backoff.on_exception(backoff.expo, (requests.exceptions.Timeout, @@ -226,16 +186,12 @@ def get_cited_by_works(self, work: Work, limit: int = None) -> List[Work]: "Please be nice and supply your email as the first argument " "when calling this class to get into the polite pool. This way " "OpenAlex can contact you if needed.") - headers = { - "Accept": "application/json", - "User-Agent": f"OpenAlexAPI https://github.com/dpriskorn/OpenAlexAPI mailto:{self.email}" - } - per_page = PAGE_LIMIT if limit is None else min(PAGE_LIMIT, limit) + per_page = self.page_limit if limit is None else min(self.page_limit, limit) works = [] cursor = '*' while cursor: url = f"{work.cited_by_api_url}&per_page={per_page}&cursor={cursor}" - response = requests.get(url, headers=headers) + response = requests.get(url, headers=self._headers) if response.status_code == 200: works += [Work(**w) for w in response.json()['results']] cursor = response.json()['meta']['next_cursor'] @@ -244,3 +200,72 @@ def get_cited_by_works(self, work: Work, limit: int = None) -> List[Work]: if limit and len(works) >= limit: break return works[:limit] + + @backoff.on_exception(backoff.expo, + (requests.exceptions.Timeout, + requests.exceptions.ConnectionError), + max_time=60, + on_backoff=print(f"Backing off")) + def get_associated_works(self, entity: Union[Author, DehydratedAuthor, Institution, DehydratedInstitution, Concept, DehydratedConcept, Venue, DehydratedVenue], limit: int = None) -> List[Work]: + """Fetches all works associated with the entity, up to some limit. + + :parameter work is OpenAlex Institution, Venue, Author + :parameter limit is the maximum number of works to return + """ + if self.email is None: + print("OpenAlex has 2 pools for clients. " + "Please be nice and supply your email as the first argument " + "when calling this class to get into the polite pool. This way " + "OpenAlex can contact you if needed.") + per_page = self.page_limit if limit is None else min(self.page_limit, limit) + works = [] + cursor = '*' + while cursor: + url = f"{self._works_urls[type(entity)]}{entity.id}&per_page={per_page}&cursor={cursor}" + response = requests.get(url, headers=self._headers) + if response.status_code == 200: + works += [Work(**w) for w in response.json()['results']] + cursor = response.json()['meta']['next_cursor'] + else: + raise ValueError(f"Got {response.status_code} from OpenAlex") + if limit and len(works) >= limit: + break + return works[:limit] + + @backoff.on_exception(backoff.expo, + (requests.exceptions.Timeout, + requests.exceptions.ConnectionError), + max_time=60, + on_backoff=print(f"Backing off")) + def search_entities(self, query: str, entity_type= Union["Author","Concept","Institution","Venue","Work"], limit: int = None) -> Union[List[Author],List[Concept],List[Institution],List[Venue],List[Work]]: + """Fetches all works associated with the entity, up to some limit. + + :parameter work is OpenAlex Institution, Venue, Author + :parameter limit is the maximum number of works to return + """ + + + + if self.email is None: + print("OpenAlex has 2 pools for clients. " + "Please be nice and supply your email as the first argument " + "when calling this class to get into the polite pool. This way " + "OpenAlex can contact you if needed.") + + if entity_type[0] in self._entities_prefixes: etype = self._entities_prefixes[entity_type[0]] + else: raise ValueError("Id prefix does not correspond to a valid entity.") + + per_page = self.page_limit if limit is None else min(self.page_limit, limit) + entities = [] + cursor = '*' + while cursor: + url = f"{self._base_url}{etype.__name__.lower()}s?search=\"{query}\"&per_page={per_page}&cursor={cursor}" + response = requests.get(url, headers=self._headers) + if response.status_code == 200: + entities += [etype(**e) for e in response.json()['results']] + cursor = response.json()['meta']['next_cursor'] + else: + raise ValueError(f"Got {response.status_code} from OpenAlex") + if limit and len(entities) >= limit: + break + return entities[:limit] \ No newline at end of file diff --git a/openalexapi/author.py b/openalexapi/author.py index 40ec183..4d2fa02 100644 --- a/openalexapi/author.py +++ b/openalexapi/author.py @@ -5,8 +5,9 @@ from openalexapi.basetype import OpenAlexBaseType from openalexapi.ids import Ids -from openalexapi.institution import Institution +from openalexapi.institution import DehydratedInstitution from openalexapi.year import Year +from openalexapi.concept import DehydratedConcept class Author(OpenAlexBaseType): display_name: Optional[str] @@ -15,11 +16,12 @@ class Author(OpenAlexBaseType): works_count: Optional[int] cited_by_count: Optional[int] ids: Optional[Ids] - last_known_institution: Optional[Institution] + last_known_institution: Optional[DehydratedInstitution] counts_by_year: Optional[List[Year]] works_api_url: Optional[str] updated_date: Optional[str] created_date: Optional[str] + x_concepts: Optional[List[DehydratedConcept]] class Config: arbitrary_types_allowed = True @@ -31,3 +33,15 @@ def orcid_url(self): @property def orcid_id(self): return self.orcid.replace("https://orcid.org/", "") + +class DehydratedAuthor(OpenAlexBaseType): + display_name: Optional[str] + orcid: Optional[str] + + @property + def orcid_url(self): + return self.orcid + + @property + def orcid_id(self): + return self.orcid.replace("https://orcid.org/", "") \ No newline at end of file diff --git a/openalexapi/authorship.py b/openalexapi/authorship.py index f3173c2..90ff6b0 100644 --- a/openalexapi/authorship.py +++ b/openalexapi/authorship.py @@ -5,12 +5,12 @@ from pydantic import BaseModel -from openalexapi.author import Author -from openalexapi.institution import Institution - +from openalexapi.author import DehydratedAuthor +from openalexapi.institution import DehydratedInstitution class Authorship(BaseModel): author_position: str - author: Optional[Author] - institutions: Optional[List[Institution]] - raw_affiliation_string: Optional[str] \ No newline at end of file + author: Optional[DehydratedAuthor] + institutions: Optional[List[DehydratedInstitution]] + raw_affiliation_string: Optional[str] + \ No newline at end of file diff --git a/openalexapi/concept.py b/openalexapi/concept.py index 104aa6e..be380c1 100644 --- a/openalexapi/concept.py +++ b/openalexapi/concept.py @@ -1,16 +1,37 @@ """ Copyright 2022 Dennis Priskorn """ -from typing import Optional +from typing import Optional, List from openalexapi.basetype import OpenAlexBaseType +from openalexapi.ids import Ids +from openalexapi.year import Year +class DehydratedConcept(OpenAlexBaseType): + wikidata: Optional[str] + display_name: Optional[str] + level: Optional[int] + class Concept(OpenAlexBaseType): wikidata: Optional[str] display_name: Optional[str] level: Optional[int] score: Optional[float] + description: Optional[str] + works_count: Optional[int] + cited_by_count: Optional[int] + ids: Optional[Ids] + image_url: Optional[str] + image_thumbnail_url:Optional[str] + score: Optional[float] #used for ancestors and related concepts + #TODO: international + ancestors: Optional[List[DehydratedConcept]] + related_concepts: Optional[List[DehydratedConcept]] + counts_by_year: Optional[List[Year]] + works_api_url: Optional[str] + updated_date: Optional[str] + created_date: Optional[str] class Config: arbitrary_types_allowed = True @@ -22,3 +43,4 @@ def wikidata_id(self): @property def wikidata_wiki_url(self): return self.wikidata + diff --git a/openalexapi/enums.py b/openalexapi/enums.py index 12bad9b..1645b78 100644 --- a/openalexapi/enums.py +++ b/openalexapi/enums.py @@ -33,4 +33,35 @@ class WorkType(Enum): REPORT = "report" REPORT_SERIES = "report-series" STANDARD = "standard" - STANDARD_SERIES = "standard-series" \ No newline at end of file + STANDARD_SERIES = "standard-series" + +#used in host_venue +class VersionType(Enum): + PUBLISHED_VERSION="publishedVersion" + ACCEPTED_VERSION="acceptedVersion" + SUBMITTED_VERSION="submittedVersion" + +#used in institituion +class InstitutionalRelationship(Enum): + PARENT="parent" + CHILD="child" + RELATED="related" + +#used in institituion +class InstitutionType(Enum): + EDUCATION="education" + HEALTHCARE="healthcare" + COMPANY="company" + ARCHIVE="archive" + NONPROFIT="nonprofit" + GOVERNMENT="government" + FACILITY="facility" + OTHER="other" + +#used in openaccess +class OpenAccessStatus(Enum): + GOLD="gold" + GREEN="green" + HYBRID="hybrid" + BRONZE="bronze" + CLOUD="cloud" \ No newline at end of file diff --git a/openalexapi/geo.py b/openalexapi/geo.py new file mode 100644 index 0000000..f747194 --- /dev/null +++ b/openalexapi/geo.py @@ -0,0 +1,18 @@ +""" +Copyright 2022 Dennis Priskorn +""" +from typing import Optional + +from pydantic import BaseModel, constr + + +class Geo(BaseModel): + city: str + geonames_city_id: str + region: str + country_code: Optional[constr(max_length=2, min_length=2)] + country: str + latitude: float + longitude: float + #TODO: international currenlty undocumented + \ No newline at end of file diff --git a/openalexapi/ids.py b/openalexapi/ids.py index 0b68c1c..255aaf1 100644 --- a/openalexapi/ids.py +++ b/openalexapi/ids.py @@ -1,7 +1,7 @@ """ Copyright 2022 Dennis Priskorn """ -from typing import Optional +from typing import Optional, List from pydantic import BaseModel @@ -13,7 +13,15 @@ class Ids(BaseModel): twitter: Optional[str] wikipedia: Optional[str] scopus: Optional[str] - + ror: Optional[str] + grid: Optional[str] + wikidata: Optional[str] + umls_aui: Optional[List[str]] + umls_cui: Optional[List[str]] + issn_l: Optional[str] + issn: Optional[str] + orcid: Optional[str] + class Config: arbitrary_types_allowed = True diff --git a/openalexapi/institution.py b/openalexapi/institution.py index 71f5ad1..be68ba9 100644 --- a/openalexapi/institution.py +++ b/openalexapi/institution.py @@ -1,16 +1,42 @@ """ Copyright 2022 Dennis Priskorn """ -from typing import Optional +from typing import Optional, List from pydantic import constr from openalexapi.basetype import OpenAlexBaseType - +from openalexapi.ids import Ids +from openalexapi.year import Year +from openalexapi.geo import Geo +from openalexapi.enums import InstitutionType, InstitutionalRelationship +from openalexapi.concept import DehydratedConcept class Institution(OpenAlexBaseType): id: Optional[str] display_name: Optional[str] + display_name_alternatives: Optional[List[str]] + ids: Optional[Ids] ror: Optional[str] country_code: Optional[constr(max_length=2, min_length=2)] - type: Optional[str] + type: Optional[InstitutionType] + works_count: Optional[int] + cited_by_count: Optional[int] + counts_by_year: Optional[List[Year]] + works_api_url: Optional[str] + updated_date: Optional[str] + created_date: Optional[str] + homepage_url: Optional[str] + image_url: Optional[str] + image_thumbnail_url: Optional[str] + display_name_acronyms: Optional[List[str]] + relationship: Optional[InstitutionalRelationship] + geo: Optional[Geo] + x_concepts: Optional[List[DehydratedConcept]] + +class DehydratedInstitution(OpenAlexBaseType): + display_name: Optional[str] + ror: Optional[str] + country_code: Optional[constr(max_length=2, min_length=2)] + type: Optional[InstitutionType] + diff --git a/openalexapi/openaccess.py b/openalexapi/openaccess.py index 75d8a1b..b3229b1 100644 --- a/openalexapi/openaccess.py +++ b/openalexapi/openaccess.py @@ -4,9 +4,9 @@ from typing import Optional from pydantic import BaseModel - +from openalexapi.enums import OpenAccessStatus class OpenAccess(BaseModel): is_oa: bool - oa_status: Optional[str] + oa_status: Optional[str] #TODO: use class in enums oa_url: Optional[str] diff --git a/openalexapi/venue.py b/openalexapi/venue.py index 1c69353..4673d59 100644 --- a/openalexapi/venue.py +++ b/openalexapi/venue.py @@ -4,6 +4,11 @@ from typing import Optional, List from openalexapi.basetype import OpenAlexBaseType +from openalexapi.year import Year +from openalexapi.enums import VersionType +from openalexapi.concept import DehydratedConcept + + class Venue(OpenAlexBaseType): @@ -16,3 +21,25 @@ class Venue(OpenAlexBaseType): is_oa: Optional[bool] version: Optional[str] license: Optional[str] + works_count: Optional[int] + cited_by_count: Optional[int] + counts_by_year: Optional[List[Year]] + works_api_url: Optional[str] + updated_date: Optional[str] + created_date: Optional[str] + is_in_doaj: Optional[bool] + x_concepts: Optional[List[DehydratedConcept]] + + +class DehydratedVenue(OpenAlexBaseType): + issn_l: Optional[str] + issn: Optional[str] + display_name: Optional[str] + publisher: Optional[str] + +class HostVenue(DehydratedVenue): + url: Optional[str] + is_oa: Optional[bool] + version: Optional[VersionType] + license: Optional[str] + \ No newline at end of file diff --git a/openalexapi/work.py b/openalexapi/work.py index c2218c7..41d41e6 100644 --- a/openalexapi/work.py +++ b/openalexapi/work.py @@ -8,12 +8,12 @@ from openalexapi.basetype import OpenAlexBaseType from openalexapi.authorship import Authorship from openalexapi.biblio import Biblio -from openalexapi.concept import Concept +from openalexapi.concept import DehydratedConcept from openalexapi.enums import WorkType from openalexapi.ids import Ids from openalexapi.mesh import Mesh from openalexapi.openaccess import OpenAccess -from openalexapi.venue import Venue +from openalexapi.venue import HostVenue from openalexapi.year import Year @@ -23,16 +23,18 @@ class Work(OpenAlexBaseType): title: Optional[str] publication_year: Optional[conint(le=2023, ge=0)] publication_date: Optional[str] + updated_date: Optional[str] + created_date: Optional[str] type: Optional[WorkType] - host_venue: Optional[Venue] + host_venue: Optional[HostVenue] open_access: Optional[OpenAccess] authorships: Optional[List[Authorship]] cited_by_count: Optional[int] is_retracted: Optional[bool] is_paratext: Optional[bool] - concepts: Optional[List[Concept]] + concepts: Optional[List[DehydratedConcept]] mesh: Optional[List[Mesh]] - alternate_host_venues: Optional[List[Venue]] + alternate_host_venues: Optional[List[HostVenue]] referenced_works: Optional[List[str]] # this is urls like https://openalex.org/W123 related_works: Optional[List[str]] # this is urls like https://openalex.org/W123 abstract_inverted_index: Optional[Dict[str, List[int]]] diff --git a/tests/test___init__.py b/tests/test___init__.py index 4937c31..e07ec8b 100644 --- a/tests/test___init__.py +++ b/tests/test___init__.py @@ -10,6 +10,7 @@ class TestOpenAlex(TestCase): + ''' def test_get_single_work(self): oa = OpenAlex() work = oa.get_single_work("W2741809807") @@ -64,6 +65,22 @@ def test_get_cited_by_works(self): self.assertEqual(len(works), 500) for w in works: self.assertIsInstance(w, Work) + ''' + def test_openalex(self): + openalex=OpenAlex() + institution=openalex.search_entities("University of California - Los Angeles", "Institution",limit=20) + work=openalex.search_entities("Deep Learning of Potential Outcomes", "Work",limit=20) + authors=openalex.search_entities("Bernard Koch", "Author",limit=20) + author=openalex.get_entities([authors[1].id]) + author=openalex.get_single_entity(authors[1].id) + + works = openalex.get_associated_works(author) + works = openalex.get_referenced_works(works[0]) + works = openalex.get_cited_by_works(works[0]) + works = openalex.get_associated_works(author.last_known_institution,limit=1) + + institution = openalex.hydrate_entity(author.last_known_institution) + institution = openalex.hydrate_entities([author.last_known_institution]) if __name__ == "__main__": From 0c2d88268a3395393f205043017eac5ab601401f Mon Sep 17 00:00:00 2001 From: kochbj Date: Fri, 3 Jun 2022 15:52:26 -0400 Subject: [PATCH 4/4] Large expansion of API Lots of changes: 1. Remove get_works and replace with universal get_entities 2. Exhaustively fleshed out ALL fields in OpenAlex API. This required making some new enums and classes (i.e., geo) 3. Added dehydrated objects for four core object types. Also added a convenience function hydrate. I had not planned to use dehydrated objects, but it does slow things down and I think it's good to clarify to users why fields are missing. 4. Added basic search functionality 5. Added get associated_works 6. Made explicit set_email method. I needed this to correct the headers using pydantic. Perhaps there is a better way, but I think it's good to allow users to change after construction. TODO: IMPORTANT: 1. More serious unit testing 2. Advanced search, filter, groupby functionality MINOR: 1. If theres a way to not have to do the backoff decorator every time that would be cool. 2.I struggled with the openaccess enum and allowing typing to do None so gave up. 3. I really hate how years are returned. If its possible to parse these as dict of lists rather list of dicts I'd be all for it. --- openalexapi/__init__.py | 6 +++--- openalexapi/venue.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/openalexapi/__init__.py b/openalexapi/__init__.py index 04a7a02..ffe6a93 100644 --- a/openalexapi/__init__.py +++ b/openalexapi/__init__.py @@ -11,7 +11,7 @@ from openalexapi.basetype import OpenAlexBaseType from openalexapi.work import Work from openalexapi.author import Author, DehydratedAuthor -from openalexapi.work import Concept, DehydratedConcept +from openalexapi.concept import Concept, DehydratedConcept from openalexapi.venue import Venue, DehydratedVenue, HostVenue from openalexapi.institution import Institution, DehydratedInstitution @@ -42,7 +42,7 @@ class OpenAlex(BaseModel): DehydratedInstitution: _base_url+"works?filter=institutions.id:", Venue: _base_url+"works?filter=host_venue.id:", DehydratedVenue: _base_url+"works?filter=host_venue.id:", - Host: _base_url+"works?filter=host_venue.id:" + HostVenue: _base_url+"works?filter=host_venue.id:" } _entities_prefixes: dict = { 'A':Author, @@ -268,4 +268,4 @@ def search_entities(self, query: str, entity_type= Union["Author","Concept","Ins raise ValueError(f"Got {response.status_code} from OpenAlex") if limit and len(entities) >= limit: break - return entities[:limit] \ No newline at end of file + return entities[:limit] diff --git a/openalexapi/venue.py b/openalexapi/venue.py index 4673d59..29303fa 100644 --- a/openalexapi/venue.py +++ b/openalexapi/venue.py @@ -33,7 +33,7 @@ class Venue(OpenAlexBaseType): class DehydratedVenue(OpenAlexBaseType): issn_l: Optional[str] - issn: Optional[str] + issn: Optional[List[str]] display_name: Optional[str] publisher: Optional[str]