diff --git a/openalexapi/__init__.py b/openalexapi/__init__.py index 211df98..ffe6a93 100644 --- a/openalexapi/__init__.py +++ b/openalexapi/__init__.py @@ -2,13 +2,19 @@ Copyright 2022 Dennis Priskorn """ import logging -from typing import Optional, List +from typing import Optional, List, Union import backoff # type: ignore import requests from pydantic import BaseModel, EmailStr +from openalexapi.basetype import OpenAlexBaseType from openalexapi.work import Work +from openalexapi.author import Author, DehydratedAuthor +from openalexapi.concept import Concept, DehydratedConcept +from openalexapi.venue import Venue, DehydratedVenue, HostVenue +from openalexapi.institution import Institution, DehydratedInstitution + logger = logging.getLogger(__name__) @@ -20,14 +26,48 @@ class OpenAlex(BaseModel): :parameter=email """ email: Optional[EmailStr] - base_url = "https://api.openalex.org/" + page_limit: int = 50 + _base_url: str = "https://api.openalex.org/" + _headers: dict = { + "Accept": "application/json", + "User-Agent": f"OpenAlexAPI https://github.com/dpriskorn/OpenAlexAPI" + } + #Convenience dict because dehydrated entities do not have works_api_urls and annoying inconsistencies in endpoints (institution vs instititions, host_venue vs venue) + _works_urls: dict = { + Author: _base_url+"works?filter=author.id:", + DehydratedAuthor: _base_url+"works?filter=author.id:", + Concept: _base_url+"works?filter=concept.id:", + DehydratedConcept: _base_url+"works?filter=concept.id:", + Institution: _base_url+"works?filter=institutions.id:", + DehydratedInstitution: _base_url+"works?filter=institutions.id:", + Venue: _base_url+"works?filter=host_venue.id:", + DehydratedVenue: _base_url+"works?filter=host_venue.id:", + HostVenue: _base_url+"works?filter=host_venue.id:" + } + _entities_prefixes: dict = { + 'A':Author, + 'C':Concept, + 'I':Institution, + 'V':Venue, + 'W':Work + } + + class Config: + underscore_attrs_are_private = True + + def set_email(self,email: EmailStr): + self.email = email + self._headers = { + "Accept": "application/json", + "User-Agent": f"OpenAlexAPI https://github.com/dpriskorn/OpenAlexAPI mailto:{self.email}" + } @backoff.on_exception(backoff.expo, - (requests.exceptions.Timeout, - requests.exceptions.ConnectionError), - max_time=60, - on_backoff=print(f"Backing off")) - def get_single_work(self, id: str) -> Optional[Work]: + (requests.exceptions.Timeout, + requests.exceptions.ConnectionError), + max_time=60, + on_backoff=print(f"Backing off")) + def get_single_entity(self, id: str) -> Union[Author,Concept,Institution,Venue,Work]: """This models the single work entity endpoint :parameter id can be and OpenAlex ID e.g. "W123" or a namespace ID like "doi:10.123" @@ -39,27 +79,35 @@ def get_single_work(self, id: str) -> Optional[Work]: "Please be nice and supply your email as the first argument " "when calling this class to get into the polite pool. This way " "OpenAlex can contact you if needed.") - url = self.base_url + "works/" + id + id =id.replace("https://openalex.org/", "") + if id[0] in self._entities_prefixes: etype = self._entities_prefixes[id[0]] + else: raise ValueError("Id prefix does not correspond to a valid entity.") + url = self._base_url + etype.__name__.lower() + 's/' + id logger.debug(f"Fetching {url}") - headers = { - "Accept": "application/json", - "User-Agent": f"OpenAlexAPI https://github.com/dpriskorn/OpenAlexAPI mailto:{self.email}" - } - response = requests.get(url, headers=headers) + response = requests.get(url, headers=self._headers) if response.status_code == 200: - return Work(**response.json()) + return etype(**response.json()) elif response.status_code == 404: return None else: raise ValueError(f"Got {response.status_code} from OpenAlex") + # TODO: Adapt this to support multiple namespaces + @backoff.on_exception(backoff.expo, + (requests.exceptions.Timeout, + requests.exceptions.ConnectionError), + max_time=60, + on_backoff=print(f"Backing off")) + def hydrate_entity(self, dehydrated_entity: Union[DehydratedAuthor, DehydratedConcept, DehydratedInstitution, DehydratedVenue, HostVenue]) -> Union[Author,Concept,Institution,Venue]: + return self.get_single_entity(dehydrated_entity.id) + # TODO: Adapt this to support multiple namespaces @backoff.on_exception(backoff.expo, (requests.exceptions.Timeout, requests.exceptions.ConnectionError), max_time=60, on_backoff=print(f"Backing off")) - def get_multiple_works(self, ids: List[str]) -> List[Work]: + def get_entities(self, ids: List[str]) -> Union[List[Author],List[Concept],List[Institution],List[Venue],List[Work]]: """Fetches multiple works by OpenAlex IDs. Note this does not support alternative namespaces. @@ -72,25 +120,31 @@ def get_multiple_works(self, ids: List[str]) -> List[Work]: "Please be nice and supply your email as the first argument " "when calling this class to get into the polite pool. This way " "OpenAlex can contact you if needed.") - headers = { - "Accept": "application/json", - "User-Agent": f"OpenAlexAPI https://github.com/dpriskorn/OpenAlexAPI mailto:{self.email}" - } ids = [s.replace("https://openalex.org/", "") for s in ids] - works = [] - # Limit of 50 is imposed by OpenAlex API - for i in range(0, len(ids), 50): - url_ids = '|'.join(ids[i:i+50]) - url = self.base_url + f"works?filter=openalex_id:{url_ids}&per_page=50" - logger.debug(f"Fetching works {i} through {i+50}") - response = requests.get(url, headers=headers) + if ids[0][0] in self._entities_prefixes: etype = self._entities_prefixes[ids[0][0]] + else: raise ValueError("Id prefix does not correspond to a valid entity.") + entities = [] + for i in range(0, len(ids), self.page_limit): + url_ids = '|'.join(ids[i:i+self.page_limit]) + url = self._base_url + f"{etype.__name__.lower()}s?filter=openalex_id:{url_ids}&per_page={self.page_limit}" + logger.debug(f"Fetching {etype.__name__.lower()}s {i} through {i+self.page_limit}") + response = requests.get(url, headers=self._headers) if response.status_code == 200: - works += [Work(**w) for w in response.json()['results']] + entities += [etype(**e) for e in response.json()['results']] elif response.status_code == 403: raise ValueError("Got error 403. Are you using OpenAlex IDs?") else: raise ValueError(f"Got {response.status_code} from OpenAlex") - return works + return entities + + @backoff.on_exception(backoff.expo, + (requests.exceptions.Timeout, + requests.exceptions.ConnectionError), + max_time=60, + on_backoff=print(f"Backing off")) + def hydrate_entities(self, dehydrated_entities: Union[List[DehydratedAuthor],List[DehydratedConcept],List[DehydratedInstitution],List[DehydratedVenue],List[HostVenue]]) -> Union[List[Author],List[Concept],List[Institution],List[Venue]]: + return self.get_entities([i.id for i in dehydrated_entities]) + @backoff.on_exception(backoff.expo, (requests.exceptions.Timeout, @@ -102,7 +156,7 @@ def get_related_works(self, work: Work) -> List[Work]: :parameter work is OpenAlex Work """ - return self.get_multiple_works(work.related_works) + return self.get_entities(work.related_works) @backoff.on_exception(backoff.expo, (requests.exceptions.Timeout, @@ -114,7 +168,7 @@ def get_referenced_works(self, work: Work) -> List[Work]: :parameter work is OpenAlex Work """ - return self.get_multiple_works(work.referenced_works) + return self.get_entities(work.referenced_works) @backoff.on_exception(backoff.expo, (requests.exceptions.Timeout, @@ -132,16 +186,12 @@ def get_cited_by_works(self, work: Work, limit: int = None) -> List[Work]: "Please be nice and supply your email as the first argument " "when calling this class to get into the polite pool. This way " "OpenAlex can contact you if needed.") - headers = { - "Accept": "application/json", - "User-Agent": f"OpenAlexAPI https://github.com/dpriskorn/OpenAlexAPI mailto:{self.email}" - } - per_page = 200 if limit is None else min(200, limit) + per_page = self.page_limit if limit is None else min(self.page_limit, limit) works = [] cursor = '*' while cursor: url = f"{work.cited_by_api_url}&per_page={per_page}&cursor={cursor}" - response = requests.get(url, headers=headers) + response = requests.get(url, headers=self._headers) if response.status_code == 200: works += [Work(**w) for w in response.json()['results']] cursor = response.json()['meta']['next_cursor'] @@ -150,3 +200,72 @@ def get_cited_by_works(self, work: Work, limit: int = None) -> List[Work]: if limit and len(works) >= limit: break return works[:limit] + + @backoff.on_exception(backoff.expo, + (requests.exceptions.Timeout, + requests.exceptions.ConnectionError), + max_time=60, + on_backoff=print(f"Backing off")) + def get_associated_works(self, entity: Union[Author, DehydratedAuthor, Institution, DehydratedInstitution, Concept, DehydratedConcept, Venue, DehydratedVenue], limit: int = None) -> List[Work]: + """Fetches all works associated with the entity, up to some limit. + + :parameter work is OpenAlex Institution, Venue, Author + :parameter limit is the maximum number of works to return + """ + if self.email is None: + print("OpenAlex has 2 pools for clients. " + "Please be nice and supply your email as the first argument " + "when calling this class to get into the polite pool. This way " + "OpenAlex can contact you if needed.") + per_page = self.page_limit if limit is None else min(self.page_limit, limit) + works = [] + cursor = '*' + while cursor: + url = f"{self._works_urls[type(entity)]}{entity.id}&per_page={per_page}&cursor={cursor}" + response = requests.get(url, headers=self._headers) + if response.status_code == 200: + works += [Work(**w) for w in response.json()['results']] + cursor = response.json()['meta']['next_cursor'] + else: + raise ValueError(f"Got {response.status_code} from OpenAlex") + if limit and len(works) >= limit: + break + return works[:limit] + + @backoff.on_exception(backoff.expo, + (requests.exceptions.Timeout, + requests.exceptions.ConnectionError), + max_time=60, + on_backoff=print(f"Backing off")) + def search_entities(self, query: str, entity_type= Union["Author","Concept","Institution","Venue","Work"], limit: int = None) -> Union[List[Author],List[Concept],List[Institution],List[Venue],List[Work]]: + """Fetches all works associated with the entity, up to some limit. + + :parameter work is OpenAlex Institution, Venue, Author + :parameter limit is the maximum number of works to return + """ + + + + if self.email is None: + print("OpenAlex has 2 pools for clients. " + "Please be nice and supply your email as the first argument " + "when calling this class to get into the polite pool. This way " + "OpenAlex can contact you if needed.") + + if entity_type[0] in self._entities_prefixes: etype = self._entities_prefixes[entity_type[0]] + else: raise ValueError("Id prefix does not correspond to a valid entity.") + + per_page = self.page_limit if limit is None else min(self.page_limit, limit) + entities = [] + cursor = '*' + while cursor: + url = f"{self._base_url}{etype.__name__.lower()}s?search=\"{query}\"&per_page={per_page}&cursor={cursor}" + response = requests.get(url, headers=self._headers) + if response.status_code == 200: + entities += [etype(**e) for e in response.json()['results']] + cursor = response.json()['meta']['next_cursor'] + else: + raise ValueError(f"Got {response.status_code} from OpenAlex") + if limit and len(entities) >= limit: + break + return entities[:limit] diff --git a/openalexapi/author.py b/openalexapi/author.py index 6424564..4d2fa02 100644 --- a/openalexapi/author.py +++ b/openalexapi/author.py @@ -1,15 +1,28 @@ """ Copyright 2022 Dennis Priskorn """ -from typing import Optional +from typing import Optional, List from openalexapi.basetype import OpenAlexBaseType - +from openalexapi.ids import Ids +from openalexapi.institution import DehydratedInstitution +from openalexapi.year import Year +from openalexapi.concept import DehydratedConcept class Author(OpenAlexBaseType): display_name: Optional[str] orcid: Optional[str] - + display_name_alternatives: Optional[List[str]] + works_count: Optional[int] + cited_by_count: Optional[int] + ids: Optional[Ids] + last_known_institution: Optional[DehydratedInstitution] + counts_by_year: Optional[List[Year]] + works_api_url: Optional[str] + updated_date: Optional[str] + created_date: Optional[str] + x_concepts: Optional[List[DehydratedConcept]] + class Config: arbitrary_types_allowed = True @@ -20,3 +33,15 @@ def orcid_url(self): @property def orcid_id(self): return self.orcid.replace("https://orcid.org/", "") + +class DehydratedAuthor(OpenAlexBaseType): + display_name: Optional[str] + orcid: Optional[str] + + @property + def orcid_url(self): + return self.orcid + + @property + def orcid_id(self): + return self.orcid.replace("https://orcid.org/", "") \ No newline at end of file diff --git a/openalexapi/authorship.py b/openalexapi/authorship.py index f3173c2..90ff6b0 100644 --- a/openalexapi/authorship.py +++ b/openalexapi/authorship.py @@ -5,12 +5,12 @@ from pydantic import BaseModel -from openalexapi.author import Author -from openalexapi.institution import Institution - +from openalexapi.author import DehydratedAuthor +from openalexapi.institution import DehydratedInstitution class Authorship(BaseModel): author_position: str - author: Optional[Author] - institutions: Optional[List[Institution]] - raw_affiliation_string: Optional[str] \ No newline at end of file + author: Optional[DehydratedAuthor] + institutions: Optional[List[DehydratedInstitution]] + raw_affiliation_string: Optional[str] + \ No newline at end of file diff --git a/openalexapi/concept.py b/openalexapi/concept.py index 104aa6e..be380c1 100644 --- a/openalexapi/concept.py +++ b/openalexapi/concept.py @@ -1,16 +1,37 @@ """ Copyright 2022 Dennis Priskorn """ -from typing import Optional +from typing import Optional, List from openalexapi.basetype import OpenAlexBaseType +from openalexapi.ids import Ids +from openalexapi.year import Year +class DehydratedConcept(OpenAlexBaseType): + wikidata: Optional[str] + display_name: Optional[str] + level: Optional[int] + class Concept(OpenAlexBaseType): wikidata: Optional[str] display_name: Optional[str] level: Optional[int] score: Optional[float] + description: Optional[str] + works_count: Optional[int] + cited_by_count: Optional[int] + ids: Optional[Ids] + image_url: Optional[str] + image_thumbnail_url:Optional[str] + score: Optional[float] #used for ancestors and related concepts + #TODO: international + ancestors: Optional[List[DehydratedConcept]] + related_concepts: Optional[List[DehydratedConcept]] + counts_by_year: Optional[List[Year]] + works_api_url: Optional[str] + updated_date: Optional[str] + created_date: Optional[str] class Config: arbitrary_types_allowed = True @@ -22,3 +43,4 @@ def wikidata_id(self): @property def wikidata_wiki_url(self): return self.wikidata + diff --git a/openalexapi/enums.py b/openalexapi/enums.py index 12bad9b..1645b78 100644 --- a/openalexapi/enums.py +++ b/openalexapi/enums.py @@ -33,4 +33,35 @@ class WorkType(Enum): REPORT = "report" REPORT_SERIES = "report-series" STANDARD = "standard" - STANDARD_SERIES = "standard-series" \ No newline at end of file + STANDARD_SERIES = "standard-series" + +#used in host_venue +class VersionType(Enum): + PUBLISHED_VERSION="publishedVersion" + ACCEPTED_VERSION="acceptedVersion" + SUBMITTED_VERSION="submittedVersion" + +#used in institituion +class InstitutionalRelationship(Enum): + PARENT="parent" + CHILD="child" + RELATED="related" + +#used in institituion +class InstitutionType(Enum): + EDUCATION="education" + HEALTHCARE="healthcare" + COMPANY="company" + ARCHIVE="archive" + NONPROFIT="nonprofit" + GOVERNMENT="government" + FACILITY="facility" + OTHER="other" + +#used in openaccess +class OpenAccessStatus(Enum): + GOLD="gold" + GREEN="green" + HYBRID="hybrid" + BRONZE="bronze" + CLOUD="cloud" \ No newline at end of file diff --git a/openalexapi/geo.py b/openalexapi/geo.py new file mode 100644 index 0000000..f747194 --- /dev/null +++ b/openalexapi/geo.py @@ -0,0 +1,18 @@ +""" +Copyright 2022 Dennis Priskorn +""" +from typing import Optional + +from pydantic import BaseModel, constr + + +class Geo(BaseModel): + city: str + geonames_city_id: str + region: str + country_code: Optional[constr(max_length=2, min_length=2)] + country: str + latitude: float + longitude: float + #TODO: international currenlty undocumented + \ No newline at end of file diff --git a/openalexapi/ids.py b/openalexapi/ids.py index 195f8e6..255aaf1 100644 --- a/openalexapi/ids.py +++ b/openalexapi/ids.py @@ -1,7 +1,7 @@ """ Copyright 2022 Dennis Priskorn """ -from typing import Optional +from typing import Optional, List from pydantic import BaseModel @@ -10,7 +10,18 @@ class Ids(BaseModel): doi: Optional[str] pmid: Optional[str] mag: Optional[str] - + twitter: Optional[str] + wikipedia: Optional[str] + scopus: Optional[str] + ror: Optional[str] + grid: Optional[str] + wikidata: Optional[str] + umls_aui: Optional[List[str]] + umls_cui: Optional[List[str]] + issn_l: Optional[str] + issn: Optional[str] + orcid: Optional[str] + class Config: arbitrary_types_allowed = True @@ -35,3 +46,4 @@ def pmid_id(self): @property def pmid_url(self): return self.pmid + diff --git a/openalexapi/institution.py b/openalexapi/institution.py index 71f5ad1..be68ba9 100644 --- a/openalexapi/institution.py +++ b/openalexapi/institution.py @@ -1,16 +1,42 @@ """ Copyright 2022 Dennis Priskorn """ -from typing import Optional +from typing import Optional, List from pydantic import constr from openalexapi.basetype import OpenAlexBaseType - +from openalexapi.ids import Ids +from openalexapi.year import Year +from openalexapi.geo import Geo +from openalexapi.enums import InstitutionType, InstitutionalRelationship +from openalexapi.concept import DehydratedConcept class Institution(OpenAlexBaseType): id: Optional[str] display_name: Optional[str] + display_name_alternatives: Optional[List[str]] + ids: Optional[Ids] ror: Optional[str] country_code: Optional[constr(max_length=2, min_length=2)] - type: Optional[str] + type: Optional[InstitutionType] + works_count: Optional[int] + cited_by_count: Optional[int] + counts_by_year: Optional[List[Year]] + works_api_url: Optional[str] + updated_date: Optional[str] + created_date: Optional[str] + homepage_url: Optional[str] + image_url: Optional[str] + image_thumbnail_url: Optional[str] + display_name_acronyms: Optional[List[str]] + relationship: Optional[InstitutionalRelationship] + geo: Optional[Geo] + x_concepts: Optional[List[DehydratedConcept]] + +class DehydratedInstitution(OpenAlexBaseType): + display_name: Optional[str] + ror: Optional[str] + country_code: Optional[constr(max_length=2, min_length=2)] + type: Optional[InstitutionType] + diff --git a/openalexapi/openaccess.py b/openalexapi/openaccess.py index 75d8a1b..b3229b1 100644 --- a/openalexapi/openaccess.py +++ b/openalexapi/openaccess.py @@ -4,9 +4,9 @@ from typing import Optional from pydantic import BaseModel - +from openalexapi.enums import OpenAccessStatus class OpenAccess(BaseModel): is_oa: bool - oa_status: Optional[str] + oa_status: Optional[str] #TODO: use class in enums oa_url: Optional[str] diff --git a/openalexapi/venue.py b/openalexapi/venue.py index 1c69353..29303fa 100644 --- a/openalexapi/venue.py +++ b/openalexapi/venue.py @@ -4,6 +4,11 @@ from typing import Optional, List from openalexapi.basetype import OpenAlexBaseType +from openalexapi.year import Year +from openalexapi.enums import VersionType +from openalexapi.concept import DehydratedConcept + + class Venue(OpenAlexBaseType): @@ -16,3 +21,25 @@ class Venue(OpenAlexBaseType): is_oa: Optional[bool] version: Optional[str] license: Optional[str] + works_count: Optional[int] + cited_by_count: Optional[int] + counts_by_year: Optional[List[Year]] + works_api_url: Optional[str] + updated_date: Optional[str] + created_date: Optional[str] + is_in_doaj: Optional[bool] + x_concepts: Optional[List[DehydratedConcept]] + + +class DehydratedVenue(OpenAlexBaseType): + issn_l: Optional[str] + issn: Optional[List[str]] + display_name: Optional[str] + publisher: Optional[str] + +class HostVenue(DehydratedVenue): + url: Optional[str] + is_oa: Optional[bool] + version: Optional[VersionType] + license: Optional[str] + \ No newline at end of file diff --git a/openalexapi/work.py b/openalexapi/work.py index c2218c7..41d41e6 100644 --- a/openalexapi/work.py +++ b/openalexapi/work.py @@ -8,12 +8,12 @@ from openalexapi.basetype import OpenAlexBaseType from openalexapi.authorship import Authorship from openalexapi.biblio import Biblio -from openalexapi.concept import Concept +from openalexapi.concept import DehydratedConcept from openalexapi.enums import WorkType from openalexapi.ids import Ids from openalexapi.mesh import Mesh from openalexapi.openaccess import OpenAccess -from openalexapi.venue import Venue +from openalexapi.venue import HostVenue from openalexapi.year import Year @@ -23,16 +23,18 @@ class Work(OpenAlexBaseType): title: Optional[str] publication_year: Optional[conint(le=2023, ge=0)] publication_date: Optional[str] + updated_date: Optional[str] + created_date: Optional[str] type: Optional[WorkType] - host_venue: Optional[Venue] + host_venue: Optional[HostVenue] open_access: Optional[OpenAccess] authorships: Optional[List[Authorship]] cited_by_count: Optional[int] is_retracted: Optional[bool] is_paratext: Optional[bool] - concepts: Optional[List[Concept]] + concepts: Optional[List[DehydratedConcept]] mesh: Optional[List[Mesh]] - alternate_host_venues: Optional[List[Venue]] + alternate_host_venues: Optional[List[HostVenue]] referenced_works: Optional[List[str]] # this is urls like https://openalex.org/W123 related_works: Optional[List[str]] # this is urls like https://openalex.org/W123 abstract_inverted_index: Optional[Dict[str, List[int]]] diff --git a/openalexapi/year.py b/openalexapi/year.py index 6f35170..230e772 100644 --- a/openalexapi/year.py +++ b/openalexapi/year.py @@ -2,8 +2,10 @@ Copyright 2022 Dennis Priskorn """ from pydantic import BaseModel, conint +from typing import Optional class Year(BaseModel): year: conint(le=2023, ge=0) cited_by_count: int + works_count: Optional[int] diff --git a/tests/test___init__.py b/tests/test___init__.py index 4937c31..e07ec8b 100644 --- a/tests/test___init__.py +++ b/tests/test___init__.py @@ -10,6 +10,7 @@ class TestOpenAlex(TestCase): + ''' def test_get_single_work(self): oa = OpenAlex() work = oa.get_single_work("W2741809807") @@ -64,6 +65,22 @@ def test_get_cited_by_works(self): self.assertEqual(len(works), 500) for w in works: self.assertIsInstance(w, Work) + ''' + def test_openalex(self): + openalex=OpenAlex() + institution=openalex.search_entities("University of California - Los Angeles", "Institution",limit=20) + work=openalex.search_entities("Deep Learning of Potential Outcomes", "Work",limit=20) + authors=openalex.search_entities("Bernard Koch", "Author",limit=20) + author=openalex.get_entities([authors[1].id]) + author=openalex.get_single_entity(authors[1].id) + + works = openalex.get_associated_works(author) + works = openalex.get_referenced_works(works[0]) + works = openalex.get_cited_by_works(works[0]) + works = openalex.get_associated_works(author.last_known_institution,limit=1) + + institution = openalex.hydrate_entity(author.last_known_institution) + institution = openalex.hydrate_entities([author.last_known_institution]) if __name__ == "__main__":