-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPubchem_Pull.py
More file actions
78 lines (54 loc) · 2.45 KB
/
Pubchem_Pull.py
File metadata and controls
78 lines (54 loc) · 2.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import pubchempy as pcp
import requests
def CID_puller(compound_name):
"""Returns compound ID from PubChem Database"""
# Gets CID from compound name and returns the value as an str
result = pcp.get_compounds(compound_name, 'name')
compound = result[0]
CID = compound.cid
return str(CID)
def pubchem_URL_generator(compound_name):
"""Returns PubChem URL"""
# Gets compound CID
compound_CID = CID_puller(compound_name)
# URL template to return clickable link
URL_template = 'https://pubchem.ncbi.nlm.nih.gov/compound/'
# Final URL for compound
pubchem_URL = URL_template + compound_CID
return pubchem_URL
def pubmed_article_IDs(compound_name):
"""Returns PubMed article IDs"""
# Get the compound CID for insertion into URL
compound_CID = CID_puller(compound_name)
# Generic URL to access PubMed ID
pubmed_articles_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + compound_CID + '/xrefs/PubMedID/TXT'
# Uses requests to get all PubMed IDs
website_data_raw = requests.get(pubmed_articles_url)
# Extracts all IDs and puts them in a list
website_data = website_data_raw.text
PubMedID_list = website_data.split('\n')
# Get number of PubMed articles
return len(PubMedID_list)
def data_source_number(compound_name):
"""Returns the number of sources used (number of substance IDs for compound)"""
# Get CID for compound
cid = CID_puller(compound_name)
# Substance information (Gets returned as a list of a dictionary)
substance_info = pcp.get_sids(cid)
# Isolates dictionary out
substance_record = substance_info[0]
# Value for the 'SID' is returned as a list of all the SID entries
# Length of this list is the number of sources
data_sources = len(substance_record['SID'])
return data_sources
def pubchem_datarank_vector(compound_name):
"""Returns vector associated with PubChem for ranking method"""
# Data source number
data_sources = data_source_number(compound_name)
# Data field number (decided on 24 from https://pubchempy.readthedocs.io/en/latest/guide/properties.html)
data_fields = 24
# Literature reference count (This is just the total number of PubMed articles)
# Decided to halve the number to account for the fact that some just the compound and are not focused
# on the compound
literature = pubmed_article_IDs(compound_name) / 2
return [data_sources, literature, data_fields]