CompBioProject/KEGG_Pull.py at master · KevKap/CompBioProject · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import Bio.KEGG.REST

def compound_ID_pull(compound_name):
    """Gets KEGG compound ID from compound name"""
    # Compound name needs to be in title case (Aaaaa Aaaa) (only if not all uppercase such as ATP)
    if not(compound_name.isupper()):
        compound_name_update = compound_name.lower().capitalize()
    else:
        compound_name_update = compound_name

    # Returns text object of all compounds that are related
    raw_data = Bio.KEGG.REST.kegg_find('compound', compound_name_update)

    # Looks for entry where it is compound_name; as formatted in the returned raw_data
    full_data = raw_data.readlines()
    for line in full_data:
        if compound_name_update + ';' in line:
            entry = line
            break

    # Finds the first instance of colon after cpd and '  ' after the number
    try:
        colon = entry.find(':')
        space = entry.find('	')

        return entry[colon + 1:space]

    except:
        return '\nThis compound is not in the KEGG database. Please set isBiological to False.\n'


def kegg_URL_generator(compound_name):
    """Generates KEGG database URL"""
    # Gets KEGG ID
    ID = compound_ID_pull(compound_name)

    # General URL template
    URL_temp = 'https://www.genome.jp/dbget-bin/www_bget?'

    return URL_temp + ID


def KEGG_databse_number(compound_name):
    """Returns the database link number from the compound page"""

    # Gets raw data
    raw_data = Bio.KEGG.REST.kegg_get(compound_ID_pull(compound_name))

    # Runs through the lines, searching for other DB links
    record = False
    database_reference_number = 0
    for line in raw_data.readlines():
        if line[:7] == 'DBLINKS':
            # If line starts with DBLINKS, it counts new lines
            record = True
        if line[:4] == 'ATOM':
            record = False
        if record:
            database_reference_number += 1

    # Returns number of outside database links
    return database_reference_number


def KEGG_data_fields(compound_name):
    """Returns the number of data fields for a compound"""

    # Gets raw data
    raw_data = Bio.KEGG.REST.kegg_get(compound_ID_pull(compound_name))

    # Runs through lines, recording when there is a new data field (one deemed worthy (listed in approved_list))
    approved_list = ['NAME', 'FORMULA', 'EXACT_MASS', 'MOL_WEIGHT', 'REACTION', 'PATHWAY', 'MODULE', 'ENZYME', 'BRITE',
                     'DBLINKS']

    # Initializes data_field_number
    data_field_number = 0

    # Goes through all the lines and checks to see if fields are there
    for line in raw_data.readlines():
        split_line = line.split()
        if split_line[0] in approved_list:
            data_field_number += 1

    return data_field_number


def KEGG_datarank_vecotr(compound_name):
    """Returns vector associated with KEGG that will be used in the ranking method"""

    # Data fields function
    data_fields = KEGG_data_fields(compound_name)

    # Gets number of data sources and literature
    data_sources = KEGG_databse_number(compound_name)

    # Literature will be set to 0, but will be assigned minimal value of literature citations (as other lowest source)
    # since KEGG self references

    # Vector for ranking (all entries have fields in same positions)
    return [data_sources, 0, data_fields]