DANSy/scripts/whole_proteome_reference.py at main · NaegleLab/DANSy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import CoDIAC
import os
import pandas as pd
from pybiomart import Dataset
from CoDIAC import UniProt,InterPro

def main():
    # Change below to change slight behavior
    single_file = True
    method = 'gencode'

    # Current Interpro File Data Folder
    data_folder = 'DANSY_DATA/'
    file_suffix = 'Reference_File_20251006.csv'

    # Ensuring if there are existing reference files that they are not overwritten and finding which IDs have not been fetched.
    current_uniprots = set()
    full_dir = os.listdir(data_folder)
    for file in full_dir:
        if file.endswith(file_suffix):
            temp_df = pd.read_csv(data_folder+file)
            temp_uniprot = temp_df['UniProt ID'].tolist()
            current_uniprots.update(temp_uniprot)

    # There are two potential ways to retrieve the UniProt IDs one is using the pybiomart package and the other is using data downloaded from GenCode
    if method == 'pybiomart':
        # If pybiomart is to used these are the commands that will retrieve UniProt IDs
        dataset = Dataset(host = 'http://useast.ensembl.org', name='hsapiens_gene_ensembl')
        gene_cnv = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name','uniprotswissprot'])
        gene_cnv.dropna(axis=0, inplace=True,subset=['UniProtKB/Swiss-Prot ID'])
        uniprot_ids = gene_cnv['UniProtKB/Swiss-Prot ID'].tolist()
    elif method == 'gencode':
        # Using the gencode data to get the full list of UniProt IDs available for the human proteome
        gencode = pd.read_table('gencode.v47.metadata.SwissProt',header=None)
        uniprot_ids = gencode[1].tolist()
    else:
        raise ValueError()

    uniprot_ids = list(set(uniprot_ids).difference(current_uniprots))
    print('Out of %s there are a total of %s left'%(len(set(uniprot_ids))+len(current_uniprots), len(set(uniprot_ids))))

    # Double-check in case there was a disruption during reference file generation.
    cnt = 0
    hs_prot_dir = os.listdir(data_folder)
    for file in hs_prot_dir:
        if file.endswith(file_suffix):
            file_str = file.split('_')
            cnt_chck = int(file_str[1])
            if cnt_chck > cnt:
                cnt = cnt_chck

    # Second double-check for complete overlap between two reference files as there was a re-run with the same uniprot ID list
    rm_files = []
    for file in hs_prot_dir :
        if file.endswith(file_suffix) and file not in rm_files:
            file_content_1 = pd.read_csv(data_folder+file)
            id_list1 = file_content_1['UniProt ID'].tolist()
            for file_2 in hs_prot_dir:
                if file_2.endswith(file_suffix):
                    if file_2 != file and file_2 not in rm_files:
                        file_content_2 = pd.read_csv(data_folder+file_2)
                        id_list2 = file_content_2['UniProt ID'].tolist()
                        if set(id_list1).difference(id_list2) == set():
                            print('Found a match for %s'%file)
                            rm_files.append(file_2)
    if rm_files:
        cnt += 1

    if single_file:
        # Final set of IDs that are missing
        ref_file_name = os.path.join(data_folder,'Complete_Proteome_'+file_suffix)
        _ = CoDIAC.UniProt.makeRefFile(uniprot_ids, ref_file_name)
    else:
        # Note for everything below the 300 IDs is once UniProt starts to throw out exceptions.
        currently_retrieved = set()
        for x in range(0,round(len(uniprot_ids)/300)):
            a = uniprot_ids[x*300:300*(x+1)]
            ref_file_name = os.path.join(data_folder, 'Complete_Proteome_'+str(cnt)+'_'+file_suffix)
            _ = CoDIAC.UniProt.makeRefFile(a, ref_file_name)
            cnt += 1
            currently_retrieved.update(a)

        # Final set of IDs that are missing
        a = set(uniprot_ids).difference(currently_retrieved)
        ref_file_name = data_folder+'Complete_Proteome_'+str(cnt)+'_'+file_suffix
        _ = CoDIAC.UniProt.makeRefFile(a, ref_file_name)

if __name__ == '__main__':
    main()