-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwhole_proteome_reference.py
More file actions
89 lines (79 loc) · 3.95 KB
/
whole_proteome_reference.py
File metadata and controls
89 lines (79 loc) · 3.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import CoDIAC
import os
import pandas as pd
from pybiomart import Dataset
from CoDIAC import UniProt,InterPro
def main():
# Change below to change slight behavior
single_file = True
method = 'gencode'
# Current Interpro File Data Folder
data_folder = 'DANSY_DATA/'
file_suffix = 'Reference_File_20251006.csv'
# Ensuring if there are existing reference files that they are not overwritten and finding which IDs have not been fetched.
current_uniprots = set()
full_dir = os.listdir(data_folder)
for file in full_dir:
if file.endswith(file_suffix):
temp_df = pd.read_csv(data_folder+file)
temp_uniprot = temp_df['UniProt ID'].tolist()
current_uniprots.update(temp_uniprot)
# There are two potential ways to retrieve the UniProt IDs one is using the pybiomart package and the other is using data downloaded from GenCode
if method == 'pybiomart':
# If pybiomart is to used these are the commands that will retrieve UniProt IDs
dataset = Dataset(host = 'http://useast.ensembl.org', name='hsapiens_gene_ensembl')
gene_cnv = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name','uniprotswissprot'])
gene_cnv.dropna(axis=0, inplace=True,subset=['UniProtKB/Swiss-Prot ID'])
uniprot_ids = gene_cnv['UniProtKB/Swiss-Prot ID'].tolist()
elif method == 'gencode':
# Using the gencode data to get the full list of UniProt IDs available for the human proteome
gencode = pd.read_table('gencode.v47.metadata.SwissProt',header=None)
uniprot_ids = gencode[1].tolist()
else:
raise ValueError()
uniprot_ids = list(set(uniprot_ids).difference(current_uniprots))
print('Out of %s there are a total of %s left'%(len(set(uniprot_ids))+len(current_uniprots), len(set(uniprot_ids))))
# Double-check in case there was a disruption during reference file generation.
cnt = 0
hs_prot_dir = os.listdir(data_folder)
for file in hs_prot_dir:
if file.endswith(file_suffix):
file_str = file.split('_')
cnt_chck = int(file_str[1])
if cnt_chck > cnt:
cnt = cnt_chck
# Second double-check for complete overlap between two reference files as there was a re-run with the same uniprot ID list
rm_files = []
for file in hs_prot_dir :
if file.endswith(file_suffix) and file not in rm_files:
file_content_1 = pd.read_csv(data_folder+file)
id_list1 = file_content_1['UniProt ID'].tolist()
for file_2 in hs_prot_dir:
if file_2.endswith(file_suffix):
if file_2 != file and file_2 not in rm_files:
file_content_2 = pd.read_csv(data_folder+file_2)
id_list2 = file_content_2['UniProt ID'].tolist()
if set(id_list1).difference(id_list2) == set():
print('Found a match for %s'%file)
rm_files.append(file_2)
if rm_files:
cnt += 1
if single_file:
# Final set of IDs that are missing
ref_file_name = os.path.join(data_folder,'Complete_Proteome_'+file_suffix)
_ = CoDIAC.UniProt.makeRefFile(uniprot_ids, ref_file_name)
else:
# Note for everything below the 300 IDs is once UniProt starts to throw out exceptions.
currently_retrieved = set()
for x in range(0,round(len(uniprot_ids)/300)):
a = uniprot_ids[x*300:300*(x+1)]
ref_file_name = os.path.join(data_folder, 'Complete_Proteome_'+str(cnt)+'_'+file_suffix)
_ = CoDIAC.UniProt.makeRefFile(a, ref_file_name)
cnt += 1
currently_retrieved.update(a)
# Final set of IDs that are missing
a = set(uniprot_ids).difference(currently_retrieved)
ref_file_name = data_folder+'Complete_Proteome_'+str(cnt)+'_'+file_suffix
_ = CoDIAC.UniProt.makeRefFile(a, ref_file_name)
if __name__ == '__main__':
main()