Graph_Genome/Graph_Process.py at master · maingoc303/Graph_Genome · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def protein_list(id):
	'''
	this function return the text file contain the list of
	protein of each genome
	input is id - identity % for clustering lead to location where
	output of cluster extracted
	'''
	import os
	# run vsearch to get output of each cluster
	os.system('vsearch --cluster_size combine.fna --id '+str(id)+' --clusters Cluster')
	# extract file for list of protein in cluster
	prot_list = os.system('grep ">" Cluster* > protein_name.csv')
	# delete all Cluster extracted:
	rm=os.system('rm Cluster*')
	return prot_list

f =  protein_list(0.5) # input identity % for clustering mature peptides

def clust_info(f):
	'''
	f input is text file contain the list of protein name of each
	genome
	this function return the dataframe with 3 columns: accession_no,
	protein names and cluster that protein belongs
	'''
	import pandas as pd
	data = pd.read_csv(f, delimiter=':>', engine='python',
                      header=None, names=['cluster', 'protein'])
	# dropping null value columns to avoid errors
	data.dropna(inplace = True)
	# new data frame with split value columns
	new = data["protein"].str.split("_", n = 1, expand = True)
	# making seperate first name column from new data frame
	data["accession_no"] = new[0]
	# making seperate last name column from new data frame
	data["prot"] = new[1]
	# Dropping old Name columns
	data.drop(columns=["prot"], inplace=True)
	# return dataframe
	return data

def no_clust(dataframe):
	import pandas as pd
	'''
	This function will answers 4 questions about important metrics
	after clustering mature peptides from genome of all known subtypes
	'''
	# Q1: How many clusters there are
	df=pd.DataFrame(pd.crosstab(dataframe['cluster'],dataframe['accession_no']))
	print('\n 1. There are %i clusters' % df.shape[0])
	return df.shape[0]

def size_clust(dataframe):
	import pandas as pd
	# Q2: How big is each cluster is? (number of protein in each cluster)
	no_prot_in_clust = pd.crosstab(dataframe['cluster'],
                            dataframe['prot'],
                                margins = False)
	prot = pd.DataFrame(no_prot_in_clust)
	prot['sum']=no_prot_in_clust.sum(axis=1)
	print('\n 2. Number of protein in each cluster:')
	#print(prot['sum'])
	return prot['sum']

def clust_icl_all_prot(dataframe):
	import pandas as pd
	# Q3: How many clusters have exactly one protein from every genome
	clust = pd.crosstab(dataframe['accession_no'],
                            dataframe['cluster'],
                                margins = False)
	i = sum(clust.sum(axis=0)==clust.shape[0])
	print("\n 3. There are %i clusters have exactly one protein from every genome" % i)
	print(clust)

def prot_in_multi_clust(dataframe):
	import pandas as pd
	# Q4: How many proteins are in multiple cluster?
	# create new column for checking prot from genome in cluster
	dataframe['unique']=dataframe['accession_no']+'_'+dataframe['prot']
	unique = pd.crosstab(dataframe['unique'],
                            dataframe['cluster'],
                                margins = False)
	multi = pd.DataFrame(unique)
	multi['sum'] = unique.sum(axis=1)
	print('\n 4. %i proteins are in multiple cluster' % sum(multi['sum']>1))
	return sum(multi['sum']>1)


def visual(dataframe):
	from bokeh.io import output, show
	from bokeh.plotting import figure