-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSHet.py
More file actions
171 lines (136 loc) · 5.99 KB
/
Copy pathSHet.py
File metadata and controls
171 lines (136 loc) · 5.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
This script calculates the multivariate SHet statistic for an individual SNP from
summary statistics (effect size b and standard error se) obtained with univariate
mlma analysis in GCTA for all the the tierpsy features of the CeNDR strains.
Created on Fri Sep 13 17:40:07 2019
@author: em812
"""
import numpy as np
from numpy.linalg import pinv
from time import time
import pandas as pd
def EstimateGamma(N_permut, SampleSize, CorrMatrix, correct = 1, startCutoff = 0, endCutoff = 1, CutoffStep = 0.05, isAllpossible = True):
from numpy.random import multivariate_normal as mvrnorm
from scipy.stats import gamma
# Sample SHet
Stat = []
for i in range(N_permut):
X = mvrnorm(np.zeros(SampleSize.shape[0]), CorrMatrix, check_valid='warn', tol=1e-8)
Stat.append(Trucated_TestScore(X, SampleSize, CorrMatrix, correct=correct,
startCutoff=startCutoff, endCutoff=endCutoff,
CutoffStep=CutoffStep, isAllpossible=isAllpossible))
# Fit gamma distribution
fit_alpha, fit_loc, fit_beta = gamma.fit(Stat)
k = fit_alpha
theta = 1/fit_beta
a = fit_loc
# a = min(Stat)*3/4
# ex3 = np.mean(Stat*Stat*Stat)
# V = np.var(Stat)
#
# for i in np.arange(1,101,1):
# E = np.mean(Stat)-a
# k = E**2/V
# theta = V/E
# a = (-3*k*(k+1)*theta**2+sqrt(9*k**2*(k+1)**2*theta**4-12*k*theta*(k*(k+1)*(k+2)*theta**3-ex3)))/6/k/theta
para = [k,theta,a]
return(para)
def Trucated_TestScore(X, SampleSize, CorrMatrix, correct = True, startCutoff = 0, endCutoff = 1, CutoffStep = 0.05, isAllpossible = False):
"""
Calculates the SHet statistic.
params:
X: Wald summary statistic from each cohort for each trait (for a given SNP)
Each row represents a cohort and each column represents a trait (number of cohorts x number of traits)
SampleSize: (size of the cohorts) weights in the metaanalysis (number of cohorts x number of traits)
[if only one cohort is assumed, the sample size for each trait is the
same and all the weights are equal]
CorrMatrix: correlation between phenotypes
correct: flag defining whether the stats with negative sign will be corrected (multiplied by -1) to add to the evidence
startCutoff, endCutoff, CutoffStep: define the cutoffs in the wald summary stats values that we want to consider when building SHet
isAllpossible: if True, all the values in X will be used as cutoffs (only one summary statistic will be dropped at a time)
return:
SHet
"""
# Number of traits
#N = X.shape[1]
# Weights of summary statistics -> sqrt(sample size of cohort)
W = 1/np.sqrt(SampleSize)
#XX = apply(X, 1, function(x) {
# For each row of X (all traits from a given cohort)
all_time = time()
# Initialize
TTT = -1;
sel_group = None
# Get cutoff levels
if isAllpossible:
cutoff = np.sort(np.unique(np.abs(X))); ## it will filter out any of them.
else:
cutoff = np.arange(startCutoff, endCutoff+CutoffStep, CutoffStep);
# Calculate SHet for each cutoff threshold
for threshold in cutoff:
ptm = time()
print("Calculating for threshold ",threshold)
# If all the Wald summary stats are smaller that the threshold, break the for loop
# (there aren't any larger stats left to group)
if np.all(np.abs(X)<threshold):
break
# Else calculate SHet using the statistics that are >= threshold
else:
index = np.abs(X) >= threshold
x1 = X[index]
A = CorrMatrix[index, :] ## update the matrix
A = A[:, index]
W1 = W[index]
# Change sign of negative stats to add evidence
if correct:
W1[(x1 < 0)] = -W1[(x1 < 0)] ## update the sign
# Calculate SHet
try:
A = pinv(A)
except:
pd.DataFrame(A).to_csv('A_non_invertible_threshold={}.csv'.format(threshold))
print('error')
x1 = x1.reshape(1,-1)
W1 = W1.reshape(1,-1)
T = np.matmul(np.matmul(W1,A),np.transpose(x1))
T = (T*T) / np.matmul(np.matmul(W1,A),np.transpose(W1));
# If SHet for the given threshold is larger than all previous SHet values, then keep this value
if TTT < T[0,0]:
TTT = T[0,0]
sel_group = index
iter_time = time() - ptm
print("Calculated in ",iter_time)
print('Finished in {} sec.'.format(time()-all_time))
return(TTT,sel_group)
if __name__ == "__main__":
#%%
## Input
# GCTA summary stats
res = pd.read_csv("stats_SNP=1:727.csv")
# Tierpsy features per strain
features = pd.read_csv("10_tierpsy_features.csv")
#%%
## Preprocess stats and features/traits
res = res.sort_values(by='feature')
analysed_feat = res["feature"].tolist()
res = res.set_index('feature')
res = res[['b','se','p']]
features = features[res.index]
## Get feature/trait correlation matrix (equivalent to the correlation between Wald statistics of each trait, according to equation 5 in Zhou et al 2015)
CorrMatrix = features.corr()
# Make sure that the traits are in the same order in res and CorrMatrix
res = res.loc[CorrMatrix.index,:]
res = res.values
CorrMatrix = CorrMatrix.values
## Calculate Wald statistic per feature/trait
X = res[:,0]/res[:,1]
# Sample size
n_sample = features.shape[0]
# Number of traits
nvar = res.shape[0]
# Make SampleSize vector
SampleSize = np.repeat([n_sample],nvar)
## Calculate SHet statistic
SHet,sel_group = Trucated_TestScore(X, SampleSize, CorrMatrix, correct = True, startCutoff = 0, endCutoff = 1, CutoffStep = 0.05, isAllpossible = False)