Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions explain/generate_net.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/env python
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.models import load_model,Model
from Bio import SeqIO
import sys,ig
import numpy as np
from tensorflow.keras.callbacks import ModelCheckpoint,CSVLogger,EarlyStopping
class GenerLayer(keras.layers.Layer):
def __init__(self,**kwargs):
super(GenerLayer, self).__init__(**kwargs)
def build(self, input_shape):
#assert isinstance(input_shape, list)
#print(input_shape)
self.w = self.add_weight(name='weight',shape=input_shape[0][1:],initializer='glorot_uniform',trainable=True)
super(GenerLayer, self).build(input_shape)
def call(self, x):
assert isinstance(x, list)
mask,template=x
target_bits=2.0
norm_weights=keras.layers.Softmax()(self.w)
#print(norm_weights)
target_entropy_mse_loss=target_entropy_mse(norm_weights,target_bits)
self.add_loss(target_entropy_mse_loss) #https://keras.io/api/losses/
return tf.math.multiply(norm_weights,mask)+template #add(+) is right?
#def compute_output_shape(self, input_shape):
# return input_shape[0]
def target_entropy_mse(pwm,target_bits): #seqprop_optimizer.py
entropy = pwm * -tf.math.log(tf.clip_by_value(pwm, K.epsilon(), 1. - K.epsilon()))/tf.math.log(2.0)
#print(entropy)
entropy = tf.reduce_sum(entropy,axis=1)
conservation = 2.0 - entropy
target_entropy_mse_loss=tf.reduce_mean((conservation - target_bits)**2, axis=-1)
#target_entropy_sme_loss=(tf.reduce_mean(conservation, axis=-1) - target_bits)**2
return target_entropy_mse_loss
def my_loss_fn(y_true, y_pred):
#loss=keras.losses.poisson(y_true[:,y_mask_start:y_mask_end,:], y_pred[:,y_mask_start:y_mask_end,:])
mse=keras.losses.MeanSquaredError()
loss=mse(y_true[:,y_mask_start:y_mask_end,:], y_pred[:,y_mask_start:y_mask_end,:])
return tf.reduce_mean(loss)
if __name__ == '__main__':
h5=sys.argv[1]
npz=sys.argv[2]
prefix=sys.argv[3]
model = load_model(h5,{'Rsquare':ig.Rsquare})
data=np.load(npz)
mask=np.expand_dims(data["mask"],axis=0)
template=np.expand_dims(data["template"],axis=0)
y_scale=np.expand_dims(data["y_scale"],axis=(0,2))
global y_mask_start,y_mask_end
y_mask_start=data["y_mask_start"]
y_mask_end=data["y_mask_end"]
input1=keras.layers.Input(shape=mask.shape[1:],name="input1")
input2=keras.layers.Input(shape=template.shape[1:],name="input2")
x=GenerLayer()([input1,input2])
for layer_item in model.layers:
print(layer_item.name)
if layer_item.name=="input_1":
continue
layer_item.trainable=False
if layer_item.name=="rloop_regression":
outputs=layer_item(x)
break
else:
x=layer_item(x)
fModel=Model(inputs=[input1,input2],outputs=outputs)
fModel.summary()
#sys.exit()
#checkpoint=ModelCheckpoint(filepath="generate_model_trained/%s_{epoch:02d}_{loss:.4f}.hdf5"%prefix,monitor='loss',verbose=1,save_best_only=True,mode='min')
checkpoint=ModelCheckpoint(filepath="generate_model_trained/%s.hdf5"%prefix,monitor='loss',verbose=1,save_best_only=True,mode='min')
csv_logger=CSVLogger('%s_training.log'%prefix,separator='\t')
callbacks=[csv_logger,checkpoint]
fModel.compile(optimizer="Adam",loss={'rloop_regression':my_loss_fn},metrics={"rloop_regression":ig.Rsquare})
fModel.fit([mask,template],y_scale,epochs=10000,callbacks=callbacks)
#for layer_item in fModel.layers:
# print(layer_item.name)
gModel=Model(inputs=fModel.input,outputs=fModel.get_layer("gener_layer").output)
pwm=gModel.predict([mask,template])
print(pwm)
68 changes: 68 additions & 0 deletions explain/get_generate_seq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/usr/bin/env python
#coding=utf-8
import sys,ig,generate_net
import tensorflow as tf
from tensorflow.keras.models import load_model,Model
import numpy as np
def ont_hot_to_seq(pwm):
seq=""
score=[]
for x in pwm:
#print(x)
p=np.argwhere(x==np.max(x))
#print(p)
real_p=p[0][0]
score.append(x[real_p])
if real_p==0:
seq+="A"
elif real_p==1:
seq+="T"
elif real_p==2:
seq+="C"
elif real_p==3:
seq+="G"
return seq,score
if __name__ == '__main__':
h5=sys.argv[1]
npz=sys.argv[2]
prefix=sys.argv[3]
scale_win=128
data=np.load(npz)
mask=np.expand_dims(data["mask"],axis=0)
template=np.expand_dims(data["template"],axis=0)
fModel = load_model(h5,custom_objects={'Rsquare':ig.Rsquare,'GenerLayer':generate_net.GenerLayer,'my_loss_fn':generate_net.my_loss_fn})
gModel=Model(inputs=fModel.input,outputs=fModel.get_layer("gener_layer").output)
pwm=gModel.predict([mask,template])
seq,score=ont_hot_to_seq(pwm[0])
Fr=open(prefix+"_generateseq.fasta","w")
Fr.write(">"+prefix+"\n")
Fr.write(seq+"\n")
Fr.close()
Fr=open(prefix+"_generateseq_score.bdg","w") #每个碱基的支持率或者频率
for i,s in enumerate(score):
Fr.write(prefix+"\t"+str(i)+"\t"+str(i+1)+"\t"+str(s)+"\n")
Fr.close()
r=fModel.predict([mask,template])
Fr=open(prefix+"_generateseq_Rlooplevel.bdg","w") #生成序列的Rloop水平
for i,s in enumerate(r[0]):
st=i*scale_win
ed=st+scale_win
Fr.write(prefix+"\t"+str(st)+"\t"+str(ed)+"\t"+str(s[0])+"\n")
Fr.close()
#sys.exit()
Fr=open(prefix+"_generateseq_targetRlooplevel.bdg","w") #生成序列去拟合的目标target
for i,s in enumerate(data["y_scale"]):
st=i*scale_win
ed=st+scale_win
Fr.write(prefix+"\t"+str(st)+"\t"+str(ed)+"\t"+str(s)+"\n")
Fr.close()
Fr=open(prefix+"_targetseq_realRlooplevel.bdg","w") #target目标序列的真实R-loop水平
for i,s in enumerate(data["y_real"]):
st=i*scale_win
ed=st+scale_win
Fr.write(prefix+"\t"+str(st)+"\t"+str(ed)+"\t"+str(s)+"\n")
Fr.close()
np.savez_compressed(prefix+"_generateseq_pwm.npz",pwm=pwm[0])
Fr=open(prefix+"_targetseq.bed","w")
Fr.write(prefix+"\t"+str(data["X_mask_start"])+"\t"+str(data["X_mask_end"])+"\n")
Fr.close()
110 changes: 110 additions & 0 deletions explain/ig.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#!/usr/bin/env python
#coding=utf-8
import sys
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.applications import xception
from tensorflow.keras.models import load_model,Model
from tensorflow.keras import backend as K
def Rsquare(y_true, y_pred):
SS_res=K.sum(K.square( y_true-y_pred ))
SS_tot=K.sum(K.square( y_true - K.mean(y_true) ) )
return 1 - SS_res/(SS_tot + K.epsilon())
def get_gradients(X_test,model):
X_test=tf.convert_to_tensor(X_test,dtype=tf.float32)
with tf.GradientTape() as tape:
tape.watch(X_test)
preds = model(X_test)
grads = tape.gradient(preds,X_test)
return grads
def get_integrated_gradients(X_test, model, baseline=None, num_steps=50):
if baseline is None:
baseline = np.zeros(X_test.shape).astype(np.float32)
else:
baseline = baseline.astype(np.float32)

# 1. Do interpolation.
X_test = X_test.astype(np.float32)
interpolated = [baseline + (step / num_steps) * (X_test - baseline) for step in range(num_steps + 1)]
interpolated = np.array(interpolated).astype(np.float32)

# 3. Get the gradients
grads = []
for i,ipd in enumerate(interpolated):
#print("ipd shape",ipd.shape) #ipd shape (1, 128000, 4)
grad = get_gradients(ipd,model)
#print(grad)
grads.append(grad[0])
grads = tf.convert_to_tensor(grads, dtype=tf.float32)

# 4. Approximate the integral using the trapezoidal rule
grads = (grads[:-1] + grads[1:]) / 2.0
avg_grads = tf.reduce_mean(grads, axis=0)

# 5. Calculate integrated gradients and return
integrated_grads = (X_test - baseline) * avg_grads
return integrated_grads

def baseline_integrated_gradients(X_test, model,num_steps=50, num_runs=2,select="random"):
# 1. List to keep track of Integrated Gradients (IG) for all the images
integrated_grads = []

# 2. Get the integrated gradients for all the baselines
for run in range(num_runs):
if select=="random":
#np.random.seed(10)
baseline = np.random.random(X_test.shape)
total=np.sum(baseline,axis=2) #(1,12800),total[:,:,None].shape==(1,12800,1)
baseline=baseline/total[:,:,None] #(1,12800,4)
#print(np.sum(baseline,axis=2))
elif select=="shuffle":
baseline=X_test.copy()
list(map(np.random.shuffle, baseline)) #Multi-dimensional arrays are only shuffled along the first axis,直接shuffle原始序列,不加list,结果不shuffle
#print("baseline shape",baseline.shape) #(1, 128000, 4)
#baseline=None
igrads = get_integrated_gradients(X_test,model,baseline=baseline,num_steps=num_steps)
integrated_grads.append(igrads)
# 3. Return the average integrated gradients for the image
integrated_grads = tf.convert_to_tensor(integrated_grads)
#print("integrated_grads shape",integrated_grads.shape) # (num_runs, 1, 128000, 4)
return tf.reduce_mean(integrated_grads, axis=0)
if __name__ == '__main__':
h5=sys.argv[1]
npz=sys.argv[2]
prefix=sys.argv[3]
data=np.load(npz)
X_test=data["X_test"][0:10,:,:]
model = load_model(h5,{'Rsquare':Rsquare})
inputs=model.layers[0].input
outputs=model.get_layer("rloop_regression").output
fModel=Model(inputs=inputs,outputs=outputs)
Fr_step=open("%s_step.xls"%prefix,"w")
for xi in X_test:
xi=xi[None,:,:]
print("xi shape",xi.shape)
igrads=baseline_integrated_gradients(xi,fModel,num_steps=50,num_runs=10,select="random")
#print(igrads) #(1, 128000, 4)
score=np.sum(np.abs(igrads),axis=2) #right?
print("score shape",score.shape) #(1, 128000)
#print(score)
score_final=xi*score[:,:,None]
print("score_final shape",score_final.shape) #(1, 128000, 4)
#print(score_final)
for ni in np.reshape(np.sum(score_final,axis=2),-1):
Fr_step.write(str(ni)+"\n")
Fr_step.close()
"""
Fr_batch=open("%s_batch.xls"%prefix,"w")
igrads=baseline_integrated_gradients(X_test,fModel,num_steps=50,num_runs=10,select="random")
score=np.sum(np.abs(igrads),axis=2)
print("score shape",score.shape)
score_final=X_test*score[:,:,None]
print("score_final shape",score_final.shape)
for timesteps in np.sum(score_final,axis=2):
for ni in timesteps:
Fr_batch.write(str(ni)+"\n")
Fr_batch.close()
"""
77 changes: 77 additions & 0 deletions explain/ig_seq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/usr/bin/env python
import sys,os,ig,argparse
from Bio import SeqIO
import numpy as np
from tensorflow.keras.models import load_model,Model
def get_parser():
parser = argparse.ArgumentParser(description="ig_seq.py",formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('--fasta',dest="fasta")
parser.add_argument('--win',dest="win",type=int) #choices=["fwd","rev"]
parser.add_argument('--chrome_size',dest="chrome_size")
parser.add_argument('--h5',dest="h5")#help=""
parser.add_argument('--select',dest="select",choices=["random","shuffle"],default="random")
parser.add_argument("--num_steps",dest="num_steps",type=int,default="50")
parser.add_argument("--num_runs",dest="num_runs",type=int,default="2")
parser.add_argument("--prefix",dest="prefix")
return parser
if __name__ == '__main__':
parser=get_parser()
if len(sys.argv)==1:
parser.print_help()
sys.exit()
else:
Args = parser.parse_args()
fasta=Args.fasta
win=Args.win
chrome_size=Args.chrome_size
h5=Args.h5
select=Args.select
num_steps=Args.num_steps
num_runs=Args.num_runs
prefix=Args.prefix
Fr_log=open(prefix+".log","w")
Fr_log.write(" ".join(sys.argv)+"\n")
Fr_bdg=open(prefix+"_imscore.bdg","w")
one_hot={"A":[1,0,0,0],"T":[0,1,0,0],"C":[0,0,1,0],"G":[0,0,0,1],"N":[0,0,0,0]}
dseq=SeqIO.index(fasta,"fasta")
model = load_model(h5,{'Rsquare':ig.Rsquare})
inputs=model.layers[0].input
outputs=model.get_layer("rloop_regression").output
fModel=Model(inputs=inputs,outputs=outputs)
lines=os.popen("bedtools makewindows -g %s -w %s "%(chrome_size,win)).readlines()
igrads_matrix=[] #for heatmap
imscore_matrix=[] #for seqlogo
for x in lines:
x=x.rstrip()
l=x.split("\t")
seq=dseq[l[0]][int(l[1]):int(l[2])]
ss=[]
for s in seq.upper():
if s in "ATCG":
ss.append(one_hot[s])
else:
ss.append(one_hot["N"])
if len(ss)<int(win):
for i in range(int(win)-len(ss)):
ss.append(one_hot["N"])
X_test=np.array([ss])
print("X_test shape",X_test.shape)
igrads=ig.baseline_integrated_gradients(X_test,fModel,num_steps=num_steps,num_runs=num_runs,select=select) #num_step,num_run need change
#print(igrads) #(1, 128000, 4)
#igrads_matrix.append(igrads[0])
score=np.sum(np.abs(igrads),axis=2) #right?
print("score shape",score.shape) #(1, 128000)
#print(score)
score_final=X_test*score[:,:,None]
print("score_final shape",score_final.shape) #(1, 128000, 4)
#print(score_final)
#imscore_matrix.append(score_final)
for ni,pi,gi,si in zip(np.around(np.reshape(np.sum(score_final,axis=2),-1),4),range(int(l[1]),int(l[2])),igrads[0],score_final[0]):
fm=l[0]+"\t"+str(pi)+"\t"+str(pi+1)+"\t"+str(ni)
Fr_bdg.write(fm+"\n")
igrads_matrix.append(gi)
imscore_matrix.append(si)
Fr_bdg.close()
Fr_log.close()
np.savez_compressed("%s_matrix.npz"%prefix,igrads_matrix=np.around(igrads_matrix,4),imscore_matrix=np.around(imscore_matrix,4))
os.system("bedGraphToBigWig %s_imscore.bdg %s %s_imscore.bw"%(prefix,chrome_size,prefix))
Loading