From acf3cdb3e7844bae5c3693f1a9ce8bdcdc461489 Mon Sep 17 00:00:00 2001 From: kuan <396777306@qq.com> Date: Mon, 18 Apr 2022 17:01:50 +0800 Subject: [PATCH] explain --- explain/generate_net.py | 80 +++++++++++++++++++++++++ explain/get_generate_seq.py | 68 +++++++++++++++++++++ explain/ig.py | 110 ++++++++++++++++++++++++++++++++++ explain/ig_seq.py | 77 ++++++++++++++++++++++++ explain/max_activation_seq.py | 93 ++++++++++++++++++++++++++++ 5 files changed, 428 insertions(+) create mode 100755 explain/generate_net.py create mode 100755 explain/get_generate_seq.py create mode 100755 explain/ig.py create mode 100755 explain/ig_seq.py create mode 100755 explain/max_activation_seq.py diff --git a/explain/generate_net.py b/explain/generate_net.py new file mode 100755 index 0000000..7ffc70e --- /dev/null +++ b/explain/generate_net.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import backend as K +from tensorflow.keras.models import load_model,Model +from Bio import SeqIO +import sys,ig +import numpy as np +from tensorflow.keras.callbacks import ModelCheckpoint,CSVLogger,EarlyStopping +class GenerLayer(keras.layers.Layer): + def __init__(self,**kwargs): + super(GenerLayer, self).__init__(**kwargs) + def build(self, input_shape): + #assert isinstance(input_shape, list) + #print(input_shape) + self.w = self.add_weight(name='weight',shape=input_shape[0][1:],initializer='glorot_uniform',trainable=True) + super(GenerLayer, self).build(input_shape) + def call(self, x): + assert isinstance(x, list) + mask,template=x + target_bits=2.0 + norm_weights=keras.layers.Softmax()(self.w) + #print(norm_weights) + target_entropy_mse_loss=target_entropy_mse(norm_weights,target_bits) + self.add_loss(target_entropy_mse_loss) #https://keras.io/api/losses/ + return tf.math.multiply(norm_weights,mask)+template #add(+) is right? + #def compute_output_shape(self, input_shape): + # return input_shape[0] +def target_entropy_mse(pwm,target_bits): #seqprop_optimizer.py + entropy = pwm * -tf.math.log(tf.clip_by_value(pwm, K.epsilon(), 1. - K.epsilon()))/tf.math.log(2.0) + #print(entropy) + entropy = tf.reduce_sum(entropy,axis=1) + conservation = 2.0 - entropy + target_entropy_mse_loss=tf.reduce_mean((conservation - target_bits)**2, axis=-1) + #target_entropy_sme_loss=(tf.reduce_mean(conservation, axis=-1) - target_bits)**2 + return target_entropy_mse_loss +def my_loss_fn(y_true, y_pred): + #loss=keras.losses.poisson(y_true[:,y_mask_start:y_mask_end,:], y_pred[:,y_mask_start:y_mask_end,:]) + mse=keras.losses.MeanSquaredError() + loss=mse(y_true[:,y_mask_start:y_mask_end,:], y_pred[:,y_mask_start:y_mask_end,:]) + return tf.reduce_mean(loss) +if __name__ == '__main__': + h5=sys.argv[1] + npz=sys.argv[2] + prefix=sys.argv[3] + model = load_model(h5,{'Rsquare':ig.Rsquare}) + data=np.load(npz) + mask=np.expand_dims(data["mask"],axis=0) + template=np.expand_dims(data["template"],axis=0) + y_scale=np.expand_dims(data["y_scale"],axis=(0,2)) + global y_mask_start,y_mask_end + y_mask_start=data["y_mask_start"] + y_mask_end=data["y_mask_end"] + input1=keras.layers.Input(shape=mask.shape[1:],name="input1") + input2=keras.layers.Input(shape=template.shape[1:],name="input2") + x=GenerLayer()([input1,input2]) + for layer_item in model.layers: + print(layer_item.name) + if layer_item.name=="input_1": + continue + layer_item.trainable=False + if layer_item.name=="rloop_regression": + outputs=layer_item(x) + break + else: + x=layer_item(x) + fModel=Model(inputs=[input1,input2],outputs=outputs) + fModel.summary() + #sys.exit() + #checkpoint=ModelCheckpoint(filepath="generate_model_trained/%s_{epoch:02d}_{loss:.4f}.hdf5"%prefix,monitor='loss',verbose=1,save_best_only=True,mode='min') + checkpoint=ModelCheckpoint(filepath="generate_model_trained/%s.hdf5"%prefix,monitor='loss',verbose=1,save_best_only=True,mode='min') + csv_logger=CSVLogger('%s_training.log'%prefix,separator='\t') + callbacks=[csv_logger,checkpoint] + fModel.compile(optimizer="Adam",loss={'rloop_regression':my_loss_fn},metrics={"rloop_regression":ig.Rsquare}) + fModel.fit([mask,template],y_scale,epochs=10000,callbacks=callbacks) + #for layer_item in fModel.layers: + # print(layer_item.name) + gModel=Model(inputs=fModel.input,outputs=fModel.get_layer("gener_layer").output) + pwm=gModel.predict([mask,template]) + print(pwm) diff --git a/explain/get_generate_seq.py b/explain/get_generate_seq.py new file mode 100755 index 0000000..9d4eddc --- /dev/null +++ b/explain/get_generate_seq.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +#coding=utf-8 +import sys,ig,generate_net +import tensorflow as tf +from tensorflow.keras.models import load_model,Model +import numpy as np +def ont_hot_to_seq(pwm): + seq="" + score=[] + for x in pwm: + #print(x) + p=np.argwhere(x==np.max(x)) + #print(p) + real_p=p[0][0] + score.append(x[real_p]) + if real_p==0: + seq+="A" + elif real_p==1: + seq+="T" + elif real_p==2: + seq+="C" + elif real_p==3: + seq+="G" + return seq,score +if __name__ == '__main__': + h5=sys.argv[1] + npz=sys.argv[2] + prefix=sys.argv[3] + scale_win=128 + data=np.load(npz) + mask=np.expand_dims(data["mask"],axis=0) + template=np.expand_dims(data["template"],axis=0) + fModel = load_model(h5,custom_objects={'Rsquare':ig.Rsquare,'GenerLayer':generate_net.GenerLayer,'my_loss_fn':generate_net.my_loss_fn}) + gModel=Model(inputs=fModel.input,outputs=fModel.get_layer("gener_layer").output) + pwm=gModel.predict([mask,template]) + seq,score=ont_hot_to_seq(pwm[0]) + Fr=open(prefix+"_generateseq.fasta","w") + Fr.write(">"+prefix+"\n") + Fr.write(seq+"\n") + Fr.close() + Fr=open(prefix+"_generateseq_score.bdg","w") #每个碱基的支持率或者频率 + for i,s in enumerate(score): + Fr.write(prefix+"\t"+str(i)+"\t"+str(i+1)+"\t"+str(s)+"\n") + Fr.close() + r=fModel.predict([mask,template]) + Fr=open(prefix+"_generateseq_Rlooplevel.bdg","w") #生成序列的Rloop水平 + for i,s in enumerate(r[0]): + st=i*scale_win + ed=st+scale_win + Fr.write(prefix+"\t"+str(st)+"\t"+str(ed)+"\t"+str(s[0])+"\n") + Fr.close() + #sys.exit() + Fr=open(prefix+"_generateseq_targetRlooplevel.bdg","w") #生成序列去拟合的目标target + for i,s in enumerate(data["y_scale"]): + st=i*scale_win + ed=st+scale_win + Fr.write(prefix+"\t"+str(st)+"\t"+str(ed)+"\t"+str(s)+"\n") + Fr.close() + Fr=open(prefix+"_targetseq_realRlooplevel.bdg","w") #target目标序列的真实R-loop水平 + for i,s in enumerate(data["y_real"]): + st=i*scale_win + ed=st+scale_win + Fr.write(prefix+"\t"+str(st)+"\t"+str(ed)+"\t"+str(s)+"\n") + Fr.close() + np.savez_compressed(prefix+"_generateseq_pwm.npz",pwm=pwm[0]) + Fr=open(prefix+"_targetseq.bed","w") + Fr.write(prefix+"\t"+str(data["X_mask_start"])+"\t"+str(data["X_mask_end"])+"\n") + Fr.close() \ No newline at end of file diff --git a/explain/ig.py b/explain/ig.py new file mode 100755 index 0000000..629c72c --- /dev/null +++ b/explain/ig.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python +#coding=utf-8 +import sys +import numpy as np +import matplotlib.pyplot as plt +import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import layers +from tensorflow.keras.applications import xception +from tensorflow.keras.models import load_model,Model +from tensorflow.keras import backend as K +def Rsquare(y_true, y_pred): + SS_res=K.sum(K.square( y_true-y_pred )) + SS_tot=K.sum(K.square( y_true - K.mean(y_true) ) ) + return 1 - SS_res/(SS_tot + K.epsilon()) +def get_gradients(X_test,model): + X_test=tf.convert_to_tensor(X_test,dtype=tf.float32) + with tf.GradientTape() as tape: + tape.watch(X_test) + preds = model(X_test) + grads = tape.gradient(preds,X_test) + return grads +def get_integrated_gradients(X_test, model, baseline=None, num_steps=50): + if baseline is None: + baseline = np.zeros(X_test.shape).astype(np.float32) + else: + baseline = baseline.astype(np.float32) + + # 1. Do interpolation. + X_test = X_test.astype(np.float32) + interpolated = [baseline + (step / num_steps) * (X_test - baseline) for step in range(num_steps + 1)] + interpolated = np.array(interpolated).astype(np.float32) + + # 3. Get the gradients + grads = [] + for i,ipd in enumerate(interpolated): + #print("ipd shape",ipd.shape) #ipd shape (1, 128000, 4) + grad = get_gradients(ipd,model) + #print(grad) + grads.append(grad[0]) + grads = tf.convert_to_tensor(grads, dtype=tf.float32) + + # 4. Approximate the integral using the trapezoidal rule + grads = (grads[:-1] + grads[1:]) / 2.0 + avg_grads = tf.reduce_mean(grads, axis=0) + + # 5. Calculate integrated gradients and return + integrated_grads = (X_test - baseline) * avg_grads + return integrated_grads + +def baseline_integrated_gradients(X_test, model,num_steps=50, num_runs=2,select="random"): + # 1. List to keep track of Integrated Gradients (IG) for all the images + integrated_grads = [] + + # 2. Get the integrated gradients for all the baselines + for run in range(num_runs): + if select=="random": + #np.random.seed(10) + baseline = np.random.random(X_test.shape) + total=np.sum(baseline,axis=2) #(1,12800),total[:,:,None].shape==(1,12800,1) + baseline=baseline/total[:,:,None] #(1,12800,4) + #print(np.sum(baseline,axis=2)) + elif select=="shuffle": + baseline=X_test.copy() + list(map(np.random.shuffle, baseline)) #Multi-dimensional arrays are only shuffled along the first axis,直接shuffle原始序列,不加list,结果不shuffle + #print("baseline shape",baseline.shape) #(1, 128000, 4) + #baseline=None + igrads = get_integrated_gradients(X_test,model,baseline=baseline,num_steps=num_steps) + integrated_grads.append(igrads) + # 3. Return the average integrated gradients for the image + integrated_grads = tf.convert_to_tensor(integrated_grads) + #print("integrated_grads shape",integrated_grads.shape) # (num_runs, 1, 128000, 4) + return tf.reduce_mean(integrated_grads, axis=0) +if __name__ == '__main__': + h5=sys.argv[1] + npz=sys.argv[2] + prefix=sys.argv[3] + data=np.load(npz) + X_test=data["X_test"][0:10,:,:] + model = load_model(h5,{'Rsquare':Rsquare}) + inputs=model.layers[0].input + outputs=model.get_layer("rloop_regression").output + fModel=Model(inputs=inputs,outputs=outputs) + Fr_step=open("%s_step.xls"%prefix,"w") + for xi in X_test: + xi=xi[None,:,:] + print("xi shape",xi.shape) + igrads=baseline_integrated_gradients(xi,fModel,num_steps=50,num_runs=10,select="random") + #print(igrads) #(1, 128000, 4) + score=np.sum(np.abs(igrads),axis=2) #right? + print("score shape",score.shape) #(1, 128000) + #print(score) + score_final=xi*score[:,:,None] + print("score_final shape",score_final.shape) #(1, 128000, 4) + #print(score_final) + for ni in np.reshape(np.sum(score_final,axis=2),-1): + Fr_step.write(str(ni)+"\n") + Fr_step.close() + """ + Fr_batch=open("%s_batch.xls"%prefix,"w") + igrads=baseline_integrated_gradients(X_test,fModel,num_steps=50,num_runs=10,select="random") + score=np.sum(np.abs(igrads),axis=2) + print("score shape",score.shape) + score_final=X_test*score[:,:,None] + print("score_final shape",score_final.shape) + for timesteps in np.sum(score_final,axis=2): + for ni in timesteps: + Fr_batch.write(str(ni)+"\n") + Fr_batch.close() + """ diff --git a/explain/ig_seq.py b/explain/ig_seq.py new file mode 100755 index 0000000..53e3962 --- /dev/null +++ b/explain/ig_seq.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +import sys,os,ig,argparse +from Bio import SeqIO +import numpy as np +from tensorflow.keras.models import load_model,Model +def get_parser(): + parser = argparse.ArgumentParser(description="ig_seq.py",formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--fasta',dest="fasta") + parser.add_argument('--win',dest="win",type=int) #choices=["fwd","rev"] + parser.add_argument('--chrome_size',dest="chrome_size") + parser.add_argument('--h5',dest="h5")#help="" + parser.add_argument('--select',dest="select",choices=["random","shuffle"],default="random") + parser.add_argument("--num_steps",dest="num_steps",type=int,default="50") + parser.add_argument("--num_runs",dest="num_runs",type=int,default="2") + parser.add_argument("--prefix",dest="prefix") + return parser +if __name__ == '__main__': + parser=get_parser() + if len(sys.argv)==1: + parser.print_help() + sys.exit() + else: + Args = parser.parse_args() + fasta=Args.fasta + win=Args.win + chrome_size=Args.chrome_size + h5=Args.h5 + select=Args.select + num_steps=Args.num_steps + num_runs=Args.num_runs + prefix=Args.prefix + Fr_log=open(prefix+".log","w") + Fr_log.write(" ".join(sys.argv)+"\n") + Fr_bdg=open(prefix+"_imscore.bdg","w") + one_hot={"A":[1,0,0,0],"T":[0,1,0,0],"C":[0,0,1,0],"G":[0,0,0,1],"N":[0,0,0,0]} + dseq=SeqIO.index(fasta,"fasta") + model = load_model(h5,{'Rsquare':ig.Rsquare}) + inputs=model.layers[0].input + outputs=model.get_layer("rloop_regression").output + fModel=Model(inputs=inputs,outputs=outputs) + lines=os.popen("bedtools makewindows -g %s -w %s "%(chrome_size,win)).readlines() + igrads_matrix=[] #for heatmap + imscore_matrix=[] #for seqlogo + for x in lines: + x=x.rstrip() + l=x.split("\t") + seq=dseq[l[0]][int(l[1]):int(l[2])] + ss=[] + for s in seq.upper(): + if s in "ATCG": + ss.append(one_hot[s]) + else: + ss.append(one_hot["N"]) + if len(ss)"+seq_name+"\n") + Fr_seq.write(seq+"\n") + for i,s in enumerate(score): + Fr_score.write(seq_name+"\t"+str(i)+"\t"+str(i+1)+"\t"+str(s)+"\n") + pwmd[seq_name]=pwm[0] + Fr_maxac.write(seq_name+"_maxac"+"\t"+str(max_act_value)+"\n") + Fr_maxac.flush() + #for i,s in enumerate(r[0][0]): + # st=i*scale_win + # ed=st+scale_win + # Fr_rloop.write(prefix+"\t"+str(st)+"\t"+str(ed)+"\t"+str(s[0])+"\n") + K.clear_session() + #break + Fr_seq.close() + Fr_score.close() + Fr_maxac.close() + #Fr_rloop.close() + np.savez_compressed(prefix+"_maxacseq_pwm.npz",**pwmd)