-
Notifications
You must be signed in to change notification settings - Fork 0
Description
Requirements
## tensorflow==1.15.2
## tensor2tensor==1.14
Import Packages
import re
from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_problems
from tensor2tensor.utils import registry
from tensor2tensor import problems
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import shutil
Enable TF Eager execution
tfe = tf.contrib.eager
tfe.enable_eager_execution()
Other setup
Modes = tf.estimator.ModeKeys
#%%
Required Folder Creation
HOME_PATH = "C:/Raju/Translation_model/Translation_t2t/t2t/"
data_dir = os.path.expanduser(HOME_PATH + "data") # This folder contain the data
tmp_dir = os.path.expanduser(HOME_PATH + "tmp") # Ths folder contains temp data if any
train_dir = os.path.expanduser(HOME_PATH + "train") # This folder contain the model
export_dir = os.path.expanduser(HOME_PATH + "export") # This folder contain the exported model for production
translations_dir = os.path.expanduser(HOME_PATH + "translation") # This folder contain all translated sequence
event_dir = os.path.expanduser(HOME_PATH + "event") # Test the BLEU score
usr_dir = os.path.expanduser(HOME_PATH + "user") # This folder contains our data that we want to add
checkpoint_dir = os.path.expanduser(HOME_PATH + "checkpoints")
#%%
shutil.rmtree(data_dir)
shutil.rmtree(train_dir)
#%%
Creating folders
tf.io.gfile.makedirs(data_dir)
tf.io.gfile.makedirs(tmp_dir)
tf.io.gfile.makedirs(export_dir)
tf.io.gfile.makedirs(translations_dir)
tf.io.gfile.makedirs(train_dir)
tf.io.gfile.makedirs(event_dir)
tf.io.gfile.makedirs(usr_dir)
tf.io.gfile.makedirs(checkpoint_dir)
#%%
@registry.register_problem
class translationsig(text_problems.Text2TextProblem):
"""Predict RX SIG using Standardized SIG"""
@Property
def approx_vocab_size(self):
return 2**13*2 # ~16k
@Property
def is_generate_per_split(self):
# generate_data will shard the data into TRAIN and EVAL for us.
return True
@Property
def dataset_splits(self):
"""Splits of data to produce and number of output shards for each."""
# 10% evaluation data
return [{
"split": problem.DatasetSplit.TRAIN,
"shards": 8,
}, {
"split": problem.DatasetSplit.EVAL,
"shards": 2,
}]
def generate_samples(self, data_dir, tmp_dir, dataset_split):
del data_dir
del tmp_dir
del dataset_split
workspace = Workspace(subscription_id, resource_group, workspace_name)
dataset = Dataset.get_by_name(workspace, name='Sample_DL')
sig_data = dataset.to_pandas_dataframe()
print(sig_data.shape)
## Data Pre Processing
## Selecting Columns
sig_data = sig_data[['Standardized_SIG', 'IC+_Pharmacist SIG']]
sig_data = sig_data.drop_duplicates()
#print(sig_data.shape)
## Cleaning column names
sig_data.columns = sig_data.columns.str.replace(' ','_')
sig_data.columns = sig_data.columns.str.replace('+','')
#sig_data.Standardized_SIG = sig_data.Standardized_SIG.str.lower()
#sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.str.lower()
## Data Cleaning
sig_data.Standardized_SIG = sig_data.Standardized_SIG.map(lambda x: re.sub(r'.$', "",x))
sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.map(lambda x: re.sub(r'"$', "",x))
sig_data.Standardized_SIG = sig_data.Standardized_SIG.str.strip()
sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.str.strip()
# sig_data.Standardized_SIG = sig_data.Standardized_SIG.str.strip()
# sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.str.strip()
# sig_data.Standardized_SIG = sig_data.Standardized_SIG.apply(lambda x: re.sub(r'.$', "",x))
# sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.apply(lambda x: re.sub(r'"$', "",x))
#sig_data.Standardized_SIG = sig_data.Standardized_SIG.str.rstrip('[",]')
#sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.str.rstrip('[",]')
#sig_data.Standardized_SIG = sig_data.Standardized_SIG.str.rstrip('[",]')
#sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.str.rstrip('[",]')
sig_data.Standardized_SIG = sig_data.Standardized_SIG.str.upper()
sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.str.upper()
#sig_data.Standardized_SIG = sig_data.Standardized_SIG.str.strip()
#sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.str.strip()
sig_data = sig_data[sig_data.IC_Pharmacist_SIG != '.']
sig_data = sig_data[sig_data.IC_Pharmacist_SIG != '']
sig_data = sig_data[sig_data.IC_Pharmacist_SIG != '...']
sig_data = sig_data.dropna()
sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.apply(lambda x:re.sub(r'\bTAKE\b',"Tk",x))
sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.apply(lambda x:re.sub(r'\bONE\b',"1",x))
print("Final Shape : {}".format(sig_data.shape))
for sig in range(len(sig_data)):
yield {
"inputs": sig_data.Standardized_SIG.iloc[sig],
"targets":sig_data.IC_Pharmacist_SIG.iloc[sig],
}
#%%
Model name and Parameters selection
PROBLEM = "translationsig" # Custom ESIG Translation Problem
MODEL = "transformer" # Our model
HPARAMS = "transformer_base" # Hyperparameters for the model by default
# If you have a one gpu, use transformer_big_single_gpu
#%%
Setup helper functions for encoding and decoding
def encode(input_str, output_str=None):
"""Input str to features dict, ready for inference"""
inputs = encoders["inputs"].encode(input_str) + [1] # add EOS id
batch_inputs = tf.reshape(inputs, [1, -1, 1]) # Make it 3D.
return {"inputs": batch_inputs}
def decode(integers):
"""List of ints to str"""
integers = list(np.squeeze(integers))
if 1 in integers:
integers = integers[:integers.index(1)]
return encoders["inputs"].decode(np.squeeze(integers))
#%%
#Data generation
print('Generating data')
problem_definition = translationsig()
t2t_problem = problems.problem(PROBLEM)
t2t_problem.generate_data(data_dir, tmp_dir)
print("Data Generated.")
#%%
Get the encoders from the problem
encoders = t2t_problem.feature_encoders(data_dir)
example = tfe.Iterator(t2t_problem.dataset(Modes.TRAIN, data_dir)).next()
inputs = [int(x) for x in example["inputs"].numpy()] # Cast to ints.
targets = [int(x) for x in example["targets"].numpy()] # Cast to ints.
Example inputs as int-tensor.
print("Inputs, encoded:")
print(inputs)
print("Inputs, decoded:")
Example inputs as a sentence.
print(decode(inputs))
Example targets as int-tensor.
print("Targets, encoded:")
print(targets)
Example targets as a sentence.
print("Targets, decoded:")
print(decode(targets))
#%%
from tensor2tensor import models
#print(problems.available()) #Show all problems
print(registry.list_models()) #Show all registered models
#%%
from tensor2tensor.utils.trainer_lib import create_hparams
Init Hparams object from T2T Problem
hparams = create_hparams(HPARAMS)
#data_dir=data_dir, problem_name="translation_esig")
print(hparams.to_json())
#%%
Make Chngaes to Hparams
hparams.batch_size = 500
hparams.learning_rate_warmup_steps = 4500
hparams.learning_rate = .4
save_checkpoints_steps = 1000
print(hparams.to_json())
#%%
from tensor2tensor.utils.trainer_lib import create_run_config, create_experiment
Initi Run COnfig for Model Training
RUN_CONFIG = create_run_config(
model_dir=train_dir,
model_name=MODEL,
save_checkpoints_steps= save_checkpoints_steps# Location of where model file is store
# More Params here in this fucntion for controling how noften to tave checkpoints and more.
)
# Create Tensorflow Experiment Object
tensorflow_exp_fn = create_experiment(
run_config=RUN_CONFIG,
hparams=hparams,
model_name=MODEL,
problem_name=PROBLEM,
data_dir=data_dir,
train_steps=400, # Total number of train steps for all Epochs
eval_steps=100 # Number of steps to perform for each evaluation
)
tensorflow_exp_fn.train_and_evaluate()
#%%
translate_model = registry.model(MODEL)(hparams, Modes.PREDICT)
#%%
#enfr_problem = problems.problem(PROBLEM)
Copy the vocab file locally so we can encode inputs and decode model outputs
vocab_name = "vocab.sig_translator.16384.subwords"
vocab_file = os.path.join(data_dir, vocab_name)
Get the encoders from the problem
#encoders = enfr_problem.feature_encoders(DATA_DIR)
Copy the pretrained checkpoint locally
ckpt_name = "transformer_esig"
gs_ckpt = os.path.join(gs_ckpt_dir, ckpt_name)
ckpt_path = tf.train.latest_checkpoint(os.path.join(checkpoint_dir, ckpt_name))
print(ckpt_path)
ckpt_path = tf.train.latest_checkpoint(os.path.join(train_dir))
print(ckpt_path)
#%%
def translate(inputs):
encoded_inputs = encode(inputs)
with tfe.restore_variables_on_create(ckpt_path):
model_output = translate_model.infer(encoded_inputs)["outputs"]
return decode(model_output)
#%%
inputs = "TAKE 1 TABLET 2 TIMES DAILY"
outputs = translate(inputs)
print("Inputs: %s" % inputs)
print("Outputs: %s" % outputs)