diff --git a/data_loader.py b/data_loader.py index a116cbe..10c351c 100644 --- a/data_loader.py +++ b/data_loader.py @@ -9,12 +9,15 @@ import argparse # only need to run once for synonym generator -import nltk -nltk.download('wordnet') - -# The domesticviolence and survivorsofabuse subreddits will be class 0, critical; these are personal stories, calls for help, requests for advice -# The abuseInterrupted subreddit will be class 1, noncritical; it mostly contains empty text, links to articles, general statements about abuse, etc. -# Everything else will be class 2, general/unrelated +# note: don't need this unless we augment again and it throws errors +# import nltk +# nltk.download('wordnet') + +''' +The domesticviolence and survivorsofabuse subreddits will be class 0, critical; these are personal stories, calls for help, requests for advice +The abuseInterrupted subreddit will be class 1, noncritical; it mostly contains empty text, links to articles, general statements about abuse, etc. +Everything else will be class 2, general/unrelated +''' CLASSES = { 'relationship_advice': 2, # 5874 samples diff --git a/gru.py b/gru.py index 63a529b..74907ff 100644 --- a/gru.py +++ b/gru.py @@ -2,12 +2,10 @@ import torch import torch.nn as nn import torch.optim as optim -from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split -from tqdm import trange from data_loader import load_data -from utils import build_vocab, make_minibatch, strings_to_tensors +from utils import build_vocab, strings_to_tensors, train_model, evaluate_model class GRU(nn.Module): @@ -36,7 +34,6 @@ def forward(self, input_seq): if __name__ == "__main__": - # If there's an available GPU, lets train on it device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') @@ -77,73 +74,8 @@ def forward(self, input_seq): loss_func = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=learning_rate) - """ - Train the model - """ - epochs = trange(1, n_epochs + 1) - for epoch in epochs: - model.train() - - # Accumulate loss for all samples in this epoch - running_loss = 0 - num_batches = 0 - num_samples = len(train_data) - - shuffled_indices = torch.randperm(num_samples) - - # Gradient descent algorithm for each data sample - batches = trange(0, num_samples - bs, bs, leave=False) - for count in batches: - # Extract minibatches and send to device - indices = shuffled_indices[count: count + bs] - minibatch_data, minibatch_label = make_minibatch( - indices, train_data, train_labels) - minibatch_data, minibatch_label = minibatch_data.to( - device), minibatch_label.to(device) - - # Make predictions on the training data - scores = model(minibatch_data) - - # Backpropagation - loss = loss_func(scores, minibatch_label) - loss.backward() - optimizer.step() - optimizer.zero_grad() - - # Compute metrics - with torch.no_grad(): - num_batches += 1 - running_loss += loss.item() - - epochs.set_description( - f'Epoch {epoch} / {n_epochs} | Loss: {running_loss/num_batches}') - - """ - Evaluate the model - """ - with torch.no_grad(): - model.eval() - predicted_labels = [] - - # cycle through test set in batches - batches = trange(0, len(test_data) - bs, bs, - desc='evaluating on test set', leave=False) - for i in batches: - # extract minibatch - indices = torch.arange(i, i + bs) - minibatch_data, _ = make_minibatch(indices, test_data, test_labels) - minibatch_data = minibatch_data.to(device) - - # make and score predictions - scores = model(minibatch_data) - predicted_labels.extend(scores.argmax(dim=1).tolist()) - - # evaluate remaining samples - indices = torch.arange(len(predicted_labels), len(test_labels)) - minibatch_data, _ = make_minibatch(indices, test_data, test_labels) - minibatch_data = minibatch_data.to(device) - - scores = model(minibatch_data) - predicted_labels.extend(scores.argmax(dim=1).tolist()) - - print(classification_report(y_true=test_labels, y_pred=predicted_labels)) + train_model(model=model, n_epochs=n_epochs, train_data=train_data, train_labels=train_labels, + bs=bs, device=device, loss_func=loss_func, optimizer=optimizer) + + evaluate_model(model=model, test_data=test_data, test_labels=test_labels, + bs=bs, device=device) diff --git a/lstm.py b/lstm.py index 761a924..daf8c09 100644 --- a/lstm.py +++ b/lstm.py @@ -2,12 +2,10 @@ import torch import torch.nn as nn import torch.optim as optim -from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split -from tqdm import trange from data_loader import load_data -from utils import build_vocab, make_minibatch, strings_to_tensors +from utils import build_vocab, strings_to_tensors, train_model, evaluate_model class LSTM(nn.Module): @@ -46,7 +44,6 @@ def forward(self, input_seq): if __name__ == "__main__": - # If there's an available GPU, lets train on it device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') @@ -54,9 +51,9 @@ def forward(self, input_seq): Load the training, validation, and test data """ posts, labels = load_data( - og_file_path='data/reddit_submissions.csv', - aug_file_path='data/synonym_augmented_reddit_submissions.csv', - include_og=True, + og_file_path='data/reddit_submissions.csv', + aug_file_path='data/synonym_augmented_reddit_submissions.csv', + include_og=True, include_aug=True) posts_train, posts_test, train_labels, test_labels = train_test_split( @@ -91,78 +88,11 @@ def forward(self, input_seq): loss_func = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=learning_rate) - """ - Train the model - """ - epochs = trange(1, n_epochs + 1) - for epoch in epochs: - model.train() - - # Accumulate loss for all samples in this epoch - running_loss = 0 - num_batches = 0 - num_samples = len(train_data) - - shuffled_indices = torch.randperm(num_samples) - - # Gradient descent algorithm for each data sample - batches = trange(0, num_samples - bs, bs, leave=False) - for count in batches: - # Extract minibatches and send to GPU - indices = shuffled_indices[count : count + bs] - minibatch_data, minibatch_label = make_minibatch( - indices, train_data, train_labels) - minibatch_data, minibatch_label = minibatch_data.to( - device), minibatch_label.to(device) - - # Make a predictions on the training data - scores = model(minibatch_data) - - # Backpropagation - loss = loss_func(scores, minibatch_label) - loss.backward() - optimizer.step() - optimizer.zero_grad() - - # Compute metrics - with torch.no_grad(): - num_batches += 1 - running_loss += loss.item() - - epochs.set_description( - f'Epoch {epoch}/{n_epochs} | Loss: {running_loss/num_batches}') - - """ - Evaluate the model - """ - with torch.no_grad(): - model.eval() - predicted_labels = [] - - # cycle through test set in batches - batches = trange(0, len(test_data) - bs, bs, - desc='evaluating on test set', leave=False) - for i in batches: - # extract minibatch - indices = torch.arange(i, i + bs) - minibatch_data, _ = make_minibatch(indices, test_data, test_labels) - minibatch_data = minibatch_data.to(device) - - # make and score predictions - scores = model(minibatch_data) - predicted_labels.extend(scores.argmax(dim=1).tolist()) - - # evaluate remaining samples - indices = torch.arange(len(predicted_labels), len(test_labels)) - minibatch_data, _ = make_minibatch(indices, test_data, test_labels) - minibatch_data = minibatch_data.to(device) - - scores = model(minibatch_data) - predicted_labels.extend(scores.argmax(dim=1).tolist()) - - print(classification_report(y_true=test_labels, y_pred=predicted_labels)) - + train_model(model=model, n_epochs=n_epochs, train_data=train_data, train_labels=train_labels, + bs=bs, device=device, loss_func=loss_func, optimizer=optimizer) + evaluate_model(model=model, test_data=test_data, test_labels=test_labels, + bs=bs, device=device) """ Output: diff --git a/rnn.py b/rnn.py index f46e215..c3b020b 100644 --- a/rnn.py +++ b/rnn.py @@ -2,12 +2,10 @@ import torch import torch.nn as nn import torch.optim as optim -from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split -from tqdm import trange from data_loader import load_data -from utils import build_vocab, make_minibatch, strings_to_tensors +from utils import build_vocab, strings_to_tensors, train_model, evaluate_model class RNN(nn.Module): @@ -38,7 +36,6 @@ def forward(self, input_seq): if __name__ == "__main__": - # If there's an available GPU, lets train on it device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') @@ -79,73 +76,8 @@ def forward(self, input_seq): loss_func = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=learning_rate) - """ - Train the model - """ - epochs = trange(1, n_epochs + 1) - for epoch in epochs: - model.train() - - # Accumulate loss for all samples in this epoch - running_loss = 0 - num_batches = 0 - num_samples = len(train_data) - - shuffled_indices = torch.randperm(num_samples) - - # Gradient descent algorithm for each data sample - batches = trange(0, num_samples - bs, bs, leave=False) - for count in batches: - # Extract minibatches and send to device - indices = shuffled_indices[count: count + bs] - minibatch_data, minibatch_label = make_minibatch( - indices, train_data, train_labels) - minibatch_data, minibatch_label = minibatch_data.to( - device), minibatch_label.to(device) - - # Make predictions on the training data - scores = model(minibatch_data) - - # Backpropagation - loss = loss_func(scores, minibatch_label) - loss.backward() - optimizer.step() - optimizer.zero_grad() - - # Compute metrics - with torch.no_grad(): - num_batches += 1 - running_loss += loss.item() - - epochs.set_description( - f'Epoch {epoch} / {n_epochs} | Loss: {running_loss/num_batches}') - - """ - Evaluate the model - """ - with torch.no_grad(): - model.eval() - predicted_labels = [] - - # cycle through test set in batches - batches = trange(0, len(test_data) - bs, bs, - desc='evaluating on test set', leave=False) - for i in batches: - # extract minibatch - indices = torch.arange(i, i + bs) - minibatch_data, _ = make_minibatch(indices, test_data, test_labels) - minibatch_data = minibatch_data.to(device) - - # make and score predictions - scores = model(minibatch_data) - predicted_labels.extend(scores.argmax(dim=1).tolist()) - - # evaluate remaining samples - indices = torch.arange(len(predicted_labels), len(test_labels)) - minibatch_data, _ = make_minibatch(indices, test_data, test_labels) - minibatch_data = minibatch_data.to(device) - - scores = model(minibatch_data) - predicted_labels.extend(scores.argmax(dim=1).tolist()) - - print(classification_report(y_true=test_labels, y_pred=predicted_labels)) + train_model(model=model, n_epochs=n_epochs, train_data=train_data, train_labels=train_labels, + bs=bs, device=device, loss_func=loss_func, optimizer=optimizer) + + evaluate_model(model=model, test_data=test_data, test_labels=test_labels, + bs=bs, device=device) diff --git a/utils.py b/utils.py index 6309904..b285c20 100644 --- a/utils.py +++ b/utils.py @@ -1,5 +1,7 @@ import torch import torch.nn as nn +from sklearn.metrics import classification_report +from tqdm import trange def build_vocab(posts): @@ -52,3 +54,72 @@ def make_minibatch(indices, data, label): minibatch_label = torch.stack(minibatch_label, dim=0) return minibatch_data, minibatch_label + + +def train_model(model, n_epochs, train_data, train_labels, bs, device, loss_func, optimizer): + epochs = trange(1, n_epochs + 1) + for epoch in epochs: + model.train() + + # Accumulate loss for all samples in this epoch + running_loss = 0 + num_batches = 0 + num_samples = len(train_data) + + shuffled_indices = torch.randperm(num_samples) + + # Gradient descent algorithm for each data sample + batches = trange(0, num_samples - bs, bs, leave=False) + for count in batches: + # Extract minibatches and send to GPU + indices = shuffled_indices[count: count + bs] + minibatch_data, minibatch_label = make_minibatch( + indices, train_data, train_labels) + minibatch_data, minibatch_label = minibatch_data.to( + device), minibatch_label.to(device) + + # Make a predictions on the training data + scores = model(minibatch_data) + + # Backpropagation + loss = loss_func(scores, minibatch_label) + loss.backward() + optimizer.step() + optimizer.zero_grad() + + # Compute metrics + with torch.no_grad(): + num_batches += 1 + running_loss += loss.item() + + epochs.set_description( + f'Epoch {epoch}/{n_epochs} | Loss: {running_loss / num_batches}') + + +def evaluate_model(model, test_data, test_labels, bs, device): + with torch.no_grad(): + model.eval() + predicted_labels = [] + + # cycle through test set in batches + batches = trange(0, len(test_data) - bs, bs, + desc='evaluating on test set', leave=False) + for i in batches: + # extract minibatch + indices = torch.arange(i, i + bs) + minibatch_data, _ = make_minibatch(indices, test_data, test_labels) + minibatch_data = minibatch_data.to(device) + + # make and score predictions + scores = model(minibatch_data) + predicted_labels.extend(scores.argmax(dim=1).tolist()) + + # evaluate remaining samples + indices = torch.arange(len(predicted_labels), len(test_labels)) + minibatch_data, _ = make_minibatch(indices, test_data, test_labels) + minibatch_data = minibatch_data.to(device) + + scores = model(minibatch_data) + predicted_labels.extend(scores.argmax(dim=1).tolist()) + + print(classification_report(y_true=test_labels, y_pred=predicted_labels))