Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,15 @@
import argparse

# only need to run once for synonym generator
import nltk
nltk.download('wordnet')

# The domesticviolence and survivorsofabuse subreddits will be class 0, critical; these are personal stories, calls for help, requests for advice
# The abuseInterrupted subreddit will be class 1, noncritical; it mostly contains empty text, links to articles, general statements about abuse, etc.
# Everything else will be class 2, general/unrelated
# note: don't need this unless we augment again and it throws errors
# import nltk
# nltk.download('wordnet')

'''
The domesticviolence and survivorsofabuse subreddits will be class 0, critical; these are personal stories, calls for help, requests for advice
The abuseInterrupted subreddit will be class 1, noncritical; it mostly contains empty text, links to articles, general statements about abuse, etc.
Everything else will be class 2, general/unrelated
'''

CLASSES = {
'relationship_advice': 2, # 5874 samples
Expand Down
80 changes: 6 additions & 74 deletions gru.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,10 @@
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tqdm import trange

from data_loader import load_data
from utils import build_vocab, make_minibatch, strings_to_tensors
from utils import build_vocab, strings_to_tensors, train_model, evaluate_model


class GRU(nn.Module):
Expand Down Expand Up @@ -36,7 +34,6 @@ def forward(self, input_seq):


if __name__ == "__main__":

# If there's an available GPU, lets train on it
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Expand Down Expand Up @@ -77,73 +74,8 @@ def forward(self, input_seq):
loss_func = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

"""
Train the model
"""
epochs = trange(1, n_epochs + 1)
for epoch in epochs:
model.train()

# Accumulate loss for all samples in this epoch
running_loss = 0
num_batches = 0
num_samples = len(train_data)

shuffled_indices = torch.randperm(num_samples)

# Gradient descent algorithm for each data sample
batches = trange(0, num_samples - bs, bs, leave=False)
for count in batches:
# Extract minibatches and send to device
indices = shuffled_indices[count: count + bs]
minibatch_data, minibatch_label = make_minibatch(
indices, train_data, train_labels)
minibatch_data, minibatch_label = minibatch_data.to(
device), minibatch_label.to(device)

# Make predictions on the training data
scores = model(minibatch_data)

# Backpropagation
loss = loss_func(scores, minibatch_label)
loss.backward()
optimizer.step()
optimizer.zero_grad()

# Compute metrics
with torch.no_grad():
num_batches += 1
running_loss += loss.item()

epochs.set_description(
f'Epoch {epoch} / {n_epochs} | Loss: {running_loss/num_batches}')

"""
Evaluate the model
"""
with torch.no_grad():
model.eval()
predicted_labels = []

# cycle through test set in batches
batches = trange(0, len(test_data) - bs, bs,
desc='evaluating on test set', leave=False)
for i in batches:
# extract minibatch
indices = torch.arange(i, i + bs)
minibatch_data, _ = make_minibatch(indices, test_data, test_labels)
minibatch_data = minibatch_data.to(device)

# make and score predictions
scores = model(minibatch_data)
predicted_labels.extend(scores.argmax(dim=1).tolist())

# evaluate remaining samples
indices = torch.arange(len(predicted_labels), len(test_labels))
minibatch_data, _ = make_minibatch(indices, test_data, test_labels)
minibatch_data = minibatch_data.to(device)

scores = model(minibatch_data)
predicted_labels.extend(scores.argmax(dim=1).tolist())

print(classification_report(y_true=test_labels, y_pred=predicted_labels))
train_model(model=model, n_epochs=n_epochs, train_data=train_data, train_labels=train_labels,
bs=bs, device=device, loss_func=loss_func, optimizer=optimizer)

evaluate_model(model=model, test_data=test_data, test_labels=test_labels,
bs=bs, device=device)
86 changes: 8 additions & 78 deletions lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,10 @@
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tqdm import trange

from data_loader import load_data
from utils import build_vocab, make_minibatch, strings_to_tensors
from utils import build_vocab, strings_to_tensors, train_model, evaluate_model


class LSTM(nn.Module):
Expand Down Expand Up @@ -46,17 +44,16 @@ def forward(self, input_seq):


if __name__ == "__main__":

# If there's an available GPU, lets train on it
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

"""
Load the training, validation, and test data
"""
posts, labels = load_data(
og_file_path='data/reddit_submissions.csv',
aug_file_path='data/synonym_augmented_reddit_submissions.csv',
include_og=True,
og_file_path='data/reddit_submissions.csv',
aug_file_path='data/synonym_augmented_reddit_submissions.csv',
include_og=True,
include_aug=True)

posts_train, posts_test, train_labels, test_labels = train_test_split(
Expand Down Expand Up @@ -91,78 +88,11 @@ def forward(self, input_seq):
loss_func = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

"""
Train the model
"""
epochs = trange(1, n_epochs + 1)
for epoch in epochs:
model.train()

# Accumulate loss for all samples in this epoch
running_loss = 0
num_batches = 0
num_samples = len(train_data)

shuffled_indices = torch.randperm(num_samples)

# Gradient descent algorithm for each data sample
batches = trange(0, num_samples - bs, bs, leave=False)
for count in batches:
# Extract minibatches and send to GPU
indices = shuffled_indices[count : count + bs]
minibatch_data, minibatch_label = make_minibatch(
indices, train_data, train_labels)
minibatch_data, minibatch_label = minibatch_data.to(
device), minibatch_label.to(device)

# Make a predictions on the training data
scores = model(minibatch_data)

# Backpropagation
loss = loss_func(scores, minibatch_label)
loss.backward()
optimizer.step()
optimizer.zero_grad()

# Compute metrics
with torch.no_grad():
num_batches += 1
running_loss += loss.item()

epochs.set_description(
f'Epoch {epoch}/{n_epochs} | Loss: {running_loss/num_batches}')

"""
Evaluate the model
"""
with torch.no_grad():
model.eval()
predicted_labels = []

# cycle through test set in batches
batches = trange(0, len(test_data) - bs, bs,
desc='evaluating on test set', leave=False)
for i in batches:
# extract minibatch
indices = torch.arange(i, i + bs)
minibatch_data, _ = make_minibatch(indices, test_data, test_labels)
minibatch_data = minibatch_data.to(device)

# make and score predictions
scores = model(minibatch_data)
predicted_labels.extend(scores.argmax(dim=1).tolist())

# evaluate remaining samples
indices = torch.arange(len(predicted_labels), len(test_labels))
minibatch_data, _ = make_minibatch(indices, test_data, test_labels)
minibatch_data = minibatch_data.to(device)

scores = model(minibatch_data)
predicted_labels.extend(scores.argmax(dim=1).tolist())

print(classification_report(y_true=test_labels, y_pred=predicted_labels))

train_model(model=model, n_epochs=n_epochs, train_data=train_data, train_labels=train_labels,
bs=bs, device=device, loss_func=loss_func, optimizer=optimizer)

evaluate_model(model=model, test_data=test_data, test_labels=test_labels,
bs=bs, device=device)

"""
Output:
Expand Down
80 changes: 6 additions & 74 deletions rnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,10 @@
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tqdm import trange

from data_loader import load_data
from utils import build_vocab, make_minibatch, strings_to_tensors
from utils import build_vocab, strings_to_tensors, train_model, evaluate_model


class RNN(nn.Module):
Expand Down Expand Up @@ -38,7 +36,6 @@ def forward(self, input_seq):


if __name__ == "__main__":

# If there's an available GPU, lets train on it
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Expand Down Expand Up @@ -79,73 +76,8 @@ def forward(self, input_seq):
loss_func = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

"""
Train the model
"""
epochs = trange(1, n_epochs + 1)
for epoch in epochs:
model.train()

# Accumulate loss for all samples in this epoch
running_loss = 0
num_batches = 0
num_samples = len(train_data)

shuffled_indices = torch.randperm(num_samples)

# Gradient descent algorithm for each data sample
batches = trange(0, num_samples - bs, bs, leave=False)
for count in batches:
# Extract minibatches and send to device
indices = shuffled_indices[count: count + bs]
minibatch_data, minibatch_label = make_minibatch(
indices, train_data, train_labels)
minibatch_data, minibatch_label = minibatch_data.to(
device), minibatch_label.to(device)

# Make predictions on the training data
scores = model(minibatch_data)

# Backpropagation
loss = loss_func(scores, minibatch_label)
loss.backward()
optimizer.step()
optimizer.zero_grad()

# Compute metrics
with torch.no_grad():
num_batches += 1
running_loss += loss.item()

epochs.set_description(
f'Epoch {epoch} / {n_epochs} | Loss: {running_loss/num_batches}')

"""
Evaluate the model
"""
with torch.no_grad():
model.eval()
predicted_labels = []

# cycle through test set in batches
batches = trange(0, len(test_data) - bs, bs,
desc='evaluating on test set', leave=False)
for i in batches:
# extract minibatch
indices = torch.arange(i, i + bs)
minibatch_data, _ = make_minibatch(indices, test_data, test_labels)
minibatch_data = minibatch_data.to(device)

# make and score predictions
scores = model(minibatch_data)
predicted_labels.extend(scores.argmax(dim=1).tolist())

# evaluate remaining samples
indices = torch.arange(len(predicted_labels), len(test_labels))
minibatch_data, _ = make_minibatch(indices, test_data, test_labels)
minibatch_data = minibatch_data.to(device)

scores = model(minibatch_data)
predicted_labels.extend(scores.argmax(dim=1).tolist())

print(classification_report(y_true=test_labels, y_pred=predicted_labels))
train_model(model=model, n_epochs=n_epochs, train_data=train_data, train_labels=train_labels,
bs=bs, device=device, loss_func=loss_func, optimizer=optimizer)

evaluate_model(model=model, test_data=test_data, test_labels=test_labels,
bs=bs, device=device)
Loading