-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathembed_preprocessing.py
More file actions
101 lines (80 loc) · 2.85 KB
/
embed_preprocessing.py
File metadata and controls
101 lines (80 loc) · 2.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 26 22:48:34 2015
@author: ameasure
"""
import numpy as np
import re
import cPickle as pickle
from gensim.models import Word2Vec
from msha_extractor import get_data
def get_model():
f = r'C:\Users\ameasure\Desktop\Programming Projects\theano_test\GoogleNews-vectors-negative300.bin'
model = Word2Vec.load_word2vec_format(f, binary=True)
return model
def get_simple_model():
return pickle.load(open('simple_model.pi', 'rb'))
def get_vocabulary(raw):
vocabulary = set([])
rows = raw.to_dict(orient='records')
for row in rows:
words = tokenize(row['NARRATIVE'])
for word in words:
vocabulary.add(word)
return vocabulary
def make_simple_model(model, vocabulary):
simple_model = {}
for word in vocabulary:
try:
vector = model[word]
except KeyError:
print 'KeyError on %s, using random embedding instead' % word
vector = get_random_embedding()
simple_model[word] = vector
simple_model['BLANK_EMBEDDING'] = np.zeros(300, dtype=np.float32)
return simple_model
def make_and_save_simple_model():
train, test = get_data(n_train=100000000, n_test=0)
vocabulary = get_vocabulary(train)
model = get_model()
simple_model = make_simple_model(model=model, vocabulary=vocabulary)
pickle.dump(simple_model, open('simple_model.pi', 'wb'))
TOKEN_PATTERN = re.compile(r"(?u)\b\w+\b")
def tokenize(document):
return TOKEN_PATTERN.findall(document)
def vectorize(raw, model):
rows = raw.to_dict(orient='records')
embedded_documents = []
for row in rows:
words = tokenize(row['NARRATIVE'])
vectors = []
for word in words:
vector = model[word]
vectors.append(vector)
while len(vectors) < 98:
vector = model['BLANK_EMBEDDING']
vectors.append(vector)
embedded_documents.append(np.hstack(vectors).astype(np.float32))
return np.vstack(embedded_documents)
EMBED_SIZE = 300
EMBED_MEAN = -.0017837381
EMBED_STD = .057699453
def get_random_embedding():
return np.random.normal(loc=EMBED_MEAN, scale=EMBED_STD, size=EMBED_SIZE)
"""
Input: tokenizer word index
Output: list of shape (input_dim, output_dim) = (vocab_size, embedding_size)
"""
def get_initial_embeddings(word_index):
model = get_simple_model()
initial_embedding = []
skipped_words = []
for word, index in sorted(word_index.items(), key=lambda x: x[1]):
try:
embedding = model[word]
except KeyError:
skipped_words.append(word)
embedding = get_random_embedding()
initial_embedding.append(embedding)
print '%d words received random embeddings because previously unseen' % len(skipped_words)
return np.array(initial_embedding)