-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathemb_modulus.py
More file actions
166 lines (122 loc) · 4.72 KB
/
Copy pathemb_modulus.py
File metadata and controls
166 lines (122 loc) · 4.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import tensorflow.contrib.layers as lays
import multiprocessing as mp
import tensorflow as tf
import pandas as pd
import numpy as np
from tqdm import tqdm
import pyemblib
import scipy
import time
import sys
import os
'''
emb_modulus.py
Compute and print the average vector norm given the location of a
pretrained embedding.
'''
#========1=========2=========3=========4=========5=========6=========7==
# RETURNS: a tuple of the script arguments
def parse_args():
emb_path = sys.argv[1]
args = [emb_path,
]
return args
#========1=========2=========3=========4=========5=========6=========7==
def check_valid_dir(some_dir):
if not os.path.isdir(some_dir):
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
print("")
print("DIES IST EIN UNGÜLTIGES VERZEICHNIS!!!!")
print("")
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
exit()
#========1=========2=========3=========4=========5=========6=========7==
def check_valid_file(some_file):
if not os.path.isfile(some_file):
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
print("")
print("DIES IST KEIN GÜLTIGER SPEICHERORT FÜR DATEIEN!!!!")
print("")
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
exit()
#========1=========2=========3=========4=========5=========6=========7==
# RETURNS a tuple of the vectors and the labels dataframe
def process_embedding(emb_path):
print("Preprocessing. ")
file_name_length = len(emb_path)
last_char = emb_path[file_name_length - 1]
# Decide if it's a binary or text embedding file, and read in
# the embedding as a dict object, where the keys are the tokens
# (strings), and the values are the components of the corresponding
# vectors (floats).
embedding = {}
if (last_char == 'n'):
embedding = pyemblib.read(emb_path, mode=pyemblib.Mode.Binary)
elif (last_char == 't'):
embedding = pyemblib.read(emb_path, mode=pyemblib.Mode.Text)
else:
print("Unsupported embedding format. ")
exit()
# convert embedding to pandas dataframe
# "words_with_friends" is the column label for the vectors
# this df has shape [num_inputs,2] since the vectors are all in 1
# column as length d lists
emb_df = pd.Series(embedding, name="words_with_friends")
# print(emb_df.head(10))
# reset the index of the dataframe
emb_df = emb_df.reset_index()
# print(emb_df.head(10))
# matrix of just the vectors
emb_matrix = emb_df.words_with_friends.values.tolist()
# print(emb_matrix[0:10])
# dataframe of just the vectors
vectors_df = pd.DataFrame(emb_matrix,index=emb_df.index)
# print(vectors_df.head(10))
# numpy matrix of just the vectors
vectors_matrix = vectors_df.as_matrix()
# print(vectors_matrix[0:10])
return vectors_matrix, emb_df.loc[:,"index"]
#========1=========2=========3=========4=========5=========6=========7==
def compute_modulus(vectors_matrix):
norm_array = np.linalg.norm(vectors_matrix,axis=1)
print("norm_array:", norm_array)
average_norm = np.average(norm_array)
return average_norm
#========1=========2=========3=========4=========5=========6=========7==
def runflow(emb_path):
unit_norm = True
# unit_norm = False
# PREPROCESSING
check_valid_file(emb_path)
vectors_matrix,label_df = process_embedding(emb_path)
# We get the dimensions of the input dataset.
shape = vectors_matrix.shape
print("Shape of embedding matrix: ", shape)
# number of rows in the embedding
num_inputs = shape[0]
num_outputs = num_inputs
# dimensionality of the embedding file
num_hidden = shape[1]
# unit norm option
if (unit_norm):
print("Unit norming the embedding. ")
norms_matrix = np.linalg.norm(vectors_matrix, axis=1)
# norms_matrix[norms_matrix==0] = 1
vectors_matrix = vectors_matrix / np.expand_dims(norms_matrix, -1)
modulus = compute_modulus(vectors_matrix)
print("The average vector norm is: ", modulus)
return
#========1=========2=========3=========4=========5=========6=========7==
if __name__ == "__main__":
# stuff only to run when not called via 'import' here
args = parse_args()
emb_path = args[0]
runflow(emb_path)