-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathPerplexity.py
More file actions
117 lines (60 loc) · 2.24 KB
/
Perplexity.py
File metadata and controls
117 lines (60 loc) · 2.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
__author__ = 'greglakomski'
#calculates perplexity
# Note: Have to delete the first line in the wordmap before running this!!!!!!!!!!!!
import csv
from numpy import genfromtxt
# input the phi data as array
from math import log, exp
filepathin = "/Users/greglakomski/Desktop/GibbsLDA++-0.2/models/Medicare/EightyClustersStop/model-final.phi"
my_phi = genfromtxt(filepathin, delimiter=' ')
print(my_phi.shape)
filepathin2 = "/Users/greglakomski/Desktop/GibbsLDA++-0.2/models/Medicare/EightyClustersStop/wordmap.txt"
# input the word list as array
# will need to use the word list to generate indexes
index_names = []
# create searchable list of words that matches the phi array
with open(filepathin2) as f2:
reader2 = csv.reader(f2, delimiter = ' ')
next(reader2)
for row in reader2:
index_names.append(row[0])
#import the Theta data for providers
filepathin3 = "/Users/greglakomski/Desktop/GibbsLDA++-0.2/models/Medicare/EightyClustersStop/model-final.theta"
my_theta = genfromtxt(filepathin3, delimiter=' ')
print(my_theta.shape)
length_theta = my_theta.shape[1]
filepathin4 = "/Users/greglakomski/Desktop/GibbsLDA++-0.2/models/Medicare/code_array4.csv"
with open(filepathin4) as f4:
line = list(f4)[0]
linelist = line.split(' ')
num_docs = linelist[0].replace('\r\n','')
print('num docs',num_docs)
sum_words = 0
sum_log = 0
# Algorithm
for k in range(1,int(num_docs)):
#for k in range(1,3):
with open(filepathin4) as f4:
line = list(f4)[k]
linelist = line.split(' ')
#print(len(linelist))
linelistset = set(linelist)
sum_words += len(linelistset)
p_w_d = 1
for i in range (0,len(linelist)): # for every word in the document
indexed_word = linelist[i].replace('\r\n','')
word_index = index_names.index(indexed_word)
#print(word_index)
temp_sum = 0
for j in range (0,length_theta): # for every topic
#print(my_phi[j,word_index] * my_theta[k-1,j])
temp_sum += my_phi[j,word_index] * my_theta[k-1,j] # k-1 because there is extra line in code array
p_w_d *= temp_sum
#print(p_w_d)
log_p_w_d = log(p_w_d)
# print('log pwd',log_p_w_d)
sum_log += log_p_w_d
print('sum_log',sum_log)
print('num words',sum_words)
perplex = exp(-sum_log/sum_words)
print(perplex)