-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparsejson.py
More file actions
executable file
·124 lines (107 loc) · 3.19 KB
/
parsejson.py
File metadata and controls
executable file
·124 lines (107 loc) · 3.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/python
try:
import json
except ImportError:
import simplejson as json
#import json
#import nltk
import re
import string
import os.path
import sys
from pprint import pprint
#from nltk.corpus import stopwords
#from nltk import word_tokenize, wordpunct_tokenize
# def split_line(text):
# # split the text
# words = text.split()
# # for each word in the line:
# for word in words:
# # prints each word on a line
# # print(word)
def write_vocab(filename,hash_array):
filehandle=open(filename,"a")
keysinarray=hash_array.keys()
#print keysinarray[0:]
for wkey in keysinarray:
filehandle.write(wkey+"\n")
filehandle.close()
def filetohash(filename):
filehandle=open(filename,"r")
hash_array=dict()
for i in filehandle:
hash_array[i.strip()]=0;
filehandle.close()
return hash_array
def stripstopwords_punctuations(t1):
punctuations = '''!()-[]{}1234567890;:'"\,<>./?@#$%^&*_~'''
#
text = t1.lower()
# text = ' '.join([word for word in text.split() if word not in (nltk.corpus.stopwords.words('english'))])
text = ' '.join([word for word in text.split() if word not in punctuations])
text=text.encode('ascii','replace')
text=text.translate(None,punctuations)
return text
# print text
# text.strip(punctuations)
# from nltk.tokenize import RegexpTokenizer
# tokenizer = RegexpTokenizer(r'\w')
# t2=tokenizer.tokenize(text)
# print t2
# exclude = string.punctuation#['?','.','#','(',')','']
# s = ''.join([text for text in text if text not in exclude])
# s=split_line(s)
def tokenize_words(text):
s = nltk.word_tokenize(text)
d={}
for token in s:
if d.has_key(token):
d[token]=d[token]+1
else:
d[token]=1
if(not hash_arr.has_key(token)):
filehandle_wc=open("vocab.txt","a")
filehandle_wc.write(token+"\n")
filehandle_wc.close()
return d
# Main Program
#file='/Users/Deb/Downloads/BioASQ-SampleDataA.json-2.txt'
file=sys.argv[1]
try:
# with open(file,"r") as data_file:
data_file=open(file,"r")
data = json.load(data_file)
data_file.close()
except IOError:
print "Error file not found"
nObjects=len(data['articles'])
print nObjects
#Existing vocab
if(os.path.exists('vocab.txt')):
hash_arr=filetohash('vocab.txt')
else:
hash_arr={}
#Loading files
stopwords=filetohash('stopwords.txt')
object_hash_tokenized=[]
for fileno in range(nObjects):
# pprint(data['articles'][0]['pmid'])
# pprint(data['articles'][0]['abstractText'])
# pprint(data['articles'][0]['meshMajor'])
t1 =data['articles'][fileno]['abstractText']
print fileno
tokenized_hash=dict()
tokenized=re.split('\W+', t1.lower())
for token in tokenized:
if(stopwords.has_key(token)):
tokenized.remove(token)
else:
tokenized_hash[token]=0
#tokenized=re.split('\W+', t1.lower())
print tokenized_hash
#object_hash_tokenized.extend(tokenize_words(t1))
# print d
# write_vocab('vocab.txt',t1_hash_arr)
# hash_arr=filetohash('vocab.txt')
#pprint (object_hash_tokenized)
# TODO:Duplicate hash entry