forked from pcyin/NL2code
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnlparser.py
More file actions
89 lines (59 loc) · 2.69 KB
/
nlparser.py
File metadata and controls
89 lines (59 loc) · 2.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# This file is a wrapper around the stanford parser
# code written with template from https://stackoverflow.com/questions/13883277/stanford-parser-and-nltk
import os
from nltk.parse import stanford
from nltk.translate.ribes_score import position_of_ngram
import nltk
os.environ['STANFORD_PARSER'] = "./"
os.environ['STANFORD_MODELS'] = "./"
class QueryParser:
# parser
parser = ""
# constructor
def __init__(self):
# global variable; initialize parser once
self.parser = stanford.StanfordParser(model_path = "./englishPCFG.ser.gz")
@staticmethod
def recurse(s, d, sentence = None):
if not isinstance(s, nltk.Tree):
return d
if (s.label() != "NP" and s.label() != "VP" and s.label().endswith("P")):
if sentence is not None:
word_lst = s.leaves()
sent_part = " ".join(word_lst)
# figure out the position
p = position_of_ngram(tuple(sent_part.split(" ")), sentence.split(" "))
d[(p, p + len(s.leaves()) - 1, sent_part.encode("ascii", "ignore"))] = s.label().encode("ascii", "ignore")
## this is neither a NP nor a VP; but is a Phrase none the less
#words = " ".join(s.leaves()).encode("ascii", "ignore")
#d[words] = s.label().encode("ascii", "ignore")
## return immediately
return d
# not an interesting phrase; lets go on
for t in s:
d = QueryParser().recurse(t, d, sentence)
return d
# give a list of sentences to parse
def deep_phrases(self, sents):
sentences = self.parser.raw_parse_sents(sents)
phrase_pos = []
for sentence in sentences:
for s in sentence:
sent_leaves = s.leaves()
#sent_leaves = map(lambda l: l.label(), sent_leaves)
sent_str = " ".join(sent_leaves)
d = {}
postags = []
d = QueryParser().recurse(s, d, sent_str)
postags = s.pos()
# make every string in pos tag ascii
postags = map(lambda t: (t[0].encode("ascii", "ignore"), \
t[1].encode("ascii", "ignore")), postags)
phrase_pos.append((d, postags))
return phrase_pos
if __name__ == "__main__":
qparser = QueryParser()
s = " key wathari_l into the dict. hello world. bye world"
#print qparser.deep_phrases(["# divide a and b and store into c"])
#print qparser.deep_phrases([" key wathari_l into the dict. hello world. bye world"])
print qparser.deep_phrases([" key wathari_l into the dict. hello world. bye world"])[0][1]