-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCCNode.py
More file actions
218 lines (174 loc) · 6.1 KB
/
CCNode.py
File metadata and controls
218 lines (174 loc) · 6.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
from utils_gen import *
from collections import OrderedDict
from utiils_span import *
class CCNode:
"""
CCNode is similar to Openie6.metric.Coordination
This class defines the nodes (ccnodes) of a CCTree. Think of a CCNode by
its __str__. For example, a __str__ for a CCNode might be (2, 5)6(7,
23). (2, 5) is its left span `span_pair[0]`, 6 is its `ccloc` (cc
location) and (7, 23) is its right span `span_pair[1]`. The CCNode's
ccloc is always located between but outside the range of its left and
right spans.
A span is a tuple (i,j), where i is position of first token/word and j-1
is the position (i.e., location, loc) of last token/word. Hence,
span(5, 8) covers range(5, 8) = (5, 6, 7).
self.span_pair is a list of 2 spans.
e.g. osent = "He ate apples and oranges ."
self.ccloc = 3
self.osent_words = ["He", "ate", "apples", "and", "oranges", "."]
self.span_pair=[(0,3), (4,5)]
Note that the spans in self.span_pair exclude self.ccloc
span is similar to Openie6.conjunct
loc= location of a word relative to self.osent_words
SentenceAx uses NLTK both to tokenize sentences into words (see
utils_gen.get_words()), and to find the POS of each token/word. A
token/word may be a punctuation mark. Openie6 mixes NLTK and Spacy (bad!)
Attributes
----------
ccloc: int
location of cc (coordinating conjunction) (see FANBOYS). 1-1 map
between cclocs and CCNodes
depth: int
0-based position in CCTree. depth is just a label for distinguishing
between CCNodes. Not used for anything.
osent_words: list[str]
list of words in osent (original sentence)
span_pair: list[tuple[int, int], tuple[int, int]]
a list of 2 spans, left and right, of self. spans exclude location
`ccloc`
spanned_locs: list[int]
locs that are within a span
"""
def __init__(self,
ccloc,
depth,
osent_words,
span_pair):
"""
Constructor
Parameters
----------
ccloc: int
depth: int
osent_words: list[str]
span_pair: list[tuple[int,int]]
"""
self.ccloc = ccloc
self.depth = depth
self.osent_words = osent_words
self.span_pair = span_pair
self.spanned_locs = self.get_spanned_locs()
# print("lobhj", self.spanned_locs)
def check_self(self):
"""
This method checks that the left and right spans don't overlap and
that self.ccloc is between the 2 spans but not in their ranges.
Returns
-------
None
"""
last_b = -1
for a, b in self.span_pair:
assert a < b
assert last_b <= a
last_b = b
# print("by56x", self.osent_words)
# print("lkou", self.span_pair)
# print("bnhj", self.ccloc)
# print("bxxx", self.sep_locs)
locs = self.get_spanned_locs()
for loc in locs:
if self.ccloc == loc:
assert False
min0 = self.span_pair[0][0]
max0 = self.span_pair[1][1] - 1
# print("nnmkl", self.span_pair)
assert min0 <= self.ccloc <= max0, \
f"min0={min0}, ccloc={self.ccloc}, max0={max0}"
def is_parent(self, child):
"""
similar to Openie6.data.is_parent()
Returns True iff self is a parent of ccnode `child`.
A CCNode `child` is a child of a CCNode `parent` if the child's left
span, ccloc and right span are all fully contained within a single
span (either the left or the right one) of the parent ccnode.
Parameters
----------
child: CCNode
Returns
-------
bool
"""
# parent, child are instances of CCNode
ch_min = child.span_pair[0][0]
ch_max = child.span_pair[1][1] - 1
# self is parent iff
# at least one span in self.span_pair contains all spans of the child
for span in self.span_pair:
if span[0] <= ch_min and ch_max <= span[1] - 1:
return True
return False
def is_child(self, parent):
"""
Returns True iff self is a child of ccnode `parent`.
Parameters
----------
parent: CCNode
Returns
-------
bool
"""
return parent.is_parent(self)
def get_spanned_locs(self):
"""
This method returns all locations in the ranges of the left and
right spans.
Returns
-------
list[int]
"""
spanned_locs = []
for span in self.span_pair:
for i in range(*span):
if i < len(self.osent_words):
spanned_locs.append(i)
return sorted(spanned_locs)
def get_spanned_unbreakable_loc_to_word(self):
"""
similar to Openie6.data.remove_unbreakable_conjuncts()
This method returns a dictionary mapping the location of each
spanned unbreakable word to the word.
Used in CCTree.remove_bad_ccnodes()
Returns
-------
dict[str, int]
"""
spanned_unbreakable_loc_to_word = OrderedDict()
for i, word in enumerate(self.osent_words):
if word.lower() in SAX_UNBREAKABLE_WORDS and \
i in self.spanned_locs:
spanned_unbreakable_loc_to_word[i] = word
return spanned_unbreakable_loc_to_word
def __eq__(self, node):
"""
This method defines equality of 2 CCNode instances.
Parameters
----------
node: CCNode
Returns
-------
bool
"""
return self.ccloc == node.ccloc and \
self.span_pair[0] == node.span_pair[0] and \
self.span_pair[1] == node.span_pair[1]
def __str__(self):
"""
Returns a string containing self.span_pair and self.ccloc.
Returns
-------
str
"""
return str(self.span_pair[0]) + str(self.ccloc) + str(
self.span_pair[1])