Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def add_keyword_corpus(self, keyword, sentences):
self.sentences = list(set(self.sentences))

else:

self.kc[keyword] = set(sentences)
self.sentences.extend(sentences)
self.sentences = list(set(self.sentences))
Expand Down
2 changes: 1 addition & 1 deletion preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def __init__(self, keywords, case_sensitive=False, worker=3):
# self.kc[keyword] = set(sentences)


def _create(self, keywords, sentences, chunksize=256):
def _create(self, keywords, sentences, chunksize=5000):

sentences_chunk = []
partition_size = chunksize // self.corpus_worker
Expand Down
40 changes: 40 additions & 0 deletions sec2vec_decorator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from collections import Iterator

def assert_sentences(f):
'''
Normalize the input sentences' data structure
'''

def inner(self, k=None, sentences=None, corpus_file=None, *args):
# def inner(self, k=None, sentences=None, *args):
'''
:param k: keywords or keyword
:type k: str or list of str
:param sentences: input sentences for training.
:type sentences: str, list of str or list of listed tokens
:param corpus_file: path for corpus
:type corpus_file: str
'''

if isinstance(sentences, str):
return f(self, k, [sentences], corpus_file, *args)

elif isinstance(sentences, Iterator):
return f(self, k, sentences, corpus_file, *args)

elif isinstance(sentences, list):

_sentences = []

for s in sentences:

if isinstance(s, str):
_sentences.append(s)
else:
_sentences.append(' '.join(s))

return f(self, k, _sentences, corpus_file, *args)

return f(self, k, sentences, corpus_file, *args)

return inner