diff --git a/embedding.py b/embedding.py index 63dee8b..e6f0390 100755 --- a/embedding.py +++ b/embedding.py @@ -111,7 +111,7 @@ def add_keyword_corpus(self, keyword, sentences): self.sentences = list(set(self.sentences)) else: - + self.kc[keyword] = set(sentences) self.sentences.extend(sentences) self.sentences = list(set(self.sentences)) diff --git a/preprocessing.py b/preprocessing.py index af31f1d..2d6aa47 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -114,7 +114,7 @@ def __init__(self, keywords, case_sensitive=False, worker=3): # self.kc[keyword] = set(sentences) - def _create(self, keywords, sentences, chunksize=256): + def _create(self, keywords, sentences, chunksize=5000): sentences_chunk = [] partition_size = chunksize // self.corpus_worker diff --git a/sec2vec_decorator.py b/sec2vec_decorator.py new file mode 100644 index 0000000..dca5edf --- /dev/null +++ b/sec2vec_decorator.py @@ -0,0 +1,40 @@ +from collections import Iterator + +def assert_sentences(f): + ''' + Normalize the input sentences' data structure + ''' + + def inner(self, k=None, sentences=None, corpus_file=None, *args): + # def inner(self, k=None, sentences=None, *args): + ''' + :param k: keywords or keyword + :type k: str or list of str + :param sentences: input sentences for training. + :type sentences: str, list of str or list of listed tokens + :param corpus_file: path for corpus + :type corpus_file: str + ''' + + if isinstance(sentences, str): + return f(self, k, [sentences], corpus_file, *args) + + elif isinstance(sentences, Iterator): + return f(self, k, sentences, corpus_file, *args) + + elif isinstance(sentences, list): + + _sentences = [] + + for s in sentences: + + if isinstance(s, str): + _sentences.append(s) + else: + _sentences.append(' '.join(s)) + + return f(self, k, _sentences, corpus_file, *args) + + return f(self, k, sentences, corpus_file, *args) + + return inner \ No newline at end of file