From 0dc9b3dae6635f4c7a10ae6ca49b29da96ac3e39 Mon Sep 17 00:00:00 2001 From: hannahxchen Date: Fri, 30 Nov 2018 20:23:40 +0800 Subject: [PATCH 1/5] : merge conflicts --- embedding.py | 2 +- preprocessing.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/embedding.py b/embedding.py index ca17229..a1e1a72 100755 --- a/embedding.py +++ b/embedding.py @@ -114,7 +114,7 @@ def _cal_kv(self): # 20181130 LIN, Y.D. Move from KeywordCorpusFactory def add_keyword_corpus(self, keyword, sentences): - if instance(sentences, list): + if isinstance(sentences, list): if keyword in self.kc: diff --git a/preprocessing.py b/preprocessing.py index e03aec5..34d7d3a 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -151,7 +151,7 @@ def __init__(self, keywords, case_sensitive=False, worker=3): # self.kc[keyword] = set(sentences) - def _create(self, keywords, sentences, chunksize=256): + def _create(self, keywords, sentences, chunksize=5000): sentences_chunk = [] partition_size = chunksize // self.corpus_worker From 68160a9a3cfc13b07be6799fa52361329b6d0884 Mon Sep 17 00:00:00 2001 From: hannahxchen Date: Sat, 1 Dec 2018 13:22:10 +0800 Subject: [PATCH 2/5] : merge conflicts --- embedding.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/embedding.py b/embedding.py index 3f8eb1f..861474e 100755 --- a/embedding.py +++ b/embedding.py @@ -91,24 +91,20 @@ def _cal_kv(self): # 20181130 LIN, Y.D. Move from KeywordCorpusFactory def add_keyword_corpus(self, keyword, sentences): -<<<<<<< HEAD -======= #20181130 - print(len(self.kc)) ->>>>>>> upstream/master if isinstance(sentences, list): if keyword in self.kc: for s in sentences: - self.kc[keyword].add(s) - self.sentences.extend(sentences) - self.sentences = list(set(self.sentences)) + self.kc[keyword].add(s) + self.sentences.extend(sentences) + self.sentences = list(set(self.sentences)) else: - print(self.sentences) + self.kc[keyword] = set(sentences) self.sentences.extend(sentences) self.sentences = list(set(self.sentences)) From 6ccd48da8a5143a8688a0722d226f6d62b5b135c Mon Sep 17 00:00:00 2001 From: hannahxchen Date: Sun, 2 Dec 2018 14:38:31 +0800 Subject: [PATCH 3/5] : modify decorator.py to sec2vec_decorator.py --- sec2vec_decorator.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 sec2vec_decorator.py diff --git a/sec2vec_decorator.py b/sec2vec_decorator.py new file mode 100644 index 0000000..dca5edf --- /dev/null +++ b/sec2vec_decorator.py @@ -0,0 +1,40 @@ +from collections import Iterator + +def assert_sentences(f): + ''' + Normalize the input sentences' data structure + ''' + + def inner(self, k=None, sentences=None, corpus_file=None, *args): + # def inner(self, k=None, sentences=None, *args): + ''' + :param k: keywords or keyword + :type k: str or list of str + :param sentences: input sentences for training. + :type sentences: str, list of str or list of listed tokens + :param corpus_file: path for corpus + :type corpus_file: str + ''' + + if isinstance(sentences, str): + return f(self, k, [sentences], corpus_file, *args) + + elif isinstance(sentences, Iterator): + return f(self, k, sentences, corpus_file, *args) + + elif isinstance(sentences, list): + + _sentences = [] + + for s in sentences: + + if isinstance(s, str): + _sentences.append(s) + else: + _sentences.append(' '.join(s)) + + return f(self, k, _sentences, corpus_file, *args) + + return f(self, k, sentences, corpus_file, *args) + + return inner \ No newline at end of file From 90fe488c4960824d6f7a25219a0269321622dfbd Mon Sep 17 00:00:00 2001 From: hannahxchen Date: Sun, 2 Dec 2018 14:42:01 +0800 Subject: [PATCH 4/5] Revert ": modify decorator.py to sec2vec_decorator.py" This reverts commit 6ccd48da8a5143a8688a0722d226f6d62b5b135c. --- sec2vec_decorator.py | 40 ---------------------------------------- 1 file changed, 40 deletions(-) delete mode 100644 sec2vec_decorator.py diff --git a/sec2vec_decorator.py b/sec2vec_decorator.py deleted file mode 100644 index dca5edf..0000000 --- a/sec2vec_decorator.py +++ /dev/null @@ -1,40 +0,0 @@ -from collections import Iterator - -def assert_sentences(f): - ''' - Normalize the input sentences' data structure - ''' - - def inner(self, k=None, sentences=None, corpus_file=None, *args): - # def inner(self, k=None, sentences=None, *args): - ''' - :param k: keywords or keyword - :type k: str or list of str - :param sentences: input sentences for training. - :type sentences: str, list of str or list of listed tokens - :param corpus_file: path for corpus - :type corpus_file: str - ''' - - if isinstance(sentences, str): - return f(self, k, [sentences], corpus_file, *args) - - elif isinstance(sentences, Iterator): - return f(self, k, sentences, corpus_file, *args) - - elif isinstance(sentences, list): - - _sentences = [] - - for s in sentences: - - if isinstance(s, str): - _sentences.append(s) - else: - _sentences.append(' '.join(s)) - - return f(self, k, _sentences, corpus_file, *args) - - return f(self, k, sentences, corpus_file, *args) - - return inner \ No newline at end of file From 918f9fb8bb8d78005674da1c58cb24f48862624e Mon Sep 17 00:00:00 2001 From: hannahxchen Date: Sun, 2 Dec 2018 14:43:58 +0800 Subject: [PATCH 5/5] : modify decorator.py to sec2vec_decorator.py --- sec2vec_decorator.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 sec2vec_decorator.py diff --git a/sec2vec_decorator.py b/sec2vec_decorator.py new file mode 100644 index 0000000..dca5edf --- /dev/null +++ b/sec2vec_decorator.py @@ -0,0 +1,40 @@ +from collections import Iterator + +def assert_sentences(f): + ''' + Normalize the input sentences' data structure + ''' + + def inner(self, k=None, sentences=None, corpus_file=None, *args): + # def inner(self, k=None, sentences=None, *args): + ''' + :param k: keywords or keyword + :type k: str or list of str + :param sentences: input sentences for training. + :type sentences: str, list of str or list of listed tokens + :param corpus_file: path for corpus + :type corpus_file: str + ''' + + if isinstance(sentences, str): + return f(self, k, [sentences], corpus_file, *args) + + elif isinstance(sentences, Iterator): + return f(self, k, sentences, corpus_file, *args) + + elif isinstance(sentences, list): + + _sentences = [] + + for s in sentences: + + if isinstance(s, str): + _sentences.append(s) + else: + _sentences.append(' '.join(s)) + + return f(self, k, _sentences, corpus_file, *args) + + return f(self, k, sentences, corpus_file, *args) + + return inner \ No newline at end of file