emo2vec/count_classifier.py at master · glnmario/emo2vec · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import numpy as np


n_classes = 6


def l1_normalize(vector):
    norm = np.sum(vector)
    if norm == 0:
        return vector  # zeros vector
    return vector / norm


def read_emo_lemma(aline):
    """
    Splits a line into lemma l, emotion e, and l(e).
    l(e) := 1 if lemma l has emotion e according to the lexicon
    l(e) := 0 otherwise
    """
    split = aline.split()
    return split[0], split[1], int(split[2])


def build_fuzzy_lexicon(lexicon_path):
    """
    Based on the emotion lexicon, create a mapping from an emotion word to its label probability distribution
    """
    lexicon = dict()
    with open(lexicon_path, 'r') as f:
        emo_idx = 0  # anger: 0, anticipation: 1, disgust: 2, fear: 3, joy: 4, sadness: 5, surprise: 6, trust: 7

        for l in f:
            lemma, emotion, has_emotion = read_emo_lemma(l)

            if emotion == 'anger':  # i.e. if lemma not in lexicon.keys()
                lexicon[lemma] = np.empty(shape=(n_classes,))
            if emotion == 'positive' or emotion == 'negative':
                continue

            lexicon[lemma][emo_idx] = has_emotion

            if emo_idx < n_classes - 1:
                emo_idx += 1
            else:
                # normalize: emotion-label probabilities for a lemma should sum up to 1
                lexicon[lemma] = l1_normalize(lexicon[lemma])
                # reset index - next line contains a new lemma
                emo_idx = 0
    return lexicon


def classify(corpus_path, lexicon_path):
    """
    Return a list of probability distributions.
    """
    # Create mapping: emotion word -> label probability distribution
    prob_lexicon = build_fuzzy_lexicon(lexicon_path)

    print('Read and tokenize corpus.')
    texts = []
    with open(corpus_path, 'r') as f:
        for line in f:
            line_split = line[20:].split(sep='\t:: ')
            texts.append(line_split[0].strip())

        print('Found %s texts.' % len(texts))


    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)  # one sequence of tokens per text input
    max_seq_len = np.max([len(s) for s in sequences])
    sequences = pad_sequences(sequences, max_seq_len, padding='post')

    # Dictionary mapping an index to the word it represents in the corpus (invert word->index mapping as it is bijective)
    index_to_word = {i: w for w, i in tokenizer.word_index.items()}

    # Label probability distribution for sequences, shape=(len(sequences), n_classes)
    fuzzy_labels = []

    print('Label the texts.')
    for seq in sequences:
        seq_labels = np.zeros(shape=(max_seq_len, n_classes))
        j = 0  # index of token in a sequence (different from token_id)

        for token_id in seq:
            if token_id == 0:  # we reached the padding zeros
                break
            token = index_to_word[token_id]
            if token in prob_lexicon.keys():
                seq_labels[j] += prob_lexicon[token]
            j += 1

        labels = l1_normalize(np.sum(seq_labels, 0))
        fuzzy_labels.append(labels)

    return fuzzy_labels