Log2Vec/log2vec.py at master · NetManAIOps/Log2Vec · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import argparse
import os
import gensim
import numpy as np

def cos( vector1, vector2):
    return float(np.sum(vector1*vector2))/(np.linalg.norm(vector1)*np.linalg.norm(vector2))

def load_model(filename,is_binary=False):
    model = gensim.models.KeyedVectors.load_word2vec_format(filename, binary = is_binary)
    return model

def getLogVector(para):
    '''
        通过日志文件中的单词词向量，组合成每条日志模板的句向量，然后将日志向量保存到文件中，每个向量的index是类别号，从1开始
        return: (template_to_index, index_to_template, template_to_vector)
    '''
    template_file = para['template_file']
    model = load_model(para['word_model'])
    dimension = para['dimension']
    template_vector_file = para['template_vector_file']
    if para['oov_vector']:
        oov_vector = load_model(para['oov_vector'])
    else:
        oov_vector = None
    template_to_index = {}
    index_to_template = {}
    template_to_vector = {}
    template_num = 0
    with open(template_file) as IN:
        for line in IN:
                template_num += 1
    f = open(template_vector_file, 'w')
    f.writelines(str(template_num)+' '+str(para['dimension'])+'\n') #word2vec的模型格式，第一行为单词数&维度
    index = 1
    with open(template_file) as IN:
        for line in IN:
            template = line.strip()
            l = template.split()
            log_length = len(l)
            cur_vector = np.zeros(dimension)
            for word in l:
                if word in model:
                    cur_vector += model[word]
                elif oov_vector:
                    if word in oov_vector:
                        cur_vector += oov_vector[word]
                    else:
                        raise Exception(word + " not in w2v and oov")
                else:
                    log_length -= 1
            cur_vector /= log_length
            template_to_vector[template] = cur_vector
            template_to_index[template] = str(index)
            index_to_template[index] = template
            f.writelines(str(index))
            for v in cur_vector:
                f.writelines(' '+str(v))
            f.writelines('\n')
            index += 1
    f.close()
    return (template_to_index, index_to_template, template_to_vector)

def evaluate(output_original_path, output_withOut_path, output_oov_path):
    original_log2vec = gensim.models.KeyedVectors.load_word2vec_format(output_original_path, binary = False)
    withOut_log2vec = gensim.models.KeyedVectors.load_word2vec_format(output_withOut_path, binary = False)
    oov_log2vec = gensim.models.KeyedVectors.load_word2vec_format(output_oov_path, binary = False)
    output_file = os.path.join(output_path, 'log_similarity.txt')
    ofile = open(output_file, 'w')
    for index in range(1, len(original_log2vec.vocab)+1):
        vec = str(index)
        ofile.write(vec+' ')
        orignal_vec = original_log2vec[vec]
        withOut_vec = withOut_log2vec[vec]
        oov_vec = oov_log2vec[vec]
        ofile.write(str(cos(orignal_vec, withOut_vec))+' ')
        ofile.write(str(cos(orignal_vec, oov_vec))+'\n')
    ofile.close()
    return output_file

def statistics(similarity_result, output_path):
    log_ave_dis = 0
    count = 0
    simi_oov_sum = 0
    simi_without_sum = 0
    ifile = open(similarity_result, 'r')
    for line in ifile:
        simi_oov = float(line.split(" ")[-1])
        simi_oov_sum += simi_oov
        simi_without = float(line.split(" ")[-2])
        simi_without_sum += simi_without
        count += 1
    ifile.close()
    ofile = open(output_path, 'w')
    ofile.write('oov score: '+str(simi_oov_sum/count)+'\n')
    ofile.write('without score: '+str(simi_without_sum/count)+'\n')
    ofile.close()

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', help='input_directory')
    parser.add_argument('-t', help='log type')
    args = parser.parse_args()
    log_type = args.t
    iopath = os.path.abspath(args.i)
    iopath = os.path.join(iopath, log_type)
    word_model_path = os.path.join(iopath, 'embedding.model')
    oov_vector_path = os.path.join(iopath, 'oov.vector')
    processed_log = os.path.join(iopath, 'without_variables.log')
    generate_file_path = os.path.join(iopath, 'changed_log')
    changed_log = os.path.join(generate_file_path, "without_variables.log")

    output_path = os.path.join(iopath, 'log2vec')
    if not os.path.exists(output_path):
        os.mkdir(output_path)
    output_original_path = os.path.join(output_path, 'original_log.vector')
    output_withOut_path = os.path.join(output_path, 'removeWord_log.vector')
    output_oov_path = os.path.join(output_path, 'oov_log.vector')

    para_original_log = {}
    para_original_log['template_file'] = processed_log
    para_original_log['word_model'] = word_model_path
    para_original_log['dimension'] = 32
    para_original_log['template_vector_file'] = output_original_path
    para_original_log['oov_vector'] = None

    para_withOut_log = {}
    para_withOut_log['template_file'] = changed_log
    para_withOut_log['word_model'] = word_model_path
    para_withOut_log['dimension'] = 32
    para_withOut_log['template_vector_file'] = output_withOut_path
    para_withOut_log['oov_vector'] = None

    para_oov_log = {}
    para_oov_log['template_file'] = changed_log
    para_oov_log['word_model'] = word_model_path
    para_oov_log['dimension'] = 32
    para_oov_log['template_vector_file'] = output_oov_path
    para_oov_log['oov_vector'] = oov_vector_path

    getLogVector(para_original_log)
    getLogVector(para_withOut_log)
    getLogVector(para_oov_log)
    similarity_result = evaluate(output_original_path, output_withOut_path, output_oov_path)
    statistics(similarity_result, os.path.join(output_path, 'score.txt'))