forked from bindog/ToyMalwareClassification
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathopcode_n-gram.py
More file actions
71 lines (66 loc) · 1.99 KB
/
opcode_n-gram.py
File metadata and controls
71 lines (66 loc) · 1.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import re
from collections import *
import os
import pandas as pd
def getOpcodeSequence(filename):
opcode_seq = []
p = re.compile(r'\s([a-fA-F0-9]{2}\s)+\s*([a-z]+)')
with open(filename) as f:
for line in f:
if line.startswith(".text"):
m = re.findall(p,line)
if m:
opc = m[0][1]
if opc != "align":
opcode_seq.append(opc)
return opcode_seq
def train_opcode_lm(ops, order=4):
lm = defaultdict(Counter)
prefix = ["~"] * order
prefix.extend(ops)
data = prefix
for i in xrange(len(data)-order):
history, char = tuple(data[i:i+order]), data[i+order]
lm[history][char]+=1
def normalize(counter):
s = float(sum(counter.values()))
return [(c,cnt/s) for c,cnt in counter.iteritems()]
outlm = {hist:chars for hist, chars in lm.iteritems()}
return outlm
def getOpcodeNgram(ops, n=3):
opngramlist = [tuple(ops[i:i+n]) for i in range(len(ops)-n)]
opngram = Counter(opngramlist)
return opngram
basepath = "/home/moon/subtrain/"
map3gram = defaultdict(Counter)
subtrain = pd.read_csv('subtrainLabels.csv')
count = 1
for sid in subtrain.Id:
print "counting the 3-gram of the {0} file...".format(str(count))
count += 1
filename = basepath + sid + ".asm"
ops = getOpcodeSequence(filename)
op3gram = getOpcodeNgram(ops)
map3gram[sid] = op3gram
cc = Counter([])
for d in map3gram.values():
cc += d
selectedfeatures = {}
tc = 0
for k,v in cc.iteritems():
if v >= 500:
selectedfeatures[k] = v
print k,v
tc += 1
dataframelist = []
for fid,op3gram in map3gram.iteritems():
standard = {}
standard["Id"] = fid
for feature in selectedfeatures:
if feature in op3gram:
standard[feature] = op3gram[feature]
else:
standard[feature] = 0
dataframelist.append(standard)
df = pd.DataFrame(dataframelist)
df.to_csv("3gramfeature.csv",index=False)