1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | import numpy from PIL import Image import binascii def getMatrixfrom_bin(filename,width): with open(filename, 'rb') as f: content = f.read() hexst = binascii.hexlify(content) #将二进制文件转换为十六进制字符串 fh = numpy.array([int(hexst[i:i+2],16) for i in range(0, len(hexst), 2)]) #按字节分割 rn = len(fh)/width fh = numpy.reshape(fh[:rn*width],(-1,width)) #根据设定的宽度生成矩阵 fh = numpy.uint8(fh) return fh filename = "your_bin_filename" im = Image.fromarray(getMatrixfrom_bin(filename,512)) #转换为图像 im.save("your_img_filename.png") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | import re from collections import * # 从.asm文件获取Opcode序列 def getOpcodeSequence(filename): opcode_seq = [] p = re.compile(r'\s([a-fA-F0-9]{2}\s)+\s*([a-z]+)') with open(filename) as f: for line in f: if line.startswith(".text"): m = re.findall(p,line) if m: opc = m[0][10] if opc != "align": opcode_seq.append(opc) return opcode_seq # 根据Opcode序列,统计对应的n-gram def getOpcodeNgram(ops ,n = 3): opngramlist = [tuple(ops[i:i+n]) for i in range(len(ops)-n)] opngram = Counter(opngramlist) return opngram file = "train/0A32eTdBKayjCWhZqDOQ.asm" ops = getOpcodeSequence(file) opngram = getOpcodeNgram(ops) print opngram # output # Counter({('mov', 'mov', 'mov'): 164, ('xor', 'test', 'setnle'): 155... |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 | import os from random import * import pandas as pd import shutil rs = Random() # 读取微软提供的训练集标注 trainlabels = pd.read_csv('trainLabels.csv') fids = [] opd = pd.DataFrame() for clabel in range (1,10): # 筛选特定分类 mids = trainlabels[trainlabels.Class == clabel] mids = mids.reset_index(drop=True) # 在该分类下随机抽取100个 rchoice = [rs.randint(0,len(mids)-1) for i in range(100)] rids = [mids.loc.Id for i in rchoice] fids.extend(rids) opd = opd.append(mids.loc[rchoice]) opd = opd.reset_index(drop=True) # 生成训练子集标注 opd.to_csv('subtrainLabels.csv', encoding='utf-8', index=False) # 将训练子集拷贝出来(根据实际情况修改这个路径) sbase = 'yourpath/train/' tbase = 'yourpath/subtrain/' for fid in fids: fnames = ['{0}.asm'.format(fid),'{0}.bytes'.format(fid)] for fname in fnames: cspath = sbase + fname ctpath = tbase + fname shutil.copy(cspath,ctpath) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | from sklearn.ensemble import RandomForestClassifier as RF from sklearn import cross_validation from sklearn.metrics import confusion_matrix import pandas as pd subtrainLabel = pd.read_csv('subtrainLabels.csv') subtrainfeature = pd.read_csv("3gramfeature.csv") subtrain = pd.merge(subtrainLabel,subtrainfeature,on='Id') labels = subtrain.Class subtrain.drop(["Class","Id"], axis=1, inplace=True) subtrain = subtrain.as_matrix() # 将训练子集划分为训练集和测试集 其中测试集占40% X_train, X_test, y_train, y_test = cross_validation.train_test_split(subtrain,labels,test_size=0.4) # 构造随机森林 其中包含500棵决策树 srf = RF(n_estimators=500, n_jobs=-1) srf.fit(X_train,y_train) # 训练 print srf.score(X_test,y_test) # 测试 |
dengxiangxi 发表于 2017-11-30 15:39
真不错,说的太好了
欢迎光临 机器人与人工智能爱好者论坛 (http://robot-ai.org/) | Powered by Discuz! X3.2 |