阿里云安全恶意程序检测-排名295
发布日期:2021-06-29 19:49:15
浏览次数:3
分类:技术文章
本文共 9084 字,大约阅读时间需要 30 分钟。
赛题说明
本题目提供的数据来自文件(windows 可执行程序)经过沙箱程序模拟运行后的API指令序列,全为windows二进制可执行程序,经过脱敏处理。 本题目提供的样本数据均来自于从互联网。其中恶意文件的类型有感染型病毒、木马程序、挖矿程序、DDOS木马、勒索病毒等,数据总计6亿条。具体请移步:
数据说明
简单思路:
数据量过大,改变数据类型减少内存使用 交叉验证 lgb效果还不错具体代码:
from tqdm import tqdm_notebookclass _Data_Preprocess: def __init__(self): self.int8_max = np.iinfo(np.int8).max self.int8_min = np.iinfo(np.int8).min self.int16_max = np.iinfo(np.int16).max self.int16_min = np.iinfo(np.int16).min self.int32_max = np.iinfo(np.int32).max self.int32_min = np.iinfo(np.int32).min self.int64_max = np.iinfo(np.int64).max self.int64_min = np.iinfo(np.int64).min self.float16_max = np.finfo(np.float16).max self.float16_min = np.finfo(np.float16).min self.float32_max = np.finfo(np.float32).max self.float32_min = np.finfo(np.float32).min self.float64_max = np.finfo(np.float64).max self.float64_min = np.finfo(np.float64).min def _get_type(self, min_val, max_val, types): if types == 'int': if max_val <= self.int8_max and min_val >= self.int8_min: return np.int8 elif max_val <= self.int16_max <= max_val and min_val >= self.int16_min: return np.int16 elif max_val <= self.int32_max and min_val >= self.int32_min: return np.int32 return None elif types == 'float': if max_val <= self.float16_max and min_val >= self.float16_min: return np.float16 if max_val <= self.float32_max and min_val >= self.float32_min: return np.float32 if max_val <= self.float64_max and min_val >= self.float64_min: return np.float64 return None def _memory_process(self, df): init_memory = df.memory_usage().sum() / 1024 ** 2 / 1024 print('Original data occupies {} GB memory.'.format(init_memory)) df_cols = df.columns for col in tqdm_notebook(df_cols): try: if 'float' in str(df[col].dtypes): max_val = df[col].max() min_val = df[col].min() trans_types = self._get_type(min_val, max_val, 'float') if trans_types is not None: df[col] = df[col].astype(trans_types) elif 'int' in str(df[col].dtypes): max_val = df[col].max() min_val = df[col].min() trans_types = self._get_type(min_val, max_val, 'int') if trans_types is not None: df[col] = df[col].astype(trans_types) except: print(' Can not do any process for column, {}.'.format(col)) afterprocess_memory = df.memory_usage().sum() / 1024 ** 2 / 1024 print('After processing, the data occupies {} GB memory.'.format(afterprocess_memory)) return dfimport pandas as pdimport numpy as npimport seaborn as snsimport matplotlib.pyplot as pltimport lightgbm as lgbimport warningswarnings.filterwarnings('ignore')import osos.chdir(r'E:\项目文件\阿里云安全恶意程序检测')train = pd.read_csv('security_train.csv')test = pd.read_csv('security_test.csv')def simple_sts_features(df): simple_fea = pd.DataFrame() simple_fea['file_id'] = df['file_id'].unique() simple_fea = simple_fea.sort_values('file_id') df_grp = df.groupby('file_id') simple_fea['file_id_api_count'] = df_grp['api'].count().values simple_fea['file_id_api_nunique'] = df_grp['api'].nunique().values simple_fea['file_id_tid_count'] = df_grp['tid'].count().values simple_fea['file_id_tid_nunique'] = df_grp['tid'].nunique().values simple_fea['file_id_index_count'] = df_grp['index'].count().values simple_fea['file_id_index_nunique'] = df_grp['index'].nunique().values return simple_feasimple_train_fea1 = simple_sts_features(train)simple_test_fea1 = simple_sts_features(test)def simple_numerical_sts_features(df): simple_numerical_fea = pd.DataFrame() simple_numerical_fea['file_id'] = df['file_id'].unique() simple_numerical_fea = simple_numerical_fea.sort_values('file_id') df_grp = df.groupby('file_id') simple_numerical_fea['file_id_tid_mean'] = df_grp['tid'].mean().values simple_numerical_fea['file_id_tid_min'] = df_grp['tid'].min().values simple_numerical_fea['file_id_tid_std'] = df_grp['tid'].std().values simple_numerical_fea['file_id_tid_max'] = df_grp['tid'].max().values simple_numerical_fea['file_id_index_mean'] = df_grp['index'].mean().values simple_numerical_fea['file_id_index_min'] = df_grp['index'].min().values simple_numerical_fea['file_id_index_std'] = df_grp['index'].std().values simple_numerical_fea['file_id_index_max'] = df_grp['index'].max().values return simple_numerical_feasimple_train_fea2 = simple_numerical_sts_features(train)simple_test_fea2 = simple_numerical_sts_features(test)train_label = train[['file_id', 'label']].drop_duplicates(subset=['file_id', 'label'], keep='first')test_submit = test[['file_id']].drop_duplicates(subset=['file_id'], keep='first')### 训练集&测试集构建train_data = train_label.merge(simple_train_fea1, on='file_id', how='left')train_data = train_data.merge(simple_train_fea2, on='file_id', how='left')test_submit = test_submit.merge(simple_test_fea1, on='file_id', how='left')test_submit = test_submit.merge(simple_test_fea2, on='file_id', how='left')def lgb_logloss(preds, data): labels_ = data.get_label() classes_ = np.unique(labels_) preds_prob = [] for i in range(len(classes_)): preds_prob.append(preds[i * len(labels_):(i + 1) * len(labels_)]) preds_prob_ = np.vstack(preds_prob) loss = [] for i in range(preds_prob_.shape[1]): # 样本个数 sum_ = 0 for j in range(preds_prob_.shape[0]): # 类别个数 pred = preds_prob_[j, i] # 第i个样本预测为第j类的概率 if j == labels_[i]: sum_ += np.log(pred) else: sum_ += np.log(1 - pred) loss.append(sum_) return 'loss is: ', -1 * (np.sum(loss) / preds_prob_.shape[1]), False### 模型验证train_features = [col for col in train_data.columns if col not in ['label', 'file_id']]train_label = 'label'from sklearn.model_selection import KFoldparams = { 'task': 'train', 'num_leaves': 255, 'objective': 'multiclass', 'num_class': 8, 'min_data_in_leaf': 50, 'learning_rate': 0.05, 'feature_fraction': 0.85, 'bagging_fraction': 0.85, 'bagging_freq': 5, 'max_bin': 128}folds = KFold(n_splits=5, shuffle=True, random_state=15)oof = np.zeros(len(train))predict_res = 0models = []for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_data)): print("fold n°{}".format(fold_)) trn_data = lgb.Dataset(train_data.iloc[trn_idx][train_features], label=train_data.iloc[trn_idx][train_label].values) val_data = lgb.Dataset(train_data.iloc[val_idx][train_features], label=train_data.iloc[val_idx][train_label].values) clf = lgb.train(params, trn_data, num_boost_round=2000, valid_sets=val_data, verbose_eval=50, early_stopping_rounds=100, feval=lgb_logloss) models.append(clf)from sklearn.model_selection import KFoldparams = { 'task': 'train', 'num_leaves': 255, 'objective': 'multiclass', 'num_class': 8, 'min_data_in_leaf': 50, 'learning_rate': 0.05, 'feature_fraction': 0.85, 'bagging_fraction': 0.85, 'bagging_freq': 5, 'max_bin': 128}folds = KFold(n_splits=5, shuffle=True, random_state=15)oof = np.zeros(len(train))models = []predict_res = 0for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_data)): print("fold n°{}".format(fold_)) trn_data = lgb.Dataset(train_data.iloc[trn_idx][train_features], label=train_data.iloc[trn_idx][train_label].values) val_data = lgb.Dataset(train_data.iloc[val_idx][train_features], label=train_data.iloc[val_idx][train_label].values) clf = lgb.train(params, trn_data, num_boost_round=2000, valid_sets=[trn_data, val_data], verbose_eval=50, early_stopping_rounds=100, feval=lgb_logloss) models.append(clf)feature_importance = pd.DataFrame()feature_importance['fea_name'] = train_featuresfeature_importance['fea_imp'] = clf.feature_importance()feature_importance = feature_importance.sort_values('fea_imp', ascending=False)plt.figure(figsize=[20, 10, ])sns.barplot(x=feature_importance['fea_name'], y=feature_importance['fea_imp'])# sns.barplot(x="fea_name",y="fea_imp",data=feature_importance)pred_res = 0fold = 5for model in models: pred_res += model.predict(test_submit[train_features]) * 1.0 / foldtest_submit['prob0'] = 0test_submit['prob1'] = 0test_submit['prob2'] = 0test_submit['prob3'] = 0test_submit['prob4'] = 0test_submit['prob5'] = 0test_submit['prob6'] = 0test_submit['prob7'] = 0test_submit[['prob0', 'prob1', 'prob2', 'prob3', 'prob4', 'prob5', 'prob6', 'prob7']] = pred_restest_submit[['file_id', 'prob0', 'prob1', 'prob2', 'prob3', 'prob4', 'prob5', 'prob6', 'prob7']].to_csv('baseline.csv', index=False)
喜欢记得一键三连
转载地址:https://data-mining.blog.csdn.net/article/details/109557312 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!
发表评论
最新留言
不错!
[***.144.177.141]2024年04月17日 01时25分21秒
关于作者
喝酒易醉,品茶养心,人生如梦,品茶悟道,何以解忧?唯有杜康!
-- 愿君每日到此一游!
推荐文章
Eclipse下搭建Spring框架(一)
2019-04-30
Eclipse下搭建Spring框架(二)Eclipse下Spring 实例
2019-04-30
C3P0出现的问题(一)
2019-04-30
C3P0出现的问题(二)
2019-04-30
Java--C3P0的使用
2019-04-30
漫画算法:什么是红黑树?
2019-04-30
ThreadLocal底层实现原理
2019-04-30
如何判断链表有环?解释+实现
2019-04-30
解决idea没有tomcat server选项的问题
2019-04-30
SSM-遇见的一些小坑(一)-mapper的问题
2019-04-30
SSM-遇见的一些小坑(二)解析请求的问题
2019-04-30
SSM-遇见的一些小坑(三)Controller层出现的错误
2019-04-30
SSM-遇见的一些小坑(四)测试接口时出现返回500的问题
2019-04-30
SSM-遇见的一些小坑(五)解决跨域问题
2019-04-30
Jsp调用Action的几种方法--做个记录
2019-04-30
SSM-遇见的一些小坑(五)jstl标签库
2019-04-30
Mybatis-mapper.xml相关记录
2019-04-30
insert ignore into--跳坑
2019-04-30
SecureCRT的安装和破解--亲试可用
2019-04-30