416. Riiid Answer Correctness Prediction | riiid-test-answer-prediction
恭喜所有获奖队伍以及新晋的 Grandmaster、Master 和 Expert。感谢组织者和 Kaggle 提供了如此精彩的比赛,这表明 Kaggle 比赛不仅仅是一场游戏,也可以是一个有用的机器学习项目。
对于 GBM 特征,我们没有使用许多字典来保存特征数据,而是开发了一个基于 numba 的框架来加速特征工程过程和在线计算。首先,数据按 ['user_id', 'timestamp', 'content_id'] 排序并拆分为不同的数组。然后我们通过自定义的滚动函数或自定义的累积函数在不同的数组中创建特征。实际上,这为我们提供了一种非常灵活的方式来创建特征并进行测试。在 1000 万条数据中,特征工程过程仅需 5 分钟即可完成。
下面列出了一些示例。
from tqdm import tqdm
from numba import jit,njit
from joblib import Parallel, delayed
from tqdm import tqdm
import gc
from multiprocessing import Process, Manager,Pool
from functools import partial
from numba import prange
import numpy as np
import pandas as pd
from numba import types
from numba.typed import Dict
import functools, time
from numba.typed import List
def timeit(f):
def wrap(*args, **kwargs):
time1 = time.time()
ret = f(*args, **kwargs)
time2 = time.time()
print('{:s} function took {:.3f} s'.format(f.__name__, np.round(time2-time1, 2)))
return ret
return wrap
def rolling_feat_group(train, col_used):
a = train[col_used].values
ind = np.lexsort((a[:,2],a[:,1],a[:,0]))
a = a[ind]
g = np.split(a, np.unique(a[:, 0], return_index=True)[1][1:])
return g, ind, col_used
@jit(nopython = True, fastmath = True)
def rolling_cal(arr, step, window = 5, shift_ = 1):
m = 2
arr_ = np.concatenate((np.full((window, ), np.nan), arr))
ret = np.zeros((arr.shape[0], m))
beg = window
for i in step:
tmp = arr_[beg-window:beg]
ret[beg - window:(beg - window + i), 0] = np.nanmean(tmp)
ret[beg - window:(beg - window + i), 1] = np.nansum(tmp)
beg += i
return ret
@jit(nopython = True, fastmath = True)
def rolling_time_cal(arr, window = 5, shift_ = 1):
m = 1
arr_ = np.concatenate((np.full((window, ), np.nan), arr))
ret = np.zeros((arr.shape[0], m))
for i in range(0,arr.shape[0], 1):
tmp = arr_[i:i+window+1]
ret[i, 0] = np.nanmean(tmp)
return ret
def rolling_cal_wrap(tmp_g, shift_period):
m = 2
tmp_res = []
step = np.unique(tmp_g[:, 1], return_counts=True)[1]
for window_size in shift_period:
tmp = rolling_cal(tmp_g[:, 2], step, window_size)
tmp_res.append(tmp)
tmp_res = np.concatenate(tmp_res, axis = 1)
return tmp_res
def rolling_time_cal_wrap(tmp_g, shift_period):
m = 2
tmp_res = []
for window_size in shift_period:
tmp = rolling_time_cal(tmp_g[:, 2], window_size)
tmp_res.append(tmp)
tmp_res = np.concatenate(tmp_res, axis = 1)
return tmp_res
def rolling_feat_cal(tmp_g, name_dict, global_period):
answer_idx = name_dict.index('answered_correctly')
prior_idx = name_dict.index('prior_question_elapsed_time')
item_mean_idx = name_dict.index('item_mean')
task_set_idx = name_dict.index('task_set_distance')
tmp_res1 = rolling_cal_wrap