331. Jigsaw Unintended Bias in Toxicity Classification | jigsaw-unintended-bias-in-toxicity-classification
感谢我的队友 @zongbingwang @wxzcyy
最终融合方案:
0.1 * mean(6lstm) + 0.1 * mean(3gpt2) + 0.8 * mean(2 bert_base_uncase + 1 bert_large_uncased) = 0.94599
bert-base 代码链接:
https://www.kaggle.com/hanyaopeng/single-bert-base-with-0-94376
coll = ['black','white','homosexual_gay_or_lesbian','muslim']
identity_columns = [
'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
weights = np.ones((len(X),)) / 4
# 子群组 identity_columns
weights += (train_df[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) / 4
# 背景正向,子群组负向
weights += (( (train_df['target'].values>=0.5).astype(bool).astype(np.int) +
(train_df[identity_columns].fillna(0).values<0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
# 背景负向,子群组正向
weights += (( (train_df['target'].values<0.5).astype(bool).astype(np.int) +
(train_df[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
weights += (( (train_df['target'].values>=0.5).astype(bool).astype(np.int) +
(train_df[coll].fillna(0).values<0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 8
weights += (( (train_df['target'].values<0.5).astype(bool).astype(np.int) +
(train_df[coll].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 8
loss_weight = 1.0 / weights.mean()
weights = weights.reshape(-1,1)
def custom_loss(data, targets):
''' 为 'target' 列定义带权重的 BCE 自定义损失函数 '''
bce_loss_1 = nn.BCEWithLogitsLoss(weight=targets[:,0:1])(data[:,0:1],targets[:,1:2])
bce_loss_2 = nn.BCEWithLogitsLoss()(data[:,1:],targets[:,2:])
return (bce_loss_1 * loss_weight) + bce_loss_2
def convert_lines(example, max_seq_length, tokenizer):
max_seq_length -= 2
all_tokens = []
longer = 0
for text in tqdm_notebook(example):
tokens_a = tokenizer.tokenize(text)
if len(tokens_a) > max_seq_length:
tokens_a = tokens_a[:max_seq_length]
longer += 1
one_token = tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens_a + ["[SEP]"]) + [0] * \
(max_seq_length - len(tokens_a))
all_tokens.append(one_token)
print(longer)
return np.array(all_tokens)
代码详情:
同比赛其他方案