362. Google QUEST Q&A Labeling | google-quest-challenge
感谢 Kaggle 举办了一场精彩的比赛!
我是 NLP 新手,所以我从优秀的 Kernel 中学到了很多关于预处理、如何构建 Bert 模型等知识。我的代码基于 @idv2005 的 这个 Kernel 和 @adityaecdrid 的 这个 Kernel!非常感谢!
以下是我工作的总结。(抱歉英语不好。)
best_bin_dict = {'question_type_spelling': 4, 'question_type_instructions': 8, 'question_type_entity': 9, 'question_type_definition': 8, 'question_type_consequence': 4, 'question_type_compare': 8, 'question_type_choice': 8, 'question_opinion_seeking': 50, 'question_not_really_a_question': 4, 'question_multi_intent': 9, 'question_interestingness_self': 9, 'question_interestingness_others': 100, 'question_has_commonly_accepted_answer': 5, 'question_fact_seeking': 10, 'question_expect_short_answer': 50, 'question_conversational': 8, 'question_body_critical': 100, 'question_asker_intent_understanding': 100, 'answer_well_written': 9, 'answer_type_procedure': 50, 'answer_type_instructions': 9, 'answer_relevance': 100, 'answer_level_of_information': 50}
df_sub = pd.read_csv("../input/google-quest-challenge/sample_submission.csv")
pred_final = np.array(preds).sum(axis=0)
for i, col in enumerate(df_sub.columns[1:]):
df_sub[col] = pred_final[:, i]
not_type_spelling_idx = df_test.query("host not in ['ell.stackexchange.com', 'english.stackexchange.com']").index
for col in df_sub.columns[1:]:
if col in best_bin_dict:
n_bins = best_bin_dict[col]
binned = pd.cut(df_sub[col].values, n_bins, retbins=True, labels=np.arange(n_bins)/(n_bins-1))[0]
if col == "question_type_spelling":
binned[not_type_spelling_idx] = 0
df_sub[col] = binned
df_sub.to_csv("submission.csv", index=False)