573. Kaggle - LLM Science Exam | kaggle-llm-science-exam
首先,我们想向主办方和Kaggle团队组织这次令人惊叹的比赛表示诚挚的感谢。感谢我的团队成员 @yuanji1239 的辛勤工作。感谢Chris Deotte、MGoksu、MB和Radek分享OpenBook技术!
def calu_lenq85(df, col):
'''
# calu_lenq85(df_train, "context") ## 930
# calu_lenq85(df_valid, "context") ## 877
'''
## mask == noise
a = df[col].str.split(" ").apply(len)
len_q85 = round(np.quantile(a, q=0.85))
return len_q85
def mask_string(s, len_q85=930):
s = s.split(" ")
s = np.array(s)
mask = tokenizer.mask_token
## mask for some items
if len(s) > len_q85:
## mask ratio
num_replacements = np.random.uniform(low=0.0, high=0.08)
num_replacements = round(len(s) * num_replacements)
# mask numbers
maskidx = np.random.choice(len(s), size=num_replacements, replace=False)
s[maskidx] = mask
## strings
s = " ".join(s.tolist())
return s
def generate_prompt_option(df):
cond1 = lambda x: True if 'What is' in x["prompt"] else False
cond1 = df.apply(cond1, axis=1)
for col in list("ABCDE"):
df.loc[cond1, col] = df.loc[cond1, "prompt"].apply(lambda x: x[7:-1]) + ' ' + "is that" + ' ' + df.loc[cond1, col]
return df
使用4个deberta模型和 longformer模型(由 @yuanji1239 提供)的集成方法来推断在线测试数据集。