576. Bengali.AI Speech Recognition | bengaliai-speech
非常感谢 organizers 举办这次有趣的比赛。语音识别是一个非常有趣的方向,我从许多参赛选手的讨论和公开代码中学到了很多。过去的三个月充满压力但也收获颇丰。我将尝试用我蹩脚的英语清晰地阐述我的解决方案。
我的解决方案相对简单,使用 wav2vec2 1b 模型作为预训练模型,并训练一个 Wav2Vec2ForCTC 模型。在后处理阶段,使用 KenLM 训练了一个 6-gram 语言模型,并对输出结果进行归一化和 dari 的进一步后处理。
具体来说,我使用 facebook/wav2vec2-xls-r-1b 作为预训练模型。该模型的训练分为三个阶段,每个阶段使用不同的随机种子,并采用一致的数据增强和参数:
class LinearWarmupCosine3LongTailLRScheduler:
def __init__(
self,
optimizer,
max_epoch,
min_lr,
init_lr,
iters_per_epoch,
warmup_steps=0,
warmup_start_lr=-1,
**kwargs
):
self.optimizer = optimizer
self.max_epoch = max_epoch
self.min_lr = min_lr
self.init_lr = init_lr
self.warmup_steps = warmup_steps
self.iters_per_epoch = iters_per_epoch
self.warmup_start_lr = warmup_start_lr if warmup_start_lr >= 0 else init_lr
self.max_iters = max_epoch * iters_per_epoch
def step(self, cur_epoch, cur_step):
total_steps = cur_epoch * self.iters_per_epoch + cur_step
if total_steps < self.warmup_steps:
warmup_lr_schedule(
step=cur_step,
optimizer=self.optimizer,
max_step=self.warmup_steps,
init_lr=self.warmup_start_lr,
max_lr=self.init_lr,
)
elif total_steps <= self.max_iters // 4:
cosine_lr_schedule(
epoch=total_steps,
optimizer=self.optimizer,
max_epoch=self.max_iters // 4,
init_lr=self.init_lr,
min_lr=self.min_lr,
)
elif total_steps <= self.max_iters // 2:
cosine_lr_schedule(
epoch=self.max_iters // 4,
optimizer=self.optimizer,
max_epoch=self.max_iters // 4,
init_lr=self.init_lr,
min_lr=self.min_lr,
)
else:
cosine_lr_schedule(
epoch=total_steps - self.max_iters // 2,
optimizer=self.optimizer,
max_epoch=self.max_iters // 2,
init_lr=self.min_lr,
min_lr=0,
)
def cosine_lr_schedule(optimizer, epoch, max_epoch, init_lr, min_lr):
lr = (init_lr - min_lr) * 0.5 * (
1.0 + math.cos(math.pi * epoch / max_epoch)
) + min_lr
for param_group in optimizer.param_groups:
param_group["lr"] = lr
def get_transform(musan_dir):
trans = Compose(
[
TimeStretch(min_rate=0.9, max_rate=1.1, p=0.2, leave_length_unchanged=False),
Gain(min_gain_in_db=-6, max_gain_in_db=6, p=0.1),
PitchShift(min_semitones=-4, max_semitones=4, p=0.2),
OneOf(
[
AddBackgroundNoise(sounds_path=musan_dir, min_snr_in_db=3.0, max_snr_in_db=30.0,
noise_transform=PolarityInversion(), p=1.0),
AddGaussianNoise(min_amplitude=0.005, max_amplitude=0.015, p=1.0),
] if musan_dir is not None else [
AddGaussianNoise(min_amplitude=0.005, max_amplitude=0.015, p=1.0), ],
p=0.5,
),
]
)
return trans
chars_to_ignore = re.compile(r'[^\u0980-\u09FF\s]')
long_space_to_ignore = re.compile(r'\s+')
bnorm = Normalizer()
def fix_text(text: str):
text = re.sub(chars_to_ignore, ' ', text)
text = re.sub(long_space_to_ignore, ' ', text).strip()
return text
def norm_sentence(sentence):
sentence = normalize(sentence)
sentence = fix_text(sentence)
words = sentence.split()
try:
all_words = [bnorm(word)["normalized"] for word in words]
all_words = [_ for _ in all_words if _]
if len(all_words) < 2:
return ""
return " ".join(all_words).strip()
except TypeError:
return None