570. Google - American Sign Language Fingerspelling Recognition | asl-fingerspelling
这场比赛与语音识别非常相似。因此,我尝试了关注语音识别中使用的新技术。其中,最有效的是 conformer架构、interCTC技术、强数据增强 和 无掩码处理。
在提供的543个关键点中,我使用了42个手部数据、33个姿态数据和40个唇部数据。输入特征通过连接xy坐标(丢弃z轴)和运动数据(xy[1:] - xy[:-1])构建。
def pre_process0(x):
x = x[:args.max_frame]
x = tf.where(tf.math.is_nan(x), tf.zeros_like(x), x)
n_frames = tf.shape(x)[0]
lhand = tf.transpose(tf.reshape(x[:, 0:63], (n_frames, 3, args.n_hand_landmarks)), (0, 2, 1))
rhand = tf.transpose(tf.reshape(x[:, 63:126], (n_frames, 3, args.n_hand_landmarks)), (0, 2, 1))
pose = tf.transpose(tf.reshape(x[:, 126:225], (n_frames, 3, args.n_pose_landmarks)), (0, 2, 1))
face = tf.transpose(tf.reshape(x[:, 225:345], (n_frames, 3, args.n_face_landmarks)), (0, 2, 1))
x = tf.concat([
lhand,
rhand,
pose,
face
], axis=1)
x = x[:, :, :2]
return x
def decode_fn(record_bytes, augment=False):
schema = {
'coordinates': tf.io.VarLenFeature(tf.float32),
'phrase': tf.io.VarLenFeature(tf.int64)
}
x = tf.io.parse_single_example(record_bytes, schema)
coordinates = tf.reshape(tf.sparse.to_dense(x["coordinates"]), (-1, args.input_dim))
phrase = tf.sparse.to_dense(x["phrase"])
if augment:
coordinates, phrase = augment_fn(coordinates, phrase)
dx = tf.cond(tf.shape(coordinates)[0]>1,
lambda: tf.pad(coordinates[1:] - coordinates[:-1], [[0,1],[0,0]]),
lambda: tf.zeros_like(coordinates))
coordinates = tf.concat([coordinates, dx], axis=-1)
return coordinates, phrase
应用了三种数据增强方法,每种方法都显著影响了交叉验证结果:
def flip_hand(video):
video = tf.reshape(video, shape=(-1, args.n_landmarks, 2))
hands = video[:, :int(2 * args.n_hand_landmarks)]
other = video[:, int(2 * args.n_hand_landmarks):]
lhand = hands[:, :args.n_hand_landmarks]
rhand = hands[:, args.n_hand_landmarks:]
lhand_x, rhand_x = lhand[:, :, 0], rhand[:, :, 0]
lhand_x = tf.negative(lhand_x) + 2 * tf.reduce_mean(lhand_x, axis=1, keepdims=True)
rhand_x = tf.negative(rhand_x) + 2 * tf.reduce_mean(rhand_x, axis=1, keepdims=True)
lhand = tf.concat([tf.expand_dims(lhand_x, axis=-1), lhand[:, :, 1:]], axis=-1)
rhand = tf.concat([tf.expand_dims(rhand_x, axis=-1), rhand[:, :, 1:]], axis=-1)
flipped_hands = tf.concat([rhand, lhand, other], axis=1)
flipped_hands = tf.reshape(flipped_hands, shape=(-1, args.input_dim))
return flipped_hands
def reverse_frames(x, y):
x = x[::-1]
y = y[::-1]
return x, y
def cat_augment(inputs, inputs2):
x, y = inputs
x2, y2 = inputs2
x_shape = tf.shape(x)
x2_shape = tf.shape(x2)
should_concat = tf.random.uniform(()) < 0.5
x_condition = should_concat & (x_shape[0] + x2_shape[0] < args.max_frame)
x = tf.cond(x_condition,
lambda: tf.concat([x, x2], axis=0),
lambda: x)
y = tf.cond(x_condition,
lambda: tf.concat([y, y2], axis=0),
lambda: y)
return x, y
模型构建采用了Conformer提出的架构。最初我使用了TensorSpeech/TensorFlowASR的Conformer代码,但通过对@hoyso48使用的代码进行特定修改,成功实现了Conformer架构。这一实现使LB分数提升了0.008。该改进可能是由于移除了Conformer块之间的残差连接,从而能够集成更多层并提高抗过拟合能力。此外,结合interCTC技术(论文)也贡献了约0.005的LB分数提升。
此外,由于模型输出采用无掩码计算,且CTC损失的"输入长度"变量使用最大长度384,采用更大的卷积核尺寸(=31)有效地将帧信息传递到384长度范围内,从而提升了性能。
def get_model(max_len=384, dim=160, ksize=31, drop_rate=0.1, num_layers=16):
NUM_CLASSES = 63
PAD = 0
inp = tf.keras.Input((None, 2*230))
x = inp
x = tf.keras.layers.Dense(dim, use_bias=False, name='stem_conv')(x)
x = tf.keras.layers.BatchNormalization(momentum=0.95, name='stem_bn')(x)
xs = []
for i in range(num_layers):
x = TransformerBlock(dim, expand=2)(x)
x = Conv1DBlock(dim, ksize, drop_rate=drop_rate)(x)
xs.append(x)
classifier = tf.keras.layers.Dense(NUM_CLASSES, name='classifier')
x1 = tf.keras.layers.Dropout(0.2)(xs[-8])
x1 = classifier(x1)
x2 = tf.keras.layers.Dropout(0.2)(xs[-1])
x2 = classifier(x2)
return x1, x2