Transformer架构深度解析
# 自注意力机制实现
def scaled_dot_product_attention(Q, K, V, mask=None):
matmul_qk = tf.matmul(Q, K, transpose_b=True)
dk = tf.cast(tf.shape(K)[-1], tf.float32)
scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
if mask is not None:
scaled_attention_logits += (mask * -1e9)
attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
return tf.matmul(attention_weights, V)