# 基于pytorch代码了解transformer的自注意力机制

```import torch
import torch.nn as nn
import torch.nn.functional as F

self.d_model = d_model

# 定义线性变换
self.W_Q = nn.Linear(d_model, d_model)  # (d_model, d_model)
self.W_K = nn.Linear(d_model, d_model)  # (d_model, d_model)
self.W_V = nn.Linear(d_model, d_model)  # (d_model, d_model)
self.W_O = nn.Linear(d_model, d_model)  # (d_model, d_model)

def forward(self, Q, K, V, mask=None):
batch_size = Q.size(0)  # (batch_size, seq_len, d_model)

# 将输入通过线性变换得到 Q, K, V
Q = self.W_Q(Q)  # (batch_size, seq_len, d_model)
K = self.W_K(K)  # (batch_size, seq_len, d_model)
V = self.W_V(V)  # (batch_size, seq_len, d_model)

# 将 Q, K, V 按照头数分割

# 计算注意力
scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_head ** 0.5)  # (batch_size, num_heads, seq_len, seq_len)
attention_weights = F.softmax(scores, dim=-1)  # (batch_size, num_heads, seq_len, seq_len)

# 合并所有头的输出
attention_output = attention_output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)  # (batch_size, seq_len, d_model)

# 线性变换输出
output = self.W_O(attention_output)  # (batch_size, seq_len, d_model)
return output

d_model = 512
batch_size = 64
seq_len = 10

Q = torch.randn(batch_size, seq_len, d_model)  # (64, 10, 512)
K = torch.randn(batch_size, seq_len, d_model)  # (64, 10, 512)
V = torch.randn(batch_size, seq_len, d_model)  # (64, 10, 512)
output = mha(Q, K, V)  # (64, 10, 512)
print(output.shape)  # 输出: torch.Size([64, 10, 512])
```

nn.Linear 初始化：

self.W_Q, self.W_K, self.W_V, self.W_O 是线性层，将输入的特征维度映射到相同的特征维度（d_model），权重矩阵大小为 (d_model, d_model)。

Q, K, V 的初始维度是 (batch_size, seq_len, d_model)，通过线性变换后仍然是相同的维度。