PyTorch 实例 – 文本情感分析项目
以下是一个使用 PyTorch 实现的文本情感分析项目的完整示例,基于 IMDb 数据集进行二分类(正面/负面情感)。项目使用 LSTM 网络(一种适合序列数据的循环神经网络)进行文本分类,包含数据预处理、模型定义、训练、验证、测试和推理。代码清晰、注释详细,适合初学者和需要快速上手的开发者。
项目概述
- 任务:对 IMDb 电影评论进行情感分类(正面/负面)。
- 数据集:IMDb 数据集(通过
torchtext
加载,25,000 条训练样本,25,000 条测试样本)。 - 模型:LSTM 网络,结合词嵌入(Embedding)和全连接层。
- 功能:文本预处理、模型训练、验证、测试、模型保存和推理。
- 环境:PyTorch、torchtext,运行在 CPU 或 GPU 上。
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import IMDB
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
import numpy as np
设置随机种子
torch.manual_seed(42)
if torch.cuda.is_available():
torch.cuda.manual_seed(42)
超参数
vocab_size = 20000 # 词汇表大小
embed_dim = 128 # 词嵌入维度
hidden_dim = 256 # LSTM 隐藏层维度
num_classes = 2 # 正面/负面
batch_size = 32
epochs = 5
learning_rate = 0.001
max_seq_len = 500 # 最大序列长度
device = torch.device(‘cuda’ if torch.cuda.is_available() else ‘cpu’)
print(f’Using device: {device}’)
1. 数据预处理
tokenizer = get_tokenizer(‘basic_english’)
构建词汇表
def yield_tokens(data_iter):
for _, text in data_iter:
yield tokenizer(text)
train_iter = IMDB(split=’train’)
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=[”, ”], max_tokens=vocab_size)
vocab.set_default_index(vocab[”])
数据处理 pipeline
def text_pipeline(text):
tokens = tokenizer(text)[:max_seq_len] # 截断到最大长度
return vocab(tokens)
def label_pipeline(label):
return 1 if label == ‘pos’ else 0 # 正面: 1, 负面: 0
自定义数据集加载
def collate_batch(batch):
labels = torch.tensor([label_pipeline(label) for label, _ in batch], dtype=torch.long)
texts = [text_pipeline(text) for _, text in batch]
lengths = torch.tensor([len(text) for text in texts], dtype=torch.long)
# 填充序列
texts = nn.utils.rnn.pad_sequence(
[torch.tensor(text, dtype=torch.long) for text in texts],
batch_first=True,
padding_value=vocab[”]
)
return texts, labels, lengths
加载 IMDb 数据集
train_iter, test_iter = IMDB(split=(‘train’, ‘test’))
train_dataset = list(train_iter)
test_dataset = list(test_iter)
划分训练和验证集
num_train = len(train_dataset)
train_size = int(0.8 * num_train)
val_size = num_train – train_size
train_data, val_data = random_split(train_dataset, [train_size, val_size])
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
2. 定义 LSTM 模型
class LSTMClassifier(nn.Module):
def init(self, vocab_size, embed_dim, hidden_dim, num_classes):
super(LSTMClassifier, self).init()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=vocab[”])
self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
self.fc = nn.Linear(hidden_dim, num_classes)
self.dropout = nn.Dropout(0.3)
def forward(self, text, lengths):
embedded = self.dropout(self.embedding(text)) # (batch_size, seq_len, embed_dim)
# 打包序列以处理变长输入
packed_embedded = nn.utils.rnn.pack_padded_sequence(
embedded, lengths.cpu(), batch_first=True, enforce_sorted=False
)
packed_output, (hidden, _) = self.lstm(packed_embedded)
output = self.dropout(hidden[-1]) # 取最后一层隐藏状态
return self.fc(output)
初始化模型
model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_classes).to(device)
3. 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
4. 训练和验证函数
def train(model, loader, criterion, optimizer, device):
model.train()
total_loss = 0
correct = 0
total = 0
for texts, labels, lengths in loader:
texts, labels, lengths = texts.to(device), labels.to(device), lengths.to(device)
optimizer.zero_grad()
outputs = model(texts, lengths)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
return total_loss / len(loader), 100 * correct / total
def validate(model, loader, criterion, device):
model.eval()
total_loss = 0
correct = 0
total = 0
with torch.no_grad():
for texts, labels, lengths in loader:
texts, labels, lengths = texts.to(device), labels.to(device), lengths.to(device)
outputs = model(texts, lengths)
loss = criterion(outputs, labels)
total_loss += loss.item()
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
return total_loss / len(loader), 100 * correct / total
5. 训练循环
best_val_acc = 0
for epoch in range(epochs):
train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
val_loss, val_acc = validate(model, val_loader, criterion, device)
scheduler.step()
print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, '
f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%')
# 保存最佳模型
if val_acc > best_val_acc:
best_val_acc = val_acc
torch.save(model.state_dict(), 'best_lstm_model.pth')
print(f'Saved best model with Val Acc: {val_acc:.2f}%')
6. 测试模型
model.load_state_dict(torch.load(‘best_lstm_model.pth’, map_location=device))
model.eval()
correct = 0
total = 0
with torch.no_grad():
for texts, labels, lengths in test_loader:
texts, labels, lengths = texts.to(device), labels.to(device), lengths.to(device)
outputs = model(texts, lengths)
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
test_acc = 100 * correct / total
print(f’Test Accuracy: {test_acc:.2f}%’)
7. 推理示例
def predict_sentiment(model, text, vocab, tokenizer, device, max_seq_len=500):
model.eval()
tokens = tokenizer(text)[:max_seq_len]
indexed = vocab(tokens)
length = torch.tensor([len(indexed)], dtype=torch.long)
tensor = torch.tensor(indexed, dtype=torch.long).unsqueeze(0).to(device)
with torch.no_grad():
output = model(tensor, length.to(device))
_, predicted = torch.max(output, 1)
return ‘Positive’ if predicted.item() == 1 else ‘Negative’
示例文本
sample_text = “This movie was fantastic and really touching!”
prediction = predict_sentiment(model, sample_text, vocab, tokenizer, device)
print(f’Sample text: {sample_text}’)
print(f’Predicted sentiment: {prediction}’)
代码说明
- 数据预处理:
- 使用
torchtext
加载 IMDb 数据集,构建词汇表(限制为 20,000 个词)。 - 文本分词(
basic_english
分词器),截断到最大长度(500)。 - 使用
pad_sequence
处理变长序列,添加填充标记(<pad>
)。 - 训练集划分为 80% 训练和 20% 验证。
- 模型:
LSTMClassifier
使用词嵌入(nn.Embedding
)、LSTM 层和全连接层。- 处理变长序列通过
pack_padded_sequence
优化计算。 - 添加 Dropout(0.3)防止过拟合。
- 训练与验证:
- 使用 Adam 优化器和 StepLR 调度器(每 3 个 epoch 降低学习率)。
- 计算训练和验证的损失与准确率,保存验证集上最佳模型。
- 测试与推理:
- 加载最佳模型进行测试,计算整体准确率。
- 提供
predict_sentiment
函数对新文本进行情感预测。
- 运行环境:
- 支持 CPU 和 GPU(自动检测)。
num_workers=2
加速数据加载(Windows 用户可能需设为 0)。
运行要求
- 依赖:
pip install torch torchtext
- 硬件:支持 CPU 或 GPU,GPU 加速需要 CUDA 兼容的 NVIDIA 显卡。
- 数据集:IMDb 数据集通过
torchtext
自动下载。
示例输出
运行代码可能得到类似以下输出(具体数值因随机性和硬件不同而异):
Using device: cuda
Epoch 1/5, Train Loss: 0.6234, Train Acc: 65.32%, Val Loss: 0.5432, Val Acc: 72.45%
Saved best model with Val Acc: 72.45%
Epoch 2/5, Train Loss: 0.4567, Train Acc: 78.91%, Val Loss: 0.4321, Val Acc: 80.12%
Saved best model with Val Acc: 80.12%
...
Test Accuracy: 79.56%
Sample text: This movie was fantastic and really touching!
Predicted sentiment: Positive
进阶建议
- 使用预训练词嵌入:
- 替换
nn.Embedding
为预训练 GloVe 或 fastText 词向量:python from torchtext.vocab import GloVe glove = GloVe(name='6B', dim=100) model.embedding.weight.data = glove.get_vecs_by_tokens(vocab.get_itos())
- 使用 Transformer:
- 替换 LSTM 为 Transformer 模型(如
nn.TransformerEncoder
):python encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=8) transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)
- 数据增强:
- 使用同义词替换或随机删除词语增强文本数据。
- 借助
nlpaug
库实现复杂增强:python import nlpaug.augmenter.word as naw aug = naw.SynonymAug()
- 优化性能:
- 使用混合精度训练:
python from torch.cuda.amp import autocast, GradScaler scaler = GradScaler() with autocast(): outputs = model(texts, lengths) loss = criterion(outputs, labels) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update()
- 模型部署:
- 导出为 TorchScript:
python model.eval() traced_model = torch.jit.trace(model, (torch.randint(0, vocab_size, (1, max_seq_len)).to(device), torch.tensor([max_seq_len]).to(device))) traced_model.save('lstm_sentiment.pt')
- 导出为 ONNX(需要调整模型以支持固定输入形状)。
常见问题与注意事项
- 内存不足:降低
batch_size
或max_seq_len
,或使用混合精度训练。 - 数据加载慢:在 Linux/Mac 上增加
num_workers
,Windows 用户可能需设为 0。 - 过拟合:增加 Dropout、添加正则化(如
weight_decay=1e-4
),或使用更多数据增强。 - 词汇表大小:调整
vocab_size
平衡模型性能和内存占用。 - 版本兼容性:确保
torchtext
版本与 PyTorch 兼容(如torchtext>=0.12
)。
参考资源
- 官方文档:
torchtext
:https://pytorch.org/text/stable/index.htmltorch.nn.LSTM
:https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html- 教程:https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html
- 社区论坛:https://discuss.pytorch.org/
进一步帮助
如果你需要扩展此项目(例如使用 Transformer 模型、处理其他数据集、或部署到生产环境),优化性能(量化、加速推理),或调试问题,请提供更多细节,我可以为你提供定制化的代码或解决方案!