GRU的类模板参数与对象参数与RNN一致
import torch from torch import nn from torch.nn import functional as F seq_len = 85 batch_size = 32 embedding_dim = 256 output_size = 512 bidirectional=True num_layers = 2 single_rnn_nums = num_layers model = nn.GRU(input_size=256, hidden_size=output_size, num_layers=num_layers, bidirectional=bidirectional) x = torch.randn([seq_len, batch_size, embedding_dim]) # 每一个单元处理,都有一个隐藏层 # 输出层output包含所有的隐藏层,[seq_len,batch_size,output_size] if bidirectional: single_rnn_nums = num_layers*2 h0 = torch.zeros([single_rnn_nums, batch_size, output_size]) # ht是每一个时间步t的输出,hn是最后一个时间步的输出 output,hn = model(x,h0)
将RNN替换为GRU
class ParamConfig(): padding_idx = 0 embedding_dim = 256 hidden_size = 512 output_size = 2 bidirectional = True num_layers=2 batch_size = 128 debug = False BASE_DIR = sys.path[0] param_path = os.path.join(BASE_DIR,"model/model_gru2.pkl") log_file = os.path.join(BASE_DIR,"main.log") def __init__(self, isTest=False, seq_len=seq_len) -> None: if isTest:# 测试不需要加载真实数据,随机给个数,快速验证模型 self.dict_len = 10000 self.seq_len = seq_len self.debug = True else: words_set,word2idx = load_hotel(return_dict=True) dict_len = len(words_set) print(f"dict_len:{dict_len}") # dict_len:21437 self.dict_len = dict_len self.word2idx = word2idx self.seq_len = seq_len pm = ParamConfig() class RNNClassify1(nn.Module): def __init__(self, dict_len, input_size, hidden_size, output_size, num_layers=2, bidirectional=True,debug=pm.debug): super(RNNClassify1, self).__init__() self.embedding = nn.Embedding(num_embeddings=dict_len, embedding_dim=input_size, padding_idx=0) self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size self.debug = debug self.model = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional) if bidirectional: self.single_rnn_nums = num_layers*2 else: self.single_rnn_nums = num_layers self.fc = nn.Linear(hidden_size, output_size) def forward(self,X): """ - X:2维索引矩阵 - """ if X.ndim != 2: throw("X must be 2-dimensional") # 这个批次是动态的,因为不管你指定大多,最后一个批次(余数)大概率不与该指定值相同 batch_size = X.shape[0] x = self.embedding(X) # [B,seq_len] --> [B,seq_len,embedding] if self.debug: print(f"embedding:{x.shape}") # embedding:torch.Size([128, 85, 256]) x = torch.permute(input=x, dims=(1,0,2)) # [B,seq_len,embedding] --> [seq_len,B,embedding] if self.debug: print(f"permute后x.shape:{x.shape}") # permute后x.shape:torch.Size([85, 128, 256]) h0 = torch.zeros(self.single_rnn_nums, batch_size, self.hidden_size) # h0与hn是一个单向RNN链的两端,一个是首单词的初始化向量,一个是尾单词的输出向量 out, hn = self.model(x, h0) #根据RNN思想,最后一个单词的输出包含整个序列的信息 #意思就是最后一个单词的输出,也是整个序列的输出 #即hn中一个长度为hidden_size的向量就是一个序列的上下文向量 #现有batch_size个序列,应该有batch_size个长度为hidden_size的上下文向量 #需要将hn的shape转化为[batch_size,hidden_size] #对hn的dim=0维度进行sum,使该维消失,正好满足需求 #hn的dim=0维是指几条单向RNN链,最后将所有的单向RNN链最后一个单词的输出,融合到一起 #这里选择了相加,融合,还有一种做法,就是拼接,将多个单向RNN的输出结果拼接到一起 # out = torch.sum(input=hn, dim=0) out = out[:,:,self.hidden_size:] + out[:,:,:self.hidden_size] out = torch.sum(input=out, dim=0) #全连接分类 out = self.fc(out) return out
欠拟合
训练集精度不够 此时,可尝试一些组件
过拟合
当训练集精度已经接近100%时,损失函数的值已经非常小了 但测试集上精度不再升 也就是说这个模型的能力已经全部展现了 这时候可能一些小的优化就不起作用了
#全连接分类 out = self.fc1(out) out = self.fc2(out) out = self.fc3(out) 相比 out = self.fc(out) 多层可以让模型有更快的收敛速度,但几乎不影响最终的精度 self.fc1 = nn.Linear(hidden_size, 4*hidden_size) self.fc2 = nn.Linear(4*hidden_size, hidden_size) self.fc3 = nn.Linear(hidden_size, output_size) 第二个影响是 单层全连接 在训练开始 在训练集上有较高的得分,测试集得分较低,二者差距明显 多层全连接 在训练开始 在训练集与测试集的得分差异不大,即欠拟合的情况不明显
多层全连接代码
class RNNClassify1(nn.Module): def __init__(self, dict_len, input_size, hidden_size, output_size, num_layers=2, bidirectional=True,debug=pm.debug): super(RNNClassify1, self).__init__() self.embedding = nn.Embedding(num_embeddings=dict_len, embedding_dim=input_size, padding_idx=0) self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size self.debug = debug self.model = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional) if bidirectional: self.single_rnn_nums = num_layers*2 else: self.single_rnn_nums = num_layers self.fc1 = nn.Linear(hidden_size, 4*hidden_size) self.fc2 = nn.Linear(4*hidden_size, hidden_size) self.fc3 = nn.Linear(hidden_size, output_size) def forward(self,X): """ - X:2维索引矩阵 - """ if X.ndim != 2: throw("X must be 2-dimensional") # 这个批次是动态的,因为不管你指定大多,最后一个批次(余数)大概率不与该指定值相同 batch_size = X.shape[0] x = self.embedding(X) # [B,seq_len] --> [B,seq_len,embedding] if self.debug: print(f"embedding:{x.shape}") # embedding:torch.Size([128, 85, 256]) x = torch.permute(input=x, dims=(1,0,2)) # [B,seq_len,embedding] --> [seq_len,B,embedding] if self.debug: print(f"permute后x.shape:{x.shape}") # permute后x.shape:torch.Size([85, 128, 256]) h0 = torch.zeros(self.single_rnn_nums, batch_size, self.hidden_size) # h0与hn是一个单向RNN链的两端,一个是首单词的初始化向量,一个是尾单词的输出向量 out, hn = self.model(x, h0) #根据RNN思想,最后一个单词的输出包含整个序列的信息 #意思就是最后一个单词的输出,也是整个序列的输出 #即hn中一个长度为hidden_size的向量就是一个序列的上下文向量 #现有batch_size个序列,应该有batch_size个长度为hidden_size的上下文向量 #需要将hn的shape转化为[batch_size,hidden_size] #对hn的dim=0维度进行sum,使该维消失,正好满足需求 #hn的dim=0维是指几条单向RNN链,最后将所有的单向RNN链最后一个单词的输出,融合到一起 #这里选择了相加,融合,还有一种做法,就是拼接,将多个单向RNN的输出结果拼接到一起 # out = torch.sum(input=hn, dim=0) out = out[:,:,self.hidden_size:] + out[:,:,:self.hidden_size] out = torch.sum(input=out, dim=0) #全连接分类 out = self.fc1(out) out = self.fc2(out) out = self.fc3(out) return out
num_layers对RNN的影响
num_layers从2到5精度并没有增加,精度最高仍然到89%,在86%-88%之间震荡 RNN的层数的为2就已经将有用的信息提取完了, 更多的层数没有提取出更有用的信息
代码
class ParamConfig(): padding_idx = 0 embedding_dim = 256 hidden_size = 512 output_size = 2 bidirectional = True num_layers=5 batch_size = 128 debug = False BASE_DIR = sys.path[0] param_path = os.path.join(BASE_DIR,"model/model_gru7_2.pkl") log_file = os.path.join(BASE_DIR,"main.log") def __init__(self, isTest=False, seq_len=seq_len) -> None: if isTest:# 测试不需要加载真实数据,随机给个数,快速验证模型 self.dict_len = 10000 self.seq_len = seq_len self.debug = True else: words_set,word2idx = load_hotel(return_dict=True) dict_len = len(words_set) print(f"dict_len:{dict_len}") # dict_len:21437 self.dict_len = dict_len self.word2idx = word2idx self.seq_len = seq_len pm = ParamConfig() # pm = ParamConfig(isTest=True) class MyDataSet(Dataset): """ 构建数据集 """ def __init__(self, X, y): self.X = X self.y = y def __len__(self): return len(self.X) def __getitem__(self, idx): x = self.X[idx] y = self.y[idx] return torch.tensor(data=x).long(), torch.tensor(data=y).long() train_dataset = MyDataSet(X=X_train, y=y_train) test_dataset = MyDataSet(X=X_test,y=y_test) # 从数据集中批次取数据 train_dataloader = DataLoader(dataset=train_dataset, shuffle=True, batch_size = pm.batch_size) for X,y in train_dataloader: print(X.shape,X.ndim,y.shape,y.ndim) # torch.Size([128, 85]) 2 torch.Size([128]) 1 break class RNNClassify1(nn.Module): def __init__(self, dict_len, input_size, hidden_size, output_size, num_layers=2, bidirectional=True,debug=pm.debug): super(RNNClassify1, self).__init__() self.embedding = nn.Embedding(num_embeddings=dict_len, embedding_dim=input_size, padding_idx=0) self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size self.debug = debug self.model = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional) if bidirectional: self.single_rnn_nums = num_layers*2 else: self.single_rnn_nums = num_layers self.fc1 = nn.Linear(hidden_size, hidden_size) self.fc2 = nn.Linear(hidden_size, output_size) def forward(self,X): """ - X:2维索引矩阵 - """ if X.ndim != 2: throw("X must be 2-dimensional") # 这个批次是动态的,因为不管你指定大多,最后一个批次(余数)大概率不与该指定值相同 batch_size = X.shape[0] x = self.embedding(X) # [B,seq_len] --> [B,seq_len,embedding] if self.debug: print(f"embedding:{x.shape}") # embedding:torch.Size([128, 85, 256]) x = torch.permute(input=x, dims=(1,0,2)) # [B,seq_len,embedding] --> [seq_len,B,embedding] if self.debug: print(f"permute后x.shape:{x.shape}") # permute后x.shape:torch.Size([85, 128, 256]) h0 = torch.zeros(self.single_rnn_nums, batch_size, self.hidden_size) # h0与hn是一个单向RNN链的两端,一个是首单词的初始化向量,一个是尾单词的输出向量 out, hn = self.model(x, h0) #根据RNN思想,最后一个单词的输出包含整个序列的信息 #意思就是最后一个单词的输出,也是整个序列的输出 #即hn中一个长度为hidden_size的向量就是一个序列的上下文向量 #现有batch_size个序列,应该有batch_size个长度为hidden_size的上下文向量 #需要将hn的shape转化为[batch_size,hidden_size] #对hn的dim=0维度进行sum,使该维消失,正好满足需求 #hn的dim=0维是指几条单向RNN链,最后将所有的单向RNN链最后一个单词的输出,融合到一起 #这里选择了相加,融合,还有一种做法,就是拼接,将多个单向RNN的输出结果拼接到一起 # out = torch.sum(input=hn, dim=0) out = out[:,:,self.hidden_size:] + out[:,:,:self.hidden_size] out = torch.sum(input=out, dim=0) #全连接分类 out = self.fc1(out) out = self.fc2(out) return out
nn.ReLU()
ReLU(𝑥)=(𝑥)+=max(0,𝑥)
nn.PReLU()
PReLU(𝑥)=max(0,𝑥)+𝑎∗min(0,𝑥) ReLU直接舍弃负值 PReLU是给负值一个权重
relu在此场景的效果
ReLU 没有增益 PReLU 未验证
class RNNClassify1(nn.Module): def __init__(self, dict_len, input_size, hidden_size, output_size, num_layers=2, bidirectional=True,debug=pm.debug): super(RNNClassify1, self).__init__() self.embedding = nn.Embedding(num_embeddings=dict_len, embedding_dim=input_size, padding_idx=0) self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size self.debug = debug self.model = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional) if bidirectional: self.single_rnn_nums = num_layers*2 else: self.single_rnn_nums = num_layers self.dropout = nn.Dropout(p=0.5) self.fc1 = nn.Linear(hidden_size, hidden_size) self.fc2 = nn.Linear(hidden_size, output_size) def forward(self,X): """ - X:2维索引矩阵 - """ if X.ndim != 2: throw("X must be 2-dimensional") # 这个批次是动态的,因为不管你指定大多,最后一个批次(余数)大概率不与该指定值相同 batch_size = X.shape[0] x = self.embedding(X) # [B,seq_len] --> [B,seq_len,embedding] if self.debug: print(f"embedding:{x.shape}") # embedding:torch.Size([128, 85, 256]) x = torch.permute(input=x, dims=(1,0,2)) # [B,seq_len,embedding] --> [seq_len,B,embedding] if self.debug: print(f"permute后x.shape:{x.shape}") # permute后x.shape:torch.Size([85, 128, 256]) h0 = torch.zeros(self.single_rnn_nums, batch_size, self.hidden_size) # h0与hn是一个单向RNN链的两端,一个是首单词的初始化向量,一个是尾单词的输出向量 out, hn = self.model(x, h0) out = out[:,:,self.hidden_size:] + out[:,:,:self.hidden_size] out = torch.sum(input=out, dim=0) #全连接分类 out = self.fc1(out) out = self.dropout(out) out = self.fc2(out) return out p=0.5在这个以选负面单词为目的的场景中, 值过大,导致精度在86%-88%之间震荡
双层双向
RNN网络仍然是双层双向,这个不变,这个也可能是效果较好的
序列依赖的长度
序列过长会信息丢失严重,经验值为 序列长度超过20信息开始丢失
输出output与hn
这一步可以变,针对不同的业务效果可能会不同 # out = torch.sum(input=hn, dim=0) out = out[:,:,self.hidden_size:] + out[:,:,:self.hidden_size] out = torch.sum(input=out, dim=0) 这里的效果是使用output好一些 在不对单词进行额外数据加工的情况下, GRU能达到89%,偶尔升到90%又很快会降到89%, 仅次于TEXTCNN的91% RNN可以达到88% 这里还没有实验接拼的效果 目前看,TextCNN是文本分类项目中那个又快又好的模型 快指训练速度快
激活函数ReLU、Leaky ReLU、PReLU和RReLU