实验目的
- 了解机器学习算法理论基础
- 平台实现算法
- 编程实现机器学习
实验原理
机器学习(Machine Learning, ML)是一门多领域交叉学科,涉及概率论、统计学、逼近论、凸分析、算法复杂度理论等多门学科。专门研究计算机怎样模拟或实现人类的学习行为,以获取新的知识或技能,重新组织已有的知识结构使之不断改善自身的性能。
实验环境
操作系统:Windows 10 专业版 20H2
平台:jupyter notebook
环境:anaconda python 3.7
python相关库:
- pytorch
- numpy
- pandas
- collections
- matplotlib
- sklearn
实验步骤
学习框架
本实验中使用的是机器学习(ML)中分支深度学习(DL),并利用pytorch
来进行模型的构建。
搭建一个简单的CNN网络,使用了BN批标准化和1*1卷积,学习率为0.1,weight_decay为0.2,激活函数均选用ReLU
。
class Network(nn.Module):
def __init__(self, input_nc=1, output_nc=1):
super(Network, self).__init__()
self.Net = nn.Sequential(OrderedDict([
('conv0', nn.Conv1d(input_nc, 16, kernel_size=3, stride=2,
padding=1, bias=False)),
('norm0', nn.BatchNorm1d(16)),
('relu0', nn.ReLU(True))
]))
for i in range(1,3):
num = 2**i
self.Net.add_module(
'dense' + str(i),
nn.Sequential(OrderedDict([
('conv1', nn.Conv1d(16*(num//2), 16*num, kernel_size=3, stride=2,
padding=1, bias=False)),
('relu1', nn.ReLU(True)),
('conv2', nn.Conv1d(16*num, 16*num, kernel_size=3, stride=2,
padding=1, bias=False)),
('relu2', nn.ReLU(True)),
])))
self.Net.add_module('conv1*1', nn.Conv1d(64, 32, kernel_size=1, stride=1,
padding=0, bias=False))
self.Net.add_module('norm1', nn.BatchNorm1d(32))
self.Net.add_module('relu1', nn.ReLU(True))
self.predict = nn.Linear(32, 1)
def forward(self, x):
x = self.Net(x)
x = F.adaptive_avg_pool1d(x, 1)
x = torch.flatten(x, 1)
out = self.predict(x)
return out
搭建一个线性网络
class Network(nn.Module):
def __init__(self, input_nc=1, output_nc=1):
super(Network, self).__init__()
self.predict = nn.Linear(117, 1)
def forward(self, x):
out = self.predict(x)
return out
数据加载
继承 torch.utils.data.Dataset类
来构建数据迭代器。
pandas.get_dummies方法
来进行独热编码(one hot编码)。
class GetDateset(Dataset):
def __init__(self, data, label):
self.data = data
self.label = label
def __getitem__(self, index):
data = self.data[index]
label = self.label[index]
return data, label
def __len__(self):
return len(self.data)
数据分割
torch.utils.data.random_split方法
来进行数据集分割,比例为8:2。
class GetData:
def __init__(self, data_path, batch_size):
df = pd.read_table('test.txt', sep=',', header=None)
df1 = pd.get_dummies(df, dummy_na=True)
num = df1.values
minmax_scaler = MinMaxScaler()
# num = minmax_scaler.fit_transform(num)
data = num[:,1:-1]
data = minmax_scaler.fit_transform(data)
data = data[:,np.newaxis,:]
label = num[:,0]
label = label[:,np.newaxis]
data = GetDateset(data, label)
# 划分训练集和验证集
train_size = int(len(data) * 0.8)
validdate_size = int(len(data)) - train_size
traindata, validdata = torch.utils.data.random_split(data, [train_size, validdate_size])
self.trainiter = DataLoader(traindata, batch_size=batch_size,num_workers=0,shuffle=True)
self.validiter = DataLoader(validdata, batch_size=batch_size,num_workers=0,shuffle=False)
def get_iter(self):
return self.trainiter, self.validiter
训练
使用MSE(均方误差)
来评判我们的训练效果。
学习率为0.1,迭代次数为50次,样本大小为300,使用cuda
加速.
class Train:
def __init__(self, train_data_path='test.txt', num_epochs=50, batch_size=300,lr=1e-1, weight_decay=0.2, device='cuda:0',
train_arr=[], test_arr=[]):
self.train_data_path = train_data_path
self.num_epochs = num_epochs
self.batch_size = batch_size
self.lr = lr
self.weight_decay = weight_decay
self.device = device
self.train_arr = train_arr
self.test_arr = test_arr
def dataloader(self):
getdata = GetData(self.train_data_path, self.batch_size)
self.train_iter, self.valid_iter = getdata.get_iter()
def build_model(self):
self.net = Network().to(self.device)
self.MSE_loss = nn.MSELoss().to(self.device)
self.optim = torch.optim.Adam(self.net.parameters(), lr=self.lr, weight_decay=self.weight_decay)
def train(self):
self.net.train()
print("train start!")
for epoch in range(self.num_epochs):
train_loss = 0
valid_loss = 0
for data, label in self.train_iter:
data = data.to(self.device, dtype=torch.float)
label = label.to(self.device, dtype=torch.float)
out = self.net(data)
loss = self.MSE_loss(out, label).to(self.device)
self.optim.zero_grad()
loss.backward()
self.optim.step()
train_loss += loss.item()
for data, label in self.valid_iter:
data = data.to(self.device, dtype=torch.float)
label = label.to(self.device, dtype=torch.float)
out = self.net(data)
loss = self.MSE_loss(out, label).to(self.device)
valid_loss += loss.item()
print("Epoch %d. Train Loss: %f, Valid Loss: %f, "
% (epoch, train_loss / len(self.train_iter), valid_loss / len(self.valid_iter)))
self.train_arr.append(train_loss / len(self.train_iter))
self.test_arr.append(valid_loss / len(self.valid_iter))
torch.save(self.net, 'Net.pth')
验证
由于训练集数据数量较多(30000+)数据,此次实验并未使用k折交叉验证。
同时做一个对比:
CNN网络与线性网络对比
图1:CNN网络的训练与验证误差
图2:线性网络的训练与验证误差
由线性网络作为基准,可以看出CNN网络提升很大,但仍然效果一般。
具体分析,可能需预测值实际与其他特征值之间关系较弱,线性较弱。
模型的保存
模型在Train类
中保存(训练结束后)。
使用torch.save(self.net, 'Net.pth')方法
来保存模型。
if __name__ == '__main__':
train_arr = []
test_arr = []
# 训练
Predict = Train(train_data_path='test.txt', num_epochs=200, batch_size=300,lr=1e-1, weight_decay=0.2, device='cuda:0',
train_arr=train_arr, test_arr=test_arr)
Predict.dataloader()
Predict.build_model()
Predict.train()
#绘图
plt.plot(train_arr)
plt.plot(test_arr)
plt.legend(['train_loss','test_loss'])
plt.savefig('loss.png',dpi=600)
plt.show()
备注(源码)
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import torch.nn.functional as F
from collections import OrderedDict
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
# In[2]:
class GetDateset(Dataset):
def __init__(self, data, label):
self.data = data
self.label = label
def __getitem__(self, index):
data = self.data[index]
label = self.label[index]
return data, label
def __len__(self):
return len(self.data)
class GetData:
def __init__(self, data_path, batch_size):
df = pd.read_table('test.txt', sep=',', header=None)
df1 = pd.get_dummies(df, dummy_na=True)
num = df1.values
minmax_scaler = MinMaxScaler()
# num = minmax_scaler.fit_transform(num)
data = num[:,1:-1]
data = minmax_scaler.fit_transform(data)
data = data[:,np.newaxis,:]
label = num[:,0]
label = label[:,np.newaxis]
data = GetDateset(data, label)
# 划分训练集和验证集
train_size = int(len(data) * 0.8)
validdate_size = int(len(data)) - train_size
traindata, validdata = torch.utils.data.random_split(data, [train_size, validdate_size])
self.trainiter = DataLoader(traindata, batch_size=batch_size,num_workers=0,shuffle=True)
self.validiter = DataLoader(validdata, batch_size=batch_size,num_workers=0,shuffle=False)
def get_iter(self):
return self.trainiter, self.validiter
# In[3]:
class Network(nn.Module):
def __init__(self, input_nc=1, output_nc=1):
super(Network, self).__init__()
self.Net = nn.Sequential(OrderedDict([
('conv0', nn.Conv1d(input_nc, 16, kernel_size=3, stride=2,
padding=1, bias=False)),
('norm0', nn.BatchNorm1d(16)),
('relu0', nn.ReLU(True))
]))
for i in range(1,3):
num = 2**i
self.Net.add_module(
'dense' + str(i),
nn.Sequential(OrderedDict([
('conv1', nn.Conv1d(16*(num//2), 16*num, kernel_size=3, stride=2,
padding=1, bias=False)),
('relu1', nn.ReLU(True)),
('conv2', nn.Conv1d(16*num, 16*num, kernel_size=3, stride=2,
padding=1, bias=False)),
('relu2', nn.ReLU(True)),
])))
self.Net.add_module('conv1*1', nn.Conv1d(64, 32, kernel_size=1, stride=1,
padding=0, bias=False))
self.Net.add_module('norm1', nn.BatchNorm1d(32))
self.Net.add_module('relu1', nn.ReLU(True))
self.predict = nn.Linear(32, 1)
def forward(self, x):
x = self.Net(x)
x = F.adaptive_avg_pool1d(x, 1)
x = torch.flatten(x, 1)
out = self.predict(x)
return out
# In[4]:
class Train:
def __init__(self, train_data_path='test.txt', num_epochs=50, batch_size=300,lr=1e-1, weight_decay=0.2, device='cuda:0',
train_arr=[], test_arr=[]):
self.train_data_path = train_data_path
self.num_epochs = num_epochs
self.batch_size = batch_size
self.lr = lr
self.weight_decay = weight_decay
self.device = device
self.train_arr = train_arr
self.test_arr = test_arr
def dataloader(self):
getdata = GetData(self.train_data_path, self.batch_size)
self.train_iter, self.valid_iter = getdata.get_iter()
def build_model(self):
self.net = Network().to(self.device)
self.MSE_loss = nn.MSELoss().to(self.device)
self.optim = torch.optim.Adam(self.net.parameters(), lr=self.lr, weight_decay=self.weight_decay)
def train(self):
self.net.train()
print("train start!")
for epoch in range(self.num_epochs):
train_loss = 0
valid_loss = 0
for data, label in self.train_iter:
data = data.to(self.device, dtype=torch.float)
label = label.to(self.device, dtype=torch.float)
out = self.net(data)
loss = self.MSE_loss(out, label).to(self.device)
self.optim.zero_grad()
loss.backward()
self.optim.step()
train_loss += loss.item()
for data, label in self.valid_iter:
data = data.to(self.device, dtype=torch.float)
label = label.to(self.device, dtype=torch.float)
out = self.net(data)
loss = self.MSE_loss(out, label).to(self.device)
valid_loss += loss.item()
print("Epoch %d. Train Loss: %f, Valid Loss: %f, "
% (epoch, train_loss / len(self.train_iter), valid_loss / len(self.valid_iter)))
self.train_arr.append(train_loss / len(self.train_iter))
self.test_arr.append(valid_loss / len(self.valid_iter))
torch.save(self.net, 'Net.pth')
# In[5]:
if __name__ == '__main__':
train_arr = []
test_arr = []
# 训练
Predict = Train(train_data_path='test.txt', num_epochs=200, batch_size=300,lr=1e-1, weight_decay=0.2, device='cuda:0',
train_arr=train_arr, test_arr=test_arr)
Predict.dataloader()
Predict.build_model()
Predict.train()
#绘图
plt.plot(train_arr)
plt.plot(test_arr)
plt.legend(['train_loss','test_loss'])
plt.savefig('loss.png',dpi=600)
plt.show()
2 条评论
您好~我是腾讯云+社区的运营,关注了您分享的技术文章,觉得内容很棒,我们诚挚邀请您加入腾讯云自媒体分享计划。完整福利和申请地址请见:https://cloud.tencent.com/developer/support-plan
作者申请此计划后将作者的文章进行搬迁同步到社区的专栏下,你只需要简单填写一下表单申请即可,我们会给作者提供包括流量、云服务器等,另外还有些周边礼物。
感谢分享 赞一个