# 线性回归算法实例

1.调用Sklearn库函数
2.自己实现相关函数

``````import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']  # 中文字体
mpl.rcParams['axes.unicode_minus'] = False # 负号

import warnings
warnings.filterwarnings('ignore')

def get_dataset2():
data = []
label = []
tmpLine = line.strip().split('|')
data.append( [float(tmpLine[2]),float(tmpLine[3])] )   #前两列是特征
label.append(int(tmpLine[4]))                          #第三列是标签
#转换之前，data的类型是：List  label的类型是： List
#将Data,label转换成矩阵，很关键的一步
data  = np.matrix(data)
label = np.matrix(label).transpose()
return data,label

#对数据集X进行归一化
def data_process(data):
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(data)
data = scaler.transform(data)
return data

#scaler要求输入的数据为array类型
#data,label被转换为Matrix类型
#考虑一下这样是否可行
#调试结果发现，可以

#划分训练集和测试集
def divided1(xdata, ydata, percent):
data_train,data_test,label_train,label_test = train_test_split(xdata,ydata,test_size = percent)
return data_train,data_test,label_train,label_test

#len(data1) = 577
#len(label1) = 577
#data11.shape --> (577,2)
#label1.shape --> (577,1)
#data_train.shape --> (403,2)
#data_test.shape --> (174,2)
#label_train.shape --> (403,1)
#label_test.shape --> (174,1)
#type: numpy.ndarray

#绘图函数
def figure(title,*datalist):
for jj in datalist:
plt.plot(jj[0],'-',label = jj[1],linewidth = 2)
plt.plot(jj[0],'o')
plt.grid()
plt.title(title)
plt.legend()
plt.show()

#预测函数1：基于Sklearn框架
#            data_train,label_train,data_test,label_test
def predict1(x_train_data,y_train_data,x_test_data,y_test_data):
reg = linear_model.LinearRegression()
#开始训练
reg.fit(x_train_data,y_train_data)
#输入测试集，预测数据的预测值
test_pre = reg.predict(x_test_data)
#输入训练集，训练数据的预测值
train_pre = reg.predict(x_train_data)

train_error = [ mean_squared_error(y_train_data,[np.mean(y_train_data)] * len(y_train_data)),
mean_squared_error(y_train_data,train_pre)
]
#绘制误差图
figure('误差图 最终的MSE = %.4f' % (train_error[-1]),[train_error,'error'])

#绘制预测值与真实值图
figure('预测值与真实值图 模型的' + r'\$R^2=%.4f\$' % (r2_score(train_pre, y_train_data)), [test_pre, '预测值'],
[y_test_data, '真实值'])
plt.show()

#线性回归的参数
#print('线性回归的系数为:\n w = %s \n b = %s' % (reg.coef_,reg.intercept_))

#自己编程搭建模型
# xdata = data_train [432,2]
# ydata = label_train  [432,1]
class LinearRegression:

def __init__(self,theta = 0.2, iter_times =200000,error = 1e-9):
self.theta = theta
self.iter_times = iter_times
self.error = error

def Trans(self,xdata):
#w和b合为一个参数，也就是x最后加上一列全为1的数据
# y = wx + b
# [W,B],[X,1]
one1 = np.ones(len(xdata))
xta = np.append(xdata,one1.reshape(-1,1),axis = 1)
return xta   #[432,3]

#梯度下降算法
#X --> [X,1]
xdata = self.Trans(xdata)                         #xdata = [432,3]
#初始化 weights
self.weights = np.zeros((xdata.shape[1] , 1))      #weights = [3,1]
#存储损失函数的值
cost_function = []

for i in range(self.iter_times):
#得到回归的值
y_predict = np.dot(xdata,self.weights)        #y_predict = [432,1]
#最小二乘法计算误差
cost  = np.sum((y_predict - ydata).T * (y_predict - ydata)) / len(xdata)  # cost = [432,1],?
cost_function.append(cost)

#计算梯度
grad = 2 * np.dot(xdata.T,(y_predict - ydata))  /  len(xdata)  # grad : [3,432] * [432,1] = [3,1]

#更新w,b的值
self.weights = self.weights - self.theta * grad

#提前结束循环的机制
if len(cost_function) > 1:
if 0 < cost_function[-2]  - cost_function[-1] < self.error:
break
return self.weights,cost_function
#预测
def predict2(self,xdata):
return np.dot(self.Trans(xdata),self.weights)   #[432,3] * [1,1]
#计算R2的函数
def getR(self,ydata_tr,ydata_pre):
sum_error = np.sum((ydata_tr - np.mean(ydata_tr)).T * (ydata_tr - np.mean(ydata_tr)))
inexplicable = np.sum((ydata_tr - ydata_pre).T * (ydata_tr - ydata_pre))
#sum_error = np.sum(((ydata_tr - np.mean(ydata_tr)) ** 2))
#inexplicable = np.sum(((ydata_tr - ydata_pre) ** 2))
return 1 - inexplicable / sum_error

#根据公式
""" def Formula(self, xdata, ydata):
xdata = self.Trans(xdata)
self.weights = np.dot(np.dot(np.linalg.inv(np.dot(xdata.T, xdata)), xdata.T), ydata)
y_predict = np.dot(xdata, self.weights)
cost = [np.sum((ydata - np.mean(ydata)) ** 2) / len(xdata)]  # 开始是以y值得平均值作为预测值计算cost
cost += [np.sum((y_predict - ydata) ** 2) / len(xdata)]  # 利用公式，一次计算便得到参数的值，不需要迭代。
return self.weights, cost  # 包括2个值
"""

#用sklearn库实现线性回归
"""def main():
data1 = []
label1 = []
data1,label1 = get_dataset2()
data1 = data_process(data1)
data_train = []
data_test  = []
label_train = []
label_test  = []
percent1 = 0.25
data_train,data_test,label_train,label_test  = divided1(data1,label1,percent1)

#data_train = np.matrix(data_train)
#data_test  = np.matrix(data_test)

predict1(data_train,label_train,data_test,label_test)
"""
#自己搭建模型实现逻辑回归
def main():
data1 = []
label1 = []
data1,label1 = get_dataset2()
data1 = data_process(data1)
data_train = []
data_test  = []
label_train = []
label_test  = []
percent1 = 0.25
data_train,data_test,label_train,label_test  = divided1(data1,label1,percent1)
data_train = np.matrix(data_train)     #[432,2]
data_test  = np.matrix(data_test)      #[145,2]
label_train = np.matrix(label_train)   #[432,1]
label_test  = np.matrix(label_test)    #[145,1]

regressor = LinearRegression()
# 开始训练
#输入测试集，预测数据的预测值
test_pre = regressor.predict2(data_test)
#输入训练集，训练数据的预测值
train_pre = regressor.predict2(data_train)
#绘制误差图
figure('误差图 最终的MSE = %.4f' % (train_error[1][-1]), [train_error[1], 'error'])
figure('预测值与真实值图 模型的' + r'\$R^2=%.4f\$' % (regressor.getR(label_train,train_pre)), [test_pre,'预测值'],
[label_test, '真实值'])
plt.show()

if __name__  == "__main__":
main()

``````

TA 点赞

• 推荐
• 1
• 收藏
• 共同学习，写下你的评论

100积分直接送

0/150