为了账号安全,请及时绑定邮箱和手机立即绑定

通用机器学习算法:线性回归+决策树+Xgboost

2019.05.08 19:29 641浏览
import os
import pandas as pd 
import numpy  as np 

def train_data_reads(path):  
	data_directory  = path + "/data"
	#获取数据路径
	data_name_list  = os.listdir(data_directory)
	file_name       = data_name_list[0]
	#数据的路径:data_path
	data_path       = data_directory + "/" + file_name
	name,extension  = file_name.split(".")
	if extension == "csv":
		try:
			data = pd.read_csv(data_path,encoding = "gbk")
		except:
			data = pd.read_csv(data_path,encoding = "utf-8")
	elif extension == "txt":
		try:
			data = pd.read_csv(data_path,encoding = "gbk",sep = "\t")
		except:
			data = pd.read_csv(data_path,encoding = "utf-8",sep = "\t")
	else:
		data = pd.read_excel(data_path)

	return data 

def train_data_reprocess(data):

	#剔除重复值
	data = data.drop_duplicates()
	data = data.reset_index(drop = True)
	return data 

def feature_label_split(data):
	#获取dataFrame的名
	name_list = data.columns.values.tolist()
	label_name = name_list[len(name_list) - 1]

	#将数据中label为空的数据删除
	data = data[np.isnan(data[label_name]) == False]

	#拆分特征与标签
	x = data.drop(["ID",label_name],axis = 1)
	y = data[label_name]

	#补全特征中的缺失值
	feature_name_list = x.columns.values.tolist()
	class_name_list   = [name for name in feature_name_list if name.find("class") > 0]
	num_name_list     = [name for name in feature_name_list if name.find("num")   > 0]
	class_filled_df   = x[class_name_list].fillna("missing")
	num_filled_df     = x[num_name_list].fillna(data.mean())
	new_x             = pd.concat([class_filled_df,num_filled_df],axis = 1)
	return new_x,y
	
#将分类特征转换成哑变量
def dummy_variable_transform(x):
	#获取feature的列名
    columns_name = x.columns.values.tolist()
    for feature_name in columns_name:
        feature_name_split = feature_name.split("_", 1)
        name = feature_name_split[0]
        feature_type = feature_name_split[1]
        if feature_type == 'class':
            dummy_class = pd.get_dummies(x[feature_name], prefix=name, drop_first=True)
            x = x.drop(feature_name, axis=1).join(dummy_class)
    return x 

#对数据集X进行归一化
#线性回归对最大值,最小值敏感,思考一下,标准化Or归一化哪个更好
def data_normalization(x)
	from sklearn.preprocessing import MinMaxScaler
	scaler = MinMaxScaler(feature_range = (0,1))
	scaler.fit(x)
	data = scaler.transform(x)
	return data 

#划分训练集和测试集
def train_test_div(x,y,percent):
	from sklearn.model_selection import train_test_split
	x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = percent)
	return x_train,x_test,y_train,y_test
	#train_test_split:先打乱顺序,然后进行分割


#1.线性回归预测
def lin_predict(x_train,x_test,y_train,y_test):
	from sklearn import linear_model
	from sklearn.linear_model import LinearRegression
	from sklearn.metrics import mean_squared_error,r2_score
	linreg = LinearRegression()
	linreg.fit(x_train,y_train)

	y_pred = linreg.predict(x_test)
	y_pred = list(map(lambda x: x if x >= 0 else 0,y_pred))
	MSE = np.sqrt(sum((np.array(y_test) - np.array(y_pred)) ** 2 ) / len(y_pred) ) #均方根误差作为结果
	R2  = r2_score(y_test,y_pred)
	return MSE,R2
	
#2.决策树预测
#决策树不需要变量变为哑变量
def tree_predict(x_train,x_test,y_train,y_test):
	from sklearn.tree import DecisionTreeRegressor
	reg = DecisionTreeRegressor(max_depth = 5)
	reg.fit(x_train,y_train)

	y_pred = reg.predict(x_test)
	y_pred = list(map(lambda x: x if x >= 0 else 0,y_pred))
	MSE = np.sqrt(sum((np.array(y_test) - np.array(y_pred)) ** 2 ) / len(y_pred) ) #均方根误差作为结果
	R2  = r2_score(y_test,y_pred)
	return MSE,R2

#3.xgboost回归
#xgboost不需要变量变为哑变量
def xgb_predict(x_train,x_test,y_train,y_test):
	from xgboost import XGBRegressor
	reg = XGBRegressor(learning_rate = 0.05,max_depth = 5,n_estimators = 500)
	reg.fit(x_train,y_train)

	y_pred = reg.predict(x_test)
	y_pred = list(map(lambda x: x if x >= 0 else 0,y_pred))
	MSE = np.sqrt(sum((np.array(y_test) - np.array(y_pred)) ** 2 ) / len(y_pred) ) #均方根误差作为结果
	R2  = r2_score(y_test,y_pred)
	return MSE,R2


def main():
	path = "E:/AnaLinReg/Data"
	data = train_data_reads(path)
	data = train_data_reprocess(data)
	x,y = feature_label_split(data)
	x = dummy_variable_transform(x)
	x = data_normalization(x)
	x_train,x_test,y_train,y_test = train_test_div(x3,y2,0.3)
    MSE,R2 = lin_predict(x_train,x_test,y_train,y_test)
    print (MSE)
    print (R2)


if __name__ == "__main__":
	main()
点击查看更多内容
0人点赞

若觉得本文不错,就分享一下吧!

评论

相关文章推荐

正在加载中
意见反馈 邀请有奖 帮助中心 APP下载
官方微信

举报

0/150
提交
取消