-
#(10)quantile values np.percentile(df.iloc[:,0], (1,5,25,50,75,95,99)) df.iloc[:,0][~np.isin(df.iloc[:,0], missSet)] np.percentile(df.iloc[:,0][~np.isin(df.iloc[:,0], missSet)], (1,5,25,50,75,95,99)) json_quantile = {} for i,name in enumerate(df.iloc[:,0:3].columns): print('the %d columns: %s' %(i,name)) json_quantile[name] = np.percentile(df[name][~np.isin(df[name], missSet)], (1,5,25,50,75,95,99)) # 写法1 df_quantife = pd.DataFrame(json_quantile, columns = df.iloc[:,0:3].columns).T # 写法2 df_quantife = pd.DataFrame(json_quantile)[df.iloc[:,0:3].columns].T查看全部
-
#(8)Min Values# np.min(df.iloc[:,0]) df.iloc[:,0][~np.isin(df.iloc[:,0], missSet)] np.min(df.iloc[:,0][~np.isin(df.iloc[:,0], missSet)]) df_min = df.iloc[:, 0:3].apply(lambda x:np.min(x[~np.isin(x, missSet)])) #(9)Max Values# np.max(df.iloc[:,0]) df.iloc[:,0][~np.isin(df.iloc[:,0], missSet)] np.max(df.iloc[:,0][~np.isin(df.iloc[:,0], missSet)]) df_max = df.iloc[:, 0:3].apply(lambda x:np.max(x[~np.isin(x, missSet)]))查看全部
-
#(6)Mode Values# df_mode = df.iloc[:,0:3].apply(lambda x: stats.mode(x[~np.isin(x, missSet)])[0][0]) #(7)Mode Percentage# df_mode_count = df.iloc[:,0:3].apply(lambda x: stats.mode(x[~np.isin(x, missSet)])[1][0]) df_mode_perct = df_mode_count/df.shape[0]查看全部
-
#(4)Mean Values# np.mean(df.iloc[:, 0]) # 没有去除缺失值之前的均值很低 df.iloc[:,0][~np.isin(df.iloc[:,0], missSet)] # 去除缺失值 np.mean(df.iloc[:,0][~np.isin(df.iloc[:,0], missSet)]) # 去除缺失值后的均值计算 df_mean = df.iloc[:,0:3].apply(lambda x:np.mean(x[~np.isin(x, missSet)])) #(5)Median Values# np.median(df.iloc[:,0]) df.iloc[:,0][~np.isin(df.iloc[:,0], missSet)] np.median(df.iloc[:,0][~np.isin(df.iloc[:,0], missSet)]) df_media = df.iloc[:,0:3].apply(lambda x:np.median(x[~np.isin(x, missSet)]))查看全部
-
##1.Basic Analysis## #(1)Missing Value# missSet = [np.nan, 9999999999, -999999] #(2)Count distinct# len(df.iloc[:, 0].unique()) count_un = df.iloc[:, 0:3].apply(lambda x:len(x.unique())) #(3)Zero Values# np.sum(df.iloc[:,0] == 0) count_zero = df.iloc[:, 0:3].apply(lambda x:np.sum(x == 0))查看全部
-
##0.Read Data## df = pd.read_csv('train.csv') label = df['TARGET'] df = df.drop(['ID','TARGET'], axis=1)查看全部
-
完整代码查看全部
举报
0/150
提交
取消