为了账号安全,请及时绑定邮箱和手机立即绑定

基于Pandas的给批量文件增加列名的算法

这次的代码,结合上次的批量文件合并,可以使代码能够先给特征数据集添加列名,然后批量合并,形成自动化的预处理😎

完整代码:

import os
import pandas as pd 
import numpy  as np 
import math 


# 获取文件名
def get_data_set(path):
    data_directory  = path + "/data"
    data_files = []
    files = os.listdir(data_directory) 
    files.sort()
    for f in files:
        try:
            data_files.append(f)
        except Exception:
            msg = traceback.format_exc()
            print (msg)
            print('Warning: unknown file', f)

    return data_files  #返回一个文件名列表

#根据文件名读文件
def data_read(path,file_name):  
    data_file_path  = path + "/data/"
    data_file_path  = data_file_path  + file_name
    name,extension  = file_name.split(".")
    if extension == "csv":
        try:
            data = pd.read_csv(data_file_path,encoding = "gbk",header = None)
        except:
            data = pd.read_csv(data_file_path,encoding = "utf-8",header = None)
    elif extension == "txt":
        try:
            data = pd.read_csv(data_file_path,encoding = "gbk",header = None,sep = "\t")
        except:
            data = pd.read_csv(data_file_path,encoding = "utf-8",header = None,sep = "\t")
    else:
        data = pd.read_excel(data_file_path)

    return data 


#删除无用字段,替换原文件
def data_clean(path):
    data_files = get_data_set(path)
    #前N-1个文件
    #删除0,1,2,3,5,6,7,8列
    for i in range(len(data_files) - 1):
        d1 = data_read(path,data_files[i])
        d1 = d1.drop([0,1,2,3,5,6,7,8],axis = 1)
        f1_name = data_files[i]
        print (f1_name)
        d1.to_csv(path+"/data/"+f1_name,header=False,index=False)
    #第N个文件
    #删除0,1,2,4,5,6,7列
    d2 = data_read(path,data_files[len(data_files) - 1])
    d2 = d2.drop([0,1,2,4,5,6,7],axis = 1)
    f2_name = data_files[len(data_files) - 1]
    print (f2_name)
    d2.to_csv(path+"/data/"+f2_name,header=False,index=False)

File_columns_dict = \
"d1.csv:ID,x1_class,x2_class,x3_class,x4_class,x5_num,x6_num,x7_num,x8_class \
|d2.csv:ID,x9_class,x10_class,x11_class,x12_class,x13_num,x14_num,x15_num,x16_class,x17_num  \
|d3.csv:ID,x18_class,x19_class,x20_class,x21_class,x22_num,x23_num,x24_num,x25_class,x26_num,x27_num   \
|d4.csv:ID,x28_class,x29_class,x30_class,x31_class,x32_num,x33_num,x34_num,x35_class,x36_num,x37_num,x38_class,x39_num  \
|d5.csv:ID,x40_class,x41_class,x42_class,x43_class,x44_num,x45_num,x46_num,x47_class,x48_num,x49_num,x50_class,x51_num,x52_num"

#config_str = File_columns_dict 

def get_all_factor_dict(config_str):
    columns_name_dict = {}
    config_items = config_str.split('|')
    for config_item in config_items:
        config_pair = config_item.split(':')
        File_columns_key = config_pair[0]
        File_columns_values = config_pair[1]
        name_list = []
        item_value_arr  = File_columns_values.split(',')
        for i in range(len(item_value_arr)):
            name_list.append(item_value_arr[i])        
        columns_name_dict[File_columns_key] = name_list

    return columns_name_dict

"""
预计输出结果:
输出一个字典:
{
  'd1.csv':['ID','x1_class','x2_class','x3_class','x4_class','x5_num','x6_num','x7_num','x8_class'],
  'd2.csv':['ID','x9_class','x10_class','x11_class','x12_class','x13_num','x14_num','x15_num','x16_class','x17_num'],
  'd3.csv':['ID','x18_class','x19_class','x20_class','x21_class','x22_num','x23_num','x24_num','x25_class','x26_num','x27_num'],
  'd4.csv':['ID','x28_class','x29_class','x30_class','x31_class','x32_num','x33_num','x34_num','x35_class','x36_num','x37_num','x38_class','x39_num'],
  'd5.csv':['ID','x40_class','x41_class','x42_class','x43_class','x44_num','x45_num','x46_num','x47_class','x48_num','x49_num','x50_class','x51_num','x52_num']
}
"""

#给字段加列名,替换原文件
def data_set_columns(path,file_columns_dict):
    data_files = get_data_set(path)
    for i in range(len(data_files)):
        data_file   = data_read(path,data_files[i])
        file_column = file_columns_dict[data_files[i]]
        data_file.columns = file_column
        f1_name = data_files[i]
        data_file.to_csv(path+"/data/"+f1_name,index=False)

        
def main():
    path = "E:/AnaLinReg/BFM/code_test/DataSet1"
    #data_clean(path)
    config_str = File_columns_dict 
    file_columns_dict = get_all_factor_dict(config_str)
    data_set_columns(path,file_columns_dict)
    print ("Done")

    

if __name__ == "__main__":
    main()
                                   示例文件及效果

原始文件:
图片描述
代码运行后:
图片描述
需要被预处理的文件列表:
图片描述

点击查看更多内容
TA 点赞

若觉得本文不错,就分享一下吧!

评论

作者其他优质文章

正在加载中
  • 推荐
  • 评论
  • 收藏
  • 共同学习,写下你的评论
感谢您的支持,我会继续努力的~
扫码打赏,你说多少就多少
赞赏金额会直接到老师账户
支付方式
打开微信扫一扫,即可进行扫码打赏哦
今天注册有机会得

100积分直接送

付费专栏免费学

大额优惠券免费领

立即参与 放弃机会
意见反馈 帮助中心 APP下载
官方微信

举报

0/150
提交
取消