1.目的:将不同文件夹下具有相同文件名称的文件进行合并,并保证文件中第一行标题不重复(按行整合)。
代码:
import pandas as pd
import os
def merge_data(id):
base = 'path to Excel file/' #大文件夹,注意最后的/
dir_names = os.listdir(base) # 打开大文件夹后的各个小文件夹名dir_names
df_all = pd.DataFrame()
for dir in dir_names: #遍历每一个小文件夹
file_path = base+dir
file_names = os.listdir(file_path) #打开小文件夹后每一个excel文件的名称
for file in file_names: #遍历小文件夹里的每一个文件
file_name = file_path+'/'+file
if file.split('.')[0] == id:
df = pd.read_excel(file_name)
# df_all = df_all.append(df)
df_all = df_all.append(df,ignore_index=True)#索引重新排序
return df_all
if __name__ == '__main__':
base = 'path to excel file/'
dir_names = os.listdir(base)
ids = set() # set()可以看做一个列表,这里面不包含重复的元素,不可以使用索引
for dir in dir_names:
file_path = base+dir
file_names = os.listdir(file_path)
for file_name in file_names:
id = file_name.split('.')[0] #id表示这一个股票的代码
ids.add(id) #把所有id放入ids中,这样不会有重复的id存在
for id in ids:
df = merge_data(id)
base = 'path to save file/' #存放合并后的文件路径
id_path = base + id + '.xlsx'
df.to_excel(id_path)
2.更新:将两个不同文件夹下同名文件合并,并且要求合并后的文件包括两个sheet。
代码:
import pandas as pd
import os
from pandas import ExcelWriter
writer = ExcelWriter("path to save output excel/output.xlsx")
base = 'path to excel /' #大文件夹,注意最后的/
dir_names = os.listdir(base) # 打开大文件夹后的各个小文件夹名dir_names
df_all = pd.DataFrame()
for dir in dir_names: #遍历每一个小文件夹
file_path = base+dir
file_names = os.listdir(file_path) #打开小文件夹后每一个excel文件的名称
for file in file_names: #遍历小文件夹里的每一个文件
file_name = file_path+'/'+file
df_excel = pd.read_excel(file_name)
(_, f_name) = os.path.split(file_name)
(f_short_name, _) = os.path.splitext(f_name)
df_excel.to_excel(writer, f_short_name, index=False)
writer.save()
3.更新:将两个不同文件夹同名的文件合并,按列合并成一个sheet
import pandas as pd
import numpy as np
import os
import glob
from os.path import join
#%%
#合并不同文件夹下相同文件名Excel,按列合并
# out file path
outDir = os.path.abspath('save path')
#one file
imageDir1 = os.path.abspath('file path1')
#define area file variable
image1 = [] #1.txt;2.txt
imgname1 = [] #1;2
#get all .txt file
imageList1 = glob.glob(os.path.join(imageDir1, '*.xlsx'))
#get filename (1.txt;2.txt)
for item in imageList1:
image1.append(os.path.basename(item))
#get filename(1;2)
for item in image1:
(temp1, temp2) = os.path.splitext(item)
imgname1.append(temp1)
#second file
imageDir2 = os.path.abspath('file path2')
image2 = []
imgname2 = []
imageList2 = glob.glob(os.path.join(imageDir2, '*.xlsx'))
for item in imageList2:
image2.append(os.path.basename(item))
for item in image2:
(temp1, temp2) = os.path.splitext(item)
imgname2.append(temp1)
#f the first file name and sencond file name are the same, the two groups of data are merged
for item1 in imgname1:
for item2 in imgname2:
if item1 == item2:
dir1 = imageList1[imgname1.index(item1)]
dir2 = imageList2[imgname2.index(item2)]
data1 = pd.read_excel(dir1)
area = data1
# print(data)
name1 = os.path.basename(dir1)
data2 = pd.read_excel(dir2)
height = data2[0:132].reset_index(drop=True)
# print(data)
name2 = os.path.basename(dir2)
data = pd.concat([area,height],axis=1,ignore_index=True)
pd_data = pd.DataFrame(data)
pd_data.to_csv(os.path.join(outDir, name1.split('.')[0]+'.csv'))
print('done!')
参考资料:https://blog.csdn.net/weixin_43668299/article/details/97807698
4.快速将一个文件夹下所有csv文件合并为一个文件:copy *.csv all.csv
需要手动删除表头。