[toc]
python 表格操作
将awk生成的数据导入表格
pandas :pannel data analysis(面板数据分析)。pandas是基于numpy构建的,为时间序列分析提供了很好的支持。pandas中有两个主要的数据结构,一个是Series,另一个是DataFrame。
Series 类似于一维数组与字典(map)数据结构的结合。它由一组数据和一组与数据相对应的数据标签(索引index)组成。这组数据和索引标签的基础都是一个一维ndarray数组。可将index索引理解为行索引。 Series的表现形式为:索引在左,数据在右。
DataFrame是一个类似表格的数据结构,索引包括列索引和行索引,包含有一组有序的列,每列可以是不同的值类型(数值、字符串、布尔值等)。DataFrame的每一行和每一列都是一个Series,这个Series的name属性为当前的行索引名/列索引名。
#!/usr/bin/env python # -*- coding: utf-8 -*- import openpyxl as xl import pandas as pd import numpy as np import os import datetime cycle=38 time_year_month_day = str(datetime.date.today()-datetime.timedelta(days=cycle)) def proj_xls_to_xlsx(file_path,sheet_name,save_name,tableTitle=None): with open(file_path) as f: data = f.readlines() if not os.path.exists(save_name): wb = xl.Workbook() wb.save(save_name) else: wb = xl.load_workbook(save_name) ws1 = wb.create_sheet(0) ws1.title = sheet_name if tableTitle != None: for n in range(len(tableTitle)): c = n + 1 ws1.cell(row=1, column=c).value = tableTitle[n] for d in data: value = d.split('\t') try: # print(value) value[5]=float(value[5]) except: pass ws1.append(value) wb.save(save_name) def crate_shuju_toushi(save_name,sheet_name,sheet_name2,index_name,values_name,department_name,new_department_name,personnel_name): f = pd.read_excel(io=save_name,sheet_name=sheet_name) res = pd.pivot_table(f,index=[index_name],values=[values_name],aggfunc=np.sum, margins=True) wb = xl.load_workbook(save_name) old_title = wb.worksheets[0] old_title.title = sheet_name2 all_xinxi_title=list(zip(list((res.index)),list(res[values_name]))) department_personnel = list(zip(f[department_name],f[index_name])) department_personnel_size_list = [] for user in all_xinxi_title: for department_personnel_list in department_personnel: if user[0] in department_personnel_list: if user[0] == '-': continue if user[0] == ' ': continue department_personnel_size_list.append((department_personnel_list[0],user[0],user[1])) end_department_personnel_size_list = sorted(list(set(department_personnel_size_list)), key=lambda x:x[2] ,reverse=True) all_xinxi_title_end = all_xinxi_title[-1] all_xinxi_title_end = list(all_xinxi_title_end) all_xinxi_title_end.insert(0,'') end_department_personnel_size_list.append(all_xinxi_title_end) end_department_personnel_size_list.insert(0,[new_department_name,personnel_name,values_name]) end_department_personnel_size_list.pop() user_volue = [] for user2 in end_department_personnel_size_list: user2 = list(user2) if user2[1] == user_volue: continue user_volue = user2[1] old_title.append(user2) wb.save(save_name) def create_space_toushi(save_name,sheet_name,sheet_name2,department_num,sum_name,data_total,index_name,values_name): f = pd.read_excel(io=save_name,sheet_name=sheet_name,dtype={'业务XX利润编号':str}) res = pd.pivot_table(f,index=index_name,values=values_name,aggfunc=np.sum,margins=True) all_list = list(zip(list(res.index),list(res[values_name]))) wb = xl.load_workbook(save_name) sheet = wb.create_sheet(sheet_name2) space_list = [] for user in all_list: user2 = list(user)[1] user3 = list(list(user)[0]) if user3[0] == '-': space_list.append((user3[1],user2)) if user3[0] == ' ': space_list.append((user3[1],user2)) sum_list = [] for slist in space_list: sum_list.append(slist[1]) sum_list=sum(sum_list) sort_space_list = sorted(list(set(space_list)), key=lambda x:x[1] ,reverse=True) sort_space_list.append((sum_name, sum_list)) sort_space_list.insert(0,(department_num,data_total)) for all_space_list in sort_space_list: sheet.append(all_space_list) wb.save(save_name) if __name__=='__main__': sheet_name = 'XXX详情' save_name = 'TJ-XJ-DATASHOW-' + time_year_month_day + '.xlsx' save_name = 'TJ-XJ-DATASHOW-' + time_year_month_day + '.xlsx' file_path = './able.' + time_year_month_day +'.lib.xls' proj_xls_to_xlsx(file_path=file_path, sheet_name=sheet_name, save_name=save_name) sheet_name = 'XXX总体情况' save_name = 'TJ-XJ-DATASHOW-' + time_year_month_day + '.xlsx' file_path = './' + time_year_month_day + '.ProjInfo.XJpath.xls' table_title = ['业务XX名称','业务XX利润编号', '运营经理','信息分析','涉及项目数', '涉及数据量大小(G)','项目编号1', '项目名称1','数据量(G)1', '文库编号1','项目编号2', '项目名称2','数据量(G)2', '文库编号2','项目编号3', '项目名称3','数据量(G)3', '文库编号3','项目编号4', '项目名称4 数据量(G)4','文库编号4',] proj_xls_to_xlsx(file_path=file_path, sheet_name=sheet_name, tableTitle=table_title, save_name=save_name) save_name = 'TJ-XJ-DATASHOW-' + time_year_month_day + '.xlsx' sheet_name = 'XXX总体情况' sheet_name2 = 'XXX概况' index_name = '信息分析' values_name = '涉及数据量大小(G)' department_name = '业务XX名称' new_department_name = 'XX' personnel_name = '人员' crate_shuju_toushi(save_name=save_name, sheet_name=sheet_name, sheet_name2=sheet_name2, index_name=index_name, values_name=values_name, department_name=department_name, new_department_name=new_department_name, personnel_name=personnel_name) index_name = ['信息分析','业务XX利润编号'] save_name = 'TJ-XJ-DATASHOW-' + time_year_month_day + '.xlsx' sheet_name = 'XXX总体情况' department_num = 'XX编号' data_total = '数据量(G)' values_name = '涉及数据量大小(G)' sheet_name2 = '未匹配人员数据量' sum_name = 'ALL' create_space_toushi(index_name=index_name, save_name=save_name, sheet_name=sheet_name, department_num=department_num, data_total=data_total, values_name=values_name, sheet_name2=sheet_name2, sum_name=sum_name,)
完成数据透视,
#!/usr/bin/env python # -*- coding: utf-8 -*- import pandas as pd import openpyxl as xl import datetime difference_cycle=33 time_year_month_day = str(datetime.date.today()-datetime.timedelta(days=difference_cycle)) def add_last_excel(old_sheet,new_sheet,gaikuang): pd01 = pd.read_excel(old_sheet,sheet_name=gaikuang,encoding='utf-8') pd02 = pd.read_excel(new_sheet,sheet_name=gaikuang,encoding='utf-8') pd11 = pd.read_excel(old_sheet,sheet_name='未匹配人员数据量',encoding='utf-8') pd12 = pd.read_excel(new_sheet,sheet_name='未匹配人员数据量',encoding='utf-8') result = pd.merge(pd01,pd02[['人员','涉及数据量大小(G)']],on='人员') result_list = list(zip(list((result['XXX'])),list((result['人员'])),list((result['涉及数据量大小(G)_x'])),list((result['涉及数据量大小(G)_y'])))) not_match = pd.merge(pd11,pd12[['XXX编号','数据量(G)']],left_on='XXX编号',right_on='XXX编号') print(not_match) not_match_list = list(zip(list((not_match['XXX编号'])),list((not_match['数据量(G)_x'])),list((not_match['数据量(G)_y'])))) wb2 = xl.load_workbook(new_sheet) remove_sheet1 = wb2[gaikuang] remove_sheet2 = wb2['未匹配人员数据量'] wb2.remove(remove_sheet1) wb2.remove(remove_sheet2) wb2.save(new_sheet) wb2 = xl.load_workbook(new_sheet) sheet21 = wb2.create_sheet(gaikuang,0) sheet22 = wb2.create_sheet('未匹配人员数据量') result_head = ['XXX','人员','涉及数据量(G)','第二次涉及数据量(G)','任务额差'] result_list.insert(0,result_head) result_for_num = 1 for i in result_list: result_i = list(i) if result_for_num != 1: result_chae=i[2]-i[3]-i[2]*0.2 result_i.append(float('%.2f'% result_chae)) result_for_num = result_for_num + 1 sheet21.append(result_i) not_match_head = ['XXX编号','数据量(G)','第二次数据量(G)','任务额差'] not_match_list.insert(0,not_match_head) not_match_for_num = 1 for j in not_match_list: not_match_j = list(j) if not_match_for_num != 1: not_match_chae=j[1]-j[2]-j[1]*0.2 not_match_j.append(float('%.2f'% not_match_chae)) not_match_for_num = not_match_for_num + 1 sheet22.append(not_match_j) wb2.save(new_sheet) if __name__ == '__main__': old_sheet='TJ-XJ-DATASHOW-' + time_year_month_day + '-old.xlsx' new_sheet='TJ-XJ-DATASHOW-' + time_year_month_day + '.xlsx' gaikuang='XXX概况' add_last_excel(old_sheet=old_sheet,new_sheet=new_sheet,gaikuang=gaikuang)