比如我有一份票房数据:
类型 | 票房 |
---|---|
剧情, 灾难 | 2913118 |
战争, 历史 | 3094524 |
剧情, 喜剧 | 3099961 |
剧情 | 3176119 |
根据已知的票房信息,我想知道不同类型的片子能抢占多少比例的总票房,效果如下:
import pandas as pd
def split_and_sum(dataframe: pd.DataFrame, column: str, sum_column: str):
def get_all_type(type_se):
"""获得全部的类型"""
type_info = type_se[column]
type_list = type_info.split(',')
for i in type_list:
if i in all_data.index:
all_data.loc[i, 0] += type_se[sum_column]
else:
all_data.loc[i] = type_se[sum_column]
def set_value(type_se):
"""为对应的类型赋值"""
type_info = type_se[column]
type_list = type_info.split(',')
type_value = 0.0
for i in type_list:
type_value += data_dict[i]
return type_value
all_data: pd.DataFrame = pd.DataFrame([['test']], index=['test'])
dataframe.apply(get_all_type, axis=1)
all_data.drop('test', inplace=True) # 删除第一行
all_data.loc[:, 0] = all_data.loc[:, 0] / all_data.loc[:, 0].sum() # 计算比值
data_dict = all_data.to_dict()[0]
series = dataframe.apply(set_value, axis=1)
return series
if __name__ == '__main__':
df: pd.DataFrame = pd.DataFrame([
['剧情, 灾难', 2913118, ],
['战争, 历史', 3094524, ],
['剧情, 喜剧', 3099961, ],
['剧情,', 3176119, ],
], columns=['type', 'box_office'])
df['type_value'] = split_and_sum(df, 'type', 'box_office')