青年千人数据分析

序章

本文是对《对青年千人表格信息的可视化探索》的重复,作者(天地本无心大神)
https://mp.weixin.qq.com/s/GrVyv3RocrzJijVx7Ymr6Q

刚开始学习python的我,会跑知乎社区看看别人比较R与python那个才是最好的分析软件,现在当我看到天地本无心大神拿着两把利器游刃有余的分析时,才顿悟:花时间去比较谁强谁弱的时候,早已有大神点满了这两项技能。

1. 数据下载

https://github.com/wandering513/Data_workshop/tree/master/1000_youth_talent

2. python 实战

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib

# 用pandas 读取excel
qian = pd.read_excel("./1000_youth_talent.xlsx",header=0, skiprows=1)
# 自己加表头
qian.columns = ["ID","Name","Gender","year-month-day","year_birth","university","major"]
print(qian.head())

《青年千人数据分析》 image.png

2.1 看一下男女分布
  • 本届千人共630人,男549,女81
sex =qian.groupby("Gender").size()
#调节图形大小,宽,高
plt.figure(figsize=(6,9))
#定义饼状图的标签,标签是列表
labels = ['Men','Female']
#每个标签占多大,会自动去算百分比
sizes = [sex['男'],sex['女']]
colors = ['yellowgreen','red']
#将某部分爆炸出来, 使用括号,将第一块分割出来,数值的大小是分割出来的与其他两块的间隙
explode = (0.05,0)

patches,l_text,p_text = plt.pie(sizes,explode=explode,labels=labels,colors=colors,labeldistance = 1.1,autopct = '%3.1f%%',shadow = False,startangle = 90,pctdistance = 0.6)

#改变文本的大小
#方法是把每一个text遍历。调用set_size方法设置它的属性
for t in l_text:
    t.set_size=(30)
for t in p_text:
    t.set_size=(20)
# 设置x,y轴刻度一致,这样饼图才能是圆的
plt.axis('equal')
plt.legend()
plt.show()

《青年千人数据分析》 image.png

2.2 看一下年龄分布
  • 大神dataframe 用的出神入化了。。。
#Code to get the age distribution of male and female
# 抽取男女的dataframe
qian_male = qian[qian["Gender"] == u"男"]
qian_female = qian[qian["Gender"] == u"女"]
# 算出男女的年龄,设为index
qian_male_age = pd.Series(2017 - np.array(qian_male["year_birth"])).value_counts().sort_index()
qian_female_age = pd.Series(2017 - np.array(qian_female["year_birth"])).value_counts().sort_index()
# 统计每个年龄出现的人数
qian_male_age = pd.DataFrame(qian_male_age).reset_index(); qian_male_age.columns = ["age","male_num"]
qian_female_age = pd.DataFrame(qian_female_age).reset_index(); qian_female_age.columns = ["age","female_num"]
# 合并
age_count = qian_male_age.merge(qian_female_age,how="outer")
age_count = age_count.fillna(0); age_count["female_num"] = age_count["female_num"].astype("int")

# Code to draw plot
font = {'family': 'serif', 'weight': 'normal', 'size': 14}
matplotlib.rc('font', **font)
plt.rcParams['figure.figsize'] = (15, 9)
bar_labels = age_count["age"].map(lambda x: str(x) + "-year-old").values
male = age_count["male_num"].values
female = age_count["female_num"].values
minus_female = [-int(i) for i in female]

X = np.arange(16)
for x, y in zip(X, male):
    plt.text(x + 0.1, y + 0.08, '%s' % y, ha='center', va="bottom")
for x, y in zip(X, minus_female):
    plt.text(x + 0.1, y - 0.08, "%s" % -y, ha='center', va='top')

plt.bar(X, male, facecolor="#9999ff", edgecolor="white", label="Male 549")
plt.bar(X, minus_female, facecolor="#ff9999", edgecolor="white", label="Female 81")
x_pos = [x + 0.1 for x in X]
plt.xticks(x_pos, bar_labels, fontsize=15)
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment="right")

plt.setp(plt.gca().spines['right'].set_color('none'))
plt.setp(plt.gca().spines['top'].set_color('none'))
plt.setp(plt.gca().xaxis.set_ticks_position('bottom'))
plt.setp(plt.gca().yaxis.set_ticks_position('left'))
plt.yticks(plt.yticks()[0], [int(i) for i in abs((plt.yticks()[0]))])
plt.legend()
plt.show()

《青年千人数据分析》 image.png

2.3 看一下出生月数分布
# Code to get the month distribution matrix in each year
# 提取月份
qian["month"] = pd.Series([array[1] for array in qian["year-month-day"].str.split("/")])
# 提取年龄
qian["age"] = pd.Series(2017 - np.array(qian["year_birth"]))
# 制作一个dataframe, 年龄+不同月份人数的list
temp1 = pd.DataFrame(qian.groupby("age")["month"].apply(list))
#print(temp1)

list_month = []
for i in temp1["month"]:
    list = [0] * 12
    for j in i:
        list[int(j)-1] +=1
    list_month.append(list)
columns = [ x for x in range(1,13)]
index =[str(x)+"-years-old" for x in range(25,41)]
month_matrix =pd.DataFrame(
    list_month, columns = columns, index =index)
plt.rcParams['figure.figsize'] = (6,4)
month_matrix.sum().plot(kind="bar")
plt.show()

《青年千人数据分析》 image.png

import seaborn as sns
plt.figure(figsize=(8,4))
sns.heatmap(month_matrix,  annot=True, cmap="YlGnBu")
plt.show()

《青年千人数据分析》 image.png

2.4 看一下哪个城市居多

在mac 中运行中文乱码,这里稍作修改
1 https://github.com/dolbydu/font/tree/master/unicode 下载“STKaiti”文件, 放到matplotlib的ttf文件夹下。找不到路径用matplotlib.matplotlib_fname()这个找。
2 终端输入rm -rf ~/.matplotlib/*.cache.
3 在python脚本中添加一行代码mpl.rcParams['font.sans-serif'] = u'SimHei'

qian = pd.read_excel("1000_youth_talent.xlsx",header=0, skiprows=1)
qian.columns = ["ID","Name","Gender","year-month-day","year_birth","university","major"]
font = {'weight' : 'normal','size': 14}
matplotlib.rc('font', **font)
plt.rcParams['figure.figsize'] = (20,9)
plt.rcParams['font.sans-serif'] = ['STKaiti']
school_count = pd.DataFrame(qian["university"].value_counts()).reset_index()
school_count.columns = ["university","num"]
Pcity = pd.read_excel("university_province_city.xls",header=0)
temp2 = school_count.merge(Pcity)
temp2 = temp2.groupby("City").sum().reset_index()
temp2.columns = ["city","num"]
temp2 = temp2.sort_values("num", ascending=False)
temp2 = temp2.set_index("city")

# 这里保存一个csv, 后面R作图用
temp2.to_csv('city_num.csv',sep=',',header=True,index=True,encoding='utf-8')


temp2.plot(kind="bar", color="skyblue")
plt.legend().set_visible(False)
plt.setp(plt.gca().spines['right'].set_color('none'))
plt.setp(plt.gca().spines['top'].set_color('none'))
plt.setp(plt.gca().xaxis.set_ticks_position('bottom'))
plt.setp(plt.gca().yaxis.set_ticks_position('left'))
plt.show()

《青年千人数据分析》 image.png

2.5 各行业分布
temp3 = qian["major"].value_counts().reset_index(); temp3.columns = ["major","num"]
temp3.loc[3], temp3.loc[6] = temp3.loc[6], temp3.loc[3]
temp3 = temp3.set_index("major")
import brewer2mpl
set2 = brewer2mpl.get_map('Set2','qualitative','8').mpl_colors + brewer2mpl.get_map('Accent','qualitative','6').mpl_colors
font = {'family' : 'STKaiti', 'weight' : 'normal','size': 14}
matplotlib.rc('font', **font)
plt.rcParams['figure.figsize'] = (10,10)
explode = (0, 0,0.05,0.05, 0, 0, 0, 0, 0, 0)
X = temp3.index
Y = temp3["num"]
figlabel = []
for x,y in zip(X,Y):
    figlabel.append(x +"-"+ str(y))
temp3.plot(kind="pie",y="num",colors=set2,explode=explode)
plt.setp(plt.gca().legend(labels = figlabel, loc="best",bbox_to_anchor=(0.01,0.75), fontsize=20))
plt.show()

《青年千人数据分析》 image.png

3 R实战

# install REmap
install.packages("devtools")
library(devtools)
install_github('lchiffon/REmap')
setwd('/Users/chengkai/Desktop/file/learn/project/1000talient')
library(REmap)
library(dplyr)
rm(list=ls())
Sys.setlocale("LC_ALL","Chinese")
city_num <- read.csv("city_num.csv",sep=",",fileEncoding = "utf-8")

cities = city_num$city
cities = as.character(cities)
cities_Geo <- get_geo_position(cities)
write.csv(cities_Geo, file="R_Geo.csv")
#city_num
data_final <- cities_Geo %>% 
  left_join(city_num)

data_final <- data_final[,-3]
#temp_file <- data.frame(cites_Geo,city_num$num)[,-3]
#names(temp_file)[names(temp_file)=="city_num.num"]="num"

remapH(data_final,
       maptype = "china",
       theme = get_theme("Bright"),
       blurSize = 35,
       color="red",
       minAlpha = 10,
       opacity = 2)

《青年千人数据分析》 image.png

参考文献

  1. http://www.jianshu.com/p/883c8d9268a8(数据分析神器之Pandas)
    原文作者:thinkando
    原文地址: https://www.jianshu.com/p/5e58cf9808f0
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞