Python数据分析

2023-08-10 07:09| 来源: 网络整理| 查看: 265

介绍

当我获得一份名单（本校4年的本科生录取姓名、学号、专业、录取中学等等）打算利用这个名单，进行一些基础的有趣分析

基础整理

引入库，并读写表格

import numpy as np import pandas as pd import os from collections import defaultdict import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 path = "E:/code_interesting/name_of_school/" os.chdir(path) s1 = pd.read_excel("2014-2019 中学录取学生数据.xlsx") s1.head()

在这里插入图片描述遍历表格，将每个学生的姓氏、名字、学校分别放入表格 itertuples(): 将DataFrame迭代为元组 getattr(行, 列名)：返回对应值

last_name = [] # 存放姓氏 first_name = [] # 存放名 school_name = [] # 存放学校 for row in s1.itertuples(): # 遍历dataframe中每一行 name = getattr(row, "姓名").strip() school = getattr(row, "zxmc") if not isinstance(school, str): school = "无" else: school = school.strip() # print(school) school_name.append(school) # if "翰文" in name: # print(name) if "·" in name: # 少数民族姓名中间为· name1, name2 = name.split("·") last_name.append(name1) first_name.append(name2) elif len(name) == 4: # 四个字名字，前两个复姓 last_name.append(name[:2]) first_name.append(name[2:]) else: last_name.append(name[:1]) first_name.append(name[1:]) first_c_name = [] for s in first_name: # 将名列表中每个名字拆分成单字 ss = list(s) first_c_name += ss print(len(first_c_name)) # print(first_c_name[0:100])

对每个列表进行个数统计（defaultdict建立默认类型的字典）其中，对字典排序：sorted(dict.items(), key=lambda x:x[1])其中1即为按字典值排序，0为按字典项排序

last_num = defaultdict(int) # 构建字典，每个姓氏有多少人，下同 first_num = defaultdict(int) first_c_num = defaultdict(int) school_num = defaultdict(int) # print(first_name) for i in range(len(first_name)): # print(i) # print(last_name[i]) last_num[last_name[i]] += 1 first_num[first_name[i]] += 1 school_num[school_name[i]] += 1 for i in range(len(first_c_name)): first_c_num[first_c_name[i]] += 1 last_sort = sorted(last_num.items(), key=lambda x:x[1], reverse=True) # 对字典进行排序 first_sort = sorted(first_num.items(), key=lambda x:x[1], reverse=True) first_c_sort = sorted(first_c_num.items(), key=lambda x:x[1], reverse=True) school_sort = sorted(school_num.items(), key=lambda x:x[1], reverse=True) print(len(school_sort)) first_2c_sort = [] # 复名列表 for t in first_sort: if len(t[0]) == 2: first_2c_sort.append(t) 单字图表 # 单字图标 total_c = len(first_c_sort) # 总长度 head_num = 100 x = np.array(range(head_num)) y_c = [] xlabel_c = [] for tupe in first_c_sort[0:head_num]: y_c.append(tupe[1]) xlabel_c.append(tupe[0]) print(y_c) print(x) plt.figure(figsize=(40, 5), dpi=150) plt.bar(x, y_c) plt.xticks(x, xlabel_c) for a, b in zip(x, y_c): plt.text(a-0.5, b+1, b) plt.savefig("c_100.jpg") plt.show()

plt.figure中，figsize指定了图表的尺寸（因为x太多，故为长条状），dpi指定了像素值 plt.bar绘制直方图（条状图） plt.xticks第一个值为50个0开始的数，第二个值为50个对应的x坐标名称 plt.text为在指定位置添加文字 zip把多个可迭代元素内的项，一一对应打包成元组在这里插入图片描述

复名分析 head_num = 100 print(first_2c_sort[0:20]) x = np.array(range(head_num)) y_c = [] xlabel_c = [] for tupe in first_2c_sort[0:head_num]: y_c.append(tupe[1]) xlabel_c.append(tupe[0]) plt.figure(figsize=(40, 5), dpi=150) plt.bar(x, y_c) plt.xticks(x, xlabel_c) plt.xticks(rotation=45) for a, b in zip(x, y_c): plt.text(a-0.5, b+1, b) plt.savefig("2c_100.jpg") plt.show()

在这里插入图片描述

姓氏分析 total = len(last_name) head_num = 100 print(last_sort[0:20]) x = np.array(range(head_num)) y_c = [] xlabel_c = [] for tupe in last_sort[0:head_num]: y_c.append(tupe[1]) xlabel_c.append(tupe[0]) plt.figure(figsize=(40, 5), dpi=150) plt.bar(x, y_c) plt.xticks(x, xlabel_c) # plt.xticks(rotation=45) for a, b in zip(x, y_c): plt.text(a-0.5, b+1, b) plt.text(a-0.5, b+100, "{:.2f}%".format((b/total)*100)) plt.savefig("last_100.jpg") plt.show()

在这里插入图片描述

学校分析 head_num = 50 x = np.array(range(head_num)) y_c = [] xlabel_c = [] for tupe in school_sort[0:head_num]: y_c.append(tupe[1]) xlabel_c.append(tupe[0]) plt.figure(figsize=(40, 5), dpi=150) plt.bar(x, y_c) plt.xticks(x, xlabel_c) plt.xticks(rotation=20) for a, b in zip(x, y_c): plt.text(a-0.3, b+1, b) plt.text(a-0.3, b+15, "{:.2f}%".format((b/total)*100)) plt.savefig("school_50.jpg") plt.show()

在这里插入图片描述

【本文地址】

Python数据分析

Python数据分析

今日新闻

推荐新闻