python数据分析 |
您所在的位置:网站首页 › 历届世界杯比赛数据 › python数据分析 |
import pandas as pd
import numpy as np
import seaborn as sns
import itertools
import io
import base64
import os
import folium
import folium.plugins
import matplotlib.pyplot as plt
from matplotlib import rc,animation
from mpl_toolkits.mplot3d import Axes3D
from wordcloud import WordCloud,STOPWORDS
读取数据
matches = pd.read_csv('./WorldCupMatches.csv')
players = pd.read_csv('./WorldCupPlayers.csv')
cups = pd.read_csv('./WorldCups.csv')
看一下数据前三行
matches.head(3)
Year
Datetime
Stage
Stadium
City
Home Team Name
Home Team Goals
Away Team Goals
Away Team Name
Win conditions
Attendance
Half-time Home Goals
Half-time Away Goals
Referee
Assistant 1
Assistant 2
RoundID
MatchID
Home Team Initials
Away Team Initials
0
1930.0
13 Jul 1930 - 15:00
Group 1
Pocitos
Montevideo
France
4.0
1.0
Mexico
4444.0
3.0
0.0
LOMBARDI Domingo (URU)
CRISTOPHE Henry (BEL)
REGO Gilberto (BRA)
201.0
1096.0
FRA
MEX
1
1930.0
13 Jul 1930 - 15:00
Group 4
Parque Central
Montevideo
USA
3.0
0.0
Belgium
18346.0
2.0
0.0
MACIAS Jose (ARG)
MATEUCCI Francisco (URU)
WARNKEN Alberto (CHI)
201.0
1090.0
USA
BEL
2
1930.0
14 Jul 1930 - 12:45
Group 2
Parque Central
Montevideo
Yugoslavia
2.0
1.0
Brazil
24059.0
2.0
0.0
TEJADA Anibal (URU)
VALLARINO Ricardo (URU)
BALWAY Thomas (FRA)
201.0
1093.0
YUG
BRA
players.head(3)
RoundID
MatchID
Team Initials
Coach Name
Line-up
Shirt Number
Player Name
Position
Event
0
201
1096
FRA
CAUDRON Raoul (FRA)
S
0
Alex THEPOT
GK
NaN
1
201
1096
MEX
LUQUE Juan (MEX)
S
0
Oscar BONFIGLIO
GK
NaN
2
201
1096
FRA
CAUDRON Raoul (FRA)
S
0
Marcel LANGILLER
NaN
G40'
cups.head(3)
Year
Country
Winner
Runners-Up
Third
Fourth
GoalsScored
QualifiedTeams
MatchesPlayed
Attendance
0
1930
Uruguay
Uruguay
Argentina
USA
Yugoslavia
70
13
18
590.549
1
1934
Italy
Italy
Czechoslovakia
Germany
Austria
70
16
17
363.000
2
1938
France
Italy
Hungary
Brazil
Sweden
84
15
18
375.700
统计历届世界杯的观众总数
先去掉世界杯比赛中Attendance字段的重复数据,根据Year字段对其进行累加,再使用Seaborn和Matplotlib可视化 matches.isnull().sum() sns.set_style("darkgrid") matches = matches.drop_duplicates(subset="MatchID",keep="first") matches = matches[matches["Year"].notnull()] att = matches.groupby("Year")["Attendance"].sum().reset_index() att["Year"] = att["Year"].astype(int) plt.figure(figsize=(12,7)) sns.barplot(att["Year"],att["Attendance"],linewidth=1,edgecolor="k"*len(att)) plt.grid(True) plt.title("Attendence by year",color='b') plt.show()考虑到赛制改变等因素,历届世界杯的比赛场次数量存在一定的差异,进一步计算每届世界杯观众数的平均值,进一步分析历届世界杯的影响力 att1 = matches.groupby("Year")["Attendance"].mean().reset_index() att1["Year"] = att1["Year"].astype(int) plt.figure(figsize=(12,7)) ax = sns.pointplot(att1["Year"],att1["Attendance"],color="w") ax.set_facecolor("k") plt.grid(True,color="grey",alpha=.3) plt.title("Average attendence by year",color='b') plt.show()总体呈上升趋势,1994年最高,2006-2014年总人数稳定在较高水平 接下来,计算各个比赛城市的平均观众人数,并用可视化的形式展示平均值最高的20个城市。 ct_at = matches.groupby("City")["Attendance"].mean().reset_index() ct_at = ct_at.sort_values(by="Attendance",ascending=False) plt.figure(figsize=(10,10)) ax = sns.barplot("Attendance","City", data=ct_at[:20], linewidth = 1, edgecolor = "k"*20, palette = "Spectral_r") for i,j in enumerate(" Average attendance : "+np.around(ct_at["Attendance"][:20],0).astype(str)):ax.text(.7,i,j,fontsize=12) plt.grid(True) plt.title("Average attendance by city",color='b') plt.show()一个城市可能有多个场馆,各个场馆的观众数可能也不相同:计算各个场馆的平均观众人数,并取观众数最多的14个场馆进行可视化,与上面的结果略有出入。 matches["Year"] = matches["Year"].astype(int)#学习这种转化数据类型的方式,机器学习会用到 matches["Datetime"] = matches["Datetime"].str.split("-").str[0] matches["Stadium"] = matches["Stadium"].str.replace('Estadio do Maracana',"Maracanã Stadium") matches["Stadium"] = matches["Stadium"].str.replace('Maracan� - Est�dio Jornalista M�rio Filho',"Maracanã Stadium") std = matches.groupby(["Stadium","City"])["Attendance"].mean().reset_index().sort_values(by = "Attendance",ascending =False) plt.figure(figsize=(8,9 |
今日新闻 |
推荐新闻 |
CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3 |