musk12 commited on
Commit
fe14f7a
β€’
1 Parent(s): 3656e39

olympic_eda

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Analysis_image.png filter=lfs diff=lfs merge=lfs -text
37
+ athlete_events.csv filter=lfs diff=lfs merge=lfs -text
Analysis_image.png ADDED

Git LFS Details

  • SHA256: 54a70825253a8bcf63f2feb3701f054e7ffde3a89deb64e179d5b7876047f53e
  • Pointer size: 132 Bytes
  • Size of remote file: 8.43 MB
Olympic_image.jpg ADDED
README.md CHANGED
@@ -1,13 +1,5 @@
1
- ---
2
- title: Olympic Eda
3
- emoji: πŸ‘€
4
- colorFrom: indigo
5
- colorTo: red
6
- sdk: streamlit
7
- sdk_version: 1.36.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # olympics-data-analysis-web-app
2
+
3
+ A Streamlit web application for the analysis of olympics data.
4
+
5
+ Live Web App : https://olympicanalysiss.streamlit.app/
 
 
 
 
 
 
 
 
athlete_events.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1751e89b509dd8c0fa0979ee258eef4374aa93f17d64cd07383ad3f57c5aabd7
3
+ size 41500688
functions.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ df = pd.read_csv("athlete_events.csv")
4
+ region_df = pd.read_csv("noc_regions.csv")
5
+
6
+ def preprocess1(df, region_df):
7
+
8
+
9
+ #filtering for summer olympics
10
+ df = df[df["Season"] == "Summer"]
11
+
12
+ #merging with noc_regions data
13
+ df = df.merge(region_df, on="NOC", how="left")
14
+
15
+ #dropping duplicates
16
+ df.drop_duplicates(inplace=True)
17
+
18
+ #one hot encoding medal column
19
+ dummy = pd.get_dummies(df["Medal"], dtype=int)
20
+
21
+ #and concat dummy with original data
22
+ df = pd.concat([df, dummy], axis=1)
23
+
24
+ return df
25
+
important.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def medal_tally(df):
2
+ medal_tally = df.drop_duplicates(subset=["Team", "NOC", "Games", "Year", "City", "Sport", "Event", "Medal"])
3
+ medal_tally = medal_tally.groupby("region").sum()[["Gold", "Silver", "Bronze"]].sort_values("Gold", ascending = False).reset_index()
4
+ medal_tally["Total"] = medal_tally["Gold"]+medal_tally["Silver"]+medal_tally["Bronze"]
5
+
6
+ return medal_tally
7
+
8
+ def country_year_list(df):
9
+ years = df["Year"].unique().tolist()
10
+ years.sort()
11
+ years.insert(0,"Overall")
12
+
13
+ country = df["region"].dropna().unique().tolist()
14
+ country.sort()
15
+ country.insert(0,"Overall")
16
+
17
+ return years, country
18
+
19
+ def fetch_medal_tally(df,year,country):
20
+ medal_df = df.drop_duplicates(subset=["Team", "NOC", "Games", "Year", "City", "Sport", "Event", "Medal"])
21
+ flag = 0
22
+ if year == "Overall" and country == "Overall":
23
+ temp_df = medal_df
24
+ if year == "Overall" and country != "Overall":
25
+ flag = 1
26
+ temp_df = medal_df[medal_df["region"] == country]
27
+ if year != "Overall" and country == "Overall":
28
+ temp_df = medal_df[medal_df["Year"] == int(year)]
29
+ if year != "Overall" and country != "Overall":
30
+ temp_df = medal_df[(medal_df["Year"] == year) & (medal_df["region"] == country)]
31
+
32
+ if flag == 1:
33
+ x = temp_df.groupby("Year").sum()[["Gold", "Silver", "Bronze"]].sort_values("Year").reset_index()
34
+ else:
35
+ x = temp_df.groupby("region").sum()[["Gold", "Silver", "Bronze"]].sort_values("Gold", ascending = False).reset_index()
36
+ x["Total"] = x["Gold"]+x["Silver"]+x["Bronze"]
37
+
38
+ return x
39
+
40
+ def data_over_time(df,col):
41
+ year_count = df.drop_duplicates(["Year", col])["Year"].value_counts().reset_index().sort_values("Year")
42
+ year_count.rename(columns={"count":col}, inplace = True)
43
+
44
+ return year_count
45
+
46
+
47
+ def most_successful(df, sport):
48
+ temp_df = df.dropna(subset=["Medal"])
49
+
50
+ if sport != "Overall":
51
+ temp_df = temp_df[temp_df["Sport"] == sport]
52
+ x= temp_df["Name"].value_counts().reset_index().head(15).merge(df,left_on="Name",
53
+ right_on="Name",how="left") [["Name","count","Sport","region"]].drop_duplicates("Name")
54
+ x.rename(columns={"count":"Medals"}, inplace=True)
55
+ return x
56
+
57
+ def country_wise_medal_tally(df, region):
58
+
59
+ temp_df = df.dropna(subset=["Medal"])
60
+ temp_df.drop_duplicates(subset=["Team", "NOC", "Games", "Year", "City", "Sport", "Event", "Medal"], inplace=True)
61
+ new_df=temp_df[temp_df["region"]==region]
62
+ new_df.groupby("Year").count()["Medal"].reset_index()
63
+ final_df = new_df.groupby("Year").count()["Medal"].reset_index()
64
+ return final_df
65
+
66
+ def country_event_heatmap(df,country):
67
+
68
+ temp_df = df.dropna(subset=["Medal"])
69
+ new_df=temp_df[temp_df["region"]==country]
70
+ pt = new_df.pivot_table(index="Sport", columns="Year", values="Medal", aggfunc="count").fillna(0).astype("int")
71
+
72
+ return pt
73
+
74
+ def most_successful_athletes(df, country):
75
+ temp_df = df.dropna(subset=["Medal"])
76
+
77
+ temp_df = temp_df[temp_df["region"] == country]
78
+ x= temp_df["Name"].value_counts().reset_index().head(10).merge(df,left_on="Name", right_on="Name",
79
+ how="left")[["Name","count","Sport"]].drop_duplicates("Name")
80
+ x.rename(columns={"count":"Medals"}, inplace=True)
81
+ return x
82
+
83
+ def weight_v_height(df, sport):
84
+ athlete_df = df.drop_duplicates(subset=["Name", "region"])
85
+ athlete_df["Medal"].fillna("No Medal", inplace=True)
86
+ if sport != "Overall":
87
+ temp_df = athlete_df[athlete_df["Sport"]==sport]
88
+ return temp_df
89
+ else:
90
+ return athlete_df
91
+
92
+ def men_vs_women(df):
93
+ athlete_df = df.drop_duplicates(subset=["Name", "region"])
94
+ men = athlete_df[athlete_df["Sex"]=="M"].groupby("Year").count()["Name"].reset_index()
95
+ women = athlete_df[athlete_df["Sex"]=="F"].groupby("Year").count()["Name"].reset_index()
96
+ final = men.merge(women, on="Year", how="left")
97
+ final.rename(columns={"Name_x":"Male", "Name_y":"Female"}, inplace=True)
98
+ final.fillna(0, inplace=True)
99
+ return final
100
+
101
+
102
+
103
+
lat_long.csv ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ City,latitude,longitude
2
+ Athina,38.0359,23.787
3
+ Paris,48.841465,2.252616
4
+ St. Louis,38.6479,-90.3138
5
+ London,51.5387,-0.0172
6
+ Stockholm,59.3453,18.079
7
+ Antwerpen,51.1841,4.3825
8
+ Amsterdam,52.3434,4.8533
9
+ Los Angeles,34.073814,-118.240784
10
+ Berlin,52.5147,13.2395
11
+ Helsinki,60.1872,24.9268
12
+ Melbourne,-37.8209,144.9783
13
+ Rome,41.933964,12.454297
14
+ Tokyo,35.6778,139.7145
15
+ Mexico City,19.3319,-99.1922
16
+ Munich,48.1732,11.5466
17
+ Montreal,45.5577,-73.5515
18
+ Moskva,55.7812,37.6261
19
+ Seoul,37.5158,127.0728
20
+ Barcelona,41.3647,2.1557
21
+ Atlanta,33.76,-84.3932
22
+ Sydney,-33.8354,151.0654
23
+ Beijing,39.9929,116.3965
24
+ Rio de Janeiro,-22.8933,-43.2923
noc_regions.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ NOC,region,notes
olympic_app.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import functions
4
+ import important
5
+ import plotly.express as px
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+ import plotly.figure_factory as ff
9
+
10
+
11
+ df = pd.read_csv("athlete_events.csv")
12
+ region_df = pd.read_csv("noc_regions.csv")
13
+
14
+ df = functions.preprocess1(df, region_df)
15
+
16
+
17
+ #st.title("Olympic Data Analysis.")
18
+
19
+ st.sidebar.title("Olympic Data Analysis.")
20
+ st.sidebar.image("Olympic_image.jpg")
21
+ user_menu = st.sidebar.radio("Select an option", ("Overview", "Overall Analysis", "Medal tally", "Country-wise Analysis", "Athlete-wise Analysis"))
22
+
23
+ if user_menu == "Overview":
24
+ st.image("Analysis_image.png")
25
+
26
+ if user_menu == "Overall Analysis":
27
+ editions = df["Year"].unique().shape[0]-1
28
+ cities = df["City"].unique().shape[0]
29
+ sports = df["Sport"].unique().shape[0]
30
+ events = df["Event"].unique().shape[0]
31
+ athletes = df["Name"].unique().shape[0]
32
+ nations = df["region"].unique().shape[0]
33
+
34
+ st.title("Top Statistics")
35
+
36
+ col1, col2, col3 = st.columns(3)
37
+
38
+ with col1:
39
+ st.header("Editions")
40
+ st.title(editions)
41
+ with col2:
42
+ st.header("cities")
43
+ st.title(cities)
44
+ with col3:
45
+ st.header("sports")
46
+ st.title(sports)
47
+
48
+ col1, col2, col3 = st.columns(3)
49
+
50
+ with col1:
51
+ st.header("events")
52
+ st.title(events)
53
+ with col2:
54
+ st.header("athletes")
55
+ st.title(athletes)
56
+ with col3:
57
+ st.header("nations")
58
+ st.title(nations)
59
+
60
+ st.subheader("Countries have hosted the olympics.")
61
+ n_df = df.drop_duplicates(subset="Year")[["Year","City"]]
62
+ fig = px.bar(n_df, x='City', y='Year',text_auto = True,)
63
+ fig.update_xaxes(tickangle=45)
64
+ st.plotly_chart(fig)
65
+
66
+ countries_over_time = important.data_over_time(df,"region")
67
+ fig = px.line(countries_over_time, x="Year", y="region", title = 'Participating Countries over the years')
68
+ st.plotly_chart(fig)
69
+
70
+ events_over_time = important.data_over_time(df, "Event")
71
+ fig = px.line(events_over_time, x="Year", y="Event", title = 'Events over the years')
72
+ st.plotly_chart(fig)
73
+
74
+ athlete_over_time = important.data_over_time(df, "Name")
75
+ fig = px.line(athlete_over_time, x="Year", y="Name", title = 'Athletes over the years')
76
+ st.plotly_chart(fig)
77
+
78
+ st.title("No. of Events over time(Every Sport)")
79
+ fig, ax = plt.subplots(figsize=(20,20))
80
+ x = df.drop_duplicates(["Year", "Sport", "Event"])
81
+ ax = sns.heatmap(x.pivot_table(index="Sport", columns="Year", values="Event", aggfunc="count").fillna(0).astype("int"),annot=True)
82
+ st.pyplot(fig)
83
+
84
+ sport_list = df["Sport"].unique().tolist()
85
+ sport_list.sort()
86
+ sport_list.insert(0,"Overall")
87
+
88
+ selected_sport = st.selectbox("Select a Sport",sport_list)
89
+ st.title("Most successful Athletes")
90
+ x = important.most_successful(df,selected_sport)
91
+ st.table(x)
92
+
93
+ st.title("Most popular sports of Olympics")
94
+ sport_df = df["Sport"].value_counts().reset_index()
95
+ fig = px.pie(sport_df, values='count', names='Sport')
96
+ fig.update_layout(autosize=False, width=850,height=700)
97
+ st.plotly_chart(fig)
98
+
99
+ st.title("Locations of Stadium of countries where olympics held.")
100
+ data1 = pd.read_csv("lat_long.csv")
101
+ st.map(data1)
102
+
103
+
104
+ if user_menu == "Medal tally":
105
+
106
+ st.sidebar.header("Medal tally")
107
+ years, country = important.country_year_list(df)
108
+
109
+ selected_year = st.sidebar.selectbox("Select year", years)
110
+ selected_country = st.sidebar.selectbox("Select country", country)
111
+
112
+ medal_tally = important.fetch_medal_tally(df,selected_year,selected_country)
113
+
114
+ if selected_year == "Overall" and selected_country == "Overall":
115
+ st.title("Overall Medal Tally")
116
+ if selected_year != "Overall" and selected_country == "Overall":
117
+ st.title("Medal Tally in " + str(selected_year) + " Olympics")
118
+ if selected_year == "Overall" and selected_country != "Overall":
119
+ st.title(selected_country + " overall performance in Olympics" )
120
+ if selected_year != "Overall" and selected_country != "Overall":
121
+ st.title(selected_country + "'s performance in " + str(selected_year) + " Olympics")
122
+ st.table(medal_tally)
123
+
124
+
125
+ if user_menu == "Country-wise Analysis":
126
+
127
+ st.title("Country-wise Analysis")
128
+
129
+ country = df["region"].dropna().unique().tolist()
130
+ country.sort()
131
+
132
+ selected_country = st.sidebar.selectbox("Select a country", country)
133
+ new_region = important.country_wise_medal_tally(df, selected_country)
134
+
135
+ fig = px.line(new_region, x = "Year", y="Medal")
136
+ st.subheader(selected_country + "'s Medal Tally over the years")
137
+ st.plotly_chart(fig)
138
+
139
+ pt = important.country_event_heatmap(df,selected_country)
140
+ st.subheader(selected_country+" excels in the following sports")
141
+ fig, ax = plt.subplots(figsize=(20,20))
142
+ ax = sns.heatmap(pt, annot=True)
143
+ st.pyplot(fig)
144
+
145
+ athlete = important.most_successful_athletes(df, selected_country)
146
+ st.subheader("Top 10 athletes of "+ selected_country)
147
+ st.table(athlete)
148
+
149
+ if user_menu == "Athlete-wise Analysis":
150
+
151
+ athlete_df = df.drop_duplicates(subset=["Name", "region"])
152
+ x1 = athlete_df["Age"].dropna()
153
+ x2 = athlete_df[athlete_df["Medal"] == "Gold"]["Age"].dropna()
154
+ x3 = athlete_df[athlete_df["Medal"] == "Silver"]["Age"].dropna()
155
+ x4 = athlete_df[athlete_df["Medal"] == "Bronze"]["Age"].dropna()
156
+
157
+ st.title("Distribution of Age.")
158
+ fig = ff.create_distplot([x1,x2,x3,x4], ["Age Distribution","Gold Medalist","Silver Medalist","Bronze Medalist"], show_hist=False, show_rug=False)
159
+ fig.update_layout(autosize=False, width=850,height=530)
160
+ st.plotly_chart(fig)
161
+
162
+ st.title("Distribution of Age wrt sports(Gold Medalist)")
163
+ famous_sports =['Basketball','Judo', 'Football','Tug-Of-War','Athletics','Swimming','Badminton','Sailing','Gymnastics','Art Competitions',
164
+ 'Handball','Weightlifting','Wrestling','Water Polo','Hockey','Rowing','Fencing','Shooting','Boxing','Taekwondo',
165
+ 'Cycling', 'Diving', 'Canoeing', 'Tennis', 'Modern Pentathlon', 'Golf', 'Softball', 'Archery', 'Volleyball','Synchronized Swimming',
166
+ 'Table Tennis', 'Baseball','Rhythmic Gymnastics','Rugby Sevens', 'Beach Volleyball', 'Triathlon', 'Rugby', 'Lacrosse', 'Polo', 'Cricket',
167
+ 'Ice Hockey','Motorboating']
168
+ x = []
169
+ name = []
170
+ for sport in famous_sports:
171
+ temp_df = athlete_df[athlete_df["Sport"] == sport]
172
+ x.append(temp_df[temp_df["Medal"]=="Gold"]["Age"].dropna())
173
+ name.append(sport)
174
+
175
+ fig1 = ff.create_distplot(x,name,show_hist=False, show_rug=False)
176
+ fig1.update_layout(autosize=False, width=850,height=530)
177
+ st.plotly_chart(fig1)
178
+
179
+
180
+ st.title("Height vs Weight")
181
+ sport_list = df["Sport"].unique().tolist()
182
+ sport_list.sort()
183
+ sport_list.insert(0,"Overall")
184
+
185
+ selected_sport = st.selectbox("Select a Sport",sport_list)
186
+ new_df = important.weight_v_height(df,selected_sport)
187
+
188
+ fig, ax = plt.subplots(figsize=(10,10))
189
+ ax = sns.scatterplot(new_df, x ="Weight",y = "Height", hue=new_df["Medal"],style=new_df["Sex"],s=100)
190
+ st.pyplot(fig)
191
+
192
+ st.title("Men vs Women participation over the years")
193
+ final_df = important.men_vs_women(df)
194
+ fig = px.line(final_df, x="Year", y=["Male","Female"])
195
+ st.plotly_chart(fig)
196
+
197
+
198
+
199
+
200
+
201
+
202
+
203
+
204
+
205
+
206
+
207
+
208
+
209
+
210
+
211
+
212
+
213
+
214
+
olympics-data-analysis-web-app ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ A Streamlit web application for the analysis of olympics data.
2
+ Live Web App : https://olympicanalysiss.streamlit.app/
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ numpy==1.26.3
2
+ pandas==2.1.4
3
+ matplotlib==3.8.2
4
+ seaborn==0.13.1
5
+ plotly==5.18.0
6
+ streamlit==1.29.0
7
+ gunicorn==19.7.1
8
+ scipy==1.11.4