Skip to content
Extraits de code Groupes Projets
Valider 861b9f4e rédigé par Corentin Vande Kerckhove's avatar Corentin Vande Kerckhove
Parcourir les fichiers

finish to clean first exercise

parent a97cdcda
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
%% Cell type:markdown id: tags:
# Exercise 1 - Parsing HTML
The following notebook is greatly inspired by https://github.com/khuyentran1401/Web-Scrapping-Wikipedia
The following notebook is inspired by https://github.com/khuyentran1401/Web-Scrapping-Wikipedia
%% Cell type:markdown id: tags:
### Packages , Paths and Functions
%% Cell type:code id: tags:
``` python
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import re
from pathlib import Path
import numpy as np
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import seaborn as sns
DATA_REPO =
```
DATA_PATH = Path('../data/')
%% Cell type:code id: tags:
# Epidemiology webpage : https://en.wikipedia.org/wiki/Epidemiology_of_depression
DEPRESSION_FILENAME = 'a1_epidemiology_of_depression.html' # Stored locally
``` python
url = 'https://en.wikipedia.org/wiki/Epidemiology_of_depression'
```
# Epidemiology webpage : https://en.wikipedia.org/wiki/List_of_cities_by_sunshine_duration
SUNSHINE_FILENAME = 'a1_city_sunshine_duration.html' # Stored locally
%% Cell type:code id: tags:
def read_html_file(path: Path, filename: str) -> str:
"""Read an HTML stored locally"""
with open(path / filename, "r") as file:
return file.read()
``` python
html = urlopen(url)
```
def process_num(string_number : str) -> float:
"""Convert a string number formatted with a comma to separate thousands
%% Cell type:code id: tags:
``` python
soup = BeautifulSoup(html, 'html.parser')
Example : 1,823.0 -> 1823.0"""
return float(re.sub(r'[^\w\s.]','', string_number))
```
%% Cell type:code id: tags:
%% Cell type:markdown id: tags:
``` python
tables = soup.find_all('table')
```
## 1 - Create depression table
%% Cell type:code id: tags:
``` python
#convert number as string to integer
#re.sub() returns the substring that match the regrex
import re
def process_num(num):
return float(re.sub(r'[^\w\s.]','',num))
depression_html = read_html_file(DATA_PATH, DEPRESSION_FILENAME)
depression_soup = BeautifulSoup(depression_html, 'html.parser')
```
%% Cell type:code id: tags:
``` python
num1 = re.sub(r'[^\w\s.]','','1,156.30')
num1
```
depression_rates = [] # preparing list to contain the different depression rates
depression_countries = [] # preparing list to contain the different country names
%% Output
COUNTRY_POSITION_IN_DEP_TABLE = 0
RATE_POSITION_IN_DEP_TABLE = 2
'1156.30'
def extract_depression_rates(depression_soup: BeautifulSoup) -> pd.DataFrame:
"""Extract depression rates from soup build from Wikipedia depression table
"""
%% Cell type:code id: tags:
# Extract the table from the soup
tables = depression_soup.find_all('table')
depression_table = tables[0] # ignore the glossary at the end
``` python
ranks = []
rates = []
countries = []
links = []
# Loop over rows
## @COMPLETE : extract all the rows
# table_rows = ...
for table_row in table_rows:
## @COMPLETE : extract all the cells
# table_cells = ...
for table in tables:
rows = table.find_all('tr')
if len(table_cells) > 1:
country = table_cells[COUNTRY_POSITION_IN_DEP_TABLE]
depression_countries.append(country.text.strip())
for row in rows:
cells = row.find_all('td')
rate = table_cells[RATE_POSITION_IN_DEP_TABLE]
depression_rates.append(round(float(rate.text.strip())))
return pd.DataFrame(depression_rates, index= depression_countries, columns = ['DALY rate'])
if len(cells) > 1:
rank = cells[0]
ranks.append(int(rank.text))
country = cells[1]
countries.append(country.text.strip())
df_depression = extract_depression_rates(depression_soup)
print(f'Extracted depression data for {df_depression.shape[0]} countries')
display(df_depression.head())
```
rate = cells[2]
rates.append(process_num(rate.text.strip()))
%% Cell type:markdown id: tags:
link = cells[1].find('a').get('href')
links.append('https://en.wikipedia.org/'+ link)
## 2 - Create sunshine table
df1 = pd.DataFrame(ranks, index= countries, columns = ['Rank'])
df1['DALY rate'] = rates
%% Cell type:code id: tags:
df1.head(10)
``` python
sunshine_html = read_html_file(DATA_PATH, SUNSHINE_FILENAME)
sunshine_soup = BeautifulSoup(sunshine_html, 'html.parser')
```
%% Output
Rank DALY rate
United States 1 1454.74
Nepal 2 1424.48
East Timor 3 1404.10
Bangladesh 4 1401.53
India 5 1400.84
Pakistan 6 1400.42
Brazil 7 1396.10
Maldives 8 1391.61
Bhutan 9 1385.53
Afghanistan 10 1385.14
%% Cell type:code id: tags:
``` python
sun_url = urlopen('https://en.wikipedia.org/wiki/List_of_cities_by_sunshine_duration')
sun = BeautifulSoup(sun_url, 'html.parser')
tables = sun.find_all('table')
#Dictionary to hold the name of the country and its corresponding temperature
country_suns = {}
country_sunshine = {}
#Dictionary to hold the country and its frequency in the table
count = {}
for table in tables:
if len(table) >1:
rows = table.find_all('tr')
#Skip the first row, which is the name of the columns
for row in rows[1:]:
cells = row.find_all('td')
country = cells[0].text.strip()
#If country in the list of country we found previously
#append the country to the dictionary
if country in countries:
sun = cells[-2].text.strip()
sun = process_num(sun)/10
#If country is already in the dictionary
#add to the existing sun hours of that country and the count to keep track of how many times we add
if country in country_suns:
count[country] += 1
country_suns[country] += sun
COUNTRY_POSITION_IN_SUN_TABLE = 0
SUNSHINE_POSITION_IN_SUN_TABLE = -2
def extract_monthly_sunshine_hours(sunshine_soup: BeautifulSoup) -> pd.DataFrame:
"""Extract average monthly sunshine hours from soup build from Wikipedia sunshine table
"""
sunshine_tables = sunshine_soup.find_all('table')
# Loop over tables
for table in sunshine_tables:
if len(table) >1:
# Loop over rows
## @COMPLETE : extract all the rows
# table_rows = ...
for table_row in table_rows[1:]: # skip the first row (header)
## @COMPLETE : extract all the cells
# table_cells = ...
# Extract country and sunshine hours
country = table_cells[COUNTRY_POSITION_IN_SUN_TABLE].text.strip()
yearly_sun_hours = table_cells[SUNSHINE_POSITION_IN_SUN_TABLE].text.strip()
yearly_sun_hours = process_num(yearly_sun_hours)
monthly_sun_hours = yearly_sun_hours/12
# Record hours for every city in the country
if country in country_sunshine:
country_sunshine[country].append(monthly_sun_hours)
else:
count[country] = 1
country_suns[country] = sun
#Find the average temperature of each country
for country in country_suns:
print(country_suns[country],count[country])
country_suns[country] = round(country_suns[country]/count[country],2)
print('Country: {}, Sunshine Hours: {}'.format(country,country_suns[country]))
country_sunshine[country] = [monthly_sun_hours]
```
%% Output
789.14 3
Country: Benin, Sunshine Hours: 263.05
515.99 2
Country: Togo, Sunshine Hours: 258.0
710.25 3
Country: Ghana, Sunshine Hours: 236.75
866.0500000000001 4
Country: Cameroon, Sunshine Hours: 216.51
344.03999999999996 2
Country: Gabon, Sunshine Hours: 172.02
1334.54 5
Country: Nigeria, Sunshine Hours: 266.91
711.91 2
Country: Sudan, Sunshine Hours: 355.95
336.1 1
Country: Eritrea, Sunshine Hours: 336.1
641.8 2
Country: Burkina Faso, Sunshine Hours: 320.9
320.32 1
Country: Niger, Sunshine Hours: 320.32
670.6400000000001 2
Country: Chad, Sunshine Hours: 335.32
307.0 1
Country: Gambia, Sunshine Hours: 307.0
629.2 2
Country: Senegal, Sunshine Hours: 314.6
620.5999999999999 2
Country: Somalia, Sunshine Hours: 310.3
327.9 1
Country: Djibouti, Sunshine Hours: 327.9
964.0099999999999 3
Country: Mali, Sunshine Hours: 321.34
653.3 2
Country: Algeria, Sunshine Hours: 326.65
609.99 2
Country: Tunisia, Sunshine Hours: 305.0
946.64 3
Country: Morocco, Sunshine Hours: 315.55
2253.8500000000004 6
Country: Egypt, Sunshine Hours: 375.64
635.6199999999999 2
Country: Libya, Sunshine Hours: 317.81
1212.01 4
Country: Kenya, Sunshine Hours: 303.0
234.1 1
Country: Angola, Sunshine Hours: 234.1
1213.1399999999999 4
Country: Tanzania, Sunshine Hours: 303.28
556.97 2
Country: Ethiopia, Sunshine Hours: 278.49
666.5 2
Country: Mauritania, Sunshine Hours: 333.25
1884.79 6
Country: South Africa, Sunshine Hours: 314.13
1028.0 3
Country: Botswana, Sunshine Hours: 342.67
889.6400000000001 3
Country: Zambia, Sunshine Hours: 296.55
613.08 2
Country: Zimbabwe, Sunshine Hours: 306.54
838.76 3
Country: Malawi, Sunshine Hours: 279.59
1718.66 6
Country: Madagascar, Sunshine Hours: 286.44
283.8 1
Country: Mozambique, Sunshine Hours: 283.8
681.8 3
Country: Uganda, Sunshine Hours: 227.27
237.34 1
Country: Burundi, Sunshine Hours: 237.34
488.0 2
Country: Guinea, Sunshine Hours: 244.0
270.7 1
Country: Guinea-Bissau, Sunshine Hours: 270.7
309.79 2
Country: Equatorial Guinea, Sunshine Hours: 154.9
747.5 2
Country: Namibia, Sunshine Hours: 373.75
317.51 1
Country: Afghanistan, Sunshine Hours: 317.51
220.74 1
Country: Azerbaijan, Sunshine Hours: 220.74
206.6 1
Country: Bangladesh, Sunshine Hours: 206.6
1091.49 5
Country: China, Sunshine Hours: 218.3
973.66 4
Country: India, Sunshine Hours: 243.41
298.33000000000004 1
Country: Indonesia, Sunshine Hours: 298.33
282.61 1
Country: Iran, Sunshine Hours: 282.61
324.08000000000004 1
Country: Iraq, Sunshine Hours: 324.08
331.1 1
Country: Israel, Sunshine Hours: 331.1
361.71000000000004 2
Country: Japan, Sunshine Hours: 180.86
486.29999999999995 2
Country: Kazakhstan, Sunshine Hours: 243.15
279.15 1
Country: Mongolia, Sunshine Hours: 279.15
249.2 1
Country: North Korea, Sunshine Hours: 249.2
349.33000000000004 1
Country: Oman, Sunshine Hours: 349.33
598.4300000000001 2
Country: Pakistan, Sunshine Hours: 299.22
210.31 1
Country: Philippines, Sunshine Hours: 210.31
1578.2299999999998 8
Country: Russia, Sunshine Hours: 197.28
647.3 2
Country: Saudi Arabia, Sunshine Hours: 323.65
202.24 1
Country: Singapore, Sunshine Hours: 202.24
439.33000000000004 2
Country: South Korea, Sunshine Hours: 219.67
870.0099999999999 4
Country: Thailand, Sunshine Hours: 217.5
466.76 2
Country: Turkey, Sunshine Hours: 233.38
282.39 1
Country: Uzbekistan, Sunshine Hours: 282.39
849.4 4
Country: Vietnam, Sunshine Hours: 212.35
254.4 1
Country: Albania, Sunshine Hours: 254.4
247.4 1
Country: Armenia, Sunshine Hours: 247.4
188.4 1
Country: Austria, Sunshine Hours: 188.4
180.7 1
Country: Belarus, Sunshine Hours: 180.7
154.6 1
Country: Belgium, Sunshine Hours: 154.6
176.9 1
Country: Bosnia and Herzegovina, Sunshine Hours: 176.9
217.7 1
Country: Bulgaria, Sunshine Hours: 217.7
191.3 1
Country: Croatia, Sunshine Hours: 191.3
166.8 1
Country: Czech Republic, Sunshine Hours: 166.8
331.40999999999997 1
Country: Cyprus, Sunshine Hours: 331.41
173.9 1
Country: Denmark, Sunshine Hours: 173.9
182.6 1
Country: Estonia, Sunshine Hours: 182.6
185.8 1
Country: Finland, Sunshine Hours: 185.8
449.8 2
Country: France, Sunshine Hours: 224.9
204.6 1
Country: Georgia, Sunshine Hours: 204.6
328.79999999999995 2
Country: Germany, Sunshine Hours: 164.4
595.0 2
Country: Greece, Sunshine Hours: 297.5
198.8 1
Country: Hungary, Sunshine Hours: 198.8
132.6 1
Country: Iceland, Sunshine Hours: 132.6
145.3 1
Country: Ireland, Sunshine Hours: 145.3
438.8 2
Country: Italy, Sunshine Hours: 219.4
175.4 1
Country: Latvia, Sunshine Hours: 175.4
169.1 1
Country: Lithuania, Sunshine Hours: 169.1
305.4 1
Country: Malta, Sunshine Hours: 305.4
212.6 1
Country: Moldova, Sunshine Hours: 212.6
166.2 1
Country: Netherlands, Sunshine Hours: 166.2
166.8 1
Country: Norway, Sunshine Hours: 166.8
157.1 1
Country: Poland, Sunshine Hours: 157.1
280.6 1
Country: Portugal, Sunshine Hours: 280.6
211.5 1
Country: Romania, Sunshine Hours: 211.5
203.8 1
Country: Slovakia, Sunshine Hours: 203.8
197.4 1
Country: Slovenia, Sunshine Hours: 197.4
826.6 3
Country: Spain, Sunshine Hours: 275.53
374.29999999999995 2
Country: Sweden, Sunshine Hours: 187.15
156.6 1
Country: Switzerland, Sunshine Hours: 156.6
195.5 1
Country: Ukraine, Sunshine Hours: 195.5
306.0 2
Country: United Kingdom, Sunshine Hours: 153.0
1825.24 9
Country: Canada, Sunshine Hours: 202.8
225.98000000000002 1
Country: Honduras, Sunshine Hours: 225.98
1038.5 4
Country: Mexico, Sunshine Hours: 259.62
275.99 1
Country: Nicaragua, Sunshine Hours: 275.99
174.35 1
Country: Panama, Sunshine Hours: 174.35
295.7 1
Country: El Salvador, Sunshine Hours: 295.7
15218.579999999998 54
Country: United States, Sunshine Hours: 281.83
1149.52 5
Country: Argentina, Sunshine Hours: 229.9
228.89000000000001 1
Country: Bolivia, Sunshine Hours: 228.89
1322.58 6
Country: Brazil, Sunshine Hours: 220.43
953.81 6
Country: Colombia, Sunshine Hours: 158.97
1324.27 5
Country: Chile, Sunshine Hours: 264.85
381.90999999999997 2
Country: Ecuador, Sunshine Hours: 190.95
280.3 1
Country: Paraguay, Sunshine Hours: 280.3
604.0 3
Country: Peru, Sunshine Hours: 201.33
248.14000000000001 1
Country: Uruguay, Sunshine Hours: 248.14
579.0899999999999 2
Country: Venezuela, Sunshine Hours: 289.54
2553.15 9
Country: Australia, Sunshine Hours: 283.68
192.2 1
Country: Fiji, Sunshine Hours: 192.2
613.1999999999999 3
Country: New Zealand, Sunshine Hours: 204.4
246.3 1
Country: Papua New Guinea, Sunshine Hours: 246.3
233.0 1
Country: Solomon Islands, Sunshine Hours: 233.0
# Finally, take the average temperature over each country
for country in country_sunshine:
country_sunshine[country] = round(np.average(country_sunshine[country]))
%% Cell type:code id: tags:
return pd.DataFrame.from_dict(country_sunshine, orient='index', columns = ['Sunshine Hours/Month'])
``` python
df2 = pd.DataFrame.from_dict(country_suns,orient='index', columns = ['Sunshine Hours/Year'])
df = df1.join(df2)
df.info()
df_sunshine = extract_monthly_sunshine_hours(sunshine_soup)
print(f'Extracted sunshine data for {df_sunshine.shape[0]} countries')
display(df_sunshine.head())
```
%% Output
<class 'pandas.core.frame.DataFrame'>
Index: 192 entries, United States to Japan
Data columns (total 3 columns):
Rank 192 non-null int64
DALY rate 192 non-null float64
Sunshine Hours/Year 122 non-null float64
dtypes: float64(2), int64(1)
memory usage: 11.0+ KB
%% Cell type:markdown id: tags:
%% Cell type:code id: tags:
``` python
df.dropna(inplace=True)
```
## 3 - Compare depression to sunshine
%% Cell type:code id: tags:
``` python
df.info()
import matplotlib.pyplot as plt
import seaborn as sns
sns.scatterplot('Rank', 'Sunshine Hours/Year', data=df)
df_joined = df_depression.join(df_sunshine)
df_joined = df_joined[~df_joined.isnull().any(axis=1)]
print(f'Having both depression and sunshine information for {df_joined.shape[0]} countries')
display(df_joined.head())
```
%% Output
<class 'pandas.core.frame.DataFrame'>
Index: 122 entries, United States to Japan
Data columns (total 3 columns):
Rank 122 non-null int64
DALY rate 122 non-null float64
Sunshine Hours/Year 122 non-null float64
dtypes: float64(2), int64(1)
memory usage: 8.8+ KB
<matplotlib.axes._subplots.AxesSubplot at 0x1a1a728410>
%% Cell type:code id: tags:
``` python
df.corr()
correlation = df_joined.corr().iloc[0,1]
sns.scatterplot(
data=df_joined,
x='DALY rate',
y='Sunshine Hours/Month'
).set_title(f'Pearson correlation : {correlation: 5.2f}');
```
%% Output
Rank DALY rate Sunshine Hours/Year
Rank 1.000000 -0.963597 0.346623
DALY rate -0.963597 1.000000 -0.285906
Sunshine Hours/Year 0.346623 -0.285906 1.000000
%% Cell type:code id: tags:
``` python
```
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter