Skip to content
Extraits de code Groupes Projets
Valider ef3c0dae rédigé par Corentin Vande Kerckhove's avatar Corentin Vande Kerckhove
Parcourir les fichiers

rename data for ex 3

parent 935ce792
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Exercise 1 - Parsing HTML # Exercise 1 - Parsing HTML
The following notebook is inspired by https://github.com/khuyentran1401/Web-Scrapping-Wikipedia The following notebook is inspired by https://github.com/khuyentran1401/Web-Scrapping-Wikipedia
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Packages , Paths and Functions ### Packages , Paths and Functions
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import time import time
import re import re
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import seaborn as sns import seaborn as sns
DATA_PATH = Path('../data/') DATA_PATH = Path('../data/')
# Epidemiology webpage : https://en.wikipedia.org/wiki/Epidemiology_of_depression # Epidemiology webpage : https://en.wikipedia.org/wiki/Epidemiology_of_depression
DEPRESSION_FILENAME = 'a1_epidemiology_of_depression.html' # Stored locally DEPRESSION_FILENAME = 'epidemiology_of_depression.html' # Stored locally
# Epidemiology webpage : https://en.wikipedia.org/wiki/List_of_cities_by_sunshine_duration # Epidemiology webpage : https://en.wikipedia.org/wiki/List_of_cities_by_sunshine_duration
SUNSHINE_FILENAME = 'a1_city_sunshine_duration.html' # Stored locally SUNSHINE_FILENAME = 'city_sunshine_duration.html' # Stored locally
def read_html_file(path: Path, filename: str) -> str: def read_html_file(path: Path, filename: str) -> str:
"""Read an HTML stored locally""" """Read an HTML stored locally"""
with open(path / filename, "r") as file: with open(path / filename, "r") as file:
return file.read() return file.read()
def process_num(string_number : str) -> float: def process_num(string_number : str) -> float:
"""Convert a string number formatted with a comma to separate thousands """Convert a string number formatted with a comma to separate thousands
Example : 1,823.0 -> 1823.0""" Example : 1,823.0 -> 1823.0"""
return float(re.sub(r'[^\w\s.]','', string_number)) return float(re.sub(r'[^\w\s.]','', string_number))
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## 1 - Create depression table ## 1 - Create depression table
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
depression_html = read_html_file(DATA_PATH, DEPRESSION_FILENAME) depression_html = read_html_file(DATA_PATH, DEPRESSION_FILENAME)
depression_soup = BeautifulSoup(depression_html, 'html.parser') depression_soup = BeautifulSoup(depression_html, 'html.parser')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
depression_rates = [] # preparing list to contain the different depression rates depression_rates = [] # preparing list to contain the different depression rates
depression_countries = [] # preparing list to contain the different country names depression_countries = [] # preparing list to contain the different country names
COUNTRY_POSITION_IN_DEP_TABLE = 0 COUNTRY_POSITION_IN_DEP_TABLE = 0
RATE_POSITION_IN_DEP_TABLE = 2 RATE_POSITION_IN_DEP_TABLE = 2
def extract_depression_rates(depression_soup: BeautifulSoup) -> pd.DataFrame: def extract_depression_rates(depression_soup: BeautifulSoup) -> pd.DataFrame:
"""Extract depression rates from soup build from Wikipedia depression table """Extract depression rates from soup build from Wikipedia depression table
""" """
# Extract the table from the soup # Extract the table from the soup
tables = depression_soup.find_all('table') tables = depression_soup.find_all('table')
depression_table = tables[0] # ignore the glossary at the end depression_table = tables[0] # ignore the glossary at the end
# Loop over rows # Loop over rows
## @COMPLETE : extract all the rows ## @COMPLETE : extract all the rows
# table_rows = ... # table_rows = ...
for table_row in table_rows: for table_row in table_rows:
## @COMPLETE : extract all the cells ## @COMPLETE : extract all the cells
# table_cells = ... # table_cells = ...
if len(table_cells) > 1: if len(table_cells) > 1:
country = table_cells[COUNTRY_POSITION_IN_DEP_TABLE] country = table_cells[COUNTRY_POSITION_IN_DEP_TABLE]
depression_countries.append(country.text.strip()) depression_countries.append(country.text.strip())
rate = table_cells[RATE_POSITION_IN_DEP_TABLE] rate = table_cells[RATE_POSITION_IN_DEP_TABLE]
depression_rates.append(round(float(rate.text.strip()))) depression_rates.append(round(float(rate.text.strip())))
return pd.DataFrame(depression_rates, index= depression_countries, columns = ['DALY rate']) return pd.DataFrame(depression_rates, index= depression_countries, columns = ['DALY rate'])
df_depression = extract_depression_rates(depression_soup) df_depression = extract_depression_rates(depression_soup)
print(f'Extracted depression data for {df_depression.shape[0]} countries') print(f'Extracted depression data for {df_depression.shape[0]} countries')
display(df_depression.head()) display(df_depression.head())
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## 2 - Create sunshine table ## 2 - Create sunshine table
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
sunshine_html = read_html_file(DATA_PATH, SUNSHINE_FILENAME) sunshine_html = read_html_file(DATA_PATH, SUNSHINE_FILENAME)
sunshine_soup = BeautifulSoup(sunshine_html, 'html.parser') sunshine_soup = BeautifulSoup(sunshine_html, 'html.parser')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#Dictionary to hold the name of the country and its corresponding temperature #Dictionary to hold the name of the country and its corresponding temperature
country_sunshine = {} country_sunshine = {}
COUNTRY_POSITION_IN_SUN_TABLE = 0 COUNTRY_POSITION_IN_SUN_TABLE = 0
SUNSHINE_POSITION_IN_SUN_TABLE = -2 SUNSHINE_POSITION_IN_SUN_TABLE = -2
def extract_monthly_sunshine_hours(sunshine_soup: BeautifulSoup) -> pd.DataFrame: def extract_monthly_sunshine_hours(sunshine_soup: BeautifulSoup) -> pd.DataFrame:
"""Extract average monthly sunshine hours from soup build from Wikipedia sunshine table """Extract average monthly sunshine hours from soup build from Wikipedia sunshine table
""" """
sunshine_tables = sunshine_soup.find_all('table') sunshine_tables = sunshine_soup.find_all('table')
# Loop over tables # Loop over tables
for table in sunshine_tables: for table in sunshine_tables:
if len(table) >1: if len(table) >1:
# Loop over rows # Loop over rows
## @COMPLETE : extract all the rows ## @COMPLETE : extract all the rows
# table_rows = ... # table_rows = ...
for table_row in table_rows[1:]: # skip the first row (header) for table_row in table_rows[1:]: # skip the first row (header)
## @COMPLETE : extract all the cells ## @COMPLETE : extract all the cells
# table_cells = ... # table_cells = ...
# Extract country and sunshine hours # Extract country and sunshine hours
country = table_cells[COUNTRY_POSITION_IN_SUN_TABLE].text.strip() country = table_cells[COUNTRY_POSITION_IN_SUN_TABLE].text.strip()
yearly_sun_hours = table_cells[SUNSHINE_POSITION_IN_SUN_TABLE].text.strip() yearly_sun_hours = table_cells[SUNSHINE_POSITION_IN_SUN_TABLE].text.strip()
yearly_sun_hours = process_num(yearly_sun_hours) yearly_sun_hours = process_num(yearly_sun_hours)
monthly_sun_hours = yearly_sun_hours/12 monthly_sun_hours = yearly_sun_hours/12
# Record hours for every city in the country # Record hours for every city in the country
if country in country_sunshine: if country in country_sunshine:
country_sunshine[country].append(monthly_sun_hours) country_sunshine[country].append(monthly_sun_hours)
else: else:
country_sunshine[country] = [monthly_sun_hours] country_sunshine[country] = [monthly_sun_hours]
# Finally, take the average temperature over each country # Finally, take the average temperature over each country
for country in country_sunshine: for country in country_sunshine:
country_sunshine[country] = round(np.average(country_sunshine[country])) country_sunshine[country] = round(np.average(country_sunshine[country]))
return pd.DataFrame.from_dict(country_sunshine, orient='index', columns = ['Sunshine Hours/Month']) return pd.DataFrame.from_dict(country_sunshine, orient='index', columns = ['Sunshine Hours/Month'])
df_sunshine = extract_monthly_sunshine_hours(sunshine_soup) df_sunshine = extract_monthly_sunshine_hours(sunshine_soup)
print(f'Extracted sunshine data for {df_sunshine.shape[0]} countries') print(f'Extracted sunshine data for {df_sunshine.shape[0]} countries')
display(df_sunshine.head()) display(df_sunshine.head())
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## 3 - Compare depression to sunshine ## 3 - Compare depression to sunshine
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df_joined = df_depression.join(df_sunshine) df_joined = df_depression.join(df_sunshine)
df_joined = df_joined[~df_joined.isnull().any(axis=1)] df_joined = df_joined[~df_joined.isnull().any(axis=1)]
print(f'Having both depression and sunshine information for {df_joined.shape[0]} countries') print(f'Having both depression and sunshine information for {df_joined.shape[0]} countries')
display(df_joined.head()) display(df_joined.head())
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
correlation = df_joined.corr().iloc[0,1] correlation = df_joined.corr().iloc[0,1]
sns.scatterplot( sns.scatterplot(
data=df_joined, data=df_joined,
x='DALY rate', x='DALY rate',
y='Sunshine Hours/Month' y='Sunshine Hours/Month'
).set_title(f'Pearson correlation : {correlation: 5.2f}'); ).set_title(f'Pearson correlation : {correlation: 5.2f}');
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
``` ```
......
Fichier déplacé
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter