rename data for ex 3

ef3c0dae · Corentin Vande Kerckhove · 935ce792 · ef3c0dae · ef3c0dae · ef3c0dae
--- a/a-data-collection/exercise3.ipynb
+++ b/a-data-collection/exercise3.ipynb
@@ -33,10 +33,10 @@
    "DATA_PATH = Path('../data/')\n",
    "\n",
    "# Epidemiology webpage : https://en.wikipedia.org/wiki/Epidemiology_of_depression\n",
-    "DEPRESSION_FILENAME = 'a1_epidemiology_of_depression.html' # Stored locally\n",
+    "DEPRESSION_FILENAME = 'epidemiology_of_depression.html' # Stored locally\n",
    "\n",
    "# Epidemiology webpage : https://en.wikipedia.org/wiki/List_of_cities_by_sunshine_duration\n",
-    "SUNSHINE_FILENAME = 'a1_city_sunshine_duration.html' # Stored locally\n",
+    "SUNSHINE_FILENAME = 'city_sunshine_duration.html' # Stored locally\n",
    "\n",
    "def read_html_file(path: Path, filename: str) -> str:\n",
    "    \"\"\"Read an HTML stored locally\"\"\"\n",

 %% Cell type:markdown id: tags:
 # Exercise 1 - Parsing HTML
 The following notebook is inspired by https://github.com/khuyentran1401/Web-Scrapping-Wikipedia
 %% Cell type:markdown id: tags:
 ### Packages , Paths and Functions
 %% Cell type:code id: tags:
 ``` python
 import time
 import re
 from pathlib import Path
 import numpy as np
 import pandas as pd
 from bs4 import BeautifulSoup
 import seaborn as sns
 DATA_PATH = Path('../data/')
 # Epidemiology webpage : https://en.wikipedia.org/wiki/Epidemiology_of_depression
-DEPRESSION_FILENAME = 'a1_epidemiology_of_depression.html' # Stored locally
+DEPRESSION_FILENAME = 'epidemiology_of_depression.html' # Stored locally
 # Epidemiology webpage : https://en.wikipedia.org/wiki/List_of_cities_by_sunshine_duration
-SUNSHINE_FILENAME = 'a1_city_sunshine_duration.html' # Stored locally
+SUNSHINE_FILENAME = 'city_sunshine_duration.html' # Stored locally
 def read_html_file(path: Path, filename: str) -> str:
    """Read an HTML stored locally"""
    with open(path / filename, "r") as file:
        return file.read()
 def process_num(string_number : str) -> float:
    """Convert a string number formatted with a comma to separate thousands
    Example : 1,823.0 -> 1823.0"""
    return float(re.sub(r'[^\w\s.]','', string_number))
 ```
 %% Cell type:markdown id: tags:
 ## 1 - Create depression table
 %% Cell type:code id: tags:
 ``` python
 depression_html = read_html_file(DATA_PATH, DEPRESSION_FILENAME)
 depression_soup = BeautifulSoup(depression_html, 'html.parser')
 ```
 %% Cell type:code id: tags:
 ``` python
 depression_rates = []  # preparing list to contain the different depression rates
 depression_countries = [] # preparing list to contain the different country names
 COUNTRY_POSITION_IN_DEP_TABLE = 0
 RATE_POSITION_IN_DEP_TABLE = 2
 def extract_depression_rates(depression_soup: BeautifulSoup) -> pd.DataFrame:
    """Extract depression rates from soup build from Wikipedia depression table
    """
    # Extract the table from the soup
    tables = depression_soup.find_all('table')
    depression_table = tables[0]  # ignore the glossary at the end
    # Loop over rows
    ## @COMPLETE : extract all the rows
    # table_rows = ...
    for table_row in table_rows:
        ## @COMPLETE : extract all the cells
        # table_cells = ...
        if len(table_cells) > 1:
            country = table_cells[COUNTRY_POSITION_IN_DEP_TABLE]
            depression_countries.append(country.text.strip())
            rate = table_cells[RATE_POSITION_IN_DEP_TABLE]
            depression_rates.append(round(float(rate.text.strip())))
    return pd.DataFrame(depression_rates, index= depression_countries, columns = ['DALY rate'])
 df_depression = extract_depression_rates(depression_soup)
 print(f'Extracted depression data for {df_depression.shape[0]} countries')
 display(df_depression.head())
 ```
 %% Cell type:markdown id: tags:
 ## 2 - Create sunshine table
 %% Cell type:code id: tags:
 ``` python
 sunshine_html = read_html_file(DATA_PATH, SUNSHINE_FILENAME)
 sunshine_soup = BeautifulSoup(sunshine_html, 'html.parser')
 ```
 %% Cell type:code id: tags:
 ``` python
 #Dictionary to hold the name of the country and its corresponding temperature
 country_sunshine = {}
 COUNTRY_POSITION_IN_SUN_TABLE = 0
 SUNSHINE_POSITION_IN_SUN_TABLE = -2
 def extract_monthly_sunshine_hours(sunshine_soup: BeautifulSoup) -> pd.DataFrame:
    """Extract average monthly sunshine hours from soup build from Wikipedia sunshine table
    """
    sunshine_tables = sunshine_soup.find_all('table')
    # Loop over tables
    for table in sunshine_tables:
        if len(table) >1:
            # Loop over rows
            ## @COMPLETE : extract all the rows
            # table_rows = ...
            for table_row in table_rows[1:]: # skip the first row (header)
                ## @COMPLETE : extract all the cells
                # table_cells = ...
                # Extract country and sunshine hours
                country = table_cells[COUNTRY_POSITION_IN_SUN_TABLE].text.strip()
                yearly_sun_hours = table_cells[SUNSHINE_POSITION_IN_SUN_TABLE].text.strip()
                yearly_sun_hours = process_num(yearly_sun_hours)
                monthly_sun_hours = yearly_sun_hours/12
                # Record hours for every city in the country
                if country in country_sunshine:
                    country_sunshine[country].append(monthly_sun_hours)
                else:
                    country_sunshine[country] = [monthly_sun_hours]
    # Finally, take the average temperature over each country
    for country in country_sunshine:
        country_sunshine[country] = round(np.average(country_sunshine[country]))
    return pd.DataFrame.from_dict(country_sunshine, orient='index', columns = ['Sunshine Hours/Month'])
 df_sunshine = extract_monthly_sunshine_hours(sunshine_soup)
 print(f'Extracted sunshine data for {df_sunshine.shape[0]} countries')
 display(df_sunshine.head())
 ```
 %% Cell type:markdown id: tags:
 ## 3 - Compare depression to sunshine
 %% Cell type:code id: tags:
 ``` python
 df_joined = df_depression.join(df_sunshine)
 df_joined = df_joined[~df_joined.isnull().any(axis=1)]
 print(f'Having both depression and sunshine information for {df_joined.shape[0]} countries')
 display(df_joined.head())
 ```
 %% Cell type:code id: tags:
 ``` python
 correlation = df_joined.corr().iloc[0,1]
 sns.scatterplot(
    data=df_joined,
    x='DALY rate',
    y='Sunshine Hours/Month'
 ).set_title(f'Pearson correlation : {correlation: 5.2f}');
 ```
 %% Cell type:code id: tags:
 ``` python
 ```

--- a/data/a1_city_sunshine_duration.html
+++ b/data/a1_city_sunshine_duration.html
--- a/data/a1_epidemiology_of_depression.html
+++ b/data/a1_epidemiology_of_depression.html