Skip to content
Extraits de code Groupes Projets
exercise3.ipynb 7,66 ko
Newer Older
  • Learn to ignore specific revisions
  • {
     "cells": [
    
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "# Exercise 1 - Parsing HTML\n",
    
        "The following notebook is inspired by https://github.com/khuyentran1401/Web-Scrapping-Wikipedia"
    
       "cell_type": "markdown",
    
       "metadata": {},
       "source": [
    
        "### Packages , Paths and Functions "
    
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": null,
    
       "metadata": {},
       "outputs": [],
       "source": [
    
        "import time\n",
        "import re\n",
        "from pathlib import Path\n",
        "\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "from bs4 import BeautifulSoup\n",
        "import seaborn as sns\n",
        "\n",
        "DATA_PATH = Path('../data/')\n",
        "\n",
        "# Epidemiology webpage : https://en.wikipedia.org/wiki/Epidemiology_of_depression\n",
    
        "DEPRESSION_FILENAME = 'epidemiology_of_depression.html' # Stored locally\n",
    
        "\n",
        "# Epidemiology webpage : https://en.wikipedia.org/wiki/List_of_cities_by_sunshine_duration\n",
    
        "SUNSHINE_FILENAME = 'city_sunshine_duration.html' # Stored locally\n",
    
        "\n",
        "def read_html_file(path: Path, filename: str) -> str:\n",
        "    \"\"\"Read an HTML stored locally\"\"\"\n",
        "    with open(path / filename, \"r\") as file:\n",
        "        return file.read()\n",
        "\n",
        "def process_num(string_number : str) -> float:\n",
        "    \"\"\"Convert a string number formatted with a comma to separate thousands\n",
        "    \n",
        "    Example : 1,823.0 -> 1823.0\"\"\"\n",
        "    return float(re.sub(r'[^\\w\\s.]','', string_number))"
    
       "cell_type": "markdown",
    
       "metadata": {},
       "source": [
    
        "## 1 - Create depression table"
    
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": null,
    
       "metadata": {},
       "outputs": [],
       "source": [
    
        "depression_html = read_html_file(DATA_PATH, DEPRESSION_FILENAME)\n",
        "depression_soup = BeautifulSoup(depression_html, 'html.parser')"
    
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": null,
    
       "metadata": {},
       "outputs": [],
       "source": [
    
        "depression_rates = []  # preparing list to contain the different depression rates\n",
        "depression_countries = [] # preparing list to contain the different country names\n",
        "\n",
        "COUNTRY_POSITION_IN_DEP_TABLE = 0\n",
        "RATE_POSITION_IN_DEP_TABLE = 2\n",
        "\n",
        "def extract_depression_rates(depression_soup: BeautifulSoup) -> pd.DataFrame:\n",
        "    \"\"\"Extract depression rates from soup build from Wikipedia depression table\n",
        "    \"\"\"\n",
        "    \n",
        "    # Extract the table from the soup\n",
        "    tables = depression_soup.find_all('table')\n",
        "    depression_table = tables[0]  # ignore the glossary at the end\n",
        "    \n",
        "    # Loop over rows\n",
        "    ## @COMPLETE : extract all the rows\n",
        "    # table_rows = ...\n",
        "    for table_row in table_rows:\n",
        "        ## @COMPLETE : extract all the cells\n",
        "        # table_cells = ...\n",
        "\n",
        "        if len(table_cells) > 1:\n",
        "            country = table_cells[COUNTRY_POSITION_IN_DEP_TABLE]\n",
        "            depression_countries.append(country.text.strip())\n",
        "\n",
        "            rate = table_cells[RATE_POSITION_IN_DEP_TABLE]\n",
        "            depression_rates.append(round(float(rate.text.strip())))\n",
        "    return pd.DataFrame(depression_rates, index= depression_countries, columns = ['DALY rate'])\n",
        "\n",
        "df_depression = extract_depression_rates(depression_soup)\n",
        "print(f'Extracted depression data for {df_depression.shape[0]} countries')\n",
        "display(df_depression.head())"
    
       "cell_type": "markdown",
    
       "metadata": {},
       "source": [
    
        "## 2 - Create sunshine table"
    
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": null,
    
       "metadata": {},
    
       "source": [
    
        "sunshine_html = read_html_file(DATA_PATH, SUNSHINE_FILENAME)\n",
        "sunshine_soup = BeautifulSoup(sunshine_html, 'html.parser')"
    
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": null,
    
       "metadata": {},
    
       "source": [
        "#Dictionary to hold the name of the country and its corresponding temperature\n",
    
        "country_sunshine = {}\n",
        "\n",
        "COUNTRY_POSITION_IN_SUN_TABLE = 0\n",
        "SUNSHINE_POSITION_IN_SUN_TABLE = -2\n",
    
        "def extract_monthly_sunshine_hours(sunshine_soup: BeautifulSoup) -> pd.DataFrame:\n",
        "    \"\"\"Extract average monthly sunshine hours from soup build from Wikipedia sunshine table\n",
        "    \"\"\"\n",
        "    sunshine_tables = sunshine_soup.find_all('table')\n",
        "    \n",
        "    # Loop over tables\n",
        "    for table in sunshine_tables:\n",
        "        if len(table) >1:\n",
    
        "            # Loop over rows\n",
        "            ## @COMPLETE : extract all the rows\n",
        "            # table_rows = ...\n",
        "            for table_row in table_rows[1:]: # skip the first row (header)\n",
        "                ## @COMPLETE : extract all the cells\n",
        "                # table_cells = ...\n",
    
        "                # Extract country and sunshine hours\n",
        "                country = table_cells[COUNTRY_POSITION_IN_SUN_TABLE].text.strip()\n",
        "                yearly_sun_hours = table_cells[SUNSHINE_POSITION_IN_SUN_TABLE].text.strip()\n",
        "                yearly_sun_hours = process_num(yearly_sun_hours)\n",
        "                monthly_sun_hours = yearly_sun_hours/12\n",
        "\n",
        "                # Record hours for every city in the country\n",
        "                if country in country_sunshine:\n",
        "                    country_sunshine[country].append(monthly_sun_hours)\n",
    
        "                else:\n",
    
        "                    country_sunshine[country] = [monthly_sun_hours]\n",
    
        "                    \n",
        "\n",
    
        "    # Finally, take the average temperature over each country\n",
        "    for country in country_sunshine:\n",
        "        country_sunshine[country] = round(np.average(country_sunshine[country]))\n",
        "    \n",
        "    return pd.DataFrame.from_dict(country_sunshine, orient='index', columns = ['Sunshine Hours/Month'])\n",
    
        "df_sunshine = extract_monthly_sunshine_hours(sunshine_soup)\n",
        "print(f'Extracted sunshine data for {df_sunshine.shape[0]} countries')\n",
        "display(df_sunshine.head())"
    
       "cell_type": "markdown",
    
       "metadata": {},
       "source": [
    
        "## 3 - Compare depression to sunshine\n"
    
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": null,
    
       "metadata": {},
    
       "source": [
    
        "df_joined = df_depression.join(df_sunshine)\n",
        "df_joined = df_joined[~df_joined.isnull().any(axis=1)]\n",
        "print(f'Having both depression and sunshine information for {df_joined.shape[0]} countries')\n",
        "display(df_joined.head())"
    
       ]
      },
      {
       "cell_type": "code",
    
       "execution_count": null,
    
       "metadata": {},
    
       "source": [
    
        "correlation = df_joined.corr().iloc[0,1]\n",
        "sns.scatterplot(\n",
        "    data=df_joined,\n",
        "    x='DALY rate',\n",
        "    y='Sunshine Hours/Month'\n",
        ").set_title(f'Pearson correlation : {correlation: 5.2f}');"
    
       ]
      },
      {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {},
       "outputs": [],
       "source": []
      }
     ],
     "metadata": {
      "kernelspec": {
    
       "display_name": "Python 3 (ipykernel)",
    
       "language": "python",
       "name": "python3"
      },
      "language_info": {
       "codemirror_mode": {
        "name": "ipython",
        "version": 3
       },
       "file_extension": ".py",
       "mimetype": "text/x-python",
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
    
       "version": "3.9.9"
    
      }
     },
     "nbformat": 4,
     "nbformat_minor": 2