finish to clean first exercise

861b9f4e · Corentin Vande Kerckhove · a97cdcda · 861b9f4e
--- a/a-data-collection/exercise1.ipynb
+++ b/a-data-collection/exercise1.ipynb
@@ -5,688 +5,208 @@
   "metadata": {},
   "source": [
    "# Exercise 1 - Parsing HTML\n",
-    "The following notebook is greatly inspired by https://github.com/khuyentran1401/Web-Scrapping-Wikipedia"
+    "The following notebook is inspired by https://github.com/khuyentran1401/Web-Scrapping-Wikipedia"
   ]
  },
  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import requests\n",
-    "import urllib.request\n",
-    "import time\n",
-    "from bs4 import BeautifulSoup\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "from urllib.request import urlopen\n",
-    "\n",
-    "DATA_REPO = "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
+   "cell_type": "markdown",
   "metadata": {},
-   "outputs": [],
   "source": [
-    "url = 'https://en.wikipedia.org/wiki/Epidemiology_of_depression'"
+    "### Packages , Paths and Functions "
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "html = urlopen(url)"
+    "import time\n",
+    "import re\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from bs4 import BeautifulSoup\n",
+    "import seaborn as sns\n",
+    "\n",
+    "DATA_PATH = Path('../data/')\n",
+    "\n",
+    "# Epidemiology webpage : https://en.wikipedia.org/wiki/Epidemiology_of_depression\n",
+    "DEPRESSION_FILENAME = 'a1_epidemiology_of_depression.html' # Stored locally\n",
+    "\n",
+    "# Epidemiology webpage : https://en.wikipedia.org/wiki/List_of_cities_by_sunshine_duration\n",
+    "SUNSHINE_FILENAME = 'a1_city_sunshine_duration.html' # Stored locally\n",
+    "\n",
+    "def read_html_file(path: Path, filename: str) -> str:\n",
+    "    \"\"\"Read an HTML stored locally\"\"\"\n",
+    "    with open(path / filename, \"r\") as file:\n",
+    "        return file.read()\n",
+    "\n",
+    "def process_num(string_number : str) -> float:\n",
+    "    \"\"\"Convert a string number formatted with a comma to separate thousands\n",
+    "    \n",
+    "    Example : 1,823.0 -> 1823.0\"\"\"\n",
+    "    return float(re.sub(r'[^\\w\\s.]','', string_number))"
   ]
  },
  {
-   "cell_type": "code",
-   "execution_count": 4,
+   "cell_type": "markdown",
   "metadata": {},
-   "outputs": [],
   "source": [
-    "soup = BeautifulSoup(html, 'html.parser')"
+    "## 1 - Create depression table"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "tables = soup.find_all('table')"
+    "depression_html = read_html_file(DATA_PATH, DEPRESSION_FILENAME)\n",
+    "depression_soup = BeautifulSoup(depression_html, 'html.parser')"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "#convert number as string to integer\n",
-    "#re.sub() returns the substring that match the regrex\n",
-    "import re\n",
-    "def process_num(num):\n",
-    "    return float(re.sub(r'[^\\w\\s.]','',num))\n"
+    "depression_rates = []  # preparing list to contain the different depression rates\n",
+    "depression_countries = [] # preparing list to contain the different country names\n",
+    "\n",
+    "COUNTRY_POSITION_IN_DEP_TABLE = 0\n",
+    "RATE_POSITION_IN_DEP_TABLE = 2\n",
+    "\n",
+    "def extract_depression_rates(depression_soup: BeautifulSoup) -> pd.DataFrame:\n",
+    "    \"\"\"Extract depression rates from soup build from Wikipedia depression table\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    # Extract the table from the soup\n",
+    "    tables = depression_soup.find_all('table')\n",
+    "    depression_table = tables[0]  # ignore the glossary at the end\n",
+    "    \n",
+    "    # Loop over rows\n",
+    "    ## @COMPLETE : extract all the rows\n",
+    "    # table_rows = ...\n",
+    "    for table_row in table_rows:\n",
+    "        ## @COMPLETE : extract all the cells\n",
+    "        # table_cells = ...\n",
+    "\n",
+    "        if len(table_cells) > 1:\n",
+    "            country = table_cells[COUNTRY_POSITION_IN_DEP_TABLE]\n",
+    "            depression_countries.append(country.text.strip())\n",
+    "\n",
+    "            rate = table_cells[RATE_POSITION_IN_DEP_TABLE]\n",
+    "            depression_rates.append(round(float(rate.text.strip())))\n",
+    "    return pd.DataFrame(depression_rates, index= depression_countries, columns = ['DALY rate'])\n",
+    "\n",
+    "df_depression = extract_depression_rates(depression_soup)\n",
+    "print(f'Extracted depression data for {df_depression.shape[0]} countries')\n",
+    "display(df_depression.head())"
   ]
  },
  {
-   "cell_type": "code",
-   "execution_count": 7,
+   "cell_type": "markdown",
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'1156.30'"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
   "source": [
-    "num1 = re.sub(r'[^\\w\\s.]','','1,156.30')\n",
-    "num1"
+    "## 2 - Create sunshine table"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Rank</th>\n",
-       "      <th>DALY rate</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <td>United States</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1454.74</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>Nepal</td>\n",
-       "      <td>2</td>\n",
-       "      <td>1424.48</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>East Timor</td>\n",
-       "      <td>3</td>\n",
-       "      <td>1404.10</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>Bangladesh</td>\n",
-       "      <td>4</td>\n",
-       "      <td>1401.53</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>India</td>\n",
-       "      <td>5</td>\n",
-       "      <td>1400.84</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>Pakistan</td>\n",
-       "      <td>6</td>\n",
-       "      <td>1400.42</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>Brazil</td>\n",
-       "      <td>7</td>\n",
-       "      <td>1396.10</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>Maldives</td>\n",
-       "      <td>8</td>\n",
-       "      <td>1391.61</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>Bhutan</td>\n",
-       "      <td>9</td>\n",
-       "      <td>1385.53</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>Afghanistan</td>\n",
-       "      <td>10</td>\n",
-       "      <td>1385.14</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "               Rank  DALY rate\n",
-       "United States     1    1454.74\n",
-       "Nepal             2    1424.48\n",
-       "East Timor        3    1404.10\n",
-       "Bangladesh        4    1401.53\n",
-       "India             5    1400.84\n",
-       "Pakistan          6    1400.42\n",
-       "Brazil            7    1396.10\n",
-       "Maldives          8    1391.61\n",
-       "Bhutan            9    1385.53\n",
-       "Afghanistan      10    1385.14"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
-    "ranks = []\n",
-    "rates = []\n",
-    "countries = []\n",
-    "links = []\n",
-    "\n",
-    "for table in tables:\n",
-    "    rows = table.find_all('tr')\n",
-    "    \n",
-    "    for row in rows:\n",
-    "        cells = row.find_all('td')\n",
-    "        \n",
-    "        if len(cells) > 1:\n",
-    "            rank = cells[0]\n",
-    "            ranks.append(int(rank.text))\n",
-    "            \n",
-    "            country = cells[1]\n",
-    "            countries.append(country.text.strip())\n",
-    "            \n",
-    "            rate = cells[2]\n",
-    "            rates.append(process_num(rate.text.strip()))\n",
-    "            \n",
-    "            link = cells[1].find('a').get('href')\n",
-    "            links.append('https://en.wikipedia.org/'+ link)\n",
-    "            \n",
-    "df1 = pd.DataFrame(ranks, index= countries, columns = ['Rank'])\n",
-    "df1['DALY rate'] = rates\n",
-    "\n",
-    "df1.head(10)"
+    "sunshine_html = read_html_file(DATA_PATH, SUNSHINE_FILENAME)\n",
+    "sunshine_soup = BeautifulSoup(sunshine_html, 'html.parser')"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "789.14 3\n",
-      "Country: Benin, Sunshine Hours: 263.05\n",
-      "515.99 2\n",
-      "Country: Togo, Sunshine Hours: 258.0\n",
-      "710.25 3\n",
-      "Country: Ghana, Sunshine Hours: 236.75\n",
-      "866.0500000000001 4\n",
-      "Country: Cameroon, Sunshine Hours: 216.51\n",
-      "344.03999999999996 2\n",
-      "Country: Gabon, Sunshine Hours: 172.02\n",
-      "1334.54 5\n",
-      "Country: Nigeria, Sunshine Hours: 266.91\n",
-      "711.91 2\n",
-      "Country: Sudan, Sunshine Hours: 355.95\n",
-      "336.1 1\n",
-      "Country: Eritrea, Sunshine Hours: 336.1\n",
-      "641.8 2\n",
-      "Country: Burkina Faso, Sunshine Hours: 320.9\n",
-      "320.32 1\n",
-      "Country: Niger, Sunshine Hours: 320.32\n",
-      "670.6400000000001 2\n",
-      "Country: Chad, Sunshine Hours: 335.32\n",
-      "307.0 1\n",
-      "Country: Gambia, Sunshine Hours: 307.0\n",
-      "629.2 2\n",
-      "Country: Senegal, Sunshine Hours: 314.6\n",
-      "620.5999999999999 2\n",
-      "Country: Somalia, Sunshine Hours: 310.3\n",
-      "327.9 1\n",
-      "Country: Djibouti, Sunshine Hours: 327.9\n",
-      "964.0099999999999 3\n",
-      "Country: Mali, Sunshine Hours: 321.34\n",
-      "653.3 2\n",
-      "Country: Algeria, Sunshine Hours: 326.65\n",
-      "609.99 2\n",
-      "Country: Tunisia, Sunshine Hours: 305.0\n",
-      "946.64 3\n",
-      "Country: Morocco, Sunshine Hours: 315.55\n",
-      "2253.8500000000004 6\n",
-      "Country: Egypt, Sunshine Hours: 375.64\n",
-      "635.6199999999999 2\n",
-      "Country: Libya, Sunshine Hours: 317.81\n",
-      "1212.01 4\n",
-      "Country: Kenya, Sunshine Hours: 303.0\n",
-      "234.1 1\n",
-      "Country: Angola, Sunshine Hours: 234.1\n",
-      "1213.1399999999999 4\n",
-      "Country: Tanzania, Sunshine Hours: 303.28\n",
-      "556.97 2\n",
-      "Country: Ethiopia, Sunshine Hours: 278.49\n",
-      "666.5 2\n",
-      "Country: Mauritania, Sunshine Hours: 333.25\n",
-      "1884.79 6\n",
-      "Country: South Africa, Sunshine Hours: 314.13\n",
-      "1028.0 3\n",
-      "Country: Botswana, Sunshine Hours: 342.67\n",
-      "889.6400000000001 3\n",
-      "Country: Zambia, Sunshine Hours: 296.55\n",
-      "613.08 2\n",
-      "Country: Zimbabwe, Sunshine Hours: 306.54\n",
-      "838.76 3\n",
-      "Country: Malawi, Sunshine Hours: 279.59\n",
-      "1718.66 6\n",
-      "Country: Madagascar, Sunshine Hours: 286.44\n",
-      "283.8 1\n",
-      "Country: Mozambique, Sunshine Hours: 283.8\n",
-      "681.8 3\n",
-      "Country: Uganda, Sunshine Hours: 227.27\n",
-      "237.34 1\n",
-      "Country: Burundi, Sunshine Hours: 237.34\n",
-      "488.0 2\n",
-      "Country: Guinea, Sunshine Hours: 244.0\n",
-      "270.7 1\n",
-      "Country: Guinea-Bissau, Sunshine Hours: 270.7\n",
-      "309.79 2\n",
-      "Country: Equatorial Guinea, Sunshine Hours: 154.9\n",
-      "747.5 2\n",
-      "Country: Namibia, Sunshine Hours: 373.75\n",
-      "317.51 1\n",
-      "Country: Afghanistan, Sunshine Hours: 317.51\n",
-      "220.74 1\n",
-      "Country: Azerbaijan, Sunshine Hours: 220.74\n",
-      "206.6 1\n",
-      "Country: Bangladesh, Sunshine Hours: 206.6\n",
-      "1091.49 5\n",
-      "Country: China, Sunshine Hours: 218.3\n",
-      "973.66 4\n",
-      "Country: India, Sunshine Hours: 243.41\n",
-      "298.33000000000004 1\n",
-      "Country: Indonesia, Sunshine Hours: 298.33\n",
-      "282.61 1\n",
-      "Country: Iran, Sunshine Hours: 282.61\n",
-      "324.08000000000004 1\n",
-      "Country: Iraq, Sunshine Hours: 324.08\n",
-      "331.1 1\n",
-      "Country: Israel, Sunshine Hours: 331.1\n",
-      "361.71000000000004 2\n",
-      "Country: Japan, Sunshine Hours: 180.86\n",
-      "486.29999999999995 2\n",
-      "Country: Kazakhstan, Sunshine Hours: 243.15\n",
-      "279.15 1\n",
-      "Country: Mongolia, Sunshine Hours: 279.15\n",
-      "249.2 1\n",
-      "Country: North Korea, Sunshine Hours: 249.2\n",
-      "349.33000000000004 1\n",
-      "Country: Oman, Sunshine Hours: 349.33\n",
-      "598.4300000000001 2\n",
-      "Country: Pakistan, Sunshine Hours: 299.22\n",
-      "210.31 1\n",
-      "Country: Philippines, Sunshine Hours: 210.31\n",
-      "1578.2299999999998 8\n",
-      "Country: Russia, Sunshine Hours: 197.28\n",
-      "647.3 2\n",
-      "Country: Saudi Arabia, Sunshine Hours: 323.65\n",
-      "202.24 1\n",
-      "Country: Singapore, Sunshine Hours: 202.24\n",
-      "439.33000000000004 2\n",
-      "Country: South Korea, Sunshine Hours: 219.67\n",
-      "870.0099999999999 4\n",
-      "Country: Thailand, Sunshine Hours: 217.5\n",
-      "466.76 2\n",
-      "Country: Turkey, Sunshine Hours: 233.38\n",
-      "282.39 1\n",
-      "Country: Uzbekistan, Sunshine Hours: 282.39\n",
-      "849.4 4\n",
-      "Country: Vietnam, Sunshine Hours: 212.35\n",
-      "254.4 1\n",
-      "Country: Albania, Sunshine Hours: 254.4\n",
-      "247.4 1\n",
-      "Country: Armenia, Sunshine Hours: 247.4\n",
-      "188.4 1\n",
-      "Country: Austria, Sunshine Hours: 188.4\n",
-      "180.7 1\n",
-      "Country: Belarus, Sunshine Hours: 180.7\n",
-      "154.6 1\n",
-      "Country: Belgium, Sunshine Hours: 154.6\n",
-      "176.9 1\n",
-      "Country: Bosnia and Herzegovina, Sunshine Hours: 176.9\n",
-      "217.7 1\n",
-      "Country: Bulgaria, Sunshine Hours: 217.7\n",
-      "191.3 1\n",
-      "Country: Croatia, Sunshine Hours: 191.3\n",
-      "166.8 1\n",
-      "Country: Czech Republic, Sunshine Hours: 166.8\n",
-      "331.40999999999997 1\n",
-      "Country: Cyprus, Sunshine Hours: 331.41\n",
-      "173.9 1\n",
-      "Country: Denmark, Sunshine Hours: 173.9\n",
-      "182.6 1\n",
-      "Country: Estonia, Sunshine Hours: 182.6\n",
-      "185.8 1\n",
-      "Country: Finland, Sunshine Hours: 185.8\n",
-      "449.8 2\n",
-      "Country: France, Sunshine Hours: 224.9\n",
-      "204.6 1\n",
-      "Country: Georgia, Sunshine Hours: 204.6\n",
-      "328.79999999999995 2\n",
-      "Country: Germany, Sunshine Hours: 164.4\n",
-      "595.0 2\n",
-      "Country: Greece, Sunshine Hours: 297.5\n",
-      "198.8 1\n",
-      "Country: Hungary, Sunshine Hours: 198.8\n",
-      "132.6 1\n",
-      "Country: Iceland, Sunshine Hours: 132.6\n",
-      "145.3 1\n",
-      "Country: Ireland, Sunshine Hours: 145.3\n",
-      "438.8 2\n",
-      "Country: Italy, Sunshine Hours: 219.4\n",
-      "175.4 1\n",
-      "Country: Latvia, Sunshine Hours: 175.4\n",
-      "169.1 1\n",
-      "Country: Lithuania, Sunshine Hours: 169.1\n",
-      "305.4 1\n",
-      "Country: Malta, Sunshine Hours: 305.4\n",
-      "212.6 1\n",
-      "Country: Moldova, Sunshine Hours: 212.6\n",
-      "166.2 1\n",
-      "Country: Netherlands, Sunshine Hours: 166.2\n",
-      "166.8 1\n",
-      "Country: Norway, Sunshine Hours: 166.8\n",
-      "157.1 1\n",
-      "Country: Poland, Sunshine Hours: 157.1\n",
-      "280.6 1\n",
-      "Country: Portugal, Sunshine Hours: 280.6\n",
-      "211.5 1\n",
-      "Country: Romania, Sunshine Hours: 211.5\n",
-      "203.8 1\n",
-      "Country: Slovakia, Sunshine Hours: 203.8\n",
-      "197.4 1\n",
-      "Country: Slovenia, Sunshine Hours: 197.4\n",
-      "826.6 3\n",
-      "Country: Spain, Sunshine Hours: 275.53\n",
-      "374.29999999999995 2\n",
-      "Country: Sweden, Sunshine Hours: 187.15\n",
-      "156.6 1\n",
-      "Country: Switzerland, Sunshine Hours: 156.6\n",
-      "195.5 1\n",
-      "Country: Ukraine, Sunshine Hours: 195.5\n",
-      "306.0 2\n",
-      "Country: United Kingdom, Sunshine Hours: 153.0\n",
-      "1825.24 9\n",
-      "Country: Canada, Sunshine Hours: 202.8\n",
-      "225.98000000000002 1\n",
-      "Country: Honduras, Sunshine Hours: 225.98\n",
-      "1038.5 4\n",
-      "Country: Mexico, Sunshine Hours: 259.62\n",
-      "275.99 1\n",
-      "Country: Nicaragua, Sunshine Hours: 275.99\n",
-      "174.35 1\n",
-      "Country: Panama, Sunshine Hours: 174.35\n",
-      "295.7 1\n",
-      "Country: El Salvador, Sunshine Hours: 295.7\n",
-      "15218.579999999998 54\n",
-      "Country: United States, Sunshine Hours: 281.83\n",
-      "1149.52 5\n",
-      "Country: Argentina, Sunshine Hours: 229.9\n",
-      "228.89000000000001 1\n",
-      "Country: Bolivia, Sunshine Hours: 228.89\n",
-      "1322.58 6\n",
-      "Country: Brazil, Sunshine Hours: 220.43\n",
-      "953.81 6\n",
-      "Country: Colombia, Sunshine Hours: 158.97\n",
-      "1324.27 5\n",
-      "Country: Chile, Sunshine Hours: 264.85\n",
-      "381.90999999999997 2\n",
-      "Country: Ecuador, Sunshine Hours: 190.95\n",
-      "280.3 1\n",
-      "Country: Paraguay, Sunshine Hours: 280.3\n",
-      "604.0 3\n",
-      "Country: Peru, Sunshine Hours: 201.33\n",
-      "248.14000000000001 1\n",
-      "Country: Uruguay, Sunshine Hours: 248.14\n",
-      "579.0899999999999 2\n",
-      "Country: Venezuela, Sunshine Hours: 289.54\n",
-      "2553.15 9\n",
-      "Country: Australia, Sunshine Hours: 283.68\n",
-      "192.2 1\n",
-      "Country: Fiji, Sunshine Hours: 192.2\n",
-      "613.1999999999999 3\n",
-      "Country: New Zealand, Sunshine Hours: 204.4\n",
-      "246.3 1\n",
-      "Country: Papua New Guinea, Sunshine Hours: 246.3\n",
-      "233.0 1\n",
-      "Country: Solomon Islands, Sunshine Hours: 233.0\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
-    "sun_url = urlopen('https://en.wikipedia.org/wiki/List_of_cities_by_sunshine_duration')\n",
-    "sun = BeautifulSoup(sun_url, 'html.parser')\n",
-    "tables = sun.find_all('table')\n",
-    "\n",
    "#Dictionary to hold the name of the country and its corresponding temperature\n",
-    "country_suns = {}\n",
+    "country_sunshine = {}\n",
+    "\n",
+    "COUNTRY_POSITION_IN_SUN_TABLE = 0\n",
+    "SUNSHINE_POSITION_IN_SUN_TABLE = -2\n",
    "\n",
-    "#Dictionary to hold the country and its frequency in the table\n",
-    "count = {}\n",
-    "for table in tables:\n",
-    "    if len(table) >1:\n",
-    "        rows = table.find_all('tr')\n",
-    "        \n",
-    "        #Skip the first row, which is the name of the columns\n",
-    "        for row in rows[1:]:\n",
-    "            cells = row.find_all('td')\n",
-    "            country = cells[0].text.strip()\n",
+    "def extract_monthly_sunshine_hours(sunshine_soup: BeautifulSoup) -> pd.DataFrame:\n",
+    "    \"\"\"Extract average monthly sunshine hours from soup build from Wikipedia sunshine table\n",
+    "    \"\"\"\n",
+    "    sunshine_tables = sunshine_soup.find_all('table')\n",
+    "    \n",
+    "    # Loop over tables\n",
+    "    for table in sunshine_tables:\n",
+    "        if len(table) >1:\n",
    "            \n",
-    "            #If country in the list of country we found previously\n",
-    "            #append the country to the dictionary\n",
-    "            if country in countries:\n",
+    "            # Loop over rows\n",
+    "            ## @COMPLETE : extract all the rows\n",
+    "            # table_rows = ...\n",
+    "            for table_row in table_rows[1:]: # skip the first row (header)\n",
+    "                ## @COMPLETE : extract all the cells\n",
+    "                # table_cells = ...\n",
    "                \n",
-    "                sun = cells[-2].text.strip()\n",
-    "                sun = process_num(sun)/10\n",
-    "                \n",
-    "                #If country is already in the dictionary\n",
-    "                #add to the existing sun hours of that country and the count to keep track of how many times we add\n",
-    "                if country in country_suns:\n",
-    "                    count[country] += 1\n",
-    "                    country_suns[country] += sun\n",
-    "                    \n",
+    "                # Extract country and sunshine hours\n",
+    "                country = table_cells[COUNTRY_POSITION_IN_SUN_TABLE].text.strip()\n",
+    "                yearly_sun_hours = table_cells[SUNSHINE_POSITION_IN_SUN_TABLE].text.strip()\n",
+    "                yearly_sun_hours = process_num(yearly_sun_hours)\n",
+    "                monthly_sun_hours = yearly_sun_hours/12\n",
+    "\n",
+    "                # Record hours for every city in the country\n",
+    "                if country in country_sunshine:\n",
+    "                    country_sunshine[country].append(monthly_sun_hours)\n",
    "                else:\n",
-    "                    count[country] = 1\n",
-    "                    country_suns[country] = sun\n",
+    "                    country_sunshine[country] = [monthly_sun_hours]\n",
    "                    \n",
    "\n",
-    "#Find the average temperature of each country\n",
-    "for country in country_suns:\n",
-    "    print(country_suns[country],count[country])\n",
-    "    country_suns[country] = round(country_suns[country]/count[country],2)\n",
-    "    print('Country: {}, Sunshine Hours: {}'.format(country,country_suns[country]))\n",
-    "                "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<class 'pandas.core.frame.DataFrame'>\n",
-      "Index: 192 entries, United States to Japan\n",
-      "Data columns (total 3 columns):\n",
-      "Rank                   192 non-null int64\n",
-      "DALY rate              192 non-null float64\n",
-      "Sunshine Hours/Year    122 non-null float64\n",
-      "dtypes: float64(2), int64(1)\n",
-      "memory usage: 11.0+ KB\n"
-     ]
-    }
-   ],
-   "source": [
-    "df2 = pd.DataFrame.from_dict(country_suns,orient='index', columns = ['Sunshine Hours/Year'])\n",
-    "\n",
-    "df = df1.join(df2)\n",
+    "    # Finally, take the average temperature over each country\n",
+    "    for country in country_sunshine:\n",
+    "        country_sunshine[country] = round(np.average(country_sunshine[country]))\n",
+    "    \n",
+    "    return pd.DataFrame.from_dict(country_sunshine, orient='index', columns = ['Sunshine Hours/Month'])\n",
    "\n",
-    "df.info()\n"
+    "df_sunshine = extract_monthly_sunshine_hours(sunshine_soup)\n",
+    "print(f'Extracted sunshine data for {df_sunshine.shape[0]} countries')\n",
+    "display(df_sunshine.head())"
   ]
  },
  {
-   "cell_type": "code",
-   "execution_count": 21,
+   "cell_type": "markdown",
   "metadata": {},
-   "outputs": [],
   "source": [
-    "df.dropna(inplace=True)"
+    "## 3 - Compare depression to sunshine\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<class 'pandas.core.frame.DataFrame'>\n",
-      "Index: 122 entries, United States to Japan\n",
-      "Data columns (total 3 columns):\n",
-      "Rank                   122 non-null int64\n",
-      "DALY rate              122 non-null float64\n",
-      "Sunshine Hours/Year    122 non-null float64\n",
-      "dtypes: float64(2), int64(1)\n",
-      "memory usage: 8.8+ KB\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "<matplotlib.axes._subplots.AxesSubplot at 0x1a1a728410>"
-      ]
-     },
-     "execution_count": 38,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYwAAAEGCAYAAAB2EqL0AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3de5QcdZ338fc3k9uQRBLIqNlcTMDIiqxyiega1wWveFkRdRV3V1B8BB900eOFeGMXj/LsQUWP+giIIhJ1RRRZEHWVRfASBU0iN0UUBB8CMURMwpjECZP5Pn9UNek03T3V1VVdv6r6vM6Zk0lN98y3q6vr+7t861fm7oiIiExmStEBiIhIOShhiIhIIkoYIiKSiBKGiIgkooQhIiKJTC06gH7Mnz/fly5dWnQYIiKlsm7duj+6+0ivzyt1wli6dClr164tOgwRkVIxs9+neZ6GpEREJBElDBERSUQJQ0REElHCEBGRRJQwREQkkVJXSYn0amLCeWD7LnaN72b61CH2nzWdKVOs6LBESkEJQ2pjYsK5fdMob1y9lg1bdrJo3jCfPWEFBz1mjpKGSAIakpLaeGD7roeTBcCGLTt54+q1PLB9V8GRSRVMTDibR8e4d8sONo+OMTFRvVtHqIchtbFrfPfDyaJhw5ad7BrfXVBEUhV16b2qhyG1MX3qEIvmDe+1bdG8YaZPHSooIqmKuvRelTCkNvafNZ3PnrDi4aTRaAXuP2t6wZFJ2dWl96ohKamNKVOMgx4zh8tPXakqKclUo/fanDSq2HtVD0NqZcoUY2TODBbO24eROTOULCQTdem9qochItKnuvRelTBERDLQ6L1WmYakREQkESUMERFJRAlDREQSUcIQEZFElDBERCQRJQwREUlEZbUiIh3o/il7U8IQEWmjLivQ9kJDUiIibdRlBdpeKGGIiLRRlxVoe6GEISLShu6f8ki5JQwzm2lmPzOzm8zsl2b2gXj7F8zsLjO7Mf46NN5uZvZJM7vDzG42s8Pzik1EZDJ1WYG2F3lOeo8Bz3b3P5vZNODHZvad+Gfvcvevtzz+hcDy+OtpwHnxvyKSkKp6slOXFWh7kVvCcHcH/hz/d1r81e2u6McCq+PnXW9mc81sgbtvzCtGkSpRVU/26rACbS9yncMwsyEzuxG4H7ja3W+If3RWPOz0cTNrvBsLgXuanr4h3tb6O082s7Vmtnbz5s15hi9SKqrqkbzlmjDcfbe7HwosAo40s0OA9wB/DTwV2A9YFT+8XRPoET0Sd7/A3Ve4+4qRkZGcIhcpH1X1SN4GUiXl7luB64Bj3H2jR8aAi4Aj44dtABY3PW0RcN8g4hOpAlX1SN7yrJIaMbO58ffDwHOBX5vZgnibAS8Dbo2fciVwQlwt9XRgm+YvRJJTVU95TEw4m0fHuHfLDjaPjjEx0W16Nxx5VkktAC42syGixHSpu19lZt83sxGiIagbgTfFj/828CLgDmAH8PocYxOpHFX1lEOZixMsKkoqpxUrVvjatWuLDkNEJLHNo2Mcd+6aveabFs0b5vJTVw6sIsvM1rn7il6fpyu9RUQGqMzFCUoYIiIDVObiBCUMEZGU0kxel7k4QffDEJFghbzUSdrJ6zIXJ6iHISJBapyQjzt3DSvPvpbjzl3D7ZtGgylB7efK+saSIwvn7cPInBmlSBaghCEigcp6qZOsr30o8+R1WhqSSinkrrJE9B6VW5Yn5DyufWhMXreWx5Zh8jot9TBSCL2rLHqPqiDLaqI8FmYs8+R1WkoYKWhV0PDpPSq/LE/IeQwfNU9er1l1NJefurIUV2v3Q0NSKdRx7LJs9B6VX5bVRJMNH6Udvqzb/TLUw0ihzBfe1IXeo2rIqpqoW28lyfBlWRcLzJrWkkqhzIuH1YXeI2nVqRcx2dpOVTyW0q4lpSGpFMp84U1ddHuPVD1VT52GjyYbvuw0HzbIxQJDoYSRUt3GLsuo3XtUtdaikl//Jpvf0HzYHprDkFqpUvVUGUuHQ5wLmKwaS/Nhe6iHIbVSpdZi2YZK2vXuVp90JLNnTuWh8YnCekiTDTE3Ekoj7ucf/Gje/+KD2TW+m82jY7Xq1SlhSK1U6ercsiW/1gQ3MnsGmx78Cyd8/ubChwe7DTE3J5SJiQn+uH0X//S5GwqPuQgakpJayePq3KKGWco2VNKa4N501IG86+s3l2J4sJFQpkyZwilfXFeKmPOgHkYPNMFYfllXuBU5id46VBL60hStvbu5w9NK1UOC8vXqsqaEkVDVqmvqpF2iz2qMv8h5hLKVd7cmuB27dpdueLBKQ5ppaEgqoSpV19RJ3pVERbc4y3Rfhda1l56yeN9Uw4NFVlolHdIMsRosC117GGZmwGPdfeOA4glW0ScGSSfvHkDdW5y9ap1cnjs8vaceUtE9/SS9uqJjzFPXHoZH64ZcNaBYgla2CUaJ5J3o67jEdZZ67SGF0NOfLOYQYsxLkjmMn5nZ4e6+PvdoAla2CUaJ5N0DKNs8Qtk1NwAOWzyXNx11IHOHp7FrfDcTEx7Efq/yaESShPFM4I1mdiewHTCizsfhuUYWGJ0YymkQiV7LxAxOowEwMnsG73zBQay6rPhrODrFWMVhyklXqzWzA9ttd/c7c4moB0WtVivlonLo6mjMD/xh218444pbO64wW6QyzGHktlptIzGY2X7AzBSxiRRKPYDqaPT0Z80YSjTsU0RjodNoBMDm0bFSN1wmTRhm9mLg48Ai4AHgr4DfAn+db2giUnZ5nLCnTDGGp02ddNinyJZ+ayOlDL2OJJJch3EWsBK43d0XA8cA1+UZlMggVbVmvmh5XgOTpDotpGqlkGLpR5JJ73F332xmU8zM3P1qMzsr98hEBqAqLb8Q5XkNTJIilJCqlZLEUoa5tiQ9jG1mNgtYA6w2s3OAiXzDEhmMqrT8QpT3CXuy6yFCuXZqYsLZPeFdYynLvU2SJIyXAX8B3ko0FHUv8A85xiQyMCG1Qqum6BN2KBdVPrB9Fx/61q84+xVP3iuWz7z2iIdjKUvDJUmV1KiZLQKWu/uFZjYTmPQdjx/3Q2BG/He+7u7/bmbLgEuA/YD1wGvdfZeZzQBWA0cQTa6/2t3vTvm6UitDt1CyU+Wa+aIVfbFrKNdO7Rrfzfd+dT+bR3dxxksOZu7wNLbufIj5TbGUpeGSpErqJOAtwL7AgcAS4FzguZM8dQx4trv/2cymAT82s+8Abwc+7u6XmNn5wBuA8+J/t7j7483seOBs4NUpX1cqGs+un3YntdUnHYnj3LtlhxoNfQjhhB1CSXWjUfKLe7ZyyhfXAXuuGWl9TOgNlyQX7t0IHAnc4O6Hxdtucfe/SfxHzPYBfgz8b+BbRAsajpvZ3wJnuvsLzOy78fc/NbOpwB+AEe8SYNYX7m0eHeO4c9cEeTGQ5Ke5Vzk8fYhND46p0SCZSdIQnewxWY985HbhHvCXeMio8YcSp7z4seuAxwOfBu4Etrr7ePyQDcDC+PuFwD0AcTLZBuwP/LHld54MnAywZMmSpKEkUpZuYScaTkunuRW6eXQs19VtpX5ae1rD04cYn3A2btu51+e0U28spJGPJJPea8zsdGCmmR0NfJWEK9i6+253P5Toor8jgSe2e1j8b7tX/ojehbtf4O4r3H3FyMhIkjASK3qSrh9lqbIIXdkbDRKmRqNkwb7DbHpwjJef+5NHfE47VX2FNCGeJGGcDowCvyaqlLoGeF8vf8TdtxJVWD0dmBsPOUGUSO6Lv98ALAaIf74v8Kde/k6/QqmqSCOkg6rMytxokPCl+ZyG1IjpOCQV9yrOcffdRJPS5/Xyi81sBHjI3bea2TDRJPnZwLXAK4kqpU4EroifcmX8/5/GP/9+t/mLPIQwSZdWSAdVmRVd2SPVluZzGtKEeLc5jMcB68zsze6+JsXvXgBcHM9jTAEudferzOxXwCVm9iHgF8CF8eMvBL5oZncQ9SyOT/E3+xZCVUUaIR1UZVbmRoOEL83nNKRGTNcqKTM7HPgU0XDUeTRd4R3CDZW0vPkeg5oY08S6SHppP6ehVEklKas9CrgMuIU9k9Du7s/u9Y9lTQljb3mfzEOq1hApqxAaXZmX1ZrZo4FzgAOILsC7qY/4ZADyHk7LczE5kboo67A3dK+S+inwI+CZShYCmliXcGmJ+sHoNun9A2AzMJuorFZqThPrEiINlQ5Otx7GBcBTgG+b2TVmtsrMnjKguCRAZb5ORfJVZAs/7TVI6pX0rmMPw92vB64HzjSz/YHnA+8wsycTrTL73+5+6WDClFYh3au4Sq24ECYky6boFn6aodKiYy6rJFd64+4PuPtX3P2EeKmPTwPL8w0tH1VoVRS5DMhkN60pMy2vkk7RqwykuTq/6JjLatKEYWZvNbNHWeRzZrYemO/upbtNa1VOCGU42MuYmMuwX0NUdDFEmqHSPGPO4tgP9fOTZLXak9z9E2b2AuDRwOuBi4Dv5hpZDqpSFlr0B3QyZe3uh75f0xjEEFvRxRBphkrzijmLC/NCXmI/yZBUI8IXARfFJbbhfuq7qMoJIfQF8sraUg99v/ZqUD3qEIoheh0qzSvmNMd+6/t00z3bgv38JOlhrDOz7wHLgPeY2Ryalggpk6JbQlm19kJaW6adsibm0PdrrwbVoy5jMUReMac59lvfp32mDwX7+emaMCy6a9K/ASPA79x9R1wx9fpBBJe1Ik8IWQ7ThP4BLToxpxX6fu3VIBN3Ga9eziPmNMd+6/u0dedDbX/H8PQhNo+OFXpsdh2SipcX/y93Xx/f06JRMXXzQKLLWPMJYc2qo7n81JUDGxfMepgm5GqlEIYo0gp5v/aqakNsZZDm2G99n86/7k4+8son7/U7Vp90JJseHCu8YCfJ4oOfBr7g7j8fTEjJlWnxwXu37GDl2dc+YvuaVUezcN4+BUSUL13PULyyFh+UXa/Hfrv3afVJRzJ75lQeGp9g+tQhHOfl5/7kEb2OtMOLed7T+2jgFDP7PbCdaMLb3f3Jvf6xOivrME1aZRyiqJqqDbGVRa/HfpL36d4tO4KY10iSMF6YexQ1ULUJVSmHMibuOvZOJ3ufQmlwJkkYYVwxUnJq7YlMTsNo7YXS4Ewyh9G4cZIBM4nKa2939yflH153ZZrDEJHJbR6NJnazGquvkix7XrnNYbj737T8ocOBU3r9QyIik+lWClzHoapmIQwvJhmS2ou7rzezp+YRTBXU/aAW6Uensfrh6UMaqgrApAnDzN7e9N8pwOFEN1aSFhp/FelPp7H68QmvxDpwZZekhzGn6ftx4FvAZfmEU25VWdxQpCidikM2btvZV1mpev7ZSDKH8QGAeA0pd/c/5x5VSZV1DSWRkLQbq++nrFQ9/+wkuR/GIWb2C+BW4Jdmts7MDsk/tPLRUgwi+ehnuZmyrp4coiRDUhcAb3f3awHM7Kh42zNyjKuUQqmVFilSHsM//VzHpJ5/dpIkjFmNZAHg7teZ2awcYyotXZwndZfX8E8/SSiUq6SrIMkNlH5nZmeY2dL46/3AXXkHVlZVWu1UpFd5DP/0eyOoMq+eHJpEt2gFPgB8g+hq7x9S0vthiEi+8hj+6bf6UD3/7CSpktoCnDaAWKQGVN5YbXkM/2SRhEK4SroKOiYMM/smXRYedPeX5hKRVJbKG6svj8IPzUGEo+Pig2b2941vgc8C/6v55+7+g3xDm5wWHywXLSxXD1n3ItXQyF7miw82JwQz+3OvCcLMFgOrgccCE8AF7v4JMzsTeCN7lhd5r7t/O37Oe4A3ALuB09z9u738TQmbyhvrIevhH81BhCPp4oNp7okxDrwjXqxwDrDOzK6Of/Zxd/9o84PN7GDgeOBJwF8B/2NmT3B3nU0qQkMLg1Wl+SLNQYShY1mtme3X+AKGzGxey7au3H2ju6+Pvx8FbgMWdnnKscAl7j7m7ncBdwBH9vRqpCcTE87m0THu3bKDzaNjud9QXuWNg9NvKapIO916GOvYc+MkgPVNP3PggKR/xMyWAocBNwArgbeY2QnAWqJeyBaiZHJ909M20CbBmNnJwMkAS5YsSRpCKQyyRVjEuHCdhxYG3drXQpiSh25zGMuy+ANmNptoddu3ufuDZnYe8EGipPNB4Byiaz3afXoe0Rxy9wuIliZhxYoVlWkuDfoEXtQJpY5DC0UkZ80XSR6SXOmdmplNI0oWX3b3bwC4+yZ33+3uE0TVV41hpw3A4qanLwLuyzO+kAx6gTSdUAaniMXvtBBm7wY9RFtGuSUMMzPgQuA2d/9Y0/YFTQ87jmgVXIArgePNbIaZLQOWAz/LK752ijxgBn0C1wllcIpIzknmi3SC3ENzPsn0fIvWHqwEXgvcYmY3xtveC7zGzA4lGm66m/j+4O7+SzO7FPgVUYXVmwdZIVV0rfegK4i0su7gFFEdNtl8UdHHe2g055NMxwv39nqQ2TOB5e5+kZmNALPjSqZCZXnhXtEXlRXxAa5S2WXIQjw5F328h+beLTtYefa1j9i+ZtXRLJy3TwER5SvzC/eafvG/AyuAg4CLgGnAl4h6EJVR9Jh+ERVEdZyALkKI1WFFH++h0TVCySSZwzgOeCmwHcDd72Pv+3xXQghj+loavbpCe29DON5DomuEkkkyh7HL3d3MHKCqN0/SmL7UiY73vSXpBTYP4Q5PH2J8wnlofCKIHuOgTDqHYWbvJKpYeh7wH0TXTPynu38q//C6y3rxQY3pS53oeE+ueR5qZPYMTj/mIN719ZuDmZPqVdo5jKST3s8Dnk90cd133f3qSZ4yEFqtVqS8ypSwmosEPvPaI/jgVb8qdcFAbpPeAHGCCCJJSP2U6cSSpSq/7hArx7ppLhKYOzyttgUDk056m9nLzey3ZrbNzB40s1Eze3AQwYnU9YKqqr/uIq5+70dzkcDWnQ/VtmAgSZXUh4GXuvu+7v4od5/j7o/KOzARKN+JJStVf91lK+ttrqI6/7o7+cgrn1zLiqokQ1Kb3P223COR4IQwJFK2E0tWqv66y3bdQ2sV1fD0Ib5x6jNqVyWVJGGsNbOvAv8FjDU2NhYTlMmFcOLtVShjzGU7sWSl6q+7jGW9utA1WVntRW02u7uflE9IyZWhSiqUE2+vQlk6oqz7r191eN1lbEgNwiD2S65ltaEqQ8II4cSb5gAMaW2dup5Y6vq662xQDYXMy2rN7HR3/7CZfYr2NzI6rdc/VnXtPuBFj0WnPQBDGhKp61BAXV93nYW+am63KqnGRPdaotu1tn5Jk05lkNOmTim0BC9ttY3W1hEZvKIbmJPpdovWb8b/Xjy4cMqr04n5yresLHRyL+0BGOIKqyJVF1LPvp0ky5s/AXgnsLT58e7+7PzCKl6v48edTsw7d+3u6cSb9bh1PweghkREBiv06rEkZbVfA84HPgeE0S/KWZpx/24n5qQn3jwmvEI/AGUwiphA16R970Lv2Scpq13n7kcMKJ6e5FUllaayKYuTfV4VVfrg1lsvx2ZWx0odyoLLLI8qqf3ib79pZqcCl7P3hXt/6jnKkkgz7p9FyyCvCa8yDS2FlNxCiqUfSStvsjzJh17tI+l0G5JaR1RO2zhS3tX0MwcOyCuooqUd9+/3xBz6hFfeQmqVhhRLv5I2RLI8yRdZ7VOVRB+ijmW17r7M3Q+I/239qmyygOJKSuteyhrSgnshxdKvpLdjzfIkX9QtYKu+ym/RklRJ/SPw3+4+ambvBw4HPujuv8g9uoIUNfEU+oRX3kKqQQ8pln4lLXzIsodbVLGFhsLylaRK6gx3/5qZPRN4AfBRoqqpp+UaWcGKGvcv03xD1kIakssylqKHSJI2RJKc5JO+lqIaP1VK9CFKkjAae/rFwHnufoWZnZlfSFJHExPO0BT4zL8cwSlfWld4CXBWLeRQ5kKSNEQmO8n3+lqKaPyE1OiooiRltVcB9wLPBY4AdgI/c/en5B9ed2VYfFAm13wiGpk9g9Oes5xl82exz4wh5s+aEWyVVJLWdgiLT2alDK8llAQdujzv6f0q4Bjgo+6+1cwWsHfFlEhfmsedN2zZyeu/8POHT0RFfsi7tZCTnpiqNERShtdS93nAvE16i1Z33wFcAWw3syXANODXeQcm9VGGE1GrpFVURVUL5aEsr6WR6BfO24eROcX1UKto0oRhZv8KbAKuBr4Vf12Vc1wDNTHhbB4d494tO9g8OqYSvAEry4moWdIkV6VS6Sq9FkknyZDUW4GD3P2BvIMpgsY8B6PbeH8Z17tKOrlapSGSKr0WSSfJpPe1wPPcfXwwISWXxaR3GSbyym6ypDwx4WzduYudu3az22HmtCmFTnYnUeeGRtFlwtK/PCe9fwdcZ2bfYu+1pD7W6x8L0aDHz+v4Yet2MdX+s6a3PfHOnxV2sm5tbQ9PH2J8wtm4bWel39c6J0pJMIcB/D+i+YvpwJymr0oY5Ph567IF77v8ZjZs2VH5uZNuSbnMS3A0JlcX7DvMpgfHePm5P6n8chRlfr+kf5P2MNz9A2l+sZktBlYDjwUmgAvc/RPxKrhfJboh093Aq9x9i5kZ8AngRcAO4HXuvj7N3+7FIMfPmz9shy2ey4nPWMY/fe6GyrfUuo33l7FCqlWdlqOowvsl6SWpkrrWzL7f+pXgd48D73D3JwJPB95sZgcD7waucfflwDXx/wFeCCyPv04GzkvxenrWPLSwZtXRXH7qytxO2s0ftjcddSCrLru5Fi21btU1ZayQalWnk2jr+3XY4rlc9Lqnstu90r1kiSSZw3hn0/czgVcQJYOu3H0jsDH+ftTMbgMWAscCR8UPuxi4DlgVb1/t0Sz89WY218wWxL8nV/0sYdDLnERzS3vu8LTanGS6VdeUsUKqVdWWo0ha0TYyewanH3MQ7/r6zZXvJUskyZDUupZNa8zsB738ETNbChwG3AA8ppEE3H2jmT06fthC4J6mp22It+2VMMzsZKIeCEuWLOkljMz1OgHY/GHbuvOhSp1kJtMpKVehVLMKSa9hsmO69f169QXX12IoTiJJymr3a/rvFKL1pD7p7gcl+gNms4EfAGe5+zfMbKu7z236+RZ3nxdXYf2Hu/843n4NcHqbhPWwoteSSnsr1we272JiYoI/bt/FKV9cp9ZZBZS1+q01bsd5+bk/SXRM37tlByvPvvYRv3PNqqNZOG+f3GOX9PIsq22+8944cBfwhoRBTQMuA77s7t+IN29qDDXF61LdH2/fACxuevoi4L4kf6coaW/l2vjgjcyZWeqWtexRxmXp2/UmvvSGpyU+pqs2FCeTS7KWVPOd95a7+/MbvYBu4qqnC4HbWq7ZuBI4Mf7+RKJ1qhrbT7DI04Ftg5i/6Ee/E7Za86beil6Spl11111/3J74mNZSIfXTcUjKzJ4K3OPuf4j/fwLRhPfvgTPd/U9df3F0w6UfAbcQldUCvJdoHuNSYAnRNR7/6O5/ihPM/yVaGXcH8Hp37zreVPSQVD8XMTUPBTQu+npofEI9jZoI4QK4dkNKhy2ey4dedsgj7knSKa6yDsXVXdohqW4JYz3w3Phk/izgEuBfgUOBJ7r7K/sJOAtFJwxI94Fpvf+DKk3qJ4QlaTrFcOVbVrJ7AiWBCkubMLoNSQ019SJeTXTh3WXufgbw+DRBVlGaYaXmoYA3HXXgw8kCqn09huwRwrUbnYaU5g5P11CptNVt0nvIzKbGiw4+h7iUNcHzZBLNJ4ssrsfQsED5hDBhXIWSZhmsbj2MrwA/MLMriG7L+iMAM3s8sG0AsVVW82R543qMZr2cOFrXpwpxHaOiJ3dDFMqEsQovpBddr8OIq5UWAN9z9+3xticAswexztNkipjDyKI1n+UcRghj4d2EMLkbKvUMpSiZT3qXwaATRpYnv6yqpEK/eCr0hCZSR3lMekuLLJd2bh4K2G/WDB49Z2aqYYHQF+8LYXJXJqdhQ0lCCaMHIZ78QhkL7yT0hCblmAeTMKjaqQchVLa0Cr3SpUoL81VVp56zrscIU5FzX0oYPQj15BfyOkahJzRp33MemT2DjVv/kviKbxmMootINOndI1W2SNW0K0y46HVP5YwrblWxQmCyKiLRpPeAqG49G5pkDUe7ebBl82cFN18nxc+jakhKBq7obrXsrd2woeN7zdcdtngupz1n+cO3YlXPuhhFz6Oqh1EyobbMe4kry/JkyUZrz3n+rBkP9zoOWzyX0485iDOuuJVnffg6VVEVqOiqSPUwSiTUlnmvcRXdrZbJ6VasYSq6iEQ9jBIJtWXea1y6NqMcmqvvqprgQ+2xd1PkPKoSRomE2jLvNa6iu9XSm6omeF2w2DsljBIJ9YPba1zN3eo1q47m8lNXFj6sJu1NTDhDU+Az/3JE5RJ8qD32kGkOo0RCvXAwTVwhX2wokdZVlT947CEsmz+LfWYMMX9W+UvKQ+2xh0wJo0SKnvAqMq5+L5jUBZe9a26Bb9iyk9d/4ecPXyRWhX1XdIlqGSlhlEyoLfM84+q3OizU6rKiJE2eVW+Bh9pjD5kShgSv01hzu7LOdifDXp5fdb0kz6q3wEPtsYdMk94SvKQt3U5VL1VvKfeil4neOlSzaamf3qiHIcFL2tLtdDK89JS/rXRLuRe9JE+1wKWVehgSvKQt3U4nwyGj8i3lpNKUQKsFLg3qYVRUlaqCkrZ0O/VEpkyZopZyTBO90g/dD6OC6loVVNfX3asqNSYknbT3w1DCCFQ/H+qsbrKStzxOXDoZikwubcLQkFSA+m0pl6EqKK/eQKjXqYhUgSa9M5D1ipf9rnET6ppTzbSOj0ymjCvJVp16GH3Ko6Xcbw+hDBObaV+jhpzqQfNRYVLC6FMeVxH3e4VtGern07xGnUTqQ1fnh0lDUn3KY74giytsQ6+fT/MaNYxVH2WYh6uj3HoYZvZ54CXA/e5+SLztTOCNwOb4Ye9192/HP3sP8AZgN3Cau383r9iylMd6O2XoIfQrzWvUSaQ+qr6OVVnl2cP4AnBMm+0fd/dD469GsjgYOB54Uvycc82sFEdGXuvthN5DyEKvr7EMk/mSjTqsY1VGufUw3P2HZrY04cOPBS5x9zHgLjO7AzgS+GlO4WWmDr2BUJRhMl+yoc9VmIqY9H6LmXF0H/AAAAe/SURBVJ0ArAXe4e5bgIXA9U2P2RBvewQzOxk4GWDJkiU5h5qMav8HQyeRetHnKjyDnvQ+DzgQOBTYCJwTb2/3iW9bdO3uF7j7CndfMTIykk+UEqzGSWTBvtFQxcZtO1WjLzIgA+1huPumxvdm9lngqvi/G4DFTQ9dBNw3wNCkRFReK1KMgfYwzGxB03+PA26Nv78SON7MZpjZMmA58LNBxiblofJakWLkWVb7FeAoYL6ZbQD+HTjKzA4lGm66GzgFwN1/aWaXAr8CxoE3u7tqJaUtldeKFCPPKqnXtNl8YZfHnwWclVc8Uh2q0Rcphq70ltJRjb5IMbSWlJSOymtFiqGEIaWkGn2RwdOQlIiIJKKEISIiiWhISipBN1YSyZ8ShpServwWGQwNSUnp6cpvkcFQwpDS05XfIoOhhCGlpxsriQyGEoaUnq78FhkMTXpL6enKb5HBUMKQStCV3yL505CUiIgkooQhIiKJKGGIiEgiShgiIpKIEoaIiCRi7l50DKmZ2Wbg9ymeOh/4Y8bhZEnxpRdybKD4+qX4+tOI73HuPtLrk0udMNIys7XuvqLoODpRfOmFHBsovn4pvv70G5+GpEREJBElDBERSaSuCeOCogOYhOJLL+TYQPH1S/H1p6/4ajmHISIivatrD0NERHqkhCEiIonUKmGY2TFmdruZ3WFm7w4gnsVmdq2Z3WZmvzSzt8bbzzSze83sxvjrRQXGeLeZ3RLHsTbetp+ZXW1mv43/nVdQbAc17aMbzexBM3tbkfvPzD5vZveb2a1N29ruL4t8Mj4ebzazwwuK7yNm9us4hsvNbG68famZ7Wzaj+cXEFvH99LM3hPvu9vN7AV5xtYlvq82xXa3md0Ybx/ovov/ZqfzSXbHn7vX4gsYAu4EDgCmAzcBBxcc0wLg8Pj7OcBvgIOBM4F3Fr3P4rjuBua3bPsw8O74+3cDZwcQ5xDwB+BxRe4/4FnA4cCtk+0v4EXAdwADng7cUFB8zwemxt+f3RTf0ubHFRRb2/cy/pzcBMwAlsWf7aFBx9fy83OAfyti38V/s9P5JLPjr049jCOBO9z9d+6+C7gEOLbIgNx9o7uvj78fBW4DFhYZU0LHAhfH318MvKzAWBqeA9zp7mmu/M+Mu/8Q+FPL5k7761hgtUeuB+aa2YJBx+fu33P38fi/1wOL8oyhkw77rpNjgUvcfczd7wLuIPqM56ZbfGZmwKuAr+QZQzddzieZHX91ShgLgXua/r+BgE7OZrYUOAy4Id70lrib+PmihnxiDnzPzNaZ2cnxtse4+0aIDlLg0YVFt8fx7P1hDWX/Qef9FeIxeRJRq7NhmZn9wsx+YGZ/V1BM7d7L0Pbd3wGb3P23TdsK23ct55PMjr86JYx29+sMoqbYzGYDlwFvc/cHgfOAA4FDgY1EXd2irHT3w4EXAm82s2cVGEtbZjYdeCnwtXhTSPuvm6COSTN7HzAOfDnetBFY4u6HAW8H/tPMHjXgsDq9l0HtO+A17N1gKWzftTmfdHxom21d92GdEsYGYHHT/xcB9xUUy8PMbBrRm/tld/8GgLtvcvfd7j4BfJacu9rduPt98b/3A5fHsWxqdF3jf+8vKr7YC4H17r4Jwtp/sU77K5hj0sxOBF4C/LPHA9zxcM8D8ffriOYJnjDIuLq8lyHtu6nAy4GvNrYVte/anU/I8PirU8L4ObDczJbFLdLjgSuLDCge97wQuM3dP9a0vXkc8Tjg1tbnDoKZzTKzOY3viSZHbyXabyfGDzsRuKKI+Jrs1boLZf816bS/rgROiKtVng5sawwdDJKZHQOsAl7q7juato+Y2VD8/QHAcuB3A46t03t5JXC8mc0ws2VxbD8bZGxNngv82t03NDYUse86nU/I8vgb5Cx+0V9EVQG/Icr27wsgnmcSdQFvBm6Mv14EfBG4Jd5+JbCgoPgOIKpEuQn4ZWOfAfsD1wC/jf/dr8B9uA/wALBv07bC9h9R4toIPETUgntDp/1FNCTw6fh4vAVYUVB8dxCNZTeOwfPjx74ift9vAtYD/1BAbB3fS+B98b67HXhhEfsu3v4F4E0tjx3ovov/ZqfzSWbHn5YGERGRROo0JCUiIn1QwhARkUSUMEREJBElDBERSUQJQ0REElHCEEnIzHbHK4/eambftHhV15S/6zozW5FlfCJ5U8IQSW6nux/q7ocQLUL35qIDEhkkJQyRdH5KvFCbmc02s2vMbL1F9w45Nt6+NL43wWfj+xN8z8yGm3+JmU0xs4vN7EMFvAaRnihhiPQoXvLhOexZWuYvwHEeLdJ4NHBOvEwDREtCfNrdnwRsJboCuGEq0UJ/v3H39w8keJE+KGGIJDcc31HtAWA/4Op4uwH/x8xuBv6HqOfxmPhnd7n7jfH364hurNPwGaKb7JyVd+AiWVDCEElup7sfSnRXv+nsmcP4Z2AEOCL++SZgZvyzsabn7ybqVTT8BDjazGYiUgJKGCI9cvdtwGnAO+PlpPcF7nf3h8zsaKKEksSFwLeBr8VLZIsETQlDJAV3/wXRSqTHE81DrDCztUS9jV/38Hs+RrSa6RfNTJ9HCZpWqxURkUTUohERkUSUMEREJBElDBERSUQJQ0REElHCEBGRRJQwREQkESUMERFJ5P8DcMUIv5Egn1AAAAAASUVORK5CYII=\n",
-      "text/plain": [
-       "<Figure size 432x288 with 1 Axes>"
-      ]
-     },
-     "metadata": {
-      "needs_background": "light"
-     },
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
-    "df.info()\n",
-    "\n",
-    "import matplotlib.pyplot as plt\n",
-    "import seaborn as sns\n",
-    "\n",
-    "sns.scatterplot('Rank', 'Sunshine Hours/Year', data=df)"
+    "df_joined = df_depression.join(df_sunshine)\n",
+    "df_joined = df_joined[~df_joined.isnull().any(axis=1)]\n",
+    "print(f'Having both depression and sunshine information for {df_joined.shape[0]} countries')\n",
+    "display(df_joined.head())"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Rank</th>\n",
-       "      <th>DALY rate</th>\n",
-       "      <th>Sunshine Hours/Year</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <td>Rank</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>-0.963597</td>\n",
-       "      <td>0.346623</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>DALY rate</td>\n",
-       "      <td>-0.963597</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>-0.285906</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>Sunshine Hours/Year</td>\n",
-       "      <td>0.346623</td>\n",
-       "      <td>-0.285906</td>\n",
-       "      <td>1.000000</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                         Rank  DALY rate  Sunshine Hours/Year\n",
-       "Rank                 1.000000  -0.963597             0.346623\n",
-       "DALY rate           -0.963597   1.000000            -0.285906\n",
-       "Sunshine Hours/Year  0.346623  -0.285906             1.000000"
-      ]
-     },
-     "execution_count": 36,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
-    "df.corr()"
+    "correlation = df_joined.corr().iloc[0,1]\n",
+    "sns.scatterplot(\n",
+    "    data=df_joined,\n",
+    "    x='DALY rate',\n",
+    "    y='Sunshine Hours/Month'\n",
+    ").set_title(f'Pearson correlation : {correlation: 5.2f}');"
   ]
  },
  {

 %% Cell type:markdown id: tags:

 # Exercise 1 - Parsing HTML
-The following notebook is greatly inspired by https://github.com/khuyentran1401/Web-Scrapping-Wikipedia
+The following notebook is inspired by https://github.com/khuyentran1401/Web-Scrapping-Wikipedia
+
+%% Cell type:markdown id: tags:
+
+### Packages , Paths and Functions

 %% Cell type:code id: tags:

 ``` python
-import requests
-import urllib.request
 import time
-from bs4 import BeautifulSoup
+import re
+from pathlib import Path
+
 import numpy as np
 import pandas as pd
-from urllib.request import urlopen
+from bs4 import BeautifulSoup
+import seaborn as sns

-DATA_REPO =
-```
+DATA_PATH = Path('../data/')

-%% Cell type:code id: tags:
+# Epidemiology webpage : https://en.wikipedia.org/wiki/Epidemiology_of_depression
+DEPRESSION_FILENAME = 'a1_epidemiology_of_depression.html' # Stored locally

-``` python
-url = 'https://en.wikipedia.org/wiki/Epidemiology_of_depression'
-```
+# Epidemiology webpage : https://en.wikipedia.org/wiki/List_of_cities_by_sunshine_duration
+SUNSHINE_FILENAME = 'a1_city_sunshine_duration.html' # Stored locally

-%% Cell type:code id: tags:
+def read_html_file(path: Path, filename: str) -> str:
+    """Read an HTML stored locally"""
+    with open(path / filename, "r") as file:
+        return file.read()

-``` python
-html = urlopen(url)
-```
+def process_num(string_number : str) -> float:
+    """Convert a string number formatted with a comma to separate thousands

-%% Cell type:code id: tags:
-
-``` python
-soup = BeautifulSoup(html, 'html.parser')
+    Example : 1,823.0 -> 1823.0"""
+    return float(re.sub(r'[^\w\s.]','', string_number))
 ```

-%% Cell type:code id: tags:
+%% Cell type:markdown id: tags:

-``` python
-tables = soup.find_all('table')
-```
+## 1 - Create depression table

 %% Cell type:code id: tags:

 ``` python
-#convert number as string to integer
-#re.sub() returns the substring that match the regrex
-import re
-def process_num(num):
-    return float(re.sub(r'[^\w\s.]','',num))
+depression_html = read_html_file(DATA_PATH, DEPRESSION_FILENAME)
+depression_soup = BeautifulSoup(depression_html, 'html.parser')
 ```

 %% Cell type:code id: tags:

 ``` python
-num1 = re.sub(r'[^\w\s.]','','1,156.30')
-num1
-```
+depression_rates = []  # preparing list to contain the different depression rates
+depression_countries = [] # preparing list to contain the different country names

-%% Output
+COUNTRY_POSITION_IN_DEP_TABLE = 0
+RATE_POSITION_IN_DEP_TABLE = 2

-    '1156.30'
+def extract_depression_rates(depression_soup: BeautifulSoup) -> pd.DataFrame:
+    """Extract depression rates from soup build from Wikipedia depression table
+    """

-%% Cell type:code id: tags:
+    # Extract the table from the soup
+    tables = depression_soup.find_all('table')
+    depression_table = tables[0]  # ignore the glossary at the end

-``` python
-ranks = []
-rates = []
-countries = []
-links = []
+    # Loop over rows
+    ## @COMPLETE : extract all the rows
+    # table_rows = ...
+    for table_row in table_rows:
+        ## @COMPLETE : extract all the cells
+        # table_cells = ...

-for table in tables:
-    rows = table.find_all('tr')
+        if len(table_cells) > 1:
+            country = table_cells[COUNTRY_POSITION_IN_DEP_TABLE]
+            depression_countries.append(country.text.strip())

-    for row in rows:
-        cells = row.find_all('td')
+            rate = table_cells[RATE_POSITION_IN_DEP_TABLE]
+            depression_rates.append(round(float(rate.text.strip())))
+    return pd.DataFrame(depression_rates, index= depression_countries, columns = ['DALY rate'])

-        if len(cells) > 1:
-            rank = cells[0]
-            ranks.append(int(rank.text))
-
-            country = cells[1]
-            countries.append(country.text.strip())
+df_depression = extract_depression_rates(depression_soup)
+print(f'Extracted depression data for {df_depression.shape[0]} countries')
+display(df_depression.head())
+```

-            rate = cells[2]
-            rates.append(process_num(rate.text.strip()))
+%% Cell type:markdown id: tags:

-            link = cells[1].find('a').get('href')
-            links.append('https://en.wikipedia.org/'+ link)
+## 2 - Create sunshine table

-df1 = pd.DataFrame(ranks, index= countries, columns = ['Rank'])
-df1['DALY rate'] = rates
+%% Cell type:code id: tags:

-df1.head(10)
+``` python
+sunshine_html = read_html_file(DATA_PATH, SUNSHINE_FILENAME)
+sunshine_soup = BeautifulSoup(sunshine_html, 'html.parser')
 ```

-%% Output
-
-                   Rank  DALY rate
-    United States     1    1454.74
-    Nepal             2    1424.48
-    East Timor        3    1404.10
-    Bangladesh        4    1401.53
-    India             5    1400.84
-    Pakistan          6    1400.42
-    Brazil            7    1396.10
-    Maldives          8    1391.61
-    Bhutan            9    1385.53
-    Afghanistan      10    1385.14
-
 %% Cell type:code id: tags:

 ``` python
-sun_url = urlopen('https://en.wikipedia.org/wiki/List_of_cities_by_sunshine_duration')
-sun = BeautifulSoup(sun_url, 'html.parser')
-tables = sun.find_all('table')
-
 #Dictionary to hold the name of the country and its corresponding temperature
-country_suns = {}
+country_sunshine = {}

-#Dictionary to hold the country and its frequency in the table
-count = {}
-for table in tables:
-    if len(table) >1:
-        rows = table.find_all('tr')
-
-        #Skip the first row, which is the name of the columns
-        for row in rows[1:]:
-            cells = row.find_all('td')
-            country = cells[0].text.strip()
-
-            #If country in the list of country we found previously
-            #append the country to the dictionary
-            if country in countries:
-
-                sun = cells[-2].text.strip()
-                sun = process_num(sun)/10
-
-                #If country is already in the dictionary
-                #add to the existing sun hours of that country and the count to keep track of how many times we add
-                if country in country_suns:
-                    count[country] += 1
-                    country_suns[country] += sun
+COUNTRY_POSITION_IN_SUN_TABLE = 0
+SUNSHINE_POSITION_IN_SUN_TABLE = -2

+def extract_monthly_sunshine_hours(sunshine_soup: BeautifulSoup) -> pd.DataFrame:
+    """Extract average monthly sunshine hours from soup build from Wikipedia sunshine table
+    """
+    sunshine_tables = sunshine_soup.find_all('table')
+
+    # Loop over tables
+    for table in sunshine_tables:
+        if len(table) >1:
+
+            # Loop over rows
+            ## @COMPLETE : extract all the rows
+            # table_rows = ...
+            for table_row in table_rows[1:]: # skip the first row (header)
+                ## @COMPLETE : extract all the cells
+                # table_cells = ...
+
+                # Extract country and sunshine hours
+                country = table_cells[COUNTRY_POSITION_IN_SUN_TABLE].text.strip()
+                yearly_sun_hours = table_cells[SUNSHINE_POSITION_IN_SUN_TABLE].text.strip()
+                yearly_sun_hours = process_num(yearly_sun_hours)
+                monthly_sun_hours = yearly_sun_hours/12
+
+                # Record hours for every city in the country
+                if country in country_sunshine:
+                    country_sunshine[country].append(monthly_sun_hours)
                else:
-                    count[country] = 1
-                    country_suns[country] = sun
-
-
-#Find the average temperature of each country
-for country in country_suns:
-    print(country_suns[country],count[country])
-    country_suns[country] = round(country_suns[country]/count[country],2)
-    print('Country: {}, Sunshine Hours: {}'.format(country,country_suns[country]))
+                    country_sunshine[country] = [monthly_sun_hours]

-```
-
-%% Output

-    789.14 3
-    Country: Benin, Sunshine Hours: 263.05
-    515.99 2
-    Country: Togo, Sunshine Hours: 258.0
-    710.25 3
-    Country: Ghana, Sunshine Hours: 236.75
-    866.0500000000001 4
-    Country: Cameroon, Sunshine Hours: 216.51
-    344.03999999999996 2
-    Country: Gabon, Sunshine Hours: 172.02
-    1334.54 5
-    Country: Nigeria, Sunshine Hours: 266.91
-    711.91 2
-    Country: Sudan, Sunshine Hours: 355.95
-    336.1 1
-    Country: Eritrea, Sunshine Hours: 336.1
-    641.8 2
-    Country: Burkina Faso, Sunshine Hours: 320.9
-    320.32 1
-    Country: Niger, Sunshine Hours: 320.32
-    670.6400000000001 2
-    Country: Chad, Sunshine Hours: 335.32
-    307.0 1
-    Country: Gambia, Sunshine Hours: 307.0
-    629.2 2
-    Country: Senegal, Sunshine Hours: 314.6
-    620.5999999999999 2
-    Country: Somalia, Sunshine Hours: 310.3
-    327.9 1
-    Country: Djibouti, Sunshine Hours: 327.9
-    964.0099999999999 3
-    Country: Mali, Sunshine Hours: 321.34
-    653.3 2
-    Country: Algeria, Sunshine Hours: 326.65
-    609.99 2
-    Country: Tunisia, Sunshine Hours: 305.0
-    946.64 3
-    Country: Morocco, Sunshine Hours: 315.55
-    2253.8500000000004 6
-    Country: Egypt, Sunshine Hours: 375.64
-    635.6199999999999 2
-    Country: Libya, Sunshine Hours: 317.81
-    1212.01 4
-    Country: Kenya, Sunshine Hours: 303.0
-    234.1 1
-    Country: Angola, Sunshine Hours: 234.1
-    1213.1399999999999 4
-    Country: Tanzania, Sunshine Hours: 303.28
-    556.97 2
-    Country: Ethiopia, Sunshine Hours: 278.49
-    666.5 2
-    Country: Mauritania, Sunshine Hours: 333.25
-    1884.79 6
-    Country: South Africa, Sunshine Hours: 314.13
-    1028.0 3
-    Country: Botswana, Sunshine Hours: 342.67
-    889.6400000000001 3
-    Country: Zambia, Sunshine Hours: 296.55
-    613.08 2
-    Country: Zimbabwe, Sunshine Hours: 306.54
-    838.76 3
-    Country: Malawi, Sunshine Hours: 279.59
-    1718.66 6
-    Country: Madagascar, Sunshine Hours: 286.44
-    283.8 1
-    Country: Mozambique, Sunshine Hours: 283.8
-    681.8 3
-    Country: Uganda, Sunshine Hours: 227.27
-    237.34 1
-    Country: Burundi, Sunshine Hours: 237.34
-    488.0 2
-    Country: Guinea, Sunshine Hours: 244.0
-    270.7 1
-    Country: Guinea-Bissau, Sunshine Hours: 270.7
-    309.79 2
-    Country: Equatorial Guinea, Sunshine Hours: 154.9
-    747.5 2
-    Country: Namibia, Sunshine Hours: 373.75
-    317.51 1
-    Country: Afghanistan, Sunshine Hours: 317.51
-    220.74 1
-    Country: Azerbaijan, Sunshine Hours: 220.74
-    206.6 1
-    Country: Bangladesh, Sunshine Hours: 206.6
-    1091.49 5
-    Country: China, Sunshine Hours: 218.3
-    973.66 4
-    Country: India, Sunshine Hours: 243.41
-    298.33000000000004 1
-    Country: Indonesia, Sunshine Hours: 298.33
-    282.61 1
-    Country: Iran, Sunshine Hours: 282.61
-    324.08000000000004 1
-    Country: Iraq, Sunshine Hours: 324.08
-    331.1 1
-    Country: Israel, Sunshine Hours: 331.1
-    361.71000000000004 2
-    Country: Japan, Sunshine Hours: 180.86
-    486.29999999999995 2
-    Country: Kazakhstan, Sunshine Hours: 243.15
-    279.15 1
-    Country: Mongolia, Sunshine Hours: 279.15
-    249.2 1
-    Country: North Korea, Sunshine Hours: 249.2
-    349.33000000000004 1
-    Country: Oman, Sunshine Hours: 349.33
-    598.4300000000001 2
-    Country: Pakistan, Sunshine Hours: 299.22
-    210.31 1
-    Country: Philippines, Sunshine Hours: 210.31
-    1578.2299999999998 8
-    Country: Russia, Sunshine Hours: 197.28
-    647.3 2
-    Country: Saudi Arabia, Sunshine Hours: 323.65
-    202.24 1
-    Country: Singapore, Sunshine Hours: 202.24
-    439.33000000000004 2
-    Country: South Korea, Sunshine Hours: 219.67
-    870.0099999999999 4
-    Country: Thailand, Sunshine Hours: 217.5
-    466.76 2
-    Country: Turkey, Sunshine Hours: 233.38
-    282.39 1
-    Country: Uzbekistan, Sunshine Hours: 282.39
-    849.4 4
-    Country: Vietnam, Sunshine Hours: 212.35
-    254.4 1
-    Country: Albania, Sunshine Hours: 254.4
-    247.4 1
-    Country: Armenia, Sunshine Hours: 247.4
-    188.4 1
-    Country: Austria, Sunshine Hours: 188.4
-    180.7 1
-    Country: Belarus, Sunshine Hours: 180.7
-    154.6 1
-    Country: Belgium, Sunshine Hours: 154.6
-    176.9 1
-    Country: Bosnia and Herzegovina, Sunshine Hours: 176.9
-    217.7 1
-    Country: Bulgaria, Sunshine Hours: 217.7
-    191.3 1
-    Country: Croatia, Sunshine Hours: 191.3
-    166.8 1
-    Country: Czech Republic, Sunshine Hours: 166.8
-    331.40999999999997 1
-    Country: Cyprus, Sunshine Hours: 331.41
-    173.9 1
-    Country: Denmark, Sunshine Hours: 173.9
-    182.6 1
-    Country: Estonia, Sunshine Hours: 182.6
-    185.8 1
-    Country: Finland, Sunshine Hours: 185.8
-    449.8 2
-    Country: France, Sunshine Hours: 224.9
-    204.6 1
-    Country: Georgia, Sunshine Hours: 204.6
-    328.79999999999995 2
-    Country: Germany, Sunshine Hours: 164.4
-    595.0 2
-    Country: Greece, Sunshine Hours: 297.5
-    198.8 1
-    Country: Hungary, Sunshine Hours: 198.8
-    132.6 1
-    Country: Iceland, Sunshine Hours: 132.6
-    145.3 1
-    Country: Ireland, Sunshine Hours: 145.3
-    438.8 2
-    Country: Italy, Sunshine Hours: 219.4
-    175.4 1
-    Country: Latvia, Sunshine Hours: 175.4
-    169.1 1
-    Country: Lithuania, Sunshine Hours: 169.1
-    305.4 1
-    Country: Malta, Sunshine Hours: 305.4
-    212.6 1
-    Country: Moldova, Sunshine Hours: 212.6
-    166.2 1
-    Country: Netherlands, Sunshine Hours: 166.2
-    166.8 1
-    Country: Norway, Sunshine Hours: 166.8
-    157.1 1
-    Country: Poland, Sunshine Hours: 157.1
-    280.6 1
-    Country: Portugal, Sunshine Hours: 280.6
-    211.5 1
-    Country: Romania, Sunshine Hours: 211.5
-    203.8 1
-    Country: Slovakia, Sunshine Hours: 203.8
-    197.4 1
-    Country: Slovenia, Sunshine Hours: 197.4
-    826.6 3
-    Country: Spain, Sunshine Hours: 275.53
-    374.29999999999995 2
-    Country: Sweden, Sunshine Hours: 187.15
-    156.6 1
-    Country: Switzerland, Sunshine Hours: 156.6
-    195.5 1
-    Country: Ukraine, Sunshine Hours: 195.5
-    306.0 2
-    Country: United Kingdom, Sunshine Hours: 153.0
-    1825.24 9
-    Country: Canada, Sunshine Hours: 202.8
-    225.98000000000002 1
-    Country: Honduras, Sunshine Hours: 225.98
-    1038.5 4
-    Country: Mexico, Sunshine Hours: 259.62
-    275.99 1
-    Country: Nicaragua, Sunshine Hours: 275.99
-    174.35 1
-    Country: Panama, Sunshine Hours: 174.35
-    295.7 1
-    Country: El Salvador, Sunshine Hours: 295.7
-    15218.579999999998 54
-    Country: United States, Sunshine Hours: 281.83
-    1149.52 5
-    Country: Argentina, Sunshine Hours: 229.9
-    228.89000000000001 1
-    Country: Bolivia, Sunshine Hours: 228.89
-    1322.58 6
-    Country: Brazil, Sunshine Hours: 220.43
-    953.81 6
-    Country: Colombia, Sunshine Hours: 158.97
-    1324.27 5
-    Country: Chile, Sunshine Hours: 264.85
-    381.90999999999997 2
-    Country: Ecuador, Sunshine Hours: 190.95
-    280.3 1
-    Country: Paraguay, Sunshine Hours: 280.3
-    604.0 3
-    Country: Peru, Sunshine Hours: 201.33
-    248.14000000000001 1
-    Country: Uruguay, Sunshine Hours: 248.14
-    579.0899999999999 2
-    Country: Venezuela, Sunshine Hours: 289.54
-    2553.15 9
-    Country: Australia, Sunshine Hours: 283.68
-    192.2 1
-    Country: Fiji, Sunshine Hours: 192.2
-    613.1999999999999 3
-    Country: New Zealand, Sunshine Hours: 204.4
-    246.3 1
-    Country: Papua New Guinea, Sunshine Hours: 246.3
-    233.0 1
-    Country: Solomon Islands, Sunshine Hours: 233.0
+    # Finally, take the average temperature over each country
+    for country in country_sunshine:
+        country_sunshine[country] = round(np.average(country_sunshine[country]))

-%% Cell type:code id: tags:
+    return pd.DataFrame.from_dict(country_sunshine, orient='index', columns = ['Sunshine Hours/Month'])

-``` python
-df2 = pd.DataFrame.from_dict(country_suns,orient='index', columns = ['Sunshine Hours/Year'])
-
-df = df1.join(df2)
-
-df.info()
+df_sunshine = extract_monthly_sunshine_hours(sunshine_soup)
+print(f'Extracted sunshine data for {df_sunshine.shape[0]} countries')
+display(df_sunshine.head())
 ```

-%% Output
-
-    <class 'pandas.core.frame.DataFrame'>
-    Index: 192 entries, United States to Japan
-    Data columns (total 3 columns):
-    Rank                   192 non-null int64
-    DALY rate              192 non-null float64
-    Sunshine Hours/Year    122 non-null float64
-    dtypes: float64(2), int64(1)
-    memory usage: 11.0+ KB
+%% Cell type:markdown id: tags:

-%% Cell type:code id: tags:
-
-``` python
-df.dropna(inplace=True)
-```
+## 3 - Compare depression to sunshine

 %% Cell type:code id: tags:

 ``` python
-df.info()
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-sns.scatterplot('Rank', 'Sunshine Hours/Year', data=df)
+df_joined = df_depression.join(df_sunshine)
+df_joined = df_joined[~df_joined.isnull().any(axis=1)]
+print(f'Having both depression and sunshine information for {df_joined.shape[0]} countries')
+display(df_joined.head())
 ```

-%% Output
-
-    <class 'pandas.core.frame.DataFrame'>
-    Index: 122 entries, United States to Japan
-    Data columns (total 3 columns):
-    Rank                   122 non-null int64
-    DALY rate              122 non-null float64
-    Sunshine Hours/Year    122 non-null float64
-    dtypes: float64(2), int64(1)
-    memory usage: 8.8+ KB
-
-    <matplotlib.axes._subplots.AxesSubplot at 0x1a1a728410>
-
-
-
 %% Cell type:code id: tags:

 ``` python
-df.corr()
+correlation = df_joined.corr().iloc[0,1]
+sns.scatterplot(
+    data=df_joined,
+    x='DALY rate',
+    y='Sunshine Hours/Month'
+).set_title(f'Pearson correlation : {correlation: 5.2f}');
 ```

-%% Output
-
-                             Rank  DALY rate  Sunshine Hours/Year
-    Rank                 1.000000  -0.963597             0.346623
-    DALY rate           -0.963597   1.000000            -0.285906
-    Sunshine Hours/Year  0.346623  -0.285906             1.000000
-
 %% Cell type:code id: tags:

 ``` python
 ```