diff --git a/a-data-collection/exercise2.ipynb b/a-data-collection/exercise2.ipynb index 4e3d76fcaff710702969ca20b1758a2a281a5bb7..a1caee89d6bbd525325c7e352c5351d6625c23c4 100644 --- a/a-data-collection/exercise2.ipynb +++ b/a-data-collection/exercise2.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 23, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -11,13 +11,12 @@ "import numpy\n", "import os\n", "import bs4\n", - "import httplib2\n", "import requests" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -60,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -108,7 +107,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -152,6 +151,8 @@ " \"\"\"\n", "\n", " self.output = output + os.sep\n", + " if not os.path.isdir(self.output):\n", + " os.makedirs(self.output)\n", " self.basic_url = basic_url\n", " self.actors = dict()\n", " self.actors_by_name = dict()\n", @@ -179,10 +180,13 @@ " doc_id = len(self.movies) + 1 # First actor_id = 1\n", " movie = Movie(doc_id, movie)\n", " self.movies[doc_id] = movie\n", + " \n", + " # Download the HTML using the requests library, check the status-code and extract the text\n", + " ## @COMPLETE : use the requests library here, get the response and extract the content\n", + " # response = ...\n", + " # content = ...\n", "\n", " # Download the HTML and parse it through Beautifulsoup\n", - " h = httplib2.Http(\"./docs/.cache\") \n", - " resp, content = h.request(url, \"GET\")\n", " soup = bs4.BeautifulSoup(content, \"html.parser\")\n", " \n", " # Extract infos\n", @@ -193,10 +197,9 @@ " def extract_summary(self, movie, soup) -> None:\n", " \"\"\"\n", " This function extract the summary from a movie/tv-show\n", - " Loop over the divs of BeautifulSoup to find the \"overview\" class\n", + " It use the find_all method of BeautifulSoup to find the \"overview\" class\n", " \"\"\"\n", - " ## @COMPLETE : find all the divs\n", - " # divs = ...\n", + " divs = soup.find_all(\"div\")\n", " for div in divs:\n", " div_class = div.get(\"class\")\n", " if div_class is not None:\n", @@ -212,10 +215,8 @@ " It use the select method of BeautifulSoup to extract actors displayed on the page.\n", " Actor are defined in people scroller cards\n", " \"\"\"\n", - " \n", - " ## @COMPLETE : find the selector string\n", - " # selector = \"\"\n", - " soup_results = soup.select(selector)\n", + "\n", + " soup_results = soup.select(\"ol[class='people scroller'] li[class='card'] p a\")\n", " actors = [soup_result.text for soup_result in soup_results]\n", " print(actors)\n", "\n", @@ -280,7 +281,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 10, "metadata": {}, "outputs": [ {