diff --git a/a-data-collection/docextraction.ipynb b/a-data-collection/docextraction.ipynb index f94bd643b68678b00936d3c4eeb018d9b81a4087..8d19a88292bb8f28cb765d2d0b412570b0b7c349 100644 --- a/a-data-collection/docextraction.ipynb +++ b/a-data-collection/docextraction.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -11,12 +11,13 @@ "import numpy\n", "import os\n", "import bs4\n", - "import httplib2" + "import httplib2\n", + "import requests" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -59,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -107,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -180,68 +181,51 @@ " self.movies[doc_id] = movie\n", "\n", " # Download the HTML and parse it through Beautifulsoup\n", - " h = httplib2.Http(\"./docs/.cache\")\n", + " h = httplib2.Http(\"./docs/.cache\") \n", " resp, content = h.request(url, \"GET\")\n", " soup = bs4.BeautifulSoup(content, \"html.parser\")\n", - "\n", - " # Extract the content\n", + " \n", + " # Extract infos\n", + " self.extract_summary(movie, soup)\n", + " self.extract_actors(movie, soup)\n", + " \n", + " # -------------------------------------------------------------------------\n", + " def extract_summary(self, movie, soup) -> None:\n", + " \"\"\"\n", + " This function extract the summary from a movie/tv-show\n", + " It use the find_all method of BeautifulSoup to find the \"overview\" class\n", + " \"\"\"\n", " divs = soup.find_all(\"div\")\n", " for div in divs:\n", " div_class = div.get(\"class\")\n", - " if str(div_class)[:15] == \"['GenresAndPlot\":\n", - " spans=div.find_all(\"span\")\n", - " for span in spans:\n", - " span_data=span.get(\"data-testid\")\n", - " if span_data==\"plot-xs_to_m\":\n", - " try:\n", - " movie.summary = span.string.strip()\n", - " except:\n", - " movie.summary = span.contents[0]\n", - " \n", - " print(movie.summary)\n", - " elif div_class == ['ipc-shoveler', 'title-cast__grid']:\n", - " self.extract_actors(movie, div)\n", - "\n", + " if div_class is not None:\n", + " if 'overview' in div_class:\n", + " movie.summary = div.text\n", + " print(movie.summary)\n", + " \n", + " \n", " # -------------------------------------------------------------------------\n", - " def extract_actors(self, movie, div) -> None:\n", + " def extract_actors(self, movie, soup) -> None:\n", " \"\"\"\n", - " This function takes the content of a \"div\" tag to determined if it contains actors. In practice, the following\n", - " steps are executed:\n", - "\n", - " 1. Look if there is a \"h4\" tag that contains \"Stars:\".\n", - "\n", - " 2. Extract all the links that begins with \"/name\". These are links to actor pages and the name of\n", - " the actor is extracted.\n", - "\n", - " 3. Each actor extracted to the global list of actors and the list of actors for the analyzed movie_to_analyse.\n", - "\n", - " :param movie: Analyzed movie_to_analyse.\n", - " :param div: A \"div\" tag that could contain the actors.\n", + " This function extract the list of actors displayed for a specific movie/tv-show\n", + " It use the select method of BeautifulSoup to extract actors displayed on the page.\n", + " Actor are defined in people scroller cards\n", " \"\"\"\n", "\n", - " # Look this are the actors\n", - " divs= div.find_all(\"div\")\n", - " if (divs is None) :\n", - " return\n", + " soup_results = soup.select(\"ol[class='people scroller'] li[class='card'] p a\")\n", + " actors = [soup_result.text for soup_result in soup_results]\n", + " print(actors)\n", + "\n", + " # Store actors in class dictionaries\n", + " for actor in actors:\n", + " if actor not in self.actors_by_name.keys():\n", + " actor_id = len(self.actors) + 1 # First actor_id = 1\n", + " new_actor = Actor(actor_id, actor)\n", + " self.actors[actor] = new_actor\n", + " self.actors_by_name[actor] = new_actor\n", + " self.actors_by_name[actor].movies.append(movie)\n", + " movie.actors.append(self.actors_by_name[actor])\n", "\n", - " # Extract all the text of the links beginning with \"name\"\n", - " for div in divs:\n", - " div_class = div.get(\"data-testid\")\n", - " if div_class==\"title-cast-item\":\n", - " for link in div.find_all(\"a\"):\n", - " href = link[\"href\"]\n", - " if href[:5] != \"/name\" or link[\"class\"] == [\"ipc-lockup-overlay\", \"ipc-focusable\"]:\n", - " continue\n", - " actor = link.string.strip()\n", - "\n", - " # Add the fact that the current actor is in the current movie_to_analyse\n", - " if actor not in self.actors_by_name.keys():\n", - " actor_id = len(self.actors) + 1 # First actor_id = 1\n", - " new_actor = Actor(actor_id, actor)\n", - " self.actors[actor] = new_actor\n", - " self.actors_by_name[actor] = new_actor\n", - " self.actors_by_name[actor].movies.append(movie)\n", - " movie.actors.append(self.actors_by_name[actor])\n", "\n", " # -------------------------------------------------------------------------\n", " def write_files(self) -> None:\n", @@ -253,7 +237,8 @@ "\n", " 2. Create a CSV file \"actors.txt\" with all the actors and their identifiers.\n", "\n", - " 3. Build a matrix actors/actors which elements represent the number of times two actors are playing in the same\n", + " 3. Build a matrix actors/actors which elements represent the number of times \n", + " two actors are playing in the same\n", " movie_to_analyse.\n", "\n", " 4. Create a CSV file \"links.txt\" that contains all the pairs of actors having played together.\n", @@ -261,7 +246,7 @@ "\n", " # Write the clean text\n", " for movie in self.movies.values():\n", - " movie_file = codecs.open(self.output + 'doc' + str(movie.movie_id) + \".txt\", 'w', \"utf-8\")\n", + " movie_file = codecs.open(self.output + 'doc_' + str(movie.movie_id) + \".txt\", 'w', \"utf-8\")\n", " movie_file.write(movie.summary + \"\\n\")\n", " for actor in movie.actors:\n", " movie_file.write(actor.name + \"\\n\")\n", @@ -292,33 +277,45 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "A U.S. Army officer serving in Vietnam is tasked with assassinating a renegade Special Forces Colonel who sees himself as a god.\n", - "The Godfather follows Vito Corleone Don of the Corleone family as he passes the mantel to his son Michael\n", - "A C.I.A. Agent tries to infiltrate Soviet intelligence to stop a murderous diabolical plot.\n", - "Years after a friend and fellow 00 agent is killed on a joint mission, a secret space based weapons program known as \"GoldenEye\" is stolen. James Bond sets out to stop a Russian crime syndic...\n" + "\n", + "Harry Potter has lived under the stairs at his aunt and uncle's house his whole life. But on his 11th birthday, he learns he's a powerful wizard—with a place waiting for him at the Hogwarts School of Witchcraft and Wizardry. As he learns to harness his newfound powers with the help of the school's kindly headmaster, Harry uncovers the truth about his parents' deaths—and about the villain who's to blame.\n", + "\n", + "['Daniel Radcliffe', 'Rupert Grint', 'Emma Watson', 'Richard Harris', 'Tom Felton', 'Alan Rickman', 'Robbie Coltrane', 'Maggie Smith', 'Richard Griffiths']\n", + "\n", + "101-year-old Rose DeWitt Bukater tells the story of her life aboard the Titanic, 84 years later. A young Rose boards the ship with her mother and fiancé. Meanwhile, Jack Dawson and Fabrizio De Rossi win third-class tickets aboard the ship. Rose tells the whole story from Titanic's departure through to its death—on its first and last voyage—on April 15, 1912.\n", + "\n", + "['Leonardo DiCaprio', 'Kate Winslet', 'Billy Zane', 'Gloria Stuart', 'Kathy Bates', 'Frances Fisher', 'Bill Paxton', 'Bernard Hill', 'David Warner']\n", + "\n", + "A New York stockbroker refuses to cooperate in a large securities fraud case involving corruption on Wall Street, corporate banking world and mob infiltration. Based on Jordan Belfort's autobiography.\n", + "\n", + "['Leonardo DiCaprio', 'Jonah Hill', 'Margot Robbie', 'Matthew McConaughey', 'Kyle Chandler', 'Rob Reiner', 'Jon Bernthal', 'Jean Dujardin', 'Kenneth Choi']\n" ] } ], "source": [ - "# ----------------------------------------------------------------------------------------------------------------------\n", + "# ----------------------------------------------------------------------------------------\n", "# Initialize a list of movies to download\n", - "movies = [\"0078788\", \"0068646\", \"0083891\",\"0113189\"]\n", - "basic_url_to_analyze = \"https://www.imdb.com/title/tt\"\n", + "movies = [\n", + " (\"Harry Potter 1\",\"671\"),\n", + " (\"Titanic\",\"597\"),\n", + " (\"The Wolf of Wall Street\", \"106646\"),\n", + "]\n", + "basic_url_to_analyze = 'https://www.themoviedb.org/movie/'\n", "dir_docs = \"./docs\"\n", "\n", "\n", - "# ----------------------------------------------------------------------------------------------------------------------\n", + "# -----------------------------------------------------------------------------------------\n", "# Use our custom parser to download each HTML page and save the actors and the links\n", "parser = Parser(dir_docs, basic_url_to_analyze)\n", - "for movie_to_analyse in movies:\n", - " parser.extract_data(movie_to_analyse)\n", + "for movie_label, movie_id in movies:\n", + " parser.extract_data(movie_id)\n", "parser.write_files()" ] }, @@ -332,9 +329,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Web Mining", "language": "python", - "name": "python3" + "name": "web-mining" }, "language_info": { "codemirror_mode": { @@ -346,7 +343,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.9.9" } }, "nbformat": 4,