exercise2.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import List, Dict\n",
    "import codecs\n",
    "import numpy\n",
    "import os\n",
    "import bs4\n",
    "import requests"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Actor:\n",
    "    \"\"\"\n",
    "    This class represents an actor.\n",
    "\n",
    "    |\n",
    "\n",
    "    The instance attributes are:\n",
    "\n",
    "    actor_id:\n",
    "        Identifier of the actor.\n",
    "\n",
    "    name:\n",
    "        Name of the actor.\n",
    "\n",
    "    movies:\n",
    "        List of movies in which the actor has played.\n",
    "    \"\"\"\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    actor_id: int\n",
    "    name: str\n",
    "    movies: List[\"Movie\"]\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def __init__(self, actor_id: int, name: str):\n",
    "        \"\"\"\n",
    "        Constructor.\n",
    "\n",
    "        :param actor_id: Identifier of the actor.\n",
    "        :param name: Name of the actor.\n",
    "        \"\"\"\n",
    "\n",
    "        self.actor_id = actor_id\n",
    "        self.name = name\n",
    "        self.movies = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Movie:\n",
    "    \"\"\"\n",
    "    This class represents a movie_to_analyse.\n",
    "\n",
    "    |\n",
    "\n",
    "    The instance attributes are:\n",
    "\n",
    "    movie_id:\n",
    "        Identifier of the movie_to_analyse.\n",
    "\n",
    "    name:\n",
    "        Name of the movie_to_analyse in the IMDb database.\n",
    "\n",
    "    actors:\n",
    "        List of actors who have played in the movie_to_analyse.\n",
    "\n",
    "    summary:\n",
    "        Summary of the movie_to_analyse.\n",
    "    \"\"\"\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    movie_id: int\n",
    "    name: str\n",
    "    actors: List[Actor]\n",
    "    summary: str\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def __init__(self, movie_id: int, name: str):\n",
    "        \"\"\"\n",
    "        Constructor.\n",
    "\n",
    "        :param movie_id: Identifier of the movie_to_analyse.\n",
    "        :param name: Name fo the movie_to_analyse.\n",
    "        \"\"\"\n",
    "\n",
    "        self.movie_id = movie_id\n",
    "        self.name = name\n",
    "        self.actors = []\n",
    "        self.summary = \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Parser:\n",
    "    \"\"\"\n",
    "\n",
    "    |\n",
    "\n",
    "    The instance attributes are:\n",
    "\n",
    "    output:\n",
    "        Directory where to store the resulting data.\n",
    "\n",
    "    basic_url:\n",
    "        Begin of the URL used to retrieve the HTML page of a movie_to_analyse.\n",
    "\n",
    "    actors:\n",
    "        Dictionary of actors (the identifiers are the key).\n",
    "\n",
    "    actors:\n",
    "        Dictionary of actors (the names are the key).\n",
    "\n",
    "    movies:\n",
    "        Dictionary of movies (the identifiers are the key).\n",
    "    \"\"\"\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    output: str\n",
    "    basic_url: str\n",
    "    actors: Dict[int, Actor]\n",
    "    actors_by_name: Dict[str, Actor]\n",
    "    movies: Dict[int, Movie]\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def __init__(self, output: str, basic_url: str) -> None:\n",
    "        \"\"\"\n",
    "        Initialize the parser.\n",
    "\n",
    "        :param output: Directory where to store the results.\n",
    "        :param basic_url: Beginning part of the URL of a movie_to_analyse page.\n",
    "        \"\"\"\n",
    "\n",
    "        self.output = output + os.sep\n",
    "        if not os.path.isdir(self.output):\n",
    "            os.makedirs(self.output)\n",
    "        self.basic_url = basic_url\n",
    "        self.actors = dict()\n",
    "        self.actors_by_name = dict()\n",
    "        self.movies = dict()\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def extract_data(self, movie: str) -> None:\n",
    "        \"\"\"\n",
    "        Extract the \"useful\" data from the page. In practice, the following steps are executed:\n",
    "\n",
    "        1. Build the URL of the movie_to_analyse page.\n",
    "\n",
    "        2. Create a new Movie instance and add it to the list.\n",
    "\n",
    "        3. Download the HTML page and use an instance of BeautifulSoup to parse.\n",
    "\n",
    "        4. Extract all \"div\" tags and analyze those of the class \"summary_text\" (summary of the movie_to_analyse) and\n",
    "        \"credit_summary_item\" (directors, producers, actors, etc.).\n",
    "\n",
    "        :param movie: Analyzed movie_to_analyse.\n",
    "        \"\"\"\n",
    "\n",
    "        url = self.basic_url + movie\n",
    "\n",
    "        doc_id = len(self.movies) + 1  # First actor_id = 1\n",
    "        movie = Movie(doc_id, movie)\n",
    "        self.movies[doc_id] = movie\n",
    "        \n",
    "        # Download the HTML using the requests library, check the status-code and extract the text\n",
    "        ## @COMPLETE : use the requests library here, get the response and extract the content\n",
    "        # response = ...\n",
    "        # content = ...\n",
    "\n",
    "        # Download the HTML and parse it through Beautifulsoup\n",
    "        soup = bs4.BeautifulSoup(content, \"html.parser\")\n",
    "        \n",
    "        # Extract infos\n",
    "        self.extract_summary(movie, soup)\n",
    "        self.extract_actors(movie, soup)\n",
    "    \n",
    "    # -------------------------------------------------------------------------\n",
    "    def extract_summary(self, movie, soup) -> None:\n",
    "        \"\"\"\n",
    "        This function extract the summary from a movie/tv-show\n",
    "        It use the find_all method of BeautifulSoup to find the \"overview\" class\n",
    "        \"\"\"\n",
    "        divs = soup.find_all(\"div\")\n",
    "        for div in divs:\n",
    "            div_class = div.get(\"class\")\n",
    "            if div_class is not None:\n",
    "                if 'overview' in div_class:\n",
    "                    movie.summary = div.text\n",
    "                    print(movie.summary)\n",
    "        \n",
    "        \n",
    "    # -------------------------------------------------------------------------\n",
    "    def extract_actors(self, movie, soup) -> None:\n",
    "        \"\"\"\n",
    "        This function extract the list of actors displayed for a specific movie/tv-show\n",
    "        It use the select method of BeautifulSoup to extract actors displayed on the page.\n",
    "        Actor are defined in people scroller cards\n",
    "        \"\"\"\n",
    "\n",
    "        soup_results = soup.select(\"ol[class='people scroller'] li[class='card'] p a\")\n",
    "        actors = [soup_result.text for soup_result in soup_results]\n",
    "        print(actors)\n",
    "\n",
    "        # Store actors in class dictionaries\n",
    "        for actor in actors:\n",
    "            if actor not in self.actors_by_name.keys():\n",
    "                actor_id = len(self.actors) + 1  # First actor_id = 1\n",
    "                new_actor = Actor(actor_id, actor)\n",
    "                self.actors[actor] = new_actor\n",
    "                self.actors_by_name[actor] = new_actor\n",
    "            self.actors_by_name[actor].movies.append(movie)\n",
    "            movie.actors.append(self.actors_by_name[actor])\n",
    "\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def write_files(self) -> None:\n",
    "        \"\"\"\n",
    "        Write all the file. Three thinks are done:\n",
    "\n",
    "        1. For each document, create a file (doc*.txt) that contains the summary and the name of\n",
    "        the actors.\n",
    "\n",
    "        2. Create a CSV file \"actors.txt\" with all the actors and their identifiers.\n",
    "\n",
    "        3. Build a matrix actors/actors which elements represent the number of times \n",
    "        two actors are playing in the same\n",
    "        movie_to_analyse.\n",
    "\n",
    "        4. Create a CSV file \"links.txt\" that contains all the pairs of actors having played together.\n",
    "        \"\"\"\n",
    "\n",
    "        # Write the clean text\n",
    "        for movie in self.movies.values():\n",
    "            movie_file = codecs.open(self.output + 'doc_' + str(movie.movie_id) + \".txt\", 'w', \"utf-8\")\n",
    "            movie_file.write(movie.summary + \"\\n\")\n",
    "            for actor in movie.actors:\n",
    "                movie_file.write(actor.name + \"\\n\")\n",
    "\n",
    "        # Write the list of actors\n",
    "        actors_file = codecs.open(self.output + \"actors.txt\", 'w', \"utf-8\")\n",
    "        for actor in self.actors.values():\n",
    "            actors_file.write(str(actor.actor_id) + ',\"' + actor.name + '\"\\n')\n",
    "\n",
    "        # Build the matrix actors/actors\n",
    "        matrix = numpy.zeros(shape=(len(self.actors), len(self.actors)))\n",
    "        for movie in self.movies.values():\n",
    "            for i in range(0, len(movie.actors) - 1):\n",
    "                for j in range(i + 1, len(movie.actors)):\n",
    "                    # ! Matrix begins with 0, actors with 1\n",
    "                    matrix[movie.actors[i].actor_id - 1, movie.actors[j].actor_id - 1] += 1\n",
    "                    matrix[movie.actors[j].actor_id - 1, movie.actors[i].actor_id - 1] += 1\n",
    "\n",
    "        # Write only the positive links\n",
    "        links_file = codecs.open(self.output + \"links.txt\", 'w', \"utf-8\")\n",
    "        for i in range(0, len(self.actors) - 1):\n",
    "            for j in range(i + 1, len(self.actors)):\n",
    "                weight = matrix[i, j]\n",
    "                if weight > 0.0:\n",
    "                    # ! Matrix begins with 0, actors with 1\n",
    "                    links_file.write(str(i + 1) + \",\" + str(j + 1) + \",\" + str(weight) + \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Harry Potter has lived under the stairs at his aunt and uncle's house his whole life. But on his 11th birthday, he learns he's a powerful wizard—with a place waiting for him at the Hogwarts School of Witchcraft and Wizardry. As he learns to harness his newfound powers with the help of the school's kindly headmaster, Harry uncovers the truth about his parents' deaths—and about the villain who's to blame.\n",
      "\n",
      "['Daniel Radcliffe', 'Rupert Grint', 'Emma Watson', 'Richard Harris', 'Tom Felton', 'Alan Rickman', 'Robbie Coltrane', 'Maggie Smith', 'Richard Griffiths']\n",
      "\n",
      "101-year-old Rose DeWitt Bukater tells the story of her life aboard the Titanic, 84 years later. A young Rose boards the ship with her mother and fiancé. Meanwhile, Jack Dawson and Fabrizio De Rossi win third-class tickets aboard the ship. Rose tells the whole story from Titanic's departure through to its death—on its first and last voyage—on April 15, 1912.\n",
      "\n",
      "['Leonardo DiCaprio', 'Kate Winslet', 'Billy Zane', 'Gloria Stuart', 'Kathy Bates', 'Frances Fisher', 'Bill Paxton', 'Bernard Hill', 'David Warner']\n",
      "\n",
      "A New York stockbroker refuses to cooperate in a large securities fraud case involving corruption on Wall Street, corporate banking world and mob infiltration. Based on Jordan Belfort's autobiography.\n",
      "\n",
      "['Leonardo DiCaprio', 'Jonah Hill', 'Margot Robbie', 'Matthew McConaughey', 'Kyle Chandler', 'Rob Reiner', 'Jon Bernthal', 'Jean Dujardin', 'Kenneth Choi']\n"
     ]
    }
   ],
   "source": [
    "# ----------------------------------------------------------------------------------------\n",
    "# Initialize a list of movies to download\n",
    "movies = [\n",
    "    (\"Harry Potter 1\",\"671\"),\n",
    "    (\"Titanic\",\"597\"),\n",
    "    (\"The Wolf of Wall Street\", \"106646\"),\n",
    "]\n",
    "basic_url_to_analyze = 'https://www.themoviedb.org/movie/'\n",
    "dir_docs = \"./docs\"\n",
    "\n",
    "\n",
    "# -----------------------------------------------------------------------------------------\n",
    "# Use our custom parser to download each HTML page and save the actors and the links\n",
    "parser = Parser(dir_docs, basic_url_to_analyze)\n",
    "for movie_label, movie_id in movies:\n",
    "    parser.extract_data(movie_id)\n",
    "parser.write_files()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Web Mining",
   "language": "python",
   "name": "web-mining"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}