add scripts from last year

a7fe6da3 · Corentin Vande Kerckhove · fb574560 · a7fe6da3 · a7fe6da3 · a7fe6da3
--- a/data-collection/docextraction.ipynb
+++ b/data-collection/docextraction.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import List, Dict\n",
+    "import codecs\n",
+    "import numpy\n",
+    "import os\n",
+    "import bs4\n",
+    "import httplib2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Actor:\n",
+    "    \"\"\"\n",
+    "    This class represents an actor.\n",
+    "\n",
+    "    |\n",
+    "\n",
+    "    The instance attributes are:\n",
+    "\n",
+    "    actor_id:\n",
+    "        Identifier of the actor.\n",
+    "\n",
+    "    name:\n",
+    "        Name of the actor.\n",
+    "\n",
+    "    movies:\n",
+    "        List of movies in which the actor has played.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # -------------------------------------------------------------------------\n",
+    "    actor_id: int\n",
+    "    name: str\n",
+    "    movies: List[\"Movie\"]\n",
+    "\n",
+    "    # -------------------------------------------------------------------------\n",
+    "    def __init__(self, actor_id: int, name: str):\n",
+    "        \"\"\"\n",
+    "        Constructor.\n",
+    "\n",
+    "        :param actor_id: Identifier of the actor.\n",
+    "        :param name: Name of the actor.\n",
+    "        \"\"\"\n",
+    "\n",
+    "        self.actor_id = actor_id\n",
+    "        self.name = name\n",
+    "        self.movies = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Movie:\n",
+    "    \"\"\"\n",
+    "    This class represents a movie_to_analyse.\n",
+    "\n",
+    "    |\n",
+    "\n",
+    "    The instance attributes are:\n",
+    "\n",
+    "    movie_id:\n",
+    "        Identifier of the movie_to_analyse.\n",
+    "\n",
+    "    name:\n",
+    "        Name of the movie_to_analyse in the IMDb database.\n",
+    "\n",
+    "    actors:\n",
+    "        List of actors who have played in the movie_to_analyse.\n",
+    "\n",
+    "    summary:\n",
+    "        Summary of the movie_to_analyse.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # -------------------------------------------------------------------------\n",
+    "    movie_id: int\n",
+    "    name: str\n",
+    "    actors: List[Actor]\n",
+    "    summary: str\n",
+    "\n",
+    "    # -------------------------------------------------------------------------\n",
+    "    def __init__(self, movie_id: int, name: str):\n",
+    "        \"\"\"\n",
+    "        Constructor.\n",
+    "\n",
+    "        :param movie_id: Identifier of the movie_to_analyse.\n",
+    "        :param name: Name fo the movie_to_analyse.\n",
+    "        \"\"\"\n",
+    "\n",
+    "        self.movie_id = movie_id\n",
+    "        self.name = name\n",
+    "        self.actors = []\n",
+    "        self.summary = \"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Parser:\n",
+    "    \"\"\"\n",
+    "\n",
+    "    |\n",
+    "\n",
+    "    The instance attributes are:\n",
+    "\n",
+    "    output:\n",
+    "        Directory where to store the resulting data.\n",
+    "\n",
+    "    basic_url:\n",
+    "        Begin of the URL used to retrieve the HTML page of a movie_to_analyse.\n",
+    "\n",
+    "    actors:\n",
+    "        Dictionary of actors (the identifiers are the key).\n",
+    "\n",
+    "    actors:\n",
+    "        Dictionary of actors (the names are the key).\n",
+    "\n",
+    "    movies:\n",
+    "        Dictionary of movies (the identifiers are the key).\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # -------------------------------------------------------------------------\n",
+    "    output: str\n",
+    "    basic_url: str\n",
+    "    actors: Dict[int, Actor]\n",
+    "    actors_by_name: Dict[str, Actor]\n",
+    "    movies: Dict[int, Movie]\n",
+    "\n",
+    "    # -------------------------------------------------------------------------\n",
+    "    def __init__(self, output: str, basic_url: str) -> None:\n",
+    "        \"\"\"\n",
+    "        Initialize the parser.\n",
+    "\n",
+    "        :param output: Directory where to store the results.\n",
+    "        :param basic_url: Beginning part of the URL of a movie_to_analyse page.\n",
+    "        \"\"\"\n",
+    "\n",
+    "        self.output = output + os.sep\n",
+    "        self.basic_url = basic_url\n",
+    "        self.actors = dict()\n",
+    "        self.actors_by_name = dict()\n",
+    "        self.movies = dict()\n",
+    "\n",
+    "    # -------------------------------------------------------------------------\n",
+    "    def extract_data(self, movie: str) -> None:\n",
+    "        \"\"\"\n",
+    "        Extract the \"useful\" data from the page. In practice, the following steps are executed:\n",
+    "\n",
+    "        1. Build the URL of the movie_to_analyse page.\n",
+    "\n",
+    "        2. Create a new Movie instance and add it to the list.\n",
+    "\n",
+    "        3. Download the HTML page and use an instance of BeautifulSoup to parse.\n",
+    "\n",
+    "        4. Extract all \"div\" tags and analyze those of the class \"summary_text\" (summary of the movie_to_analyse) and\n",
+    "        \"credit_summary_item\" (directors, producers, actors, etc.).\n",
+    "\n",
+    "        :param movie: Analyzed movie_to_analyse.\n",
+    "        \"\"\"\n",
+    "\n",
+    "        url = self.basic_url + movie\n",
+    "\n",
+    "        doc_id = len(self.movies) + 1  # First actor_id = 1\n",
+    "        movie = Movie(doc_id, movie)\n",
+    "        self.movies[doc_id] = movie\n",
+    "\n",
+    "        # Download the HTML and parse it through Beautifulsoup\n",
+    "        h = httplib2.Http(\"./docs/.cache\")\n",
+    "        resp, content = h.request(url, \"GET\")\n",
+    "        soup = bs4.BeautifulSoup(content, \"html.parser\")\n",
+    "\n",
+    "        # Extract the content\n",
+    "        divs = soup.find_all(\"div\")\n",
+    "        for div in divs:\n",
+    "            div_class = div.get(\"class\")\n",
+    "            if str(div_class)[:15] == \"['GenresAndPlot\":\n",
+    "                spans=div.find_all(\"span\")\n",
+    "                for span in spans:\n",
+    "                    span_data=span.get(\"data-testid\")\n",
+    "                    if span_data==\"plot-xs_to_m\":\n",
+    "                        try:\n",
+    "                            movie.summary = span.string.strip()\n",
+    "                        except:\n",
+    "                            movie.summary = span.contents[0]\n",
+    "                            \n",
+    "                        print(movie.summary)\n",
+    "            elif div_class == ['ipc-shoveler', 'title-cast__grid']:\n",
+    "                self.extract_actors(movie, div)\n",
+    "\n",
+    "    # -------------------------------------------------------------------------\n",
+    "    def extract_actors(self, movie, div) -> None:\n",
+    "        \"\"\"\n",
+    "        This function takes the content of a \"div\" tag to determined if it contains actors. In practice, the following\n",
+    "        steps are executed:\n",
+    "\n",
+    "        1. Look if there is a \"h4\" tag that contains \"Stars:\".\n",
+    "\n",
+    "        2. Extract all the links that begins with \"/name\". These are links to actor pages and the name of\n",
+    "        the actor is extracted.\n",
+    "\n",
+    "        3. Each actor extracted to the global list of actors and the list of actors for the analyzed movie_to_analyse.\n",
+    "\n",
+    "        :param movie: Analyzed movie_to_analyse.\n",
+    "        :param div: A \"div\" tag that could contain the actors.\n",
+    "        \"\"\"\n",
+    "\n",
+    "        # Look this are the actors\n",
+    "        divs= div.find_all(\"div\")\n",
+    "        if (divs is None) :\n",
+    "            return\n",
+    "\n",
+    "        # Extract all the text of the links beginning with \"name\"\n",
+    "        for div in divs:\n",
+    "            div_class = div.get(\"data-testid\")\n",
+    "            if div_class==\"title-cast-item\":\n",
+    "                for link in div.find_all(\"a\"):\n",
+    "                    href = link[\"href\"]\n",
+    "                    if href[:5] != \"/name\" or link[\"class\"] == [\"ipc-lockup-overlay\", \"ipc-focusable\"]:\n",
+    "                        continue\n",
+    "                    actor = link.string.strip()\n",
+    "\n",
+    "            # Add the fact that the current actor is in the current movie_to_analyse\n",
+    "                    if actor not in self.actors_by_name.keys():\n",
+    "                        actor_id = len(self.actors) + 1  # First actor_id = 1\n",
+    "                        new_actor = Actor(actor_id, actor)\n",
+    "                        self.actors[actor] = new_actor\n",
+    "                        self.actors_by_name[actor] = new_actor\n",
+    "                    self.actors_by_name[actor].movies.append(movie)\n",
+    "                    movie.actors.append(self.actors_by_name[actor])\n",
+    "\n",
+    "    # -------------------------------------------------------------------------\n",
+    "    def write_files(self) -> None:\n",
+    "        \"\"\"\n",
+    "        Write all the file. Three thinks are done:\n",
+    "\n",
+    "        1. For each document, create a file (doc*.txt) that contains the summary and the name of\n",
+    "        the actors.\n",
+    "\n",
+    "        2. Create a CSV file \"actors.txt\" with all the actors and their identifiers.\n",
+    "\n",
+    "        3. Build a matrix actors/actors which elements represent the number of times two actors are playing in the same\n",
+    "        movie_to_analyse.\n",
+    "\n",
+    "        4. Create a CSV file \"links.txt\" that contains all the pairs of actors having played together.\n",
+    "        \"\"\"\n",
+    "\n",
+    "        # Write the clean text\n",
+    "        for movie in self.movies.values():\n",
+    "            movie_file = codecs.open(self.output + 'doc' + str(movie.movie_id) + \".txt\", 'w', \"utf-8\")\n",
+    "            movie_file.write(movie.summary + \"\\n\")\n",
+    "            for actor in movie.actors:\n",
+    "                movie_file.write(actor.name + \"\\n\")\n",
+    "\n",
+    "        # Write the list of actors\n",
+    "        actors_file = codecs.open(self.output + \"actors.txt\", 'w', \"utf-8\")\n",
+    "        for actor in self.actors.values():\n",
+    "            actors_file.write(str(actor.actor_id) + ',\"' + actor.name + '\"\\n')\n",
+    "\n",
+    "        # Build the matrix actors/actors\n",
+    "        matrix = numpy.zeros(shape=(len(self.actors), len(self.actors)))\n",
+    "        for movie in self.movies.values():\n",
+    "            for i in range(0, len(movie.actors) - 1):\n",
+    "                for j in range(i + 1, len(movie.actors)):\n",
+    "                    # ! Matrix begins with 0, actors with 1\n",
+    "                    matrix[movie.actors[i].actor_id - 1, movie.actors[j].actor_id - 1] += 1\n",
+    "                    matrix[movie.actors[j].actor_id - 1, movie.actors[i].actor_id - 1] += 1\n",
+    "\n",
+    "        # Write only the positive links\n",
+    "        links_file = codecs.open(self.output + \"links.txt\", 'w', \"utf-8\")\n",
+    "        for i in range(0, len(self.actors) - 1):\n",
+    "            for j in range(i + 1, len(self.actors)):\n",
+    "                weight = matrix[i, j]\n",
+    "                if weight > 0.0:\n",
+    "                    # ! Matrix begins with 0, actors with 1\n",
+    "                    links_file.write(str(i + 1) + \",\" + str(j + 1) + \",\" + str(weight) + \"\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "A U.S. Army officer serving in Vietnam is tasked with assassinating a renegade Special Forces Colonel who sees himself as a god.\n",
+      "The Godfather follows Vito Corleone Don of the Corleone family as he passes the mantel to his son Michael\n",
+      "A C.I.A. Agent tries to infiltrate Soviet intelligence to stop a murderous diabolical plot.\n",
+      "Years after a friend and fellow 00 agent is killed on a joint mission, a secret space based weapons program known as \"GoldenEye\" is stolen. James Bond sets out to stop a Russian crime syndic...\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ----------------------------------------------------------------------------------------------------------------------\n",
+    "# Initialize a list of movies to download\n",
+    "movies = [\"0078788\", \"0068646\", \"0083891\",\"0113189\"]\n",
+    "basic_url_to_analyze = \"https://www.imdb.com/title/tt\"\n",
+    "dir_docs = \"./docs\"\n",
+    "\n",
+    "\n",
+    "# ----------------------------------------------------------------------------------------------------------------------\n",
+    "# Use our custom parser to download each HTML page and save the actors and the links\n",
+    "parser = Parser(dir_docs, basic_url_to_analyze)\n",
+    "for movie_to_analyse in movies:\n",
+    "    parser.extract_data(movie_to_analyse)\n",
+    "parser.write_files()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
+%% Cell type:code id: tags:
+
+``` python
+from typing import List, Dict
+import codecs
+import numpy
+import os
+import bs4
+import httplib2
+```
+
+%% Cell type:code id: tags:
+
+``` python
+class Actor:
+    """
+    This class represents an actor.
+
+    |
+
+    The instance attributes are:
+
+    actor_id:
+        Identifier of the actor.
+
+    name:
+        Name of the actor.
+
+    movies:
+        List of movies in which the actor has played.
+    """
+
+    # -------------------------------------------------------------------------
+    actor_id: int
+    name: str
+    movies: List["Movie"]
+
+    # -------------------------------------------------------------------------
+    def __init__(self, actor_id: int, name: str):
+        """
+        Constructor.
+
+        :param actor_id: Identifier of the actor.
+        :param name: Name of the actor.
+        """
+
+        self.actor_id = actor_id
+        self.name = name
+        self.movies = []
+```
+
+%% Cell type:code id: tags:
+
+``` python
+class Movie:
+    """
+    This class represents a movie_to_analyse.
+
+    |
+
+    The instance attributes are:
+
+    movie_id:
+        Identifier of the movie_to_analyse.
+
+    name:
+        Name of the movie_to_analyse in the IMDb database.
+
+    actors:
+        List of actors who have played in the movie_to_analyse.
+
+    summary:
+        Summary of the movie_to_analyse.
+    """
+
+    # -------------------------------------------------------------------------
+    movie_id: int
+    name: str
+    actors: List[Actor]
+    summary: str
+
+    # -------------------------------------------------------------------------
+    def __init__(self, movie_id: int, name: str):
+        """
+        Constructor.
+
+        :param movie_id: Identifier of the movie_to_analyse.
+        :param name: Name fo the movie_to_analyse.
+        """
+
+        self.movie_id = movie_id
+        self.name = name
+        self.actors = []
+        self.summary = ""
+```
+
+%% Cell type:code id: tags:
+
+``` python
+class Parser:
+    """
+
+    |
+
+    The instance attributes are:
+
+    output:
+        Directory where to store the resulting data.
+
+    basic_url:
+        Begin of the URL used to retrieve the HTML page of a movie_to_analyse.
+
+    actors:
+        Dictionary of actors (the identifiers are the key).
+
+    actors:
+        Dictionary of actors (the names are the key).
+
+    movies:
+        Dictionary of movies (the identifiers are the key).
+    """
+
+    # -------------------------------------------------------------------------
+    output: str
+    basic_url: str
+    actors: Dict[int, Actor]
+    actors_by_name: Dict[str, Actor]
+    movies: Dict[int, Movie]
+
+    # -------------------------------------------------------------------------
+    def __init__(self, output: str, basic_url: str) -> None:
+        """
+        Initialize the parser.
+
+        :param output: Directory where to store the results.
+        :param basic_url: Beginning part of the URL of a movie_to_analyse page.
+        """
+
+        self.output = output + os.sep
+        self.basic_url = basic_url
+        self.actors = dict()
+        self.actors_by_name = dict()
+        self.movies = dict()
+
+    # -------------------------------------------------------------------------
+    def extract_data(self, movie: str) -> None:
+        """
+        Extract the "useful" data from the page. In practice, the following steps are executed:
+
+        1. Build the URL of the movie_to_analyse page.
+
+        2. Create a new Movie instance and add it to the list.
+
+        3. Download the HTML page and use an instance of BeautifulSoup to parse.
+
+        4. Extract all "div" tags and analyze those of the class "summary_text" (summary of the movie_to_analyse) and
+        "credit_summary_item" (directors, producers, actors, etc.).
+
+        :param movie: Analyzed movie_to_analyse.
+        """
+
+        url = self.basic_url + movie
+
+        doc_id = len(self.movies) + 1  # First actor_id = 1
+        movie = Movie(doc_id, movie)
+        self.movies[doc_id] = movie
+
+        # Download the HTML and parse it through Beautifulsoup
+        h = httplib2.Http("./docs/.cache")
+        resp, content = h.request(url, "GET")
+        soup = bs4.BeautifulSoup(content, "html.parser")
+
+        # Extract the content
+        divs = soup.find_all("div")
+        for div in divs:
+            div_class = div.get("class")
+            if str(div_class)[:15] == "['GenresAndPlot":
+                spans=div.find_all("span")
+                for span in spans:
+                    span_data=span.get("data-testid")
+                    if span_data=="plot-xs_to_m":
+                        try:
+                            movie.summary = span.string.strip()
+                        except:
+                            movie.summary = span.contents[0]
+
+                        print(movie.summary)
+            elif div_class == ['ipc-shoveler', 'title-cast__grid']:
+                self.extract_actors(movie, div)
+
+    # -------------------------------------------------------------------------
+    def extract_actors(self, movie, div) -> None:
+        """
+        This function takes the content of a "div" tag to determined if it contains actors. In practice, the following
+        steps are executed:
+
+        1. Look if there is a "h4" tag that contains "Stars:".
+
+        2. Extract all the links that begins with "/name". These are links to actor pages and the name of
+        the actor is extracted.
+
+        3. Each actor extracted to the global list of actors and the list of actors for the analyzed movie_to_analyse.
+
+        :param movie: Analyzed movie_to_analyse.
+        :param div: A "div" tag that could contain the actors.
+        """
+
+        # Look this are the actors
+        divs= div.find_all("div")
+        if (divs is None) :
+            return
+
+        # Extract all the text of the links beginning with "name"
+        for div in divs:
+            div_class = div.get("data-testid")
+            if div_class=="title-cast-item":
+                for link in div.find_all("a"):
+                    href = link["href"]
+                    if href[:5] != "/name" or link["class"] == ["ipc-lockup-overlay", "ipc-focusable"]:
+                        continue
+                    actor = link.string.strip()
+
+            # Add the fact that the current actor is in the current movie_to_analyse
+                    if actor not in self.actors_by_name.keys():
+                        actor_id = len(self.actors) + 1  # First actor_id = 1
+                        new_actor = Actor(actor_id, actor)
+                        self.actors[actor] = new_actor
+                        self.actors_by_name[actor] = new_actor
+                    self.actors_by_name[actor].movies.append(movie)
+                    movie.actors.append(self.actors_by_name[actor])
+
+    # -------------------------------------------------------------------------
+    def write_files(self) -> None:
+        """
+        Write all the file. Three thinks are done:
+
+        1. For each document, create a file (doc*.txt) that contains the summary and the name of
+        the actors.
+
+        2. Create a CSV file "actors.txt" with all the actors and their identifiers.
+
+        3. Build a matrix actors/actors which elements represent the number of times two actors are playing in the same
+        movie_to_analyse.
+
+        4. Create a CSV file "links.txt" that contains all the pairs of actors having played together.
+        """
+
+        # Write the clean text
+        for movie in self.movies.values():
+            movie_file = codecs.open(self.output + 'doc' + str(movie.movie_id) + ".txt", 'w', "utf-8")
+            movie_file.write(movie.summary + "\n")
+            for actor in movie.actors:
+                movie_file.write(actor.name + "\n")
+
+        # Write the list of actors
+        actors_file = codecs.open(self.output + "actors.txt", 'w', "utf-8")
+        for actor in self.actors.values():
+            actors_file.write(str(actor.actor_id) + ',"' + actor.name + '"\n')
+
+        # Build the matrix actors/actors
+        matrix = numpy.zeros(shape=(len(self.actors), len(self.actors)))
+        for movie in self.movies.values():
+            for i in range(0, len(movie.actors) - 1):
+                for j in range(i + 1, len(movie.actors)):
+                    # ! Matrix begins with 0, actors with 1
+                    matrix[movie.actors[i].actor_id - 1, movie.actors[j].actor_id - 1] += 1
+                    matrix[movie.actors[j].actor_id - 1, movie.actors[i].actor_id - 1] += 1
+
+        # Write only the positive links
+        links_file = codecs.open(self.output + "links.txt", 'w', "utf-8")
+        for i in range(0, len(self.actors) - 1):
+            for j in range(i + 1, len(self.actors)):
+                weight = matrix[i, j]
+                if weight > 0.0:
+                    # ! Matrix begins with 0, actors with 1
+                    links_file.write(str(i + 1) + "," + str(j + 1) + "," + str(weight) + "\n")
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# ----------------------------------------------------------------------------------------------------------------------
+# Initialize a list of movies to download
+movies = ["0078788", "0068646", "0083891","0113189"]
+basic_url_to_analyze = "https://www.imdb.com/title/tt"
+dir_docs = "./docs"
+
+
+# ----------------------------------------------------------------------------------------------------------------------
+# Use our custom parser to download each HTML page and save the actors and the links
+parser = Parser(dir_docs, basic_url_to_analyze)
+for movie_to_analyse in movies:
+    parser.extract_data(movie_to_analyse)
+parser.write_files()
+```
+
+%% Output
+
+    A U.S. Army officer serving in Vietnam is tasked with assassinating a renegade Special Forces Colonel who sees himself as a god.
+    The Godfather follows Vito Corleone Don of the Corleone family as he passes the mantel to his son Michael
+    A C.I.A. Agent tries to infiltrate Soviet intelligence to stop a murderous diabolical plot.
+    Years after a friend and fellow 00 agent is killed on a joint mission, a secret space based weapons program known as "GoldenEye" is stolen. James Bond sets out to stop a Russian crime syndic...
+
+%% Cell type:code id: tags:
+
+``` python
+```
--- a/link-analysis/floyd.py
+++ b/link-analysis/floyd.py
+def floyd(mat):
+    x = len(mat)
+    y = len(mat[0])
+    SP = preprocess(mat)
+    if x == y:
+        for k in range(x):
+            for i in range(x):
+                for j in range(x):
+                    SP[i,j]=min((SP[i,k]+SP[k,j],SP[i,j]))
+    else:
+        print('error')
+    for i in range(x):
+        SP[i,i]=0
+    return SP
--- a/link-analysis/preprocess.py
+++ b/link-analysis/preprocess.py
+def preprocess(mat):
+    x = len(mat)
+    y = len(mat[0])
+    for i in range(x):
+        for j in range(y):
+            if mat[i, j] == 0:
+                mat[i, j] = 100000
+    return mat
+
--- a/link-analysis/to_complete.py
+++ b/link-analysis/to_complete.py
+import numpy as np
+from math import *
+
+A = np.array([[0, 1, 1, 1, 1, 0, 0, 0, 0, 0],
+              [1, 0, 1, 1, 0, 0, 1, 0, 0, 0],
+              [1, 1, 0, 1, 1, 0, 1, 0, 0, 0],
+              [1, 1, 1, 0, 1, 0, 0, 0, 0, 0],
+              [1, 0, 1, 1, 0, 0, 1, 0, 1, 0],
+              [0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
+              [0, 1, 1, 0, 1, 0, 0, 0, 0, 0],
+              [0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
+              [0, 0, 0, 0, 1, 1, 0, 1, 0, 1],
+              [0, 0, 0, 0, 0, 1, 0, 1, 1, 0]], int)
+
+SimCommon = np.zeros(A.shape)
+
+for i in range(len(A)):
+    for k in range(len(A)):
+        
+XXXX
+A COMPLETER
+XXXX
\ No newline at end of file
--- a/text-mining/textanalysis.ipynb
+++ b/text-mining/textanalysis.ipynb