diff --git a/data-collection/docextraction.ipynb b/data-collection/docextraction.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..f94bd643b68678b00936d3c4eeb018d9b81a4087 --- /dev/null +++ b/data-collection/docextraction.ipynb @@ -0,0 +1,354 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List, Dict\n", + "import codecs\n", + "import numpy\n", + "import os\n", + "import bs4\n", + "import httplib2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "class Actor:\n", + " \"\"\"\n", + " This class represents an actor.\n", + "\n", + " |\n", + "\n", + " The instance attributes are:\n", + "\n", + " actor_id:\n", + " Identifier of the actor.\n", + "\n", + " name:\n", + " Name of the actor.\n", + "\n", + " movies:\n", + " List of movies in which the actor has played.\n", + " \"\"\"\n", + "\n", + " # -------------------------------------------------------------------------\n", + " actor_id: int\n", + " name: str\n", + " movies: List[\"Movie\"]\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def __init__(self, actor_id: int, name: str):\n", + " \"\"\"\n", + " Constructor.\n", + "\n", + " :param actor_id: Identifier of the actor.\n", + " :param name: Name of the actor.\n", + " \"\"\"\n", + "\n", + " self.actor_id = actor_id\n", + " self.name = name\n", + " self.movies = []" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "class Movie:\n", + " \"\"\"\n", + " This class represents a movie_to_analyse.\n", + "\n", + " |\n", + "\n", + " The instance attributes are:\n", + "\n", + " movie_id:\n", + " Identifier of the movie_to_analyse.\n", + "\n", + " name:\n", + " Name of the movie_to_analyse in the IMDb database.\n", + "\n", + " actors:\n", + " List of actors who have played in the movie_to_analyse.\n", + "\n", + " summary:\n", + " Summary of the movie_to_analyse.\n", + " \"\"\"\n", + "\n", + " # -------------------------------------------------------------------------\n", + " movie_id: int\n", + " name: str\n", + " actors: List[Actor]\n", + " summary: str\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def __init__(self, movie_id: int, name: str):\n", + " \"\"\"\n", + " Constructor.\n", + "\n", + " :param movie_id: Identifier of the movie_to_analyse.\n", + " :param name: Name fo the movie_to_analyse.\n", + " \"\"\"\n", + "\n", + " self.movie_id = movie_id\n", + " self.name = name\n", + " self.actors = []\n", + " self.summary = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "class Parser:\n", + " \"\"\"\n", + "\n", + " |\n", + "\n", + " The instance attributes are:\n", + "\n", + " output:\n", + " Directory where to store the resulting data.\n", + "\n", + " basic_url:\n", + " Begin of the URL used to retrieve the HTML page of a movie_to_analyse.\n", + "\n", + " actors:\n", + " Dictionary of actors (the identifiers are the key).\n", + "\n", + " actors:\n", + " Dictionary of actors (the names are the key).\n", + "\n", + " movies:\n", + " Dictionary of movies (the identifiers are the key).\n", + " \"\"\"\n", + "\n", + " # -------------------------------------------------------------------------\n", + " output: str\n", + " basic_url: str\n", + " actors: Dict[int, Actor]\n", + " actors_by_name: Dict[str, Actor]\n", + " movies: Dict[int, Movie]\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def __init__(self, output: str, basic_url: str) -> None:\n", + " \"\"\"\n", + " Initialize the parser.\n", + "\n", + " :param output: Directory where to store the results.\n", + " :param basic_url: Beginning part of the URL of a movie_to_analyse page.\n", + " \"\"\"\n", + "\n", + " self.output = output + os.sep\n", + " self.basic_url = basic_url\n", + " self.actors = dict()\n", + " self.actors_by_name = dict()\n", + " self.movies = dict()\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def extract_data(self, movie: str) -> None:\n", + " \"\"\"\n", + " Extract the \"useful\" data from the page. In practice, the following steps are executed:\n", + "\n", + " 1. Build the URL of the movie_to_analyse page.\n", + "\n", + " 2. Create a new Movie instance and add it to the list.\n", + "\n", + " 3. Download the HTML page and use an instance of BeautifulSoup to parse.\n", + "\n", + " 4. Extract all \"div\" tags and analyze those of the class \"summary_text\" (summary of the movie_to_analyse) and\n", + " \"credit_summary_item\" (directors, producers, actors, etc.).\n", + "\n", + " :param movie: Analyzed movie_to_analyse.\n", + " \"\"\"\n", + "\n", + " url = self.basic_url + movie\n", + "\n", + " doc_id = len(self.movies) + 1 # First actor_id = 1\n", + " movie = Movie(doc_id, movie)\n", + " self.movies[doc_id] = movie\n", + "\n", + " # Download the HTML and parse it through Beautifulsoup\n", + " h = httplib2.Http(\"./docs/.cache\")\n", + " resp, content = h.request(url, \"GET\")\n", + " soup = bs4.BeautifulSoup(content, \"html.parser\")\n", + "\n", + " # Extract the content\n", + " divs = soup.find_all(\"div\")\n", + " for div in divs:\n", + " div_class = div.get(\"class\")\n", + " if str(div_class)[:15] == \"['GenresAndPlot\":\n", + " spans=div.find_all(\"span\")\n", + " for span in spans:\n", + " span_data=span.get(\"data-testid\")\n", + " if span_data==\"plot-xs_to_m\":\n", + " try:\n", + " movie.summary = span.string.strip()\n", + " except:\n", + " movie.summary = span.contents[0]\n", + " \n", + " print(movie.summary)\n", + " elif div_class == ['ipc-shoveler', 'title-cast__grid']:\n", + " self.extract_actors(movie, div)\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def extract_actors(self, movie, div) -> None:\n", + " \"\"\"\n", + " This function takes the content of a \"div\" tag to determined if it contains actors. In practice, the following\n", + " steps are executed:\n", + "\n", + " 1. Look if there is a \"h4\" tag that contains \"Stars:\".\n", + "\n", + " 2. Extract all the links that begins with \"/name\". These are links to actor pages and the name of\n", + " the actor is extracted.\n", + "\n", + " 3. Each actor extracted to the global list of actors and the list of actors for the analyzed movie_to_analyse.\n", + "\n", + " :param movie: Analyzed movie_to_analyse.\n", + " :param div: A \"div\" tag that could contain the actors.\n", + " \"\"\"\n", + "\n", + " # Look this are the actors\n", + " divs= div.find_all(\"div\")\n", + " if (divs is None) :\n", + " return\n", + "\n", + " # Extract all the text of the links beginning with \"name\"\n", + " for div in divs:\n", + " div_class = div.get(\"data-testid\")\n", + " if div_class==\"title-cast-item\":\n", + " for link in div.find_all(\"a\"):\n", + " href = link[\"href\"]\n", + " if href[:5] != \"/name\" or link[\"class\"] == [\"ipc-lockup-overlay\", \"ipc-focusable\"]:\n", + " continue\n", + " actor = link.string.strip()\n", + "\n", + " # Add the fact that the current actor is in the current movie_to_analyse\n", + " if actor not in self.actors_by_name.keys():\n", + " actor_id = len(self.actors) + 1 # First actor_id = 1\n", + " new_actor = Actor(actor_id, actor)\n", + " self.actors[actor] = new_actor\n", + " self.actors_by_name[actor] = new_actor\n", + " self.actors_by_name[actor].movies.append(movie)\n", + " movie.actors.append(self.actors_by_name[actor])\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def write_files(self) -> None:\n", + " \"\"\"\n", + " Write all the file. Three thinks are done:\n", + "\n", + " 1. For each document, create a file (doc*.txt) that contains the summary and the name of\n", + " the actors.\n", + "\n", + " 2. Create a CSV file \"actors.txt\" with all the actors and their identifiers.\n", + "\n", + " 3. Build a matrix actors/actors which elements represent the number of times two actors are playing in the same\n", + " movie_to_analyse.\n", + "\n", + " 4. Create a CSV file \"links.txt\" that contains all the pairs of actors having played together.\n", + " \"\"\"\n", + "\n", + " # Write the clean text\n", + " for movie in self.movies.values():\n", + " movie_file = codecs.open(self.output + 'doc' + str(movie.movie_id) + \".txt\", 'w', \"utf-8\")\n", + " movie_file.write(movie.summary + \"\\n\")\n", + " for actor in movie.actors:\n", + " movie_file.write(actor.name + \"\\n\")\n", + "\n", + " # Write the list of actors\n", + " actors_file = codecs.open(self.output + \"actors.txt\", 'w', \"utf-8\")\n", + " for actor in self.actors.values():\n", + " actors_file.write(str(actor.actor_id) + ',\"' + actor.name + '\"\\n')\n", + "\n", + " # Build the matrix actors/actors\n", + " matrix = numpy.zeros(shape=(len(self.actors), len(self.actors)))\n", + " for movie in self.movies.values():\n", + " for i in range(0, len(movie.actors) - 1):\n", + " for j in range(i + 1, len(movie.actors)):\n", + " # ! Matrix begins with 0, actors with 1\n", + " matrix[movie.actors[i].actor_id - 1, movie.actors[j].actor_id - 1] += 1\n", + " matrix[movie.actors[j].actor_id - 1, movie.actors[i].actor_id - 1] += 1\n", + "\n", + " # Write only the positive links\n", + " links_file = codecs.open(self.output + \"links.txt\", 'w', \"utf-8\")\n", + " for i in range(0, len(self.actors) - 1):\n", + " for j in range(i + 1, len(self.actors)):\n", + " weight = matrix[i, j]\n", + " if weight > 0.0:\n", + " # ! Matrix begins with 0, actors with 1\n", + " links_file.write(str(i + 1) + \",\" + str(j + 1) + \",\" + str(weight) + \"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A U.S. Army officer serving in Vietnam is tasked with assassinating a renegade Special Forces Colonel who sees himself as a god.\n", + "The Godfather follows Vito Corleone Don of the Corleone family as he passes the mantel to his son Michael\n", + "A C.I.A. Agent tries to infiltrate Soviet intelligence to stop a murderous diabolical plot.\n", + "Years after a friend and fellow 00 agent is killed on a joint mission, a secret space based weapons program known as \"GoldenEye\" is stolen. James Bond sets out to stop a Russian crime syndic...\n" + ] + } + ], + "source": [ + "# ----------------------------------------------------------------------------------------------------------------------\n", + "# Initialize a list of movies to download\n", + "movies = [\"0078788\", \"0068646\", \"0083891\",\"0113189\"]\n", + "basic_url_to_analyze = \"https://www.imdb.com/title/tt\"\n", + "dir_docs = \"./docs\"\n", + "\n", + "\n", + "# ----------------------------------------------------------------------------------------------------------------------\n", + "# Use our custom parser to download each HTML page and save the actors and the links\n", + "parser = Parser(dir_docs, basic_url_to_analyze)\n", + "for movie_to_analyse in movies:\n", + " parser.extract_data(movie_to_analyse)\n", + "parser.write_files()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/link-analysis/floyd.py b/link-analysis/floyd.py new file mode 100644 index 0000000000000000000000000000000000000000..5ef4123a5d14a76518646723f2df2e28036cd974 --- /dev/null +++ b/link-analysis/floyd.py @@ -0,0 +1,14 @@ +def floyd(mat): + x = len(mat) + y = len(mat[0]) + SP = preprocess(mat) + if x == y: + for k in range(x): + for i in range(x): + for j in range(x): + SP[i,j]=min((SP[i,k]+SP[k,j],SP[i,j])) + else: + print('error') + for i in range(x): + SP[i,i]=0 + return SP diff --git a/link-analysis/preprocess.py b/link-analysis/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..1d97642b19a82314d92614b6db892cf7226c89f7 --- /dev/null +++ b/link-analysis/preprocess.py @@ -0,0 +1,9 @@ +def preprocess(mat): + x = len(mat) + y = len(mat[0]) + for i in range(x): + for j in range(y): + if mat[i, j] == 0: + mat[i, j] = 100000 + return mat + diff --git a/link-analysis/to_complete.py b/link-analysis/to_complete.py new file mode 100644 index 0000000000000000000000000000000000000000..8e6afe29cfa9f7f2257f91fa052f3d58f0d7a7c1 --- /dev/null +++ b/link-analysis/to_complete.py @@ -0,0 +1,22 @@ +import numpy as np +from math import * + +A = np.array([[0, 1, 1, 1, 1, 0, 0, 0, 0, 0], + [1, 0, 1, 1, 0, 0, 1, 0, 0, 0], + [1, 1, 0, 1, 1, 0, 1, 0, 0, 0], + [1, 1, 1, 0, 1, 0, 0, 0, 0, 0], + [1, 0, 1, 1, 0, 0, 1, 0, 1, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 1, 1], + [0, 1, 1, 0, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 1, 1], + [0, 0, 0, 0, 1, 1, 0, 1, 0, 1], + [0, 0, 0, 0, 0, 1, 0, 1, 1, 0]], int) + +SimCommon = np.zeros(A.shape) + +for i in range(len(A)): + for k in range(len(A)): + +XXXX +A COMPLETER +XXXX \ No newline at end of file diff --git a/text-mining/textanalysis.ipynb b/text-mining/textanalysis.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..ee4d16255c538bce246423c8b17bcee820274b25 --- /dev/null +++ b/text-mining/textanalysis.ipynb @@ -0,0 +1,598 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Dict, List, Optional\n", + "import nltk\n", + "import math\n", + "import string\n", + "import numpy\n", + "from nltk.corpus import stopwords\n", + "from gensim.models.doc2vec import TaggedDocument, Doc2Vec\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "from os import walk, sep" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Ensure the needed nltk resources have been downloaded and are up to date\n", + "nltk.download('punkt')\n", + "nltk.download('stopwords')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class Token:\n", + " \"\"\"\n", + " Class representing a given token. It stores the string representing the token, its identifier and the number of\n", + " documents\n", + "\n", + " |\n", + "\n", + " The instance attributes are:\n", + "\n", + " token_id:\n", + " Identifier of the token.\n", + " token:\n", + " String representing the token.\n", + " docs:\n", + " Identifiers of documents containing the token.\n", + " \"\"\"\n", + "\n", + " # -------------------------------------------------------------------------\n", + " token_id: int\n", + " token: str\n", + " docs: List[int]\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def __init__(self, token_id: int, token: str):\n", + " \"\"\"\n", + " Constructor.\n", + "\n", + " :param token_id: Identifier of the token.\n", + " :param token: String representing the token.\n", + " \"\"\"\n", + " self.token_id = token_id\n", + " self.token = token\n", + " self.docs = []\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def get_idf(self, nb_docs: int) -> float:\n", + " \"\"\"\n", + " Compute the IDF factor of a token.\n", + "\n", + " :param nb_docs: Total number of documents in the corpus.\n", + " :return: IDF factor.\n", + " \"\"\"\n", + "\n", + " if len(self.docs) == 0:\n", + " return 0.0\n", + " return math.log(float(nb_docs) / float(len(self.docs)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class Doc:\n", + " \"\"\"\n", + " This class represents an instance of a document.\n", + "\n", + " |\n", + "\n", + " The instance attributes are:\n", + "\n", + " url:\n", + " URL of the document (if defined).\n", + " doc_id:\n", + " Identifier of the document.\n", + " text:\n", + " Text of the document to analyse.\n", + " vector:\n", + " Vector representing the document.\n", + " tokens:\n", + " List of tokens i order of appearances. A same token may appear several times.\n", + " \"\"\"\n", + "\n", + " # -------------------------------------------------------------------------\n", + " url: Optional[str]\n", + " doc_id: int\n", + " text: str\n", + " vector: numpy.ndarray\n", + " tokens: List[Token]\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def __init__(self, doc_id: int, text: str, url: Optional[str] = None):\n", + " \"\"\"\n", + " Constructor.\n", + "\n", + " :param doc_id:\n", + " :param text: Text of the document (brut).\n", + " :param url: URL of the document (if any).\n", + " \"\"\"\n", + " self.url = url\n", + " self.doc_id = doc_id\n", + " self.text = text\n", + " self.vector = None\n", + " self.tokens = None\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class DocCorpus:\n", + " \"\"\"\n", + " This class represents a corpus of documents and the corresponding dictionary of tokens contained.\n", + "\n", + " |\n", + "\n", + " The instance attributes are:\n", + "\n", + " docs:\n", + " List of documents.\n", + " tokens:\n", + " Dictionary of tokens (strings are the key).\n", + " ids:\n", + " Dictionary of tokens (identifiers are the key).\n", + " method:\n", + " String representing the method used for analysing (\"Bag of words\" or \"Doc2Vec\").\n", + " nb_dims:\n", + " Number of dimensions of the semantic space.\n", + " stopwords:\n", + " List of stopwords to eliminate from the analysis. By default, it's the classic English list.\n", + " \"\"\"\n", + "\n", + " # -------------------------------------------------------------------------\n", + " docs = List[Doc]\n", + " tokens: Dict[str, Token]\n", + " ids: Dict[int, Token]\n", + " method: str\n", + " nb_dims: int\n", + " stopwords: List[str]\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def __init__(self):\n", + " \"\"\"\n", + " Constructor.\n", + " \"\"\"\n", + " self.docs = []\n", + " self.tokens = dict()\n", + " self.ids = dict()\n", + " self.method = \"Doc2Vec\"\n", + " self.nb_dims = 0\n", + " self.stopwords = stopwords.words('english')\n", + " print(self.stopwords)\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def set_method(self, name) -> None:\n", + " \"\"\"\n", + " Change the parameter.\n", + "\n", + " :param name: Name of the method.\n", + " \"\"\"\n", + " self.method = name\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def add_doc(self, new_doc: str, url: Optional[str] = None) -> None:\n", + " \"\"\"\n", + " Add a string representing a document to the corpus and provides an\n", + " identifier to the document.\n", + "\n", + " :param new_doc: New document.\n", + " :param url: URL of the document (if any)\n", + " \"\"\"\n", + " new_id = len(self.docs)\n", + " self.docs.append(Doc(new_id, new_doc, url))\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def add_docs(self, docs: List[str]) -> None:\n", + " \"\"\"\n", + " Add a list of strings representing documents to the corpus. Each document receives an\n", + " identifier.\n", + "\n", + " :param docs: List of documents.\n", + " \"\"\"\n", + " for cur_doc in docs:\n", + " self.add_doc(cur_doc)\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def build_vectors(self) -> None:\n", + " \"\"\"\n", + " Build the vectors for the documents of the corpus based on the current method.\n", + " \"\"\"\n", + "\n", + " if self.method == \"Doc2Vec\":\n", + " self.build_doc2vec()\n", + " elif self.method == \"Bag of words\":\n", + " self.build_bag_of_words()\n", + " else:\n", + " raise ValueError(\"'\" + self.method + \"': Invalid building method\")\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def get_doc_token_matrix(self) -> numpy.ndarray:\n", + " \"\"\"\n", + " Build a document-token matrix with the weights as values.\n", + "\n", + " :return: Document-token matrix.\n", + " \"\"\"\n", + "\n", + " matrix = numpy.zeros(shape=(len(self.docs),self.nb_dims))\n", + " for cur_doc in self.docs:\n", + " i = 0\n", + " for token in cur_doc.vector:\n", + " matrix[cur_doc.doc_id, i] = token\n", + " i += 1\n", + " return matrix\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def extract_tokens(self) -> None:\n", + " \"\"\"\n", + " Extract the tokens from the text of the documents. In practice, for each document, the methods\n", + " do the following steps:\n", + "\n", + " 1. The text is transform in lowercase.\n", + "\n", + " 2. The text is tokenised.\n", + "\n", + " 3. Stopwords are removed.\n", + "\n", + " The method words incrementally. Once a document is treated, it will not be re-treated in successive\n", + " calls.\n", + " \"\"\"\n", + "\n", + " stem = nltk.stem.SnowballStemmer(\"english\")\n", + " for cur_doc in self.docs:\n", + " if cur_doc.tokens is not None:\n", + " continue\n", + " cur_doc.tokens = []\n", + " text = cur_doc.text.lower()\n", + " for extracted_token in nltk.word_tokenize(text):\n", + "\n", + " # Retains only the stem of non stopwords and punctuation\n", + " if extracted_token in string.punctuation: continue\n", + " if extracted_token in self.stopwords: continue\n", + " token_str=stem.stem(extracted_token)\n", + "\n", + " # Find the identifier of the current token in the dictionary\n", + " if token_str not in self.tokens.keys():\n", + " token_id = len(self.tokens)\n", + " token = Token(token_id, token_str)\n", + " self.tokens[token_str] = token\n", + " self.ids[token_id] = token\n", + " self.nb_dims = len(self.tokens)\n", + " else:\n", + " token = self.tokens[token_str]\n", + "\n", + " # Add the token\n", + " cur_doc.tokens.append(token)\n", + "\n", + " # Add a reference count if necessary\n", + " if cur_doc.doc_id not in token.docs:\n", + " token.docs.append(cur_doc.doc_id)\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def build_bag_of_words(self) -> None:\n", + " \"\"\"\n", + " Build the vectors of the corpus using the bag of words approach.\n", + " \"\"\"\n", + "\n", + " vectors = []\n", + " self.extract_tokens()\n", + "\n", + " # Step 1: For each document, compute the relative frequencies of each token (TF).\n", + " for cur_doc in self.docs:\n", + "\n", + " vector = dict() # Dictionary representing a vector of pairs (token_id,nb_occurrences)\n", + " nb_occurrences = 0\n", + "\n", + " for token in cur_doc.tokens:\n", + " nb_occurrences += 1\n", + "\n", + " # Add an occurrence of the current token in the vector\n", + " if token.token_id not in vector.keys():\n", + " vector[token.token_id] = 1\n", + " else:\n", + " vector[token.token_id] += 1\n", + "\n", + " # Compute the relative frequencies\n", + " for coord in vector:\n", + " coord /= float(nb_occurrences)\n", + " vectors.append(vector)\n", + "\n", + " # Step 2: Build the vectors by multiplying the relative frequencies by the IDF factor.\n", + " for cur_doc in self.docs:\n", + " cur_doc.vector = numpy.zeros(shape=self.nb_dims)\n", + " vector = vectors[cur_doc.doc_id]\n", + " for token_id in vector:\n", + " weight = vector[token_id] * self.ids[token_id].get_idf(len(self.docs))\n", + " cur_doc.vector[token_id] = weight\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def build_doc2vec(self) -> None:\n", + " \"\"\"\n", + " Build the vectors using the doc2vec approach.\n", + " \"\"\"\n", + "\n", + " self.extract_tokens()\n", + " corpus = []\n", + " for doc in self.docs:\n", + " tokens = []\n", + " for token in doc.tokens:\n", + " tokens.append(token.token)\n", + " corpus.append(tokens)\n", + " corpus = [\n", + " TaggedDocument(words, ['d{}'.format(idx)])\n", + " for idx, words in enumerate(corpus)\n", + " ]\n", + "\n", + " self.nb_dims = 5\n", + " model = Doc2Vec(corpus, vector_size=self.nb_dims, min_count=1)\n", + " for i in range(0, len(self.docs)):\n", + " self.docs[i].vector = model.docvecs[i]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class TokenSorter:\n", + " \"\"\"\n", + " Class to sort a list of tokens by a certain value.\n", + " |\n", + "\n", + " The instance attributes are:\n", + "\n", + " tokens:\n", + " List of tokens to sort.\n", + " reverse:\n", + " Must the token be ranked descending (False) or ascending (True)\n", + " \"\"\"\n", + "\n", + " # -------------------------------------------------------------------------\n", + " class TokenRef:\n", + " \"\"\"\n", + " Class to represent a reference to a token.\n", + " \"\"\"\n", + "\n", + " # ---------------------------------------------------------------------\n", + " token: Token\n", + " value: float\n", + "\n", + " # ---------------------------------------------------------------------\n", + " def __init__(self, token: Token, value: float):\n", + " self.token = token\n", + " self.value = value\n", + "\n", + " # -------------------------------------------------------------------------\n", + " tokens: List[TokenRef]\n", + " reverse: bool\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def __init__(self):\n", + " \"\"\"\n", + " Constructor.\n", + " \"\"\"\n", + "\n", + " self.tokens = []\n", + " self.reverse = False\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def build(self, tokens, value, reverse: bool) -> None:\n", + " \"\"\"\n", + " Build the list of token to sort.\n", + "\n", + " :param tokens: Tokens to sort.\n", + " :param value: Lambda function that will be used to build the value associated to each token to sort.\n", + " :param reverse: Should the token be sorted in descending (True) of ascending (False) order.\n", + " \"\"\"\n", + "\n", + " for token in tokens.values():\n", + " self.add_token(token, value(token))\n", + " self.reverse = reverse\n", + " self.sort()\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def add_token(self, token: Token, value: float) -> None:\n", + " \"\"\"\n", + " Add a token to the list.\n", + "\n", + " :param token: Token to add.\n", + " :param value: Value that will be used to sort the tokens.\n", + " \"\"\"\n", + "\n", + " self.tokens.append(TokenSorter.TokenRef(token=token, value=float(value)))\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def sort(self) -> None:\n", + " \"\"\"\n", + " Sort the tokens.\n", + " \"\"\"\n", + "\n", + " self.tokens.sort(reverse=self.reverse, key=lambda token: token.value)\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def get_token(self, pos: int) -> str:\n", + " \"\"\"\n", + " Get a given token of the list.\n", + "\n", + " :param pos: Position of the token in the list.\n", + " :return: String representing the token.\n", + " \"\"\"\n", + "\n", + " return self.tokens[pos].token.token\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def get_value(self, pos: int) -> str:\n", + " \"\"\"\n", + " Get a value of a given token in the list.\n", + "\n", + " :param pos: Position of the token in the list.\n", + " :return: String representing the value of the token used for the sorting.\n", + " \"\"\"\n", + "\n", + " return str(self.tokens[pos].value)\n", + "\n", + " # -------------------------------------------------------------------------\n", + " def print(self, title: str, nb : int) -> None:\n", + " \"\"\"\n", + " Print a given number of top ranked tokens with a title and their values.\n", + "\n", + " :param title: Title to print.\n", + " :param nb: Number of tokens to print.\n", + " \"\"\"\n", + " print(title)\n", + " if nb > len(self.tokens):\n", + " nb = len(self.tokens)\n", + " for i in range(0,nb):\n", + " print(\" Token: '\" + self.get_token(i) + \"' (\" + self.get_value(i) + \")\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def print_matrix(name:str, matrix: numpy.ndarray) -> None:\n", + " \"\"\"\n", + " Simple method to print a little matrix nicely.\n", + "\n", + " :param name: Name of the matrix.\n", + " :param matrix: Matrix to print.\n", + " \"\"\"\n", + " nb_lines = matrix.shape[0]\n", + " nb_cols = matrix.shape[1]\n", + " spaces = \" \" * (len(name) + 1)\n", + " title_line = nb_lines % 2\n", + " for i in range(0, nb_lines):\n", + " if i == title_line:\n", + " print(name + \"=\", end=\"\")\n", + " else:\n", + " print(spaces, end=\"\")\n", + " print(\"( \", end=\"\")\n", + " for j in range(0, nb_cols):\n", + " print( \"{:.3f}\".format(matrix[i,j]), end=\" \")\n", + " print(\")\",)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a corpus instance\n", + "the_corpus = DocCorpus()\n", + "\n", + "# Look for all the files in a directory\n", + "files = []\n", + "dir_to_analyse = \"./docs\"\n", + "for (_, _, file_names) in walk(dir_to_analyse):\n", + " files.extend(file_names)\n", + " break\n", + "\n", + "# Add the context to the corpus\n", + "for doc_to_analyse in files:\n", + " # Treat only files beginning with \"doc\"\n", + " if doc_to_analyse[:3] != \"doc\":\n", + " continue\n", + "\n", + " filename = dir_to_analyse + sep + doc_to_analyse\n", + " file = open(file=filename, mode=\"r\", encoding=\"utf-8\")\n", + " the_corpus.add_doc(file.read(), filename)\n", + "\n", + "# Extract the tokens\n", + "the_corpus.extract_tokens()\n", + "\n", + "\n", + "# ----------------------------------------------------------------------------------------------------------------------\n", + "# Sort the tokens by the number of documents in which they appear\n", + "sort_by_docs = TokenSorter()\n", + "sort_by_docs.build(tokens=the_corpus.tokens, value=lambda token: len(token.docs), reverse=True)\n", + "sort_by_docs.print(title=\"Most appearing tokens (Nb Documents):\",nb=5)\n", + "\n", + "# Sort the tokens by their idf factor\n", + "sort_by_iDF = TokenSorter()\n", + "sort_by_iDF.build(tokens=the_corpus.tokens, value=lambda token: token.get_idf(len(the_corpus.docs)), reverse=True)\n", + "sort_by_iDF.print(title=\"Most discriminant tokens (idf):\",nb=5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def corpus_analysis(corpus: DocCorpus, method: str) -> None:\n", + " \"\"\"\n", + " MAke a little analysis of a corpus.\n", + "\n", + " :param corpus: Corpus to analyse.\n", + " :param method: Method to use for the analysis.\n", + " \"\"\"\n", + " print(\"\\n---- \" + method + \" ----\")\n", + " corpus.set_method(method)\n", + " corpus.build_vectors()\n", + " matrix = corpus.get_doc_token_matrix()\n", + " print_matrix(\"Docs\", matrix)\n", + " for i in range(0, len(corpus.docs) - 1):\n", + " # Take a vector and build a two dimension matrix needed by cosine_similarity\n", + " vec1 = matrix[i].reshape(1, -1)\n", + "\n", + " for j in range(i + 1, len(corpus.docs)):\n", + " # Take a vector and build a two dimension matrix needed by cosine_similarity\n", + " vec2 = matrix[j].reshape(1, -1)\n", + "\n", + " # Compute and display the similarity\n", + " print(\"\\tSim(doc\" + str(i) + \",doc\" + str(j) + \")=\" + \"{:.3f}\".format(cosine_similarity(vec1, vec2)[0, 0]))\n", + "\n", + "\n", + "# ----------------------------------------------------------------------------------------------------------------------\n", + "corpus_analysis(corpus=the_corpus, method=\"Bag of words\")\n", + "corpus_analysis(corpus=the_corpus, method=\"Doc2Vec\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}