textanalysis.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Dict, List, Optional\n",
    "import nltk\n",
    "import math\n",
    "import string\n",
    "import numpy\n",
    "from nltk.corpus import stopwords\n",
    "from gensim.models.doc2vec import TaggedDocument, Doc2Vec\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from os import walk, sep"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ensure the needed nltk resources have been downloaded and are up to date\n",
    "nltk.download('punkt')\n",
    "nltk.download('stopwords')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Token:\n",
    "    \"\"\"\n",
    "    Class representing a given token. It stores the string representing the token, its identifier and the number of\n",
    "    documents\n",
    "\n",
    "    |\n",
    "\n",
    "    The instance attributes are:\n",
    "\n",
    "    token_id:\n",
    "        Identifier of the token.\n",
    "    token:\n",
    "        String representing the token.\n",
    "    docs:\n",
    "        Identifiers of documents containing the token.\n",
    "    \"\"\"\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    token_id: int\n",
    "    token: str\n",
    "    docs: List[int]\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def __init__(self, token_id: int, token: str):\n",
    "        \"\"\"\n",
    "        Constructor.\n",
    "\n",
    "        :param token_id: Identifier of the token.\n",
    "        :param token: String representing the token.\n",
    "        \"\"\"\n",
    "        self.token_id = token_id\n",
    "        self.token = token\n",
    "        self.docs = []\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def get_idf(self, nb_docs: int) -> float:\n",
    "        \"\"\"\n",
    "        Compute the IDF factor of a token.\n",
    "\n",
    "        :param nb_docs: Total number of documents in the corpus.\n",
    "        :return: IDF factor.\n",
    "        \"\"\"\n",
    "\n",
    "        if len(self.docs) == 0:\n",
    "            return 0.0\n",
    "        return math.log(float(nb_docs) / float(len(self.docs)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Doc:\n",
    "    \"\"\"\n",
    "    This class represents an instance of a document.\n",
    "\n",
    "    |\n",
    "\n",
    "    The instance attributes are:\n",
    "\n",
    "    url:\n",
    "        URL of the document (if defined).\n",
    "    doc_id:\n",
    "        Identifier of the document.\n",
    "    text:\n",
    "        Text of the document to analyse.\n",
    "    vector:\n",
    "        Vector representing the document.\n",
    "    tokens:\n",
    "        List of tokens i order of appearances. A same token may appear several times.\n",
    "    \"\"\"\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    url: Optional[str]\n",
    "    doc_id: int\n",
    "    text: str\n",
    "    vector: numpy.ndarray\n",
    "    tokens: List[Token]\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def __init__(self, doc_id: int, text: str, url: Optional[str] = None):\n",
    "        \"\"\"\n",
    "        Constructor.\n",
    "\n",
    "        :param doc_id:\n",
    "        :param text: Text of the document (brut).\n",
    "        :param url: URL of the document (if any).\n",
    "        \"\"\"\n",
    "        self.url = url\n",
    "        self.doc_id = doc_id\n",
    "        self.text = text\n",
    "        self.vector = None\n",
    "        self.tokens = None\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class DocCorpus:\n",
    "    \"\"\"\n",
    "    This class represents a corpus of documents and the corresponding dictionary of tokens contained.\n",
    "\n",
    "    |\n",
    "\n",
    "    The instance attributes are:\n",
    "\n",
    "    docs:\n",
    "        List of documents.\n",
    "    tokens:\n",
    "        Dictionary of tokens (strings are the key).\n",
    "    ids:\n",
    "        Dictionary of tokens (identifiers are the key).\n",
    "    method:\n",
    "        String representing the method used for analysing (\"Bag of words\" or \"Doc2Vec\").\n",
    "    nb_dims:\n",
    "        Number of dimensions of the semantic space.\n",
    "    stopwords:\n",
    "        List of stopwords to eliminate from the analysis. By default, it's the classic English list.\n",
    "    \"\"\"\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    docs = List[Doc]\n",
    "    tokens: Dict[str, Token]\n",
    "    ids: Dict[int, Token]\n",
    "    method: str\n",
    "    nb_dims: int\n",
    "    stopwords: List[str]\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def __init__(self):\n",
    "        \"\"\"\n",
    "        Constructor.\n",
    "        \"\"\"\n",
    "        self.docs = []\n",
    "        self.tokens = dict()\n",
    "        self.ids = dict()\n",
    "        self.method = \"Doc2Vec\"\n",
    "        self.nb_dims = 0\n",
    "        self.stopwords = stopwords.words('english')\n",
    "        print(self.stopwords)\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def set_method(self, name) -> None:\n",
    "        \"\"\"\n",
    "        Change the parameter.\n",
    "\n",
    "        :param name: Name of the method.\n",
    "        \"\"\"\n",
    "        self.method = name\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def add_doc(self, new_doc: str, url: Optional[str] = None) -> None:\n",
    "        \"\"\"\n",
    "        Add a string representing a document to the corpus and provides an\n",
    "        identifier to the document.\n",
    "\n",
    "        :param new_doc: New document.\n",
    "        :param url: URL of the document (if any)\n",
    "        \"\"\"\n",
    "        new_id = len(self.docs)\n",
    "        self.docs.append(Doc(new_id, new_doc, url))\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def add_docs(self, docs: List[str]) -> None:\n",
    "        \"\"\"\n",
    "        Add a list of strings representing documents to the corpus. Each document receives an\n",
    "        identifier.\n",
    "\n",
    "        :param docs: List of documents.\n",
    "        \"\"\"\n",
    "        for cur_doc in docs:\n",
    "            self.add_doc(cur_doc)\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def build_vectors(self) -> None:\n",
    "        \"\"\"\n",
    "        Build the vectors for the documents of the corpus based on the current method.\n",
    "        \"\"\"\n",
    "\n",
    "        if self.method == \"Doc2Vec\":\n",
    "            self.build_doc2vec()\n",
    "        elif self.method == \"Bag of words\":\n",
    "            self.build_bag_of_words()\n",
    "        else:\n",
    "            raise ValueError(\"'\" + self.method + \"': Invalid building method\")\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def get_doc_token_matrix(self) -> numpy.ndarray:\n",
    "        \"\"\"\n",
    "        Build a document-token matrix with the weights as values.\n",
    "\n",
    "        :return: Document-token matrix.\n",
    "        \"\"\"\n",
    "\n",
    "        matrix = numpy.zeros(shape=(len(self.docs),self.nb_dims))\n",
    "        for cur_doc in self.docs:\n",
    "            i = 0\n",
    "            for token in cur_doc.vector:\n",
    "                matrix[cur_doc.doc_id, i] = token\n",
    "                i += 1\n",
    "        return matrix\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def extract_tokens(self) -> None:\n",
    "        \"\"\"\n",
    "        Extract the tokens from the text of the documents. In practice, for each document, the methods\n",
    "        do the following steps:\n",
    "\n",
    "        1. The text is transform in lowercase.\n",
    "\n",
    "        2. The text is tokenised.\n",
    "\n",
    "        3. Stopwords are removed.\n",
    "\n",
    "        The method words incrementally. Once a document is treated, it will not be re-treated in successive\n",
    "        calls.\n",
    "        \"\"\"\n",
    "\n",
    "        stem = nltk.stem.SnowballStemmer(\"english\")\n",
    "        for cur_doc in self.docs:\n",
    "            if cur_doc.tokens is not None:\n",
    "                continue\n",
    "            cur_doc.tokens = []\n",
    "            text = cur_doc.text.lower()\n",
    "            for extracted_token in nltk.word_tokenize(text):\n",
    "\n",
    "                # Retains only the stem of non stopwords and punctuation\n",
    "                if extracted_token in string.punctuation: continue\n",
    "                if extracted_token in self.stopwords: continue\n",
    "                token_str=stem.stem(extracted_token)\n",
    "\n",
    "                # Find the identifier of the current token in the dictionary\n",
    "                if token_str not in self.tokens.keys():\n",
    "                    token_id = len(self.tokens)\n",
    "                    token = Token(token_id, token_str)\n",
    "                    self.tokens[token_str] = token\n",
    "                    self.ids[token_id] = token\n",
    "                    self.nb_dims = len(self.tokens)\n",
    "                else:\n",
    "                    token = self.tokens[token_str]\n",
    "\n",
    "                # Add the token\n",
    "                cur_doc.tokens.append(token)\n",
    "\n",
    "                # Add a reference count if necessary\n",
    "                if cur_doc.doc_id not in token.docs:\n",
    "                    token.docs.append(cur_doc.doc_id)\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def build_bag_of_words(self) -> None:\n",
    "        \"\"\"\n",
    "        Build the vectors of the corpus using the bag of words approach.\n",
    "        \"\"\"\n",
    "\n",
    "        vectors = []\n",
    "        self.extract_tokens()\n",
    "\n",
    "        # Step 1: For each document, compute the relative frequencies of each token (TF).\n",
    "        for cur_doc in self.docs:\n",
    "\n",
    "            vector = dict()  # Dictionary representing a vector of pairs (token_id,nb_occurrences)\n",
    "            nb_occurrences = 0\n",
    "\n",
    "            for token in cur_doc.tokens:\n",
    "                nb_occurrences += 1\n",
    "\n",
    "                # Add an occurrence of the current token in the vector\n",
    "                if token.token_id not in vector.keys():\n",
    "                    vector[token.token_id] = 1\n",
    "                else:\n",
    "                    vector[token.token_id] += 1\n",
    "\n",
    "            # Compute the relative frequencies\n",
    "            for coord in vector:\n",
    "                coord /= float(nb_occurrences)\n",
    "            vectors.append(vector)\n",
    "\n",
    "        # Step 2: Build the vectors by multiplying the relative frequencies by the IDF factor.\n",
    "        for cur_doc in self.docs:\n",
    "            cur_doc.vector = numpy.zeros(shape=self.nb_dims)\n",
    "            vector = vectors[cur_doc.doc_id]\n",
    "            for token_id in vector:\n",
    "                weight = vector[token_id] * self.ids[token_id].get_idf(len(self.docs))\n",
    "                cur_doc.vector[token_id] = weight\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def build_doc2vec(self) -> None:\n",
    "        \"\"\"\n",
    "        Build the vectors using the doc2vec approach.\n",
    "        \"\"\"\n",
    "\n",
    "        self.extract_tokens()\n",
    "        corpus = []\n",
    "        for doc in self.docs:\n",
    "            tokens = []\n",
    "            for token in doc.tokens:\n",
    "                tokens.append(token.token)\n",
    "            corpus.append(tokens)\n",
    "        corpus = [\n",
    "            TaggedDocument(words, ['d{}'.format(idx)])\n",
    "            for idx, words in enumerate(corpus)\n",
    "        ]\n",
    "\n",
    "        self.nb_dims = 5\n",
    "        model = Doc2Vec(corpus, vector_size=self.nb_dims, min_count=1)\n",
    "        for i in range(0, len(self.docs)):\n",
    "            self.docs[i].vector = model.docvecs[i]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class TokenSorter:\n",
    "    \"\"\"\n",
    "    Class to sort a list of tokens by a certain value.\n",
    "    |\n",
    "\n",
    "    The instance attributes are:\n",
    "\n",
    "    tokens:\n",
    "        List of tokens to sort.\n",
    "    reverse:\n",
    "        Must the token be ranked descending (False) or ascending (True)\n",
    "    \"\"\"\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    class TokenRef:\n",
    "        \"\"\"\n",
    "        Class to represent a reference to a token.\n",
    "        \"\"\"\n",
    "\n",
    "        # ---------------------------------------------------------------------\n",
    "        token: Token\n",
    "        value: float\n",
    "\n",
    "        # ---------------------------------------------------------------------\n",
    "        def __init__(self, token: Token, value: float):\n",
    "            self.token = token\n",
    "            self.value = value\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    tokens: List[TokenRef]\n",
    "    reverse: bool\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def __init__(self):\n",
    "        \"\"\"\n",
    "        Constructor.\n",
    "        \"\"\"\n",
    "\n",
    "        self.tokens = []\n",
    "        self.reverse = False\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def build(self, tokens, value, reverse: bool) -> None:\n",
    "        \"\"\"\n",
    "        Build the list of token to sort.\n",
    "\n",
    "        :param tokens: Tokens to sort.\n",
    "        :param value: Lambda function that will be used to build the value associated to each token to sort.\n",
    "        :param reverse: Should the token be sorted in descending (True) of ascending (False) order.\n",
    "        \"\"\"\n",
    "\n",
    "        for token in tokens.values():\n",
    "            self.add_token(token, value(token))\n",
    "        self.reverse = reverse\n",
    "        self.sort()\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def add_token(self, token: Token, value: float) -> None:\n",
    "        \"\"\"\n",
    "        Add a token to the list.\n",
    "\n",
    "        :param token: Token to add.\n",
    "        :param value: Value that will be used to sort the tokens.\n",
    "        \"\"\"\n",
    "\n",
    "        self.tokens.append(TokenSorter.TokenRef(token=token, value=float(value)))\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def sort(self) -> None:\n",
    "        \"\"\"\n",
    "        Sort the tokens.\n",
    "        \"\"\"\n",
    "\n",
    "        self.tokens.sort(reverse=self.reverse, key=lambda token: token.value)\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def get_token(self, pos: int) -> str:\n",
    "        \"\"\"\n",
    "        Get a given token of the list.\n",
    "\n",
    "        :param pos: Position of the token in the list.\n",
    "        :return: String representing the token.\n",
    "        \"\"\"\n",
    "\n",
    "        return self.tokens[pos].token.token\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def get_value(self, pos: int) -> str:\n",
    "        \"\"\"\n",
    "        Get a value of a given token in the list.\n",
    "\n",
    "        :param pos: Position of the token in the list.\n",
    "        :return: String representing the value of the token used for the sorting.\n",
    "        \"\"\"\n",
    "\n",
    "        return str(self.tokens[pos].value)\n",
    "\n",
    "    # -------------------------------------------------------------------------\n",
    "    def print(self, title: str, nb : int) -> None:\n",
    "        \"\"\"\n",
    "        Print a given number of top ranked tokens with a title and their values.\n",
    "\n",
    "        :param title: Title to print.\n",
    "        :param nb:  Number of tokens to print.\n",
    "        \"\"\"\n",
    "        print(title)\n",
    "        if nb > len(self.tokens):\n",
    "            nb = len(self.tokens)\n",
    "        for i in range(0,nb):\n",
    "            print(\"  Token: '\" + self.get_token(i) + \"' (\" + self.get_value(i) + \")\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_matrix(name:str, matrix: numpy.ndarray) -> None:\n",
    "    \"\"\"\n",
    "    Simple method to print a little matrix nicely.\n",
    "\n",
    "    :param name:  Name of the matrix.\n",
    "    :param matrix:  Matrix to print.\n",
    "    \"\"\"\n",
    "    nb_lines = matrix.shape[0]\n",
    "    nb_cols = matrix.shape[1]\n",
    "    spaces = \" \" * (len(name) + 1)\n",
    "    title_line = nb_lines % 2\n",
    "    for i in range(0, nb_lines):\n",
    "        if i == title_line:\n",
    "            print(name + \"=\", end=\"\")\n",
    "        else:\n",
    "            print(spaces, end=\"\")\n",
    "        print(\"( \", end=\"\")\n",
    "        for j in range(0, nb_cols):\n",
    "            print( \"{:.3f}\".format(matrix[i,j]), end=\" \")\n",
    "        print(\")\",)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a corpus instance\n",
    "the_corpus = DocCorpus()\n",
    "\n",
    "# Look for all the files in a directory\n",
    "files = []\n",
    "dir_to_analyse = \"./docs\"\n",
    "for (_, _, file_names) in walk(dir_to_analyse):\n",
    "    files.extend(file_names)\n",
    "    break\n",
    "\n",
    "# Add the context to the corpus\n",
    "for doc_to_analyse in files:\n",
    "    # Treat only files beginning with \"doc\"\n",
    "    if doc_to_analyse[:3] != \"doc\":\n",
    "        continue\n",
    "\n",
    "    filename = dir_to_analyse + sep + doc_to_analyse\n",
    "    file = open(file=filename, mode=\"r\", encoding=\"utf-8\")\n",
    "    the_corpus.add_doc(file.read(), filename)\n",
    "\n",
    "# Extract the tokens\n",
    "the_corpus.extract_tokens()\n",
    "\n",
    "\n",
    "# ----------------------------------------------------------------------------------------------------------------------\n",
    "# Sort the tokens by the number of documents in which they appear\n",
    "sort_by_docs = TokenSorter()\n",
    "sort_by_docs.build(tokens=the_corpus.tokens, value=lambda token: len(token.docs), reverse=True)\n",
    "sort_by_docs.print(title=\"Most appearing tokens (Nb Documents):\",nb=5)\n",
    "\n",
    "# Sort the tokens by their idf factor\n",
    "sort_by_iDF = TokenSorter()\n",
    "sort_by_iDF.build(tokens=the_corpus.tokens, value=lambda token: token.get_idf(len(the_corpus.docs)), reverse=True)\n",
    "sort_by_iDF.print(title=\"Most discriminant  tokens (idf):\",nb=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def corpus_analysis(corpus: DocCorpus, method: str) -> None:\n",
    "    \"\"\"\n",
    "    MAke a little analysis of a corpus.\n",
    "\n",
    "    :param corpus: Corpus to analyse.\n",
    "    :param method: Method to use for the analysis.\n",
    "    \"\"\"\n",
    "    print(\"\\n---- \" + method + \" ----\")\n",
    "    corpus.set_method(method)\n",
    "    corpus.build_vectors()\n",
    "    matrix = corpus.get_doc_token_matrix()\n",
    "    print_matrix(\"Docs\", matrix)\n",
    "    for i in range(0, len(corpus.docs) - 1):\n",
    "        # Take a vector and build a two dimension matrix needed by cosine_similarity\n",
    "        vec1 = matrix[i].reshape(1, -1)\n",
    "\n",
    "        for j in range(i + 1, len(corpus.docs)):\n",
    "            # Take a vector and build a two dimension matrix needed by cosine_similarity\n",
    "            vec2 = matrix[j].reshape(1, -1)\n",
    "\n",
    "            # Compute and display the similarity\n",
    "            print(\"\\tSim(doc\" + str(i) + \",doc\" + str(j) + \")=\" + \"{:.3f}\".format(cosine_similarity(vec1, vec2)[0, 0]))\n",
    "\n",
    "\n",
    "# ----------------------------------------------------------------------------------------------------------------------\n",
    "corpus_analysis(corpus=the_corpus, method=\"Bag of words\")\n",
    "corpus_analysis(corpus=the_corpus, method=\"Doc2Vec\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}