diff --git a/c-text-mining/textanalysis.ipynb b/c-text-mining/textanalysis.ipynb deleted file mode 100644 index ee4d16255c538bce246423c8b17bcee820274b25..0000000000000000000000000000000000000000 --- a/c-text-mining/textanalysis.ipynb +++ /dev/null @@ -1,598 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Dict, List, Optional\n", - "import nltk\n", - "import math\n", - "import string\n", - "import numpy\n", - "from nltk.corpus import stopwords\n", - "from gensim.models.doc2vec import TaggedDocument, Doc2Vec\n", - "from sklearn.metrics.pairwise import cosine_similarity\n", - "from os import walk, sep" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Ensure the needed nltk resources have been downloaded and are up to date\n", - "nltk.download('punkt')\n", - "nltk.download('stopwords')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class Token:\n", - " \"\"\"\n", - " Class representing a given token. It stores the string representing the token, its identifier and the number of\n", - " documents\n", - "\n", - " |\n", - "\n", - " The instance attributes are:\n", - "\n", - " token_id:\n", - " Identifier of the token.\n", - " token:\n", - " String representing the token.\n", - " docs:\n", - " Identifiers of documents containing the token.\n", - " \"\"\"\n", - "\n", - " # -------------------------------------------------------------------------\n", - " token_id: int\n", - " token: str\n", - " docs: List[int]\n", - "\n", - " # -------------------------------------------------------------------------\n", - " def __init__(self, token_id: int, token: str):\n", - " \"\"\"\n", - " Constructor.\n", - "\n", - " :param token_id: Identifier of the token.\n", - " :param token: String representing the token.\n", - " \"\"\"\n", - " self.token_id = token_id\n", - " self.token = token\n", - " self.docs = []\n", - "\n", - " # -------------------------------------------------------------------------\n", - " def get_idf(self, nb_docs: int) -> float:\n", - " \"\"\"\n", - " Compute the IDF factor of a token.\n", - "\n", - " :param nb_docs: Total number of documents in the corpus.\n", - " :return: IDF factor.\n", - " \"\"\"\n", - "\n", - " if len(self.docs) == 0:\n", - " return 0.0\n", - " return math.log(float(nb_docs) / float(len(self.docs)))\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class Doc:\n", - " \"\"\"\n", - " This class represents an instance of a document.\n", - "\n", - " |\n", - "\n", - " The instance attributes are:\n", - "\n", - " url:\n", - " URL of the document (if defined).\n", - " doc_id:\n", - " Identifier of the document.\n", - " text:\n", - " Text of the document to analyse.\n", - " vector:\n", - " Vector representing the document.\n", - " tokens:\n", - " List of tokens i order of appearances. A same token may appear several times.\n", - " \"\"\"\n", - "\n", - " # -------------------------------------------------------------------------\n", - " url: Optional[str]\n", - " doc_id: int\n", - " text: str\n", - " vector: numpy.ndarray\n", - " tokens: List[Token]\n", - "\n", - " # -------------------------------------------------------------------------\n", - " def __init__(self, doc_id: int, text: str, url: Optional[str] = None):\n", - " \"\"\"\n", - " Constructor.\n", - "\n", - " :param doc_id:\n", - " :param text: Text of the document (brut).\n", - " :param url: URL of the document (if any).\n", - " \"\"\"\n", - " self.url = url\n", - " self.doc_id = doc_id\n", - " self.text = text\n", - " self.vector = None\n", - " self.tokens = None\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class DocCorpus:\n", - " \"\"\"\n", - " This class represents a corpus of documents and the corresponding dictionary of tokens contained.\n", - "\n", - " |\n", - "\n", - " The instance attributes are:\n", - "\n", - " docs:\n", - " List of documents.\n", - " tokens:\n", - " Dictionary of tokens (strings are the key).\n", - " ids:\n", - " Dictionary of tokens (identifiers are the key).\n", - " method:\n", - " String representing the method used for analysing (\"Bag of words\" or \"Doc2Vec\").\n", - " nb_dims:\n", - " Number of dimensions of the semantic space.\n", - " stopwords:\n", - " List of stopwords to eliminate from the analysis. By default, it's the classic English list.\n", - " \"\"\"\n", - "\n", - " # -------------------------------------------------------------------------\n", - " docs = List[Doc]\n", - " tokens: Dict[str, Token]\n", - " ids: Dict[int, Token]\n", - " method: str\n", - " nb_dims: int\n", - " stopwords: List[str]\n", - "\n", - " # -------------------------------------------------------------------------\n", - " def __init__(self):\n", - " \"\"\"\n", - " Constructor.\n", - " \"\"\"\n", - " self.docs = []\n", - " self.tokens = dict()\n", - " self.ids = dict()\n", - " self.method = \"Doc2Vec\"\n", - " self.nb_dims = 0\n", - " self.stopwords = stopwords.words('english')\n", - " print(self.stopwords)\n", - "\n", - " # -------------------------------------------------------------------------\n", - " def set_method(self, name) -> None:\n", - " \"\"\"\n", - " Change the parameter.\n", - "\n", - " :param name: Name of the method.\n", - " \"\"\"\n", - " self.method = name\n", - "\n", - " # -------------------------------------------------------------------------\n", - " def add_doc(self, new_doc: str, url: Optional[str] = None) -> None:\n", - " \"\"\"\n", - " Add a string representing a document to the corpus and provides an\n", - " identifier to the document.\n", - "\n", - " :param new_doc: New document.\n", - " :param url: URL of the document (if any)\n", - " \"\"\"\n", - " new_id = len(self.docs)\n", - " self.docs.append(Doc(new_id, new_doc, url))\n", - "\n", - " # -------------------------------------------------------------------------\n", - " def add_docs(self, docs: List[str]) -> None:\n", - " \"\"\"\n", - " Add a list of strings representing documents to the corpus. Each document receives an\n", - " identifier.\n", - "\n", - " :param docs: List of documents.\n", - " \"\"\"\n", - " for cur_doc in docs:\n", - " self.add_doc(cur_doc)\n", - "\n", - " # -------------------------------------------------------------------------\n", - " def build_vectors(self) -> None:\n", - " \"\"\"\n", - " Build the vectors for the documents of the corpus based on the current method.\n", - " \"\"\"\n", - "\n", - " if self.method == \"Doc2Vec\":\n", - " self.build_doc2vec()\n", - " elif self.method == \"Bag of words\":\n", - " self.build_bag_of_words()\n", - " else:\n", - " raise ValueError(\"'\" + self.method + \"': Invalid building method\")\n", - "\n", - " # -------------------------------------------------------------------------\n", - " def get_doc_token_matrix(self) -> numpy.ndarray:\n", - " \"\"\"\n", - " Build a document-token matrix with the weights as values.\n", - "\n", - " :return: Document-token matrix.\n", - " \"\"\"\n", - "\n", - " matrix = numpy.zeros(shape=(len(self.docs),self.nb_dims))\n", - " for cur_doc in self.docs:\n", - " i = 0\n", - " for token in cur_doc.vector:\n", - " matrix[cur_doc.doc_id, i] = token\n", - " i += 1\n", - " return matrix\n", - "\n", - " # -------------------------------------------------------------------------\n", - " def extract_tokens(self) -> None:\n", - " \"\"\"\n", - " Extract the tokens from the text of the documents. In practice, for each document, the methods\n", - " do the following steps:\n", - "\n", - " 1. The text is transform in lowercase.\n", - "\n", - " 2. The text is tokenised.\n", - "\n", - " 3. Stopwords are removed.\n", - "\n", - " The method words incrementally. Once a document is treated, it will not be re-treated in successive\n", - " calls.\n", - " \"\"\"\n", - "\n", - " stem = nltk.stem.SnowballStemmer(\"english\")\n", - " for cur_doc in self.docs:\n", - " if cur_doc.tokens is not None:\n", - " continue\n", - " cur_doc.tokens = []\n", - " text = cur_doc.text.lower()\n", - " for extracted_token in nltk.word_tokenize(text):\n", - "\n", - " # Retains only the stem of non stopwords and punctuation\n", - " if extracted_token in string.punctuation: continue\n", - " if extracted_token in self.stopwords: continue\n", - " token_str=stem.stem(extracted_token)\n", - "\n", - " # Find the identifier of the current token in the dictionary\n", - " if token_str not in self.tokens.keys():\n", - " token_id = len(self.tokens)\n", - " token = Token(token_id, token_str)\n", - " self.tokens[token_str] = token\n", - " self.ids[token_id] = token\n", - " self.nb_dims = len(self.tokens)\n", - " else:\n", - " token = self.tokens[token_str]\n", - "\n", - " # Add the token\n", - " cur_doc.tokens.append(token)\n", - "\n", - " # Add a reference count if necessary\n", - " if cur_doc.doc_id not in token.docs:\n", - " token.docs.append(cur_doc.doc_id)\n", - "\n", - " # -------------------------------------------------------------------------\n", - " def build_bag_of_words(self) -> None:\n", - " \"\"\"\n", - " Build the vectors of the corpus using the bag of words approach.\n", - " \"\"\"\n", - "\n", - " vectors = []\n", - " self.extract_tokens()\n", - "\n", - " # Step 1: For each document, compute the relative frequencies of each token (TF).\n", - " for cur_doc in self.docs:\n", - "\n", - " vector = dict() # Dictionary representing a vector of pairs (token_id,nb_occurrences)\n", - " nb_occurrences = 0\n", - "\n", - " for token in cur_doc.tokens:\n", - " nb_occurrences += 1\n", - "\n", - " # Add an occurrence of the current token in the vector\n", - " if token.token_id not in vector.keys():\n", - " vector[token.token_id] = 1\n", - " else:\n", - " vector[token.token_id] += 1\n", - "\n", - " # Compute the relative frequencies\n", - " for coord in vector:\n", - " coord /= float(nb_occurrences)\n", - " vectors.append(vector)\n", - "\n", - " # Step 2: Build the vectors by multiplying the relative frequencies by the IDF factor.\n", - " for cur_doc in self.docs:\n", - " cur_doc.vector = numpy.zeros(shape=self.nb_dims)\n", - " vector = vectors[cur_doc.doc_id]\n", - " for token_id in vector:\n", - " weight = vector[token_id] * self.ids[token_id].get_idf(len(self.docs))\n", - " cur_doc.vector[token_id] = weight\n", - "\n", - " # -------------------------------------------------------------------------\n", - " def build_doc2vec(self) -> None:\n", - " \"\"\"\n", - " Build the vectors using the doc2vec approach.\n", - " \"\"\"\n", - "\n", - " self.extract_tokens()\n", - " corpus = []\n", - " for doc in self.docs:\n", - " tokens = []\n", - " for token in doc.tokens:\n", - " tokens.append(token.token)\n", - " corpus.append(tokens)\n", - " corpus = [\n", - " TaggedDocument(words, ['d{}'.format(idx)])\n", - " for idx, words in enumerate(corpus)\n", - " ]\n", - "\n", - " self.nb_dims = 5\n", - " model = Doc2Vec(corpus, vector_size=self.nb_dims, min_count=1)\n", - " for i in range(0, len(self.docs)):\n", - " self.docs[i].vector = model.docvecs[i]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class TokenSorter:\n", - " \"\"\"\n", - " Class to sort a list of tokens by a certain value.\n", - " |\n", - "\n", - " The instance attributes are:\n", - "\n", - " tokens:\n", - " List of tokens to sort.\n", - " reverse:\n", - " Must the token be ranked descending (False) or ascending (True)\n", - " \"\"\"\n", - "\n", - " # -------------------------------------------------------------------------\n", - " class TokenRef:\n", - " \"\"\"\n", - " Class to represent a reference to a token.\n", - " \"\"\"\n", - "\n", - " # ---------------------------------------------------------------------\n", - " token: Token\n", - " value: float\n", - "\n", - " # ---------------------------------------------------------------------\n", - " def __init__(self, token: Token, value: float):\n", - " self.token = token\n", - " self.value = value\n", - "\n", - " # -------------------------------------------------------------------------\n", - " tokens: List[TokenRef]\n", - " reverse: bool\n", - "\n", - " # -------------------------------------------------------------------------\n", - " def __init__(self):\n", - " \"\"\"\n", - " Constructor.\n", - " \"\"\"\n", - "\n", - " self.tokens = []\n", - " self.reverse = False\n", - "\n", - " # -------------------------------------------------------------------------\n", - " def build(self, tokens, value, reverse: bool) -> None:\n", - " \"\"\"\n", - " Build the list of token to sort.\n", - "\n", - " :param tokens: Tokens to sort.\n", - " :param value: Lambda function that will be used to build the value associated to each token to sort.\n", - " :param reverse: Should the token be sorted in descending (True) of ascending (False) order.\n", - " \"\"\"\n", - "\n", - " for token in tokens.values():\n", - " self.add_token(token, value(token))\n", - " self.reverse = reverse\n", - " self.sort()\n", - "\n", - " # -------------------------------------------------------------------------\n", - " def add_token(self, token: Token, value: float) -> None:\n", - " \"\"\"\n", - " Add a token to the list.\n", - "\n", - " :param token: Token to add.\n", - " :param value: Value that will be used to sort the tokens.\n", - " \"\"\"\n", - "\n", - " self.tokens.append(TokenSorter.TokenRef(token=token, value=float(value)))\n", - "\n", - " # -------------------------------------------------------------------------\n", - " def sort(self) -> None:\n", - " \"\"\"\n", - " Sort the tokens.\n", - " \"\"\"\n", - "\n", - " self.tokens.sort(reverse=self.reverse, key=lambda token: token.value)\n", - "\n", - " # -------------------------------------------------------------------------\n", - " def get_token(self, pos: int) -> str:\n", - " \"\"\"\n", - " Get a given token of the list.\n", - "\n", - " :param pos: Position of the token in the list.\n", - " :return: String representing the token.\n", - " \"\"\"\n", - "\n", - " return self.tokens[pos].token.token\n", - "\n", - " # -------------------------------------------------------------------------\n", - " def get_value(self, pos: int) -> str:\n", - " \"\"\"\n", - " Get a value of a given token in the list.\n", - "\n", - " :param pos: Position of the token in the list.\n", - " :return: String representing the value of the token used for the sorting.\n", - " \"\"\"\n", - "\n", - " return str(self.tokens[pos].value)\n", - "\n", - " # -------------------------------------------------------------------------\n", - " def print(self, title: str, nb : int) -> None:\n", - " \"\"\"\n", - " Print a given number of top ranked tokens with a title and their values.\n", - "\n", - " :param title: Title to print.\n", - " :param nb: Number of tokens to print.\n", - " \"\"\"\n", - " print(title)\n", - " if nb > len(self.tokens):\n", - " nb = len(self.tokens)\n", - " for i in range(0,nb):\n", - " print(\" Token: '\" + self.get_token(i) + \"' (\" + self.get_value(i) + \")\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def print_matrix(name:str, matrix: numpy.ndarray) -> None:\n", - " \"\"\"\n", - " Simple method to print a little matrix nicely.\n", - "\n", - " :param name: Name of the matrix.\n", - " :param matrix: Matrix to print.\n", - " \"\"\"\n", - " nb_lines = matrix.shape[0]\n", - " nb_cols = matrix.shape[1]\n", - " spaces = \" \" * (len(name) + 1)\n", - " title_line = nb_lines % 2\n", - " for i in range(0, nb_lines):\n", - " if i == title_line:\n", - " print(name + \"=\", end=\"\")\n", - " else:\n", - " print(spaces, end=\"\")\n", - " print(\"( \", end=\"\")\n", - " for j in range(0, nb_cols):\n", - " print( \"{:.3f}\".format(matrix[i,j]), end=\" \")\n", - " print(\")\",)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create a corpus instance\n", - "the_corpus = DocCorpus()\n", - "\n", - "# Look for all the files in a directory\n", - "files = []\n", - "dir_to_analyse = \"./docs\"\n", - "for (_, _, file_names) in walk(dir_to_analyse):\n", - " files.extend(file_names)\n", - " break\n", - "\n", - "# Add the context to the corpus\n", - "for doc_to_analyse in files:\n", - " # Treat only files beginning with \"doc\"\n", - " if doc_to_analyse[:3] != \"doc\":\n", - " continue\n", - "\n", - " filename = dir_to_analyse + sep + doc_to_analyse\n", - " file = open(file=filename, mode=\"r\", encoding=\"utf-8\")\n", - " the_corpus.add_doc(file.read(), filename)\n", - "\n", - "# Extract the tokens\n", - "the_corpus.extract_tokens()\n", - "\n", - "\n", - "# ----------------------------------------------------------------------------------------------------------------------\n", - "# Sort the tokens by the number of documents in which they appear\n", - "sort_by_docs = TokenSorter()\n", - "sort_by_docs.build(tokens=the_corpus.tokens, value=lambda token: len(token.docs), reverse=True)\n", - "sort_by_docs.print(title=\"Most appearing tokens (Nb Documents):\",nb=5)\n", - "\n", - "# Sort the tokens by their idf factor\n", - "sort_by_iDF = TokenSorter()\n", - "sort_by_iDF.build(tokens=the_corpus.tokens, value=lambda token: token.get_idf(len(the_corpus.docs)), reverse=True)\n", - "sort_by_iDF.print(title=\"Most discriminant tokens (idf):\",nb=5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def corpus_analysis(corpus: DocCorpus, method: str) -> None:\n", - " \"\"\"\n", - " MAke a little analysis of a corpus.\n", - "\n", - " :param corpus: Corpus to analyse.\n", - " :param method: Method to use for the analysis.\n", - " \"\"\"\n", - " print(\"\\n---- \" + method + \" ----\")\n", - " corpus.set_method(method)\n", - " corpus.build_vectors()\n", - " matrix = corpus.get_doc_token_matrix()\n", - " print_matrix(\"Docs\", matrix)\n", - " for i in range(0, len(corpus.docs) - 1):\n", - " # Take a vector and build a two dimension matrix needed by cosine_similarity\n", - " vec1 = matrix[i].reshape(1, -1)\n", - "\n", - " for j in range(i + 1, len(corpus.docs)):\n", - " # Take a vector and build a two dimension matrix needed by cosine_similarity\n", - " vec2 = matrix[j].reshape(1, -1)\n", - "\n", - " # Compute and display the similarity\n", - " print(\"\\tSim(doc\" + str(i) + \",doc\" + str(j) + \")=\" + \"{:.3f}\".format(cosine_similarity(vec1, vec2)[0, 0]))\n", - "\n", - "\n", - "# ----------------------------------------------------------------------------------------------------------------------\n", - "corpus_analysis(corpus=the_corpus, method=\"Bag of words\")\n", - "corpus_analysis(corpus=the_corpus, method=\"Doc2Vec\")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}