From 03fc9e3c9fde066e1bd26663769e28e33ff20686 Mon Sep 17 00:00:00 2001
From: Adrienucl <adrien.payen@student.uclouvain.be>
Date: Thu, 2 May 2024 10:01:18 +0200
Subject: [PATCH] git merge all branch to main

---
 analytics_small.ipynb | 303 ++---------------
 analytics_test.ipynb  | 449 +++++++++++++++++++++++++
 analytics_tiny.ipynb  | 306 ++---------------
 content_based.ipynb   |  22 +-
 loaders.py            |  51 +++
 models.py             | 181 ++++++++++
 user_based.ipynb      | 762 ++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 1499 insertions(+), 575 deletions(-)
 create mode 100644 analytics_test.ipynb
 create mode 100644 loaders.py
 create mode 100644 models.py
 create mode 100644 user_based.ipynb

diff --git a/analytics_small.ipynb b/analytics_small.ipynb
index b6f7494f..b41000c2 100644
--- a/analytics_small.ipynb
+++ b/analytics_small.ipynb
@@ -6,274 +6,15 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Display The Movies : \n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>title</th>\n",
-       "      <th>genres</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>movieId</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>Grumpier Old Men (1995)</td>\n",
-       "      <td>Comedy|Romance</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>Cutthroat Island (1995)</td>\n",
-       "      <td>Action|Adventure|Romance</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>34</th>\n",
-       "      <td>Babe (1995)</td>\n",
-       "      <td>Children|Drama</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>59</th>\n",
-       "      <td>Confessional, The (Confessionnal, Le) (1995)</td>\n",
-       "      <td>Drama|Mystery</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>64</th>\n",
-       "      <td>Two if by Sea (1996)</td>\n",
-       "      <td>Comedy|Romance</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>148652</th>\n",
-       "      <td>The Ridiculous 6 (2015)</td>\n",
-       "      <td>Comedy|Western</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>151307</th>\n",
-       "      <td>The Lovers and the Despot</td>\n",
-       "      <td>(no genres listed)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>152173</th>\n",
-       "      <td>Michael Jackson's Thriller (1983)</td>\n",
-       "      <td>Horror</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>160440</th>\n",
-       "      <td>The Maid's Room (2014)</td>\n",
-       "      <td>Thriller</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>160656</th>\n",
-       "      <td>Tallulah (2016)</td>\n",
-       "      <td>Drama</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>912 rows × 2 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                title  \\\n",
-       "movieId                                                 \n",
-       "3                             Grumpier Old Men (1995)   \n",
-       "15                            Cutthroat Island (1995)   \n",
-       "34                                        Babe (1995)   \n",
-       "59       Confessional, The (Confessionnal, Le) (1995)   \n",
-       "64                               Two if by Sea (1996)   \n",
-       "...                                               ...   \n",
-       "148652                        The Ridiculous 6 (2015)   \n",
-       "151307                      The Lovers and the Despot   \n",
-       "152173              Michael Jackson's Thriller (1983)   \n",
-       "160440                         The Maid's Room (2014)   \n",
-       "160656                                Tallulah (2016)   \n",
-       "\n",
-       "                           genres  \n",
-       "movieId                            \n",
-       "3                  Comedy|Romance  \n",
-       "15       Action|Adventure|Romance  \n",
-       "34                 Children|Drama  \n",
-       "59                  Drama|Mystery  \n",
-       "64                 Comedy|Romance  \n",
-       "...                           ...  \n",
-       "148652             Comedy|Western  \n",
-       "151307         (no genres listed)  \n",
-       "152173                     Horror  \n",
-       "160440                   Thriller  \n",
-       "160656                      Drama  \n",
-       "\n",
-       "[912 rows x 2 columns]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Display The Ratings : \n"
+     "ename": "ImportError",
+     "evalue": "cannot import name 'Constant' from 'constants' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/constants.py)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 12\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mscipy\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msparse\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m csr_matrix\n\u001b[1;32m     11\u001b[0m \u001b[38;5;66;03m# Constants and functions\u001b[39;00m\n\u001b[0;32m---> 12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Constant \u001b[38;5;28;01mas\u001b[39;00m C\n\u001b[1;32m     13\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mloaders\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_ratings\n\u001b[1;32m     14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mloaders\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_items\n",
+      "\u001b[0;31mImportError\u001b[0m: cannot import name 'Constant' from 'constants' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/constants.py)"
      ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>userId</th>\n",
-       "      <th>movieId</th>\n",
-       "      <th>rating</th>\n",
-       "      <th>timestamp</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>15</td>\n",
-       "      <td>34</td>\n",
-       "      <td>3.0</td>\n",
-       "      <td>997938310</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>15</td>\n",
-       "      <td>95</td>\n",
-       "      <td>1.5</td>\n",
-       "      <td>1093028331</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>15</td>\n",
-       "      <td>101</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>1134522072</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>15</td>\n",
-       "      <td>123</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>997938358</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>15</td>\n",
-       "      <td>125</td>\n",
-       "      <td>3.5</td>\n",
-       "      <td>1245362506</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5291</th>\n",
-       "      <td>665</td>\n",
-       "      <td>3908</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1046967201</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5292</th>\n",
-       "      <td>665</td>\n",
-       "      <td>4052</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>992838277</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5293</th>\n",
-       "      <td>665</td>\n",
-       "      <td>4351</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>992837743</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5294</th>\n",
-       "      <td>665</td>\n",
-       "      <td>4643</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>997239207</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5295</th>\n",
-       "      <td>665</td>\n",
-       "      <td>5502</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>1046967596</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5296 rows × 4 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "      userId  movieId  rating   timestamp\n",
-       "0         15       34     3.0   997938310\n",
-       "1         15       95     1.5  1093028331\n",
-       "2         15      101     4.0  1134522072\n",
-       "3         15      123     4.0   997938358\n",
-       "4         15      125     3.5  1245362506\n",
-       "...      ...      ...     ...         ...\n",
-       "5291     665     3908     1.0  1046967201\n",
-       "5292     665     4052     4.0   992838277\n",
-       "5293     665     4351     4.0   992837743\n",
-       "5294     665     4643     4.0   997239207\n",
-       "5295     665     5502     4.0  1046967596\n",
-       "\n",
-       "[5296 rows x 4 columns]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
     }
    ],
    "source": [
@@ -311,7 +52,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -330,7 +71,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -355,7 +96,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -407,7 +148,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -426,7 +167,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -445,7 +186,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -465,7 +206,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -488,7 +229,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -511,7 +252,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -545,7 +286,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -575,7 +316,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -608,7 +349,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -625,7 +366,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -688,7 +429,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
diff --git a/analytics_test.ipynb b/analytics_test.ipynb
new file mode 100644
index 00000000..86d849a9
--- /dev/null
+++ b/analytics_test.ipynb
@@ -0,0 +1,449 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ImportError",
+     "evalue": "cannot import name 'Constant' from 'constants' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/constants.py)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 13\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlinear_model\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m LinearRegression\n\u001b[1;32m     12\u001b[0m \u001b[38;5;66;03m# Constants and functions\u001b[39;00m\n\u001b[0;32m---> 13\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Constant \u001b[38;5;28;01mas\u001b[39;00m C\n\u001b[1;32m     15\u001b[0m \u001b[38;5;66;03m# We use a pd.read_csv() so importing the loaders is not necessary\u001b[39;00m\n\u001b[1;32m     16\u001b[0m \u001b[38;5;66;03m# from loaders import load_ratings \u001b[39;00m\n\u001b[1;32m     17\u001b[0m \u001b[38;5;66;03m# from loaders import load_items\u001b[39;00m\n\u001b[1;32m     19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtabulate\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m tabulate\n",
+      "\u001b[0;31mImportError\u001b[0m: cannot import name 'Constant' from 'constants' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/constants.py)"
+     ]
+    }
+   ],
+   "source": [
+    "# Reload modules automatically before entering the execution of code\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "# Third-party imports\n",
+    "import numpy as np \n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "from scipy.sparse import csr_matrix\n",
+    "from sklearn.linear_model import LinearRegression\n",
+    "\n",
+    "# Constants and functions\n",
+    "from constants import Constant as C\n",
+    "\n",
+    "# We use a pd.read_csv() so importing the loaders is not necessary\n",
+    "# from loaders import load_ratings \n",
+    "# from loaders import load_items\n",
+    "\n",
+    "from tabulate import tabulate\n",
+    "\n",
+    "# Call the load_items() function and create a variable df_items\n",
+    "df_movies = pd.read_csv(\"../data/test/content/movies.csv\")\n",
+    "\n",
+    "# Display the DataFrame\n",
+    "print(\"Display The Movies : \")\n",
+    "display(df_movies)\n",
+    "\n",
+    "# Call the load_ratings() function and create a variable df_ratings\n",
+    "df_ratings = pd.read_csv(\"../data/test/evidence/ratings.csv\")\n",
+    "\n",
+    "# Display the DataFrame\n",
+    "print(\"Display The Ratings : \")\n",
+    "display(df_ratings)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of movies: 10\n"
+     ]
+    }
+   ],
+   "source": [
+    "# NUMBER OF MOVIES\n",
+    "n_movies = df_movies['title'].nunique()\n",
+    "print(f\"Number of movies: {n_movies}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Minimum range: 1973\n",
+      "Maximum range: 2002\n"
+     ]
+    }
+   ],
+   "source": [
+    "# THE YEAR RANGE\n",
+    "df_movies['annee'] = df_movies['title'].str.extract(r'\\((.{4})\\)')\n",
+    "df_movies['annee'] = pd.to_numeric(df_movies['annee'], errors='coerce')\n",
+    "\n",
+    "min_range = int(df_movies['annee'].min())\n",
+    "max_range = int(df_movies['annee'].max())\n",
+    "print(\"Minimum range:\", min_range)\n",
+    "print(\"Maximum range:\", max_range)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "List of all genres:\n",
+      "Action     |\n",
+      "Adventure  |\n",
+      "Animation  |\n",
+      "Children   |\n",
+      "Comedy     |\n",
+      "Drama      |\n",
+      "Fantasy    |\n",
+      "Horror     |\n",
+      "IMAX       |\n",
+      "Musical    |\n",
+      "Mystery    |\n",
+      "Romance    |\n",
+      "Sci-Fi     |\n",
+      "War        |\n"
+     ]
+    }
+   ],
+   "source": [
+    "# LIST OF MOVIE GENRES\n",
+    "def tabulate_genres(df_movies):\n",
+    "    \"\"\"Tabulate list of movie genres.\"\"\"\n",
+    "    # Split genres and explode\n",
+    "    df_movies['genres'] = df_movies['genres'].str.split('|')\n",
+    "    df_movies = df_movies.explode('genres')\n",
+    "    unique_genres = sorted(df_movies['genres'].unique())\n",
+    "\n",
+    "    # Tabulate\n",
+    "    print(\"\\nList of all genres:\")\n",
+    "    genres_table = [[genre, \"|\"] for genre in unique_genres]\n",
+    "    print(tabulate(genres_table, tablefmt=\"plain\", numalign=\"left\"))\n",
+    "\n",
+    "# Call the tabulate_genres function\n",
+    "tabulate_genres(df_movies)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of ratings: 30\n"
+     ]
+    }
+   ],
+   "source": [
+    "# THE TOTAL NUMBER OF RATINGS\n",
+    "n_ratings = df_ratings['rating'].count()\n",
+    "print(f\"Number of ratings: {n_ratings}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of users: 6\n"
+     ]
+    }
+   ],
+   "source": [
+    "# THE NUMBER OF UNIQUE USERS\n",
+    "n_users = df_ratings['userId'].nunique()\n",
+    "print(f\"Number of users: {n_users}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of unique movies : 10\n"
+     ]
+    }
+   ],
+   "source": [
+    "# THE NUMBER OF UNIQUE MOVIES (IN THE RATING MATRIX)\n",
+    "unique_movies = df_ratings[\"movieId\"].unique()\n",
+    "num_unique_movies = len(unique_movies)\n",
+    "print(f\"Number of unique movies : {num_unique_movies}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of ratings of the most rated movie(s): 4\n"
+     ]
+    }
+   ],
+   "source": [
+    "# THE NUMBER OF RATINGS OF THE MOST RATED MOVIES\n",
+    "def most_rated_movies_ratings_count(df_ratings):\n",
+    "    movie_ratings_count = df_ratings.groupby('movieId')['rating'].count()\n",
+    "    most_rated_movies = movie_ratings_count[movie_ratings_count == movie_ratings_count.max()]\n",
+    "    print(f\"Number of ratings of the most rated movie(s): {most_rated_movies.max()}\")\n",
+    "\n",
+    "most_rated_movies_ratings_count(df_ratings)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of ratings of the least rated movie(s): 2\n"
+     ]
+    }
+   ],
+   "source": [
+    "# THE NUMBER OF RATINGS OF THE LESS RATED MOVIES\n",
+    "def least_rated_movies_ratings_count(df_ratings):\n",
+    "    movie_ratings_count = df_ratings.groupby('movieId')['rating'].count()\n",
+    "    least_rated_movies = movie_ratings_count[movie_ratings_count == movie_ratings_count.min()]\n",
+    "    print(\"Number of ratings of the least rated movie(s):\", least_rated_movies.min())\n",
+    "\n",
+    "least_rated_movies_ratings_count(df_ratings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "All possible rating values, from smallest to highest:\n",
+      "1.0\n",
+      "1.5\n",
+      "2.0\n",
+      "2.5\n",
+      "3.0\n",
+      "4.0\n",
+      "4.5\n",
+      "5.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ALL THE POSSIBLE RATING VALUES; FROM THE SMALLEST VALUE TO THE VALUE HIGHEST\n",
+    "def all_possible_ratings(df_ratings):\n",
+    "    rating_values = sorted(df_ratings['rating'].unique())\n",
+    "    print(\"All possible rating values, from smallest to highest:\")\n",
+    "    for rating in rating_values:\n",
+    "        print(rating)\n",
+    "\n",
+    "all_possible_ratings(df_ratings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of movies that were not rated at all: 10\n"
+     ]
+    }
+   ],
+   "source": [
+    "# THE NUMBER OF MOVIES THAT WERE NOT RATED AT ALL\n",
+    "def unrated_movies_count(df_ratings, df_movies):\n",
+    "    rated_movies = df_ratings['movieId'].unique() if 'movieId' in df_ratings.columns else []\n",
+    "    unrated_movies_count = df_movies[~df_movies.index.isin(rated_movies)].shape[0]\n",
+    "    print(\"Number of movies that were not rated at all:\", unrated_movies_count)\n",
+    "\n",
+    "unrated_movies_count(df_ratings, df_movies)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "LONG-TAIL PROPERTY"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 2000x600 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Rating Frequency Distribution\n",
+    "merged_df = pd.merge(df_ratings,df_movies, on='movieId')\n",
+    "rating_counts = merged_df['movieId'].value_counts()\n",
+    "value_counts = rating_counts.value_counts().sort_index()\n",
+    "\n",
+    "plt.figure(figsize=(20, 6))\n",
+    "plt.plot(value_counts.values, value_counts.index, marker='o', color='skyblue', linestyle='-')  # Swap x and y arguments\n",
+    "plt.title('Rating Frequency Distribution')\n",
+    "plt.xlabel('Number of Movies')  # Update x-label\n",
+    "plt.ylabel('Number of Ratings')  # Update y-label\n",
+    "plt.xticks(rotation=45)\n",
+    "plt.grid(axis='x', linestyle='--', alpha=0.7)  # Change grid to x-axis\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 800x600 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "def create_X(df):\n",
+    "    \"\"\"\n",
+    "    Generates a sparse matrix from ratings dataframe.\n",
+    "\n",
+    "    Args:\n",
+    "        df: pandas dataframe containing 3 columns (userId, movieId, rating)\n",
+    "\n",
+    "    Returns:\n",
+    "        X: sparse matrix\n",
+    "        user_mapper: dict that maps user id's to user indices\n",
+    "        user_inv_mapper: dict that maps user indices to user id's\n",
+    "        movie_mapper: dict that maps movie id's to movie indices\n",
+    "        movie_inv_mapper: dict that maps movie indices to movie id's\n",
+    "    \"\"\"\n",
+    "    M = df['userId'].nunique()\n",
+    "    N = df['movieId'].nunique()\n",
+    "\n",
+    "    user_mapper = dict(zip(np.unique(df[\"userId\"]), list(range(M))))\n",
+    "    movie_mapper = dict(zip(np.unique(df[\"movieId\"]), list(range(N))))\n",
+    "\n",
+    "    user_inv_mapper = dict(zip(list(range(M)), np.unique(df[\"userId\"])))\n",
+    "    movie_inv_mapper = dict(zip(list(range(N)), np.unique(df[\"movieId\"])))\n",
+    "\n",
+    "    user_index = [user_mapper[i] for i in df['userId']]\n",
+    "    item_index = [movie_mapper[i] for i in df['movieId']]\n",
+    "\n",
+    "    X = csr_matrix((df[\"rating\"], (user_index,item_index)), shape=(M,N))\n",
+    "\n",
+    "    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper\n",
+    "\n",
+    "# Assuming df_ratings contains your ratings dataframe\n",
+    "\n",
+    "X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(df_ratings)\n",
+    "\n",
+    "\n",
+    "# Plot the non-zero values of the sparse matrix\n",
+    "plt.figure(figsize=(8, 6))\n",
+    "plt.spy(X, markersize=1)\n",
+    "plt.title('Non-zero values of a sparse matrix')\n",
+    "plt.xlabel('Movie Index')\n",
+    "plt.ylabel('User Index')\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Matrix sparsity: 50.0%\n"
+     ]
+    }
+   ],
+   "source": [
+    "n_total = X.shape[0]*X.shape[1]\n",
+    "n_ratings = X.nnz\n",
+    "sparsity = n_ratings/n_total\n",
+    "print(f\"Matrix sparsity: {round(sparsity*100,2)}%\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics_tiny.ipynb b/analytics_tiny.ipynb
index cbd97046..ccebba37 100644
--- a/analytics_tiny.ipynb
+++ b/analytics_tiny.ipynb
@@ -6,279 +6,15 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Display The Movies : \n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>movieId</th>\n",
-       "      <th>title</th>\n",
-       "      <th>genres</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>3</td>\n",
-       "      <td>Grumpier Old Men (1995)</td>\n",
-       "      <td>Comedy|Romance</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>15</td>\n",
-       "      <td>Cutthroat Island (1995)</td>\n",
-       "      <td>Action|Adventure|Romance</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>34</td>\n",
-       "      <td>Babe (1995)</td>\n",
-       "      <td>Children|Drama</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>59</td>\n",
-       "      <td>Confessional, The (Confessionnal, Le) (1995)</td>\n",
-       "      <td>Drama|Mystery</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>64</td>\n",
-       "      <td>Two if by Sea (1996)</td>\n",
-       "      <td>Comedy|Romance</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>907</th>\n",
-       "      <td>148652</td>\n",
-       "      <td>The Ridiculous 6 (2015)</td>\n",
-       "      <td>Comedy|Western</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>908</th>\n",
-       "      <td>151307</td>\n",
-       "      <td>The Lovers and the Despot</td>\n",
-       "      <td>(no genres listed)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>909</th>\n",
-       "      <td>152173</td>\n",
-       "      <td>Michael Jackson's Thriller (1983)</td>\n",
-       "      <td>Horror</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>910</th>\n",
-       "      <td>160440</td>\n",
-       "      <td>The Maid's Room (2014)</td>\n",
-       "      <td>Thriller</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>911</th>\n",
-       "      <td>160656</td>\n",
-       "      <td>Tallulah (2016)</td>\n",
-       "      <td>Drama</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>912 rows × 3 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     movieId                                         title  \\\n",
-       "0          3                       Grumpier Old Men (1995)   \n",
-       "1         15                       Cutthroat Island (1995)   \n",
-       "2         34                                   Babe (1995)   \n",
-       "3         59  Confessional, The (Confessionnal, Le) (1995)   \n",
-       "4         64                          Two if by Sea (1996)   \n",
-       "..       ...                                           ...   \n",
-       "907   148652                       The Ridiculous 6 (2015)   \n",
-       "908   151307                     The Lovers and the Despot   \n",
-       "909   152173             Michael Jackson's Thriller (1983)   \n",
-       "910   160440                        The Maid's Room (2014)   \n",
-       "911   160656                               Tallulah (2016)   \n",
-       "\n",
-       "                       genres  \n",
-       "0              Comedy|Romance  \n",
-       "1    Action|Adventure|Romance  \n",
-       "2              Children|Drama  \n",
-       "3               Drama|Mystery  \n",
-       "4              Comedy|Romance  \n",
-       "..                        ...  \n",
-       "907            Comedy|Western  \n",
-       "908        (no genres listed)  \n",
-       "909                    Horror  \n",
-       "910                  Thriller  \n",
-       "911                     Drama  \n",
-       "\n",
-       "[912 rows x 3 columns]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Display The Ratings : \n"
+     "ename": "ImportError",
+     "evalue": "cannot import name 'Constant' from 'constants' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/constants.py)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 13\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlinear_model\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m LinearRegression\n\u001b[1;32m     12\u001b[0m \u001b[38;5;66;03m# Constants and functions\u001b[39;00m\n\u001b[0;32m---> 13\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Constant \u001b[38;5;28;01mas\u001b[39;00m C\n\u001b[1;32m     15\u001b[0m \u001b[38;5;66;03m# We use a pd.read_csv() so importing the loaders is not necessary\u001b[39;00m\n\u001b[1;32m     16\u001b[0m \u001b[38;5;66;03m# from loaders import load_ratings \u001b[39;00m\n\u001b[1;32m     17\u001b[0m \u001b[38;5;66;03m# from loaders import load_items\u001b[39;00m\n\u001b[1;32m     19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtabulate\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m tabulate\n",
+      "\u001b[0;31mImportError\u001b[0m: cannot import name 'Constant' from 'constants' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/constants.py)"
      ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>userId</th>\n",
-       "      <th>movieId</th>\n",
-       "      <th>rating</th>\n",
-       "      <th>timestamp</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>15</td>\n",
-       "      <td>34</td>\n",
-       "      <td>3.0</td>\n",
-       "      <td>997938310</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>15</td>\n",
-       "      <td>95</td>\n",
-       "      <td>1.5</td>\n",
-       "      <td>1093028331</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>15</td>\n",
-       "      <td>101</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>1134522072</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>15</td>\n",
-       "      <td>123</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>997938358</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>15</td>\n",
-       "      <td>125</td>\n",
-       "      <td>3.5</td>\n",
-       "      <td>1245362506</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5291</th>\n",
-       "      <td>665</td>\n",
-       "      <td>3908</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1046967201</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5292</th>\n",
-       "      <td>665</td>\n",
-       "      <td>4052</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>992838277</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5293</th>\n",
-       "      <td>665</td>\n",
-       "      <td>4351</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>992837743</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5294</th>\n",
-       "      <td>665</td>\n",
-       "      <td>4643</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>997239207</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5295</th>\n",
-       "      <td>665</td>\n",
-       "      <td>5502</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>1046967596</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5296 rows × 4 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "      userId  movieId  rating   timestamp\n",
-       "0         15       34     3.0   997938310\n",
-       "1         15       95     1.5  1093028331\n",
-       "2         15      101     4.0  1134522072\n",
-       "3         15      123     4.0   997938358\n",
-       "4         15      125     3.5  1245362506\n",
-       "...      ...      ...     ...         ...\n",
-       "5291     665     3908     1.0  1046967201\n",
-       "5292     665     4052     4.0   992838277\n",
-       "5293     665     4351     4.0   992837743\n",
-       "5294     665     4643     4.0   997239207\n",
-       "5295     665     5502     4.0  1046967596\n",
-       "\n",
-       "[5296 rows x 4 columns]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
     }
    ],
    "source": [
@@ -320,7 +56,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -339,7 +75,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -364,7 +100,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -416,7 +152,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -435,7 +171,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -454,7 +190,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -474,7 +210,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -497,7 +233,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -520,7 +256,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -554,7 +290,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -584,7 +320,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -617,7 +353,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -680,7 +416,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
diff --git a/content_based.ipynb b/content_based.ipynb
index da98e312..df2d2bef 100644
--- a/content_based.ipynb
+++ b/content_based.ipynb
@@ -10,16 +10,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 1,
    "id": "277473a3",
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
+     "ename": "ImportError",
+     "evalue": "cannot import name 'Constant' from 'constants' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/constants.py)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 10\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msurprise\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AlgoBase\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msurprise\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mprediction_algorithms\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpredictions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m PredictionImpossible\n\u001b[0;32m---> 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mloaders\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_ratings\n\u001b[1;32m     11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mloaders\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_items\n\u001b[1;32m     12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Constant \u001b[38;5;28;01mas\u001b[39;00m C\n",
+      "File \u001b[0;32m~/vscodeworkspace/recomsys/loaders.py:7\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;66;03m# Local imports\u001b[39;00m\n\u001b[0;32m----> 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Constant \u001b[38;5;28;01mas\u001b[39;00m C\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msurprise\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Reader, Dataset\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_ratings\u001b[39m(surprise_format\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m):\n",
+      "\u001b[0;31mImportError\u001b[0m: cannot import name 'Constant' from 'constants' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/constants.py)"
      ]
     }
    ],
@@ -50,7 +54,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "id": "e8378976",
    "metadata": {},
    "outputs": [
@@ -143,7 +147,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "id": "16b0a602",
    "metadata": {},
    "outputs": [],
@@ -245,7 +249,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "id": "69d12f7d",
    "metadata": {},
    "outputs": [
diff --git a/loaders.py b/loaders.py
new file mode 100644
index 00000000..d4cc224f
--- /dev/null
+++ b/loaders.py
@@ -0,0 +1,51 @@
+# Third-party imports
+import pandas as pd
+import os
+
+
+# Local imports
+from constants import Constant as C
+from surprise import Reader, Dataset
+
+def load_ratings(surprise_format=False):
+    """Loads ratings data.
+
+    Parameters:
+        surprise_format (bool): If True, returns data in Surprise format.
+
+    Returns:
+        DataFrame or surprise_data: Ratings data.
+    """
+    df_ratings = pd.read_csv(C.EVIDENCE_PATH / C.RATINGS_FILENAME)
+    if surprise_format:
+        reader = Reader(rating_scale=C.RATINGS_SCALE) # on met 0.5 pcq c'est la plus petite note.
+        surprise_data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
+        return surprise_data
+    else:
+        return df_ratings
+
+
+def load_items():
+    """Loads items data.
+
+    Returns:
+        DataFrame: Items data.
+    """
+    df_items = pd.read_csv(C.CONTENT_PATH / C.ITEMS_FILENAME) # ce qui se trouve dans le movie csv
+    df_items = df_items.set_index(C.ITEM_ID_COL) # movie id
+    return df_items
+
+def export_evaluation_report(report):
+    """Exports evaluation report.
+
+    Parameters:
+        report: Evaluation report.
+
+    Returns:
+        DataFrame: Merged ratings and items data.
+    """
+    report_name = f"evaluation_report_{pd.Timestamp.now().strftime('%Y-%m-%d')}.csv"
+    export_path = os.path.join("data", "tiny", "evaluations", report_name)
+    report.to_csv(export_path, index=False)
+    print("The data has been exported to the evaluation report")
+    return report
\ No newline at end of file
diff --git a/models.py b/models.py
new file mode 100644
index 00000000..c288a5b8
--- /dev/null
+++ b/models.py
@@ -0,0 +1,181 @@
+# standard library imports
+from collections import defaultdict
+
+# third parties imports
+import pandas as pd
+import numpy as np
+import random as rd
+from surprise import AlgoBase, SVD, KNNWithMeans
+from surprise import PredictionImpossible
+
+# import local
+from loaders import load_items, load_ratings
+from constants import Constant as C
+from sklearn.linear_model import LinearRegression
+
+
+def get_top_n(predictions, n):
+    """Return the top-N recommendation for each user from a set of predictions.
+    Source: inspired by https://github.com/NicolasHug/Surprise/blob/master/examples/top_n_recommendations.py
+    and modified by cvandekerckh for random tie breaking
+
+    Args:
+        predictions(list of Prediction objects): The list of predictions, as
+            returned by the test method of an algorithm.
+        n(int): The number of recommendation to output for each user. Default
+            is 10.
+    Returns:
+    A dict where keys are user (raw) ids and values are lists of tuples:
+        [(raw item id, rating estimation), ...] of size n.
+    """
+
+    rd.seed(0)
+
+    # First map the predictions to each user.
+    top_n = defaultdict(list)
+    for uid, iid, true_r, est, _ in predictions:
+        top_n[uid].append((iid, est))
+
+    # Then sort the predictions for each user and retrieve the k highest ones.
+    for uid, user_ratings in top_n.items():
+        rd.shuffle(user_ratings)
+        user_ratings.sort(key=lambda x: x[1], reverse=True)
+        top_n[uid] = user_ratings[:n]
+
+    return top_n
+
+
+# First algorithm
+class ModelBaseline1(AlgoBase):
+    def __init__(self):
+        AlgoBase.__init__(self)
+
+    def estimate(self, u, i):
+        return 2
+
+
+# Second algorithm
+class ModelBaseline2(AlgoBase):
+    def __init__(self):
+        AlgoBase.__init__(self)
+
+    def fit(self, trainset):
+        AlgoBase.fit(self, trainset)
+        rd.seed(0)
+
+    def estimate(self, u, i):
+        return rd.uniform(self.trainset.rating_scale[0], self.trainset.rating_scale[1])
+
+
+# Third algorithm
+class ModelBaseline3(AlgoBase):
+    def __init__(self):
+        AlgoBase.__init__(self)
+
+    def fit(self, trainset):
+        AlgoBase.fit(self, trainset)
+        self.the_mean = np.mean([r for (_, _, r) in self.trainset.all_ratings()])
+
+        return self
+
+    def estimate(self, u, i):
+        return self.the_mean
+
+
+# Fourth Model
+class ModelBaseline4(SVD):
+    def __init__(self):
+        SVD.__init__(self, n_factors=100)
+
+
+class ContentBased(AlgoBase):
+    def __init__(self, features_method, regressor_method):
+        AlgoBase.__init__(self)
+        self.regressor_method = regressor_method
+        self.content_features = self.create_content_features(features_method)
+
+    def create_content_features(self, features_method):
+        """Content Analyzer"""
+        df_items = load_items()
+        if features_method is None:
+            df_features = None
+        elif features_method == "title_length": # a naive method that creates only 1 feature based on title length
+            df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
+        else: # (implement other feature creations here)
+            raise NotImplementedError(f'Feature method {features_method} not yet implemented')
+        return df_features
+    
+
+    def fit(self, trainset):
+        """Profile Learner"""
+        AlgoBase.fit(self, trainset)
+        
+        # Preallocate user profiles
+        self.user_profile = {u: None for u in trainset.all_users()}
+
+        if self.regressor_method == 'random_score':
+            for u in self.user_profile :
+                self.user_profile[u] = rd.uniform(0.5,5)
+            
+        elif self.regressor_method == 'random_sample':
+            for u in self.user_profile:
+                self.user_profile[u] = [rating for _, rating in self.trainset.ur[u]]
+        elif self.regressor_method == 'linear_regression' :
+            for u in self.user_profile:
+
+                user_ratings = [rating for _, rating in trainset.ur[u]]
+                item_ids = [iid for iid, _ in trainset.ur[u]]
+
+                df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})
+
+                df_user["item_id"] = df_user["item_id"].map(trainset.to_raw_iid)
+
+                df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left')
+
+                X = df_user['n_character_title'].values.reshape(-1,1)
+
+                y = df_user['user_ratings'].values
+
+                linear_regressor = LinearRegression(fit_intercept = False)
+
+                linear_regressor.fit(X,y)
+                
+                # Store the computed user profile
+                self.user_profile[u] = linear_regressor
+        else : 
+            pass
+
+            # (implement here the regressor fitting)  
+        
+    def estimate(self, u, i):
+        """Scoring component used for item filtering"""
+        # First, handle cases for unknown users and items
+        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
+            raise PredictionImpossible('User and/or item is unkown.')
+
+
+        if self.regressor_method == 'random_score':
+            rd.seed()
+            score = rd.uniform(0.5,5)
+
+        elif self.regressor_method == 'random_sample':
+            rd.seed()
+            score = rd.choice(self.user_profile[u])
+        
+        elif self.regressor_method == 'linear_regression':
+
+            raw_item_id = self.trainset.to_raw_iid(i)
+
+            item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values
+
+            linear_regressor = self.user_profile[u]
+
+            score= linear_regressor.predict(item_features)[0]
+        else : 
+            score = None
+
+            # (implement here the regressor prediction)
+
+        return score
+
+
diff --git a/user_based.ipynb b/user_based.ipynb
new file mode 100644
index 00000000..28f2623a
--- /dev/null
+++ b/user_based.ipynb
@@ -0,0 +1,762 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f4a8f664",
+   "metadata": {},
+   "source": [
+    "# Custom User-based Model\n",
+    "The present notebooks aims at creating a UserBased class that inherits from the Algobase class (surprise package) and that can be customized with various similarity metrics, peer groups and score aggregation functions. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "00d1b249",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ImportError",
+     "evalue": "cannot import name 'Constant' from 'constants' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/constants.py)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 14\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m     11\u001b[0m \u001b[38;5;66;03m# -- add new imports here --\u001b[39;00m\n\u001b[1;32m     12\u001b[0m \n\u001b[1;32m     13\u001b[0m \u001b[38;5;66;03m# local imports\u001b[39;00m\n\u001b[0;32m---> 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Constant \u001b[38;5;28;01mas\u001b[39;00m C\n\u001b[1;32m     15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mloaders\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_ratings,load_items \n\u001b[1;32m     16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msurprise\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m KNNWithMeans, accuracy, AlgoBase, PredictionImpossible\n",
+      "\u001b[0;31mImportError\u001b[0m: cannot import name 'Constant' from 'constants' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/constants.py)"
+     ]
+    }
+   ],
+   "source": [
+    "# reloads modules automatically before entering the execution of code\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "# standard library imports\n",
+    "# -- add new imports here --\n",
+    "\n",
+    "# third parties imports\n",
+    "import numpy as np \n",
+    "import pandas as pd\n",
+    "# -- add new imports here --\n",
+    "\n",
+    "# local imports\n",
+    "from constants import Constant as C\n",
+    "from loaders import load_ratings,load_items \n",
+    "from surprise import KNNWithMeans, accuracy, AlgoBase, PredictionImpossible\n",
+    "\n",
+    "import heapq"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "22716aa3",
+   "metadata": {},
+   "source": [
+    "# 1. Loading Data\n",
+    "Prepare a dataset in order to help implementing a user-based recommender system"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aafd1712",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computing the msd similarity matrix...\n",
+      "Done computing similarity matrix.\n",
+      "user: 11         item: 364        r_ui = 4.00   est = 3.42   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "# Create Surprise Dataset from the pandas DataFrame and Reader\n",
+    "surprise_data = load_ratings(surprise_format=True)\n",
+    "\n",
+    "trainset = surprise_data.build_full_trainset()\n",
+    "\n",
+    "\n",
+    "testset = trainset.build_anti_testset()\n",
+    "\n",
+    "\n",
+    "sim_options = {\n",
+    "    'name': 'msd',  # Mean Squared Difference (Mean Square Error)\n",
+    "    'user_based': True,  # User-based collaborative filtering\n",
+    "    'min_support': 3  # Minimum number of common ratings required\n",
+    "}\n",
+    "\n",
+    "\n",
+    "# Build an algorithm, and train it.\n",
+    "algo = KNNWithMeans(sim_options=sim_options, k=3, min_k=2)\n",
+    "algo.fit(trainset)\n",
+    "algo.test(testset)\n",
+    "\n",
+    "\n",
+    "uid = str(11)  # raw user id (as in the ratings file). They are **strings**!\n",
+    "iid = str(364) \n",
+    "\n",
+    "pred = algo.predict(uid, iid, r_ui=4, verbose=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cf3ccdc0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# -- load data, build trainset and anti testset --\n",
+    "# it depends on the tiny dataset\n",
+    "surprise_data = load_ratings(surprise_format=True)\n",
+    "df_movies = load_items()\n",
+    "\n",
+    "# Assuming you have a pandas DataFrame named 'df' with columns ['user_id', 'item_id', 'rating']\n",
+    "\n",
+    "# Build train set with all available ratings\n",
+    "trainset = surprise_data.build_full_trainset()\n",
+    "\n",
+    "# Build anti-test set\n",
+    "testset = trainset.build_anti_testset()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "94adf3a6",
+   "metadata": {},
+   "source": [
+    "# 2. Explore Surprise's user-based algorithm\n",
+    "Displays user-based predictions and similarity matrix on the test dataset using the KNNWithMeans class"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6fb78b7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computing the msd similarity matrix...\n",
+      "Done computing similarity matrix.\n",
+      "3.4190898791540785\n"
+     ]
+    }
+   ],
+   "source": [
+    "# -- using surprise's user-based algorithm, explore the impact of different parameters and displays predictions --\n",
+    "\n",
+    "# Define the similarity options\n",
+    "sim_options = {\n",
+    "    'name': 'msd',  # Mean Squared Difference (Mean Square Error)\n",
+    "    'user_based': True,  # User-based collaborative filtering\n",
+    "    'min_support': 3  # Minimum number of common ratings required\n",
+    "}\n",
+    "\n",
+    "# Create an instance of KNNWithMeans with the specified options\n",
+    "knn_model = KNNWithMeans(k=3, min_k=2, sim_options=sim_options)\n",
+    "\n",
+    "# Train the algorithm on the trainset\n",
+    "knn_model.fit(trainset).test(testset)\n",
+    "\n",
+    "# Make an estimation for user 11 and item 364\n",
+    "prediction = knn_model.predict('11', '364')\n",
+    "print(prediction.est)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ffe89c56",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computing the msd similarity matrix...\n",
+      "Done computing similarity matrix.\n",
+      "Predictions with min_k = 1:\n",
+      "User: 15, Item: 942, Rating: 3.7769516356699464\n",
+      "User: 15, Item: 2117, Rating: 2.9340004894942537\n",
+      "User: 15, Item: 2672, Rating: 2.371008709611413\n",
+      "User: 15, Item: 5054, Rating: 3.010328638497653\n",
+      "User: 15, Item: 6322, Rating: 1.711175832857413\n",
+      "User: 15, Item: 6323, Rating: 1.7645762379992287\n",
+      "User: 15, Item: 6757, Rating: 3.010328638497653\n",
+      "User: 15, Item: 7700, Rating: 3.561484741491386\n",
+      "User: 15, Item: 7981, Rating: 3.386000174210522\n",
+      "User: 15, Item: 8600, Rating: 3.320743223639117\n",
+      "User: 15, Item: 8620, Rating: 2.7538763809343654\n",
+      "User: 15, Item: 31952, Rating: 3.7409900837647396\n",
+      "User: 15, Item: 3, Rating: 2.222062601579949\n",
+      "User: 15, Item: 64, Rating: 0.9224387353614938\n",
+      "User: 15, Item: 206, Rating: 2.35668733389394\n",
+      "User: 15, Item: 249, Rating: 3.1290259851652826\n",
+      "User: 15, Item: 276, Rating: 2.1800017354806753\n",
+      "User: 15, Item: 369, Rating: 2.3082373858282694\n",
+      "User: 15, Item: 504, Rating: 2.2600496220227573\n",
+      "User: 15, Item: 515, Rating: 3.6575674086958188\n",
+      "User: 15, Item: 522, Rating: 2.4562020809509626\n",
+      "User: 15, Item: 580, Rating: 1.9073310817298395\n",
+      "User: 15, Item: 599, Rating: 2.780847470837928\n",
+      "User: 15, Item: 915, Rating: 2.761094249104645\n",
+      "User: 15, Item: 966, Rating: 3.0894953051643195\n",
+      "User: 15, Item: 1274, Rating: 2.9873500196382845\n",
+      "User: 15, Item: 1299, Rating: 3.0779327239728005\n",
+      "User: 15, Item: 1345, Rating: 2.2037629856623138\n",
+      "User: 15, Item: 1354, Rating: 2.001877412379849\n",
+      "User: 15, Item: 532, Rating: 2.7123071345260277\n",
+      "Computing the msd similarity matrix...\n",
+      "Done computing similarity matrix.\n",
+      "Predictions with min_k = 2:\n",
+      "User: 15, Item: 942, Rating: 3.7769516356699464\n",
+      "User: 15, Item: 2117, Rating: 2.9340004894942537\n",
+      "User: 15, Item: 2672, Rating: 2.371008709611413\n",
+      "User: 15, Item: 5054, Rating: 2.693661971830986\n",
+      "User: 15, Item: 6322, Rating: 1.711175832857413\n",
+      "User: 15, Item: 6323, Rating: 1.7645762379992287\n",
+      "User: 15, Item: 6757, Rating: 2.693661971830986\n",
+      "User: 15, Item: 7700, Rating: 3.561484741491386\n",
+      "User: 15, Item: 7981, Rating: 3.386000174210522\n",
+      "User: 15, Item: 8600, Rating: 3.320743223639117\n",
+      "User: 15, Item: 8620, Rating: 2.7538763809343654\n",
+      "User: 15, Item: 31952, Rating: 3.7409900837647396\n",
+      "User: 15, Item: 3, Rating: 2.222062601579949\n",
+      "User: 15, Item: 64, Rating: 0.9224387353614938\n",
+      "User: 15, Item: 206, Rating: 2.35668733389394\n",
+      "User: 15, Item: 249, Rating: 3.1290259851652826\n",
+      "User: 15, Item: 276, Rating: 2.1800017354806753\n",
+      "User: 15, Item: 369, Rating: 2.3082373858282694\n",
+      "User: 15, Item: 504, Rating: 2.2600496220227573\n",
+      "User: 15, Item: 515, Rating: 3.6575674086958188\n",
+      "User: 15, Item: 522, Rating: 2.4562020809509626\n",
+      "User: 15, Item: 580, Rating: 1.9073310817298395\n",
+      "User: 15, Item: 599, Rating: 2.780847470837928\n",
+      "User: 15, Item: 915, Rating: 2.761094249104645\n",
+      "User: 15, Item: 966, Rating: 2.693661971830986\n",
+      "User: 15, Item: 1274, Rating: 2.9873500196382845\n",
+      "User: 15, Item: 1299, Rating: 3.0779327239728005\n",
+      "User: 15, Item: 1345, Rating: 2.2037629856623138\n",
+      "User: 15, Item: 1354, Rating: 2.001877412379849\n",
+      "User: 15, Item: 532, Rating: 2.7123071345260277\n",
+      "Computing the msd similarity matrix...\n",
+      "Done computing similarity matrix.\n",
+      "Predictions with min_k = 3:\n",
+      "User: 15, Item: 942, Rating: 3.7769516356699464\n",
+      "User: 15, Item: 2117, Rating: 2.9340004894942537\n",
+      "User: 15, Item: 2672, Rating: 2.371008709611413\n",
+      "User: 15, Item: 5054, Rating: 2.693661971830986\n",
+      "User: 15, Item: 6322, Rating: 2.693661971830986\n",
+      "User: 15, Item: 6323, Rating: 1.7645762379992287\n",
+      "User: 15, Item: 6757, Rating: 2.693661971830986\n",
+      "User: 15, Item: 7700, Rating: 2.693661971830986\n",
+      "User: 15, Item: 7981, Rating: 3.386000174210522\n",
+      "User: 15, Item: 8600, Rating: 2.693661971830986\n",
+      "User: 15, Item: 8620, Rating: 2.7538763809343654\n",
+      "User: 15, Item: 31952, Rating: 2.693661971830986\n",
+      "User: 15, Item: 3, Rating: 2.222062601579949\n",
+      "User: 15, Item: 64, Rating: 0.9224387353614938\n",
+      "User: 15, Item: 206, Rating: 2.35668733389394\n",
+      "User: 15, Item: 249, Rating: 3.1290259851652826\n",
+      "User: 15, Item: 276, Rating: 2.1800017354806753\n",
+      "User: 15, Item: 369, Rating: 2.3082373858282694\n",
+      "User: 15, Item: 504, Rating: 2.2600496220227573\n",
+      "User: 15, Item: 515, Rating: 3.6575674086958188\n",
+      "User: 15, Item: 522, Rating: 2.4562020809509626\n",
+      "User: 15, Item: 580, Rating: 1.9073310817298395\n",
+      "User: 15, Item: 599, Rating: 2.780847470837928\n",
+      "User: 15, Item: 915, Rating: 2.761094249104645\n",
+      "User: 15, Item: 966, Rating: 2.693661971830986\n",
+      "User: 15, Item: 1274, Rating: 2.9873500196382845\n",
+      "User: 15, Item: 1299, Rating: 3.0779327239728005\n",
+      "User: 15, Item: 1345, Rating: 2.2037629856623138\n",
+      "User: 15, Item: 1354, Rating: 2.001877412379849\n",
+      "User: 15, Item: 532, Rating: 2.7123071345260277\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Playing with KNN\n",
+    "\n",
+    "# Define the similarity options\n",
+    "sim_options = {\n",
+    "    'name': 'msd',  # Mean Squared Difference (Mean Square Error)\n",
+    "    'user_based': True,  # User-based collaborative filtering\n",
+    "    'min_support': 3  # Minimum number of common ratings required. This data is\n",
+    "}\n",
+    "\n",
+    "# Create an instance of KNNWithMeans with the specified options\n",
+    "def predict_ratings(trainset, testset, min_k_values):\n",
+    "    for min_k in min_k_values:\n",
+    "        knn_model = KNNWithMeans(sim_options=sim_options, k=3, min_k=min_k)\n",
+    "        # Train the algorithm on the trainset\n",
+    "        knn_model.fit(trainset)\n",
+    "\n",
+    "        # Make predictions for all ratings in the anti testset\n",
+    "        predictions = knn_model.test(testset)\n",
+    "\n",
+    "        # Display 30 predictions\n",
+    "        print(f\"Predictions with min_k = {min_k}:\")\n",
+    "        for prediction in predictions[:30]:\n",
+    "            print(f\"User: {prediction.uid}, Item: {prediction.iid}, Rating: {prediction.est}\")\n",
+    "\n",
+    "# Assuming trainset and testset are already defined\n",
+    "predict_ratings(trainset, testset, min_k_values=[1, 2, 3])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c5209097",
+   "metadata": {},
+   "source": [
+    "Quelque soit les neighbours (1,2,3) la valeur du ratings ne change pas "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c8890e11",
+   "metadata": {},
+   "source": [
+    "1).Predictions with min_k = 1: In this case, the model makes predictions without considering any minimum number of neighbors. Each prediction is made solely based on the similarity between the target user and other users who have rated the same items. Consequently, we observe varying prediction values for different items. For instance, for user 15 and item 942, the predicted rating is 3.777, while for item 64, the predicted rating is only 0.922. This indicates that the model heavily relies on the ratings from users who may have rated only a single item in common with the target user, leading to potentially erratic predictions.\n",
+    "\n",
+    "2). Predictions with min_k = 2: Here, a minimum of 2 neighbors are required to make a prediction. This introduces a bit of regularization, ensuring that predictions are made based on a slightly broader consensus. We notice that the predictions are somewhat similar to those with min_k = 1, but there are slight changes in some ratings. For example, the rating for item 5054 changes from 3.010 to 2.694. This suggests that the model is slightly more conservative in its predictions due to the requirement of at least two neighbors.\n",
+    "\n",
+    "3). Predictions with min_k = 3: With a minimum of 3 neighbors, the model becomes even more conservative. It requires a stronger consensus among users before making predictions. As a result, we see more uniformity in the predicted ratings compared to the previous cases. For example, for item 6322, the prediction changes from 1.711 (min_k = 1) to 2.694 (min_k = 2) and finally to 2.694 again (min_k = 3). This indicates that the model is increasingly cautious as it demands more agreement among neighbors before making predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc806424",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Prédictions avec min_support = 1:\n",
+      "User: 15, Item: 942, Actual_k: 3\n",
+      "User: 15, Item: 2117, Actual_k: 3\n",
+      "User: 15, Item: 2672, Actual_k: 3\n",
+      "User: 15, Item: 5054, Actual_k: 1\n",
+      "User: 15, Item: 6322, Actual_k: 2\n",
+      "User: 15, Item: 6323, Actual_k: 3\n",
+      "User: 15, Item: 6757, Actual_k: 1\n",
+      "User: 15, Item: 7700, Actual_k: 2\n",
+      "User: 15, Item: 7981, Actual_k: 3\n",
+      "User: 15, Item: 8600, Actual_k: 2\n",
+      "User: 15, Item: 8620, Actual_k: 3\n",
+      "User: 15, Item: 31952, Actual_k: 2\n",
+      "User: 15, Item: 3, Actual_k: 3\n",
+      "User: 15, Item: 64, Actual_k: 3\n",
+      "User: 15, Item: 206, Actual_k: 3\n",
+      "User: 15, Item: 249, Actual_k: 3\n",
+      "User: 15, Item: 276, Actual_k: 3\n",
+      "User: 15, Item: 369, Actual_k: 3\n",
+      "User: 15, Item: 504, Actual_k: 3\n",
+      "User: 15, Item: 515, Actual_k: 3\n",
+      "User: 15, Item: 522, Actual_k: 3\n",
+      "User: 15, Item: 580, Actual_k: 3\n",
+      "User: 15, Item: 599, Actual_k: 3\n",
+      "User: 15, Item: 915, Actual_k: 3\n",
+      "User: 15, Item: 966, Actual_k: 1\n",
+      "User: 15, Item: 1274, Actual_k: 3\n",
+      "User: 15, Item: 1299, Actual_k: 3\n",
+      "User: 15, Item: 1345, Actual_k: 3\n",
+      "User: 15, Item: 1354, Actual_k: 3\n",
+      "User: 15, Item: 532, Actual_k: 3\n",
+      "\n",
+      "Prédictions avec min_support = 2:\n",
+      "User: 15, Item: 942, Actual_k: 3\n",
+      "User: 15, Item: 2117, Actual_k: 3\n",
+      "User: 15, Item: 2672, Actual_k: 3\n",
+      "User: 15, Item: 5054, Actual_k: 1\n",
+      "User: 15, Item: 6322, Actual_k: 2\n",
+      "User: 15, Item: 6323, Actual_k: 3\n",
+      "User: 15, Item: 6757, Actual_k: 1\n",
+      "User: 15, Item: 7700, Actual_k: 2\n",
+      "User: 15, Item: 7981, Actual_k: 3\n",
+      "User: 15, Item: 8600, Actual_k: 2\n",
+      "User: 15, Item: 8620, Actual_k: 3\n",
+      "User: 15, Item: 31952, Actual_k: 2\n",
+      "User: 15, Item: 3, Actual_k: 3\n",
+      "User: 15, Item: 64, Actual_k: 3\n",
+      "User: 15, Item: 206, Actual_k: 3\n",
+      "User: 15, Item: 249, Actual_k: 3\n",
+      "User: 15, Item: 276, Actual_k: 3\n",
+      "User: 15, Item: 369, Actual_k: 3\n",
+      "User: 15, Item: 504, Actual_k: 3\n",
+      "User: 15, Item: 515, Actual_k: 3\n",
+      "User: 15, Item: 522, Actual_k: 3\n",
+      "User: 15, Item: 580, Actual_k: 3\n",
+      "User: 15, Item: 599, Actual_k: 3\n",
+      "User: 15, Item: 915, Actual_k: 3\n",
+      "User: 15, Item: 966, Actual_k: 1\n",
+      "User: 15, Item: 1274, Actual_k: 3\n",
+      "User: 15, Item: 1299, Actual_k: 3\n",
+      "User: 15, Item: 1345, Actual_k: 3\n",
+      "User: 15, Item: 1354, Actual_k: 3\n",
+      "User: 15, Item: 532, Actual_k: 3\n",
+      "\n",
+      "Prédictions avec min_support = 3:\n",
+      "User: 15, Item: 942, Actual_k: 3\n",
+      "User: 15, Item: 2117, Actual_k: 3\n",
+      "User: 15, Item: 2672, Actual_k: 3\n",
+      "User: 15, Item: 5054, Actual_k: 1\n",
+      "User: 15, Item: 6322, Actual_k: 2\n",
+      "User: 15, Item: 6323, Actual_k: 3\n",
+      "User: 15, Item: 6757, Actual_k: 1\n",
+      "User: 15, Item: 7700, Actual_k: 2\n",
+      "User: 15, Item: 7981, Actual_k: 3\n",
+      "User: 15, Item: 8600, Actual_k: 2\n",
+      "User: 15, Item: 8620, Actual_k: 3\n",
+      "User: 15, Item: 31952, Actual_k: 2\n",
+      "User: 15, Item: 3, Actual_k: 3\n",
+      "User: 15, Item: 64, Actual_k: 3\n",
+      "User: 15, Item: 206, Actual_k: 3\n",
+      "User: 15, Item: 249, Actual_k: 3\n",
+      "User: 15, Item: 276, Actual_k: 3\n",
+      "User: 15, Item: 369, Actual_k: 3\n",
+      "User: 15, Item: 504, Actual_k: 3\n",
+      "User: 15, Item: 515, Actual_k: 3\n",
+      "User: 15, Item: 522, Actual_k: 3\n",
+      "User: 15, Item: 580, Actual_k: 3\n",
+      "User: 15, Item: 599, Actual_k: 3\n",
+      "User: 15, Item: 915, Actual_k: 3\n",
+      "User: 15, Item: 966, Actual_k: 1\n",
+      "User: 15, Item: 1274, Actual_k: 3\n",
+      "User: 15, Item: 1299, Actual_k: 3\n",
+      "User: 15, Item: 1345, Actual_k: 3\n",
+      "User: 15, Item: 1354, Actual_k: 3\n",
+      "User: 15, Item: 532, Actual_k: 3\n",
+      "\n",
+      "Matrice de similarité:\n",
+      "[[1.         0.39130435 0.35942029 ... 0.24358974 0.28513238 0.21451104]\n",
+      " [0.39130435 1.         0.32786885 ... 0.30967742 0.42424242 0.21621622]\n",
+      " [0.35942029 0.32786885 1.         ... 0.36666667 0.72727273 0.34375   ]\n",
+      " ...\n",
+      " [0.24358974 0.30967742 0.36666667 ... 1.         0.6779661  0.37569061]\n",
+      " [0.28513238 0.42424242 0.72727273 ... 0.6779661  1.         0.83333333]\n",
+      " [0.21451104 0.21621622 0.34375    ... 0.37569061 0.83333333 1.        ]]\n",
+      "None\n"
+     ]
+    }
+   ],
+   "source": [
+    "def analyse_min_support(knn_model, testset):\n",
+    "    # Rétablir min_k à 2\n",
+    "    knn_model.min_k = 2\n",
+    "\n",
+    "    # Modifier min_support de 1 à 3 et observer actual_k\n",
+    "    for min_support in range(1, 4):\n",
+    "        knn_model.sim_options['min_support'] = min_support\n",
+    "        predictions_min_support = knn_model.test(testset[:30])  # Prendre les 30 premières prédictions pour l'affichage\n",
+    "        print(f\"\\nPrédictions avec min_support = {min_support}:\")\n",
+    "        for prediction in predictions_min_support:\n",
+    "            actual_k = prediction.details['actual_k']\n",
+    "            print(f\"User: {prediction.uid}, Item: {prediction.iid}, Actual_k: {actual_k}\")\n",
+    "\n",
+    "    # Visualiser la matrice de similarité\n",
+    "    similarity_matrix = knn_model.sim  # Algorithme de knn_model\n",
+    "    print(\"\\nMatrice de similarité:\")\n",
+    "    print(similarity_matrix)\n",
+    "\n",
+    "# Appel de la fonction et impression de l'analyse\n",
+    "result = analyse_min_support(knn_model, testset)\n",
+    "print(result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2dd01f5b",
+   "metadata": {},
+   "source": [
+    "# 3. Implement and explore a customizable user-based algorithm\n",
+    "Create a self-made user-based algorithm allowing to customize the similarity metric, peer group calculation and aggregation function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d03ed9eb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[3.  1.5 4.  ... nan nan nan]\n",
+      " [nan nan nan ... nan nan nan]\n",
+      " [4.  3.  3.  ... nan nan nan]\n",
+      " ...\n",
+      " [4.5 nan nan ... nan nan nan]\n",
+      " [nan nan nan ... nan nan nan]\n",
+      " [2.  nan nan ... nan nan nan]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "class UserBased(AlgoBase):\n",
+    "    def __init__(self, k=3, min_k=1, sim_options={}, **kwargs):\n",
+    "        AlgoBase.__init__(self, sim_options=sim_options, **kwargs)\n",
+    "        self.k = k\n",
+    "        self.min_k = min_k\n",
+    "        self.sim_options = sim_options\n",
+    "\n",
+    "        \n",
+    "    def fit(self, trainset):\n",
+    "        AlgoBase.fit(self, trainset)\n",
+    "        self.compute_rating_matrix()\n",
+    "        self.compute_similarity_matrix()\n",
+    "        self.compute_mean_ratings()\n",
+    "    \n",
+    "    def estimate(self, u, i):\n",
+    "        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):\n",
+    "            raise PredictionImpossible('User and/or item is unknown.')\n",
+    "\n",
+    "        estimate = self.mean_ratings[u]\n",
+    "\n",
+    "        # Step 1: Create the peer group of user u for item i\n",
+    "        peer_group = []\n",
+    "        for j, rating in enumerate(self.trainset.ir[i]):\n",
+    "            if rating is not None:\n",
+    "                similarity = self.sim[u, j]  # Similarity between user u and user j for item i\n",
+    "                peer_group.append((j, similarity, rating))\n",
+    "\n",
+    "        # Step 2: Pick up the top neighbors efficiently\n",
+    "        k_neighbors = heapq.nlargest(self.min_k, peer_group, key=lambda x: x[1])  # Top k neighbors based on similarity\n",
+    "\n",
+    "        # Step 3: Compute the weighted average\n",
+    "        actual_k = len(k_neighbors)\n",
+    "        if actual_k >= self.min_k:\n",
+    "            weighted_sum = 0\n",
+    "            total_similarity = 0\n",
+    "            for j, similarity, rating_list in k_neighbors:\n",
+    "                # Assuming rating_list is a list or array containing ratings\n",
+    "                rating = rating_list[0]  # Access the first element of the rating list\n",
+    "                weighted_sum += similarity * rating\n",
+    "                total_similarity += similarity\n",
+    "\n",
+    "            if total_similarity != 0:\n",
+    "                peer_group_average = weighted_sum / total_similarity\n",
+    "                estimate += peer_group_average\n",
+    "\n",
+    "        return estimate\n",
+    "\n",
+    "                    \n",
+    "    def compute_rating_matrix(self):\n",
+    "        # Get the number of users and items\n",
+    "        n_users = self.trainset.n_users\n",
+    "        n_items = self.trainset.n_items\n",
+    "    \n",
+    "        ratings_matrix = np.empty((n_users, n_items))\n",
+    "        ratings_matrix[:] = np.nan\n",
+    "\n",
+    "        # Fill in the ratings matrix with available ratings\n",
+    "        for user_id, user_ratings in self.trainset.ur.items():\n",
+    "            if user_ratings:  # Check if user has ratings\n",
+    "                for item_id, rating in user_ratings:\n",
+    "                    ratings_matrix[user_id, item_id] = rating\n",
+    "    \n",
+    "        # Set the computed ratings matrix to self.ratings_matrix\n",
+    "        self.ratings_matrix = ratings_matrix\n",
+    "    \n",
+    "    def compute_similarity_matrix(self):\n",
+    "        # Get the number of users\n",
+    "        n_users = self.trainset.n_users\n",
+    "        \n",
+    "        # Initialize the similarity matrix with zeros and ones in the diagonal\n",
+    "        similarity_matrix = np.eye(n_users)\n",
+    "        \n",
+    "        # Iterate through pairs of users to compute similarities\n",
+    "        for i in range(n_users):\n",
+    "            for j in range(i + 1, n_users):\n",
+    "                # Compute support\n",
+    "                support = np.sum(~np.isnan(self.ratings_matrix[i]) & ~np.isnan(self.ratings_matrix[j]))\n",
+    "                \n",
+    "                # Check if support is greater than or equal to min_k\n",
+    "                if support >= self.min_k:\n",
+    "                    # Compute similarity using Jaccard similarity\n",
+    "                    intersection = np.sum(~np.isnan(self.ratings_matrix[i]) & ~np.isnan(self.ratings_matrix[j]))\n",
+    "                    union = np.sum(~np.isnan(self.ratings_matrix[i]) | ~np.isnan(self.ratings_matrix[j]))\n",
+    "                    similarity = intersection / union\n",
+    "                    similarity_matrix[i, j] = similarity\n",
+    "                    similarity_matrix[j, i] = similarity  # Similarity matrix is symmetric\n",
+    "        \n",
+    "        # Set the computed similarity matrix to self.sim\n",
+    "        self.sim = similarity_matrix\n",
+    "    \n",
+    "    def compute_mean_ratings(self):\n",
+    "        # Compute the mean rating of every user\n",
+    "        mean_ratings = []\n",
+    "        for user_id, ratings in self.trainset.ur.items():\n",
+    "            if ratings:  # Check if user has ratings\n",
+    "                mean_rating = np.mean([rating[1] for rating in ratings])\n",
+    "                mean_ratings.append(mean_rating)\n",
+    "            else:\n",
+    "                mean_ratings.append(0)  # If no ratings available, set mean to 0\n",
+    "        \n",
+    "        # Set the computed mean ratings\n",
+    "        self.mean_ratings = mean_ratings\n",
+    "\n",
+    "    \n",
+    "user_based_instance = UserBased(trainset=trainset)\n",
+    "\n",
+    "# Appel de la méthode fit pour calculer les matrices des évaluations, de similarité et les moyennes des évaluations\n",
+    "user_based_instance.fit(trainset)\n",
+    "\n",
+    "# Affichage de la matrice des évaluations\n",
+    "print(user_based_instance.ratings_matrix)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dfdc9cfe",
+   "metadata": {},
+   "source": [
+    "# 4. Compare KNNWithMeans with UserBased\n",
+    "Try to replicate KNNWithMeans with your self-made UserBased and check that outcomes are identical"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be53ae27",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "UserBased MAE: 1.5398252671298895\n",
+      "UserBased RMSE: 1.5553141029705104\n",
+      "KNNWithMeans MAE: 0.5419110316300769\n",
+      "KNNWithMeans RMSE: 0.7019543155680094\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 1. Obtain Predictions\n",
+    "# Using UserBased algorithm\n",
+    "user_based_predictions = []\n",
+    "for uid, iid, true_r in testset:\n",
+    "    user_based_pred = user_based_instance.predict(uid, iid)\n",
+    "    user_based_predictions.append((uid, iid, true_r, user_based_pred.est, {}))\n",
+    "\n",
+    "# Using KNNWithMeans algorithm\n",
+    "knn_predictions = []\n",
+    "for uid, iid, true_r in testset:\n",
+    "    knn_pred = knn_model.predict(uid, iid)\n",
+    "    knn_predictions.append((uid, iid, true_r, knn_pred.est, knn_pred.details))\n",
+    "\n",
+    "# 2. Calculate Metrics\n",
+    "# Calculate MAE and RMSE for UserBased algorithm\n",
+    "user_based_mae = accuracy.mae(user_based_predictions, verbose=False)\n",
+    "user_based_rmse = accuracy.rmse(user_based_predictions, verbose=False)\n",
+    "\n",
+    "# Calculate MAE and RMSE for KNNWithMeans algorithm\n",
+    "knn_mae = accuracy.mae(knn_predictions, verbose=False)\n",
+    "knn_rmse = accuracy.rmse(knn_predictions, verbose=False)\n",
+    "\n",
+    "# 3. Compare Results\n",
+    "print(\"UserBased MAE:\", user_based_mae)\n",
+    "print(\"UserBased RMSE:\", user_based_rmse)\n",
+    "print(\"KNNWithMeans MAE:\", knn_mae)\n",
+    "print(\"KNNWithMeans RMSE:\", knn_rmse)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cced76d9",
+   "metadata": {},
+   "source": [
+    "# 5. Compare MSD and Jacard\n",
+    "Compare predictions made with MSD similarity and Jacard similarity\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c20d8e19",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computing the msd similarity matrix...\n",
+      "Done computing similarity matrix.\n",
+      "Computing the cosine similarity matrix...\n",
+      "Done computing similarity matrix.\n",
+      "RMSE: 0.9799\n",
+      "RMSE: 0.9871\n",
+      "RMSE with MSD similarity: 0.9798533097556152\n",
+      "RMSE with Jaccard similarity: 0.9870653791755158\n"
+     ]
+    }
+   ],
+   "source": [
+    "from surprise import accuracy\n",
+    "from surprise.model_selection import train_test_split\n",
+    "from surprise import Dataset, Reader\n",
+    "from surprise import KNNBasic\n",
+    "\n",
+    "\n",
+    "# Split the dataset into training and testing sets\n",
+    "trainset, testset = train_test_split(surprise_data, test_size=0.2)\n",
+    "\n",
+    "# Initialize the model with MSD similarity\n",
+    "sim_options_msd = {'name': 'msd'}\n",
+    "user_based_msd = KNNBasic(sim_options=sim_options_msd)\n",
+    "user_based_msd.fit(trainset)\n",
+    "\n",
+    "# Initialize the model with Jacard similarity\n",
+    "sim_options_jaccard = {'name': 'cosine'}\n",
+    "user_based_jaccard = KNNBasic(sim_options=sim_options_jaccard)\n",
+    "user_based_jaccard.fit(trainset)\n",
+    "\n",
+    "# Make predictions with each model on the test set\n",
+    "predictions_msd = user_based_msd.test(testset)\n",
+    "predictions_jaccard = user_based_jaccard.test(testset)\n",
+    "\n",
+    "# Calculate and display the performances of the two models\n",
+    "rmse_msd = accuracy.rmse(predictions_msd)\n",
+    "rmse_jaccard = accuracy.rmse(predictions_jaccard)\n",
+    "\n",
+    "print(\"RMSE with MSD similarity:\", rmse_msd)\n",
+    "print(\"RMSE with Jaccard similarity:\", rmse_jaccard)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mon_environnement",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
-- 
GitLab