Merge branch 'main' of https://forge.uclouvain.be/recommender_system/recomsys

fdbb4229 · Adrien Payen · c5312b8c · 66fa19da · fdbb4229
--- a/3_User_Based/user_based.ipynb
+++ b/3_User_Based/user_based.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f4a8f664",
+   "metadata": {},
+   "source": [
+    "# Custom User-based Model\n",
+    "The present notebooks aims at creating a UserBased class that inherits from the Algobase class (surprise package) and that can be customized with various similarity metrics, peer groups and score aggregation functions. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "00d1b249",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
+   "source": [
+    "# reloads modules automatically before entering the execution of code\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "# standard library imports\n",
+    "# -- add new imports here --\n",
+    "\n",
+    "# third parties imports\n",
+    "import numpy as np \n",
+    "import pandas as pd\n",
+    "from surprise import AlgoBase\n",
+    "# -- add new imports here --\n",
+    "\n",
+    "# local imports\n",
+    "from constants import Constant as C\n",
+    "from loaders import load_ratings # voir si besoin\n",
+    "# -- add new imports here --\n",
+    "from loaders import load_items\n",
+    "from surprise import Dataset\n",
+    "from surprise import Reader\n",
+    "from surprise.model_selection import train_test_split\n",
+    "from surprise import KNNWithMeans\n",
+    "from surprise import accuracy\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "22716aa3",
+   "metadata": {},
+   "source": [
+    "# 1. Loading Data\n",
+    "Prepare a dataset in order to help implementing a user-based recommender system"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "cf3ccdc0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# -- load data, build trainset and anti testset --\n",
+    "\n",
+    "df_ratings = load_ratings()\n",
+    "df_movies = load_items()\n",
+    "\n",
+    "\n",
+    "# Assuming you have a pandas DataFrame named 'df' with columns ['user_id', 'item_id', 'rating']\n",
+    "reader = Reader(rating_scale=(1, 5))\n",
+    "data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)\n",
+    "\n",
+    "# Build train set with all available ratings\n",
+    "trainset = data.build_full_trainset()\n",
+    "\n",
+    "# Build anti-test set\n",
+    "testset = trainset.build_anti_testset()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "94adf3a6",
+   "metadata": {},
+   "source": [
+    "# 2. Explore Surprise's user-based algorithm\n",
+    "Displays user-based predictions and similarity matrix on the test dataset using the KNNWithMeans class"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "e6fb78b7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computing the msd similarity matrix...\n",
+      "Done computing similarity matrix.\n",
+      "Estimated rating for user 11 and item 364 : 3.543608255669773\n",
+      "Expected rating: 2.49\n"
+     ]
+    }
+   ],
+   "source": [
+    "# -- using surprise's user-based algorithm, explore the impact of different parameters and displays predictions --\n",
+    "# Create instance of KNNWithMeans algorithm with specified parameters\n",
+    "sim_options = {\n",
+    "    'name': 'msd',  # Mean Squared Difference similarity\n",
+    "    'user_based': True,  # User-based collaborative filtering\n",
+    "    'min_support': 3  # Minimum 3 common ratings between two users\n",
+    "}\n",
+    "\n",
+    "knn_model = KNNWithMeans(sim_options=sim_options, k=3, min_k=2)\n",
+    "\n",
+    "# Train the algorithm on the train set\n",
+    "knn_model.fit(trainset)\n",
+    "\n",
+    "# Make prediction for user 11 and item 364\n",
+    "user_id = '11'\n",
+    "item_id = '364'\n",
+    "prediction = knn_model.predict(user_id, item_id)\n",
+    "\n",
+    "# Print the estimated rating\n",
+    "print(\"Estimated rating for user\", user_id, \"and item\", item_id, \":\", prediction.est)\n",
+    "\n",
+    "# Confirm prediction matches the expected value\n",
+    "expected_rating = 2.49\n",
+    "print(\"Expected rating:\", expected_rating)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ae0c6389",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Predictions with min_k = 1:\n",
+      "Prediction 1 - User: 1 Item: 10 Rating: 2.6247761735269064\n",
+      "Prediction 2 - User: 1 Item: 17 Rating: 3.1901280347708134\n",
+      "Prediction 3 - User: 1 Item: 39 Rating: 2.5294831050488193\n",
+      "Prediction 4 - User: 1 Item: 47 Rating: 2.979633267599226\n",
+      "Prediction 5 - User: 1 Item: 50 Rating: 4.043171414883629\n",
+      "Prediction 6 - User: 1 Item: 52 Rating: 2.127120896970167\n",
+      "Prediction 7 - User: 1 Item: 62 Rating: 2.7740341044326526\n",
+      "Prediction 8 - User: 1 Item: 110 Rating: 2.69539764245346\n",
+      "Prediction 9 - User: 1 Item: 144 Rating: 2.130512917560504\n",
+      "Prediction 10 - User: 1 Item: 150 Rating: 1.8002080597099979\n",
+      "Prediction 11 - User: 1 Item: 153 Rating: 2.130425891041031\n",
+      "Prediction 12 - User: 1 Item: 161 Rating: 2.673303568569785\n",
+      "Prediction 13 - User: 1 Item: 165 Rating: 3.0645368977287424\n",
+      "Prediction 14 - User: 1 Item: 168 Rating: 2.8552205170570994\n",
+      "Prediction 15 - User: 1 Item: 185 Rating: 2.4116724902671347\n",
+      "Prediction 16 - User: 1 Item: 186 Rating: 1.538610868951305\n",
+      "Prediction 17 - User: 1 Item: 208 Rating: 1.2282040633065185\n",
+      "Prediction 18 - User: 1 Item: 222 Rating: 2.6068547977317538\n",
+      "Prediction 19 - User: 1 Item: 223 Rating: 3.442259793730382\n",
+      "Prediction 20 - User: 1 Item: 225 Rating: 1.9068246621179163\n",
+      "Prediction 21 - User: 1 Item: 235 Rating: 2.232307946145723\n",
+      "Prediction 22 - User: 1 Item: 248 Rating: 1.777854639156695\n",
+      "Prediction 23 - User: 1 Item: 253 Rating: 2.7663461725246226\n",
+      "Prediction 24 - User: 1 Item: 261 Rating: 2.4070539597835885\n",
+      "Prediction 25 - User: 1 Item: 265 Rating: 2.7223969514581756\n",
+      "Prediction 26 - User: 1 Item: 266 Rating: 2.9954744480855413\n",
+      "Prediction 27 - User: 1 Item: 272 Rating: 3.386995604204407\n",
+      "Prediction 28 - User: 1 Item: 273 Rating: 2.200898157416013\n",
+      "Prediction 29 - User: 1 Item: 292 Rating: 1.9188885454059783\n",
+      "Prediction 30 - User: 1 Item: 296 Rating: 3.645542484320921\n",
+      "Computing the msd similarity matrix...\n",
+      "Done computing similarity matrix.\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[8], line 12\u001b[0m\n\u001b[1;32m     10\u001b[0m knn_model_min_k_3 \u001b[38;5;241m=\u001b[39m KNNWithMeans(sim_options\u001b[38;5;241m=\u001b[39msim_options, k\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m3\u001b[39m, min_k\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m3\u001b[39m)\n\u001b[1;32m     11\u001b[0m knn_model_min_k_3\u001b[38;5;241m.\u001b[39mfit(trainset)\n\u001b[0;32m---> 12\u001b[0m predictions_min_k_3 \u001b[38;5;241m=\u001b[39m knn_model_min_k_3\u001b[38;5;241m.\u001b[39mtest(testset)\n\u001b[1;32m     14\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mPredictions with min_k = 3:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     15\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, pred \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(predictions_min_k_3[:\u001b[38;5;241m30\u001b[39m]):\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.11/site-packages/surprise/prediction_algorithms/algo_base.py:160\u001b[0m, in \u001b[0;36mAlgoBase.test\u001b[0;34m(self, testset, verbose)\u001b[0m\n\u001b[1;32m    142\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Test the algorithm on given testset, i.e. estimate all the ratings\u001b[39;00m\n\u001b[1;32m    143\u001b[0m \u001b[38;5;124;03min the given testset.\u001b[39;00m\n\u001b[1;32m    144\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    156\u001b[0m \u001b[38;5;124;03m    that contains all the estimated ratings.\u001b[39;00m\n\u001b[1;32m    157\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    159\u001b[0m \u001b[38;5;66;03m# The ratings are translated back to their original scale.\u001b[39;00m\n\u001b[0;32m--> 160\u001b[0m predictions \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m    161\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpredict(uid, iid, r_ui_trans, verbose\u001b[38;5;241m=\u001b[39mverbose)\n\u001b[1;32m    162\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m (uid, iid, r_ui_trans) \u001b[38;5;129;01min\u001b[39;00m testset\n\u001b[1;32m    163\u001b[0m ]\n\u001b[1;32m    164\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m predictions\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.11/site-packages/surprise/prediction_algorithms/algo_base.py:161\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m    142\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Test the algorithm on given testset, i.e. estimate all the ratings\u001b[39;00m\n\u001b[1;32m    143\u001b[0m \u001b[38;5;124;03min the given testset.\u001b[39;00m\n\u001b[1;32m    144\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    156\u001b[0m \u001b[38;5;124;03m    that contains all the estimated ratings.\u001b[39;00m\n\u001b[1;32m    157\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    159\u001b[0m \u001b[38;5;66;03m# The ratings are translated back to their original scale.\u001b[39;00m\n\u001b[1;32m    160\u001b[0m predictions \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m--> 161\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpredict(uid, iid, r_ui_trans, verbose\u001b[38;5;241m=\u001b[39mverbose)\n\u001b[1;32m    162\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m (uid, iid, r_ui_trans) \u001b[38;5;129;01min\u001b[39;00m testset\n\u001b[1;32m    163\u001b[0m ]\n\u001b[1;32m    164\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m predictions\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.11/site-packages/surprise/prediction_algorithms/algo_base.py:102\u001b[0m, in \u001b[0;36mAlgoBase.predict\u001b[0;34m(self, uid, iid, r_ui, clip, verbose)\u001b[0m\n\u001b[1;32m    100\u001b[0m details \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m    101\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 102\u001b[0m     est \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mestimate(iuid, iiid)\n\u001b[1;32m    104\u001b[0m     \u001b[38;5;66;03m# If the details dict was also returned\u001b[39;00m\n\u001b[1;32m    105\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(est, \u001b[38;5;28mtuple\u001b[39m):\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.11/site-packages/surprise/prediction_algorithms/knns.py:191\u001b[0m, in \u001b[0;36mKNNWithMeans.estimate\u001b[0;34m(self, u, i)\u001b[0m\n\u001b[1;32m    187\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m PredictionImpossible(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUser and/or item is unknown.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    189\u001b[0m x, y \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mswitch(u, i)\n\u001b[0;32m--> 191\u001b[0m neighbors \u001b[38;5;241m=\u001b[39m [(x2, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msim[x, x2], r) \u001b[38;5;28;01mfor\u001b[39;00m (x2, r) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39myr[y]]\n\u001b[1;32m    192\u001b[0m k_neighbors \u001b[38;5;241m=\u001b[39m heapq\u001b[38;5;241m.\u001b[39mnlargest(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mk, neighbors, key\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mlambda\u001b[39;00m t: t[\u001b[38;5;241m1\u001b[39m])\n\u001b[1;32m    194\u001b[0m est \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmeans[x]\n",
+      "File \u001b[0;32m/opt/anaconda3/lib/python3.11/site-packages/surprise/prediction_algorithms/knns.py:191\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m    187\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m PredictionImpossible(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUser and/or item is unknown.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    189\u001b[0m x, y \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mswitch(u, i)\n\u001b[0;32m--> 191\u001b[0m neighbors \u001b[38;5;241m=\u001b[39m [(x2, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msim[x, x2], r) \u001b[38;5;28;01mfor\u001b[39;00m (x2, r) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39myr[y]]\n\u001b[1;32m    192\u001b[0m k_neighbors \u001b[38;5;241m=\u001b[39m heapq\u001b[38;5;241m.\u001b[39mnlargest(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mk, neighbors, key\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mlambda\u001b[39;00m t: t[\u001b[38;5;241m1\u001b[39m])\n\u001b[1;32m    194\u001b[0m est \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmeans[x]\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "# Make predictions for all ratings in the anti-testset\n",
+    "predictions = knn_model.test(testset)\n",
+    "\n",
+    "# Display the first 30 predictions\n",
+    "print(\"Predictions with min_k = 1:\")\n",
+    "for i, pred in enumerate(predictions[:30]):\n",
+    "    print(\"Prediction\", i+1, \"- User:\", pred.uid, \"Item:\", pred.iid, \"Rating:\", pred.est)\n",
+    "\n",
+    "# Change min_k from 1 to 3\n",
+    "knn_model_min_k_3 = KNNWithMeans(sim_options=sim_options, k=3, min_k=3)\n",
+    "knn_model_min_k_3.fit(trainset)\n",
+    "predictions_min_k_3 = knn_model_min_k_3.test(testset)\n",
+    "\n",
+    "print(\"\\nPredictions with min_k = 3:\")\n",
+    "for i, pred in enumerate(predictions_min_k_3[:30]):\n",
+    "    print(\"Prediction\", i+1, \"- User:\", pred.uid, \"Item:\", pred.iid, \"Rating:\", pred.est)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "224570f9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computing the msd similarity matrix...\n",
+      "Done computing similarity matrix.\n"
+     ]
+    },
+    {
+     "ename": "AttributeError",
+     "evalue": "'Prediction' object has no attribute 'actual_k'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[9], line 13\u001b[0m\n\u001b[1;32m     10\u001b[0m predictions_min_support_3 \u001b[38;5;241m=\u001b[39m knn_model_min_support_3\u001b[38;5;241m.\u001b[39mtest(testset)\n\u001b[1;32m     12\u001b[0m \u001b[38;5;66;03m# Look at the value of actual_k\u001b[39;00m\n\u001b[0;32m---> 13\u001b[0m actual_k_values \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mlen\u001b[39m(pred\u001b[38;5;241m.\u001b[39mactual_k) \u001b[38;5;28;01mfor\u001b[39;00m pred \u001b[38;5;129;01min\u001b[39;00m predictions_min_support_3]\n\u001b[1;32m     15\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mActual_k values:\u001b[39m\u001b[38;5;124m\"\u001b[39m, actual_k_values[:\u001b[38;5;241m30\u001b[39m])\n",
+      "Cell \u001b[0;32mIn[9], line 13\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m     10\u001b[0m predictions_min_support_3 \u001b[38;5;241m=\u001b[39m knn_model_min_support_3\u001b[38;5;241m.\u001b[39mtest(testset)\n\u001b[1;32m     12\u001b[0m \u001b[38;5;66;03m# Look at the value of actual_k\u001b[39;00m\n\u001b[0;32m---> 13\u001b[0m actual_k_values \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mlen\u001b[39m(pred\u001b[38;5;241m.\u001b[39mactual_k) \u001b[38;5;28;01mfor\u001b[39;00m pred \u001b[38;5;129;01min\u001b[39;00m predictions_min_support_3]\n\u001b[1;32m     15\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mActual_k values:\u001b[39m\u001b[38;5;124m\"\u001b[39m, actual_k_values[:\u001b[38;5;241m30\u001b[39m])\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'Prediction' object has no attribute 'actual_k'"
+     ]
+    }
+   ],
+   "source": [
+    "# Set min_support to 3\n",
+    "sim_options_min_support_3 = {\n",
+    "    'name': 'msd',  # Mean Squared Difference similarity\n",
+    "    'user_based': True,  # User-based collaborative filtering\n",
+    "    'min_support': 3  # Minimum 3 common ratings between two users\n",
+    "}\n",
+    "\n",
+    "knn_model_min_support_3 = KNNWithMeans(sim_options=sim_options_min_support_3, k=3, min_k=2)\n",
+    "knn_model_min_support_3.fit(trainset)\n",
+    "predictions_min_support_3 = knn_model_min_support_3.test(testset)\n",
+    "\n",
+    "# Look at the value of actual_k\n",
+    "actual_k_values = [len(pred.actual_k) for pred in predictions_min_support_3]\n",
+    "\n",
+    "print(\"Actual_k values:\", actual_k_values[:30])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2dd01f5b",
+   "metadata": {},
+   "source": [
+    "# 3. Implement and explore a customizable user-based algorithm\n",
+    "Create a self-made user-based algorithm allowing to customize the similarity metric, peer group calculation and aggregation function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "d03ed9eb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class UserBased(AlgoBase):\n",
+    "    def __init__(self, k=3, min_k=1, sim_options={}, **kwargs):\n",
+    "        AlgoBase.__init__(self, sim_options=sim_options, **kwargs)\n",
+    "        self.k = k\n",
+    "        self.min_k = min_k\n",
+    "\n",
+    "        \n",
+    "    def fit(self, trainset):\n",
+    "        AlgoBase.fit(self, trainset)\n",
+    "        # -- implement here the fit function --\n",
+    "    \n",
+    "    def estimate(self, u, i):\n",
+    "        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):\n",
+    "            raise PredictionImpossible('User and/or item is unknown.')\n",
+    "        \n",
+    "        estimate = self.mean_ratings[u]\n",
+    "        # -- implement here the estimate function --\n",
+    "        \n",
+    "        return estimate\n",
+    "\n",
+    "                    \n",
+    "    def compute_rating_matrix(self):\n",
+    "        pass\n",
+    "        # -- implement here the compute_rating_matrix function --\n",
+    "    \n",
+    "    def compute_similarity_matrix(self):\n",
+    "        pass\n",
+    "        # -- implement here the compute_rating_matrix function -)\n",
+    "customizable "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dfdc9cfe",
+   "metadata": {},
+   "source": [
+    "# 4. Compare KNNWithMeans with UserBased\n",
+    "Try to replicate KNNWithMeans with your self-made UserBased and check that outcomes are identical"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "be53ae27",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# -- assert that predictions are the same with different sim_options --\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cced76d9",
+   "metadata": {},
+   "source": [
+    "# 5. Compare MSD and Jacard\n",
+    "Compare predictions made with MSD similarity and Jacard similarity\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "c20d8e19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# -- compare predictions made with MSD similarity and Jacard similarity --\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mon_environnement",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:markdown id:f4a8f664 tags:
+# Custom User-based Model
+The present notebooks aims at creating a UserBased class that inherits from the Algobase class (surprise package) and that can be customized with various similarity metrics, peer groups and score aggregation functions.
+%% Cell type:code id:00d1b249 tags:
+``` python
+# reloads modules automatically before entering the execution of code
+%load_ext autoreload
+%autoreload 2
+# standard library imports
+# -- add new imports here --
+# third parties imports
+import numpy as np
+import pandas as pd
+from surprise import AlgoBase
+# -- add new imports here --
+# local imports
+from constants import Constant as C
+from loaders import load_ratings # voir si besoin
+# -- add new imports here --
+from loaders import load_items
+from surprise import Dataset
+from surprise import Reader
+from surprise.model_selection import train_test_split
+from surprise import KNNWithMeans
+from surprise import accuracy
+```
+%% Output
+    The autoreload extension is already loaded. To reload it, use:
+      %reload_ext autoreload
+%% Cell type:markdown id:22716aa3 tags:
+# 1. Loading Data
+Prepare a dataset in order to help implementing a user-based recommender system
+%% Cell type:code id:cf3ccdc0 tags:
+``` python
+# -- load data, build trainset and anti testset --
+df_ratings = load_ratings()
+df_movies = load_items()
+# Assuming you have a pandas DataFrame named 'df' with columns ['user_id', 'item_id', 'rating']
+reader = Reader(rating_scale=(1, 5))
+data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
+# Build train set with all available ratings
+trainset = data.build_full_trainset()
+# Build anti-test set
+testset = trainset.build_anti_testset()
+```
+%% Cell type:markdown id:94adf3a6 tags:
+# 2. Explore Surprise's user-based algorithm
+Displays user-based predictions and similarity matrix on the test dataset using the KNNWithMeans class
+%% Cell type:code id:e6fb78b7 tags:
+``` python
+# -- using surprise's user-based algorithm, explore the impact of different parameters and displays predictions --
+# Create instance of KNNWithMeans algorithm with specified parameters
+sim_options = {
+    'name': 'msd',  # Mean Squared Difference similarity
+    'user_based': True,  # User-based collaborative filtering
+    'min_support': 3  # Minimum 3 common ratings between two users
+}
+knn_model = KNNWithMeans(sim_options=sim_options, k=3, min_k=2)
+# Train the algorithm on the train set
+knn_model.fit(trainset)
+# Make prediction for user 11 and item 364
+user_id = '11'
+item_id = '364'
+prediction = knn_model.predict(user_id, item_id)
+# Print the estimated rating
+print("Estimated rating for user", user_id, "and item", item_id, ":", prediction.est)
+# Confirm prediction matches the expected value
+expected_rating = 2.49
+print("Expected rating:", expected_rating)
+```
+%% Output
+    Computing the msd similarity matrix...
+    Done computing similarity matrix.
+    Estimated rating for user 11 and item 364 : 3.543608255669773
+    Expected rating: 2.49
+%% Cell type:code id:ae0c6389 tags:
+``` python
+# Make predictions for all ratings in the anti-testset
+predictions = knn_model.test(testset)
+# Display the first 30 predictions
+print("Predictions with min_k = 1:")
+for i, pred in enumerate(predictions[:30]):
+    print("Prediction", i+1, "- User:", pred.uid, "Item:", pred.iid, "Rating:", pred.est)
+# Change min_k from 1 to 3
+knn_model_min_k_3 = KNNWithMeans(sim_options=sim_options, k=3, min_k=3)
+knn_model_min_k_3.fit(trainset)
+predictions_min_k_3 = knn_model_min_k_3.test(testset)
+print("\nPredictions with min_k = 3:")
+for i, pred in enumerate(predictions_min_k_3[:30]):
+    print("Prediction", i+1, "- User:", pred.uid, "Item:", pred.iid, "Rating:", pred.est)
+```
+%% Output
+    Predictions with min_k = 1:
+    Prediction 1 - User: 1 Item: 10 Rating: 2.6247761735269064
+    Prediction 2 - User: 1 Item: 17 Rating: 3.1901280347708134
+    Prediction 3 - User: 1 Item: 39 Rating: 2.5294831050488193
+    Prediction 4 - User: 1 Item: 47 Rating: 2.979633267599226
+    Prediction 5 - User: 1 Item: 50 Rating: 4.043171414883629
+    Prediction 6 - User: 1 Item: 52 Rating: 2.127120896970167
+    Prediction 7 - User: 1 Item: 62 Rating: 2.7740341044326526
+    Prediction 8 - User: 1 Item: 110 Rating: 2.69539764245346
+    Prediction 9 - User: 1 Item: 144 Rating: 2.130512917560504
+    Prediction 10 - User: 1 Item: 150 Rating: 1.8002080597099979
+    Prediction 11 - User: 1 Item: 153 Rating: 2.130425891041031
+    Prediction 12 - User: 1 Item: 161 Rating: 2.673303568569785
+    Prediction 13 - User: 1 Item: 165 Rating: 3.0645368977287424
+    Prediction 14 - User: 1 Item: 168 Rating: 2.8552205170570994
+    Prediction 15 - User: 1 Item: 185 Rating: 2.4116724902671347
+    Prediction 16 - User: 1 Item: 186 Rating: 1.538610868951305
+    Prediction 17 - User: 1 Item: 208 Rating: 1.2282040633065185
+    Prediction 18 - User: 1 Item: 222 Rating: 2.6068547977317538
+    Prediction 19 - User: 1 Item: 223 Rating: 3.442259793730382
+    Prediction 20 - User: 1 Item: 225 Rating: 1.9068246621179163
+    Prediction 21 - User: 1 Item: 235 Rating: 2.232307946145723
+    Prediction 22 - User: 1 Item: 248 Rating: 1.777854639156695
+    Prediction 23 - User: 1 Item: 253 Rating: 2.7663461725246226
+    Prediction 24 - User: 1 Item: 261 Rating: 2.4070539597835885
+    Prediction 25 - User: 1 Item: 265 Rating: 2.7223969514581756
+    Prediction 26 - User: 1 Item: 266 Rating: 2.9954744480855413
+    Prediction 27 - User: 1 Item: 272 Rating: 3.386995604204407
+    Prediction 28 - User: 1 Item: 273 Rating: 2.200898157416013
+    Prediction 29 - User: 1 Item: 292 Rating: 1.9188885454059783
+    Prediction 30 - User: 1 Item: 296 Rating: 3.645542484320921
+    Computing the msd similarity matrix...
+    Done computing similarity matrix.
+    ---------------------------------------------------------------------------
+    KeyboardInterrupt                         Traceback (most recent call last)
+Cell     In[8], line 12
+         10 knn_model_min_k_3 = KNNWithMeans(sim_options=sim_options, k=3, min_k=3)
+         11 knn_model_min_k_3.fit(trainset)
+    ---> 12 predictions_min_k_3 = knn_model_min_k_3.test(testset)
+         14 print("\nPredictions with min_k = 3:")
+         15 for i, pred in enumerate(predictions_min_k_3[:30]):
+File     /opt/anaconda3/lib/python3.11/site-packages/surprise/prediction_algorithms/algo_base.py:160, in AlgoBase.test(self, testset, verbose)
+        142 """Test the algorithm on given testset, i.e. estimate all the ratings
+        143 in the given testset.
+        144
+       (...)
+        156     that contains all the estimated ratings.
+        157 """
+        159 # The ratings are translated back to their original scale.
+    --> 160 predictions = [
+        161     self.predict(uid, iid, r_ui_trans, verbose=verbose)
+        162     for (uid, iid, r_ui_trans) in testset
+        163 ]
+        164 return predictions
+File     /opt/anaconda3/lib/python3.11/site-packages/surprise/prediction_algorithms/algo_base.py:161, in <listcomp>(.0)
+        142 """Test the algorithm on given testset, i.e. estimate all the ratings
+        143 in the given testset.
+        144
+       (...)
+        156     that contains all the estimated ratings.
+        157 """
+        159 # The ratings are translated back to their original scale.
+        160 predictions = [
+    --> 161     self.predict(uid, iid, r_ui_trans, verbose=verbose)
+        162     for (uid, iid, r_ui_trans) in testset
+        163 ]
+        164 return predictions
+File     /opt/anaconda3/lib/python3.11/site-packages/surprise/prediction_algorithms/algo_base.py:102, in AlgoBase.predict(self, uid, iid, r_ui, clip, verbose)
+        100 details = {}
+        101 try:
+    --> 102     est = self.estimate(iuid, iiid)
+        104     # If the details dict was also returned
+        105     if isinstance(est, tuple):
+File     /opt/anaconda3/lib/python3.11/site-packages/surprise/prediction_algorithms/knns.py:191, in KNNWithMeans.estimate(self, u, i)
+        187     raise PredictionImpossible("User and/or item is unknown.")
+        189 x, y = self.switch(u, i)
+    --> 191 neighbors = [(x2, self.sim[x, x2], r) for (x2, r) in self.yr[y]]
+        192 k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[1])
+        194 est = self.means[x]
+File     /opt/anaconda3/lib/python3.11/site-packages/surprise/prediction_algorithms/knns.py:191, in <listcomp>(.0)
+        187     raise PredictionImpossible("User and/or item is unknown.")
+        189 x, y = self.switch(u, i)
+    --> 191 neighbors = [(x2, self.sim[x, x2], r) for (x2, r) in self.yr[y]]
+        192 k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[1])
+        194 est = self.means[x]
+    KeyboardInterrupt:
+%% Cell type:code id:224570f9 tags:
+``` python
+# Set min_support to 3
+sim_options_min_support_3 = {
+    'name': 'msd',  # Mean Squared Difference similarity
+    'user_based': True,  # User-based collaborative filtering
+    'min_support': 3  # Minimum 3 common ratings between two users
+}
+knn_model_min_support_3 = KNNWithMeans(sim_options=sim_options_min_support_3, k=3, min_k=2)
+knn_model_min_support_3.fit(trainset)
+predictions_min_support_3 = knn_model_min_support_3.test(testset)
+# Look at the value of actual_k
+actual_k_values = [len(pred.actual_k) for pred in predictions_min_support_3]
+print("Actual_k values:", actual_k_values[:30])
+```
+%% Output
+    Computing the msd similarity matrix...
+    Done computing similarity matrix.
+    ---------------------------------------------------------------------------
+    AttributeError                            Traceback (most recent call last)
+Cell     In[9], line 13
+         10 predictions_min_support_3 = knn_model_min_support_3.test(testset)
+         12 # Look at the value of actual_k
+    ---> 13 actual_k_values = [len(pred.actual_k) for pred in predictions_min_support_3]
+         15 print("Actual_k values:", actual_k_values[:30])
+Cell     In[9], line 13, in <listcomp>(.0)
+         10 predictions_min_support_3 = knn_model_min_support_3.test(testset)
+         12 # Look at the value of actual_k
+    ---> 13 actual_k_values = [len(pred.actual_k) for pred in predictions_min_support_3]
+         15 print("Actual_k values:", actual_k_values[:30])
+    AttributeError: 'Prediction' object has no attribute 'actual_k'
+%% Cell type:markdown id:2dd01f5b tags:
+# 3. Implement and explore a customizable user-based algorithm
+Create a self-made user-based algorithm allowing to customize the similarity metric, peer group calculation and aggregation function
+%% Cell type:code id:d03ed9eb tags:
+``` python
+class UserBased(AlgoBase):
+    def __init__(self, k=3, min_k=1, sim_options={}, **kwargs):
+        AlgoBase.__init__(self, sim_options=sim_options, **kwargs)
+        self.k = k
+        self.min_k = min_k
+    def fit(self, trainset):
+        AlgoBase.fit(self, trainset)
+        # -- implement here the fit function --
+    def estimate(self, u, i):
+        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
+            raise PredictionImpossible('User and/or item is unknown.')
+        estimate = self.mean_ratings[u]
+        # -- implement here the estimate function --
+        return estimate
+    def compute_rating_matrix(self):
+        pass
+        # -- implement here the compute_rating_matrix function --
+    def compute_similarity_matrix(self):
+        pass
+        # -- implement here the compute_rating_matrix function -)
+customizable
+```
+%% Cell type:markdown id:dfdc9cfe tags:
+# 4. Compare KNNWithMeans with UserBased
+Try to replicate KNNWithMeans with your self-made UserBased and check that outcomes are identical
+%% Cell type:code id:be53ae27 tags:
+``` python
+# -- assert that predictions are the same with different sim_options --
+```
+%% Cell type:markdown id:cced76d9 tags:
+# 5. Compare MSD and Jacard
+Compare predictions made with MSD similarity and Jacard similarity
+%% Cell type:code id:c20d8e19 tags:
+``` python
+# -- compare predictions made with MSD similarity and Jacard similarity --
+```