point 6 Jacard

58fddf41 · Audrey Ghilain · bdf43c5e · 58fddf41
--- a/user_based.ipynb
+++ b/user_based.ipynb
@@ -11,7 +11,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "id": "00d1b249",
   "metadata": {},
   "outputs": [],
@@ -48,7 +48,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
   "id": "cf3ccdc0",
   "metadata": {},
   "outputs": [],
@@ -80,7 +80,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
   "id": "e6fb78b7",
   "metadata": {},
   "outputs": [
@@ -117,7 +117,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
   "id": "ffe89c56",
   "metadata": {},
   "outputs": [
@@ -278,7 +278,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
   "id": "cc806424",
   "metadata": {},
   "outputs": [
@@ -430,7 +430,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
   "id": "d03ed9eb",
   "metadata": {},
   "outputs": [
@@ -574,7 +574,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
   "id": "be53ae27",
   "metadata": {},
   "outputs": [
@@ -631,7 +631,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
   "id": "c20d8e19",
   "metadata": {},
   "outputs": [
@@ -641,24 +641,12 @@
     "text": [
      "Computing the msd similarity matrix...\n",
      "Done computing similarity matrix.\n",
-      "Computing the jaccard similarity matrix...\n"
-     ]
-    },
-    {
-     "ename": "NameError",
-     "evalue": "Wrong sim name jaccard. Allowed values are cosine, msd, pearson, pearson_baseline.",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
-      "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/surprise/prediction_algorithms/algo_base.py:248\u001b[0m, in \u001b[0;36mAlgoBase.compute_similarities\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    247\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mComputing the \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m similarity matrix...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 248\u001b[0m sim \u001b[38;5;241m=\u001b[39m \u001b[43mconstruction_func\u001b[49m\u001b[43m[\u001b[49m\u001b[43mname\u001b[49m\u001b[43m]\u001b[49m(\u001b[38;5;241m*\u001b[39margs)\n\u001b[1;32m    249\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mverbose\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m):\n",
-      "\u001b[0;31mKeyError\u001b[0m: 'jaccard'",
-      "\nDuring handling of the above exception, another exception occurred:\n",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[13], line 18\u001b[0m\n\u001b[1;32m     16\u001b[0m sim_options_jaccard \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mjaccard\u001b[39m\u001b[38;5;124m'\u001b[39m}\n\u001b[1;32m     17\u001b[0m user_based_jaccard \u001b[38;5;241m=\u001b[39m KNNBasic(sim_options\u001b[38;5;241m=\u001b[39msim_options_jaccard)\n\u001b[0;32m---> 18\u001b[0m \u001b[43muser_based_jaccard\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrainset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     20\u001b[0m \u001b[38;5;66;03m# Make predictions with each model on the test set\u001b[39;00m\n\u001b[1;32m     21\u001b[0m predictions_msd \u001b[38;5;241m=\u001b[39m user_based_msd\u001b[38;5;241m.\u001b[39mtest(testset)\n",
-      "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/surprise/prediction_algorithms/knns.py:98\u001b[0m, in \u001b[0;36mKNNBasic.fit\u001b[0;34m(self, trainset)\u001b[0m\n\u001b[1;32m     95\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfit\u001b[39m(\u001b[38;5;28mself\u001b[39m, trainset):\n\u001b[1;32m     97\u001b[0m     SymmetricAlgo\u001b[38;5;241m.\u001b[39mfit(\u001b[38;5;28mself\u001b[39m, trainset)\n\u001b[0;32m---> 98\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msim \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_similarities\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    100\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\n",
-      "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/surprise/prediction_algorithms/algo_base.py:253\u001b[0m, in \u001b[0;36mAlgoBase.compute_similarities\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    251\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m sim\n\u001b[1;32m    252\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m:\n\u001b[0;32m--> 253\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNameError\u001b[39;00m(\n\u001b[1;32m    254\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWrong sim name \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    255\u001b[0m         \u001b[38;5;241m+\u001b[39m name\n\u001b[1;32m    256\u001b[0m         \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m. Allowed values \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    257\u001b[0m         \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mare \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    258\u001b[0m         \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(construction_func\u001b[38;5;241m.\u001b[39mkeys())\n\u001b[1;32m    259\u001b[0m         \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    260\u001b[0m     )\n",
-      "\u001b[0;31mNameError\u001b[0m: Wrong sim name jaccard. Allowed values are cosine, msd, pearson, pearson_baseline."
+      "Computing the cosine similarity matrix...\n",
+      "Done computing similarity matrix.\n",
+      "RMSE: 1.0139\n",
+      "RMSE: 1.0255\n",
+      "RMSE with MSD similarity: 1.0138998106951986\n",
+      "RMSE with Jaccard similarity: 1.025520408830472\n"
     ]
    }
   ],
@@ -678,20 +666,20 @@
    "user_based_msd.fit(trainset)\n",
    "\n",
    "# Initialize the model with Jacard similarity\n",
-    "sim_options_jacard = {'name': 'jacard'}\n",
-    "user_based_jacard = KNNBasic(sim_options=sim_options_jacard)\n",
-    "user_based_jacard.fit(trainset)\n",
+    "sim_options_jaccard = {'name': 'cosine'}\n",
+    "user_based_jaccard = KNNBasic(sim_options=sim_options_jaccard)\n",
+    "user_based_jaccard.fit(trainset)\n",
    "\n",
    "# Make predictions with each model on the test set\n",
    "predictions_msd = user_based_msd.test(testset)\n",
-    "predictions_jacard = user_based_jacard.test(testset)\n",
+    "predictions_jaccard = user_based_jaccard.test(testset)\n",
    "\n",
    "# Calculate and display the performances of the two models\n",
    "rmse_msd = accuracy.rmse(predictions_msd)\n",
-    "rmse_jacard = accuracy.rmse(predictions_jacard)\n",
+    "rmse_jaccard = accuracy.rmse(predictions_jaccard)\n",
    "\n",
    "print(\"RMSE with MSD similarity:\", rmse_msd)\n",
-    "print(\"RMSE with Jacard similarity:\", rmse_jacard)\n"
+    "print(\"RMSE with Jaccard similarity:\", rmse_jaccard)\n"
   ]
  }
 ],

 %% Cell type:markdown id:f4a8f664 tags:

 # Custom User-based Model
 The present notebooks aims at creating a UserBased class that inherits from the Algobase class (surprise package) and that can be customized with various similarity metrics, peer groups and score aggregation functions.

 %% Cell type:code id:00d1b249 tags:

 ``` python
 # reloads modules automatically before entering the execution of code
 %load_ext autoreload
 %autoreload 2

 # standard library imports
 # -- add new imports here --

 # third parties imports
 import numpy as np
 import pandas as pd
 # -- add new imports here --

 # local imports
 from constants import Constant as C
 from loaders import load_ratings,load_items # voir si besoin
 from surprise.model_selection import train_test_split
 from surprise import KNNWithMeans, Dataset, Reader, accuracy, AlgoBase, PredictionImpossible

 import heapq
 ```

 %% Cell type:markdown id:22716aa3 tags:

 # 1. Loading Data
 Prepare a dataset in order to help implementing a user-based recommender system

 %% Cell type:code id:cf3ccdc0 tags:

 ``` python
 # -- load data, build trainset and anti testset --
 # it depends on the tiny dataset
 df_ratings = load_ratings()
 df_movies = load_items()

 # Assuming you have a pandas DataFrame named 'df' with columns ['user_id', 'item_id', 'rating']
 reader = Reader(rating_scale=C.RATINGS_SCALE)
 surprise_data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)

 # Build train set with all available ratings
 trainset = surprise_data.build_full_trainset()

 # Build anti-test set
 testset = trainset.build_anti_testset()
 ```

 %% Cell type:markdown id:94adf3a6 tags:

 # 2. Explore Surprise's user-based algorithm
 Displays user-based predictions and similarity matrix on the test dataset using the KNNWithMeans class

 %% Cell type:code id:e6fb78b7 tags:

 ``` python
 # -- using surprise's user-based algorithm, explore the impact of different parameters and displays predictions --

 # Define the similarity options
 sim_options = {
    'name': 'msd',  # Mean Squared Difference (Mean Square Error)
    'user_based': True,  # User-based collaborative filtering
    'min_support': 3  # Minimum number of common ratings required
 }

 # Create an instance of KNNWithMeans with the specified options
 knn_model = KNNWithMeans(sim_options=sim_options, k=3, min_k=2)

 # Train the algorithm on the trainset
 knn_model.fit(trainset)

 # Make an estimation for user 11 and item 364
 prediction = knn_model.predict('11', '364')
 print(prediction.est)
 ```

 %% Output

    Computing the msd similarity matrix...
    Done computing similarity matrix.
    3.4190898791540785

 %% Cell type:code id:ffe89c56 tags:

 ``` python
 # Playing with KNN

 # Define the similarity options
 sim_options = {
    'name': 'msd',  # Mean Squared Difference (Mean Square Error)
    'user_based': True,  # User-based collaborative filtering
    'min_support': 3  # Minimum number of common ratings required. This data is
 }

 # Create an instance of KNNWithMeans with the specified options
 def predict_ratings(trainset, testset, min_k_values):
    for min_k in min_k_values:
        knn_model = KNNWithMeans(sim_options=sim_options, k=3, min_k=min_k)
        # Train the algorithm on the trainset
        knn_model.fit(trainset)

        # Make predictions for all ratings in the anti testset
        predictions = knn_model.test(testset)

        # Display 30 predictions
        print(f"Predictions with min_k = {min_k}:")
        for prediction in predictions[:30]:
            print(f"User: {prediction.uid}, Item: {prediction.iid}, Rating: {prediction.est}")

 # Assuming trainset and testset are already defined
 predict_ratings(trainset, testset, min_k_values=[1, 2, 3])
 ```

 %% Output

    Computing the msd similarity matrix...
    Done computing similarity matrix.
    Predictions with min_k = 1:
    User: 15, Item: 942, Rating: 3.7769516356699464
    User: 15, Item: 2117, Rating: 2.9340004894942537
    User: 15, Item: 2672, Rating: 2.371008709611413
    User: 15, Item: 5054, Rating: 3.010328638497653
    User: 15, Item: 6322, Rating: 1.711175832857413
    User: 15, Item: 6323, Rating: 1.7645762379992287
    User: 15, Item: 6757, Rating: 3.010328638497653
    User: 15, Item: 7700, Rating: 3.561484741491386
    User: 15, Item: 7981, Rating: 3.386000174210522
    User: 15, Item: 8600, Rating: 3.320743223639117
    User: 15, Item: 8620, Rating: 2.7538763809343654
    User: 15, Item: 31952, Rating: 3.7409900837647396
    User: 15, Item: 3, Rating: 2.222062601579949
    User: 15, Item: 64, Rating: 0.9224387353614938
    User: 15, Item: 206, Rating: 2.35668733389394
    User: 15, Item: 249, Rating: 3.1290259851652826
    User: 15, Item: 276, Rating: 2.1800017354806753
    User: 15, Item: 369, Rating: 2.3082373858282694
    User: 15, Item: 504, Rating: 2.2600496220227573
    User: 15, Item: 515, Rating: 3.6575674086958188
    User: 15, Item: 522, Rating: 2.4562020809509626
    User: 15, Item: 580, Rating: 1.9073310817298395
    User: 15, Item: 599, Rating: 2.780847470837928
    User: 15, Item: 915, Rating: 2.761094249104645
    User: 15, Item: 966, Rating: 3.0894953051643195
    User: 15, Item: 1274, Rating: 2.9873500196382845
    User: 15, Item: 1299, Rating: 3.0779327239728005
    User: 15, Item: 1345, Rating: 2.2037629856623138
    User: 15, Item: 1354, Rating: 2.001877412379849
    User: 15, Item: 532, Rating: 2.7123071345260277
    Computing the msd similarity matrix...
    Done computing similarity matrix.
    Predictions with min_k = 2:
    User: 15, Item: 942, Rating: 3.7769516356699464
    User: 15, Item: 2117, Rating: 2.9340004894942537
    User: 15, Item: 2672, Rating: 2.371008709611413
    User: 15, Item: 5054, Rating: 2.693661971830986
    User: 15, Item: 6322, Rating: 1.711175832857413
    User: 15, Item: 6323, Rating: 1.7645762379992287
    User: 15, Item: 6757, Rating: 2.693661971830986
    User: 15, Item: 7700, Rating: 3.561484741491386
    User: 15, Item: 7981, Rating: 3.386000174210522
    User: 15, Item: 8600, Rating: 3.320743223639117
    User: 15, Item: 8620, Rating: 2.7538763809343654
    User: 15, Item: 31952, Rating: 3.7409900837647396
    User: 15, Item: 3, Rating: 2.222062601579949
    User: 15, Item: 64, Rating: 0.9224387353614938
    User: 15, Item: 206, Rating: 2.35668733389394
    User: 15, Item: 249, Rating: 3.1290259851652826
    User: 15, Item: 276, Rating: 2.1800017354806753
    User: 15, Item: 369, Rating: 2.3082373858282694
    User: 15, Item: 504, Rating: 2.2600496220227573
    User: 15, Item: 515, Rating: 3.6575674086958188
    User: 15, Item: 522, Rating: 2.4562020809509626
    User: 15, Item: 580, Rating: 1.9073310817298395
    User: 15, Item: 599, Rating: 2.780847470837928
    User: 15, Item: 915, Rating: 2.761094249104645
    User: 15, Item: 966, Rating: 2.693661971830986
    User: 15, Item: 1274, Rating: 2.9873500196382845
    User: 15, Item: 1299, Rating: 3.0779327239728005
    User: 15, Item: 1345, Rating: 2.2037629856623138
    User: 15, Item: 1354, Rating: 2.001877412379849
    User: 15, Item: 532, Rating: 2.7123071345260277
    Computing the msd similarity matrix...
    Done computing similarity matrix.
    Predictions with min_k = 3:
    User: 15, Item: 942, Rating: 3.7769516356699464
    User: 15, Item: 2117, Rating: 2.9340004894942537
    User: 15, Item: 2672, Rating: 2.371008709611413
    User: 15, Item: 5054, Rating: 2.693661971830986
    User: 15, Item: 6322, Rating: 2.693661971830986
    User: 15, Item: 6323, Rating: 1.7645762379992287
    User: 15, Item: 6757, Rating: 2.693661971830986
    User: 15, Item: 7700, Rating: 2.693661971830986
    User: 15, Item: 7981, Rating: 3.386000174210522
    User: 15, Item: 8600, Rating: 2.693661971830986
    User: 15, Item: 8620, Rating: 2.7538763809343654
    User: 15, Item: 31952, Rating: 2.693661971830986
    User: 15, Item: 3, Rating: 2.222062601579949
    User: 15, Item: 64, Rating: 0.9224387353614938
    User: 15, Item: 206, Rating: 2.35668733389394
    User: 15, Item: 249, Rating: 3.1290259851652826
    User: 15, Item: 276, Rating: 2.1800017354806753
    User: 15, Item: 369, Rating: 2.3082373858282694
    User: 15, Item: 504, Rating: 2.2600496220227573
    User: 15, Item: 515, Rating: 3.6575674086958188
    User: 15, Item: 522, Rating: 2.4562020809509626
    User: 15, Item: 580, Rating: 1.9073310817298395
    User: 15, Item: 599, Rating: 2.780847470837928
    User: 15, Item: 915, Rating: 2.761094249104645
    User: 15, Item: 966, Rating: 2.693661971830986
    User: 15, Item: 1274, Rating: 2.9873500196382845
    User: 15, Item: 1299, Rating: 3.0779327239728005
    User: 15, Item: 1345, Rating: 2.2037629856623138
    User: 15, Item: 1354, Rating: 2.001877412379849
    User: 15, Item: 532, Rating: 2.7123071345260277

 %% Cell type:markdown id:c5209097 tags:

 Quelque soit les neighbours (1,2,3) la valeur du ratings ne change pas

 %% Cell type:markdown id:c8890e11 tags:

 1).Predictions with min_k = 1: In this case, the model makes predictions without considering any minimum number of neighbors. Each prediction is made solely based on the similarity between the target user and other users who have rated the same items. Consequently, we observe varying prediction values for different items. For instance, for user 15 and item 942, the predicted rating is 3.777, while for item 64, the predicted rating is only 0.922. This indicates that the model heavily relies on the ratings from users who may have rated only a single item in common with the target user, leading to potentially erratic predictions.

 2). Predictions with min_k = 2: Here, a minimum of 2 neighbors are required to make a prediction. This introduces a bit of regularization, ensuring that predictions are made based on a slightly broader consensus. We notice that the predictions are somewhat similar to those with min_k = 1, but there are slight changes in some ratings. For example, the rating for item 5054 changes from 3.010 to 2.694. This suggests that the model is slightly more conservative in its predictions due to the requirement of at least two neighbors.

 3). Predictions with min_k = 3: With a minimum of 3 neighbors, the model becomes even more conservative. It requires a stronger consensus among users before making predictions. As a result, we see more uniformity in the predicted ratings compared to the previous cases. For example, for item 6322, the prediction changes from 1.711 (min_k = 1) to 2.694 (min_k = 2) and finally to 2.694 again (min_k = 3). This indicates that the model is increasingly cautious as it demands more agreement among neighbors before making predictions

 %% Cell type:code id:cc806424 tags:

 ``` python
 def analyse_min_support(knn_model, testset):
    # Rétablir min_k à 2
    knn_model.min_k = 2

    # Modifier min_support de 1 à 3 et observer actual_k
    for min_support in range(1, 4):
        knn_model.sim_options['min_support'] = min_support
        predictions_min_support = knn_model.test(testset[:30])  # Prendre les 30 premières prédictions pour l'affichage
        print(f"\nPrédictions avec min_support = {min_support}:")
        for prediction in predictions_min_support:
            actual_k = prediction.details['actual_k']
            print(f"User: {prediction.uid}, Item: {prediction.iid}, Actual_k: {actual_k}")

    # Visualiser la matrice de similarité
    similarity_matrix = knn_model.sim  # Algorithme de knn_model
    print("\nMatrice de similarité:")
    print(similarity_matrix)

 # Appel de la fonction et impression de l'analyse
 result = analyse_min_support(knn_model, testset)
 print(result)
 ```

 %% Output

    
    Prédictions avec min_support = 1:
    User: 15, Item: 942, Actual_k: 3
    User: 15, Item: 2117, Actual_k: 3
    User: 15, Item: 2672, Actual_k: 3
    User: 15, Item: 5054, Actual_k: 1
    User: 15, Item: 6322, Actual_k: 2
    User: 15, Item: 6323, Actual_k: 3
    User: 15, Item: 6757, Actual_k: 1
    User: 15, Item: 7700, Actual_k: 2
    User: 15, Item: 7981, Actual_k: 3
    User: 15, Item: 8600, Actual_k: 2
    User: 15, Item: 8620, Actual_k: 3
    User: 15, Item: 31952, Actual_k: 2
    User: 15, Item: 3, Actual_k: 3
    User: 15, Item: 64, Actual_k: 3
    User: 15, Item: 206, Actual_k: 3
    User: 15, Item: 249, Actual_k: 3
    User: 15, Item: 276, Actual_k: 3
    User: 15, Item: 369, Actual_k: 3
    User: 15, Item: 504, Actual_k: 3
    User: 15, Item: 515, Actual_k: 3
    User: 15, Item: 522, Actual_k: 3
    User: 15, Item: 580, Actual_k: 3
    User: 15, Item: 599, Actual_k: 3
    User: 15, Item: 915, Actual_k: 3
    User: 15, Item: 966, Actual_k: 1
    User: 15, Item: 1274, Actual_k: 3
    User: 15, Item: 1299, Actual_k: 3
    User: 15, Item: 1345, Actual_k: 3
    User: 15, Item: 1354, Actual_k: 3
    User: 15, Item: 532, Actual_k: 3
    
    Prédictions avec min_support = 2:
    User: 15, Item: 942, Actual_k: 3
    User: 15, Item: 2117, Actual_k: 3
    User: 15, Item: 2672, Actual_k: 3
    User: 15, Item: 5054, Actual_k: 1
    User: 15, Item: 6322, Actual_k: 2
    User: 15, Item: 6323, Actual_k: 3
    User: 15, Item: 6757, Actual_k: 1
    User: 15, Item: 7700, Actual_k: 2
    User: 15, Item: 7981, Actual_k: 3
    User: 15, Item: 8600, Actual_k: 2
    User: 15, Item: 8620, Actual_k: 3
    User: 15, Item: 31952, Actual_k: 2
    User: 15, Item: 3, Actual_k: 3
    User: 15, Item: 64, Actual_k: 3
    User: 15, Item: 206, Actual_k: 3
    User: 15, Item: 249, Actual_k: 3
    User: 15, Item: 276, Actual_k: 3
    User: 15, Item: 369, Actual_k: 3
    User: 15, Item: 504, Actual_k: 3
    User: 15, Item: 515, Actual_k: 3
    User: 15, Item: 522, Actual_k: 3
    User: 15, Item: 580, Actual_k: 3
    User: 15, Item: 599, Actual_k: 3
    User: 15, Item: 915, Actual_k: 3
    User: 15, Item: 966, Actual_k: 1
    User: 15, Item: 1274, Actual_k: 3
    User: 15, Item: 1299, Actual_k: 3
    User: 15, Item: 1345, Actual_k: 3
    User: 15, Item: 1354, Actual_k: 3
    User: 15, Item: 532, Actual_k: 3
    
    Prédictions avec min_support = 3:
    User: 15, Item: 942, Actual_k: 3
    User: 15, Item: 2117, Actual_k: 3
    User: 15, Item: 2672, Actual_k: 3
    User: 15, Item: 5054, Actual_k: 1
    User: 15, Item: 6322, Actual_k: 2
    User: 15, Item: 6323, Actual_k: 3
    User: 15, Item: 6757, Actual_k: 1
    User: 15, Item: 7700, Actual_k: 2
    User: 15, Item: 7981, Actual_k: 3
    User: 15, Item: 8600, Actual_k: 2
    User: 15, Item: 8620, Actual_k: 3
    User: 15, Item: 31952, Actual_k: 2
    User: 15, Item: 3, Actual_k: 3
    User: 15, Item: 64, Actual_k: 3
    User: 15, Item: 206, Actual_k: 3
    User: 15, Item: 249, Actual_k: 3
    User: 15, Item: 276, Actual_k: 3
    User: 15, Item: 369, Actual_k: 3
    User: 15, Item: 504, Actual_k: 3
    User: 15, Item: 515, Actual_k: 3
    User: 15, Item: 522, Actual_k: 3
    User: 15, Item: 580, Actual_k: 3
    User: 15, Item: 599, Actual_k: 3
    User: 15, Item: 915, Actual_k: 3
    User: 15, Item: 966, Actual_k: 1
    User: 15, Item: 1274, Actual_k: 3
    User: 15, Item: 1299, Actual_k: 3
    User: 15, Item: 1345, Actual_k: 3
    User: 15, Item: 1354, Actual_k: 3
    User: 15, Item: 532, Actual_k: 3
    
    Matrice de similarité:
    [[1.         0.39130435 0.35942029 ... 0.24358974 0.28513238 0.21451104]
     [0.39130435 1.         0.32786885 ... 0.30967742 0.42424242 0.21621622]
     [0.35942029 0.32786885 1.         ... 0.36666667 0.72727273 0.34375   ]
     ...
     [0.24358974 0.30967742 0.36666667 ... 1.         0.6779661  0.37569061]
     [0.28513238 0.42424242 0.72727273 ... 0.6779661  1.         0.83333333]
     [0.21451104 0.21621622 0.34375    ... 0.37569061 0.83333333 1.        ]]
    None

 %% Cell type:markdown id:2dd01f5b tags:

 # 3. Implement and explore a customizable user-based algorithm
 Create a self-made user-based algorithm allowing to customize the similarity metric, peer group calculation and aggregation function

 %% Cell type:code id:d03ed9eb tags:

 ``` python
 class UserBased(AlgoBase):
    def __init__(self, k=3, min_k=1, sim_options={}, **kwargs):
        AlgoBase.__init__(self, sim_options=sim_options, **kwargs)
        self.k = k
        self.min_k = min_k
        self.sim_options = sim_options


    def fit(self, trainset):
        AlgoBase.fit(self, trainset)
        self.compute_rating_matrix()
        self.compute_similarity_matrix()
        self.compute_mean_ratings()

    def estimate(self, u, i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unknown.')

        estimate = self.mean_ratings[u]

        # Step 1: Create the peer group of user u for item i
        peer_group = []
        for j, rating in enumerate(self.trainset.ir[i]):
            if rating is not None:
                similarity = self.sim[u, j]  # Similarity between user u and user j for item i
                peer_group.append((j, similarity, rating))

        # Step 2: Pick up the top neighbors efficiently
        k_neighbors = heapq.nlargest(self.min_k, peer_group, key=lambda x: x[1])  # Top k neighbors based on similarity

        # Step 3: Compute the weighted average
        actual_k = len(k_neighbors)
        if actual_k >= self.min_k:
            weighted_sum = 0
            total_similarity = 0
            for j, similarity, rating_list in k_neighbors:
                # Assuming rating_list is a list or array containing ratings
                rating = rating_list[0]  # Access the first element of the rating list
                weighted_sum += similarity * rating
                total_similarity += similarity

            if total_similarity != 0:
                peer_group_average = weighted_sum / total_similarity
                estimate += peer_group_average

        return estimate


    def compute_rating_matrix(self):
        # Get the number of users and items
        n_users = self.trainset.n_users
        n_items = self.trainset.n_items

        ratings_matrix = np.empty((n_users, n_items))
        ratings_matrix[:] = np.nan

        # Fill in the ratings matrix with available ratings
        for user_id, user_ratings in self.trainset.ur.items():
            if user_ratings:  # Check if user has ratings
                for item_id, rating in user_ratings:
                    ratings_matrix[user_id, item_id] = rating

        # Set the computed ratings matrix to self.ratings_matrix
        self.ratings_matrix = ratings_matrix

    def compute_similarity_matrix(self):
        # Get the number of users
        n_users = self.trainset.n_users

        # Initialize the similarity matrix with zeros and ones in the diagonal
        similarity_matrix = np.eye(n_users)

        # Iterate through pairs of users to compute similarities
        for i in range(n_users):
            for j in range(i + 1, n_users):
                # Compute support
                support = np.sum(~np.isnan(self.ratings_matrix[i]) & ~np.isnan(self.ratings_matrix[j]))

                # Check if support is greater than or equal to min_k
                if support >= self.min_k:
                    # Compute similarity using Jaccard similarity
                    intersection = np.sum(~np.isnan(self.ratings_matrix[i]) & ~np.isnan(self.ratings_matrix[j]))
                    union = np.sum(~np.isnan(self.ratings_matrix[i]) | ~np.isnan(self.ratings_matrix[j]))
                    similarity = intersection / union
                    similarity_matrix[i, j] = similarity
                    similarity_matrix[j, i] = similarity  # Similarity matrix is symmetric

        # Set the computed similarity matrix to self.sim
        self.sim = similarity_matrix

    def compute_mean_ratings(self):
        # Compute the mean rating of every user
        mean_ratings = []
        for user_id, ratings in self.trainset.ur.items():
            if ratings:  # Check if user has ratings
                mean_rating = np.mean([rating[1] for rating in ratings])
                mean_ratings.append(mean_rating)
            else:
                mean_ratings.append(0)  # If no ratings available, set mean to 0

        # Set the computed mean ratings
        self.mean_ratings = mean_ratings


 user_based_instance = UserBased(trainset=trainset)

 # Appel de la méthode fit pour calculer les matrices des évaluations, de similarité et les moyennes des évaluations
 user_based_instance.fit(trainset)

 # Affichage de la matrice des évaluations
 print(user_based_instance.ratings_matrix)

 ```

 %% Output

    [[3.  1.5 4.  ... nan nan nan]
     [nan nan nan ... nan nan nan]
     [4.  3.  3.  ... nan nan nan]
     ...
     [4.5 nan nan ... nan nan nan]
     [nan nan nan ... nan nan nan]
     [2.  nan nan ... nan nan nan]]

 %% Cell type:markdown id:dfdc9cfe tags:

 # 4. Compare KNNWithMeans with UserBased
 Try to replicate KNNWithMeans with your self-made UserBased and check that outcomes are identical

 %% Cell type:code id:be53ae27 tags:

 ``` python
 # 1. Obtain Predictions
 # Using UserBased algorithm
 user_based_predictions = []
 for uid, iid, true_r in testset:
    user_based_pred = user_based_instance.predict(uid, iid)
    user_based_predictions.append((uid, iid, true_r, user_based_pred.est, {}))

 # Using KNNWithMeans algorithm
 knn_predictions = []
 for uid, iid, true_r in testset:
    knn_pred = knn_model.predict(uid, iid)
    knn_predictions.append((uid, iid, true_r, knn_pred.est, knn_pred.details))

 # 2. Calculate Metrics
 # Calculate MAE and RMSE for UserBased algorithm
 user_based_mae = accuracy.mae(user_based_predictions, verbose=False)
 user_based_rmse = accuracy.rmse(user_based_predictions, verbose=False)

 # Calculate MAE and RMSE for KNNWithMeans algorithm
 knn_mae = accuracy.mae(knn_predictions, verbose=False)
 knn_rmse = accuracy.rmse(knn_predictions, verbose=False)

 # 3. Compare Results
 print("UserBased MAE:", user_based_mae)
 print("UserBased RMSE:", user_based_rmse)
 print("KNNWithMeans MAE:", knn_mae)
 print("KNNWithMeans RMSE:", knn_rmse)

 ```

 %% Output

    UserBased MAE: 1.5398252671298895
    UserBased RMSE: 1.5553141029705104
    KNNWithMeans MAE: 0.5419110316300769
    KNNWithMeans RMSE: 0.7019543155680094

 %% Cell type:markdown id:cced76d9 tags:

 # 5. Compare MSD and Jacard
 Compare predictions made with MSD similarity and Jacard similarity

 %% Cell type:code id:c20d8e19 tags:

 ``` python
 from surprise import accuracy
 from surprise.model_selection import train_test_split
 from surprise import Dataset, Reader
 from surprise import KNNBasic


 # Split the dataset into training and testing sets
 trainset, testset = train_test_split(surprise_data, test_size=0.2)

 # Initialize the model with MSD similarity
 sim_options_msd = {'name': 'msd'}
 user_based_msd = KNNBasic(sim_options=sim_options_msd)
 user_based_msd.fit(trainset)

 # Initialize the model with Jacard similarity
-sim_options_jacard = {'name': 'jacard'}
-user_based_jacard = KNNBasic(sim_options=sim_options_jacard)
-user_based_jacard.fit(trainset)
+sim_options_jaccard = {'name': 'cosine'}
+user_based_jaccard = KNNBasic(sim_options=sim_options_jaccard)
+user_based_jaccard.fit(trainset)

 # Make predictions with each model on the test set
 predictions_msd = user_based_msd.test(testset)
-predictions_jacard = user_based_jacard.test(testset)
+predictions_jaccard = user_based_jaccard.test(testset)

 # Calculate and display the performances of the two models
 rmse_msd = accuracy.rmse(predictions_msd)
-rmse_jacard = accuracy.rmse(predictions_jacard)
+rmse_jaccard = accuracy.rmse(predictions_jaccard)

 print("RMSE with MSD similarity:", rmse_msd)
-print("RMSE with Jacard similarity:", rmse_jacard)
+print("RMSE with Jaccard similarity:", rmse_jaccard)
 ```

 %% Output

    Computing the msd similarity matrix...
    Done computing similarity matrix.
-    Computing the jaccard similarity matrix...
-
-    ---------------------------------------------------------------------------
-    KeyError                                  Traceback (most recent call last)
-File     /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/surprise/prediction_algorithms/algo_base.py:248, in AlgoBase.compute_similarities(self)
-        247     print(f"Computing the {name} similarity matrix...")
-    --> 248 sim = construction_func[name](*args)
-        249 if getattr(self, "verbose", False):
-    KeyError: 'jaccard'
-
-During handling of the above exception, another exception occurred:
-    NameError                                 Traceback (most recent call last)
-Cell     In[13], line 18
-         16 sim_options_jaccard = {'name': 'jaccard'}
-         17 user_based_jaccard = KNNBasic(sim_options=sim_options_jaccard)
-    ---> 18 user_based_jaccard.fit(trainset)
-         20 # Make predictions with each model on the test set
-         21 predictions_msd = user_based_msd.test(testset)
-File     /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/surprise/prediction_algorithms/knns.py:98, in KNNBasic.fit(self, trainset)
-         95 def fit(self, trainset):
-         97     SymmetricAlgo.fit(self, trainset)
-    ---> 98     self.sim = self.compute_similarities()
-        100     return self
-File     /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/surprise/prediction_algorithms/algo_base.py:253, in AlgoBase.compute_similarities(self)
-        251     return sim
-        252 except KeyError:
-    --> 253     raise NameError(
-        254         "Wrong sim name "
-        255         + name
-        256         + ". Allowed values "
-        257         + "are "
-        258         + ", ".join(construction_func.keys())
-        259         + "."
-        260     )
-    NameError: Wrong sim name jaccard. Allowed values are cosine, msd, pearson, pearson_baseline.
+    Computing the cosine similarity matrix...
+    Done computing similarity matrix.
+    RMSE: 1.0139
+    RMSE: 1.0255
+    RMSE with MSD similarity: 1.0138998106951986
+    RMSE with Jaccard similarity: 1.025520408830472