From a442a7bf14742a5f59fccfbe89de8252704a3368 Mon Sep 17 00:00:00 2001 From: Adrienucl <adrien.payen@student.uclouvain.be> Date: Tue, 30 Apr 2024 11:34:44 +0200 Subject: [PATCH] update Analytics --- analytics_small.ipynb | 32 +- analytics_tiny.ipynb | 30 +- configs.py | 32 -- evaluator.ipynb | 393 ----------------------- models.py | 83 ----- user_based.ipynb | 704 ------------------------------------------ 6 files changed, 29 insertions(+), 1245 deletions(-) delete mode 100644 configs.py delete mode 100644 evaluator.ipynb delete mode 100644 models.py delete mode 100644 user_based.ipynb diff --git a/analytics_small.ipynb b/analytics_small.ipynb index 305d69b4..b6f7494f 100644 --- a/analytics_small.ipynb +++ b/analytics_small.ipynb @@ -2,15 +2,13 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n", "Display The Movies : \n" ] }, @@ -313,7 +311,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -332,7 +330,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -357,7 +355,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -409,7 +407,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -428,7 +426,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -447,7 +445,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -467,7 +465,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -490,7 +488,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -513,7 +511,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -547,7 +545,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -577,7 +575,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -610,7 +608,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -627,7 +625,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -690,7 +688,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "metadata": {}, "outputs": [ { diff --git a/analytics_tiny.ipynb b/analytics_tiny.ipynb index 00a7b117..cbd97046 100644 --- a/analytics_tiny.ipynb +++ b/analytics_tiny.ipynb @@ -2,15 +2,13 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n", "Display The Movies : \n" ] }, @@ -322,7 +320,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -341,7 +339,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -366,7 +364,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -418,7 +416,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -437,7 +435,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -456,7 +454,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -476,7 +474,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -499,7 +497,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -522,7 +520,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -556,7 +554,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -586,7 +584,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -619,7 +617,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -682,7 +680,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "metadata": {}, "outputs": [ { diff --git a/configs.py b/configs.py deleted file mode 100644 index c0a0002c..00000000 --- a/configs.py +++ /dev/null @@ -1,32 +0,0 @@ -# local imports -from models import * - - -class EvalConfig: - - """Configuration settings for evaluation.""" - - # List of models to evaluate, each tuple containing model_name, model class, and model parameters (dict) - - models = [ - ("baseline_1", ModelBaseline1, {}), - ("baseline_2", ModelBaseline2, {}), - ("baseline_3", ModelBaseline3, {}), - ("baseline_4", ModelBaseline4, {}) - # model_name, model class, model parameters (dict) - ] - - # Metrics to compute for split evaluation - split_metrics = ["mae", "rmse"] - - # Metrics to compute for Leave-One-Out (LOO) evaluation - loo_metrics = ["hit_rate"] - - # Metrics to compute for full dataset evaluation - full_metrics = ["novelty"] - - # Split parameters - test_size = 0.25 # -- configure the test_size (from 0 to 1) -- - - # Loo parameters - top_n_value = 10 # -- configure the numer of recommendations (> 1) -- diff --git a/evaluator.ipynb b/evaluator.ipynb deleted file mode 100644 index 25f74719..00000000 --- a/evaluator.ipynb +++ /dev/null @@ -1,393 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "a665885b", - "metadata": {}, - "source": [ - "# Evaluator Module\n", - "The Evaluator module creates evaluation reports.\n", - "\n", - "Reports contain evaluation metrics depending on models specified in the evaluation config." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "6aaf9140", - "metadata": {}, - "outputs": [], - "source": [ - "# reloads modules automatically before entering the execution of code\n", - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "# third parties imports\n", - "import numpy as np \n", - "import pandas as pd\n", - "# -- add new imports here --\n", - "\n", - "# local imports\n", - "from configs import EvalConfig\n", - "from constants import Constant as C\n", - "from loaders import export_evaluation_report\n", - "from loaders import load_ratings\n", - "# -- add new imports here --\n", - "from surprise.model_selection import train_test_split\n", - "from surprise import accuracy\n", - "from surprise.model_selection import LeaveOneOut\n", - "from collections import Counter" - ] - }, - { - "cell_type": "markdown", - "id": "d47c24a4", - "metadata": {}, - "source": [ - "# 1. Model validation functions\n", - "Validation functions are a way to perform crossvalidation on recommender system models. " - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "d6d82188", - "metadata": {}, - "outputs": [], - "source": [ - "def generate_split_predictions(algo, ratings_dataset, eval_config):\n", - " \"\"\"Generate predictions on a random test set specified in eval_config\"\"\"\n", - " # -- implement the function generate_split_predictions --\n", - " \n", - " # Spliting the data into train and test sets\n", - " trainset, testset = train_test_split(ratings_dataset, test_size=eval_config.test_size)\n", - " # Training the algorithm on the train data set\n", - " algo.fit(trainset)\n", - " # Predict ratings for the testset\n", - " predictions = algo.test(testset)\n", - " return predictions\n", - "\n", - "\n", - "def generate_loo_top_n(algo, ratings_dataset, eval_config):\n", - " \"\"\"Generate top-n recommendations for each user on a random Leave-one-out split (LOO)\"\"\"\n", - " # -- implement the function generate_loo_top_n --\n", - " # Create a LeaveOneOut split\n", - " loo = LeaveOneOut(n_splits=1)\n", - " \n", - " for trainset, testset in loo.split(ratings_dataset):\n", - " algo.fit(trainset) # Train the algorithm on the training set\n", - " anti_testset = trainset.build_anti_testset() # Build the anti test-set\n", - " predictions = algo.test(anti_testset) # Get predictions on the anti test-set\n", - " top_n = {}\n", - " for uid, iid, _, est, _ in predictions:\n", - " if uid not in top_n:\n", - " top_n[uid] = []\n", - " top_n[uid].append((iid, est))\n", - " for uid, user_ratings in top_n.items():\n", - " user_ratings.sort(key=lambda x: x[1], reverse=True)\n", - " top_n[uid] = user_ratings[:eval_config.top_n_value] # Get top-N recommendations\n", - " anti_testset_top_n = top_n\n", - " return anti_testset_top_n, testset\n", - "\n", - "def generate_full_top_n(algo, ratings_dataset, eval_config):\n", - " \"\"\"Generate top-n recommendations for each user with full training set (LOO)\"\"\"\n", - " full_trainset = ratings_dataset.build_full_trainset() # Build the full training set\n", - " algo.fit(full_trainset) # Train the algorithm on the full training set\n", - " anti_testset = full_trainset.build_anti_testset() # Build the anti test-set\n", - " predictions = algo.test(anti_testset) # Get predictions on the anti test-set\n", - " top_n = {}\n", - " for uid, iid, _, est, _ in predictions:\n", - " if uid not in top_n:\n", - " top_n[uid] = []\n", - " top_n[uid].append((iid, est))\n", - " for uid, user_ratings in top_n.items():\n", - " user_ratings.sort(key=lambda x: x[1], reverse=True)\n", - " top_n[uid] = user_ratings[:eval_config.top_n_value] # Get top-N recommendations\n", - " anti_testset_top_n = top_n\n", - " return anti_testset_top_n\n", - "\n", - "def precomputed_information(movie_data):\n", - " \"\"\" Returns a dictionary that precomputes relevant information for evaluating in full mode\n", - " \n", - " Dictionary keys:\n", - " - precomputed_dict[\"item_to_rank\"] : contains a dictionary mapping movie ids to rankings\n", - " - (-- for your project, add other relevant information here -- )\n", - " \"\"\"\n", - " # Initialize an empty dictionary to store item_id to rank mapping\n", - " item_to_rank = {}\n", - " \n", - " # Calculate popularity rank for each movie\n", - " ratings_count = movie_data.groupby('movieId').size().sort_values(ascending=False)\n", - " \n", - " # Assign ranks to movies based on their popularity\n", - " for rank, (movie_id, _) in enumerate(ratings_count.items(), start=1):\n", - " item_to_rank[movie_id] = rank\n", - " \n", - " # Create the precomputed dictionary\n", - " precomputed_dict = {}\n", - " precomputed_dict[\"item_to_rank\"] = item_to_rank\n", - " \n", - " return precomputed_dict\n", - "\n", - "def create_evaluation_report(eval_config, sp_ratings, precomputed_dict, available_metrics):\n", - " \"\"\" Create a DataFrame evaluating various models on metrics specified in an evaluation config. \n", - " \"\"\"\n", - " evaluation_dict = {}\n", - " for model_name, model, arguments in eval_config.models:\n", - " print(f'Handling model {model_name}')\n", - " algo = model(**arguments)\n", - " evaluation_dict[model_name] = {}\n", - " \n", - " # Type 1 : split evaluations\n", - " if len(eval_config.split_metrics) > 0:\n", - " print('Training split predictions')\n", - " predictions = generate_split_predictions(algo, sp_ratings, eval_config)\n", - " for metric in eval_config.split_metrics:\n", - " print(f'- computing metric {metric}')\n", - " assert metric in available_metrics['split']\n", - " evaluation_function, parameters = available_metrics[\"split\"][metric]\n", - " evaluation_dict[model_name][metric] = evaluation_function(predictions, **parameters) \n", - " \n", - " # Type 2 : loo evaluations\n", - " if len(eval_config.loo_metrics) > 0:\n", - " print('Training loo predictions')\n", - " anti_testset_top_n, testset = generate_loo_top_n(algo, sp_ratings, eval_config)\n", - " for metric in eval_config.loo_metrics:\n", - " assert metric in available_metrics['loo']\n", - " evaluation_function, parameters = available_metrics[\"loo\"][metric]\n", - " evaluation_dict[model_name][metric] = evaluation_function(anti_testset_top_n, testset, **parameters)\n", - " \n", - " # Type 3 : full evaluations\n", - " if len(eval_config.full_metrics) > 0:\n", - " print('Training full predictions')\n", - " anti_testset_top_n = generate_full_top_n(algo, sp_ratings, eval_config)\n", - " for metric in eval_config.full_metrics:\n", - " assert metric in available_metrics['full']\n", - " evaluation_function, parameters = available_metrics[\"full\"][metric]\n", - " evaluation_dict[model_name][metric] = evaluation_function(\n", - " anti_testset_top_n,\n", - " **precomputed_dict,\n", - " **parameters\n", - " )\n", - " \n", - " return pd.DataFrame.from_dict(evaluation_dict).T" - ] - }, - { - "cell_type": "markdown", - "id": "f7e83d1d", - "metadata": {}, - "source": [ - "# 2. Evaluation metrics\n", - "Implement evaluation metrics for either rating predictions (split metrics) or for top-n recommendations (loo metric, full metric)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "f1849e55", - "metadata": {}, - "outputs": [], - "source": [ - "def get_hit_rate(anti_testset_top_n, testset):\n", - " \"\"\"Compute the average hit over the users (loo metric)\n", - " \n", - " A hit (1) happens when the movie in the testset has been picked by the top-n recommender\n", - " A fail (0) happens when the movie in the testset has not been picked by the top-n recommender\n", - " \"\"\"\n", - " # -- implement the function get_hit_rate --\n", - "\n", - " hits = 0\n", - " total_users = len(testset)\n", - " for uid, true_iid, _ in testset:\n", - " if uid in anti_testset_top_n and true_iid in {iid for iid, _ in anti_testset_top_n[uid]}:\n", - " hits += 1\n", - " hit_rate = hits / total_users\n", - "\n", - " return hit_rate\n", - "\n", - "def get_novelty(anti_testset_top_n, item_to_rank):\n", - " \"\"\"Compute the average novelty of the top-n recommendation over the users (full metric)\n", - " \n", - " The novelty is defined as the average ranking of the movies recommended\n", - " \"\"\"\n", - " # -- implement the function get_novelty --\n", - " total_rank_sum = 0\n", - " total_recommendations = 0\n", - " for uid, recommendations in anti_testset_top_n.items():\n", - " for iid, _ in recommendations:\n", - " if iid in item_to_rank:\n", - " total_rank_sum += item_to_rank[iid]\n", - " total_recommendations += 1\n", - " if total_recommendations == 0:\n", - " return 0 # Avoid division by zero\n", - " average_rank_sum = total_rank_sum / total_recommendations \n", - " \n", - " return average_rank_sum" - ] - }, - { - "cell_type": "markdown", - "id": "1a9855b3", - "metadata": {}, - "source": [ - "# 3. Evaluation workflow\n", - "Load data, evaluate models and save the experimental outcomes" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "704f4d2a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Handling model baseline_1\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "Handling model baseline_2\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "Handling model baseline_3\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "Handling model baseline_4\n", - "Training split predictions\n", - "- computing metric mae\n", - "- computing metric rmse\n", - "Training loo predictions\n", - "Training full predictions\n", - "The data has been exported to the evaluation report\n" - ] - }, - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>mae</th>\n", - " <th>rmse</th>\n", - " <th>hit_rate</th>\n", - " <th>novelty</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>baseline_1</th>\n", - " <td>1.567221</td>\n", - " <td>1.788369</td>\n", - " <td>0.074766</td>\n", - " <td>99.405607</td>\n", - " </tr>\n", - " <tr>\n", - " <th>baseline_2</th>\n", - " <td>1.502872</td>\n", - " <td>1.840696</td>\n", - " <td>0.056075</td>\n", - " <td>429.942991</td>\n", - " </tr>\n", - " <tr>\n", - " <th>baseline_3</th>\n", - " <td>0.873993</td>\n", - " <td>1.076982</td>\n", - " <td>0.065421</td>\n", - " <td>99.405607</td>\n", - " </tr>\n", - " <tr>\n", - " <th>baseline_4</th>\n", - " <td>0.730657</td>\n", - " <td>0.938814</td>\n", - " <td>0.186916</td>\n", - " <td>57.465421</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " mae rmse hit_rate novelty\n", - "baseline_1 1.567221 1.788369 0.074766 99.405607\n", - "baseline_2 1.502872 1.840696 0.056075 429.942991\n", - "baseline_3 0.873993 1.076982 0.065421 99.405607\n", - "baseline_4 0.730657 0.938814 0.186916 57.465421" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "AVAILABLE_METRICS = {\n", - " \"split\": {\n", - " \"mae\": (accuracy.mae, {'verbose': False}),\n", - " \"rmse\": (accuracy.rmse, {'verbose': False})\n", - " # Add new split metrics here if needed\n", - " },\n", - " \"loo\": {\n", - " \"hit_rate\": (get_hit_rate, {}),\n", - " # Add new loo metrics here if needed\n", - " },\n", - " \"full\": {\n", - " \"novelty\": (get_novelty, {}),\n", - " # Add new full metrics here if needed\n", - " }\n", - "}\n", - "\n", - "sp_ratings = load_ratings(surprise_format=True)\n", - "precomputed_dict = precomputed_information(pd.read_csv(\"data/tiny/evidence/ratings.csv\"))\n", - "evaluation_report = create_evaluation_report(EvalConfig, sp_ratings, precomputed_dict, AVAILABLE_METRICS)\n", - "export_evaluation_report(evaluation_report)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/models.py b/models.py deleted file mode 100644 index 18a45b25..00000000 --- a/models.py +++ /dev/null @@ -1,83 +0,0 @@ -# standard library imports -from collections import defaultdict - -# third parties imports -import numpy as np -import random as rd -from surprise import AlgoBase -from surprise import KNNWithMeans -from surprise import SVD - - -def get_top_n(predictions, n): - """Return the top-N recommendation for each user from a set of predictions. - Source: inspired by https://github.com/NicolasHug/Surprise/blob/master/examples/top_n_recommendations.py - and modified by cvandekerckh for random tie breaking - - Args: - predictions(list of Prediction objects): The list of predictions, as - returned by the test method of an algorithm. - n(int): The number of recommendation to output for each user. Default - is 10. - Returns: - A dict where keys are user (raw) ids and values are lists of tuples: - [(raw item id, rating estimation), ...] of size n. - """ - - rd.seed(0) - - # First map the predictions to each user. - top_n = defaultdict(list) - for uid, iid, true_r, est, _ in predictions: - top_n[uid].append((iid, est)) - - # Then sort the predictions for each user and retrieve the k highest ones. - for uid, user_ratings in top_n.items(): - rd.shuffle(user_ratings) - user_ratings.sort(key=lambda x: x[1], reverse=True) - top_n[uid] = user_ratings[:n] - - return top_n - - -# First algorithm -class ModelBaseline1(AlgoBase): - def __init__(self): - AlgoBase.__init__(self) - - def estimate(self, u, i): - return 2 - - -# Second algorithm -class ModelBaseline2(AlgoBase): - def __init__(self): - AlgoBase.__init__(self) - - def fit(self, trainset): - AlgoBase.fit(self, trainset) - rd.seed(0) - - def estimate(self, u, i): - return rd.uniform(self.trainset.rating_scale[0], self.trainset.rating_scale[1]) - - -# Third algorithm -class ModelBaseline3(AlgoBase): - def __init__(self): - AlgoBase.__init__(self) - - def fit(self, trainset): - AlgoBase.fit(self, trainset) - self.the_mean = np.mean([r for (_, _, r) in self.trainset.all_ratings()]) - - return self - - def estimate(self, u, i): - return self.the_mean - - -# Fourth Model -class ModelBaseline4(SVD): - def __init__(self): - SVD.__init__(self, n_factors=100) diff --git a/user_based.ipynb b/user_based.ipynb deleted file mode 100644 index a1135883..00000000 --- a/user_based.ipynb +++ /dev/null @@ -1,704 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "f4a8f664", - "metadata": {}, - "source": [ - "# Custom User-based Model\n", - "The present notebooks aims at creating a UserBased class that inherits from the Algobase class (surprise package) and that can be customized with various similarity metrics, peer groups and score aggregation functions. " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "00d1b249", - "metadata": {}, - "outputs": [], - "source": [ - "# reloads modules automatically before entering the execution of code\n", - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "# standard library imports\n", - "# -- add new imports here --\n", - "\n", - "# third parties imports\n", - "import numpy as np \n", - "import pandas as pd\n", - "# -- add new imports here --\n", - "\n", - "# local imports\n", - "from constants import Constant as C\n", - "from loaders import load_ratings,load_items \n", - "from surprise import KNNWithMeans, accuracy, AlgoBase, PredictionImpossible\n", - "\n", - "import heapq" - ] - }, - { - "cell_type": "markdown", - "id": "22716aa3", - "metadata": {}, - "source": [ - "# 1. Loading Data\n", - "Prepare a dataset in order to help implementing a user-based recommender system" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "cf3ccdc0", - "metadata": {}, - "outputs": [], - "source": [ - "# -- load data, build trainset and anti testset --\n", - "# it depends on the tiny dataset\n", - "surprise_data = load_ratings(surprise_format=True)\n", - "df_movies = load_items()\n", - "\n", - "# Assuming you have a pandas DataFrame named 'df' with columns ['user_id', 'item_id', 'rating']\n", - "\n", - "# Build train set with all available ratings\n", - "trainset = surprise_data.build_full_trainset()\n", - "\n", - "# Build anti-test set\n", - "testset = trainset.build_anti_testset()" - ] - }, - { - "cell_type": "markdown", - "id": "94adf3a6", - "metadata": {}, - "source": [ - "# 2. Explore Surprise's user-based algorithm\n", - "Displays user-based predictions and similarity matrix on the test dataset using the KNNWithMeans class" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "e6fb78b7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Computing the msd similarity matrix...\n", - "Done computing similarity matrix.\n", - "3.4190898791540785\n" - ] - } - ], - "source": [ - "# -- using surprise's user-based algorithm, explore the impact of different parameters and displays predictions --\n", - "\n", - "# Define the similarity options\n", - "sim_options = {\n", - " 'name': 'msd', # Mean Squared Difference (Mean Square Error)\n", - " 'user_based': True, # User-based collaborative filtering\n", - " 'min_support': 3 # Minimum number of common ratings required\n", - "}\n", - "\n", - "# Create an instance of KNNWithMeans with the specified options\n", - "knn_model = KNNWithMeans(k=3, min_k=2, sim_options=sim_options)\n", - "\n", - "# Train the algorithm on the trainset\n", - "knn_model.fit(trainset).test(testset)\n", - "\n", - "# Make an estimation for user 11 and item 364\n", - "prediction = knn_model.predict('11', '364')\n", - "print(prediction.est)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "ffe89c56", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Computing the msd similarity matrix...\n", - "Done computing similarity matrix.\n", - "Predictions with min_k = 1:\n", - "User: 15, Item: 942, Rating: 3.7769516356699464\n", - "User: 15, Item: 2117, Rating: 2.9340004894942537\n", - "User: 15, Item: 2672, Rating: 2.371008709611413\n", - "User: 15, Item: 5054, Rating: 3.010328638497653\n", - "User: 15, Item: 6322, Rating: 1.711175832857413\n", - "User: 15, Item: 6323, Rating: 1.7645762379992287\n", - "User: 15, Item: 6757, Rating: 3.010328638497653\n", - "User: 15, Item: 7700, Rating: 3.561484741491386\n", - "User: 15, Item: 7981, Rating: 3.386000174210522\n", - "User: 15, Item: 8600, Rating: 3.320743223639117\n", - "User: 15, Item: 8620, Rating: 2.7538763809343654\n", - "User: 15, Item: 31952, Rating: 3.7409900837647396\n", - "User: 15, Item: 3, Rating: 2.222062601579949\n", - "User: 15, Item: 64, Rating: 0.9224387353614938\n", - "User: 15, Item: 206, Rating: 2.35668733389394\n", - "User: 15, Item: 249, Rating: 3.1290259851652826\n", - "User: 15, Item: 276, Rating: 2.1800017354806753\n", - "User: 15, Item: 369, Rating: 2.3082373858282694\n", - "User: 15, Item: 504, Rating: 2.2600496220227573\n", - "User: 15, Item: 515, Rating: 3.6575674086958188\n", - "User: 15, Item: 522, Rating: 2.4562020809509626\n", - "User: 15, Item: 580, Rating: 1.9073310817298395\n", - "User: 15, Item: 599, Rating: 2.780847470837928\n", - "User: 15, Item: 915, Rating: 2.761094249104645\n", - "User: 15, Item: 966, Rating: 3.0894953051643195\n", - "User: 15, Item: 1274, Rating: 2.9873500196382845\n", - "User: 15, Item: 1299, Rating: 3.0779327239728005\n", - "User: 15, Item: 1345, Rating: 2.2037629856623138\n", - "User: 15, Item: 1354, Rating: 2.001877412379849\n", - "User: 15, Item: 532, Rating: 2.7123071345260277\n", - "Computing the msd similarity matrix...\n", - "Done computing similarity matrix.\n", - "Predictions with min_k = 2:\n", - "User: 15, Item: 942, Rating: 3.7769516356699464\n", - "User: 15, Item: 2117, Rating: 2.9340004894942537\n", - "User: 15, Item: 2672, Rating: 2.371008709611413\n", - "User: 15, Item: 5054, Rating: 2.693661971830986\n", - "User: 15, Item: 6322, Rating: 1.711175832857413\n", - "User: 15, Item: 6323, Rating: 1.7645762379992287\n", - "User: 15, Item: 6757, Rating: 2.693661971830986\n", - "User: 15, Item: 7700, Rating: 3.561484741491386\n", - "User: 15, Item: 7981, Rating: 3.386000174210522\n", - "User: 15, Item: 8600, Rating: 3.320743223639117\n", - "User: 15, Item: 8620, Rating: 2.7538763809343654\n", - "User: 15, Item: 31952, Rating: 3.7409900837647396\n", - "User: 15, Item: 3, Rating: 2.222062601579949\n", - "User: 15, Item: 64, Rating: 0.9224387353614938\n", - "User: 15, Item: 206, Rating: 2.35668733389394\n", - "User: 15, Item: 249, Rating: 3.1290259851652826\n", - "User: 15, Item: 276, Rating: 2.1800017354806753\n", - "User: 15, Item: 369, Rating: 2.3082373858282694\n", - "User: 15, Item: 504, Rating: 2.2600496220227573\n", - "User: 15, Item: 515, Rating: 3.6575674086958188\n", - "User: 15, Item: 522, Rating: 2.4562020809509626\n", - "User: 15, Item: 580, Rating: 1.9073310817298395\n", - "User: 15, Item: 599, Rating: 2.780847470837928\n", - "User: 15, Item: 915, Rating: 2.761094249104645\n", - "User: 15, Item: 966, Rating: 2.693661971830986\n", - "User: 15, Item: 1274, Rating: 2.9873500196382845\n", - "User: 15, Item: 1299, Rating: 3.0779327239728005\n", - "User: 15, Item: 1345, Rating: 2.2037629856623138\n", - "User: 15, Item: 1354, Rating: 2.001877412379849\n", - "User: 15, Item: 532, Rating: 2.7123071345260277\n", - "Computing the msd similarity matrix...\n", - "Done computing similarity matrix.\n", - "Predictions with min_k = 3:\n", - "User: 15, Item: 942, Rating: 3.7769516356699464\n", - "User: 15, Item: 2117, Rating: 2.9340004894942537\n", - "User: 15, Item: 2672, Rating: 2.371008709611413\n", - "User: 15, Item: 5054, Rating: 2.693661971830986\n", - "User: 15, Item: 6322, Rating: 2.693661971830986\n", - "User: 15, Item: 6323, Rating: 1.7645762379992287\n", - "User: 15, Item: 6757, Rating: 2.693661971830986\n", - "User: 15, Item: 7700, Rating: 2.693661971830986\n", - "User: 15, Item: 7981, Rating: 3.386000174210522\n", - "User: 15, Item: 8600, Rating: 2.693661971830986\n", - "User: 15, Item: 8620, Rating: 2.7538763809343654\n", - "User: 15, Item: 31952, Rating: 2.693661971830986\n", - "User: 15, Item: 3, Rating: 2.222062601579949\n", - "User: 15, Item: 64, Rating: 0.9224387353614938\n", - "User: 15, Item: 206, Rating: 2.35668733389394\n", - "User: 15, Item: 249, Rating: 3.1290259851652826\n", - "User: 15, Item: 276, Rating: 2.1800017354806753\n", - "User: 15, Item: 369, Rating: 2.3082373858282694\n", - "User: 15, Item: 504, Rating: 2.2600496220227573\n", - "User: 15, Item: 515, Rating: 3.6575674086958188\n", - "User: 15, Item: 522, Rating: 2.4562020809509626\n", - "User: 15, Item: 580, Rating: 1.9073310817298395\n", - "User: 15, Item: 599, Rating: 2.780847470837928\n", - "User: 15, Item: 915, Rating: 2.761094249104645\n", - "User: 15, Item: 966, Rating: 2.693661971830986\n", - "User: 15, Item: 1274, Rating: 2.9873500196382845\n", - "User: 15, Item: 1299, Rating: 3.0779327239728005\n", - "User: 15, Item: 1345, Rating: 2.2037629856623138\n", - "User: 15, Item: 1354, Rating: 2.001877412379849\n", - "User: 15, Item: 532, Rating: 2.7123071345260277\n" - ] - } - ], - "source": [ - "# Playing with KNN\n", - "\n", - "# Define the similarity options\n", - "sim_options = {\n", - " 'name': 'msd', # Mean Squared Difference (Mean Square Error)\n", - " 'user_based': True, # User-based collaborative filtering\n", - " 'min_support': 3 # Minimum number of common ratings required. This data is\n", - "}\n", - "\n", - "# Create an instance of KNNWithMeans with the specified options\n", - "def predict_ratings(trainset, testset, min_k_values):\n", - " for min_k in min_k_values:\n", - " knn_model = KNNWithMeans(sim_options=sim_options, k=3, min_k=min_k)\n", - " # Train the algorithm on the trainset\n", - " knn_model.fit(trainset)\n", - "\n", - " # Make predictions for all ratings in the anti testset\n", - " predictions = knn_model.test(testset)\n", - "\n", - " # Display 30 predictions\n", - " print(f\"Predictions with min_k = {min_k}:\")\n", - " for prediction in predictions[:30]:\n", - " print(f\"User: {prediction.uid}, Item: {prediction.iid}, Rating: {prediction.est}\")\n", - "\n", - "# Assuming trainset and testset are already defined\n", - "predict_ratings(trainset, testset, min_k_values=[1, 2, 3])" - ] - }, - { - "cell_type": "markdown", - "id": "c5209097", - "metadata": {}, - "source": [ - "Quelque soit les neighbours (1,2,3) la valeur du ratings ne change pas " - ] - }, - { - "cell_type": "markdown", - "id": "c8890e11", - "metadata": {}, - "source": [ - "1).Predictions with min_k = 1: In this case, the model makes predictions without considering any minimum number of neighbors. Each prediction is made solely based on the similarity between the target user and other users who have rated the same items. Consequently, we observe varying prediction values for different items. For instance, for user 15 and item 942, the predicted rating is 3.777, while for item 64, the predicted rating is only 0.922. This indicates that the model heavily relies on the ratings from users who may have rated only a single item in common with the target user, leading to potentially erratic predictions.\n", - "\n", - "2). Predictions with min_k = 2: Here, a minimum of 2 neighbors are required to make a prediction. This introduces a bit of regularization, ensuring that predictions are made based on a slightly broader consensus. We notice that the predictions are somewhat similar to those with min_k = 1, but there are slight changes in some ratings. For example, the rating for item 5054 changes from 3.010 to 2.694. This suggests that the model is slightly more conservative in its predictions due to the requirement of at least two neighbors.\n", - "\n", - "3). Predictions with min_k = 3: With a minimum of 3 neighbors, the model becomes even more conservative. It requires a stronger consensus among users before making predictions. As a result, we see more uniformity in the predicted ratings compared to the previous cases. For example, for item 6322, the prediction changes from 1.711 (min_k = 1) to 2.694 (min_k = 2) and finally to 2.694 again (min_k = 3). This indicates that the model is increasingly cautious as it demands more agreement among neighbors before making predictions" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "cc806424", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Prédictions avec min_support = 1:\n", - "User: 15, Item: 942, Actual_k: 3\n", - "User: 15, Item: 2117, Actual_k: 3\n", - "User: 15, Item: 2672, Actual_k: 3\n", - "User: 15, Item: 5054, Actual_k: 1\n", - "User: 15, Item: 6322, Actual_k: 2\n", - "User: 15, Item: 6323, Actual_k: 3\n", - "User: 15, Item: 6757, Actual_k: 1\n", - "User: 15, Item: 7700, Actual_k: 2\n", - "User: 15, Item: 7981, Actual_k: 3\n", - "User: 15, Item: 8600, Actual_k: 2\n", - "User: 15, Item: 8620, Actual_k: 3\n", - "User: 15, Item: 31952, Actual_k: 2\n", - "User: 15, Item: 3, Actual_k: 3\n", - "User: 15, Item: 64, Actual_k: 3\n", - "User: 15, Item: 206, Actual_k: 3\n", - "User: 15, Item: 249, Actual_k: 3\n", - "User: 15, Item: 276, Actual_k: 3\n", - "User: 15, Item: 369, Actual_k: 3\n", - "User: 15, Item: 504, Actual_k: 3\n", - "User: 15, Item: 515, Actual_k: 3\n", - "User: 15, Item: 522, Actual_k: 3\n", - "User: 15, Item: 580, Actual_k: 3\n", - "User: 15, Item: 599, Actual_k: 3\n", - "User: 15, Item: 915, Actual_k: 3\n", - "User: 15, Item: 966, Actual_k: 1\n", - "User: 15, Item: 1274, Actual_k: 3\n", - "User: 15, Item: 1299, Actual_k: 3\n", - "User: 15, Item: 1345, Actual_k: 3\n", - "User: 15, Item: 1354, Actual_k: 3\n", - "User: 15, Item: 532, Actual_k: 3\n", - "\n", - "Prédictions avec min_support = 2:\n", - "User: 15, Item: 942, Actual_k: 3\n", - "User: 15, Item: 2117, Actual_k: 3\n", - "User: 15, Item: 2672, Actual_k: 3\n", - "User: 15, Item: 5054, Actual_k: 1\n", - "User: 15, Item: 6322, Actual_k: 2\n", - "User: 15, Item: 6323, Actual_k: 3\n", - "User: 15, Item: 6757, Actual_k: 1\n", - "User: 15, Item: 7700, Actual_k: 2\n", - "User: 15, Item: 7981, Actual_k: 3\n", - "User: 15, Item: 8600, Actual_k: 2\n", - "User: 15, Item: 8620, Actual_k: 3\n", - "User: 15, Item: 31952, Actual_k: 2\n", - "User: 15, Item: 3, Actual_k: 3\n", - "User: 15, Item: 64, Actual_k: 3\n", - "User: 15, Item: 206, Actual_k: 3\n", - "User: 15, Item: 249, Actual_k: 3\n", - "User: 15, Item: 276, Actual_k: 3\n", - "User: 15, Item: 369, Actual_k: 3\n", - "User: 15, Item: 504, Actual_k: 3\n", - "User: 15, Item: 515, Actual_k: 3\n", - "User: 15, Item: 522, Actual_k: 3\n", - "User: 15, Item: 580, Actual_k: 3\n", - "User: 15, Item: 599, Actual_k: 3\n", - "User: 15, Item: 915, Actual_k: 3\n", - "User: 15, Item: 966, Actual_k: 1\n", - "User: 15, Item: 1274, Actual_k: 3\n", - "User: 15, Item: 1299, Actual_k: 3\n", - "User: 15, Item: 1345, Actual_k: 3\n", - "User: 15, Item: 1354, Actual_k: 3\n", - "User: 15, Item: 532, Actual_k: 3\n", - "\n", - "Prédictions avec min_support = 3:\n", - "User: 15, Item: 942, Actual_k: 3\n", - "User: 15, Item: 2117, Actual_k: 3\n", - "User: 15, Item: 2672, Actual_k: 3\n", - "User: 15, Item: 5054, Actual_k: 1\n", - "User: 15, Item: 6322, Actual_k: 2\n", - "User: 15, Item: 6323, Actual_k: 3\n", - "User: 15, Item: 6757, Actual_k: 1\n", - "User: 15, Item: 7700, Actual_k: 2\n", - "User: 15, Item: 7981, Actual_k: 3\n", - "User: 15, Item: 8600, Actual_k: 2\n", - "User: 15, Item: 8620, Actual_k: 3\n", - "User: 15, Item: 31952, Actual_k: 2\n", - "User: 15, Item: 3, Actual_k: 3\n", - "User: 15, Item: 64, Actual_k: 3\n", - "User: 15, Item: 206, Actual_k: 3\n", - "User: 15, Item: 249, Actual_k: 3\n", - "User: 15, Item: 276, Actual_k: 3\n", - "User: 15, Item: 369, Actual_k: 3\n", - "User: 15, Item: 504, Actual_k: 3\n", - "User: 15, Item: 515, Actual_k: 3\n", - "User: 15, Item: 522, Actual_k: 3\n", - "User: 15, Item: 580, Actual_k: 3\n", - "User: 15, Item: 599, Actual_k: 3\n", - "User: 15, Item: 915, Actual_k: 3\n", - "User: 15, Item: 966, Actual_k: 1\n", - "User: 15, Item: 1274, Actual_k: 3\n", - "User: 15, Item: 1299, Actual_k: 3\n", - "User: 15, Item: 1345, Actual_k: 3\n", - "User: 15, Item: 1354, Actual_k: 3\n", - "User: 15, Item: 532, Actual_k: 3\n", - "\n", - "Matrice de similarité:\n", - "[[1. 0.39130435 0.35942029 ... 0.24358974 0.28513238 0.21451104]\n", - " [0.39130435 1. 0.32786885 ... 0.30967742 0.42424242 0.21621622]\n", - " [0.35942029 0.32786885 1. ... 0.36666667 0.72727273 0.34375 ]\n", - " ...\n", - " [0.24358974 0.30967742 0.36666667 ... 1. 0.6779661 0.37569061]\n", - " [0.28513238 0.42424242 0.72727273 ... 0.6779661 1. 0.83333333]\n", - " [0.21451104 0.21621622 0.34375 ... 0.37569061 0.83333333 1. ]]\n", - "None\n" - ] - } - ], - "source": [ - "def analyse_min_support(knn_model, testset):\n", - " # Rétablir min_k à 2\n", - " knn_model.min_k = 2\n", - "\n", - " # Modifier min_support de 1 à 3 et observer actual_k\n", - " for min_support in range(1, 4):\n", - " knn_model.sim_options['min_support'] = min_support\n", - " predictions_min_support = knn_model.test(testset[:30]) # Prendre les 30 premières prédictions pour l'affichage\n", - " print(f\"\\nPrédictions avec min_support = {min_support}:\")\n", - " for prediction in predictions_min_support:\n", - " actual_k = prediction.details['actual_k']\n", - " print(f\"User: {prediction.uid}, Item: {prediction.iid}, Actual_k: {actual_k}\")\n", - "\n", - " # Visualiser la matrice de similarité\n", - " similarity_matrix = knn_model.sim # Algorithme de knn_model\n", - " print(\"\\nMatrice de similarité:\")\n", - " print(similarity_matrix)\n", - "\n", - "# Appel de la fonction et impression de l'analyse\n", - "result = analyse_min_support(knn_model, testset)\n", - "print(result)" - ] - }, - { - "cell_type": "markdown", - "id": "2dd01f5b", - "metadata": {}, - "source": [ - "# 3. Implement and explore a customizable user-based algorithm\n", - "Create a self-made user-based algorithm allowing to customize the similarity metric, peer group calculation and aggregation function" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "d03ed9eb", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[3. 1.5 4. ... nan nan nan]\n", - " [nan nan nan ... nan nan nan]\n", - " [4. 3. 3. ... nan nan nan]\n", - " ...\n", - " [4.5 nan nan ... nan nan nan]\n", - " [nan nan nan ... nan nan nan]\n", - " [2. nan nan ... nan nan nan]]\n" - ] - } - ], - "source": [ - "class UserBased(AlgoBase):\n", - " def __init__(self, k=3, min_k=1, sim_options={}, **kwargs):\n", - " AlgoBase.__init__(self, sim_options=sim_options, **kwargs)\n", - " self.k = k\n", - " self.min_k = min_k\n", - " self.sim_options = sim_options\n", - "\n", - " \n", - " def fit(self, trainset):\n", - " AlgoBase.fit(self, trainset)\n", - " self.compute_rating_matrix()\n", - " self.compute_similarity_matrix()\n", - " self.compute_mean_ratings()\n", - " \n", - " def estimate(self, u, i):\n", - " if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):\n", - " raise PredictionImpossible('User and/or item is unknown.')\n", - "\n", - " estimate = self.mean_ratings[u]\n", - "\n", - " # Step 1: Create the peer group of user u for item i\n", - " peer_group = []\n", - " for j, rating in enumerate(self.trainset.ir[i]):\n", - " if rating is not None:\n", - " similarity = self.sim[u, j] # Similarity between user u and user j for item i\n", - " peer_group.append((j, similarity, rating))\n", - "\n", - " # Step 2: Pick up the top neighbors efficiently\n", - " k_neighbors = heapq.nlargest(self.min_k, peer_group, key=lambda x: x[1]) # Top k neighbors based on similarity\n", - "\n", - " # Step 3: Compute the weighted average\n", - " actual_k = len(k_neighbors)\n", - " if actual_k >= self.min_k:\n", - " weighted_sum = 0\n", - " total_similarity = 0\n", - " for j, similarity, rating_list in k_neighbors:\n", - " # Assuming rating_list is a list or array containing ratings\n", - " rating = rating_list[0] # Access the first element of the rating list\n", - " weighted_sum += similarity * rating\n", - " total_similarity += similarity\n", - "\n", - " if total_similarity != 0:\n", - " peer_group_average = weighted_sum / total_similarity\n", - " estimate += peer_group_average\n", - "\n", - " return estimate\n", - "\n", - " \n", - " def compute_rating_matrix(self):\n", - " # Get the number of users and items\n", - " n_users = self.trainset.n_users\n", - " n_items = self.trainset.n_items\n", - " \n", - " ratings_matrix = np.empty((n_users, n_items))\n", - " ratings_matrix[:] = np.nan\n", - "\n", - " # Fill in the ratings matrix with available ratings\n", - " for user_id, user_ratings in self.trainset.ur.items():\n", - " if user_ratings: # Check if user has ratings\n", - " for item_id, rating in user_ratings:\n", - " ratings_matrix[user_id, item_id] = rating\n", - " \n", - " # Set the computed ratings matrix to self.ratings_matrix\n", - " self.ratings_matrix = ratings_matrix\n", - " \n", - " def compute_similarity_matrix(self):\n", - " # Get the number of users\n", - " n_users = self.trainset.n_users\n", - " \n", - " # Initialize the similarity matrix with zeros and ones in the diagonal\n", - " similarity_matrix = np.eye(n_users)\n", - " \n", - " # Iterate through pairs of users to compute similarities\n", - " for i in range(n_users):\n", - " for j in range(i + 1, n_users):\n", - " # Compute support\n", - " support = np.sum(~np.isnan(self.ratings_matrix[i]) & ~np.isnan(self.ratings_matrix[j]))\n", - " \n", - " # Check if support is greater than or equal to min_k\n", - " if support >= self.min_k:\n", - " # Compute similarity using Jaccard similarity\n", - " intersection = np.sum(~np.isnan(self.ratings_matrix[i]) & ~np.isnan(self.ratings_matrix[j]))\n", - " union = np.sum(~np.isnan(self.ratings_matrix[i]) | ~np.isnan(self.ratings_matrix[j]))\n", - " similarity = intersection / union\n", - " similarity_matrix[i, j] = similarity\n", - " similarity_matrix[j, i] = similarity # Similarity matrix is symmetric\n", - " \n", - " # Set the computed similarity matrix to self.sim\n", - " self.sim = similarity_matrix\n", - " \n", - " def compute_mean_ratings(self):\n", - " # Compute the mean rating of every user\n", - " mean_ratings = []\n", - " for user_id, ratings in self.trainset.ur.items():\n", - " if ratings: # Check if user has ratings\n", - " mean_rating = np.mean([rating[1] for rating in ratings])\n", - " mean_ratings.append(mean_rating)\n", - " else:\n", - " mean_ratings.append(0) # If no ratings available, set mean to 0\n", - " \n", - " # Set the computed mean ratings\n", - " self.mean_ratings = mean_ratings\n", - "\n", - " \n", - "user_based_instance = UserBased(trainset=trainset)\n", - "\n", - "# Appel de la méthode fit pour calculer les matrices des évaluations, de similarité et les moyennes des évaluations\n", - "user_based_instance.fit(trainset)\n", - "\n", - "# Affichage de la matrice des évaluations\n", - "print(user_based_instance.ratings_matrix)\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "dfdc9cfe", - "metadata": {}, - "source": [ - "# 4. Compare KNNWithMeans with UserBased\n", - "Try to replicate KNNWithMeans with your self-made UserBased and check that outcomes are identical" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "be53ae27", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "UserBased MAE: 1.5398252671298895\n", - "UserBased RMSE: 1.5553141029705104\n", - "KNNWithMeans MAE: 0.5419110316300769\n", - "KNNWithMeans RMSE: 0.7019543155680094\n" - ] - } - ], - "source": [ - "# 1. Obtain Predictions\n", - "# Using UserBased algorithm\n", - "user_based_predictions = []\n", - "for uid, iid, true_r in testset:\n", - " user_based_pred = user_based_instance.predict(uid, iid)\n", - " user_based_predictions.append((uid, iid, true_r, user_based_pred.est, {}))\n", - "\n", - "# Using KNNWithMeans algorithm\n", - "knn_predictions = []\n", - "for uid, iid, true_r in testset:\n", - " knn_pred = knn_model.predict(uid, iid)\n", - " knn_predictions.append((uid, iid, true_r, knn_pred.est, knn_pred.details))\n", - "\n", - "# 2. Calculate Metrics\n", - "# Calculate MAE and RMSE for UserBased algorithm\n", - "user_based_mae = accuracy.mae(user_based_predictions, verbose=False)\n", - "user_based_rmse = accuracy.rmse(user_based_predictions, verbose=False)\n", - "\n", - "# Calculate MAE and RMSE for KNNWithMeans algorithm\n", - "knn_mae = accuracy.mae(knn_predictions, verbose=False)\n", - "knn_rmse = accuracy.rmse(knn_predictions, verbose=False)\n", - "\n", - "# 3. Compare Results\n", - "print(\"UserBased MAE:\", user_based_mae)\n", - "print(\"UserBased RMSE:\", user_based_rmse)\n", - "print(\"KNNWithMeans MAE:\", knn_mae)\n", - "print(\"KNNWithMeans RMSE:\", knn_rmse)\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "cced76d9", - "metadata": {}, - "source": [ - "# 5. Compare MSD and Jacard\n", - "Compare predictions made with MSD similarity and Jacard similarity\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "c20d8e19", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Computing the msd similarity matrix...\n", - "Done computing similarity matrix.\n", - "Computing the cosine similarity matrix...\n", - "Done computing similarity matrix.\n", - "RMSE: 0.9683\n", - "RMSE: 0.9824\n", - "RMSE with MSD similarity: 0.9682664011125741\n", - "RMSE with Jaccard similarity: 0.9824127884570012\n" - ] - } - ], - "source": [ - "from surprise import accuracy\n", - "from surprise.model_selection import train_test_split\n", - "from surprise import Dataset, Reader\n", - "from surprise import KNNBasic\n", - "\n", - "\n", - "# Split the dataset into training and testing sets\n", - "trainset, testset = train_test_split(surprise_data, test_size=0.2)\n", - "\n", - "# Initialize the model with MSD similarity\n", - "sim_options_msd = {'name': 'msd'}\n", - "user_based_msd = KNNBasic(sim_options=sim_options_msd)\n", - "user_based_msd.fit(trainset)\n", - "\n", - "# Initialize the model with Jacard similarity\n", - "sim_options_jaccard = {'name': 'cosine'}\n", - "user_based_jaccard = KNNBasic(sim_options=sim_options_jaccard)\n", - "user_based_jaccard.fit(trainset)\n", - "\n", - "# Make predictions with each model on the test set\n", - "predictions_msd = user_based_msd.test(testset)\n", - "predictions_jaccard = user_based_jaccard.test(testset)\n", - "\n", - "# Calculate and display the performances of the two models\n", - "rmse_msd = accuracy.rmse(predictions_msd)\n", - "rmse_jaccard = accuracy.rmse(predictions_jaccard)\n", - "\n", - "print(\"RMSE with MSD similarity:\", rmse_msd)\n", - "print(\"RMSE with Jaccard similarity:\", rmse_jaccard)\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "mon_environnement", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} -- GitLab