diff --git a/evaluator.ipynb b/evaluator.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..b88bfe44a4e7d2898edb216abecfd1b673f059b6 --- /dev/null +++ b/evaluator.ipynb @@ -0,0 +1,460 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a665885b", + "metadata": {}, + "source": [ + "# Evaluator Module\n", + "The Evaluator module creates evaluation reports.\n", + "\n", + "Reports contain evaluation metrics depending on models specified in the evaluation config." + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "id": "6aaf9140", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "# reloads modules automatically before entering the execution of code\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "# imports\n", + "import numpy as np \n", + "import pandas as pd\n", + "\n", + "# local imports\n", + "from configs import EvalConfig\n", + "from constants import Constant as C\n", + "from loaders import export_evaluation_report\n", + "from loaders import load_ratings\n", + "\n", + "# New imports\n", + "from surprise.model_selection import train_test_split\n", + "from surprise import accuracy\n", + "from surprise.model_selection import LeaveOneOut\n", + "from collections import Counter" + ] + }, + { + "cell_type": "markdown", + "id": "d47c24a4", + "metadata": {}, + "source": [ + "# 1. Model validation functions\n", + "Validation functions are a way to perform crossvalidation on recommender system models. " + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "d6d82188", + "metadata": {}, + "outputs": [], + "source": [ + "# -- implement the function generate_split_predictions --\n", + "def generate_split_predictions(algo, ratings_dataset, eval_config):\n", + " \"\"\"Generate predictions on a random test set specified in eval_config\"\"\"\n", + " \n", + " # Spliting the data into train and test sets\n", + " trainset, testset = train_test_split(ratings_dataset, test_size=eval_config.test_size)\n", + "\n", + " # Training the algorithm on the train data set\n", + " algo.fit(trainset)\n", + "\n", + " # Predict ratings for the testset\n", + " predictions = algo.test(testset)\n", + " \n", + " return predictions\n", + "\n", + "# -- implement the function generate_loo_top_n --\n", + "def generate_loo_top_n(algo, ratings_dataset, eval_config):\n", + " \"\"\"Generate top-n recommendations for each user on a random Leave-one-out split (LOO)\"\"\"\n", + " \n", + " # Create a LeaveOneOut split\n", + " loo = LeaveOneOut(n_splits=1)\n", + " \n", + " for trainset, testset in loo.split(ratings_dataset):\n", + " algo.fit(trainset) # Train the algorithm on the training set\n", + " anti_testset = trainset.build_anti_testset() # Build the anti test-set\n", + " predictions = algo.test(anti_testset) # Get predictions on the anti test-set\n", + " top_n = {}\n", + " for uid, iid, _, est, _ in predictions:\n", + " if uid not in top_n:\n", + " top_n[uid] = []\n", + " top_n[uid].append((iid, est))\n", + " for uid, user_ratings in top_n.items():\n", + " user_ratings.sort(key=lambda x: x[1], reverse=True)\n", + " top_n[uid] = user_ratings[:eval_config.top_n_value] # Get top-N recommendations\n", + " anti_testset_top_n = top_n\n", + " return anti_testset_top_n, testset\n", + "\n", + "def generate_full_top_n(algo, ratings_dataset, eval_config):\n", + " \"\"\"Generate top-n recommendations for each user with full training set (LOO)\"\"\"\n", + "\n", + " full_trainset = ratings_dataset.build_full_trainset() # Build the full training set\n", + " algo.fit(full_trainset) # Train the algorithm on the full training set\n", + " anti_testset = full_trainset.build_anti_testset() # Build the anti test-set\n", + " predictions = algo.test(anti_testset) # Get predictions on the anti test-set\n", + " top_n = {}\n", + " for uid, iid, _, est, _ in predictions:\n", + " if uid not in top_n:\n", + " top_n[uid] = []\n", + " top_n[uid].append((iid, est))\n", + " for uid, user_ratings in top_n.items():\n", + " user_ratings.sort(key=lambda x: x[1], reverse=True)\n", + " top_n[uid] = user_ratings[:eval_config.top_n_value] # Get top-N recommendations\n", + " anti_testset_top_n = top_n\n", + " return anti_testset_top_n\n", + "\n", + "def precomputed_information(movie_data):\n", + "\n", + " \"\"\" Returns a dictionary that precomputes relevant information for evaluating in full mode\n", + " \n", + " Dictionary keys:\n", + " - precomputed_dict[\"item_to_rank\"] : contains a dictionary mapping movie ids to rankings\n", + " - (-- for your project, add other relevant information here -- )\n", + " \"\"\"\n", + "\n", + " # Initialize an empty dictionary to store item_id to rank mapping\n", + " item_to_rank = {}\n", + " \n", + " # Calculate popularity rank for each movie\n", + " ratings_count = movie_data.groupby('movieId').size().sort_values(ascending=False)\n", + " \n", + " # Assign ranks to movies based on their popularity\n", + " for rank, (movie_id, _) in enumerate(ratings_count.items(), start=1):\n", + " item_to_rank[movie_id] = rank\n", + " \n", + " # Create the precomputed dictionary\n", + " precomputed_dict = {}\n", + " precomputed_dict[\"item_to_rank\"] = item_to_rank\n", + " \n", + " return precomputed_dict\n", + "\n", + "def create_evaluation_report(eval_config, sp_ratings, precomputed_dict, available_metrics):\n", + "\n", + " \"\"\" Create a DataFrame evaluating various models on metrics specified in an evaluation config. \n", + " \"\"\"\n", + " \n", + " evaluation_dict = {}\n", + " for model_name, model, arguments in eval_config.models:\n", + " print(f'Handling model {model_name}')\n", + " algo = model(**arguments)\n", + " evaluation_dict[model_name] = {}\n", + " \n", + " # Type 1 : split evaluations\n", + " if len(eval_config.split_metrics) > 0:\n", + " print('Training split predictions')\n", + " predictions = generate_split_predictions(algo, sp_ratings, eval_config)\n", + " for metric in eval_config.split_metrics:\n", + " print(f'- computing metric {metric}')\n", + " assert metric in available_metrics['split']\n", + " evaluation_function, parameters = available_metrics[\"split\"][metric]\n", + " evaluation_dict[model_name][metric] = evaluation_function(predictions, **parameters) \n", + " \n", + " # Type 2 : loo evaluations\n", + " if len(eval_config.loo_metrics) > 0:\n", + " print('Training loo predictions')\n", + " anti_testset_top_n, testset = generate_loo_top_n(algo, sp_ratings, eval_config)\n", + " for metric in eval_config.loo_metrics:\n", + " assert metric in available_metrics['loo']\n", + " evaluation_function, parameters = available_metrics[\"loo\"][metric]\n", + " evaluation_dict[model_name][metric] = evaluation_function(anti_testset_top_n, testset, **parameters)\n", + " \n", + " # Type 3 : full evaluations\n", + " if len(eval_config.full_metrics) > 0:\n", + " print('Training full predictions')\n", + " anti_testset_top_n = generate_full_top_n(algo, sp_ratings, eval_config)\n", + " for metric in eval_config.full_metrics:\n", + " assert metric in available_metrics['full']\n", + " evaluation_function, parameters = available_metrics[\"full\"][metric]\n", + " evaluation_dict[model_name][metric] = evaluation_function(\n", + " anti_testset_top_n,\n", + " **precomputed_dict,\n", + " **parameters\n", + " )\n", + " \n", + " return pd.DataFrame.from_dict(evaluation_dict).T" + ] + }, + { + "cell_type": "markdown", + "id": "f7e83d1d", + "metadata": {}, + "source": [ + "# 2. Evaluation metrics\n", + "Implement evaluation metrics for either rating predictions (split metrics) or for top-n recommendations (loo metric, full metric)" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "id": "f1849e55", + "metadata": {}, + "outputs": [], + "source": [ + "# -- implement the function get_hit_rate --\n", + "def get_hit_rate(anti_testset_top_n, testset):\n", + " \n", + " \"\"\"Compute the average hit over the users (loo metric)\n", + " \n", + " A hit (1) happens when the movie in the testset has been picked by the top-n recommender\n", + " A fail (0) happens when the movie in the testset has not been picked by the top-n recommender\n", + " \"\"\"\n", + "\n", + " hits = 0\n", + " total_users = len(testset)\n", + " for uid, true_iid, _ in testset:\n", + " if uid in anti_testset_top_n and true_iid in {iid for iid, _ in anti_testset_top_n[uid]}:\n", + " hits += 1\n", + " hit_rate = hits / total_users\n", + "\n", + " return hit_rate\n", + "\n", + "# -- implement the function get_novelty --\n", + "def get_novelty(anti_testset_top_n, item_to_rank):\n", + "\n", + " \"\"\"Compute the average novelty of the top-n recommendation over the users (full metric)\n", + " \n", + " The novelty is defined as the average ranking of the movies recommended\n", + " \"\"\"\n", + "\n", + " total_rank_sum = 0\n", + " total_recommendations = 0\n", + " for uid, recommendations in anti_testset_top_n.items():\n", + " for iid, _ in recommendations:\n", + " if iid in item_to_rank:\n", + " total_rank_sum += item_to_rank[iid]\n", + " total_recommendations += 1\n", + " if total_recommendations == 0:\n", + " return 0 # Avoid division by zero\n", + " average_rank_sum = total_rank_sum / total_recommendations \n", + " \n", + " return average_rank_sum" + ] + }, + { + "cell_type": "markdown", + "id": "1a9855b3", + "metadata": {}, + "source": [ + "# 3. Evaluation workflow\n", + "Load data, evaluate models and save the experimental outcomes" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "id": "704f4d2a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Handling model baseline_1\n", + "Training split predictions\n", + "- computing metric mae\n", + "- computing metric rmse\n", + "Training loo predictions\n", + "Training full predictions\n", + "Handling model baseline_2\n", + "Training split predictions\n", + "- computing metric mae\n", + "- computing metric rmse\n", + "Training loo predictions\n", + "Training full predictions\n", + "Handling model baseline_3\n", + "Training split predictions\n", + "- computing metric mae\n", + "- computing metric rmse\n", + "Training loo predictions\n", + "Training full predictions\n", + "Handling model baseline_4\n", + "Training split predictions\n", + "- computing metric mae\n", + "- computing metric rmse\n", + "Training loo predictions\n", + "Training full predictions\n", + "Handling model ContentBased_sample\n", + "Training split predictions\n", + "- computing metric mae\n", + "- computing metric rmse\n", + "Training loo predictions\n", + "Training full predictions\n", + "Handling model ContentBased_score\n", + "Training split predictions\n", + "- computing metric mae\n", + "- computing metric rmse\n", + "Training loo predictions\n", + "Training full predictions\n", + "Handling model ContentBased_Lr\n", + "Training split predictions\n", + "- computing metric mae\n", + "- computing metric rmse\n", + "Training loo predictions\n", + "Training full predictions\n", + "The data has been exported to the evaluation report\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>mae</th>\n", + " <th>rmse</th>\n", + " <th>hit_rate</th>\n", + " <th>novelty</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>baseline_1</th>\n", + " <td>1.561178</td>\n", + " <td>1.792482</td>\n", + " <td>0.074766</td>\n", + " <td>99.405607</td>\n", + " </tr>\n", + " <tr>\n", + " <th>baseline_2</th>\n", + " <td>1.471412</td>\n", + " <td>1.819364</td>\n", + " <td>0.000000</td>\n", + " <td>429.942991</td>\n", + " </tr>\n", + " <tr>\n", + " <th>baseline_3</th>\n", + " <td>0.878270</td>\n", + " <td>1.085591</td>\n", + " <td>0.074766</td>\n", + " <td>99.405607</td>\n", + " </tr>\n", + " <tr>\n", + " <th>baseline_4</th>\n", + " <td>0.705673</td>\n", + " <td>0.912313</td>\n", + " <td>0.130841</td>\n", + " <td>60.202804</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ContentBased_sample</th>\n", + " <td>1.013747</td>\n", + " <td>1.350417</td>\n", + " <td>0.084112</td>\n", + " <td>178.048598</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ContentBased_score</th>\n", + " <td>1.461846</td>\n", + " <td>1.803067</td>\n", + " <td>0.018692</td>\n", + " <td>437.222430</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ContentBased_Lr</th>\n", + " <td>1.202626</td>\n", + " <td>1.460273</td>\n", + " <td>0.084112</td>\n", + " <td>278.046729</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " mae rmse hit_rate novelty\n", + "baseline_1 1.561178 1.792482 0.074766 99.405607\n", + "baseline_2 1.471412 1.819364 0.000000 429.942991\n", + "baseline_3 0.878270 1.085591 0.074766 99.405607\n", + "baseline_4 0.705673 0.912313 0.130841 60.202804\n", + "ContentBased_sample 1.013747 1.350417 0.084112 178.048598\n", + "ContentBased_score 1.461846 1.803067 0.018692 437.222430\n", + "ContentBased_Lr 1.202626 1.460273 0.084112 278.046729" + ] + }, + "execution_count": 112, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "AVAILABLE_METRICS = {\n", + " \"split\": {\n", + " \"mae\": (accuracy.mae, {'verbose': False}),\n", + " \"rmse\": (accuracy.rmse, {'verbose': False})\n", + " },\n", + " \"loo\": {\n", + " \"hit_rate\": (get_hit_rate, {}),\n", + " },\n", + " \"full\": {\n", + " \"novelty\": (get_novelty, {}),\n", + " }\n", + "}\n", + "\n", + "sp_ratings = load_ratings(surprise_format=True)\n", + "precomputed_dict = precomputed_information(pd.read_csv(\"data/tiny/evidence/ratings.csv\"))\n", + "evaluation_report = create_evaluation_report(EvalConfig, sp_ratings, precomputed_dict, AVAILABLE_METRICS)\n", + "export_evaluation_report(evaluation_report)" + ] + }, + { + "cell_type": "markdown", + "id": "6f8b6d19", + "metadata": {}, + "source": [ + "dire quel modèle est meilleur ?\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}