diff --git a/evaluator.ipynb b/evaluator.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..b88bfe44a4e7d2898edb216abecfd1b673f059b6
--- /dev/null
+++ b/evaluator.ipynb
@@ -0,0 +1,460 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "a665885b",
+ "metadata": {},
+ "source": [
+ "# Evaluator Module\n",
+ "The Evaluator module creates evaluation reports.\n",
+ "\n",
+ "Reports contain evaluation metrics depending on models specified in the evaluation config."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 109,
+ "id": "6aaf9140",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The autoreload extension is already loaded. To reload it, use:\n",
+ " %reload_ext autoreload\n"
+ ]
+ }
+ ],
+ "source": [
+ "# reloads modules automatically before entering the execution of code\n",
+ "%load_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "# imports\n",
+ "import numpy as np \n",
+ "import pandas as pd\n",
+ "\n",
+ "# local imports\n",
+ "from configs import EvalConfig\n",
+ "from constants import Constant as C\n",
+ "from loaders import export_evaluation_report\n",
+ "from loaders import load_ratings\n",
+ "\n",
+ "# New imports\n",
+ "from surprise.model_selection import train_test_split\n",
+ "from surprise import accuracy\n",
+ "from surprise.model_selection import LeaveOneOut\n",
+ "from collections import Counter"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d47c24a4",
+ "metadata": {},
+ "source": [
+ "# 1. Model validation functions\n",
+ "Validation functions are a way to perform crossvalidation on recommender system models. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 110,
+ "id": "d6d82188",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# -- implement the function generate_split_predictions --\n",
+ "def generate_split_predictions(algo, ratings_dataset, eval_config):\n",
+ " \"\"\"Generate predictions on a random test set specified in eval_config\"\"\"\n",
+ " \n",
+ " # Spliting the data into train and test sets\n",
+ " trainset, testset = train_test_split(ratings_dataset, test_size=eval_config.test_size)\n",
+ "\n",
+ " # Training the algorithm on the train data set\n",
+ " algo.fit(trainset)\n",
+ "\n",
+ " # Predict ratings for the testset\n",
+ " predictions = algo.test(testset)\n",
+ " \n",
+ " return predictions\n",
+ "\n",
+ "# -- implement the function generate_loo_top_n --\n",
+ "def generate_loo_top_n(algo, ratings_dataset, eval_config):\n",
+ " \"\"\"Generate top-n recommendations for each user on a random Leave-one-out split (LOO)\"\"\"\n",
+ " \n",
+ " # Create a LeaveOneOut split\n",
+ " loo = LeaveOneOut(n_splits=1)\n",
+ " \n",
+ " for trainset, testset in loo.split(ratings_dataset):\n",
+ " algo.fit(trainset) # Train the algorithm on the training set\n",
+ " anti_testset = trainset.build_anti_testset() # Build the anti test-set\n",
+ " predictions = algo.test(anti_testset) # Get predictions on the anti test-set\n",
+ " top_n = {}\n",
+ " for uid, iid, _, est, _ in predictions:\n",
+ " if uid not in top_n:\n",
+ " top_n[uid] = []\n",
+ " top_n[uid].append((iid, est))\n",
+ " for uid, user_ratings in top_n.items():\n",
+ " user_ratings.sort(key=lambda x: x[1], reverse=True)\n",
+ " top_n[uid] = user_ratings[:eval_config.top_n_value] # Get top-N recommendations\n",
+ " anti_testset_top_n = top_n\n",
+ " return anti_testset_top_n, testset\n",
+ "\n",
+ "def generate_full_top_n(algo, ratings_dataset, eval_config):\n",
+ " \"\"\"Generate top-n recommendations for each user with full training set (LOO)\"\"\"\n",
+ "\n",
+ " full_trainset = ratings_dataset.build_full_trainset() # Build the full training set\n",
+ " algo.fit(full_trainset) # Train the algorithm on the full training set\n",
+ " anti_testset = full_trainset.build_anti_testset() # Build the anti test-set\n",
+ " predictions = algo.test(anti_testset) # Get predictions on the anti test-set\n",
+ " top_n = {}\n",
+ " for uid, iid, _, est, _ in predictions:\n",
+ " if uid not in top_n:\n",
+ " top_n[uid] = []\n",
+ " top_n[uid].append((iid, est))\n",
+ " for uid, user_ratings in top_n.items():\n",
+ " user_ratings.sort(key=lambda x: x[1], reverse=True)\n",
+ " top_n[uid] = user_ratings[:eval_config.top_n_value] # Get top-N recommendations\n",
+ " anti_testset_top_n = top_n\n",
+ " return anti_testset_top_n\n",
+ "\n",
+ "def precomputed_information(movie_data):\n",
+ "\n",
+ " \"\"\" Returns a dictionary that precomputes relevant information for evaluating in full mode\n",
+ " \n",
+ " Dictionary keys:\n",
+ " - precomputed_dict[\"item_to_rank\"] : contains a dictionary mapping movie ids to rankings\n",
+ " - (-- for your project, add other relevant information here -- )\n",
+ " \"\"\"\n",
+ "\n",
+ " # Initialize an empty dictionary to store item_id to rank mapping\n",
+ " item_to_rank = {}\n",
+ " \n",
+ " # Calculate popularity rank for each movie\n",
+ " ratings_count = movie_data.groupby('movieId').size().sort_values(ascending=False)\n",
+ " \n",
+ " # Assign ranks to movies based on their popularity\n",
+ " for rank, (movie_id, _) in enumerate(ratings_count.items(), start=1):\n",
+ " item_to_rank[movie_id] = rank\n",
+ " \n",
+ " # Create the precomputed dictionary\n",
+ " precomputed_dict = {}\n",
+ " precomputed_dict[\"item_to_rank\"] = item_to_rank\n",
+ " \n",
+ " return precomputed_dict\n",
+ "\n",
+ "def create_evaluation_report(eval_config, sp_ratings, precomputed_dict, available_metrics):\n",
+ "\n",
+ " \"\"\" Create a DataFrame evaluating various models on metrics specified in an evaluation config. \n",
+ " \"\"\"\n",
+ " \n",
+ " evaluation_dict = {}\n",
+ " for model_name, model, arguments in eval_config.models:\n",
+ " print(f'Handling model {model_name}')\n",
+ " algo = model(**arguments)\n",
+ " evaluation_dict[model_name] = {}\n",
+ " \n",
+ " # Type 1 : split evaluations\n",
+ " if len(eval_config.split_metrics) > 0:\n",
+ " print('Training split predictions')\n",
+ " predictions = generate_split_predictions(algo, sp_ratings, eval_config)\n",
+ " for metric in eval_config.split_metrics:\n",
+ " print(f'- computing metric {metric}')\n",
+ " assert metric in available_metrics['split']\n",
+ " evaluation_function, parameters = available_metrics[\"split\"][metric]\n",
+ " evaluation_dict[model_name][metric] = evaluation_function(predictions, **parameters) \n",
+ " \n",
+ " # Type 2 : loo evaluations\n",
+ " if len(eval_config.loo_metrics) > 0:\n",
+ " print('Training loo predictions')\n",
+ " anti_testset_top_n, testset = generate_loo_top_n(algo, sp_ratings, eval_config)\n",
+ " for metric in eval_config.loo_metrics:\n",
+ " assert metric in available_metrics['loo']\n",
+ " evaluation_function, parameters = available_metrics[\"loo\"][metric]\n",
+ " evaluation_dict[model_name][metric] = evaluation_function(anti_testset_top_n, testset, **parameters)\n",
+ " \n",
+ " # Type 3 : full evaluations\n",
+ " if len(eval_config.full_metrics) > 0:\n",
+ " print('Training full predictions')\n",
+ " anti_testset_top_n = generate_full_top_n(algo, sp_ratings, eval_config)\n",
+ " for metric in eval_config.full_metrics:\n",
+ " assert metric in available_metrics['full']\n",
+ " evaluation_function, parameters = available_metrics[\"full\"][metric]\n",
+ " evaluation_dict[model_name][metric] = evaluation_function(\n",
+ " anti_testset_top_n,\n",
+ " **precomputed_dict,\n",
+ " **parameters\n",
+ " )\n",
+ " \n",
+ " return pd.DataFrame.from_dict(evaluation_dict).T"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f7e83d1d",
+ "metadata": {},
+ "source": [
+ "# 2. Evaluation metrics\n",
+ "Implement evaluation metrics for either rating predictions (split metrics) or for top-n recommendations (loo metric, full metric)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 111,
+ "id": "f1849e55",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# -- implement the function get_hit_rate --\n",
+ "def get_hit_rate(anti_testset_top_n, testset):\n",
+ " \n",
+ " \"\"\"Compute the average hit over the users (loo metric)\n",
+ " \n",
+ " A hit (1) happens when the movie in the testset has been picked by the top-n recommender\n",
+ " A fail (0) happens when the movie in the testset has not been picked by the top-n recommender\n",
+ " \"\"\"\n",
+ "\n",
+ " hits = 0\n",
+ " total_users = len(testset)\n",
+ " for uid, true_iid, _ in testset:\n",
+ " if uid in anti_testset_top_n and true_iid in {iid for iid, _ in anti_testset_top_n[uid]}:\n",
+ " hits += 1\n",
+ " hit_rate = hits / total_users\n",
+ "\n",
+ " return hit_rate\n",
+ "\n",
+ "# -- implement the function get_novelty --\n",
+ "def get_novelty(anti_testset_top_n, item_to_rank):\n",
+ "\n",
+ " \"\"\"Compute the average novelty of the top-n recommendation over the users (full metric)\n",
+ " \n",
+ " The novelty is defined as the average ranking of the movies recommended\n",
+ " \"\"\"\n",
+ "\n",
+ " total_rank_sum = 0\n",
+ " total_recommendations = 0\n",
+ " for uid, recommendations in anti_testset_top_n.items():\n",
+ " for iid, _ in recommendations:\n",
+ " if iid in item_to_rank:\n",
+ " total_rank_sum += item_to_rank[iid]\n",
+ " total_recommendations += 1\n",
+ " if total_recommendations == 0:\n",
+ " return 0 # Avoid division by zero\n",
+ " average_rank_sum = total_rank_sum / total_recommendations \n",
+ " \n",
+ " return average_rank_sum"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1a9855b3",
+ "metadata": {},
+ "source": [
+ "# 3. Evaluation workflow\n",
+ "Load data, evaluate models and save the experimental outcomes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 112,
+ "id": "704f4d2a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Handling model baseline_1\n",
+ "Training split predictions\n",
+ "- computing metric mae\n",
+ "- computing metric rmse\n",
+ "Training loo predictions\n",
+ "Training full predictions\n",
+ "Handling model baseline_2\n",
+ "Training split predictions\n",
+ "- computing metric mae\n",
+ "- computing metric rmse\n",
+ "Training loo predictions\n",
+ "Training full predictions\n",
+ "Handling model baseline_3\n",
+ "Training split predictions\n",
+ "- computing metric mae\n",
+ "- computing metric rmse\n",
+ "Training loo predictions\n",
+ "Training full predictions\n",
+ "Handling model baseline_4\n",
+ "Training split predictions\n",
+ "- computing metric mae\n",
+ "- computing metric rmse\n",
+ "Training loo predictions\n",
+ "Training full predictions\n",
+ "Handling model ContentBased_sample\n",
+ "Training split predictions\n",
+ "- computing metric mae\n",
+ "- computing metric rmse\n",
+ "Training loo predictions\n",
+ "Training full predictions\n",
+ "Handling model ContentBased_score\n",
+ "Training split predictions\n",
+ "- computing metric mae\n",
+ "- computing metric rmse\n",
+ "Training loo predictions\n",
+ "Training full predictions\n",
+ "Handling model ContentBased_Lr\n",
+ "Training split predictions\n",
+ "- computing metric mae\n",
+ "- computing metric rmse\n",
+ "Training loo predictions\n",
+ "Training full predictions\n",
+ "The data has been exported to the evaluation report\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>mae</th>\n",
+ " <th>rmse</th>\n",
+ " <th>hit_rate</th>\n",
+ " <th>novelty</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>baseline_1</th>\n",
+ " <td>1.561178</td>\n",
+ " <td>1.792482</td>\n",
+ " <td>0.074766</td>\n",
+ " <td>99.405607</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>baseline_2</th>\n",
+ " <td>1.471412</td>\n",
+ " <td>1.819364</td>\n",
+ " <td>0.000000</td>\n",
+ " <td>429.942991</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>baseline_3</th>\n",
+ " <td>0.878270</td>\n",
+ " <td>1.085591</td>\n",
+ " <td>0.074766</td>\n",
+ " <td>99.405607</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>baseline_4</th>\n",
+ " <td>0.705673</td>\n",
+ " <td>0.912313</td>\n",
+ " <td>0.130841</td>\n",
+ " <td>60.202804</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>ContentBased_sample</th>\n",
+ " <td>1.013747</td>\n",
+ " <td>1.350417</td>\n",
+ " <td>0.084112</td>\n",
+ " <td>178.048598</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>ContentBased_score</th>\n",
+ " <td>1.461846</td>\n",
+ " <td>1.803067</td>\n",
+ " <td>0.018692</td>\n",
+ " <td>437.222430</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>ContentBased_Lr</th>\n",
+ " <td>1.202626</td>\n",
+ " <td>1.460273</td>\n",
+ " <td>0.084112</td>\n",
+ " <td>278.046729</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " mae rmse hit_rate novelty\n",
+ "baseline_1 1.561178 1.792482 0.074766 99.405607\n",
+ "baseline_2 1.471412 1.819364 0.000000 429.942991\n",
+ "baseline_3 0.878270 1.085591 0.074766 99.405607\n",
+ "baseline_4 0.705673 0.912313 0.130841 60.202804\n",
+ "ContentBased_sample 1.013747 1.350417 0.084112 178.048598\n",
+ "ContentBased_score 1.461846 1.803067 0.018692 437.222430\n",
+ "ContentBased_Lr 1.202626 1.460273 0.084112 278.046729"
+ ]
+ },
+ "execution_count": 112,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "AVAILABLE_METRICS = {\n",
+ " \"split\": {\n",
+ " \"mae\": (accuracy.mae, {'verbose': False}),\n",
+ " \"rmse\": (accuracy.rmse, {'verbose': False})\n",
+ " },\n",
+ " \"loo\": {\n",
+ " \"hit_rate\": (get_hit_rate, {}),\n",
+ " },\n",
+ " \"full\": {\n",
+ " \"novelty\": (get_novelty, {}),\n",
+ " }\n",
+ "}\n",
+ "\n",
+ "sp_ratings = load_ratings(surprise_format=True)\n",
+ "precomputed_dict = precomputed_information(pd.read_csv(\"data/tiny/evidence/ratings.csv\"))\n",
+ "evaluation_report = create_evaluation_report(EvalConfig, sp_ratings, precomputed_dict, AVAILABLE_METRICS)\n",
+ "export_evaluation_report(evaluation_report)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6f8b6d19",
+ "metadata": {},
+ "source": [
+ "dire quel modèle est meilleur ?\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}