update for hackathon

fb8db879 · Adrien Payen · 40620d98 · fb8db879 · fb8db879 · fb8db879
--- a/configs.py
+++ b/configs.py
@@ -13,9 +13,27 @@ class EvalConfig:
        ("baseline_2", ModelBaseline2, {}),
        ("baseline_3", ModelBaseline3, {}),
        ("baseline_4", ModelBaseline4, {}),
-        ("ContentBased_sample", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_sample"}),
+        ("title_length_ContentBased_sample", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_sample"}),
-        ("ContentBased_score", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_score"}),
+        ("title_length_ContentBased_score", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_score"}),
-        ("ContentBased_Lr", ContentBased, {"features_method" : "title_length", "regressor_method" : "linear_regression"})
+        ("title_length_ContentBased_Lr", ContentBased, {"features_method" : "title_length", "regressor_method" : "linear_regression"}),
+        ("movie_year_ContentBased_sample", ContentBased, {"features_method" : "movie_year", "regressor_method" : "random_sample"}),
+        ("movie_year_ContentBased_score", ContentBased, {"features_method" : "movie_year", "regressor_method" : "random_score"}),
+        #("movie_year_ContentBased_Lr", ContentBased, {"features_method" : "movie_year", "regressor_method" : "linear_regression"})
+        ("genres_ContentBased_sample", ContentBased, {"features_method" : "genres", "regressor_method" : "random_sample"}),
+        ("genres_ContentBased_score", ContentBased, {"features_method" : "genres", "regressor_method" : "random_score"}),
+        #("genres_ContentBased_Lr", ContentBased, {"features_method" : "genres", "regressor_method" : "linear_regression"}),
+        ("rating_ContentBased_sample", ContentBased, {"features_method" : "rating", "regressor_method" : "random_sample"}),
+        ("rating_ContentBased_score", ContentBased, {"features_method" : "rating", "regressor_method" : "random_score"}),
+        #("rating_ContentBased_Lr", ContentBased, {"features_method" : "rating", "regressor_method" : "linear_regression"}),
+        ("tags_ContentBased_sample", ContentBased, {"features_method" : "tags", "regressor_method" : "random_sample"}),
+        ("tags_ContentBased_score", ContentBased, {"features_method" : "tags", "regressor_method" : "random_score"}),
+        #("tags_ContentBased_Lr", ContentBased, {"features_method" : "tags", "regressor_method" : "linear_regression"}),
+        ("tags_length_ContentBased_sample", ContentBased, {"features_method" : "tags_length", "regressor_method" : "random_sample"}),
+        ("tags_length_ContentBased_score", ContentBased, {"features_method" : "tags_length", "regressor_method" : "random_score"}),
+        #("tags_length_ContentBased_Lr", ContentBased, {"features_method" : "tags_length", "regressor_method" : "linear_regression"}),
+        ("timestamp_ContentBased_sample", ContentBased, {"features_method" : "timestamp", "regressor_method" : "random_sample"}),
+        ("timestamp_ContentBased_score", ContentBased, {"features_method" : "timestamp", "regressor_method" : "random_score"}),
+        #("timestamp_ContentBased_Lr", ContentBased, {"features_method" : "timestamp", "regressor_method" : "linear_regression"})
        # model_name, model class, model parameters (dict)
    ]
@@ -34,3 +52,4 @@ class EvalConfig:
    # Loo parameters
    top_n_value =  10 # -- configure the numer of recommendations (> 1) --
--- a/evaluator.ipynb
+++ b/evaluator.ipynb
@@ -13,7 +13,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 109,
+   "execution_count": 25,
   "id": "6aaf9140",
   "metadata": {},
   "outputs": [
@@ -59,7 +59,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 110,
+   "execution_count": 26,
   "id": "d6d82188",
   "metadata": {},
   "outputs": [],
@@ -201,7 +201,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 111,
+   "execution_count": 27,
   "id": "f1849e55",
   "metadata": {},
   "outputs": [],
@@ -257,7 +257,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 112,
+   "execution_count": 28,
   "id": "704f4d2a",
   "metadata": {},
   "outputs": [
@@ -289,19 +289,91 @@
      "- computing metric rmse\n",
      "Training loo predictions\n",
      "Training full predictions\n",
-      "Handling model ContentBased_sample\n",
+      "Handling model title_length_ContentBased_sample\n",
      "Training split predictions\n",
      "- computing metric mae\n",
      "- computing metric rmse\n",
      "Training loo predictions\n",
      "Training full predictions\n",
-      "Handling model ContentBased_score\n",
+      "Handling model title_length_ContentBased_score\n",
      "Training split predictions\n",
      "- computing metric mae\n",
      "- computing metric rmse\n",
      "Training loo predictions\n",
      "Training full predictions\n",
-      "Handling model ContentBased_Lr\n",
+      "Handling model title_length_ContentBased_Lr\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model movie_year_ContentBased_sample\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model movie_year_ContentBased_score\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model genres_ContentBased_sample\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model genres_ContentBased_score\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model rating_ContentBased_sample\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model rating_ContentBased_score\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model tags_ContentBased_sample\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model tags_ContentBased_score\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model tags_length_ContentBased_sample\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model tags_length_ContentBased_score\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model timestamp_ContentBased_sample\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model timestamp_ContentBased_score\n",
      "Training split predictions\n",
      "- computing metric mae\n",
      "- computing metric rmse\n",
@@ -340,69 +412,165 @@
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>baseline_1</th>\n",
-       "      <td>1.561178</td>\n",
+       "      <td>1.312500</td>\n",
-       "      <td>1.792482</td>\n",
+       "      <td>1.667708</td>\n",
-       "      <td>0.074766</td>\n",
+       "      <td>1.0</td>\n",
-       "      <td>99.405607</td>\n",
+       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>baseline_2</th>\n",
-       "      <td>1.471412</td>\n",
+       "      <td>1.315250</td>\n",
-       "      <td>1.819364</td>\n",
+       "      <td>1.572990</td>\n",
-       "      <td>0.000000</td>\n",
+       "      <td>1.0</td>\n",
-       "      <td>429.942991</td>\n",
+       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>baseline_3</th>\n",
-       "      <td>0.878270</td>\n",
+       "      <td>1.318182</td>\n",
-       "      <td>1.085591</td>\n",
+       "      <td>1.465689</td>\n",
-       "      <td>0.074766</td>\n",
+       "      <td>1.0</td>\n",
-       "      <td>99.405607</td>\n",
+       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>baseline_4</th>\n",
-       "      <td>0.705673</td>\n",
+       "      <td>1.363953</td>\n",
-       "      <td>0.912313</td>\n",
+       "      <td>1.523985</td>\n",
-       "      <td>0.130841</td>\n",
+       "      <td>1.0</td>\n",
-       "      <td>60.202804</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>title_length_ContentBased_sample</th>\n",
+       "      <td>1.375000</td>\n",
+       "      <td>1.750000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>title_length_ContentBased_score</th>\n",
+       "      <td>1.556280</td>\n",
+       "      <td>2.063469</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>title_length_ContentBased_Lr</th>\n",
+       "      <td>1.625729</td>\n",
+       "      <td>1.773594</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>movie_year_ContentBased_sample</th>\n",
+       "      <td>2.250000</td>\n",
+       "      <td>2.610077</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>movie_year_ContentBased_score</th>\n",
+       "      <td>1.866274</td>\n",
+       "      <td>2.111422</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>genres_ContentBased_sample</th>\n",
+       "      <td>1.875000</td>\n",
+       "      <td>2.271136</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>genres_ContentBased_score</th>\n",
+       "      <td>1.463388</td>\n",
+       "      <td>1.793363</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>rating_ContentBased_sample</th>\n",
+       "      <td>1.289773</td>\n",
+       "      <td>1.715759</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>rating_ContentBased_score</th>\n",
+       "      <td>2.482206</td>\n",
+       "      <td>2.795490</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>tags_ContentBased_sample</th>\n",
+       "      <td>1.937500</td>\n",
+       "      <td>2.128673</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>tags_ContentBased_score</th>\n",
+       "      <td>1.683499</td>\n",
+       "      <td>1.782805</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>tags_length_ContentBased_sample</th>\n",
+       "      <td>1.187500</td>\n",
+       "      <td>1.704773</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>ContentBased_sample</th>\n",
+       "      <th>tags_length_ContentBased_score</th>\n",
-       "      <td>1.013747</td>\n",
+       "      <td>1.564917</td>\n",
-       "      <td>1.350417</td>\n",
+       "      <td>1.944345</td>\n",
-       "      <td>0.084112</td>\n",
+       "      <td>0.5</td>\n",
-       "      <td>178.048598</td>\n",
+       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>ContentBased_score</th>\n",
+       "      <th>timestamp_ContentBased_sample</th>\n",
-       "      <td>1.461846</td>\n",
+       "      <td>1.875000</td>\n",
-       "      <td>1.803067</td>\n",
+       "      <td>2.277608</td>\n",
-       "      <td>0.018692</td>\n",
+       "      <td>1.0</td>\n",
-       "      <td>437.222430</td>\n",
+       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>ContentBased_Lr</th>\n",
+       "      <th>timestamp_ContentBased_score</th>\n",
-       "      <td>1.202626</td>\n",
+       "      <td>1.265317</td>\n",
-       "      <td>1.460273</td>\n",
+       "      <td>1.512329</td>\n",
-       "      <td>0.084112</td>\n",
+       "      <td>1.0</td>\n",
-       "      <td>278.046729</td>\n",
+       "      <td>4.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
-       "                          mae      rmse  hit_rate     novelty\n",
+       "                                       mae      rmse  hit_rate  novelty\n",
-       "baseline_1           1.561178  1.792482  0.074766   99.405607\n",
+       "baseline_1                        1.312500  1.667708       1.0      4.0\n",
-       "baseline_2           1.471412  1.819364  0.000000  429.942991\n",
+       "baseline_2                        1.315250  1.572990       1.0      4.0\n",
-       "baseline_3           0.878270  1.085591  0.074766   99.405607\n",
+       "baseline_3                        1.318182  1.465689       1.0      4.0\n",
-       "baseline_4           0.705673  0.912313  0.130841   60.202804\n",
+       "baseline_4                        1.363953  1.523985       1.0      4.0\n",
-       "ContentBased_sample  1.013747  1.350417  0.084112  178.048598\n",
+       "title_length_ContentBased_sample  1.375000  1.750000       1.0      4.0\n",
-       "ContentBased_score   1.461846  1.803067  0.018692  437.222430\n",
+       "title_length_ContentBased_score   1.556280  2.063469       1.0      4.0\n",
-       "ContentBased_Lr      1.202626  1.460273  0.084112  278.046729"
+       "title_length_ContentBased_Lr      1.625729  1.773594       1.0      4.0\n",
+       "movie_year_ContentBased_sample    2.250000  2.610077       1.0      4.0\n",
+       "movie_year_ContentBased_score     1.866274  2.111422       1.0      4.0\n",
+       "genres_ContentBased_sample        1.875000  2.271136       1.0      4.0\n",
+       "genres_ContentBased_score         1.463388  1.793363       1.0      4.0\n",
+       "rating_ContentBased_sample        1.289773  1.715759       1.0      4.0\n",
+       "rating_ContentBased_score         2.482206  2.795490       1.0      4.0\n",
+       "tags_ContentBased_sample          1.937500  2.128673       0.5      4.0\n",
+       "tags_ContentBased_score           1.683499  1.782805       1.0      4.0\n",
+       "tags_length_ContentBased_sample   1.187500  1.704773       1.0      4.0\n",
+       "tags_length_ContentBased_score    1.564917  1.944345       0.5      4.0\n",
+       "timestamp_ContentBased_sample     1.875000  2.277608       1.0      4.0\n",
+       "timestamp_ContentBased_score      1.265317  1.512329       1.0      4.0"
      ]
     },
-     "execution_count": 112,
+     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -432,7 +600,8 @@
   "id": "6f8b6d19",
   "metadata": {},
   "source": [
-    "dire quel modèle est meilleur ?\n"
+    "dire quel modèle est meilleur ?\n",
+    "(ici on a pris en compte les différents content features du model)"
   ]
  }
 ],

 %% Cell type:markdown id:a665885b tags:
 # Evaluator Module
 The Evaluator module creates evaluation reports.
 Reports contain evaluation metrics depending on models specified in the evaluation config.
 %% Cell type:code id:6aaf9140 tags:
 ``` python
 # reloads modules automatically before entering the execution of code
 %load_ext autoreload
 %autoreload 2
 # imports
 import numpy as np
 import pandas as pd
 # local imports
 from configs import EvalConfig
 from constants import Constant as C
 from loaders import export_evaluation_report
 from loaders import load_ratings
 # New imports
 from surprise.model_selection import train_test_split
 from surprise import accuracy
 from surprise.model_selection import LeaveOneOut
 from collections import Counter
 ```
 %% Output
    The autoreload extension is already loaded. To reload it, use:
      %reload_ext autoreload
 %% Cell type:markdown id:d47c24a4 tags:
 # 1. Model validation functions
 Validation functions are a way to perform crossvalidation on recommender system models.
 %% Cell type:code id:d6d82188 tags:
 ``` python
 # -- implement the function generate_split_predictions --
 def generate_split_predictions(algo, ratings_dataset, eval_config):
    """Generate predictions on a random test set specified in eval_config"""
    # Spliting the data into train and test sets
    trainset, testset = train_test_split(ratings_dataset, test_size=eval_config.test_size)
    # Training the algorithm on the train data set
    algo.fit(trainset)
    # Predict ratings for the testset
    predictions = algo.test(testset)
    return predictions
 # -- implement the function generate_loo_top_n --
 def generate_loo_top_n(algo, ratings_dataset, eval_config):
    """Generate top-n recommendations for each user on a random Leave-one-out split (LOO)"""
    # Create a LeaveOneOut split
    loo = LeaveOneOut(n_splits=1)
    for trainset, testset in loo.split(ratings_dataset):
        algo.fit(trainset)  # Train the algorithm on the training set
        anti_testset = trainset.build_anti_testset()  # Build the anti test-set
        predictions = algo.test(anti_testset)  # Get predictions on the anti test-set
        top_n = {}
        for uid, iid, _, est, _ in predictions:
            if uid not in top_n:
                top_n[uid] = []
            top_n[uid].append((iid, est))
        for uid, user_ratings in top_n.items():
            user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = user_ratings[:eval_config.top_n_value]  # Get top-N recommendations
        anti_testset_top_n = top_n
        return anti_testset_top_n, testset
 def generate_full_top_n(algo, ratings_dataset, eval_config):
    """Generate top-n recommendations for each user with full training set (LOO)"""
    full_trainset = ratings_dataset.build_full_trainset()  # Build the full training set
    algo.fit(full_trainset)  # Train the algorithm on the full training set
    anti_testset = full_trainset.build_anti_testset()  # Build the anti test-set
    predictions = algo.test(anti_testset)  # Get predictions on the anti test-set
    top_n = {}
    for uid, iid, _, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:eval_config.top_n_value]  # Get top-N recommendations
    anti_testset_top_n = top_n
    return anti_testset_top_n
 def precomputed_information(movie_data):
    """ Returns a dictionary that precomputes relevant information for evaluating in full mode
    Dictionary keys:
    - precomputed_dict["item_to_rank"] : contains a dictionary mapping movie ids to rankings
    - (-- for your project, add other relevant information here -- )
    """
    # Initialize an empty dictionary to store item_id to rank mapping
    item_to_rank = {}
    # Calculate popularity rank for each movie
    ratings_count = movie_data.groupby('movieId').size().sort_values(ascending=False)
    # Assign ranks to movies based on their popularity
    for rank, (movie_id, _) in enumerate(ratings_count.items(), start=1):
        item_to_rank[movie_id] = rank
    # Create the precomputed dictionary
    precomputed_dict = {}
    precomputed_dict["item_to_rank"] = item_to_rank
    return precomputed_dict
 def create_evaluation_report(eval_config, sp_ratings, precomputed_dict, available_metrics):
    """ Create a DataFrame evaluating various models on metrics specified in an evaluation config.
    """
    evaluation_dict = {}
    for model_name, model, arguments in eval_config.models:
        print(f'Handling model {model_name}')
        algo = model(**arguments)
        evaluation_dict[model_name] = {}
        # Type 1 : split evaluations
        if len(eval_config.split_metrics) > 0:
            print('Training split predictions')
            predictions = generate_split_predictions(algo, sp_ratings, eval_config)
            for metric in eval_config.split_metrics:
                print(f'- computing metric {metric}')
                assert metric in available_metrics['split']
                evaluation_function, parameters =  available_metrics["split"][metric]
                evaluation_dict[model_name][metric] = evaluation_function(predictions, **parameters)
        # Type 2 : loo evaluations
        if len(eval_config.loo_metrics) > 0:
            print('Training loo predictions')
            anti_testset_top_n, testset = generate_loo_top_n(algo, sp_ratings, eval_config)
            for metric in eval_config.loo_metrics:
                assert metric in available_metrics['loo']
                evaluation_function, parameters =  available_metrics["loo"][metric]
                evaluation_dict[model_name][metric] = evaluation_function(anti_testset_top_n, testset, **parameters)
        # Type 3 : full evaluations
        if len(eval_config.full_metrics) > 0:
            print('Training full predictions')
            anti_testset_top_n = generate_full_top_n(algo, sp_ratings, eval_config)
            for metric in eval_config.full_metrics:
                assert metric in available_metrics['full']
                evaluation_function, parameters =  available_metrics["full"][metric]
                evaluation_dict[model_name][metric] = evaluation_function(
                    anti_testset_top_n,
                    **precomputed_dict,
                    **parameters
                )
    return pd.DataFrame.from_dict(evaluation_dict).T
 ```
 %% Cell type:markdown id:f7e83d1d tags:
 # 2. Evaluation metrics
 Implement evaluation metrics for either rating predictions (split metrics) or for top-n recommendations (loo metric, full metric)
 %% Cell type:code id:f1849e55 tags:
 ``` python
 # -- implement the function get_hit_rate --
 def get_hit_rate(anti_testset_top_n, testset):
    """Compute the average hit over the users (loo metric)
    A hit (1) happens when the movie in the testset has been picked by the top-n recommender
    A fail (0) happens when the movie in the testset has not been picked by the top-n recommender
    """
    hits = 0
    total_users = len(testset)
    for uid, true_iid, _ in testset:
        if uid in anti_testset_top_n and true_iid in {iid for iid, _ in anti_testset_top_n[uid]}:
            hits += 1
    hit_rate = hits / total_users
    return hit_rate
 # -- implement the function get_novelty --
 def get_novelty(anti_testset_top_n, item_to_rank):
    """Compute the average novelty of the top-n recommendation over the users (full metric)
    The novelty is defined as the average ranking of the movies recommended
    """
    total_rank_sum = 0
    total_recommendations = 0
    for uid, recommendations in anti_testset_top_n.items():
        for iid, _ in recommendations:
            if iid in item_to_rank:
                total_rank_sum += item_to_rank[iid]
                total_recommendations += 1
    if total_recommendations == 0:
        return 0  # Avoid division by zero
    average_rank_sum = total_rank_sum / total_recommendations
    return average_rank_sum
 ```
 %% Cell type:markdown id:1a9855b3 tags:
 # 3. Evaluation workflow
 Load data, evaluate models and save the experimental outcomes
 %% Cell type:code id:704f4d2a tags:
 ``` python
 AVAILABLE_METRICS = {
    "split": {
        "mae": (accuracy.mae, {'verbose': False}),
        "rmse": (accuracy.rmse, {'verbose': False})
    },
    "loo": {
        "hit_rate": (get_hit_rate, {}),
    },
    "full": {
        "novelty": (get_novelty, {}),
    }
 }
 sp_ratings = load_ratings(surprise_format=True)
 precomputed_dict = precomputed_information(pd.read_csv("data/tiny/evidence/ratings.csv"))
 evaluation_report = create_evaluation_report(EvalConfig, sp_ratings, precomputed_dict, AVAILABLE_METRICS)
 export_evaluation_report(evaluation_report)
 ```
 %% Output
    Handling model baseline_1
    Training split predictions
    - computing metric mae
    - computing metric rmse
    Training loo predictions
    Training full predictions
    Handling model baseline_2
    Training split predictions
    - computing metric mae
    - computing metric rmse
    Training loo predictions
    Training full predictions
    Handling model baseline_3
    Training split predictions
    - computing metric mae
    - computing metric rmse
    Training loo predictions
    Training full predictions
    Handling model baseline_4
    Training split predictions
    - computing metric mae
    - computing metric rmse
    Training loo predictions
    Training full predictions
-    Handling model ContentBased_sample
+    Handling model title_length_ContentBased_sample
    Training split predictions
    - computing metric mae
    - computing metric rmse
    Training loo predictions
    Training full predictions
-    Handling model ContentBased_score
+    Handling model title_length_ContentBased_score
    Training split predictions
    - computing metric mae
    - computing metric rmse
    Training loo predictions
    Training full predictions
-    Handling model ContentBased_Lr
+    Handling model title_length_ContentBased_Lr
+    Training split predictions
+    - computing metric mae
+    - computing metric rmse
+    Training loo predictions
+    Training full predictions
+    Handling model movie_year_ContentBased_sample
+    Training split predictions
+    - computing metric mae
+    - computing metric rmse
+    Training loo predictions
+    Training full predictions
+    Handling model movie_year_ContentBased_score
+    Training split predictions
+    - computing metric mae
+    - computing metric rmse
+    Training loo predictions
+    Training full predictions
+    Handling model genres_ContentBased_sample
+    Training split predictions
+    - computing metric mae
+    - computing metric rmse
+    Training loo predictions
+    Training full predictions
+    Handling model genres_ContentBased_score
+    Training split predictions
+    - computing metric mae
+    - computing metric rmse
+    Training loo predictions
+    Training full predictions
+    Handling model rating_ContentBased_sample
+    Training split predictions
+    - computing metric mae
+    - computing metric rmse
+    Training loo predictions
+    Training full predictions
+    Handling model rating_ContentBased_score
+    Training split predictions
+    - computing metric mae
+    - computing metric rmse
+    Training loo predictions
+    Training full predictions
+    Handling model tags_ContentBased_sample
+    Training split predictions
+    - computing metric mae
+    - computing metric rmse
+    Training loo predictions
+    Training full predictions
+    Handling model tags_ContentBased_score
+    Training split predictions
+    - computing metric mae
+    - computing metric rmse
+    Training loo predictions
+    Training full predictions
+    Handling model tags_length_ContentBased_sample
+    Training split predictions
+    - computing metric mae
+    - computing metric rmse
+    Training loo predictions
+    Training full predictions
+    Handling model tags_length_ContentBased_score
+    Training split predictions
+    - computing metric mae
+    - computing metric rmse
+    Training loo predictions
+    Training full predictions
+    Handling model timestamp_ContentBased_sample
+    Training split predictions
+    - computing metric mae
+    - computing metric rmse
+    Training loo predictions
+    Training full predictions
+    Handling model timestamp_ContentBased_score
    Training split predictions
    - computing metric mae
    - computing metric rmse
    Training loo predictions
    Training full predictions
    The data has been exported to the evaluation report
-                              mae      rmse  hit_rate     novelty
+                                           mae      rmse  hit_rate  novelty
-    baseline_1           1.561178  1.792482  0.074766   99.405607
+    baseline_1                        1.312500  1.667708       1.0      4.0
-    baseline_2           1.471412  1.819364  0.000000  429.942991
+    baseline_2                        1.315250  1.572990       1.0      4.0
-    baseline_3           0.878270  1.085591  0.074766   99.405607
+    baseline_3                        1.318182  1.465689       1.0      4.0
-    baseline_4           0.705673  0.912313  0.130841   60.202804
+    baseline_4                        1.363953  1.523985       1.0      4.0
-    ContentBased_sample  1.013747  1.350417  0.084112  178.048598
+    title_length_ContentBased_sample  1.375000  1.750000       1.0      4.0
-    ContentBased_score   1.461846  1.803067  0.018692  437.222430
+    title_length_ContentBased_score   1.556280  2.063469       1.0      4.0
-    ContentBased_Lr      1.202626  1.460273  0.084112  278.046729
+    title_length_ContentBased_Lr      1.625729  1.773594       1.0      4.0
+    movie_year_ContentBased_sample    2.250000  2.610077       1.0      4.0
+    movie_year_ContentBased_score     1.866274  2.111422       1.0      4.0
+    genres_ContentBased_sample        1.875000  2.271136       1.0      4.0
+    genres_ContentBased_score         1.463388  1.793363       1.0      4.0
+    rating_ContentBased_sample        1.289773  1.715759       1.0      4.0
+    rating_ContentBased_score         2.482206  2.795490       1.0      4.0
+    tags_ContentBased_sample          1.937500  2.128673       0.5      4.0
+    tags_ContentBased_score           1.683499  1.782805       1.0      4.0
+    tags_length_ContentBased_sample   1.187500  1.704773       1.0      4.0
+    tags_length_ContentBased_score    1.564917  1.944345       0.5      4.0
+    timestamp_ContentBased_sample     1.875000  2.277608       1.0      4.0
+    timestamp_ContentBased_score      1.265317  1.512329       1.0      4.0
 %% Cell type:markdown id:6f8b6d19 tags:
 dire quel modèle est meilleur ?
+(ici on a pris en compte les différents content features du model)

--- a/models.py
+++ b/models.py
@@ -97,10 +97,35 @@ class ContentBased(AlgoBase):
    def create_content_features(self, features_method):
        """Content Analyzer"""
        df_items = load_items()
+        df_ratings = load_ratings()
+        df_tag = df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
        if features_method is None:
            df_features = None
        elif features_method == "title_length": # a naive method that creates only 1 feature based on title length
            df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
+        elif features_method == "movie_year" :
+            df_features = df_items['movie_year'] = df_items['title'].str.extract(r'\((\d{4})\)', expand=False)
+        elif features_method == "genres" :
+            genres_list = df_items['genres'].str.split('|').explode().unique()
+            for genre in genres_list:
+                df_features = df_items['genres'].str.contains(genre).astype(int)
+        elif features_method == "rating" :
+            df_features = df_ratings.groupby('movieId')['rating'].transform('mean').to_frame('avg_rating')
+        elif features_method == "tags" :
+            df_features = df_tag['tag'].apply(lambda x: len(x.split(',')))
+        elif features_method == "tags_length" :
+             df_features = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(',')))
+        elif features_method == "timestamp" :
+            df_features =  df_ratings['timestamp_sin'] = np.sin(2 * np.pi * df_ratings['timestamp'] / 86400)
+            df_features =  df_ratings['timestamp_cos'] = np.cos(2 * np.pi * df_ratings['timestamp'] / 86400)
        else: # (implement other feature creations here)
            raise NotImplementedError(f'Feature method {features_method} not yet implemented')
        return df_features
@@ -176,6 +201,4 @@ class ContentBased(AlgoBase):
            # (implement here the regressor prediction)
        return score
\ No newline at end of file