From fb8db879864ed060484b17ca6070f2ea74ae6a96 Mon Sep 17 00:00:00 2001
From: Adrienucl <adrien.payen@student.uclouvain.be>
Date: Mon, 6 May 2024 19:22:18 +0200
Subject: [PATCH] update for hackathon

---
 configs.py      |  25 ++++-
 evaluator.ipynb | 265 +++++++++++++++++++++++++++++++++++++++---------
 models.py       |  29 +++++-
 3 files changed, 265 insertions(+), 54 deletions(-)

diff --git a/configs.py b/configs.py
index 06dd89bd..2fdbd420 100644
--- a/configs.py
+++ b/configs.py
@@ -13,9 +13,27 @@ class EvalConfig:
         ("baseline_2", ModelBaseline2, {}),
         ("baseline_3", ModelBaseline3, {}),
         ("baseline_4", ModelBaseline4, {}),
-        ("ContentBased_sample", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_sample"}),
-        ("ContentBased_score", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_score"}),
-        ("ContentBased_Lr", ContentBased, {"features_method" : "title_length", "regressor_method" : "linear_regression"})
+        ("title_length_ContentBased_sample", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_sample"}),
+        ("title_length_ContentBased_score", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_score"}),
+        ("title_length_ContentBased_Lr", ContentBased, {"features_method" : "title_length", "regressor_method" : "linear_regression"}),
+        ("movie_year_ContentBased_sample", ContentBased, {"features_method" : "movie_year", "regressor_method" : "random_sample"}),
+        ("movie_year_ContentBased_score", ContentBased, {"features_method" : "movie_year", "regressor_method" : "random_score"}),
+        #("movie_year_ContentBased_Lr", ContentBased, {"features_method" : "movie_year", "regressor_method" : "linear_regression"})
+        ("genres_ContentBased_sample", ContentBased, {"features_method" : "genres", "regressor_method" : "random_sample"}),
+        ("genres_ContentBased_score", ContentBased, {"features_method" : "genres", "regressor_method" : "random_score"}),
+        #("genres_ContentBased_Lr", ContentBased, {"features_method" : "genres", "regressor_method" : "linear_regression"}),
+        ("rating_ContentBased_sample", ContentBased, {"features_method" : "rating", "regressor_method" : "random_sample"}),
+        ("rating_ContentBased_score", ContentBased, {"features_method" : "rating", "regressor_method" : "random_score"}),
+        #("rating_ContentBased_Lr", ContentBased, {"features_method" : "rating", "regressor_method" : "linear_regression"}),
+        ("tags_ContentBased_sample", ContentBased, {"features_method" : "tags", "regressor_method" : "random_sample"}),
+        ("tags_ContentBased_score", ContentBased, {"features_method" : "tags", "regressor_method" : "random_score"}),
+        #("tags_ContentBased_Lr", ContentBased, {"features_method" : "tags", "regressor_method" : "linear_regression"}),
+        ("tags_length_ContentBased_sample", ContentBased, {"features_method" : "tags_length", "regressor_method" : "random_sample"}),
+        ("tags_length_ContentBased_score", ContentBased, {"features_method" : "tags_length", "regressor_method" : "random_score"}),
+        #("tags_length_ContentBased_Lr", ContentBased, {"features_method" : "tags_length", "regressor_method" : "linear_regression"}),
+        ("timestamp_ContentBased_sample", ContentBased, {"features_method" : "timestamp", "regressor_method" : "random_sample"}),
+        ("timestamp_ContentBased_score", ContentBased, {"features_method" : "timestamp", "regressor_method" : "random_score"}),
+        #("timestamp_ContentBased_Lr", ContentBased, {"features_method" : "timestamp", "regressor_method" : "linear_regression"})
 
         # model_name, model class, model parameters (dict)
     ]
@@ -34,3 +52,4 @@ class EvalConfig:
 
     # Loo parameters
     top_n_value =  10 # -- configure the numer of recommendations (> 1) --
+
diff --git a/evaluator.ipynb b/evaluator.ipynb
index b88bfe44..efef1b74 100644
--- a/evaluator.ipynb
+++ b/evaluator.ipynb
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 109,
+   "execution_count": 25,
    "id": "6aaf9140",
    "metadata": {},
    "outputs": [
@@ -59,7 +59,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 110,
+   "execution_count": 26,
    "id": "d6d82188",
    "metadata": {},
    "outputs": [],
@@ -201,7 +201,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 111,
+   "execution_count": 27,
    "id": "f1849e55",
    "metadata": {},
    "outputs": [],
@@ -257,7 +257,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 112,
+   "execution_count": 28,
    "id": "704f4d2a",
    "metadata": {},
    "outputs": [
@@ -289,19 +289,91 @@
       "- computing metric rmse\n",
       "Training loo predictions\n",
       "Training full predictions\n",
-      "Handling model ContentBased_sample\n",
+      "Handling model title_length_ContentBased_sample\n",
       "Training split predictions\n",
       "- computing metric mae\n",
       "- computing metric rmse\n",
       "Training loo predictions\n",
       "Training full predictions\n",
-      "Handling model ContentBased_score\n",
+      "Handling model title_length_ContentBased_score\n",
       "Training split predictions\n",
       "- computing metric mae\n",
       "- computing metric rmse\n",
       "Training loo predictions\n",
       "Training full predictions\n",
-      "Handling model ContentBased_Lr\n",
+      "Handling model title_length_ContentBased_Lr\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model movie_year_ContentBased_sample\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model movie_year_ContentBased_score\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model genres_ContentBased_sample\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model genres_ContentBased_score\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model rating_ContentBased_sample\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model rating_ContentBased_score\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model tags_ContentBased_sample\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model tags_ContentBased_score\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model tags_length_ContentBased_sample\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model tags_length_ContentBased_score\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model timestamp_ContentBased_sample\n",
+      "Training split predictions\n",
+      "- computing metric mae\n",
+      "- computing metric rmse\n",
+      "Training loo predictions\n",
+      "Training full predictions\n",
+      "Handling model timestamp_ContentBased_score\n",
       "Training split predictions\n",
       "- computing metric mae\n",
       "- computing metric rmse\n",
@@ -340,69 +412,165 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>baseline_1</th>\n",
-       "      <td>1.561178</td>\n",
-       "      <td>1.792482</td>\n",
-       "      <td>0.074766</td>\n",
-       "      <td>99.405607</td>\n",
+       "      <td>1.312500</td>\n",
+       "      <td>1.667708</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>baseline_2</th>\n",
-       "      <td>1.471412</td>\n",
-       "      <td>1.819364</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>429.942991</td>\n",
+       "      <td>1.315250</td>\n",
+       "      <td>1.572990</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>baseline_3</th>\n",
-       "      <td>0.878270</td>\n",
-       "      <td>1.085591</td>\n",
-       "      <td>0.074766</td>\n",
-       "      <td>99.405607</td>\n",
+       "      <td>1.318182</td>\n",
+       "      <td>1.465689</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>baseline_4</th>\n",
-       "      <td>0.705673</td>\n",
-       "      <td>0.912313</td>\n",
-       "      <td>0.130841</td>\n",
-       "      <td>60.202804</td>\n",
+       "      <td>1.363953</td>\n",
+       "      <td>1.523985</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>title_length_ContentBased_sample</th>\n",
+       "      <td>1.375000</td>\n",
+       "      <td>1.750000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>title_length_ContentBased_score</th>\n",
+       "      <td>1.556280</td>\n",
+       "      <td>2.063469</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>title_length_ContentBased_Lr</th>\n",
+       "      <td>1.625729</td>\n",
+       "      <td>1.773594</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>movie_year_ContentBased_sample</th>\n",
+       "      <td>2.250000</td>\n",
+       "      <td>2.610077</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>movie_year_ContentBased_score</th>\n",
+       "      <td>1.866274</td>\n",
+       "      <td>2.111422</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>genres_ContentBased_sample</th>\n",
+       "      <td>1.875000</td>\n",
+       "      <td>2.271136</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>genres_ContentBased_score</th>\n",
+       "      <td>1.463388</td>\n",
+       "      <td>1.793363</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>rating_ContentBased_sample</th>\n",
+       "      <td>1.289773</td>\n",
+       "      <td>1.715759</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>rating_ContentBased_score</th>\n",
+       "      <td>2.482206</td>\n",
+       "      <td>2.795490</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>tags_ContentBased_sample</th>\n",
+       "      <td>1.937500</td>\n",
+       "      <td>2.128673</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>tags_ContentBased_score</th>\n",
+       "      <td>1.683499</td>\n",
+       "      <td>1.782805</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>tags_length_ContentBased_sample</th>\n",
+       "      <td>1.187500</td>\n",
+       "      <td>1.704773</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>ContentBased_sample</th>\n",
-       "      <td>1.013747</td>\n",
-       "      <td>1.350417</td>\n",
-       "      <td>0.084112</td>\n",
-       "      <td>178.048598</td>\n",
+       "      <th>tags_length_ContentBased_score</th>\n",
+       "      <td>1.564917</td>\n",
+       "      <td>1.944345</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>4.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>ContentBased_score</th>\n",
-       "      <td>1.461846</td>\n",
-       "      <td>1.803067</td>\n",
-       "      <td>0.018692</td>\n",
-       "      <td>437.222430</td>\n",
+       "      <th>timestamp_ContentBased_sample</th>\n",
+       "      <td>1.875000</td>\n",
+       "      <td>2.277608</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>ContentBased_Lr</th>\n",
-       "      <td>1.202626</td>\n",
-       "      <td>1.460273</td>\n",
-       "      <td>0.084112</td>\n",
-       "      <td>278.046729</td>\n",
+       "      <th>timestamp_ContentBased_score</th>\n",
+       "      <td>1.265317</td>\n",
+       "      <td>1.512329</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4.0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                          mae      rmse  hit_rate     novelty\n",
-       "baseline_1           1.561178  1.792482  0.074766   99.405607\n",
-       "baseline_2           1.471412  1.819364  0.000000  429.942991\n",
-       "baseline_3           0.878270  1.085591  0.074766   99.405607\n",
-       "baseline_4           0.705673  0.912313  0.130841   60.202804\n",
-       "ContentBased_sample  1.013747  1.350417  0.084112  178.048598\n",
-       "ContentBased_score   1.461846  1.803067  0.018692  437.222430\n",
-       "ContentBased_Lr      1.202626  1.460273  0.084112  278.046729"
+       "                                       mae      rmse  hit_rate  novelty\n",
+       "baseline_1                        1.312500  1.667708       1.0      4.0\n",
+       "baseline_2                        1.315250  1.572990       1.0      4.0\n",
+       "baseline_3                        1.318182  1.465689       1.0      4.0\n",
+       "baseline_4                        1.363953  1.523985       1.0      4.0\n",
+       "title_length_ContentBased_sample  1.375000  1.750000       1.0      4.0\n",
+       "title_length_ContentBased_score   1.556280  2.063469       1.0      4.0\n",
+       "title_length_ContentBased_Lr      1.625729  1.773594       1.0      4.0\n",
+       "movie_year_ContentBased_sample    2.250000  2.610077       1.0      4.0\n",
+       "movie_year_ContentBased_score     1.866274  2.111422       1.0      4.0\n",
+       "genres_ContentBased_sample        1.875000  2.271136       1.0      4.0\n",
+       "genres_ContentBased_score         1.463388  1.793363       1.0      4.0\n",
+       "rating_ContentBased_sample        1.289773  1.715759       1.0      4.0\n",
+       "rating_ContentBased_score         2.482206  2.795490       1.0      4.0\n",
+       "tags_ContentBased_sample          1.937500  2.128673       0.5      4.0\n",
+       "tags_ContentBased_score           1.683499  1.782805       1.0      4.0\n",
+       "tags_length_ContentBased_sample   1.187500  1.704773       1.0      4.0\n",
+       "tags_length_ContentBased_score    1.564917  1.944345       0.5      4.0\n",
+       "timestamp_ContentBased_sample     1.875000  2.277608       1.0      4.0\n",
+       "timestamp_ContentBased_score      1.265317  1.512329       1.0      4.0"
       ]
      },
-     "execution_count": 112,
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -432,7 +600,8 @@
    "id": "6f8b6d19",
    "metadata": {},
    "source": [
-    "dire quel modèle est meilleur ?\n"
+    "dire quel modèle est meilleur ?\n",
+    "(ici on a pris en compte les différents content features du model)"
    ]
   }
  ],
diff --git a/models.py b/models.py
index c288a5b8..7696fb26 100644
--- a/models.py
+++ b/models.py
@@ -97,10 +97,35 @@ class ContentBased(AlgoBase):
     def create_content_features(self, features_method):
         """Content Analyzer"""
         df_items = load_items()
+        df_ratings = load_ratings()
+        df_tag = df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
         if features_method is None:
             df_features = None
+
         elif features_method == "title_length": # a naive method that creates only 1 feature based on title length
             df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
+
+        elif features_method == "movie_year" :
+            df_features = df_items['movie_year'] = df_items['title'].str.extract(r'\((\d{4})\)', expand=False)
+
+        elif features_method == "genres" :
+            genres_list = df_items['genres'].str.split('|').explode().unique()
+            for genre in genres_list:
+                df_features = df_items['genres'].str.contains(genre).astype(int)
+
+        elif features_method == "rating" :
+            df_features = df_ratings.groupby('movieId')['rating'].transform('mean').to_frame('avg_rating')
+
+        elif features_method == "tags" :
+            df_features = df_tag['tag'].apply(lambda x: len(x.split(',')))
+
+        elif features_method == "tags_length" :
+             df_features = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(',')))
+
+        elif features_method == "timestamp" :
+            df_features =  df_ratings['timestamp_sin'] = np.sin(2 * np.pi * df_ratings['timestamp'] / 86400)
+            df_features =  df_ratings['timestamp_cos'] = np.cos(2 * np.pi * df_ratings['timestamp'] / 86400)
+
         else: # (implement other feature creations here)
             raise NotImplementedError(f'Feature method {features_method} not yet implemented')
         return df_features
@@ -176,6 +201,4 @@ class ContentBased(AlgoBase):
 
             # (implement here the regressor prediction)
 
-        return score
-
-
+        return score
\ No newline at end of file
-- 
GitLab