From fb8db879864ed060484b17ca6070f2ea74ae6a96 Mon Sep 17 00:00:00 2001 From: Adrienucl <adrien.payen@student.uclouvain.be> Date: Mon, 6 May 2024 19:22:18 +0200 Subject: [PATCH] update for hackathon --- configs.py | 25 ++++- evaluator.ipynb | 265 +++++++++++++++++++++++++++++++++++++++--------- models.py | 29 +++++- 3 files changed, 265 insertions(+), 54 deletions(-) diff --git a/configs.py b/configs.py index 06dd89bd..2fdbd420 100644 --- a/configs.py +++ b/configs.py @@ -13,9 +13,27 @@ class EvalConfig: ("baseline_2", ModelBaseline2, {}), ("baseline_3", ModelBaseline3, {}), ("baseline_4", ModelBaseline4, {}), - ("ContentBased_sample", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_sample"}), - ("ContentBased_score", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_score"}), - ("ContentBased_Lr", ContentBased, {"features_method" : "title_length", "regressor_method" : "linear_regression"}) + ("title_length_ContentBased_sample", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_sample"}), + ("title_length_ContentBased_score", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_score"}), + ("title_length_ContentBased_Lr", ContentBased, {"features_method" : "title_length", "regressor_method" : "linear_regression"}), + ("movie_year_ContentBased_sample", ContentBased, {"features_method" : "movie_year", "regressor_method" : "random_sample"}), + ("movie_year_ContentBased_score", ContentBased, {"features_method" : "movie_year", "regressor_method" : "random_score"}), + #("movie_year_ContentBased_Lr", ContentBased, {"features_method" : "movie_year", "regressor_method" : "linear_regression"}) + ("genres_ContentBased_sample", ContentBased, {"features_method" : "genres", "regressor_method" : "random_sample"}), + ("genres_ContentBased_score", ContentBased, {"features_method" : "genres", "regressor_method" : "random_score"}), + #("genres_ContentBased_Lr", ContentBased, {"features_method" : "genres", "regressor_method" : "linear_regression"}), + ("rating_ContentBased_sample", ContentBased, {"features_method" : "rating", "regressor_method" : "random_sample"}), + ("rating_ContentBased_score", ContentBased, {"features_method" : "rating", "regressor_method" : "random_score"}), + #("rating_ContentBased_Lr", ContentBased, {"features_method" : "rating", "regressor_method" : "linear_regression"}), + ("tags_ContentBased_sample", ContentBased, {"features_method" : "tags", "regressor_method" : "random_sample"}), + ("tags_ContentBased_score", ContentBased, {"features_method" : "tags", "regressor_method" : "random_score"}), + #("tags_ContentBased_Lr", ContentBased, {"features_method" : "tags", "regressor_method" : "linear_regression"}), + ("tags_length_ContentBased_sample", ContentBased, {"features_method" : "tags_length", "regressor_method" : "random_sample"}), + ("tags_length_ContentBased_score", ContentBased, {"features_method" : "tags_length", "regressor_method" : "random_score"}), + #("tags_length_ContentBased_Lr", ContentBased, {"features_method" : "tags_length", "regressor_method" : "linear_regression"}), + ("timestamp_ContentBased_sample", ContentBased, {"features_method" : "timestamp", "regressor_method" : "random_sample"}), + ("timestamp_ContentBased_score", ContentBased, {"features_method" : "timestamp", "regressor_method" : "random_score"}), + #("timestamp_ContentBased_Lr", ContentBased, {"features_method" : "timestamp", "regressor_method" : "linear_regression"}) # model_name, model class, model parameters (dict) ] @@ -34,3 +52,4 @@ class EvalConfig: # Loo parameters top_n_value = 10 # -- configure the numer of recommendations (> 1) -- + diff --git a/evaluator.ipynb b/evaluator.ipynb index b88bfe44..efef1b74 100644 --- a/evaluator.ipynb +++ b/evaluator.ipynb @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 25, "id": "6aaf9140", "metadata": {}, "outputs": [ @@ -59,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 26, "id": "d6d82188", "metadata": {}, "outputs": [], @@ -201,7 +201,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 27, "id": "f1849e55", "metadata": {}, "outputs": [], @@ -257,7 +257,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 28, "id": "704f4d2a", "metadata": {}, "outputs": [ @@ -289,19 +289,91 @@ "- computing metric rmse\n", "Training loo predictions\n", "Training full predictions\n", - "Handling model ContentBased_sample\n", + "Handling model title_length_ContentBased_sample\n", "Training split predictions\n", "- computing metric mae\n", "- computing metric rmse\n", "Training loo predictions\n", "Training full predictions\n", - "Handling model ContentBased_score\n", + "Handling model title_length_ContentBased_score\n", "Training split predictions\n", "- computing metric mae\n", "- computing metric rmse\n", "Training loo predictions\n", "Training full predictions\n", - "Handling model ContentBased_Lr\n", + "Handling model title_length_ContentBased_Lr\n", + "Training split predictions\n", + "- computing metric mae\n", + "- computing metric rmse\n", + "Training loo predictions\n", + "Training full predictions\n", + "Handling model movie_year_ContentBased_sample\n", + "Training split predictions\n", + "- computing metric mae\n", + "- computing metric rmse\n", + "Training loo predictions\n", + "Training full predictions\n", + "Handling model movie_year_ContentBased_score\n", + "Training split predictions\n", + "- computing metric mae\n", + "- computing metric rmse\n", + "Training loo predictions\n", + "Training full predictions\n", + "Handling model genres_ContentBased_sample\n", + "Training split predictions\n", + "- computing metric mae\n", + "- computing metric rmse\n", + "Training loo predictions\n", + "Training full predictions\n", + "Handling model genres_ContentBased_score\n", + "Training split predictions\n", + "- computing metric mae\n", + "- computing metric rmse\n", + "Training loo predictions\n", + "Training full predictions\n", + "Handling model rating_ContentBased_sample\n", + "Training split predictions\n", + "- computing metric mae\n", + "- computing metric rmse\n", + "Training loo predictions\n", + "Training full predictions\n", + "Handling model rating_ContentBased_score\n", + "Training split predictions\n", + "- computing metric mae\n", + "- computing metric rmse\n", + "Training loo predictions\n", + "Training full predictions\n", + "Handling model tags_ContentBased_sample\n", + "Training split predictions\n", + "- computing metric mae\n", + "- computing metric rmse\n", + "Training loo predictions\n", + "Training full predictions\n", + "Handling model tags_ContentBased_score\n", + "Training split predictions\n", + "- computing metric mae\n", + "- computing metric rmse\n", + "Training loo predictions\n", + "Training full predictions\n", + "Handling model tags_length_ContentBased_sample\n", + "Training split predictions\n", + "- computing metric mae\n", + "- computing metric rmse\n", + "Training loo predictions\n", + "Training full predictions\n", + "Handling model tags_length_ContentBased_score\n", + "Training split predictions\n", + "- computing metric mae\n", + "- computing metric rmse\n", + "Training loo predictions\n", + "Training full predictions\n", + "Handling model timestamp_ContentBased_sample\n", + "Training split predictions\n", + "- computing metric mae\n", + "- computing metric rmse\n", + "Training loo predictions\n", + "Training full predictions\n", + "Handling model timestamp_ContentBased_score\n", "Training split predictions\n", "- computing metric mae\n", "- computing metric rmse\n", @@ -340,69 +412,165 @@ " <tbody>\n", " <tr>\n", " <th>baseline_1</th>\n", - " <td>1.561178</td>\n", - " <td>1.792482</td>\n", - " <td>0.074766</td>\n", - " <td>99.405607</td>\n", + " <td>1.312500</td>\n", + " <td>1.667708</td>\n", + " <td>1.0</td>\n", + " <td>4.0</td>\n", " </tr>\n", " <tr>\n", " <th>baseline_2</th>\n", - " <td>1.471412</td>\n", - " <td>1.819364</td>\n", - " <td>0.000000</td>\n", - " <td>429.942991</td>\n", + " <td>1.315250</td>\n", + " <td>1.572990</td>\n", + " <td>1.0</td>\n", + " <td>4.0</td>\n", " </tr>\n", " <tr>\n", " <th>baseline_3</th>\n", - " <td>0.878270</td>\n", - " <td>1.085591</td>\n", - " <td>0.074766</td>\n", - " <td>99.405607</td>\n", + " <td>1.318182</td>\n", + " <td>1.465689</td>\n", + " <td>1.0</td>\n", + " <td>4.0</td>\n", " </tr>\n", " <tr>\n", " <th>baseline_4</th>\n", - " <td>0.705673</td>\n", - " <td>0.912313</td>\n", - " <td>0.130841</td>\n", - " <td>60.202804</td>\n", + " <td>1.363953</td>\n", + " <td>1.523985</td>\n", + " <td>1.0</td>\n", + " <td>4.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>title_length_ContentBased_sample</th>\n", + " <td>1.375000</td>\n", + " <td>1.750000</td>\n", + " <td>1.0</td>\n", + " <td>4.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>title_length_ContentBased_score</th>\n", + " <td>1.556280</td>\n", + " <td>2.063469</td>\n", + " <td>1.0</td>\n", + " <td>4.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>title_length_ContentBased_Lr</th>\n", + " <td>1.625729</td>\n", + " <td>1.773594</td>\n", + " <td>1.0</td>\n", + " <td>4.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>movie_year_ContentBased_sample</th>\n", + " <td>2.250000</td>\n", + " <td>2.610077</td>\n", + " <td>1.0</td>\n", + " <td>4.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>movie_year_ContentBased_score</th>\n", + " <td>1.866274</td>\n", + " <td>2.111422</td>\n", + " <td>1.0</td>\n", + " <td>4.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>genres_ContentBased_sample</th>\n", + " <td>1.875000</td>\n", + " <td>2.271136</td>\n", + " <td>1.0</td>\n", + " <td>4.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>genres_ContentBased_score</th>\n", + " <td>1.463388</td>\n", + " <td>1.793363</td>\n", + " <td>1.0</td>\n", + " <td>4.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>rating_ContentBased_sample</th>\n", + " <td>1.289773</td>\n", + " <td>1.715759</td>\n", + " <td>1.0</td>\n", + " <td>4.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>rating_ContentBased_score</th>\n", + " <td>2.482206</td>\n", + " <td>2.795490</td>\n", + " <td>1.0</td>\n", + " <td>4.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>tags_ContentBased_sample</th>\n", + " <td>1.937500</td>\n", + " <td>2.128673</td>\n", + " <td>0.5</td>\n", + " <td>4.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>tags_ContentBased_score</th>\n", + " <td>1.683499</td>\n", + " <td>1.782805</td>\n", + " <td>1.0</td>\n", + " <td>4.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>tags_length_ContentBased_sample</th>\n", + " <td>1.187500</td>\n", + " <td>1.704773</td>\n", + " <td>1.0</td>\n", + " <td>4.0</td>\n", " </tr>\n", " <tr>\n", - " <th>ContentBased_sample</th>\n", - " <td>1.013747</td>\n", - " <td>1.350417</td>\n", - " <td>0.084112</td>\n", - " <td>178.048598</td>\n", + " <th>tags_length_ContentBased_score</th>\n", + " <td>1.564917</td>\n", + " <td>1.944345</td>\n", + " <td>0.5</td>\n", + " <td>4.0</td>\n", " </tr>\n", " <tr>\n", - " <th>ContentBased_score</th>\n", - " <td>1.461846</td>\n", - " <td>1.803067</td>\n", - " <td>0.018692</td>\n", - " <td>437.222430</td>\n", + " <th>timestamp_ContentBased_sample</th>\n", + " <td>1.875000</td>\n", + " <td>2.277608</td>\n", + " <td>1.0</td>\n", + " <td>4.0</td>\n", " </tr>\n", " <tr>\n", - " <th>ContentBased_Lr</th>\n", - " <td>1.202626</td>\n", - " <td>1.460273</td>\n", - " <td>0.084112</td>\n", - " <td>278.046729</td>\n", + " <th>timestamp_ContentBased_score</th>\n", + " <td>1.265317</td>\n", + " <td>1.512329</td>\n", + " <td>1.0</td>\n", + " <td>4.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " mae rmse hit_rate novelty\n", - "baseline_1 1.561178 1.792482 0.074766 99.405607\n", - "baseline_2 1.471412 1.819364 0.000000 429.942991\n", - "baseline_3 0.878270 1.085591 0.074766 99.405607\n", - "baseline_4 0.705673 0.912313 0.130841 60.202804\n", - "ContentBased_sample 1.013747 1.350417 0.084112 178.048598\n", - "ContentBased_score 1.461846 1.803067 0.018692 437.222430\n", - "ContentBased_Lr 1.202626 1.460273 0.084112 278.046729" + " mae rmse hit_rate novelty\n", + "baseline_1 1.312500 1.667708 1.0 4.0\n", + "baseline_2 1.315250 1.572990 1.0 4.0\n", + "baseline_3 1.318182 1.465689 1.0 4.0\n", + "baseline_4 1.363953 1.523985 1.0 4.0\n", + "title_length_ContentBased_sample 1.375000 1.750000 1.0 4.0\n", + "title_length_ContentBased_score 1.556280 2.063469 1.0 4.0\n", + "title_length_ContentBased_Lr 1.625729 1.773594 1.0 4.0\n", + "movie_year_ContentBased_sample 2.250000 2.610077 1.0 4.0\n", + "movie_year_ContentBased_score 1.866274 2.111422 1.0 4.0\n", + "genres_ContentBased_sample 1.875000 2.271136 1.0 4.0\n", + "genres_ContentBased_score 1.463388 1.793363 1.0 4.0\n", + "rating_ContentBased_sample 1.289773 1.715759 1.0 4.0\n", + "rating_ContentBased_score 2.482206 2.795490 1.0 4.0\n", + "tags_ContentBased_sample 1.937500 2.128673 0.5 4.0\n", + "tags_ContentBased_score 1.683499 1.782805 1.0 4.0\n", + "tags_length_ContentBased_sample 1.187500 1.704773 1.0 4.0\n", + "tags_length_ContentBased_score 1.564917 1.944345 0.5 4.0\n", + "timestamp_ContentBased_sample 1.875000 2.277608 1.0 4.0\n", + "timestamp_ContentBased_score 1.265317 1.512329 1.0 4.0" ] }, - "execution_count": 112, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -432,7 +600,8 @@ "id": "6f8b6d19", "metadata": {}, "source": [ - "dire quel modèle est meilleur ?\n" + "dire quel modèle est meilleur ?\n", + "(ici on a pris en compte les différents content features du model)" ] } ], diff --git a/models.py b/models.py index c288a5b8..7696fb26 100644 --- a/models.py +++ b/models.py @@ -97,10 +97,35 @@ class ContentBased(AlgoBase): def create_content_features(self, features_method): """Content Analyzer""" df_items = load_items() + df_ratings = load_ratings() + df_tag = df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME) if features_method is None: df_features = None + elif features_method == "title_length": # a naive method that creates only 1 feature based on title length df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title') + + elif features_method == "movie_year" : + df_features = df_items['movie_year'] = df_items['title'].str.extract(r'\((\d{4})\)', expand=False) + + elif features_method == "genres" : + genres_list = df_items['genres'].str.split('|').explode().unique() + for genre in genres_list: + df_features = df_items['genres'].str.contains(genre).astype(int) + + elif features_method == "rating" : + df_features = df_ratings.groupby('movieId')['rating'].transform('mean').to_frame('avg_rating') + + elif features_method == "tags" : + df_features = df_tag['tag'].apply(lambda x: len(x.split(','))) + + elif features_method == "tags_length" : + df_features = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(','))) + + elif features_method == "timestamp" : + df_features = df_ratings['timestamp_sin'] = np.sin(2 * np.pi * df_ratings['timestamp'] / 86400) + df_features = df_ratings['timestamp_cos'] = np.cos(2 * np.pi * df_ratings['timestamp'] / 86400) + else: # (implement other feature creations here) raise NotImplementedError(f'Feature method {features_method} not yet implemented') return df_features @@ -176,6 +201,4 @@ class ContentBased(AlgoBase): # (implement here the regressor prediction) - return score - - + return score \ No newline at end of file -- GitLab