diff --git a/configs.py b/configs.py
index 06dd89bde99d75a7e84859d504aaea21eab2675e..2fdbd42033f60f409b2029ee4868980916704a28 100644
--- a/configs.py
+++ b/configs.py
@@ -13,9 +13,27 @@ class EvalConfig:
("baseline_2", ModelBaseline2, {}),
("baseline_3", ModelBaseline3, {}),
("baseline_4", ModelBaseline4, {}),
- ("ContentBased_sample", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_sample"}),
- ("ContentBased_score", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_score"}),
- ("ContentBased_Lr", ContentBased, {"features_method" : "title_length", "regressor_method" : "linear_regression"})
+ ("title_length_ContentBased_sample", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_sample"}),
+ ("title_length_ContentBased_score", ContentBased, {"features_method" : "title_length", "regressor_method" : "random_score"}),
+ ("title_length_ContentBased_Lr", ContentBased, {"features_method" : "title_length", "regressor_method" : "linear_regression"}),
+ ("movie_year_ContentBased_sample", ContentBased, {"features_method" : "movie_year", "regressor_method" : "random_sample"}),
+ ("movie_year_ContentBased_score", ContentBased, {"features_method" : "movie_year", "regressor_method" : "random_score"}),
+ #("movie_year_ContentBased_Lr", ContentBased, {"features_method" : "movie_year", "regressor_method" : "linear_regression"})
+ ("genres_ContentBased_sample", ContentBased, {"features_method" : "genres", "regressor_method" : "random_sample"}),
+ ("genres_ContentBased_score", ContentBased, {"features_method" : "genres", "regressor_method" : "random_score"}),
+ #("genres_ContentBased_Lr", ContentBased, {"features_method" : "genres", "regressor_method" : "linear_regression"}),
+ ("rating_ContentBased_sample", ContentBased, {"features_method" : "rating", "regressor_method" : "random_sample"}),
+ ("rating_ContentBased_score", ContentBased, {"features_method" : "rating", "regressor_method" : "random_score"}),
+ #("rating_ContentBased_Lr", ContentBased, {"features_method" : "rating", "regressor_method" : "linear_regression"}),
+ ("tags_ContentBased_sample", ContentBased, {"features_method" : "tags", "regressor_method" : "random_sample"}),
+ ("tags_ContentBased_score", ContentBased, {"features_method" : "tags", "regressor_method" : "random_score"}),
+ #("tags_ContentBased_Lr", ContentBased, {"features_method" : "tags", "regressor_method" : "linear_regression"}),
+ ("tags_length_ContentBased_sample", ContentBased, {"features_method" : "tags_length", "regressor_method" : "random_sample"}),
+ ("tags_length_ContentBased_score", ContentBased, {"features_method" : "tags_length", "regressor_method" : "random_score"}),
+ #("tags_length_ContentBased_Lr", ContentBased, {"features_method" : "tags_length", "regressor_method" : "linear_regression"}),
+ ("timestamp_ContentBased_sample", ContentBased, {"features_method" : "timestamp", "regressor_method" : "random_sample"}),
+ ("timestamp_ContentBased_score", ContentBased, {"features_method" : "timestamp", "regressor_method" : "random_score"}),
+ #("timestamp_ContentBased_Lr", ContentBased, {"features_method" : "timestamp", "regressor_method" : "linear_regression"})
# model_name, model class, model parameters (dict)
]
@@ -34,3 +52,4 @@ class EvalConfig:
# Loo parameters
top_n_value = 10 # -- configure the numer of recommendations (> 1) --
+
diff --git a/evaluator.ipynb b/evaluator.ipynb
index b88bfe44a4e7d2898edb216abecfd1b673f059b6..efef1b7475bc2ec4598c9b8af197c29d5c358977 100644
--- a/evaluator.ipynb
+++ b/evaluator.ipynb
@@ -13,7 +13,7 @@
},
{
"cell_type": "code",
- "execution_count": 109,
+ "execution_count": 25,
"id": "6aaf9140",
"metadata": {},
"outputs": [
@@ -59,7 +59,7 @@
},
{
"cell_type": "code",
- "execution_count": 110,
+ "execution_count": 26,
"id": "d6d82188",
"metadata": {},
"outputs": [],
@@ -201,7 +201,7 @@
},
{
"cell_type": "code",
- "execution_count": 111,
+ "execution_count": 27,
"id": "f1849e55",
"metadata": {},
"outputs": [],
@@ -257,7 +257,7 @@
},
{
"cell_type": "code",
- "execution_count": 112,
+ "execution_count": 28,
"id": "704f4d2a",
"metadata": {},
"outputs": [
@@ -289,19 +289,91 @@
"- computing metric rmse\n",
"Training loo predictions\n",
"Training full predictions\n",
- "Handling model ContentBased_sample\n",
+ "Handling model title_length_ContentBased_sample\n",
"Training split predictions\n",
"- computing metric mae\n",
"- computing metric rmse\n",
"Training loo predictions\n",
"Training full predictions\n",
- "Handling model ContentBased_score\n",
+ "Handling model title_length_ContentBased_score\n",
"Training split predictions\n",
"- computing metric mae\n",
"- computing metric rmse\n",
"Training loo predictions\n",
"Training full predictions\n",
- "Handling model ContentBased_Lr\n",
+ "Handling model title_length_ContentBased_Lr\n",
+ "Training split predictions\n",
+ "- computing metric mae\n",
+ "- computing metric rmse\n",
+ "Training loo predictions\n",
+ "Training full predictions\n",
+ "Handling model movie_year_ContentBased_sample\n",
+ "Training split predictions\n",
+ "- computing metric mae\n",
+ "- computing metric rmse\n",
+ "Training loo predictions\n",
+ "Training full predictions\n",
+ "Handling model movie_year_ContentBased_score\n",
+ "Training split predictions\n",
+ "- computing metric mae\n",
+ "- computing metric rmse\n",
+ "Training loo predictions\n",
+ "Training full predictions\n",
+ "Handling model genres_ContentBased_sample\n",
+ "Training split predictions\n",
+ "- computing metric mae\n",
+ "- computing metric rmse\n",
+ "Training loo predictions\n",
+ "Training full predictions\n",
+ "Handling model genres_ContentBased_score\n",
+ "Training split predictions\n",
+ "- computing metric mae\n",
+ "- computing metric rmse\n",
+ "Training loo predictions\n",
+ "Training full predictions\n",
+ "Handling model rating_ContentBased_sample\n",
+ "Training split predictions\n",
+ "- computing metric mae\n",
+ "- computing metric rmse\n",
+ "Training loo predictions\n",
+ "Training full predictions\n",
+ "Handling model rating_ContentBased_score\n",
+ "Training split predictions\n",
+ "- computing metric mae\n",
+ "- computing metric rmse\n",
+ "Training loo predictions\n",
+ "Training full predictions\n",
+ "Handling model tags_ContentBased_sample\n",
+ "Training split predictions\n",
+ "- computing metric mae\n",
+ "- computing metric rmse\n",
+ "Training loo predictions\n",
+ "Training full predictions\n",
+ "Handling model tags_ContentBased_score\n",
+ "Training split predictions\n",
+ "- computing metric mae\n",
+ "- computing metric rmse\n",
+ "Training loo predictions\n",
+ "Training full predictions\n",
+ "Handling model tags_length_ContentBased_sample\n",
+ "Training split predictions\n",
+ "- computing metric mae\n",
+ "- computing metric rmse\n",
+ "Training loo predictions\n",
+ "Training full predictions\n",
+ "Handling model tags_length_ContentBased_score\n",
+ "Training split predictions\n",
+ "- computing metric mae\n",
+ "- computing metric rmse\n",
+ "Training loo predictions\n",
+ "Training full predictions\n",
+ "Handling model timestamp_ContentBased_sample\n",
+ "Training split predictions\n",
+ "- computing metric mae\n",
+ "- computing metric rmse\n",
+ "Training loo predictions\n",
+ "Training full predictions\n",
+ "Handling model timestamp_ContentBased_score\n",
"Training split predictions\n",
"- computing metric mae\n",
"- computing metric rmse\n",
@@ -340,69 +412,165 @@
" <tbody>\n",
" <tr>\n",
" <th>baseline_1</th>\n",
- " <td>1.561178</td>\n",
- " <td>1.792482</td>\n",
- " <td>0.074766</td>\n",
- " <td>99.405607</td>\n",
+ " <td>1.312500</td>\n",
+ " <td>1.667708</td>\n",
+ " <td>1.0</td>\n",
+ " <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>baseline_2</th>\n",
- " <td>1.471412</td>\n",
- " <td>1.819364</td>\n",
- " <td>0.000000</td>\n",
- " <td>429.942991</td>\n",
+ " <td>1.315250</td>\n",
+ " <td>1.572990</td>\n",
+ " <td>1.0</td>\n",
+ " <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>baseline_3</th>\n",
- " <td>0.878270</td>\n",
- " <td>1.085591</td>\n",
- " <td>0.074766</td>\n",
- " <td>99.405607</td>\n",
+ " <td>1.318182</td>\n",
+ " <td>1.465689</td>\n",
+ " <td>1.0</td>\n",
+ " <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>baseline_4</th>\n",
- " <td>0.705673</td>\n",
- " <td>0.912313</td>\n",
- " <td>0.130841</td>\n",
- " <td>60.202804</td>\n",
+ " <td>1.363953</td>\n",
+ " <td>1.523985</td>\n",
+ " <td>1.0</td>\n",
+ " <td>4.0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>title_length_ContentBased_sample</th>\n",
+ " <td>1.375000</td>\n",
+ " <td>1.750000</td>\n",
+ " <td>1.0</td>\n",
+ " <td>4.0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>title_length_ContentBased_score</th>\n",
+ " <td>1.556280</td>\n",
+ " <td>2.063469</td>\n",
+ " <td>1.0</td>\n",
+ " <td>4.0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>title_length_ContentBased_Lr</th>\n",
+ " <td>1.625729</td>\n",
+ " <td>1.773594</td>\n",
+ " <td>1.0</td>\n",
+ " <td>4.0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>movie_year_ContentBased_sample</th>\n",
+ " <td>2.250000</td>\n",
+ " <td>2.610077</td>\n",
+ " <td>1.0</td>\n",
+ " <td>4.0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>movie_year_ContentBased_score</th>\n",
+ " <td>1.866274</td>\n",
+ " <td>2.111422</td>\n",
+ " <td>1.0</td>\n",
+ " <td>4.0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>genres_ContentBased_sample</th>\n",
+ " <td>1.875000</td>\n",
+ " <td>2.271136</td>\n",
+ " <td>1.0</td>\n",
+ " <td>4.0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>genres_ContentBased_score</th>\n",
+ " <td>1.463388</td>\n",
+ " <td>1.793363</td>\n",
+ " <td>1.0</td>\n",
+ " <td>4.0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>rating_ContentBased_sample</th>\n",
+ " <td>1.289773</td>\n",
+ " <td>1.715759</td>\n",
+ " <td>1.0</td>\n",
+ " <td>4.0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>rating_ContentBased_score</th>\n",
+ " <td>2.482206</td>\n",
+ " <td>2.795490</td>\n",
+ " <td>1.0</td>\n",
+ " <td>4.0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>tags_ContentBased_sample</th>\n",
+ " <td>1.937500</td>\n",
+ " <td>2.128673</td>\n",
+ " <td>0.5</td>\n",
+ " <td>4.0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>tags_ContentBased_score</th>\n",
+ " <td>1.683499</td>\n",
+ " <td>1.782805</td>\n",
+ " <td>1.0</td>\n",
+ " <td>4.0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>tags_length_ContentBased_sample</th>\n",
+ " <td>1.187500</td>\n",
+ " <td>1.704773</td>\n",
+ " <td>1.0</td>\n",
+ " <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>ContentBased_sample</th>\n",
- " <td>1.013747</td>\n",
- " <td>1.350417</td>\n",
- " <td>0.084112</td>\n",
- " <td>178.048598</td>\n",
+ " <th>tags_length_ContentBased_score</th>\n",
+ " <td>1.564917</td>\n",
+ " <td>1.944345</td>\n",
+ " <td>0.5</td>\n",
+ " <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>ContentBased_score</th>\n",
- " <td>1.461846</td>\n",
- " <td>1.803067</td>\n",
- " <td>0.018692</td>\n",
- " <td>437.222430</td>\n",
+ " <th>timestamp_ContentBased_sample</th>\n",
+ " <td>1.875000</td>\n",
+ " <td>2.277608</td>\n",
+ " <td>1.0</td>\n",
+ " <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
- " <th>ContentBased_Lr</th>\n",
- " <td>1.202626</td>\n",
- " <td>1.460273</td>\n",
- " <td>0.084112</td>\n",
- " <td>278.046729</td>\n",
+ " <th>timestamp_ContentBased_score</th>\n",
+ " <td>1.265317</td>\n",
+ " <td>1.512329</td>\n",
+ " <td>1.0</td>\n",
+ " <td>4.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
- " mae rmse hit_rate novelty\n",
- "baseline_1 1.561178 1.792482 0.074766 99.405607\n",
- "baseline_2 1.471412 1.819364 0.000000 429.942991\n",
- "baseline_3 0.878270 1.085591 0.074766 99.405607\n",
- "baseline_4 0.705673 0.912313 0.130841 60.202804\n",
- "ContentBased_sample 1.013747 1.350417 0.084112 178.048598\n",
- "ContentBased_score 1.461846 1.803067 0.018692 437.222430\n",
- "ContentBased_Lr 1.202626 1.460273 0.084112 278.046729"
+ " mae rmse hit_rate novelty\n",
+ "baseline_1 1.312500 1.667708 1.0 4.0\n",
+ "baseline_2 1.315250 1.572990 1.0 4.0\n",
+ "baseline_3 1.318182 1.465689 1.0 4.0\n",
+ "baseline_4 1.363953 1.523985 1.0 4.0\n",
+ "title_length_ContentBased_sample 1.375000 1.750000 1.0 4.0\n",
+ "title_length_ContentBased_score 1.556280 2.063469 1.0 4.0\n",
+ "title_length_ContentBased_Lr 1.625729 1.773594 1.0 4.0\n",
+ "movie_year_ContentBased_sample 2.250000 2.610077 1.0 4.0\n",
+ "movie_year_ContentBased_score 1.866274 2.111422 1.0 4.0\n",
+ "genres_ContentBased_sample 1.875000 2.271136 1.0 4.0\n",
+ "genres_ContentBased_score 1.463388 1.793363 1.0 4.0\n",
+ "rating_ContentBased_sample 1.289773 1.715759 1.0 4.0\n",
+ "rating_ContentBased_score 2.482206 2.795490 1.0 4.0\n",
+ "tags_ContentBased_sample 1.937500 2.128673 0.5 4.0\n",
+ "tags_ContentBased_score 1.683499 1.782805 1.0 4.0\n",
+ "tags_length_ContentBased_sample 1.187500 1.704773 1.0 4.0\n",
+ "tags_length_ContentBased_score 1.564917 1.944345 0.5 4.0\n",
+ "timestamp_ContentBased_sample 1.875000 2.277608 1.0 4.0\n",
+ "timestamp_ContentBased_score 1.265317 1.512329 1.0 4.0"
]
},
- "execution_count": 112,
+ "execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
@@ -432,7 +600,8 @@
"id": "6f8b6d19",
"metadata": {},
"source": [
- "dire quel modèle est meilleur ?\n"
+ "dire quel modèle est meilleur ?\n",
+ "(ici on a pris en compte les différents content features du model)"
]
}
],
diff --git a/models.py b/models.py
index c288a5b8f7812d2b4187ec75540a77012b0997b3..7696fb261ca83ff0c1ee414c0cb5301f079c0c3c 100644
--- a/models.py
+++ b/models.py
@@ -97,10 +97,35 @@ class ContentBased(AlgoBase):
def create_content_features(self, features_method):
"""Content Analyzer"""
df_items = load_items()
+ df_ratings = load_ratings()
+ df_tag = df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
if features_method is None:
df_features = None
+
elif features_method == "title_length": # a naive method that creates only 1 feature based on title length
df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
+
+ elif features_method == "movie_year" :
+ df_features = df_items['movie_year'] = df_items['title'].str.extract(r'\((\d{4})\)', expand=False)
+
+ elif features_method == "genres" :
+ genres_list = df_items['genres'].str.split('|').explode().unique()
+ for genre in genres_list:
+ df_features = df_items['genres'].str.contains(genre).astype(int)
+
+ elif features_method == "rating" :
+ df_features = df_ratings.groupby('movieId')['rating'].transform('mean').to_frame('avg_rating')
+
+ elif features_method == "tags" :
+ df_features = df_tag['tag'].apply(lambda x: len(x.split(',')))
+
+ elif features_method == "tags_length" :
+ df_features = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(',')))
+
+ elif features_method == "timestamp" :
+ df_features = df_ratings['timestamp_sin'] = np.sin(2 * np.pi * df_ratings['timestamp'] / 86400)
+ df_features = df_ratings['timestamp_cos'] = np.cos(2 * np.pi * df_ratings['timestamp'] / 86400)
+
else: # (implement other feature creations here)
raise NotImplementedError(f'Feature method {features_method} not yet implemented')
return df_features
@@ -176,6 +201,4 @@ class ContentBased(AlgoBase):
# (implement here the regressor prediction)
- return score
-
-
+ return score
\ No newline at end of file