diff --git a/Home.py b/Home.py index cedd90e4e9d4e58730645f37dc6f51a7966bea35..fc42a2fca243e92a0246ec1aefa28d44f00df0a2 100644 --- a/Home.py +++ b/Home.py @@ -4,7 +4,7 @@ import pandas as pd from content import fetch_movie_info, df_links from content import df_audrey, df_adrien, df_nathanael, df_charles from surprise import Dataset, Reader -from recommender import OtherUserBased, UserBased, RecommenderSystem_KNN, LatentFactorModel # Importer la classe OtherUserBased +from recommender import OtherUserBased, UserBased, RecommenderSystem_KNN, LatentFactorModel,test_contentbased_class # Importer la classe OtherUserBased from loaders import load_ratings @@ -46,10 +46,10 @@ def display_user_movies(df, title, column_name): </div> """, unsafe_allow_html=True) -def display_recommendations(user_name, user_id, csv_file): +def display_recommendations_tm(user_name, user_id, csv_file): recommender = OtherUserBased(user_name, user_id) recommender.load_model() - top_10_predictions = recommender.get_top_10_predictions_for_user(csv_file) + top_10_predictions = recommender.get_top_n_predictions_for_user(csv_file) if top_10_predictions is not None: st.subheader(f"Top 10 Recommendations for {user_name}") @@ -73,6 +73,7 @@ def display_recommendations(user_name, user_id, csv_file): else: st.write("No recommendations found.") + def display_recommendations_ub(user_name, user_id): # Charger les données et préparer l'ensemble de données pour l'entraînement et le test @@ -84,7 +85,7 @@ def display_recommendations_ub(user_name, user_id): recommender = UserBased(k=340, min_k=340) recommender.fit(trainset) - top_10_predictions = recommender.get_top_10_pred_ub(testset, user_id) + top_10_predictions = recommender.get_top_n_pred_ub(testset, user_id) if top_10_predictions is not None: st.subheader(f"Top 10 based on similar users of {user_name}") @@ -138,6 +139,7 @@ def display_recommendations_knn(user_name, user_id): else: st.write("No recommendations found.") + def display_recommendations_latent_factor(user_name, user_id): st.subheader(f"Top 10 Recommendations for {user_name}") cols_html = "" @@ -174,6 +176,33 @@ def display_recommendations_latent_factor(user_name, user_id): +def display_content_based_recommendations(user_name, user_id=-1, n=10): + cols_html = "" + + # Call the test_contentbased_class function to get top N recommendations + top_n_recommendations = test_contentbased_class(["title_length", "movie_year", "genre", "avg_rating"], "ridge_regression", user_id=-1, n=10) + + + if top_n_recommendations: + st.write(f"Top {n} recommendations for User {user_name}:") + for iid, est in top_n_recommendations: + tmdbId = df_links.loc[df_links['movieId'] == iid, 'tmdbId'].values[0] + title_dict, poster_url = fetch_movie_info(tmdbId) + movie_title = title_dict.get("title", "Unknown Title") if isinstance(title_dict, dict) else title_dict + if poster_url: + html_file_url = f"http://localhost:8501/{movie_title.replace(' ', '_')}.html" + cols_html += f'<div style="display: inline-block; margin-right: 20px;"><a href="{html_file_url}" target="_blank"><img src="{poster_url}" alt="{movie_title}" style="width:150px;height:225px;"></a><div style="color: white; text-decoration: none; font-size: 14px; text-align: center; max-width: 150px; word-wrap: break-word; white-space: normal;"><b>{movie_title}</b></div></a></div>' + else: + cols_html += f"<p>{movie_title}</p>" + + st.markdown(f""" + <div style="overflow-x: scroll; white-space: nowrap; height: 300px; display: flex; flex-direction: row;"> + {cols_html} + </div> + """, unsafe_allow_html=True) + else: + st.write("No recommendations found.") + def main(): @@ -209,15 +238,17 @@ def main(): if selected_user: user_df = user_options[selected_user] + display_content_based_recommendations(selected_user, user_id_options[selected_user], n=10) + display_recommendations_latent_factor(selected_user, user_id_options[selected_user]) # Afficher les recommandations basées sur l'algorithme OtherUserBased - display_recommendations(selected_user, user_id_options[selected_user], csv_file) + + display_recommendations_tm(selected_user, user_id_options[selected_user], csv_file) # Afficher les recommandations basées sur l'algorithme UserBased display_recommendations_ub(selected_user, user_id_options[selected_user]) display_recommendations_knn(selected_user, user_id_options[selected_user]) - display_recommendations_latent_factor(selected_user, user_id_options[selected_user]) dataframe_links = df_links.copy() user_df['movieId'] = user_df['movieId'].astype(int) diff --git a/user_based.ipynb b/backend/user_based.ipynb similarity index 100% rename from user_based.ipynb rename to backend/user_based.ipynb diff --git a/configs.py b/configs.py index 29e2937909d84b6c039b4e0de73882589d74f869..f9de902be256cac165fb82a3b3598454af185830 100644 --- a/configs.py +++ b/configs.py @@ -1,35 +1,33 @@ -# local imports from models import * from itertools import combinations -# # Méthodes de caractéristiques disponibles -# features_methods = [ -# 'genre', 'movie_year', 'avg_rating', -# 'title_length' -# ] - -# # Méthodes de régression disponibles -# regressor_methods = [ -# 'linear_regression','random_forest', 'lasso_regression', -# 'ridge_regression','elastic_net','decision_tree','adaboost' -# ] - -# # Générer toutes les combinaisons possibles de méthodes de caractéristiques -# feature_combinations = [] -# for r in range(1, len(features_methods) + 1): -# feature_combinations.extend(combinations(features_methods, r)) - -# # Générer toutes les combinaisons possibles de méthodes de régression et de caractéristiques -# model_combinations = [] -# for feature_set in feature_combinations: -# for regressor in regressor_methods: -# model_name = f"combination_{regressor}_{'_'.join(feature_set)}" -# arguments = { -# "features_method": list(feature_set), -# "regressor_method": regressor -# } -# model_combinations.append((model_name, ContentBased, arguments)) - +# Méthodes de caractéristiques disponibles +features_methods = [ + 'genre', 'movie_year', 'avg_rating', + 'title_length' +] + +# Méthodes de régression disponibles +regressor_methods = [ + 'linear_regression','random_forest', 'lasso_regression','gradient_boosting', + 'ridge_regression', 'svr_regression' +] + +# Générer toutes les combinaisons possibles de méthodes de caractéristiques +feature_combinations = [] +for r in range(1, len(features_methods) + 1): + feature_combinations.extend(combinations(features_methods, r)) + +# Générer toutes les combinaisons possibles de méthodes de régression et de caractéristiques +model_combinations = [] +for feature_set in feature_combinations: + for regressor in regressor_methods: + model_name = f"combination_{regressor}_{'_'.join(feature_set)}" + arguments = { + "features_method": list(feature_set), + "regressor_method": regressor + } + model_combinations.append((model_name, ContentBased, arguments)) class EvalConfig: @@ -37,45 +35,39 @@ class EvalConfig: """Configuration settings for evaluation.""" # List of models to evaluate, each tuple containing model_name, model class, and model parameters (dict) - #Modèles de base models = [ ("baseline_1", ModelBaseline1, {}), ("baseline_2", ModelBaseline2, {}), ("baseline_3", ModelBaseline3, {}), ("baseline_4", ModelBaseline4, {}), - - #("1", ContentBased, {"features_method": ['movie_year', 'avg_rating', 'genre'], "regressor_method":'linear_regression'}), - ("2", ContentBased, {"features_method": ['movie_year', 'avg_rating', 'genre'], "regressor_method":'random_forest'}) - - #("3", ContentBased, {"features_method": ['movie_year', 'avg_rating', 'genre'], "regressor_method":'lasso_regression'}), - #("4", ContentBased, {"features_method": ['movie_year', 'avg_rating', 'genre'], "regressor_method":'elastic_net'}), - # ("2", ContentBased, {"features_method": ['genre', 'avg_rating'], "regressor_method":'ridge_regression'}), - # ("3", ContentBased, {"features_method":['movie_year', 'avg_rating', 'title_length'], "regressor_method":'lasso_regression'}), - # ("4", ContentBased, {"features_method":['title_length'], "regressor_method":'random_forest'}), - # ("5", ContentBased, {"features_method":['genre', 'title_length'], "regressor_method":'lasso_regression'}), - # ("6", ContentBased, {"features_method":['genre', 'title_length'], "regressor_method":'linear_regression'}), - # ("7", ContentBased, {"features_method":['genre', 'avg_rating'], "regressor_method":'lasso_regression'}), - # ("8", ContentBased, {"features_method":['avg_rating', 'title_length'], "regressor_method":'adaboost'}), - # ("9", ContentBased, {"features_method":['genre', 'movie_year', 'avg_rating'], "regressor_method":'decision_tree'}), - # ("10", ContentBased, {"features_method":['genre', 'movie_year'], "regressor_method":'decision_tree'}), - # ("11", ContentBased, {"features_method":['genre', 'movie_year', 'avg_rating'], "regressor_method":'elastic_net'}), - # ("12", ContentBased, {"features_method":['movie_year', 'avg_rating', 'title_length'], "regressor_method":'elastic_net'}) - - - - + ("1", ContentBased, {"features_method": ['movie_year', 'avg_rating'], "regressor_method": 'linear_regression'}), + # ("2", ContentBased, {"features_method": ['genre', 'movie_year', 'avg_rating'], "regressor_method": 'gradient_boosting'}), + # ("3", ContentBased, {"features_method": ['avg_rating'], "regressor_method": 'gradient_boosting'}), + # ("4", ContentBased, {"features_method": ['avg_rating'], "regressor_method": 'lasso_regression'}), + # ("5", ContentBased, {"features_method": ['genre'], "regressor_method": 'random_forest'}), + # ("6", ContentBased, {"features_method": ['genre'], "regressor_method": 'lasso_regression'}), + # ("7", ContentBased, {"features_method": ['avg_rating', 'title_length'], "regressor_method": 'ridge_regression'}), + # ("8", ContentBased, {"features_method": ['avg_rating'], "regressor_method": 'svr_regression'}), + # ("9", ContentBased, {"features_method": ['genre', 'movie_year', 'title_length'], "regressor_method": 'gradient_boosting'}), + # ("10", ContentBased, {"features_method": ['genre', 'title_length'], "regressor_method": 'svr_regression'}), + # ("11", ContentBased, {"features_method": ['genre', 'avg_rating', 'title_length'], "regressor_method": 'linear_regression'}), + # ("12", ContentBased, {"features_method": ['genre', 'avg_rating'], "regressor_method": 'linear_regression'}), + # ("13", ContentBased, {"features_method": ['genre', 'avg_rating', 'title_length'], "regressor_method": 'knn_regression'}), + # ("14", ContentBased, {"features_method": ['genre', 'movie_year', 'avg_rating'], "regressor_method": 'xgboost'}), + # ("15", ContentBased, {"features_method": ['genre', 'title_length'], "regressor_method": 'decision_tree'}), + # ("16", ContentBased, {"features_method": ['title_length'], "regressor_method": 'random_forest'}), + # ("17", ContentBased, {"features_method": ['genre', 'title_length'], "regressor_method": 'gradient_boosting'}), + # ("18", ContentBased, {"features_method": ['movie_year', 'title_length'], "regressor_method": 'lightgbm'}), + # ("19", ContentBased, {"features_method": ['avg_rating', 'title_length'], "regressor_method": 'decision_tree'}) ] # # Ajouter les combinaisons de ContentBased à la liste des modèles - # models.extend(combinations) + # models.extend(model_combinations) - # Affichage des modèles pour vérification + # # Affichage des modèles pour vérification # for model in models: - # print(model) - - #models = model_combinations + # print(model) - # Metrics to compute for split evaluation split_metrics = ["mae", "rmse"] @@ -91,3 +83,24 @@ class EvalConfig: # Loo parameters top_n_value = 10 # -- configure the numer of recommendations (> 1) -- + + + # #("1", ContentBased, {"features_method": ['movie_year', 'avg_rating', 'genre'], "regressor_method":'linear_regression'}), + # ("2", ContentBased, {"features_method": ['movie_year', 'avg_rating', 'genre'], "regressor_method":'random_forest'}) + + # #("3", ContentBased, {"features_method": ['movie_year', 'avg_rating', 'genre'], "regressor_method":'lasso_regression'}), + # #("4", ContentBased, {"features_method": ['movie_year', 'avg_rating', 'genre'], "regressor_method":'elastic_net'}), + # # ("2", ContentBased, {"features_method": ['genre', 'avg_rating'], "regressor_method":'ridge_regression'}), + # # ("3", ContentBased, {"features_method":['movie_year', 'avg_rating', 'title_length'], "regressor_method":'lasso_regression'}), + # # ("4", ContentBased, {"features_method":['title_length'], "regressor_method":'random_forest'}), + # # ("5", ContentBased, {"features_method":['genre', 'title_length'], "regressor_method":'lasso_regression'}), + # # ("6", ContentBased, {"features_method":['genre', 'title_length'], "regressor_method":'linear_regression'}), + # # ("7", ContentBased, {"features_method":['genre', 'avg_rating'], "regressor_method":'lasso_regression'}), + # # ("8", ContentBased, {"features_method":['avg_rating', 'title_length'], "regressor_method":'adaboost'}), + # # ("9", ContentBased, {"features_method":['genre', 'movie_year', 'avg_rating'], "regressor_method":'decision_tree'}), + # # ("10", ContentBased, {"features_method":['genre', 'movie_year'], "regressor_method":'decision_tree'}), + # # ("11", ContentBased, {"features_method":['genre', 'movie_year', 'avg_rating'], "regressor_method":'elastic_net'}), + # # ("12", ContentBased, {"features_method":['movie_year', 'avg_rating', 'title_length'], "regressor_method":'elastic_net'}) + + + diff --git a/content_based.ipynb b/content_based.ipynb index 22f076dd1f3a34d39c20299ad6a52bad1c18268f..fffea99ab7e327cbe56f9b267cd486ef0a2fd8c1 100644 --- a/content_based.ipynb +++ b/content_based.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "id": "277473a3", "metadata": {}, "outputs": [ @@ -27,20 +27,30 @@ "%load_ext autoreload\n", "%autoreload 2\n", "\n", - "import numpy as np\n", + "\n", + "# third parties imports\n", "import pandas as pd\n", + "import numpy as np\n", "import random as rd\n", - "from surprise import AlgoBase\n", - "from surprise.prediction_algorithms.predictions import PredictionImpossible\n", + "from surprise import AlgoBase, SVD\n", + "from surprise import PredictionImpossible\n", "\n", - "from loaders import load_ratings\n", - "from loaders import load_items\n", + "# import local\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from loaders import load_items, load_ratings\n", "from constants import Constant as C\n", - "\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor\n", "from sklearn.svm import SVR\n", - "from sklearn.feature_extraction.text import TfidfVectorizer" + "\n", + "from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet\n", + "from sklearn.svm import SVR\n", + "from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor\n", + "from sklearn.tree import DecisionTreeRegressor\n", + "from sklearn.neighbors import KNeighborsRegressor\n", + "from xgboost import XGBRegressor\n", + "from lightgbm import LGBMRegressor\n", + "\n" ] }, { @@ -53,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 7, "id": "e8378976", "metadata": {}, "outputs": [ @@ -87,24 +97,24 @@ " </thead>\n", " <tbody>\n", " <tr>\n", - " <th>3</th>\n", - " <td>23</td>\n", + " <th>1</th>\n", + " <td>16</td>\n", " </tr>\n", " <tr>\n", - " <th>15</th>\n", - " <td>23</td>\n", + " <th>2</th>\n", + " <td>14</td>\n", " </tr>\n", " <tr>\n", - " <th>34</th>\n", - " <td>11</td>\n", + " <th>3</th>\n", + " <td>23</td>\n", " </tr>\n", " <tr>\n", - " <th>59</th>\n", - " <td>44</td>\n", + " <th>4</th>\n", + " <td>24</td>\n", " </tr>\n", " <tr>\n", - " <th>64</th>\n", - " <td>20</td>\n", + " <th>5</th>\n", + " <td>34</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", @@ -113,11 +123,11 @@ "text/plain": [ " n_character_title\n", "movieId \n", + "1 16\n", + "2 14\n", "3 23\n", - "15 23\n", - "34 11\n", - "59 44\n", - "64 20" + "4 24\n", + "5 34" ] }, "metadata": {}, @@ -126,11 +136,11 @@ { "data": { "text/plain": [ - "0 Russian\n", - "1 Trilogy of the Imagination\n", - "2 Takashi Miike\n", - "3 action\n", - "4 bad plot\n", + "0 sandra 'boring' bullock\n", + "1 dentist\n", + "2 Cambodia\n", + "3 Russian\n", + "4 forgettable\n", "Name: tag, dtype: object" ] }, @@ -169,177 +179,7 @@ }, { "cell_type": "code", - "execution_count": 12, - "id": "bf27365c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "None\n", - "None\n", - "{'n_character_title': array([2.33593365e-04, 2.03124665e-05, 1.01562333e-05]), 'avg_rating': array([2.33593365e-04, 2.03124665e-05, 1.01562333e-05]), 'tags': array([1.11718566e-04, 3.04686998e-05, 1.01562333e-05])}\n", - "{'n_character_title': array([1.08454062e-04, 9.43078803e-06, 4.71539402e-06]), 'avg_rating': array([1.08454062e-04, 9.43078803e-06, 4.71539402e-06]), 'tags': array([5.18693342e-05, 1.41461820e-05, 4.71539402e-06])}\n", - "None\n", - "{'n_character_title': array([5.24102880e-05, 4.55741635e-06, 2.27870817e-06]), 'avg_rating': array([5.24102880e-05, 4.55741635e-06, 2.27870817e-06]), 'tags': array([2.50657899e-05, 6.83612452e-06, 2.27870817e-06])}\n" - ] - } - ], - "source": [ - "\n", - "class ContentBased(AlgoBase):\n", - " def __init__(self, features_method, regressor_method, combination_methods=None):\n", - " AlgoBase.__init__(self)\n", - " self.regressor_method = regressor_method\n", - " self.features_method = features_method\n", - " self.combination_methods = combination_methods\n", - " self.content_features = self.create_content_features(features_method)\n", - " self.user_profile = {}\n", - " self.user_profile_explain = {}\n", - "\n", - " def create_content_features(self, features_method):\n", - " df_items = load_items()\n", - " df_ratings = load_ratings()\n", - " df_tag = pd.read_csv(C.CONTENT_PATH / C.TAGS_FILENAME)\n", - " df_genome_score = pd.read_csv(\"data/hackathon/content/genome-scores.csv\")\n", - " df_genome_tag = pd.read_csv(\"data/hackathon/content/genome-tags.csv\")\n", - "\n", - " def get_features(method):\n", - " if method == \"relevance\":\n", - " return df_genome_score.groupby('movieId')[\"relevance\"].mean().to_frame('avg_relevance')\n", - "\n", - " elif method == \"title_length\":\n", - " return df_items[C.LABEL_COL].apply(len).to_frame('n_character_title')\n", - "\n", - " elif method == \"movie_year\":\n", - " return df_items['title'].str.extract(r'\\((\\d{4})\\)', expand=False).to_frame('movie_year')\n", - "\n", - " elif method == \"genres\":\n", - " genres_list = df_items['genres'].str.split('|').explode().unique()\n", - " df_features = pd.DataFrame()\n", - " for genre in genres_list:\n", - " df_features[genre] = df_items['genres'].str.contains(genre).astype(int)\n", - " return df_features\n", - "\n", - " elif method == \"rating\":\n", - " return df_ratings.groupby('movieId')['rating'].mean().to_frame('avg_rating')\n", - "\n", - " elif method == \"tags\":\n", - " return df_tag['tag'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0).to_frame('tags')\n", - "\n", - " elif method == \"tags_length\":\n", - " return df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(',')) if isinstance(x, str) else 0).to_frame('n_character_tags')\n", - "\n", - " else:\n", - " raise NotImplementedError(f'Feature method {method} not yet implemented')\n", - "\n", - " if features_method == \"combination\":\n", - " if not self.combination_methods:\n", - " raise ValueError('No combination methods provided for \"combination\" feature method')\n", - " df_features = pd.DataFrame()\n", - " for method in self.combination_methods:\n", - " df_method_features = get_features(method)\n", - " df_features = pd.concat([df_features, df_method_features], axis=1)\n", - " else:\n", - " df_features = get_features(features_method)\n", - "\n", - " if df_features is not None:\n", - " df_features.fillna(0, inplace=True)\n", - "\n", - " return df_features\n", - "\n", - " def fit(self, trainset):\n", - " AlgoBase.fit(self, trainset)\n", - " self.user_profile = {u: None for u in trainset.all_users()}\n", - " self.user_profile_explain = {}\n", - "\n", - " for u in trainset.all_users():\n", - " raw_user_id = trainset.to_raw_uid(u)\n", - " self.user_profile_explain[raw_user_id] = {}\n", - "\n", - " user_ratings = np.array([rating for _, rating in trainset.ur[u]])\n", - " feature_values = self.content_features.values.astype(int)\n", - " weighted_features = feature_values / np.linalg.norm(feature_values)\n", - " feature_importance = weighted_features / np.sum(user_ratings)\n", - "\n", - " self.user_profile_explain[raw_user_id] = dict(zip(self.content_features.columns, feature_importance))\n", - "\n", - " self._fit_regressors(trainset)\n", - "\n", - " def _fit_regressors(self, trainset):\n", - " for u in self.user_profile:\n", - " user_ratings = [rating for _, rating in trainset.ur[u]]\n", - " item_ids = [iid for iid, _ in trainset.ur[u]]\n", - " df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})\n", - " df_user[\"item_id\"] = df_user[\"item_id\"].map(trainset.to_raw_iid)\n", - " df_user = df_user.merge(self.content_features, left_on=\"item_id\", right_index=True, how='left')\n", - "\n", - " if df_user.empty:\n", - " continue\n", - "\n", - " X = df_user.iloc[:, 2:].values # Assuming features start from the third column\n", - " y = df_user['user_ratings'].values\n", - "\n", - " if self.regressor_method == 'linear_regression':\n", - " regressor = LinearRegression(fit_intercept=False)\n", - " elif self.regressor_method == 'svr_regression':\n", - " regressor = SVR(kernel='rbf', C=10, epsilon=0.2)\n", - " elif self.regressor_method == 'gradient_boosting':\n", - " regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)\n", - " elif self.regressor_method == 'random_forest':\n", - " regressor = RandomForestRegressor(n_estimators=100)\n", - " else:\n", - " regressor = None\n", - "\n", - " if regressor is not None:\n", - " regressor.fit(X, y)\n", - " self.user_profile[u] = regressor\n", - "\n", - " def estimate(self, u, i):\n", - " if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):\n", - " raise PredictionImpossible('User and/or item is unknown.')\n", - "\n", - " raw_item_id = self.trainset.to_raw_iid(i)\n", - " item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values\n", - "\n", - " regressor = self.user_profile[u]\n", - " if regressor is None:\n", - " raise PredictionImpossible('Regressor not found.')\n", - "\n", - " score = regressor.predict(item_features)[0]\n", - " return score\n", - " \n", - "\n", - " def explain(self, u) : \n", - " if u in self.user_profile_explain :\n", - " return self.user_profile_explain[u]\n", - " else :\n", - " return None\n", - "\n", - "# Example usage\n", - "cb = ContentBased(\"combination\", \"svr_regression\", combination_methods=[\"title_length\", \"rating\", \"tags\"])\n", - "sp_ratings = load_ratings(surprise_format=True)\n", - "train_set = sp_ratings.build_full_trainset()\n", - "cb.fit(train_set)\n", - "\n", - "\n", - "print(cb.explain(11))\n", - "\n", - "print(cb.explain(13))\n", - "\n", - "print(cb.explain(17))\n", - "\n", - "print(cb.explain(23))\n", - "\n", - "print(cb.explain(27))\n", - "\n", - "print(cb.explain(73))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, + "execution_count": 8, "id": "16b0a602", "metadata": {}, "outputs": [ @@ -347,401 +187,170 @@ "name": "stdout", "output_type": "stream", "text": [ - "None\n", - "None\n", - "{'movie_year': array([0.00030032])}\n", - "{'movie_year': array([0.00013943])}\n", - "None\n", - "{'movie_year': array([6.73812161e-05])}\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/var/folders/7z/lpyksh0x59x6l5_mf048x9tc0000gn/T/ipykernel_23482/2546010458.py:36: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", - " df_features = df_items['genres'].str.contains(genre).astype(int).to_frame('genres')\n" + "{'title_length': 0.1497645139703848, 'movie_year': 0.16218667420100635, '(no genres listed)': 0.0, 'action': 0.09449072815753193, 'adventure': 0.08778978776313201, 'animation': 0.0, 'children': 0.038431411145366176, 'comedy': 0.07268129109348041, 'crime': 0.09469516433772891, 'documentary': 0.0611428358670058, 'drama': 0.10494783392380302, 'fantasy': 0.025806451608591505, 'film-noir': 0.025806451609512046, 'horror': 0.018342712153336858, 'imax': 0.06947533670577526, 'musical': 0.0, 'mystery': 0.06234903350217154, 'romance': 0.036771716124540825, 'sci-fi': 0.059571001735546115, 'thriller': 0.0993122803165238, 'war': 0.04002978709072218, 'western': 0.04547648227079719, 'avg_rating': 0.16263357553020436}\n", + "{'title_length': 0.12975573389578626, 'movie_year': 0.13738555574364605, '(no genres listed)': 0.0, 'action': 0.0640388318396414, 'adventure': 0.0827515664964472, 'animation': 0.05686854568650957, 'children': 0.06799492283569505, 'comedy': 0.07354182680364503, 'crime': 0.05543740962624167, 'documentary': 0.0, 'drama': 0.09170589087803577, 'fantasy': 0.061481521263689595, 'film-noir': 0.0, 'horror': 0.015113350123518238, 'imax': 0.04592205020685974, 'musical': 0.03201459126079391, 'mystery': 0.03412706135338736, 'romance': 0.05989121250223656, 'sci-fi': 0.04370793816378273, 'thriller': 0.045800659191095036, 'war': 0.04907194751877139, 'western': 0.027287416762806844, 'avg_rating': 0.13740560847192132}\n", + "{'title_length': 0.04702378569892371, 'movie_year': 0.052440003628289225, '(no genres listed)': 0.0, 'action': 0.020439581335728367, 'adventure': 0.015593308332521032, 'animation': 0.004256286923052558, 'children': 0.003520723090188317, 'comedy': 0.018972762464944913, 'crime': 0.028340544273099223, 'documentary': 0.005823989517206729, 'drama': 0.037415345194166824, 'fantasy': 0.013643903080149476, 'film-noir': 0.015390183296279798, 'horror': 0.01926898253629829, 'imax': 0.0014716703456143566, 'musical': 0.0061519348279224124, 'mystery': 0.02847033164163413, 'romance': 0.019827342468818163, 'sci-fi': 0.022573488552024915, 'thriller': 0.03522231545147593, 'war': 0.010339617301415098, 'western': 0.005663885036293055, 'avg_rating': 0.05327750989412312}\n", + "{'title_length': 0.033402138126294736, 'movie_year': 0.03710065977291947, '(no genres listed)': 0.0, 'action': 0.014528522669579273, 'adventure': 0.013963913494241694, 'animation': 0.005764814103226412, 'children': 0.006513197483932152, 'comedy': 0.017763201411495646, 'crime': 0.016002513666599556, 'documentary': 0.004292962983778595, 'drama': 0.027458210593047847, 'fantasy': 0.009302633945770895, 'film-noir': 0.006823368830454359, 'horror': 0.007391689869010394, 'imax': 0.004855154663168369, 'musical': 0.0058909467772061425, 'mystery': 0.012191560732760487, 'romance': 0.01723631022081761, 'sci-fi': 0.010817269433255231, 'thriller': 0.01658593988724716, 'war': 0.010193212979882352, 'western': 0.0052038255339472966, 'avg_rating': 0.03742403427834079}\n", + "{'title_length': 0.20154225634108316, 'movie_year': 0.20848962267389695, '(no genres listed)': 0.0, 'action': 0.04545454544645529, 'adventure': 0.04545454544730129, 'animation': 0.0, 'children': 0.0, 'comedy': 0.07177284969293253, 'crime': 0.1145252645738102, 'documentary': 0.0, 'drama': 0.16778172557550536, 'fantasy': 0.0, 'film-noir': 0.0, 'horror': 0.06315936177961773, 'imax': 0.0, 'musical': 0.0, 'mystery': 0.08510520557533159, 'romance': 0.09754755529442835, 'sci-fi': 0.045454545449454146, 'thriller': 0.12542163704872258, 'war': 0.08035304331050673, 'western': 0.0, 'avg_rating': 0.21152969571139305}\n", + "{'title_length': 0.021927486954368552, 'movie_year': 0.02488786702116846, '(no genres listed)': 0.0007363092498113207, 'action': 0.013836432470735639, 'adventure': 0.011610617815573265, 'animation': 0.007520799115717832, 'children': 0.006287966766754299, 'comedy': 0.012951125615087338, 'crime': 0.011084119744598393, 'documentary': 0.0018287715645832062, 'drama': 0.015221252640276463, 'fantasy': 0.008631010164284143, 'film-noir': 0.0024629052522566544, 'horror': 0.008816299251739122, 'imax': 0.005347204099216887, 'musical': 0.0038827346462235236, 'mystery': 0.0068652812039576095, 'romance': 0.008086664541950757, 'sci-fi': 0.010304269379559203, 'thriller': 0.013200133984104478, 'war': 0.005127335699821772, 'western': 0.0036215200349232765, 'avg_rating': 0.025470698706944836}\n" ] } ], "source": [ + "\n", + "# ContetnBased\n", "class ContentBased(AlgoBase):\n", " def __init__(self, features_method, regressor_method):\n", " AlgoBase.__init__(self)\n", " self.regressor_method = regressor_method\n", + " self.features_methods = features_method\n", " self.content_features = self.create_content_features(features_method)\n", + " self.user_profile = {}\n", " self.user_profile_explain = {}\n", "\n", - " def create_content_features(self, features_method):\n", + " def create_content_features(self, features_methods):\n", " \"\"\"Content Analyzer\"\"\"\n", " df_items = load_items()\n", " df_ratings = load_ratings()\n", - " df_tag = df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n", + " df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n", " df_genome_score = pd.read_csv(\"data/hackathon/content/genome-scores.csv\")\n", " df_genome_tag = pd.read_csv(\"data/hackathon/content/genome-tags.csv\")\n", "\n", - " if features_method is None:\n", - " df_features = None\n", + " df_features = pd.DataFrame(index=df_items.index)\n", "\n", - " elif features_method == \"relevance\" :\n", - " df_features = df_genome_score.groupby('movieId')[\"relevance\"].transform('mean').to_frame('avg_relevance')\n", - "\n", - " elif features_method == \"title_length\": # a naive method that creates only 1 feature based on title length\n", - " df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')\n", + " for method in features_methods:\n", + " if method == \"title_length\":\n", + " df_title_length = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('title_length')\n", + " df_features = pd.concat([df_features, df_title_length], axis=1)\n", " \n", - " elif features_method == \"movie_year\" :\n", - " df_features = df_items['movie_year'] = df_items['title'].str.extract(r'\\((\\d{4})\\)', expand=False).to_frame('movie_year')\n", - "\n", - " elif features_method == \"genres\" :\n", - " genres_list = df_items['genres'].str.split('|').explode().unique()\n", - " for genre in genres_list:\n", - " df_features = df_items['genres'].str.contains(genre).astype(int).to_frame('genres')\n", - " \n", - " elif features_method == \"combination\" :\n", - " genres_list = df_items['genres'].str.split('|').explode().unique()\n", - " for genre in genres_list:\n", - " df_features = df_items['genres'].str.contains(genre).astype(int).to_frame('genres')\n", + " elif method == \"movie_year\":\n", + " df_movie_year = df_items['title'].str.extract(r'\\((\\d{4})\\)', expand=False).to_frame('movie_year')\n", + " df_features = pd.concat([df_features, df_movie_year.astype(float).fillna(0)], axis=1)\n", " \n", - " df_features = df_items['movie_year'] = df_items['title'].str.extract(r'\\((\\d{4})\\)', expand=False).to_frame('movie_year')\n", + " elif method == \"genre\":\n", + " tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split('|'), token_pattern=None)\n", + " tfidf_matrix = tfidf_vectorizer.fit_transform(df_items['genres'])\n", + " df_tfidf_genres = pd.DataFrame(tfidf_matrix.toarray(), index=df_items.index, columns=tfidf_vectorizer.get_feature_names_out())\n", + " df_features = pd.concat([df_features, df_tfidf_genres], axis=1)\n", "\n", - " \n", - " elif features_method == \"rating\" :\n", - " df_features = df_ratings.groupby('movieId')['rating'].transform('mean').to_frame('avg_rating')\n", + " elif method == \"avg_rating\":\n", + " df_avg_rating = df_ratings.groupby('movieId')['rating'].mean().to_frame('avg_rating')\n", + " df_features = df_features.join(df_avg_rating, on='movieId')\n", "\n", - " elif features_method == \"tags\" :\n", - " df_features = df_tag['tag'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0).to_frame('tags')\n", - "\n", - " elif features_method == \"tags_length\" :\n", - " \n", - " df_features = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(','))if isinstance(x, str) else 0).to_frame('n_character_tags')\n", - "\n", - " else: # (implement other feature creations here)\n", - " raise NotImplementedError(f'Feature method {features_method} not yet implemented')\n", + " else:\n", + " raise NotImplementedError(f'Feature method {method} not yet implemented')\n", "\n", " # Handle missing values in df_features\n", - " if df_features is not None:\n", - " df_features.fillna(0, inplace=True) \n", + " df_features.fillna(0, inplace=True)\n", "\n", " return df_features\n", - " \n", "\n", " def fit(self, trainset):\n", " \"\"\"Profile Learner\"\"\"\n", " AlgoBase.fit(self, trainset)\n", - " \n", + "\n", " # Preallocate user profiles\n", " self.user_profile = {u: None for u in trainset.all_users()}\n", - "\n", " self.user_profile_explain = {}\n", "\n", - " # Loop over all internal user IDs in the trainset\n", + " epsilon = 1e-10 # Small value to prevent division by zero\n", + "\n", " for u in trainset.all_users():\n", - " # Convert internal user ID to raw user ID\n", " raw_user_id = trainset.to_raw_uid(u)\n", - "\n", - " # Initialize feature importance dictionary for the raw user ID\n", " self.user_profile_explain[raw_user_id] = {}\n", "\n", - " # Extract user ratings for the current user\n", - " user_ratings = np.array([rating for _, rating in trainset.ur[u]])\n", + " user_ratings = np.array([rating for (_, rating) in trainset.ur[u]])\n", + " item_ids = [iid for (iid, _) in trainset.ur[u]]\n", + " raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]\n", "\n", - " # Compute feature importance based on content features and user ratings\n", - " feature_values = self.content_features.values.astype(int)\n", - " weighted_features = feature_values / np.linalg.norm(feature_values)\n", - " feature_importance = weighted_features / np.sum(user_ratings)\n", + " feature_values = self.content_features.loc[raw_item_ids].values\n", + " norms = np.linalg.norm(feature_values, axis=0) + epsilon\n", + " weighted_features = feature_values / norms\n", + " feature_importance = weighted_features.T @ user_ratings\n", + " feature_importance /= np.sum(user_ratings)\n", "\n", - " # Map feature importance scores to feature names and store in user_profile_explain\n", " self.user_profile_explain[raw_user_id] = dict(zip(self.content_features.columns, feature_importance))\n", - " \n", "\n", " if self.regressor_method == 'random_score':\n", - " for u in self.user_profile :\n", - " self.user_profile[u] = rd.uniform(0.5,5)\n", - " \n", - " elif self.regressor_method == 'random_sample':\n", - " for u in self.user_profile:\n", - " self.user_profile[u] = [rating for _, rating in self.trainset.ur[u]]\n", - "\n", - " elif self.regressor_method == 'linear_regression' :\n", " for u in self.user_profile:\n", + " self.user_profile[u] = rd.uniform(0.5, 5)\n", "\n", - " user_ratings = [rating for _, rating in trainset.ur[u]]\n", - " item_ids = [iid for iid, _ in trainset.ur[u]]\n", - "\n", - " df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})\n", - "\n", - " df_user[\"item_id\"] = df_user[\"item_id\"].map(trainset.to_raw_iid)\n", - "\n", - " df_user = df_user.merge(self.content_features, left_on = \"item_id\", right_index = True, how = 'left')\n", - " \n", - " if 'n_character_title' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['n_character_title'].values.reshape(-1, 1)\n", - "\n", - " elif 'avg_relevance' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['avg_relevance'].values.reshape(-1, 1)\n", - " \n", - " elif 'movie_year' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['movie_year'].values.reshape(-1, 1)\n", - " \n", - " elif 'combination' in df_user.columns :\n", - " X = df_user['movie_year','genres' ].values.reshape(-1, 1)\n", - " \n", - " elif 'genres' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['genres'].values.reshape(-1, 1)\n", - "\n", - " elif 'avg_rating' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['avg_rating'].values.reshape(-1, 1)\n", - "\n", - " elif 'tags' in df_user.columns:\n", - " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", - " X = df_user['tags'].values.reshape(-1, 1)\n", - "\n", - " elif 'n_character_tags' in df_user.columns:\n", - " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", - " X = df_user['n_character_tags'].values.reshape(-1, 1)\n", - "\n", - " else:\n", - " # Si aucune caractéristique appropriée n'est disponible\n", - " continue # Ou gère le cas d'erreur/exception ici\n", - "\n", - " y = df_user['user_ratings'].values\n", - "\n", - " linear_regressor = LinearRegression(fit_intercept = False)\n", - "\n", - " linear_regressor.fit(X,y)\n", - " \n", - " # Store the computed user profile\n", - " self.user_profile[u] = linear_regressor\n", - "\n", - " elif self.regressor_method == 'svr_regression':\n", - " for u in self.user_profile:\n", - "\n", - " user_ratings = [rating for _, rating in trainset.ur[u]]\n", - " item_ids = [iid for iid, _ in trainset.ur[u]]\n", - "\n", - " df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})\n", - "\n", - " df_user[\"item_id\"] = df_user[\"item_id\"].map(trainset.to_raw_iid)\n", - "\n", - " df_user = df_user.merge(self.content_features, left_on = \"item_id\", right_index = True, how = 'left')\n", - "\n", - " if 'n_character_title' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['n_character_title'].values.reshape(-1, 1)\n", - "\n", - " elif 'avg_relevance' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['avg_relevance'].values.reshape(-1, 1)\n", - " \n", - " elif 'movie_year' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['movie_year'].values.reshape(-1, 1)\n", - " \n", - " elif 'genres' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['genres'].values.reshape(-1, 1)\n", - " \n", - " elif 'avg_rating' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['avg_rating'].values.reshape(-1, 1)\n", - "\n", - " elif 'tags' in df_user.columns:\n", - " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", - " X = df_user['tags'].values.reshape(-1, 1)\n", - "\n", - " elif 'n_character_tags' in df_user.columns:\n", - " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", - " X = df_user['n_character_tags'].values.reshape(-1, 1)\n", - "\n", - " else:\n", - " # Si aucune caractéristique appropriée n'est disponible\n", - " continue # Ou gère le cas d'erreur/exception ici\n", - " \n", - " y = df_user['user_ratings'].values\n", - " svr_regressor = SVR(kernel='rbf', C=10, epsilon=0.2)\n", - " svr_regressor.fit(X, y)\n", - " self.user_profile[u] = svr_regressor\n", - "\n", - " elif self.regressor_method == 'gradient_boosting':\n", + " elif self.regressor_method == 'random_sample':\n", " for u in self.user_profile:\n", + " self.user_profile[u] = [rating for (_, rating) in trainset.ur[u]]\n", "\n", - " user_ratings = [rating for _, rating in trainset.ur[u]]\n", - " item_ids = [iid for iid, _ in trainset.ur[u]]\n", - "\n", - " df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})\n", - "\n", - " df_user[\"item_id\"] = df_user[\"item_id\"].map(trainset.to_raw_iid)\n", - "\n", - " df_user = df_user.merge(self.content_features, left_on = \"item_id\", right_index = True, how = 'left')\n", - "\n", - " if 'n_character_title' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['n_character_title'].values.reshape(-1, 1)\n", - "\n", - " elif 'avg_relevance' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['avg_relevance'].values.reshape(-1, 1)\n", - " \n", - " elif 'movie_year' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['movie_year'].values.reshape(-1, 1)\n", - " \n", - " elif 'genres' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['genres'].values.reshape(-1, 1)\n", - " \n", - " elif 'avg_rating' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['avg_rating'].values.reshape(-1, 1)\n", - "\n", - " elif 'tags' in df_user.columns:\n", - " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", - " X = df_user['tags'].values.reshape(-1, 1)\n", - "\n", - " elif 'n_character_tags' in df_user.columns:\n", - " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", - " X = df_user['n_character_tags'].values.reshape(-1, 1)\n", - "\n", - " else:\n", - " # Si aucune caractéristique appropriée n'est disponible\n", - " continue # Ou gère le cas d'erreur/exception ici\n", - " \n", - " y = df_user['user_ratings'].values\n", - " gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)\n", - " gb_regressor.fit(X, y)\n", - " self.user_profile[u] = gb_regressor\n", - "\n", + " else:\n", + " regressor_models = {\n", + " 'linear_regression': LinearRegression(fit_intercept=False),\n", + " 'svr_regression': SVR(kernel='rbf', C=10, epsilon=0.2),\n", + " 'gradient_boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),\n", + " 'random_forest': RandomForestRegressor(n_estimators=100),\n", + " 'lasso_regression': Lasso(alpha=0.1),\n", + " 'ridge_regression': Ridge(alpha=1.0),\n", + " 'elastic_net': ElasticNet(alpha=1.0, l1_ratio=0.5),\n", + " 'knn_regression': KNeighborsRegressor(n_neighbors=1),\n", + " 'decision_tree': DecisionTreeRegressor(max_depth=5),\n", + " 'adaboost': AdaBoostRegressor(n_estimators=50),\n", + " 'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),\n", + " 'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)\n", + " }\n", + "\n", + " if self.regressor_method not in regressor_models:\n", + " raise NotImplementedError(f'Regressor method {self.regressor_method} not yet implemented')\n", "\n", - " elif self.regressor_method == 'random_forest':\n", " for u in self.user_profile:\n", + " user_ratings = [rating for (_, rating) in trainset.ur[u]]\n", + " item_ids = [iid for (iid, _) in trainset.ur[u]]\n", + " raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]\n", "\n", - " user_ratings = [rating for _, rating in trainset.ur[u]]\n", - " item_ids = [iid for iid, _ in trainset.ur[u]]\n", - "\n", - " df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})\n", + " df_user = pd.DataFrame({'item_id': raw_item_ids, 'user_ratings': user_ratings})\n", + " df_user = df_user.merge(self.content_features, left_on=\"item_id\", right_index=True, how='left')\n", "\n", - " df_user[\"item_id\"] = df_user[\"item_id\"].map(trainset.to_raw_iid)\n", + " X = df_user.drop(columns=['item_id', 'user_ratings'])\n", + " y = df_user['user_ratings']\n", "\n", - " df_user = df_user.merge(self.content_features, left_on = \"item_id\", right_index = True, how = 'left')\n", - "\n", - " if 'n_character_title' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['n_character_title'].values.reshape(-1, 1)\n", - "\n", - " elif 'avg_relevance' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['avg_relevance'].values.reshape(-1, 1)\n", - " \n", - " elif 'movie_year' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['movie_year'].values.reshape(-1, 1)\n", - " \n", - " elif 'genres' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['genres'].values.reshape(-1, 1)\n", - "\n", - " elif 'avg_rating' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['avg_rating'].values.reshape(-1, 1)\n", - "\n", - " elif 'tags' in df_user.columns:\n", - " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", - " X = df_user['tags'].values.reshape(-1, 1)\n", - "\n", - " elif 'n_character_tags' in df_user.columns:\n", - " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", - " X = df_user['n_character_tags'].values.reshape(-1, 1)\n", - "\n", - " else:\n", - " # Si aucune caractéristique appropriée n'est disponible\n", - " continue # Ou gère le cas d'erreur/exception ici\n", - "\n", - " y = df_user['user_ratings'].values\n", - " rf_regressor = RandomForestRegressor(n_estimators=100)\n", - " rf_regressor.fit(X, y)\n", - " self.user_profile[u] = rf_regressor\n", + " regressor = regressor_models[self.regressor_method]\n", + " regressor.fit(X, y)\n", "\n", - " else : \n", - " pass\n", + " self.user_profile[u] = regressor\n", "\n", - " # (implement here the regressor fitting) \n", - " \n", " def estimate(self, u, i):\n", " \"\"\"Scoring component used for item filtering\"\"\"\n", - " # First, handle cases for unknown users and items\n", " if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):\n", - " raise PredictionImpossible('User and/or item is unkown.')\n", - "\n", + " raise PredictionImpossible('User and/or item is unknown.')\n", "\n", " if self.regressor_method == 'random_score':\n", - " rd.seed()\n", - " score = rd.uniform(0.5,5)\n", + " return rd.uniform(0.5, 5)\n", "\n", " elif self.regressor_method == 'random_sample':\n", - " rd.seed()\n", - " score = rd.choice(self.user_profile[u])\n", - " \n", - " elif self.regressor_method == 'linear_regression':\n", - "\n", - " raw_item_id = self.trainset.to_raw_iid(i)\n", - "\n", - " item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values\n", - "\n", - " linear_regressor = self.user_profile[u]\n", - "\n", - " score= linear_regressor.predict(item_features)[0]\n", - " \n", - " elif self.regressor_method == 'svr_regression':\n", - "\n", - " raw_item_id = self.trainset.to_raw_iid(i)\n", - "\n", - " item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values\n", - "\n", - " svr_regressor = self.user_profile[u]\n", - " score = svr_regressor.predict(item_features)[0]\n", - " \n", - " elif self.regressor_method == 'gradient_boosting':\n", - "\n", - " raw_item_id = self.trainset.to_raw_iid(i)\n", - "\n", - " item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values\n", - "\n", - " gradient_boosting = self.user_profile[u]\n", - " score = gradient_boosting.predict(item_features)[0]\n", - " \n", - " elif self.regressor_method == 'random_forest':\n", + " return rd.choice(self.user_profile[u])\n", "\n", + " else:\n", " raw_item_id = self.trainset.to_raw_iid(i)\n", + " item_features = self.content_features.loc[raw_item_id, :].values.reshape(1, -1)\n", + " regressor = self.user_profile[u]\n", + " item_features_df = pd.DataFrame(item_features, columns=self.content_features.columns)\n", + " return regressor.predict(item_features_df)[0]\n", "\n", - " item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values\n", - "\n", - " randomforest = self.user_profile[u]\n", - " score = randomforest.predict(item_features)[0]\n", - " \n", - " else : \n", - " score = None\n", + " def explain(self, u):\n", + " if u in self.user_profile_explain:\n", + " return self.user_profile_explain[u]\n", + " else:\n", + " return None\n", "\n", - " # (implement here the regressor prediction)\n", "\n", - " return score\n", + "#Example usage:\n", + "cb = ContentBased([\"title_length\", \"movie_year\",\"genre\",\"avg_rating\"], \"ridge_regression\")\n", + "surprise_data = load_ratings(surprise_format=True)\n", + "trainset = surprise_data.build_full_trainset()\n", + "testset = trainset.build_anti_testset()\n", + "cb.fit(trainset)\n", "\n", - " def explain(self, u) : \n", - " if u in self.user_profile_explain :\n", - " return self.user_profile_explain[u]\n", - " else :\n", - " return None\n", "\n", + "#print(\"RMSE: \", cb.rmse(testset))\n", "\n", - "cb = ContentBased(\"combination\", \"svr_regression\")\n", - "sp_ratings = load_ratings(surprise_format=True)\n", - "train_set = sp_ratings.build_full_trainset()\n", - "cb.fit(train_set)\n", "\n", + "#Example explanations for users:\n", "print(cb.explain(11))\n", "\n", "print(cb.explain(13))\n", @@ -752,400 +361,7 @@ "\n", "print(cb.explain(27))\n", "\n", - "print(cb.explain(73))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "baab88b7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Matrice TF-IDF des genres :\n" - ] - }, - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>action</th>\n", - " <th>adventure</th>\n", - " <th>animation</th>\n", - " <th>children</th>\n", - " <th>comedy</th>\n", - " <th>crime</th>\n", - " <th>documentary</th>\n", - " <th>drama</th>\n", - " <th>fantasy</th>\n", - " <th>fi</th>\n", - " <th>...</th>\n", - " <th>listed</th>\n", - " <th>musical</th>\n", - " <th>mystery</th>\n", - " <th>no</th>\n", - " <th>noir</th>\n", - " <th>romance</th>\n", - " <th>sci</th>\n", - " <th>thriller</th>\n", - " <th>war</th>\n", - " <th>western</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0000</td>\n", - " <td>0.589275</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>...</td>\n", - " <td>0.00000</td>\n", - " <td>0.0</td>\n", - " <td>0.0000</td>\n", - " <td>0.00000</td>\n", - " <td>0.0</td>\n", - " <td>0.807933</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>0.553377</td>\n", - " <td>0.612756</td>\n", - " <td>0.0</td>\n", - " <td>0.0000</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>...</td>\n", - " <td>0.00000</td>\n", - " <td>0.0</td>\n", - " <td>0.0000</td>\n", - " <td>0.00000</td>\n", - " <td>0.0</td>\n", - " <td>0.564185</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.9065</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.422206</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>...</td>\n", - " <td>0.00000</td>\n", - " <td>0.0</td>\n", - " <td>0.0000</td>\n", - " <td>0.00000</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0000</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.422206</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>...</td>\n", - " <td>0.00000</td>\n", - " <td>0.0</td>\n", - " <td>0.9065</td>\n", - " <td>0.00000</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0000</td>\n", - " <td>0.589275</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>...</td>\n", - " <td>0.00000</td>\n", - " <td>0.0</td>\n", - " <td>0.0000</td>\n", - " <td>0.00000</td>\n", - " <td>0.0</td>\n", - " <td>0.807933</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " </tr>\n", - " <tr>\n", - " <th>...</th>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>907</th>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0000</td>\n", - " <td>0.403927</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>...</td>\n", - " <td>0.00000</td>\n", - " <td>0.0</td>\n", - " <td>0.0000</td>\n", - " <td>0.00000</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.914791</td>\n", - " </tr>\n", - " <tr>\n", - " <th>908</th>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0000</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>...</td>\n", - " <td>0.57735</td>\n", - " <td>0.0</td>\n", - " <td>0.0000</td>\n", - " <td>0.57735</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " </tr>\n", - " <tr>\n", - " <th>909</th>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0000</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>...</td>\n", - " <td>0.00000</td>\n", - " <td>0.0</td>\n", - " <td>0.0000</td>\n", - " <td>0.00000</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " </tr>\n", - " <tr>\n", - " <th>910</th>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0000</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>...</td>\n", - " <td>0.00000</td>\n", - " <td>0.0</td>\n", - " <td>0.0000</td>\n", - " <td>0.00000</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>1.0</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " </tr>\n", - " <tr>\n", - " <th>911</th>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0000</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>1.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>...</td>\n", - " <td>0.00000</td>\n", - " <td>0.0</td>\n", - " <td>0.0000</td>\n", - " <td>0.00000</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " <td>0.000000</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "<p>912 rows × 24 columns</p>\n", - "</div>" - ], - "text/plain": [ - " action adventure animation children comedy crime documentary \\\n", - "0 0.000000 0.000000 0.0 0.0000 0.589275 0.0 0.0 \n", - "1 0.553377 0.612756 0.0 0.0000 0.000000 0.0 0.0 \n", - "2 0.000000 0.000000 0.0 0.9065 0.000000 0.0 0.0 \n", - "3 0.000000 0.000000 0.0 0.0000 0.000000 0.0 0.0 \n", - "4 0.000000 0.000000 0.0 0.0000 0.589275 0.0 0.0 \n", - ".. ... ... ... ... ... ... ... \n", - "907 0.000000 0.000000 0.0 0.0000 0.403927 0.0 0.0 \n", - "908 0.000000 0.000000 0.0 0.0000 0.000000 0.0 0.0 \n", - "909 0.000000 0.000000 0.0 0.0000 0.000000 0.0 0.0 \n", - "910 0.000000 0.000000 0.0 0.0000 0.000000 0.0 0.0 \n", - "911 0.000000 0.000000 0.0 0.0000 0.000000 0.0 0.0 \n", - "\n", - " drama fantasy fi ... listed musical mystery no noir \\\n", - "0 0.000000 0.0 0.0 ... 0.00000 0.0 0.0000 0.00000 0.0 \n", - "1 0.000000 0.0 0.0 ... 0.00000 0.0 0.0000 0.00000 0.0 \n", - "2 0.422206 0.0 0.0 ... 0.00000 0.0 0.0000 0.00000 0.0 \n", - "3 0.422206 0.0 0.0 ... 0.00000 0.0 0.9065 0.00000 0.0 \n", - "4 0.000000 0.0 0.0 ... 0.00000 0.0 0.0000 0.00000 0.0 \n", - ".. ... ... ... ... ... ... ... ... ... \n", - "907 0.000000 0.0 0.0 ... 0.00000 0.0 0.0000 0.00000 0.0 \n", - "908 0.000000 0.0 0.0 ... 0.57735 0.0 0.0000 0.57735 0.0 \n", - "909 0.000000 0.0 0.0 ... 0.00000 0.0 0.0000 0.00000 0.0 \n", - "910 0.000000 0.0 0.0 ... 0.00000 0.0 0.0000 0.00000 0.0 \n", - "911 1.000000 0.0 0.0 ... 0.00000 0.0 0.0000 0.00000 0.0 \n", - "\n", - " romance sci thriller war western \n", - "0 0.807933 0.0 0.0 0.0 0.000000 \n", - "1 0.564185 0.0 0.0 0.0 0.000000 \n", - "2 0.000000 0.0 0.0 0.0 0.000000 \n", - "3 0.000000 0.0 0.0 0.0 0.000000 \n", - "4 0.807933 0.0 0.0 0.0 0.000000 \n", - ".. ... ... ... ... ... \n", - "907 0.000000 0.0 0.0 0.0 0.914791 \n", - "908 0.000000 0.0 0.0 0.0 0.000000 \n", - "909 0.000000 0.0 0.0 0.0 0.000000 \n", - "910 0.000000 0.0 1.0 0.0 0.000000 \n", - "911 0.000000 0.0 0.0 0.0 0.000000 \n", - "\n", - "[912 rows x 24 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from pprint import pprint\n", - "\n", - "# Créer une instance de TfidfVectorizer pour les genres\n", - "tfidf_vectorizer = TfidfVectorizer()\n", - "\n", - "# Fit et transform pour calculer la matrice TF-IDF des genres\n", - "tfidf_matrix = tfidf_vectorizer.fit_transform(df_items['genres'])\n", - "\n", - "# Obtenir les noms des genres (features)\n", - "genre_names = tfidf_vectorizer.get_feature_names_out()\n", - "\n", - "# Créer un DataFrame à partir de la matrice TF-IDF des genres\n", - "df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=genre_names)\n", - "\n", - "print(\"Matrice TF-IDF des genres :\")\n", - "display(df_tfidf)" + "print(cb.explain(73))\n" ] }, { @@ -1158,10 +374,18 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 9, "id": "69d12f7d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "user: 1 item: 10 r_ui = None est = 0.72 {'was_impossible': False}\n" + ] + } + ], "source": [ "def test_contentbased_class(feature_method, regressor_method):\n", " \"\"\"Test the ContentBased class.\n", @@ -1175,72 +399,7 @@ " prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])\n", " print(prediction)\n", "\n", - "\n", - "\n", - "# print(\"title_length :\")\n", - "# test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"random_score\")\n", - "# test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"random_sample\")\n", - "# test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"linear_regression\")\n", - "# test_contentbased_class(feature_method= \"title_length\", regressor_method= \"svr_regression\")\n", - "# test_contentbased_class(feature_method= \"title_length\", regressor_method= \"gradient_boosting\")\n", - "# test_contentbased_class(feature_method= \"title_length\", regressor_method= \"random_forest\")\n", - "# print(\"\\n\")\n", - "# print(\"movie_year : \")\n", - "# test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"random_score\")\n", - "# test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"random_sample\")\n", - "# test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"linear_regression\")\n", - "# test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"svr_regression\")\n", - "# test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"gradient_boosting\")\n", - "# test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"random_forest\")\n", - "# print(\"\\n\")\n", - "# print(\"relevance : \") \n", - "# test_contentbased_class(feature_method= \"relevance\", regressor_method= \"random_score\")\n", - "# test_contentbased_class(feature_method= \"relevance\", regressor_method= \"random_sample\")\n", - "# test_contentbased_class(feature_method= \"relevance\", regressor_method= \"linear_regression\")\n", - "# test_contentbased_class(feature_method= \"relevance\", regressor_method= \"svr_regression\")\n", - "# test_contentbased_class(feature_method= \"relevance\", regressor_method= \"gradient_boosting\")\n", - "# test_contentbased_class(feature_method= \"relevance\", regressor_method= \"random_forest\")\n", - "# print(\"\\n\")\n", - "# print(\"genres : \") \n", - "# test_contentbased_class(feature_method= \"genres\", regressor_method= \"random_score\")\n", - "# test_contentbased_class(feature_method= \"genres\", regressor_method= \"random_sample\")\n", - "# test_contentbased_class(feature_method= \"genres\", regressor_method= \"linear_regression\")\n", - "# test_contentbased_class(feature_method= \"genres\", regressor_method= \"svr_regression\")\n", - "# test_contentbased_class(feature_method= \"genres\", regressor_method= \"gradient_boosting\")\n", - "# test_contentbased_class(feature_method= \"genres\", regressor_method= \"random_forest\")\n", - "# print(\"\\n\")\n", - "# print(\"rating : \")\n", - "# test_contentbased_class(feature_method= \"rating\", regressor_method=\"random_score\")\n", - "# test_contentbased_class(feature_method= \"rating\", regressor_method=\"random_sample\")\n", - "# # test_contentbased_class(feature_method= \"rating\", regressor_method=\"linear_regression\")\n", - "# #test_contentbased_class(feature_method=\"rating\", regressor_method=\"svr_regression\")\n", - "# #test_contentbased_class(feature_method=\"rating\", regressor_method=\"gradient_boosting\")\n", - "# #test_contentbased_class(feature_method=\"rating\", regressor_method=\"random_forest\")\n", - "# print(\"\\n\")\n", - "# print(\"tags : \")\n", - "# test_contentbased_class(feature_method=\"tags\", regressor_method=\"random_score\")\n", - "# test_contentbased_class(feature_method=\"tags\", regressor_method=\"random_sample\")\n", - "# #test_contentbased_class(feature_method=\"tags\", regressor_method=\"linear_regression\")\n", - "# # test_contentbased_class(feature_method=\"tags\", regressor_method=\"svr_regression\")\n", - "# # test_contentbased_class(feature_method=\"tags\", regressor_method=\"gradient_boosting\")\n", - "# # test_contentbased_class(feature_method=\"tags\", regressor_method=\"random_forest\")\n", - "# print(\"\\n\")\n", - "# print(\"tags_length : \")\n", - "# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"random_score\")\n", - "# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"random_sample\")\n", - "# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"linear_regression\")\n", - "# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"svr_regression\")\n", - "# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"gradient_boosting\")\n", - "# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"random_forest\")\n", - "\n", - "# print(\"\\n\")\n", - "# print(\"combination : \")\n", - "# test_contentbased_class(feature_method=\"combination\", regressor_method=\"random_score\")\n", - "# test_contentbased_class(feature_method=\"combination\", regressor_method=\"random_sample\")\n", - "# test_contentbased_class(feature_method=\"combination\", regressor_method=\"linear_regression\")\n", - "# test_contentbased_class(feature_method=\"combination\", regressor_method=\"svr_regression\")\n", - "# test_contentbased_class(feature_method=\"combination\", regressor_method=\"gradient_boosting\")\n", - "# test_contentbased_class(feature_method=\"combination\", regressor_method=\"random_forest\")\n" + "test_contentbased_class([\"title_length\", \"movie_year\",\"genre\",\"avg_rating\"], \"ridge_regression\")" ] } ], diff --git a/data/small/evaluations/evaluation_report_2024-05-22.csv b/data/small/evaluations/evaluation_report_2024-05-22.csv new file mode 100644 index 0000000000000000000000000000000000000000..5f8f0c651c6debc5911f80ba5ce99f2a16d49b3b --- /dev/null +++ b/data/small/evaluations/evaluation_report_2024-05-22.csv @@ -0,0 +1,6 @@ +mae,rmse,hit_rate,novelty +1.6571763025276776,1.8653776413082341,0.014814814814814815,538.5435555555556 +1.5157635089877097,1.859210427636794,0.0014814814814814814,4648.247407407407 +0.8632653600233939,1.0824195251647628,0.005925925925925926,538.5435555555556 +0.6877531258252827,0.8936032538534392,0.01925925925925926,533.9954074074074 +0.7429268968557247,0.9807166886090721,0.0,6516.658222222222 diff --git a/data/test/evaluations/evaluation_report_2024-05-22_test.csv b/data/test/evaluations/evaluation_report_2024-05-22_test.csv new file mode 100644 index 0000000000000000000000000000000000000000..a5678a7e4530b855d30286aae9de3d58659a1879 --- /dev/null +++ b/data/test/evaluations/evaluation_report_2024-05-22_test.csv @@ -0,0 +1,185 @@ +features_method,regressor_method,mae,rmse,hit_rate,novelty +Unknown features,Unknown regressor,1.4375,1.704772712123232,1.0,6.033333333333333 +Unknown features,Unknown regressor,1.1047906549123014,1.2718982459540529,1.0,6.033333333333333 +Unknown features,Unknown regressor,1.1590909090909092,1.3636363636363635,1.0,6.033333333333333 +Unknown features,Unknown regressor,1.411912797135983,1.591814183342947,1.0,6.033333333333333 +['genre'],linear_regression,1.4473754873073326,1.6776813783581366,0.6666666666666666,6.033333333333333 +['genre'],random_forest,0.8532954545454545,0.9835795175558364,1.0,6.033333333333333 +['genre'],lasso_regression,0.7954545454545454,1.0041237288352056,1.0,6.033333333333333 +['genre'],svr_regression,1.7957362529964096,2.0608964237898526,1.0,6.033333333333333 +['genre'],gradient_boosting,1.25,1.5811388300841898,1.0,6.033333333333333 +['genre'],ridge_regression,1.6875,1.976423537605237,1.0,6.033333333333333 +['genre'],elastic_net,1.5,1.6583123951777,1.0,6.033333333333333 +['genre'],decision_tree,1.25,1.4187717304557825,1.0,6.033333333333333 +['genre'],adaboost,1.5625,2.023301757029831,1.0,6.033333333333333 +['genre'],knn_regression,1.5625,2.084166500066633,1.0,6.033333333333333 +['genre'],xgboost,1.8114174157381058,2.1121808891918574,1.0,6.033333333333333 +['genre'],lightgbm,1.25,1.3900844257611844,1.0,6.033333333333333 +['movie_year'],linear_regression,1.1793559588121263,1.3141843299213294,0.3333333333333333,6.033333333333333 +['movie_year'],random_forest,1.685,1.738001726121122,1.0,6.033333333333333 +['movie_year'],lasso_regression,2.033290207006363,2.2940804500068475,1.0,6.033333333333333 +['movie_year'],svr_regression,2.0,2.384848003542364,0.5,6.033333333333333 +['movie_year'],gradient_boosting,1.6875,2.038688303787511,1.0,6.033333333333333 +['movie_year'],ridge_regression,2.1079545454545454,2.4100019718108596,1.0,6.033333333333333 +['movie_year'],elastic_net,2.0227272727272725,2.6096560601094763,1.0,6.033333333333333 +['movie_year'],decision_tree,2.125,2.318404623873926,1.0,6.033333333333333 +['movie_year'],adaboost,1.3920454545454546,1.71594671843151,1.0,6.033333333333333 +['movie_year'],knn_regression,1.5,1.6583123951777,1.0,6.033333333333333 +['movie_year'],xgboost,2.1235196590423584,2.678605209338892,1.0,6.033333333333333 +['movie_year'],lightgbm,1.8863636363636362,2.408956900217901,1.0,6.033333333333333 +['avg_rating'],linear_regression,1.4044117647058827,1.5860506663643976,0.5,6.033333333333333 +['avg_rating'],random_forest,1.915,2.1330403067952615,1.0,6.033333333333333 +['avg_rating'],lasso_regression,0.7025858070500931,0.9815772285932589,1.0,6.033333333333333 +['avg_rating'],svr_regression,0.8063143949253345,1.060586228071386,0.5,6.033333333333333 +['avg_rating'],gradient_boosting,0.749980078950834,0.9353877265415008,1.0,6.033333333333333 +['avg_rating'],ridge_regression,2.0454545454545454,2.358987999156691,1.0,6.033333333333333 +['avg_rating'],elastic_net,2.125,2.5860201081971503,1.0,6.033333333333333 +['avg_rating'],decision_tree,2.5,2.7950849718747373,1.0,6.033333333333333 +['avg_rating'],adaboost,2.1875,2.5031230493125984,1.0,6.033333333333333 +['avg_rating'],knn_regression,2.5,2.8939592256975564,0.6666666666666666,6.033333333333333 +['avg_rating'],xgboost,1.438937783241272,1.6884888622389684,1.0,6.033333333333333 +['avg_rating'],lightgbm,1.6875,2.0512858797180313,1.0,6.033333333333333 +['title_length'],linear_regression,1.2958333333333332,1.8809186421192186,1.0,6.033333333333333 +['title_length'],random_forest,0.9375,1.2149263658264091,1.0,6.033333333333333 +['title_length'],lasso_regression,1.9375,2.099106952968333,1.0,6.033333333333333 +['title_length'],svr_regression,2.5,2.8722813232690143,1.0,6.033333333333333 +['title_length'],gradient_boosting,1.75,2.2472401663219115,1.0,6.033333333333333 +['title_length'],ridge_regression,1.953251787924632,2.096761794400394,1.0,6.033333333333333 +['title_length'],elastic_net,1.8125,2.404423007708918,1.0,6.033333333333333 +['title_length'],decision_tree,1.25,1.6770509831248424,1.0,6.033333333333333 +['title_length'],adaboost,1.375,1.9525624189766635,1.0,6.033333333333333 +['title_length'],knn_regression,1.125,1.3693063937629153,1.0,6.033333333333333 +['title_length'],xgboost,1.6263536214828491,1.879353438588852,1.0,6.033333333333333 +['title_length'],lightgbm,1.1742424242424243,1.7479449759985175,1.0,6.033333333333333 +"['genre', 'movie_year']",linear_regression,1.1654492777443672,1.4386998725766413,1.0,6.033333333333333 +"['genre', 'movie_year']",random_forest,1.355,1.5987727085341803,1.0,6.033333333333333 +"['genre', 'movie_year']",lasso_regression,1.6922151766529794,2.1026562882679745,1.0,6.033333333333333 +"['genre', 'movie_year']",svr_regression,1.2653420920250695,1.4938026536014781,1.0,6.033333333333333 +"['genre', 'movie_year']",gradient_boosting,1.4034001355292514,1.7108770020792687,1.0,6.033333333333333 +"['genre', 'movie_year']",ridge_regression,1.4984061821373587,1.8243627906442716,1.0,6.033333333333333 +"['genre', 'movie_year']",elastic_net,1.4127246300211427,1.5953292965261576,1.0,6.033333333333333 +"['genre', 'movie_year']",decision_tree,3.1875,3.254804141572884,1.0,6.033333333333333 +"['genre', 'movie_year']",adaboost,2.125,2.3048861143232218,1.0,6.033333333333333 +"['genre', 'movie_year']",knn_regression,2.4375,2.7894892005526746,1.0,6.033333333333333 +"['genre', 'movie_year']",xgboost,1.3125,1.8114220932736798,1.0,6.033333333333333 +"['genre', 'movie_year']",lightgbm,1.3625,1.509552913945053,1.0,6.033333333333333 +"['genre', 'avg_rating']",linear_regression,0.8810197834402558,1.1633623833190734,1.0,6.033333333333333 +"['genre', 'avg_rating']",random_forest,1.47875,1.6431714761399678,1.0,6.033333333333333 +"['genre', 'avg_rating']",lasso_regression,1.3125,1.5512092057488571,1.0,6.033333333333333 +"['genre', 'avg_rating']",svr_regression,1.4030189396223172,1.7226838917059923,1.0,6.033333333333333 +"['genre', 'avg_rating']",gradient_boosting,2.9375,3.1770662567847086,1.0,6.033333333333333 +"['genre', 'avg_rating']",ridge_regression,1.5941614495865424,1.7614218970767976,1.0,6.033333333333333 +"['genre', 'avg_rating']",elastic_net,1.4791666666666667,1.6442448396591178,1.0,6.033333333333333 +"['genre', 'avg_rating']",decision_tree,1.5625,1.9598566843370195,0.5,6.033333333333333 +"['genre', 'avg_rating']",adaboost,1.5,1.713913650100261,1.0,6.033333333333333 +"['genre', 'avg_rating']",knn_regression,1.5,1.8874586088176875,1.0,6.033333333333333 +"['genre', 'avg_rating']",xgboost,1.6083768904209137,2.1469591124834313,1.0,6.033333333333333 +"['genre', 'avg_rating']",lightgbm,1.3125,1.4469796128487782,1.0,6.033333333333333 +"['genre', 'title_length']",linear_regression,1.7095903990166095,1.9079361689052665,1.0,6.033333333333333 +"['genre', 'title_length']",random_forest,2.16125,2.3690438472092494,1.0,6.033333333333333 +"['genre', 'title_length']",lasso_regression,1.9095085221807393,2.2649506463335594,1.0,6.033333333333333 +"['genre', 'title_length']",svr_regression,0.9028659166948557,1.0922570883925715,1.0,6.033333333333333 +"['genre', 'title_length']",gradient_boosting,0.9652308740417632,1.216094887841987,1.0,6.033333333333333 +"['genre', 'title_length']",ridge_regression,1.097871043098452,1.728983663512694,1.0,6.033333333333333 +"['genre', 'title_length']",elastic_net,1.6542403043691702,1.7556295027609992,1.0,6.033333333333333 +"['genre', 'title_length']",decision_tree,0.8125,1.2119199643540823,1.0,6.033333333333333 +"['genre', 'title_length']",adaboost,1.375,1.6393596310755,1.0,6.033333333333333 +"['genre', 'title_length']",knn_regression,1.6875,2.2150056433336687,1.0,6.033333333333333 +"['genre', 'title_length']",xgboost,1.3717930614948273,1.7417813022095112,1.0,6.033333333333333 +"['genre', 'title_length']",lightgbm,1.75,2.25,1.0,6.033333333333333 +"['movie_year', 'avg_rating']",linear_regression,0.6734228085797149,0.8239517444490464,1.0,6.033333333333333 +"['movie_year', 'avg_rating']",random_forest,1.5,1.984313483298443,1.0,6.033333333333333 +"['movie_year', 'avg_rating']",lasso_regression,1.5007812500000028,1.8480867957667793,0.6666666666666666,6.033333333333333 +"['movie_year', 'avg_rating']",svr_regression,1.5,2.1360009363293826,1.0,6.033333333333333 +"['movie_year', 'avg_rating']",gradient_boosting,2.0052481525699046,2.692573423919177,1.0,6.033333333333333 +"['movie_year', 'avg_rating']",ridge_regression,1.5289909638554189,1.6544569693334048,1.0,6.033333333333333 +"['movie_year', 'avg_rating']",elastic_net,1.9375,2.143303524935281,1.0,6.033333333333333 +"['movie_year', 'avg_rating']",decision_tree,2.1875,2.481179155159901,1.0,6.033333333333333 +"['movie_year', 'avg_rating']",adaboost,1.8125,2.143303524935281,1.0,6.033333333333333 +"['movie_year', 'avg_rating']",knn_regression,1.9375,2.3251344047172844,1.0,6.033333333333333 +"['movie_year', 'avg_rating']",xgboost,1.6889803409576416,1.897769890072298,1.0,6.033333333333333 +"['movie_year', 'avg_rating']",lightgbm,2.0,2.328741546432889,1.0,6.033333333333333 +"['movie_year', 'title_length']",linear_regression,1.2499217118997914,1.5122313117996493,1.0,6.033333333333333 +"['movie_year', 'title_length']",random_forest,1.4825,2.073179924656806,1.0,6.033333333333333 +"['movie_year', 'title_length']",lasso_regression,1.8143768982923731,2.26125435323686,1.0,6.033333333333333 +"['movie_year', 'title_length']",svr_regression,1.8125,2.069118169655856,1.0,6.033333333333333 +"['movie_year', 'title_length']",gradient_boosting,1.5,1.7853571071357126,1.0,6.033333333333333 +"['movie_year', 'title_length']",ridge_regression,1.6804379446793671,2.1523028399883026,1.0,6.033333333333333 +"['movie_year', 'title_length']",elastic_net,2.4326635034049033,2.780356782965454,1.0,6.033333333333333 +"['movie_year', 'title_length']",decision_tree,2.125,2.48746859276655,1.0,6.033333333333333 +"['movie_year', 'title_length']",adaboost,1.6875,1.8624580532189174,0.6666666666666666,6.033333333333333 +"['movie_year', 'title_length']",knn_regression,1.9375,2.143303524935281,1.0,6.033333333333333 +"['movie_year', 'title_length']",xgboost,1.687782883644104,1.97009092981722,1.0,6.033333333333333 +"['movie_year', 'title_length']",lightgbm,1.0056818181818183,1.2201556637960094,1.0,6.033333333333333 +"['avg_rating', 'title_length']",linear_regression,1.4443069306930694,1.841000186873917,1.0,6.033333333333333 +"['avg_rating', 'title_length']",random_forest,1.84125,2.1687525216123666,1.0,6.033333333333333 +"['avg_rating', 'title_length']",lasso_regression,2.090909090909091,2.6987141106904358,1.0,6.033333333333333 +"['avg_rating', 'title_length']",svr_regression,2.2179796680695123,2.5951310459929013,1.0,6.033333333333333 +"['avg_rating', 'title_length']",gradient_boosting,1.4999983399125698,2.0184285217460998,1.0,6.033333333333333 +"['avg_rating', 'title_length']",ridge_regression,0.7230706317708891,1.019417759512073,1.0,6.033333333333333 +"['avg_rating', 'title_length']",elastic_net,1.3125,1.5967021151824468,1.0,6.033333333333333 +"['avg_rating', 'title_length']",decision_tree,0.8181818181818182,1.2374890432773151,1.0,6.033333333333333 +"['avg_rating', 'title_length']",adaboost,1.5,2.0766559657295187,1.0,6.033333333333333 +"['avg_rating', 'title_length']",knn_regression,2.625,2.883140648667699,1.0,6.033333333333333 +"['avg_rating', 'title_length']",xgboost,1.4955596327781677,1.7791506491564508,0.6666666666666666,6.033333333333333 +"['avg_rating', 'title_length']",lightgbm,1.4375,1.704772712123232,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating']",linear_regression,1.9452454483098371,2.1507860231787106,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating']",random_forest,2.375,2.839454172900137,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating']",lasso_regression,2.107954545454546,2.425171460241574,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating']",svr_regression,2.1647727272727275,2.3927943655970645,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating']",gradient_boosting,0.696366266134975,0.9245168688215822,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating']",ridge_regression,1.9375,2.1578345627040085,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating']",elastic_net,1.7329545454545454,1.9673864653728386,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating']",decision_tree,1.625,1.984313483298443,0.5,6.033333333333333 +"['genre', 'movie_year', 'avg_rating']",adaboost,1.0,1.299038105676658,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating']",knn_regression,2.125,2.4325614836835734,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating']",xgboost,0.7976747588677839,1.1996512624250524,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating']",lightgbm,1.8125,2.0077973005261263,1.0,6.033333333333333 +"['genre', 'movie_year', 'title_length']",linear_regression,1.1511554727356577,1.4283269907432299,1.0,6.033333333333333 +"['genre', 'movie_year', 'title_length']",random_forest,0.9375,1.286953767623375,1.0,6.033333333333333 +"['genre', 'movie_year', 'title_length']",lasso_regression,1.9654476796952167,2.057096575823434,1.0,6.033333333333333 +"['genre', 'movie_year', 'title_length']",svr_regression,1.0626623467862377,1.3111671549042816,1.0,6.033333333333333 +"['genre', 'movie_year', 'title_length']",gradient_boosting,0.9615501326314708,1.063826267065753,1.0,6.033333333333333 +"['genre', 'movie_year', 'title_length']",ridge_regression,1.8955865714529594,2.0412170823459532,1.0,6.033333333333333 +"['genre', 'movie_year', 'title_length']",elastic_net,2.6875,3.0771334062727926,1.0,6.033333333333333 +"['genre', 'movie_year', 'title_length']",decision_tree,1.375,1.6007810593582121,1.0,6.033333333333333 +"['genre', 'movie_year', 'title_length']",adaboost,2.25,2.5495097567963922,1.0,6.033333333333333 +"['genre', 'movie_year', 'title_length']",knn_regression,2.875,3.0516389039334255,1.0,6.033333333333333 +"['genre', 'movie_year', 'title_length']",xgboost,1.375,1.7677669529663689,1.0,6.033333333333333 +"['genre', 'movie_year', 'title_length']",lightgbm,2.104166666666667,2.3884851079944194,0.6666666666666666,6.033333333333333 +"['genre', 'avg_rating', 'title_length']",linear_regression,0.8154928335413847,1.1606281170743262,0.6666666666666666,6.033333333333333 +"['genre', 'avg_rating', 'title_length']",random_forest,1.34875,1.8100362565429458,0.6666666666666666,6.033333333333333 +"['genre', 'avg_rating', 'title_length']",lasso_regression,1.2187500000000002,1.4027874037073473,1.0,6.033333333333333 +"['genre', 'avg_rating', 'title_length']",svr_regression,1.127125408284661,1.4485854886669336,0.5,6.033333333333333 +"['genre', 'avg_rating', 'title_length']",gradient_boosting,1.75,2.03100960115899,0.6666666666666666,6.033333333333333 +"['genre', 'avg_rating', 'title_length']",ridge_regression,1.8125,2.069118169655856,1.0,6.033333333333333 +"['genre', 'avg_rating', 'title_length']",elastic_net,1.6303571428571428,1.835915775490226,1.0,6.033333333333333 +"['genre', 'avg_rating', 'title_length']",decision_tree,2.4375,2.5805965019322805,1.0,6.033333333333333 +"['genre', 'avg_rating', 'title_length']",adaboost,1.9375,2.2150056433336687,1.0,6.033333333333333 +"['genre', 'avg_rating', 'title_length']",knn_regression,0.9403409090909091,1.1835378659617484,1.0,6.033333333333333 +"['genre', 'avg_rating', 'title_length']",xgboost,2.6875,2.8777161083053344,1.0,6.033333333333333 +"['genre', 'avg_rating', 'title_length']",lightgbm,1.5700757575757576,1.7515652844875222,1.0,6.033333333333333 +"['movie_year', 'avg_rating', 'title_length']",linear_regression,1.7557054258493372,1.9887479913562542,0.6666666666666666,6.033333333333333 +"['movie_year', 'avg_rating', 'title_length']",random_forest,1.600625,1.7266559081067656,1.0,6.033333333333333 +"['movie_year', 'avg_rating', 'title_length']",lasso_regression,0.8366370842413322,1.368810888302608,1.0,6.033333333333333 +"['movie_year', 'avg_rating', 'title_length']",svr_regression,1.5629929403189624,2.039055332090993,1.0,6.033333333333333 +"['movie_year', 'avg_rating', 'title_length']",gradient_boosting,1.9520765758783163,2.3984081852018986,1.0,6.033333333333333 +"['movie_year', 'avg_rating', 'title_length']",ridge_regression,2.1802325581395365,2.563201914841398,1.0,6.033333333333333 +"['movie_year', 'avg_rating', 'title_length']",elastic_net,1.9801136363636365,2.298925705808356,1.0,6.033333333333333 +"['movie_year', 'avg_rating', 'title_length']",decision_tree,1.3977272727272727,1.7712697971271558,1.0,6.033333333333333 +"['movie_year', 'avg_rating', 'title_length']",adaboost,1.6420454545454546,1.9221012305536531,1.0,6.033333333333333 +"['movie_year', 'avg_rating', 'title_length']",knn_regression,2.8125,2.942150573984955,1.0,6.033333333333333 +"['movie_year', 'avg_rating', 'title_length']",xgboost,1.5625,1.704772712123232,1.0,6.033333333333333 +"['movie_year', 'avg_rating', 'title_length']",lightgbm,1.4005681818181819,1.7874353424126381,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating', 'title_length']",linear_regression,2.2618903144827494,2.3066045260646746,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating', 'title_length']",random_forest,1.9375,2.2980970388562794,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating', 'title_length']",lasso_regression,1.6222469135802469,1.7958401377770798,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating', 'title_length']",svr_regression,1.313013214298001,1.4569356767282977,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating', 'title_length']",gradient_boosting,1.1875,1.5309310892394863,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating', 'title_length']",ridge_regression,1.375,1.9525624189766635,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating', 'title_length']",elastic_net,1.1515325148823656,1.3471236231883315,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating', 'title_length']",decision_tree,1.8125,2.1286732957408003,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating', 'title_length']",adaboost,1.5,1.9364916731037085,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating', 'title_length']",knn_regression,1.8125,2.069118169655856,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating', 'title_length']",xgboost,1.875,2.1937410968480306,1.0,6.033333333333333 +"['genre', 'movie_year', 'avg_rating', 'title_length']",lightgbm,1.8585227272727276,1.9982010245995632,1.0,6.033333333333333 diff --git a/data/tiny/evaluations/evaluation_report_2024-05-22.csv b/data/tiny/evaluations/evaluation_report_2024-05-22.csv new file mode 100644 index 0000000000000000000000000000000000000000..79a775cae4a41aa5e68ebfec95881014ebd0e1e2 --- /dev/null +++ b/data/tiny/evaluations/evaluation_report_2024-05-22.csv @@ -0,0 +1,24 @@ +features_method,regressor_method,mae,rmse,hit_rate,novelty +Unknown features,Unknown regressor,1.5649546827794563,1.7774604240073495,0.08411214953271028,99.40560747663551 +Unknown features,Unknown regressor,1.506078052886924,1.8397201791089532,0.0,429.94299065420563 +Unknown features,Unknown regressor,0.8746908115113954,1.0897093001331002,0.06542056074766354,99.40560747663551 +Unknown features,Unknown regressor,0.7138366989023895,0.9313405510567094,0.16822429906542055,61.73271028037383 +"['movie_year', 'avg_rating']",linear_regression,0.8064711208383942,0.983824686370847,0.009345794392523364,675.9383177570094 +"['genre', 'movie_year', 'avg_rating']",gradient_boosting,0.9711461895227331,1.248394836559089,0.028037383177570093,308.4906542056075 +['avg_rating'],gradient_boosting,0.8590351163143807,1.121240295642085,0.102803738317757,182.49252336448598 +['avg_rating'],lasso_regression,0.7125829169303501,0.9338850255246349,0.0,724.6280373831776 +['genre'],random_forest,0.9199064200916992,1.235119018631717,0.056074766355140186,327.0514018691589 +['genre'],lasso_regression,1.0440014586729254,1.239127258979977,0.056074766355140186,99.40560747663551 +"['avg_rating', 'title_length']",ridge_regression,0.8454994573520899,1.0365690571406192,0.0,571.8523364485982 +['avg_rating'],svr_regression,0.9622470020847163,1.286461375966794,0.018691588785046728,290.4084112149533 +"['genre', 'movie_year', 'title_length']",gradient_boosting,0.9829050110032581,1.23532414575894,0.04672897196261682,363.6448598130841 +"['genre', 'title_length']",svr_regression,0.9590937921698368,1.2054205241611384,0.0,527.6280373831776 +"['genre', 'avg_rating', 'title_length']",linear_regression,0.8261378328560118,1.0311965608643556,0.056074766355140186,115.61495327102804 +"['genre', 'avg_rating']",linear_regression,0.9911160053318097,1.266697175630553,0.018691588785046728,119.30934579439253 +"['genre', 'avg_rating', 'title_length']",knn_regression,1.037088167018069,1.3090384914907294,0.09345794392523364,141.0981308411215 +"['genre', 'movie_year', 'avg_rating']",xgboost,0.9579212706198381,1.2303094916039912,0.08411214953271028,291.59532710280376 +"['genre', 'title_length']",decision_tree,0.9076211919995862,1.1905523032159628,0.0,239.76822429906542 +['title_length'],random_forest,0.9242463275468562,1.1381511361285654,0.018691588785046728,481.17289719626166 +"['genre', 'title_length']",gradient_boosting,1.068975407551646,1.3464061489576002,0.009345794392523364,413.4607476635514 +"['movie_year', 'title_length']",lightgbm,1.1591172193268287,1.4711169690740502,0.06542056074766354,99.40560747663551 +"['avg_rating', 'title_length']",decision_tree,0.7793934429222078,1.0481330138327167,0.06542056074766354,88.16168224299065 diff --git a/evaluator.ipynb b/evaluator.ipynb index 3f594379da37b248b5392890b82736a41a832f09..5c9d6e6c12f67fe890b0b357181eedd0b2c7c3cf 100644 --- a/evaluator.ipynb +++ b/evaluator.ipynb @@ -288,7 +288,7 @@ "- computing metric rmse\n", "Training loo predictions\n", "Training full predictions\n", - "Handling model 2\n", + "Handling model 1\n", "Training split predictions\n", "- computing metric mae\n", "- computing metric rmse\n", @@ -327,50 +327,50 @@ " <tbody>\n", " <tr>\n", " <th>baseline_1</th>\n", - " <td>1.596677</td>\n", - " <td>1.814364</td>\n", - " <td>0.102804</td>\n", - " <td>99.405607</td>\n", + " <td>1.657176</td>\n", + " <td>1.865378</td>\n", + " <td>0.014815</td>\n", + " <td>538.543556</td>\n", " </tr>\n", " <tr>\n", " <th>baseline_2</th>\n", - " <td>1.504517</td>\n", - " <td>1.836713</td>\n", - " <td>0.000000</td>\n", - " <td>429.942991</td>\n", + " <td>1.515764</td>\n", + " <td>1.859210</td>\n", + " <td>0.001481</td>\n", + " <td>4648.247407</td>\n", " </tr>\n", " <tr>\n", " <th>baseline_3</th>\n", - " <td>0.878197</td>\n", - " <td>1.080797</td>\n", - " <td>0.084112</td>\n", - " <td>99.405607</td>\n", + " <td>0.863265</td>\n", + " <td>1.082420</td>\n", + " <td>0.005926</td>\n", + " <td>538.543556</td>\n", " </tr>\n", " <tr>\n", " <th>baseline_4</th>\n", - " <td>0.721185</td>\n", - " <td>0.918754</td>\n", - " <td>0.112150</td>\n", - " <td>54.942056</td>\n", + " <td>0.687753</td>\n", + " <td>0.893603</td>\n", + " <td>0.019259</td>\n", + " <td>533.995407</td>\n", " </tr>\n", " <tr>\n", - " <th>2</th>\n", - " <td>0.849388</td>\n", - " <td>1.037533</td>\n", - " <td>0.028037</td>\n", - " <td>453.141121</td>\n", + " <th>1</th>\n", + " <td>0.742927</td>\n", + " <td>0.980717</td>\n", + " <td>0.000000</td>\n", + " <td>6516.658222</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " mae rmse hit_rate novelty\n", - "baseline_1 1.596677 1.814364 0.102804 99.405607\n", - "baseline_2 1.504517 1.836713 0.000000 429.942991\n", - "baseline_3 0.878197 1.080797 0.084112 99.405607\n", - "baseline_4 0.721185 0.918754 0.112150 54.942056\n", - "2 0.849388 1.037533 0.028037 453.141121" + " mae rmse hit_rate novelty\n", + "baseline_1 1.657176 1.865378 0.014815 538.543556\n", + "baseline_2 1.515764 1.859210 0.001481 4648.247407\n", + "baseline_3 0.863265 1.082420 0.005926 538.543556\n", + "baseline_4 0.687753 0.893603 0.019259 533.995407\n", + "1 0.742927 0.980717 0.000000 6516.658222" ] }, "execution_count": 52, diff --git a/models.py b/models.py index 66fdfee8dab3004586972c90eaa04fd95a947153..14424b62ddea3a3bd20774c6bf49b90d674a7cd5 100644 --- a/models.py +++ b/models.py @@ -25,6 +25,7 @@ from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, A from sklearn.tree import DecisionTreeRegressor from sklearn.neighbors import KNeighborsRegressor from xgboost import XGBRegressor +from lightgbm import LGBMRegressor # All the dataframes @@ -200,17 +201,18 @@ class ContentBased(AlgoBase): else: regressor_models = { - 'linear_regression': LinearRegression(fit_intercept=True), # Fit intercept might help - 'svr_regression': SVR(kernel='rbf', C=1.0, epsilon=0.1), # Adjusted C and epsilon for better performance - 'gradient_boosting': GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=4), # More estimators and smaller learning rate - 'random_forest': RandomForestRegressor(n_estimators=200, max_depth=10, min_samples_split=5), # More estimators and added max_depth and min_samples_split - 'lasso_regression': Lasso(alpha=0.01), # Lower alpha for less regularization - 'ridge_regression': Ridge(alpha=0.5), # Lower alpha for less regularization - 'elastic_net': ElasticNet(alpha=0.5, l1_ratio=0.7), # Adjusted l1_ratio for better balance - 'knn_regression': KNeighborsRegressor(n_neighbors=5), # Increased neighbors for better smoothing - 'decision_tree': DecisionTreeRegressor(max_depth=10, min_samples_split=4), # Increased max_depth and added min_samples_split - 'adaboost': AdaBoostRegressor(n_estimators=100, learning_rate=0.1), # More estimators and added learning rate - 'xgboost': XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=4) # More estimators and smaller learning rate + 'linear_regression': LinearRegression(fit_intercept=False), + 'svr_regression': SVR(kernel='rbf', C=10, epsilon=0.2), + 'gradient_boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3), + 'random_forest': RandomForestRegressor(n_estimators=100), + 'lasso_regression': Lasso(alpha=0.1), + 'ridge_regression': Ridge(alpha=1.0), + 'elastic_net': ElasticNet(alpha=1.0, l1_ratio=0.5), + 'knn_regression': KNeighborsRegressor(n_neighbors=1), + 'decision_tree': DecisionTreeRegressor(max_depth=5), + 'adaboost': AdaBoostRegressor(n_estimators=50), + 'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3), + 'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3) } if self.regressor_method not in regressor_models: @@ -255,20 +257,6 @@ class ContentBased(AlgoBase): return self.user_profile_explain[u] else: return None - - def explain2(self, u): - if u in self.user_profile_explain: - user_explanation = self.user_profile_explain[u] - unique_explanation = {} - for feature in self.features_methods: - if feature == "genre": - genre_weights = {genre: user_explanation[genre] for genre in self.content_features.columns if genre in user_explanation} - unique_explanation[feature] = genre_weights - else: - unique_explanation[feature] = user_explanation[feature] - return unique_explanation - else: - return None def rmse(self, testset): """Compute RMSE on the testset""" @@ -289,8 +277,7 @@ class ContentBased(AlgoBase): # Example usage: -# cb = ContentBased(["title_length", "movie_year", "tags"], "svr_regression") -# cb = ContentBased(["movie_year","Romance","avg_rating"], "random_forest") +# cb = ContentBased(["title_length", "movie_year","genre","avg_rating"], "ridge_regression") # surprise_data = load_ratings(surprise_format=True) # trainset = surprise_data.build_full_trainset() # testset = trainset.build_anti_testset() @@ -301,19 +288,17 @@ class ContentBased(AlgoBase): # # Example explanations for users: -# print(cb.explain(11)) +# #print(cb.explain(11)) -# print(cb.explain(13)) +# #print(cb.explain(13)) -# # print(cb.explain(17)) -# print(cb.explain2(17)) -# print("-----\n") +# print(cb.explain(17)) -# print(cb.explain(23)) +#print(cb.explain(23)) -# print(cb.explain(27)) +#print(cb.explain(27)) -# print(cb.explain(73)) +#print(cb.explain(73)) diff --git a/recommender.py b/recommender.py index 0d687d75aeccb6d234ae34d68f0bbc770c869fef..0778db72658c9fcc31b65075449092178222285f 100644 --- a/recommender.py +++ b/recommender.py @@ -1,33 +1,31 @@ # Standard library imports -import numpy as np -import pandas as pd -import requests -from collections import defaultdict import heapq import pickle import random as rd +from collections import defaultdict # Third-party imports -from sklearn.metrics import mean_squared_error -from sklearn.metrics.pairwise import pairwise_distances -from sklearn.preprocessing import MultiLabelBinarizer +import numpy as np +import pandas as pd + +from scipy.stats import pearsonr +from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet -from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, AdaBoostRegressor -from sklearn.tree import DecisionTreeRegressor +from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge +from sklearn.metrics import mean_squared_error + from sklearn.neighbors import KNeighborsRegressor +from sklearn.preprocessing import MultiLabelBinarizer from sklearn.svm import SVR -from surprise import KNNWithMeans, accuracy, AlgoBase, PredictionImpossible, KNNBasic, Reader, Dataset, SVD -from surprise.model_selection import train_test_split -from surprise.similarities import cosine, msd -import xgboost as xgb +from sklearn.tree import DecisionTreeRegressor +from surprise import AlgoBase, KNNWithMeans, accuracy, PredictionImpossible from xgboost import XGBRegressor -from scipy.stats import pearsonr - +from lightgbm import LGBMRegressor # Local imports -from loaders import load_items, load_ratings from constants import Constant as C +from loaders import load_items, load_ratings + ################################################################################################################# ################################################# load the data ################################################# @@ -46,7 +44,7 @@ testset = trainset.build_anti_testset() class UserBased(AlgoBase): - def __init__(self, k=340, min_k=340, sim_options={}, **kwargs): + def __init__(self, k=20, min_k=20, sim_options={}, **kwargs): """ Initialize the UserBased collaborative filtering algorithm. @@ -172,17 +170,20 @@ class UserBased(AlgoBase): self.mean_ratings = mean_ratings - def get_top_10_pred_ub(self, testset, target_user): + def get_top_n_pred_ub(self, testset, target_user, n=10): """ - Get the top 10 predictions for a specific target user. + Get the top N predictions for a specific target user. Args: testset (list): List of testset entries containing (user, item, rating). target_user (int): Target user for whom predictions are needed. + n (int): Number of predictions to return (default: 10). + Returns: + list: Top N predictions for the target user. """ - self.min_k = 340 - self.k = 340 + self.min_k = 20 + self.k = 20 # Get the items the target user has already rated rated_items = set([item for item, rating in self.trainset.ur[self.trainset.to_inner_uid(target_user)]]) @@ -202,13 +203,43 @@ class UserBased(AlgoBase): # Sort the predictions by estimated rating in descending order user_based_predictions.sort(key=lambda x: x[1], reverse=True) - top_10_predictions_ub = user_based_predictions[:10] + top_n_predictions_ub = user_based_predictions[:n] - # Print the top 10 predictions for the target user - print(f"Top 10 predictions for user {target_user}:") - for movie_id, pred in top_10_predictions_ub: + # Print the top N predictions for the target user + print(f"Top {n} predictions for user {target_user}:") + for movie_id, pred in top_n_predictions_ub: print(f"MovieId {movie_id}: {pred}") - return top_10_predictions_ub + return top_n_predictions_ub + + def inter_user_diversity(self, top_n_recommendations): + """ + Calculate the inter-user diversity (IUD) of the recommender system. + + Args: + top_n_recommendations (dict): Dictionary containing top N recommendations for each user. + + Returns: + float: Average pairwise Jaccard distance between recommendations to users. + """ + jaccard_distances = [] + + # Convert top_n_recommendations to a list of sets for easier computation + recommendation_sets = [set([item_id for item_id, _ in recommendations]) for recommendations in top_n_recommendations.values()] + + # Calculate Jaccard distance between all pairs of recommendation sets + for i in range(len(recommendation_sets)): + for j in range(i+1, len(recommendation_sets)): + union_size = len(recommendation_sets[i].union(recommendation_sets[j])) + intersection_size = len(recommendation_sets[i].intersection(recommendation_sets[j])) + jaccard_distances.append(1 - (intersection_size / union_size)) + + # Calculate the average pairwise Jaccard distance + if jaccard_distances: + average_distance = sum(jaccard_distances) / len(jaccard_distances) + else: + average_distance = 0.0 + + return average_distance def evaluate_rmse(self, testset): """ @@ -241,20 +272,25 @@ class UserBased(AlgoBase): Calculate catalog coverage based on the top N recommendations. Args: - top_n_recommendations (list): List of top N recommendations for each user. + top_n_recommendations (list or dict): List or dictionary containing top N recommendations for each user. Returns: float: Catalog coverage ratio. """ - all_items = set() recommended_items = set() - for user_recommendations in top_n_recommendations.values(): - for item_id, _ in user_recommendations: - all_items.add(item_id) + all_items = set(range(self.trainset.n_items)) + + if isinstance(top_n_recommendations, dict): + for user_recommendations in top_n_recommendations.values(): + for item_id, _ in user_recommendations: + recommended_items.add(item_id) + elif isinstance(top_n_recommendations, list): + for item_id, _ in top_n_recommendations: recommended_items.add(item_id) + coverage = len(recommended_items) / len(all_items) return coverage - + ########################################################################################################################### ####################################################### KNN MODEL ######################################################## @@ -292,11 +328,11 @@ class RecommenderSystem_KNN : sim_options = { 'name': 'msd', # Mean Squared Difference (Mean Square Error) 'user_based': True, # User-based collaborative filtering - 'min_support': 340 # Minimum number of common ratings required + 'min_support': 20 # Minimum number of common ratings required } # Build and train the KNN model - self.model = KNNWithMeans(sim_options=sim_options, k=340, min_k=340) + self.model = KNNWithMeans(sim_options=sim_options, k=20, min_k=20) self.model.fit(self.trainset) # Evaluate the model @@ -345,22 +381,58 @@ class RecommenderSystem_KNN : return top_n.get(userid, []) + + def inter_user_diversity(self, top_n_recommendations): + """ + Calculate the inter-user diversity (IUD) of the recommender system. + + Args: + top_n_recommendations (dict): Dictionary containing top N recommendations for each user. + + Returns: + float: Average pairwise Jaccard distance between recommendations to users. + """ + jaccard_distances = [] + + # Convert top_n_recommendations to a list of sets for easier computation + recommendation_sets = [set([item_id for item_id, _ in recommendations]) for recommendations in top_n_recommendations.values()] + + # Calculate Jaccard distance between all pairs of recommendation sets + for i in range(len(recommendation_sets)): + for j in range(i+1, len(recommendation_sets)): + union_size = len(recommendation_sets[i].union(recommendation_sets[j])) + intersection_size = len(recommendation_sets[i].intersection(recommendation_sets[j])) + jaccard_distances.append(1 - (intersection_size / union_size)) + + # Calculate the average pairwise Jaccard distance + if jaccard_distances: + average_distance = sum(jaccard_distances) / len(jaccard_distances) + else: + average_distance = 0.0 + + return average_distance + def catalog_coverage(self, top_n_recommendations): """ Calculate catalog coverage based on the top N recommendations. Args: - top_n_recommendations (defaultdict(list)): Dictionary containing top N recommendations for each user. + top_n_recommendations (list or dict): List or dictionary containing top N recommendations for each user. Returns: float: Catalog coverage ratio. """ - all_items = set() recommended_items = set() - for user_recommendations in top_n_recommendations.values(): - for item_id, _ in user_recommendations: - all_items.add(item_id) + all_items = set(range(self.trainset.n_items)) + + if isinstance(top_n_recommendations, dict): + for user_recommendations in top_n_recommendations.values(): + for item_id, _ in user_recommendations: + recommended_items.add(item_id) + elif isinstance(top_n_recommendations, list): + for item_id, _ in top_n_recommendations: recommended_items.add(item_id) + coverage = len(recommended_items) / len(all_items) return coverage @@ -404,12 +476,16 @@ class OtherUserBased: data = pd.read_csv(csv_file) return data['movieId'].unique() - def get_top_10_predictions_for_user(self, csv_file): + def get_top_n_predictions_for_user(self, csv_file, n=10): """ - Get the top 100 predictions for the user. + Get the top N predictions for all users. Args: csv_file (str): Path to the CSV file containing item data. + n (int): Number of predictions to return for each user (default: 10). + + Returns: + dict: Dictionary containing top N predictions for each user. """ if hasattr(self, 'model') and self.model is not None: all_item_ids = self.get_all_item_ids_from_csv(csv_file) @@ -426,6 +502,7 @@ class OtherUserBased: print(f"Model for user {self.user_id} ({self.user_name}) could not be loaded.") return None + def evaluate_rmse(self): """ Evaluate the RMSE of the model on the test data. @@ -468,6 +545,36 @@ class OtherUserBased: print(f"Model for user {self.user_id} ({self.user_name}) could not be loaded.") return None + def inter_user_diversity(self, top_n_recommendations): + """ + Calculate the inter-user diversity (IUD) of the recommender system. + + Args: + top_n_recommendations (dict): Dictionary containing top N recommendations for each user. + + Returns: + float: Average pairwise Jaccard distance between recommendations to users. + """ + jaccard_distances = [] + + # Convert top_n_recommendations to a list of sets for easier computation + recommendation_sets = [set([item_id for item_id, _ in recommendations]) for recommendations in top_n_recommendations.values()] + + # Calculate Jaccard distance between all pairs of recommendation sets + for i in range(len(recommendation_sets)): + for j in range(i+1, len(recommendation_sets)): + union_size = len(recommendation_sets[i].union(recommendation_sets[j])) + intersection_size = len(recommendation_sets[i].intersection(recommendation_sets[j])) + jaccard_distances.append(1 - (intersection_size / union_size)) + + # Calculate the average pairwise Jaccard distance + if jaccard_distances: + average_distance = sum(jaccard_distances) / len(jaccard_distances) + else: + average_distance = 0.0 + + return average_distance + def catalog_coverage(self, top_n_predictions): """ Calculate catalog coverage based on the top N predictions. @@ -491,7 +598,7 @@ class OtherUserBased: ########################################################################################################################### class CustomUserBased(UserBased): - def __init__(self, k=340, min_k=340, sim_options={}, **kwargs): + def __init__(self, k=20, min_k=20, sim_options={}, **kwargs): """ Initialize the CustomUserBased collaborative filtering algorithm. @@ -633,73 +740,127 @@ def compare_similarity_measures(trainset,testset): return results -# # Example usage: +# # # Example usage: # comparison_results = compare_similarity_measures(trainset ,testset) # print(comparison_results) def evaluate_models(trainset, testset, ratings_path, user_name, user_id): # Entraînement et évaluation du modèle UserBased - user_based_model = UserBased(k=340, min_k=340) + user_based_model = UserBased(k=20, min_k=20) user_based_model.fit(trainset) - top_n_recommendations_ub = user_based_model.get_top_10_pred_ub(testset, user_id) - diversity_ub = user_based_model.catalog_coverage(top_n_recommendations_ub) + top_n_predictions_ub = user_based_model.get_top_n_pred_ub(testset, user_id, n=5000) + diversity_ub = user_based_model.catalog_coverage(top_n_predictions_ub) print("Diversity for UserBased model:", diversity_ub) # Entraînement et évaluation du modèle KNN knn_model = RecommenderSystem_KNN(ratings_path) knn_model.train_knn_model() - top_n_recommendations_knn = knn_model.get_top_n_recommendations(userid=user_id, n=10) - diversity_knn = knn_model.catalog_coverage(top_n_recommendations_knn) + all_predictions_knn = knn_model.get_top_n_recommendations(userid=user_id, n=5000) # Modifiez 100 selon vos besoins + diversity_knn = knn_model.catalog_coverage(all_predictions_knn) print("Diversity for KNN model:", diversity_knn) # Entraînement et évaluation du modèle OtherUserBased other_user_based_model = OtherUserBased(user_name, user_id) other_user_based_model.load_model() - top_n_predictions_other = other_user_based_model.get_top_10_predictions_for_user(ratings_path) - diversity_other = other_user_based_model.catalog_coverage(top_n_predictions_other) + top_n_predictions = other_user_based_model.get_top_n_predictions_for_user(ratings_path, n=10) + diversity_other = other_user_based_model.catalog_coverage(top_n_predictions) print("Diversity for OtherUserBased model:", diversity_other) -# Utilisation de la fonction +# # Utilisation de la fonction +# evaluate_models(trainset, testset, "data/small/evidence/ratings.csv", "Adrien", -1) + +def evaluate_inter_user_diversity(user_based_model, ratings_path, other_user_based, trainset, testset): + """ + Evaluate the inter-user diversity of different recommender models. + + Args: + user_based_model (UserBased): Instance of the UserBased model. + ratings_path (str): Path to the ratings data. + other_user_based (OtherUserBased): Instance of the OtherUserBased model. + trainset (Trainset): Training dataset containing user-item ratings. + testset (list): List of testset entries containing (user, item, rating). + + Returns: + dict: Dictionary containing inter-user diversity scores for each model. + """ + inter_user_diversity_scores = {} + + # UserBased model + user_based_model.fit(trainset) + all_top_n_recommendations_ub = {} + for user_id in range(user_based_model.trainset.n_users): + try: + trainset_user_id = user_based_model.trainset.to_raw_uid(user_id) + top_n_recommendations_ub = user_based_model.get_top_n_pred_ub(testset, target_user=trainset_user_id, n=10) + all_top_n_recommendations_ub[trainset_user_id] = top_n_recommendations_ub + except ValueError: + print(f"User {trainset_user_id} is not part of the training set for UserBased model. Skipping...") + + inter_user_diversity_scores['UserBased'] = user_based_model.inter_user_diversity(all_top_n_recommendations_ub) + + #KNN model + knn_model = RecommenderSystem_KNN(ratings_path) + knn_model.train_knn_model() + knn_top_n_recommendations = knn_model.get_top_n_recommendations(testset, n=10) + inter_user_diversity_scores['KNN'] = knn_model.inter_user_diversity(knn_top_n_recommendations) + + # OtherUserBased model + other_user_based.load_model() + other_top_n_recommendations = other_user_based.get_top_n_predictions_for_user("data/small/evidence/ratings.csv", n=10) + inter_user_diversity_scores['OtherUserBased'] = other_user_based.inter_user_diversity(other_top_n_recommendations) + + return inter_user_diversity_scores + + +# # Example usage: +# user_based_model = UserBased(k=40, min_k=40) +# ratings = "data/small/evidence/ratings.csv" +# other_user_based = OtherUserBased("Adrien", -1) +# other_user_based_2 = OtherUserBased("Audrey", -2) +# other_user_based_3 = OtherUserBased("Nathanael", -3) +# other_user_based_4 = OtherUserBased("Charles", -4) + +# inter_user_diversity_scores = evaluate_inter_user_diversity(user_based_model, ratings, other_user_based, trainset, testset) +# print("Inter-user Diversity Scores:") +# for model_name, score in inter_user_diversity_scores.items(): +# print(f"{model_name}: {score}") + -evaluate_models(trainset, testset, "data/small/evidence/ratings.csv", "Adrien", -1) ########################################################################################################################### ###################################################### CONTENT-BASED MODEL ################################################ ########################################################################################################################### -def get_top_n(predictions, n): - """Return the top-N recommendation for each user from a set of predictions. - Source: inspired by https://github.com/NicolasHug/Surprise/blob/master/examples/top_n_recommendations.py - and modified by cvandekerckh for random tie breaking +def get_top_n(predictions, user_id, n=10): + """ + Return the top-N recommendation for a specific user from a set of predictions. Args: - predictions(list of Prediction objects): The list of predictions, as - returned by the test method of an algorithm. - n(int): The number of recommendation to output for each user. Default - is 10. + predictions(list of Prediction objects): The list of predictions, as returned by the test method of an algorithm. + user_id(str): The user ID for which to return recommendations. + n(int): The number of recommendations to output. Default is 10. Returns: - A dict where keys are user (raw) ids and values are lists of tuples: - [(raw item id, rating estimation), ...] of size n. + A list of tuples: [(raw item id, rating estimation), ...] of size n. """ rd.seed(0) - # First map the predictions to each user. - top_n = defaultdict(list) + # First map the predictions to the specified user. + user_ratings = [] for uid, iid, true_r, est, _ in predictions: - top_n[uid].append((iid, est)) + if uid == user_id: + user_ratings.append((iid, est)) - # Then sort the predictions for each user and retrieve the k highest ones. - for uid, user_ratings in top_n.items(): - rd.shuffle(user_ratings) - user_ratings.sort(key=lambda x: x[1], reverse=True) - top_n[uid] = user_ratings[:n] + # Then sort the predictions for the user and retrieve the k highest ones. + rd.shuffle(user_ratings) + user_ratings.sort(key=lambda x: x[1], reverse=True) + top_n = user_ratings[:n] return top_n - +# Define your ContentBased class class ContentBased(AlgoBase): def __init__(self, features_method, regressor_method): AlgoBase.__init__(self) @@ -713,13 +874,11 @@ class ContentBased(AlgoBase): """Content Analyzer""" df_items = load_items() df_ratings = load_ratings() - df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME) - df_features = pd.DataFrame(index=df_items.index) for method in features_methods: if method == "title_length": - df_title_length = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('title_length') + df_title_length = df_items['title'].apply(lambda x: len(x)).to_frame('title_length') df_features = pd.concat([df_features, df_title_length], axis=1) elif method == "movie_year": @@ -739,35 +898,27 @@ class ContentBased(AlgoBase): else: raise NotImplementedError(f'Feature method {method} not yet implemented') - # Handle missing values in df_features df_features.fillna(0, inplace=True) - return df_features def fit(self, trainset): """Profile Learner""" AlgoBase.fit(self, trainset) - - # Preallocate user profiles self.user_profile = {u: None for u in trainset.all_users()} self.user_profile_explain = {} - - epsilon = 1e-10 # Small value to prevent division by zero + epsilon = 1e-10 for u in trainset.all_users(): raw_user_id = trainset.to_raw_uid(u) self.user_profile_explain[raw_user_id] = {} - user_ratings = np.array([rating for (_, rating) in trainset.ur[u]]) item_ids = [iid for (iid, _) in trainset.ur[u]] raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids] - feature_values = self.content_features.loc[raw_item_ids].values norms = np.linalg.norm(feature_values, axis=0) + epsilon weighted_features = feature_values / norms feature_importance = weighted_features.T @ user_ratings feature_importance /= np.sum(user_ratings) - self.user_profile_explain[raw_user_id] = dict(zip(self.content_features.columns, feature_importance)) if self.regressor_method == 'random_score': @@ -778,19 +929,21 @@ class ContentBased(AlgoBase): for u in self.user_profile: self.user_profile[u] = [rating for (_, rating) in trainset.ur[u]] + else: regressor_models = { - 'linear_regression': LinearRegression(fit_intercept=True), # Fit intercept might help - 'svr_regression': SVR(kernel='rbf', C=1.0, epsilon=0.1), # Adjusted C and epsilon for better performance - 'gradient_boosting': GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=4), # More estimators and smaller learning rate - 'random_forest': RandomForestRegressor(n_estimators=200, max_depth=10, min_samples_split=5), # More estimators and added max_depth and min_samples_split - 'lasso_regression': Lasso(alpha=0.01), # Lower alpha for less regularization - 'ridge_regression': Ridge(alpha=0.5), # Lower alpha for less regularization - 'elastic_net': ElasticNet(alpha=0.5, l1_ratio=0.7), # Adjusted l1_ratio for better balance - 'knn_regression': KNeighborsRegressor(n_neighbors=5), # Increased neighbors for better smoothing - 'decision_tree': DecisionTreeRegressor(max_depth=10, min_samples_split=4), # Increased max_depth and added min_samples_split - 'adaboost': AdaBoostRegressor(n_estimators=100, learning_rate=0.1), # More estimators and added learning rate - 'xgboost': XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=4) # More estimators and smaller learning rate + 'linear_regression': LinearRegression(fit_intercept=False), + 'svr_regression': SVR(kernel='rbf', C=10, epsilon=0.2), + 'gradient_boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3), + 'random_forest': RandomForestRegressor(n_estimators=100), + 'lasso_regression': Lasso(alpha=0.1), + 'ridge_regression': Ridge(alpha=1.0), + 'elastic_net': ElasticNet(alpha=1.0, l1_ratio=0.5), + 'knn_regression': KNeighborsRegressor(n_neighbors=1), + 'decision_tree': DecisionTreeRegressor(max_depth=5), + 'adaboost': AdaBoostRegressor(n_estimators=50), + 'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3), + 'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3) } if self.regressor_method not in regressor_models: @@ -800,20 +953,15 @@ class ContentBased(AlgoBase): user_ratings = [rating for (_, rating) in trainset.ur[u]] item_ids = [iid for (iid, _) in trainset.ur[u]] raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids] - df_user = pd.DataFrame({'item_id': raw_item_ids, 'user_ratings': user_ratings}) df_user = df_user.merge(self.content_features, left_on="item_id", right_index=True, how='left') - X = df_user.drop(columns=['item_id', 'user_ratings']) y = df_user['user_ratings'] - regressor = regressor_models[self.regressor_method] regressor.fit(X, y) - self.user_profile[u] = regressor def estimate(self, u, i): - """Scoring component used for item filtering""" if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unknown.') @@ -829,26 +977,6 @@ class ContentBased(AlgoBase): regressor = self.user_profile[u] item_features_df = pd.DataFrame(item_features, columns=self.content_features.columns) return regressor.predict(item_features_df)[0] - - def explain(self, u): - if u in self.user_profile_explain: - return self.user_profile_explain[u] - else: - return None - - def explain2(self, u): - if u in self.user_profile_explain: - user_explanation = self.user_profile_explain[u] - unique_explanation = {} - for feature in self.features_methods: - if feature == "genre": - genre_weights = {genre: user_explanation[genre] for genre in self.content_features.columns if genre in user_explanation} - unique_explanation[feature] = genre_weights - else: - unique_explanation[feature] = user_explanation[feature] - return unique_explanation - else: - return None def rmse(self, testset): """Compute RMSE on the testset""" @@ -867,13 +995,48 @@ class ContentBased(AlgoBase): rmse_value = np.sqrt(mse) return rmse_value + def explain(self, u): + if u in self.user_profile_explain: + return self.user_profile_explain[u] + else: + return None +def test_contentbased_class(feature_method, regressor_method, user_id=-1, n=10): + """Test the ContentBased class and get top N recommendations.""" + sp_ratings = load_ratings(surprise_format=True) + train_set = sp_ratings.build_full_trainset() + content_algo = ContentBased(feature_method, regressor_method) + content_algo.fit(train_set) + + anti_test_set = train_set.build_anti_testset() + user_anti_test_set = [entry for entry in anti_test_set if entry[0] == user_id] + + predictions = [] + for uid, iid, _ in user_anti_test_set: + prediction = content_algo.predict(uid, iid) + predictions.append(prediction) + # Get the top-N recommendations for each user + top_n_recommendations = get_top_n(predictions, user_id= user_id, n=n) + # Print the top-N recommendations + print(f"Top {n} recommendations for User {user_id}:") + for iid, est in top_n_recommendations: + print(f"Item {iid}: {est:.2f}") + return top_n_recommendations +# Example usage +#test_contentbased_class(["title_length", "movie_year", "genre"], "gradient_boosting", user_id=-1, n=10) +cb = ContentBased(["title_length", "movie_year","genre","avg_rating"], "ridge_regression") +surprise_data = load_ratings(surprise_format=True) +trainset = surprise_data.build_full_trainset() +testset = trainset.build_anti_testset() +cb.fit(trainset) + +print("RMSE: ", cb.rmse(testset)) ########################################################################################################################### @@ -979,9 +1142,11 @@ class LatentFactorModel: -# # Example usage: +# Example usage: + + # # Load the data -# ratings = pd.read_csv('data/small/evidence/ratings.csv') # Make sure your CSV has columns 'userId', 'movieId', 'rating' +# ratings = pd.read_csv('data/small/evidence/ratings.csv') # # Charger les données des films # movies = pd.read_csv('data/small/content/movies.csv') @@ -994,7 +1159,7 @@ class LatentFactorModel: # # Predict a rating for a specific user and movie # user_id = -1 -# movie_id = 4306 +# movie_id = 5218 # predicted_rating = lfm.predict(user_id, movie_id) # print(f"Predicted rating for user {user_id} and movie {movie_id}: {predicted_rating}")