diff --git a/recommender.py b/recommender.py index bf407ea688427efd6bbff0a7328c9ae7c0fe0bbc..3e0bdae7326de44f073cdd7d39b7f4259015bf10 100644 --- a/recommender.py +++ b/recommender.py @@ -13,6 +13,9 @@ from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, Rando from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge from sklearn.metrics import mean_squared_error +from sklearn.metrics import mean_absolute_error +from surprise.similarities import pearson + from sklearn.neighbors import KNeighborsRegressor from sklearn.preprocessing import MultiLabelBinarizer @@ -25,6 +28,7 @@ from lightgbm import LGBMRegressor # Local imports from constants import Constant as C from loaders import load_items, load_ratings +import matplotlib.pyplot as plt ################################################################################################################# @@ -32,12 +36,9 @@ from loaders import load_items, load_ratings ################################################################################################################# surprise_data = load_ratings(surprise_format=True) - trainset = surprise_data.build_full_trainset() - testset = trainset.build_anti_testset() - ########################################################################################################################### #################################################### USER-BASED MODEL ##################################################### ########################################################################################################################### @@ -268,6 +269,35 @@ class UserBased(AlgoBase): return accuracy.mae(predictions, verbose=True) +# # Construire l'ensemble d'entraînement complet et l'ensemble de test +# surprise_data = load_ratings(surprise_format=True) +# trainset = surprise_data.build_full_trainset() +# testset = trainset.build_anti_testset() + + +# # Valeurs de k à tester +# k_values = range(1, 81, 10) +# rmse_values = [] + +# #Évaluer le modèle pour chaque valeur de k +# for k in k_values: +# print(f"Évaluating for k={k}") +# algo = UserBased(k=k, min_k=k) +# algo.fit(trainset) +# rmse = algo.evaluate_rmse(testset) +# rmse_values.append(rmse) +# print(f"k={k}, RMSE={rmse}") + +# # Tracer le graphique de l'évolution du RMSE en fonction de k +# plt.figure(figsize=(10, 6)) +# plt.plot(k_values, rmse_values, marker='o') +# plt.title('Évolution du RMSE en fonction de k') +# plt.xlabel('Nombre de voisins (k)') +# plt.ylabel('RMSE') +# plt.grid(True) +# plt.show() + + ########################################################################################################################### ####################################################### KNN MODEL ######################################################## ########################################################################################################################### @@ -322,8 +352,6 @@ class RecommenderSystem_KNN : Returns: float: MAE of the model. """ - if self.model is None or self.testset is None: - raise ValueError("Model has not been trained. Call train_knn_model() first.") predictions = self.model.test(self.testset) return accuracy.mae(predictions) @@ -339,9 +367,6 @@ class RecommenderSystem_KNN : Returns: list: Top N recommendations as a list of tuples (item_id, prediction). """ - if self.model is None or self.testset is None: - raise ValueError("Model has not been trained. Call train_knn_model() first.") - # Retrieve the top N recommendations top_n_recommendations = [] @@ -400,7 +425,51 @@ class RecommenderSystem_KNN : average_distance = 0.0 return average_distance - + + def evaluate_knn_rmse_for_different_k(self): + """ + Evaluate the RMSE of the KNN model for different values of k. + """ + # Charger les données (par exemple, à partir d'un fichier de test de Surprise) + + # Split data into training and testing sets + surprise_data = load_ratings(surprise_format=True) + self.trainset = surprise_data.build_full_trainset() + self.testset = self.trainset.build_anti_testset() + + # Valeurs de k à tester + k_values = range(1, 81, 10) + rmse_values = [] + + # Évaluer le modèle pour chaque valeur de k + for k in k_values: + print(f"Évaluating for k={k}") + sim_options = { + 'name': 'msd', + 'user_based': True + } + algo = KNNWithMeans(sim_options=sim_options, k=k, min_k=k) + algo.fit(self.trainset) + predictions = algo.test(self.testset) + rmse = accuracy.rmse(predictions, verbose=False) + rmse_values.append(rmse) + print(f"k={k}, RMSE={rmse}") + + # Tracer le graphique de l'évolution du RMSE en fonction de k + plt.figure(figsize=(10, 6)) + plt.plot(k_values, rmse_values, marker='o') + plt.title('Évolution du RMSE en fonction de k') + plt.xlabel('Nombre de voisins (k)') + plt.ylabel('RMSE') + plt.grid(True) + plt.show() + +# # Utilisation de la classe RecommenderSystem_KNN +# recommender = RecommenderSystem_KNN(ratings_path='data/small/evidence/ratings.csv') +# recommender.evaluate_knn_rmse_for_different_k() + + + ########################################################################################################################### ################################################# OTHER USER-BASED MODEL ################################################## ########################################################################################################################### @@ -483,9 +552,24 @@ class OtherUserBased: predictions = self.model.test(testset) rmse = accuracy.rmse(predictions) return rmse - else: - print(f"Model for user {self.user_id} ({self.user_name}) could not be loaded.") - return None + + def evaluate_mae(self): + """ + Evaluate the MAE of the model on the test data. + + Args: + test_data (DataFrame): A pandas DataFrame with columns ['user_id', 'movie_id', 'rating']. + + Returns: + float: The MAE of the model on the test data. + """ + if self.model is not None: + surprise_data = load_ratings(surprise_format=True) + trainset = surprise_data.build_full_trainset() + testset = trainset.build_anti_testset() + predictions = self.model.test(testset) + mae = accuracy.mae(predictions) + return mae def inter_user_diversity(self, top_n_predictions): """ @@ -511,44 +595,6 @@ class OtherUserBased: inter_user_diversity_score = np.mean(similarities) return inter_user_diversity_score -########################################################################################################################### -####################################################### CUSTOM USER BASED ################################################ -########################################################################################################################### - -class CustomUserBased(UserBased): - def __init__(self, k=20, min_k=20, sim_options={}, **kwargs): - """ - Initialize the CustomUserBased collaborative filtering algorithm. - - Args: - k (int): Number of neighbors to consider (default: 3). - min_k (int): Minimum number of neighbors required to make predictions (default: 1). - sim_options (dict): Options for similarity computation (default: {}). - **kwargs: Additional keyword arguments. - """ - super().__init__(k=k, min_k=min_k, sim_options=sim_options, **kwargs) - - def compute_similarity_matrix(self): - """ - Compute the similarity matrix based on user ratings using the custom similarity metric (Pearson correlation). - """ - n_users = self.trainset.n_users - similarity_matrix = np.eye(n_users) - - for i in range(n_users): - for j in range(i + 1, n_users): - # Compute Pearson correlation coefficient between users i and j - ratings_i = self.ratings_matrix[i] - ratings_j = self.ratings_matrix[j] - common_ratings = np.logical_and(~np.isnan(ratings_i), ~np.isnan(ratings_j)) - if np.sum(common_ratings) >= self.min_k: - pearson_corr, _ = pearsonr(ratings_i[common_ratings], ratings_j[common_ratings]) - similarity_matrix[i, j] = pearson_corr - similarity_matrix[j, i] = pearson_corr - - self.sim = similarity_matrix - - ########################################################################################################################### ####################################################### COMPARISON MODEL ################################################## ########################################################################################################################### @@ -560,13 +606,18 @@ def compare_models(): user_based_model.fit(trainset) testset = trainset.build_anti_testset() rmse_user_based = user_based_model.evaluate_rmse(testset) + mae_user_based = user_based_model.evaluate_mae(testset) print(f"RMSE of the UserBased model: {rmse_user_based}") + print(f"MAE of the UserBased model: {mae_user_based}") # Train and evaluate the KNN model recommender_knn = RecommenderSystem_KNN("data/small/evidence/ratings.csv") recommender_knn.train_knn_model() rmse_knn = recommender_knn.rmse + mae_knn = recommender_knn.evaluate_mae() print(f"RMSE of the KNN model: {rmse_knn}") + print(f"MAE of the KNN model: {mae_knn}") + user_data = [ {"user_name": "Audrey", "user_id": -2}, @@ -579,10 +630,12 @@ def compare_models(): user_model = OtherUserBased(user_name=user_info["user_name"], user_id=user_info["user_id"]) user_model.load_model() rmse = user_model.evaluate_rmse() + mae = user_model.evaluate_mae() print(f"RMSE of the {user_info['user_name']} model: {rmse}") + print(f"MAE of the {user_info['user_name']} model: {mae}") # Call the function to compare the models -# compare_models() +#compare_models() def compare_similarity_measures(trainset,testset): @@ -607,8 +660,19 @@ def compare_similarity_measures(trainset,testset): results['KNN_MSD_RMSE'] = rmse_msd results['KNN_MSD_MAE'] = mae_msd + # Train and evaluate KNN model with Pearson correlation similarity + sim_options_pearson = {'name': 'pearson', 'user_based': True} + knn_pearson = KNNWithMeans(sim_options=sim_options_pearson) + knn_pearson.fit(trainset) + predictions_pearson = knn_pearson.test(testset) + rmse_pearson = accuracy.rmse(predictions_pearson) + mae_pearson = accuracy.mae(predictions_pearson) + results['KNN_Pearson_RMSE'] = rmse_pearson + results['KNN_Pearson_MAE'] = mae_pearson + + # Train and evaluate KNN model with Jaccard similarity - sim_options_jaccard = {'name': 'cosine'} + sim_options_jaccard = {'name': 'jaccard','user_based': True} user_based_jaccard = KNNWithMeans(sim_options=sim_options_jaccard) user_based_jaccard.fit(trainset) predictions_jaccard = user_based_jaccard.test(testset) @@ -618,7 +682,7 @@ def compare_similarity_measures(trainset,testset): results['KNN_Jaccard_MAE'] = mae_jaccard # Train and evaluate UserBased model with MSD similarity - user_based_msd = UserBased(sim_options={'name': 'msd'}) + user_based_msd = UserBased(sim_options={'name': 'msd','user_based': True}) user_based_msd.fit(trainset) predictions_user_based_msd = user_based_msd.test(testset) rmse_user_based_msd = accuracy.rmse(predictions_user_based_msd) @@ -627,7 +691,7 @@ def compare_similarity_measures(trainset,testset): results['UserBased_MSD_MAE'] = mae_user_based_msd # Train and evaluate UserBased model with Jaccard similarity - user_based_jaccard = UserBased(sim_options={'name': 'jaccard'}) + user_based_jaccard = UserBased(sim_options={'name': 'jaccard','user_based': True}) user_based_jaccard.fit(trainset) predictions_user_based_jaccard = user_based_jaccard.test(testset) rmse_user_based_jaccard = accuracy.rmse(predictions_user_based_jaccard) @@ -635,15 +699,14 @@ def compare_similarity_measures(trainset,testset): results['UserBased_Jaccard_RMSE'] = rmse_user_based_jaccard results['UserBased_Jaccard_MAE'] = mae_user_based_jaccard - # Train and evaluate CustomUserBased model with Pearson correlation similarity - custom_user_based_model = CustomUserBased() - custom_user_based_model.fit(trainset) - predictions_custom_user_based = [custom_user_based_model.predict(uid, iid).est for (uid, iid, _) in testset] - rmse_custom_user_based = np.sqrt(np.mean((predictions_custom_user_based - np.array([rating for (_, _, rating) in testset]))**2)) - mae_custom_user_based = np.mean(np.abs(predictions_custom_user_based - np.array([rating for (_, _, rating) in testset]))) - results['CustomUserBased_RMSE'] = rmse_custom_user_based - results['CustomUserBased_MAE'] = mae_custom_user_based - + # Train and evaluate UserBased model with Pearson correlation similarity + user_based_pearson = UserBased(sim_options={'name': 'pearson'}) + user_based_pearson.fit(trainset) + predictions_user_based_pearson = user_based_pearson.test(testset) + rmse_user_based_pearson = accuracy.rmse(predictions_user_based_pearson) + mae_user_based_pearson = accuracy.mae(predictions_user_based_pearson) + results['UserBased_Pearson_RMSE'] = rmse_user_based_pearson + results['UserBased_Pearson_MAE'] = mae_user_based_pearson # Train and evaluate OtherUserBased models for user_name, user_id in [('Adrien', -1), ('Audrey', -2), ('Nathanael', -3), ('Charles', -4)]: @@ -657,40 +720,45 @@ def compare_similarity_measures(trainset,testset): return results # # # Example usage: -# comparison_results = compare_similarity_measures(trainset ,testset) -# print(comparison_results) +comparison_results = compare_similarity_measures(trainset ,testset) +print(comparison_results) -def evaluate_inter_user_diversity(user_based_model, ratings_path, other_user_based_models, trainset, testset): +def evaluate_inter_user_diversity(user_based_model, ratings_path, other_user_based_models, trainset, testset, k, min_k): """ - Evaluate the inter-user diversity of different recommender models. + Evaluate the inter-user diversity of different recommender models with given k and min_k. Args: user_based_model (UserBased): Instance of the UserBased model. ratings_path (str): Path to the ratings data. - other_user_based (OtherUserBased): Instance of the OtherUserBased model. + other_user_based_models (list): List of instances of OtherUserBased models. trainset (Trainset): Training dataset containing user-item ratings. testset (list): List of testset entries containing (user, item, rating). + k (int): Number of neighbors for the UserBased model. + min_k (int): Minimum number of neighbors for the UserBased model. Returns: dict: Dictionary containing inter-user diversity scores for each model. """ inter_user_diversity_scores = {} - # # UserBased model - # user_based_model.fit(trainset) - # all_top_n_recommendations_ub = {} - # for user_id in range(user_based_model.trainset.n_users): - # try: - # trainset_user_id = user_based_model.trainset.to_raw_uid(user_id) - # top_n_recommendations_ub = user_based_model.get_top_n_pred_ub(testset, target_user=trainset_user_id, n=10) - # all_top_n_recommendations_ub[trainset_user_id] = top_n_recommendations_ub - # except ValueError: - # print(f"User {trainset_user_id} is not part of the training set for UserBased model. Skipping...") + # UserBased model + user_based_model.k = k + user_based_model.min_k = min_k + user_based_model.fit(trainset) + all_top_n_recommendations_ub = {} + for user_id in range(user_based_model.trainset.n_users): + try: + trainset_user_id = user_based_model.trainset.to_raw_uid(user_id) + top_n_recommendations_ub = user_based_model.get_top_n_pred_ub(testset, target_user=trainset_user_id, n=10) + all_top_n_recommendations_ub[trainset_user_id] = top_n_recommendations_ub + except ValueError: + print(f"User {trainset_user_id} is not part of the training set for UserBased model. Skipping...") + + inter_user_diversity_scores['UserBased'] = user_based_model.inter_user_diversity(all_top_n_recommendations_ub) - # inter_user_diversity_scores['UserBased'] = user_based_model.inter_user_diversity(all_top_n_recommendations_ub) - # #KNN model + # # #KNN model # knn_model = RecommenderSystem_KNN(ratings_path) # knn_model.train_knn_model() # all_top_n_recommendations_knn = {} @@ -704,33 +772,33 @@ def evaluate_inter_user_diversity(user_based_model, ratings_path, other_user_bas # inter_user_diversity_scores['KNN'] = knn_model.inter_user_diversity(all_top_n_recommendations_knn) - # Other user-based models - for other_model in other_user_based_models: - other_model.load_model() - all_top_n_recommendations_other = {} + # # Other user-based models + # for other_model in other_user_based_models: + # other_model.load_model() + # all_top_n_recommendations_other = {} - # Get predictions for all users in the test set - all_user_ids = set(user for user, _, _ in testset) - for user_id in all_user_ids: - other_model.user_id = user_id # Update the user ID for the model - top_n_predictions = other_model.get_top_n_predictions_for_user(ratings_path, n=10) - all_top_n_recommendations_other[user_id] = top_n_predictions + # # Get predictions for all users in the test set + # all_user_ids = set(user for user, _, _ in testset) + # for user_id in all_user_ids: + # other_model.user_id = user_id # Update the user ID for the model + # top_n_predictions = other_model.get_top_n_predictions_for_user(ratings_path, n=10) + # all_top_n_recommendations_other[user_id] = top_n_predictions - inter_user_diversity_scores[f'Other_{other_model.user_name}'] = other_model.inter_user_diversity(all_top_n_recommendations_other) + # inter_user_diversity_scores[f'Other_{other_model.user_name}'] = other_model.inter_user_diversity(all_top_n_recommendations_other) return inter_user_diversity_scores -# user_based_model = UserBased(k=40, min_k=40) -# ratings = "data/small/evidence/ratings.csv" -# other_user_based_models = [ -# OtherUserBased("Adrien", -1), -# OtherUserBased("Audrey", -2), -# OtherUserBased("Nathanael", -3), -# OtherUserBased("Charles", -4) -# ] +#user_based_model = UserBased(k=10, min_k=10) +ratings = "data/small/evidence/ratings.csv" +other_user_based_models = [ + OtherUserBased("Adrien", -1), + OtherUserBased("Audrey", -2), + OtherUserBased("Nathanael", -3), + OtherUserBased("Charles", -4) +] # inter_user_diversity_scores = evaluate_inter_user_diversity(user_based_model, ratings, other_user_based_models, trainset, testset) @@ -740,7 +808,53 @@ def evaluate_inter_user_diversity(user_based_model, ratings_path, other_user_bas # for model_name, diversity_score in inter_user_diversity_scores.items(): # print(f"{model_name}: {diversity_score}") +# k_values = [10, 20, 30,40,50,60] +# results = [] + +# for k in k_values: +# user_based_model = UserBased(k=k, min_k=k) +# scores = evaluate_inter_user_diversity(user_based_model, ratings, other_user_based_models, trainset, testset, k, k) + +# # Calculate RMSE +# rmse = user_based_model.evaluate_rmse(testset) # Assuming you have a function to calculate RMSE +# scores['RMSE'] = rmse + +# results.append((k, k, scores)) +# # Plotting +# fig, ax1 = plt.subplots(figsize=(10, 6)) + +# # Create lists to store data for plotting lines +# k_values_plt = [] +# user_based_values = [] +# rmse_values = [] + +# # Collect data for plotting +# for result in results: +# k_value, min_k_value, scores = result +# k_values_plt.append(k_value) +# user_based_values.append(scores['UserBased']) +# rmse_values.append(scores['RMSE']) + +# # Plot Inter-user Diversity +# ax1.set_xlabel('k') +# ax1.set_ylabel('Inter-user Diversity', color='tab:blue') +# ax1.set_title('Inter-user Diversity and RMSE vs k') +# ax1.plot(k_values_plt, user_based_values, marker='o', linestyle='-', label='Inter-user Diversity', color='tab:blue') + +# ax1.tick_params(axis='y', labelcolor='tab:blue') +# ax1.legend(loc='upper left') + +# # Create a second y-axis for RMSE +# ax2 = ax1.twinx() +# ax2.set_ylabel('RMSE', color='tab:red') +# ax2.plot(k_values_plt, rmse_values, marker='x', linestyle='--', label='RMSE', color='tab:red') + +# ax2.tick_params(axis='y', labelcolor='tab:red') +# ax2.legend(loc='upper right') + +# plt.grid(True) +# plt.show() ########################################################################################################################### @@ -940,6 +1054,27 @@ def test_contentbased_class(feature_method, regressor_method, user_id=-1, n=10): print(f"Item {iid}: {est:.2f}") return top_n_recommendations +# top_n_recommendations = test_contentbased_class(["genre"], "linear_regression", user_id=-1, n=10) + +# # Load the test ratings +# sp_ratings = load_ratings(surprise_format=True) +# # Calculate RMSE +# content_algo = ContentBased(["genre"],"linear_regression") + +# trainset = surprise_data.build_full_trainset() + +# testset = trainset.build_anti_testset() +# content_algo.fit(trainset) +# rmse = content_algo.rmse(testset) +# print("RMSE:", rmse) + +# # Calculate MAE +# test_ratings = "data/tiny/evidence/ratings.csv" +# predictions = [rating for _, rating in top_n_recommendations] +# true_ratings = [rating for _, rating in test_ratings[test_ratings['userId'] == 1]['rating']] +# mae = mean_absolute_error(true_ratings, predictions) +# print("MAE:", mae) + ########################################################################################################################### ###################################################### LATENT FACTOR MODEL ############################################### @@ -1041,6 +1176,21 @@ class LatentFactorModel: return np.sqrt(mean_squared_error(y_true, y_pred)) + def calculate_mae(self, ratings): + y_true = [] + y_pred = [] + + for _, row in ratings.iterrows(): + user_id, movie_id, rating = row['userId'], row['movieId'], row['rating'] + prediction = self.predict(user_id, movie_id) + if prediction is not None: + y_true.append(rating) + y_pred.append(prediction) + + return mean_absolute_error(y_true, y_pred) + + + # # Load the data # ratings = pd.read_csv('data/small/evidence/ratings.csv') @@ -1057,6 +1207,8 @@ class LatentFactorModel: # final_rmse = lfm.calculate_rmse(ratings) # print(f"Final RMSE after training: {final_rmse:.4f}") +# final_mae = lfm.calculate_mae(ratings) +# print("MAE:", final_mae) # # Predict a rating for a specific user and movie # user_id = -1