From 3500e6cfc9a75e3672078858aa8c865f45751938 Mon Sep 17 00:00:00 2001 From: Adrien <adrien.payen@student.uclouvain.be> Date: Fri, 24 May 2024 17:31:09 +0200 Subject: [PATCH] update files --- Home.py | 6 +- .../content_based.ipynb | 0 evaluator.ipynb => backend/evaluator.ipynb | 0 evaluator.py => backend/evaluator.py | 0 website.py => backend/website.py | 0 content_based.py | 412 ------------------ 6 files changed, 3 insertions(+), 415 deletions(-) rename content_based.ipynb => backend/content_based.ipynb (100%) rename evaluator.ipynb => backend/evaluator.ipynb (100%) rename evaluator.py => backend/evaluator.py (100%) rename website.py => backend/website.py (100%) delete mode 100644 content_based.py diff --git a/Home.py b/Home.py index ded0143d..c70b755e 100644 --- a/Home.py +++ b/Home.py @@ -26,7 +26,7 @@ def display_user_movies(df, title, column_name): # Sélectionner le nombre de films à afficher en fonction de la colonne if column_name == 'top_10': - filtered_df = filtered_df.sort_values(by=column_name, ascending=False).head(10) + filtered_df = filtered_df.sort_values(by=column_name, ascending=False).head(15) else: filtered_df = filtered_df.sort_values(by=column_name, ascending=False) @@ -84,7 +84,7 @@ def display_recommendations_ub(user_name, user_id): testset = trainset.build_anti_testset() # Créer une instance de l'algorithme de filtrage collaboratif basé sur les utilisateurs - recommender = UserBased(k=340, min_k=340) + recommender = UserBased(k=60, min_k=60) recommender.fit(trainset) top_10_predictions = recommender.get_top_n_pred_ub(testset, user_id) @@ -178,7 +178,7 @@ def display_recommendations_latent_factor(user_name, user_id): -def display_content_based_recommendations(user_name, user_id=-1, n=10): +def display_content_based_recommendations(user_name, user_id=-1, n=15): cols_html = "" # Call the test_contentbased_class function to get top N recommendations diff --git a/content_based.ipynb b/backend/content_based.ipynb similarity index 100% rename from content_based.ipynb rename to backend/content_based.ipynb diff --git a/evaluator.ipynb b/backend/evaluator.ipynb similarity index 100% rename from evaluator.ipynb rename to backend/evaluator.ipynb diff --git a/evaluator.py b/backend/evaluator.py similarity index 100% rename from evaluator.py rename to backend/evaluator.py diff --git a/website.py b/backend/website.py similarity index 100% rename from website.py rename to backend/website.py diff --git a/content_based.py b/content_based.py deleted file mode 100644 index 18822949..00000000 --- a/content_based.py +++ /dev/null @@ -1,412 +0,0 @@ - -import numpy as np -import pandas as pd -import random as rd -from surprise import AlgoBase -from surprise.prediction_algorithms.predictions import PredictionImpossible - -from loaders import load_ratings -from loaders import load_items -from constants import Constant as C - -from sklearn.linear_model import LinearRegression -from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor -from sklearn.svm import SVR -from sklearn.feature_extraction.text import TfidfVectorizer - - -# All the dataframes -df_items = load_items() -df_ratings = load_ratings() -df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME) - -df_features = df_tag[C.TAG] - -class ContentBased(AlgoBase): - def __init__(self, features_method, regressor_method): - AlgoBase.__init__(self) - self.regressor_method = regressor_method - self.content_features = self.create_content_features(features_method) - self.user_profile_explain = {} - - def create_content_features(self, features_method): - """Content Analyzer""" - df_items = load_items() - df_ratings = load_ratings() - df_tag = df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME) - df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv") - df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv") - - if features_method is None: - df_features = None - - elif features_method == "relevance" : - df_features = df_genome_score.groupby('movieId')["relevance"].transform('mean').to_frame('avg_relevance') - - elif features_method == "title_length": # a naive method that creates only 1 feature based on title length - df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title') - - elif features_method == "movie_year" : - df_features = df_items['movie_year'] = df_items['title'].str.extract(r'\((\d{4})\)', expand=False).to_frame('movie_year') - - elif features_method == "genres" : - genres_list = df_items['genres'].str.split('|').explode().unique() - for genre in genres_list: - df_features = df_items['genres'].str.contains(genre).astype(int).to_frame('genres') - - elif features_method == "combination" : - genres_list = df_items['genres'].str.split('|').explode().unique() - for genre in genres_list: - df_features = df_items['genres'].str.contains(genre).astype(int).to_frame('genres') - - df_features = df_items['movie_year'] = df_items['title'].str.extract(r'\((\d{4})\)', expand=False).to_frame('movie_year') - - - elif features_method == "rating" : - df_features = df_ratings.groupby('movieId')['rating'].transform('mean').to_frame('avg_rating') - - elif features_method == "tags" : - df_features = df_tag['tag'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0).to_frame('tags') - - elif features_method == "tags_length" : - - df_features = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(','))if isinstance(x, str) else 0).to_frame('n_character_tags') - - else: # (implement other feature creations here) - raise NotImplementedError(f'Feature method {features_method} not yet implemented') - - # Handle missing values in df_features - if df_features is not None: - df_features.fillna(0, inplace=True) - - return df_features - - - def fit(self, trainset): - """Profile Learner""" - AlgoBase.fit(self, trainset) - - # Preallocate user profiles - self.user_profile = {u: None for u in trainset.all_users()} - - self.user_profile_explain = {} - - # Loop over all internal user IDs in the trainset - for u in trainset.all_users(): - # Convert internal user ID to raw user ID - raw_user_id = trainset.to_raw_uid(u) - - # Initialize feature importance dictionary for the raw user ID - self.user_profile_explain[raw_user_id] = {} - - # Extract user ratings for the current user - user_ratings = np.array([rating for _, rating in trainset.ur[u]]) - - # Compute feature importance based on content features and user ratings - feature_values = self.content_features.values.astype(int) - weighted_features = feature_values / np.linalg.norm(feature_values) - feature_importance = weighted_features / np.sum(user_ratings) - - # Map feature importance scores to feature names and store in user_profile_explain - self.user_profile_explain[raw_user_id] = dict(zip(self.content_features.columns, feature_importance)) - - - if self.regressor_method == 'random_score': - for u in self.user_profile : - self.user_profile[u] = rd.uniform(0.5,5) - - elif self.regressor_method == 'random_sample': - for u in self.user_profile: - self.user_profile[u] = [rating for _, rating in self.trainset.ur[u]] - - elif self.regressor_method == 'linear_regression' : - for u in self.user_profile: - - user_ratings = [rating for _, rating in trainset.ur[u]] - item_ids = [iid for iid, _ in trainset.ur[u]] - - df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings}) - - df_user["item_id"] = df_user["item_id"].map(trainset.to_raw_iid) - - df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left') - - if 'n_character_title' in df_user.columns: - # Si 'n_character_title' est disponible comme caractéristique - X = df_user['n_character_title'].values.reshape(-1, 1) - - elif 'avg_relevance' in df_user.columns: - # Si 'n_character_title' est disponible comme caractéristique - X = df_user['avg_relevance'].values.reshape(-1, 1) - - elif 'movie_year' in df_user.columns: - # Si 'n_character_title' est disponible comme caractéristique - X = df_user['movie_year'].values.reshape(-1, 1) - - elif 'combination' in df_user.columns : - X = df_user['movie_year','genres' ].values.reshape(-1, 1) - - elif 'genres' in df_user.columns: - # Si 'n_character_title' est disponible comme caractéristique - X = df_user['genres'].values.reshape(-1, 1) - - elif 'avg_rating' in df_user.columns: - # Si 'n_character_title' est disponible comme caractéristique - X = df_user['avg_rating'].values.reshape(-1, 1) - - elif 'tags' in df_user.columns: - # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) - X = df_user['tags'].values.reshape(-1, 1) - - elif 'n_character_tags' in df_user.columns: - # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) - X = df_user['n_character_tags'].values.reshape(-1, 1) - - else: - # Si aucune caractéristique appropriée n'est disponible - continue # Ou gère le cas d'erreur/exception ici - - y = df_user['user_ratings'].values - - linear_regressor = LinearRegression(fit_intercept = False) - - linear_regressor.fit(X,y) - - # Store the computed user profile - self.user_profile[u] = linear_regressor - - elif self.regressor_method == 'svr_regression': - for u in self.user_profile: - - user_ratings = [rating for _, rating in trainset.ur[u]] - item_ids = [iid for iid, _ in trainset.ur[u]] - - df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings}) - - df_user["item_id"] = df_user["item_id"].map(trainset.to_raw_iid) - - df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left') - - if 'n_character_title' in df_user.columns: - # Si 'n_character_title' est disponible comme caractéristique - X = df_user['n_character_title'].values.reshape(-1, 1) - - elif 'avg_relevance' in df_user.columns: - # Si 'n_character_title' est disponible comme caractéristique - X = df_user['avg_relevance'].values.reshape(-1, 1) - - elif 'movie_year' in df_user.columns: - # Si 'n_character_title' est disponible comme caractéristique - X = df_user['movie_year'].values.reshape(-1, 1) - - elif 'genres' in df_user.columns: - # Si 'n_character_title' est disponible comme caractéristique - X = df_user['genres'].values.reshape(-1, 1) - - elif 'avg_rating' in df_user.columns: - # Si 'n_character_title' est disponible comme caractéristique - X = df_user['avg_rating'].values.reshape(-1, 1) - - elif 'tags' in df_user.columns: - # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) - X = df_user['tags'].values.reshape(-1, 1) - - elif 'n_character_tags' in df_user.columns: - # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) - X = df_user['n_character_tags'].values.reshape(-1, 1) - - else: - # Si aucune caractéristique appropriée n'est disponible - continue # Ou gère le cas d'erreur/exception ici - - y = df_user['user_ratings'].values - svr_regressor = SVR(kernel='rbf', C=10, epsilon=0.2) - svr_regressor.fit(X, y) - self.user_profile[u] = svr_regressor - - elif self.regressor_method == 'gradient_boosting': - for u in self.user_profile: - - user_ratings = [rating for _, rating in trainset.ur[u]] - item_ids = [iid for iid, _ in trainset.ur[u]] - - df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings}) - - df_user["item_id"] = df_user["item_id"].map(trainset.to_raw_iid) - - df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left') - - if 'n_character_title' in df_user.columns: - # Si 'n_character_title' est disponible comme caractéristique - X = df_user['n_character_title'].values.reshape(-1, 1) - - elif 'avg_relevance' in df_user.columns: - # Si 'n_character_title' est disponible comme caractéristique - X = df_user['avg_relevance'].values.reshape(-1, 1) - - elif 'movie_year' in df_user.columns: - # Si 'n_character_title' est disponible comme caractéristique - X = df_user['movie_year'].values.reshape(-1, 1) - - elif 'genres' in df_user.columns: - # Si 'n_character_title' est disponible comme caractéristique - X = df_user['genres'].values.reshape(-1, 1) - - elif 'avg_rating' in df_user.columns: - # Si 'n_character_title' est disponible comme caractéristique - X = df_user['avg_rating'].values.reshape(-1, 1) - - elif 'tags' in df_user.columns: - # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) - X = df_user['tags'].values.reshape(-1, 1) - - elif 'n_character_tags' in df_user.columns: - # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) - X = df_user['n_character_tags'].values.reshape(-1, 1) - - else: - # Si aucune caractéristique appropriée n'est disponible - continue # Ou gère le cas d'erreur/exception ici - - y = df_user['user_ratings'].values - gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3) - gb_regressor.fit(X, y) - self.user_profile[u] = gb_regressor - - - elif self.regressor_method == 'random_forest': - for u in self.user_profile: - - user_ratings = [rating for _, rating in trainset.ur[u]] - item_ids = [iid for iid, _ in trainset.ur[u]] - - df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings}) - - df_user["item_id"] = df_user["item_id"].map(trainset.to_raw_iid) - - df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left') - - if 'n_character_title' in df_user.columns: - # Si 'n_character_title' est disponible comme caractéristique - X = df_user['n_character_title'].values.reshape(-1, 1) - - elif 'avg_relevance' in df_user.columns: - # Si 'n_character_title' est disponible comme caractéristique - X = df_user['avg_relevance'].values.reshape(-1, 1) - - elif 'movie_year' in df_user.columns: - # Si 'n_character_title' est disponible comme caractéristique - X = df_user['movie_year'].values.reshape(-1, 1) - - elif 'genres' in df_user.columns: - # Si 'n_character_title' est disponible comme caractéristique - X = df_user['genres'].values.reshape(-1, 1) - - elif 'avg_rating' in df_user.columns: - # Si 'n_character_title' est disponible comme caractéristique - X = df_user['avg_rating'].values.reshape(-1, 1) - - elif 'tags' in df_user.columns: - # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) - X = df_user['tags'].values.reshape(-1, 1) - - elif 'n_character_tags' in df_user.columns: - # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) - X = df_user['n_character_tags'].values.reshape(-1, 1) - - else: - # Si aucune caractéristique appropriée n'est disponible - continue # Ou gère le cas d'erreur/exception ici - - y = df_user['user_ratings'].values - rf_regressor = RandomForestRegressor(n_estimators=100) - rf_regressor.fit(X, y) - self.user_profile[u] = rf_regressor - - else : - pass - - # (implement here the regressor fitting) - - def estimate(self, u, i): - """Scoring component used for item filtering""" - # First, handle cases for unknown users and items - if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): - raise PredictionImpossible('User and/or item is unkown.') - - - if self.regressor_method == 'random_score': - rd.seed() - score = rd.uniform(0.5,5) - - elif self.regressor_method == 'random_sample': - rd.seed() - score = rd.choice(self.user_profile[u]) - - elif self.regressor_method == 'linear_regression': - - raw_item_id = self.trainset.to_raw_iid(i) - - item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values - - linear_regressor = self.user_profile[u] - - score= linear_regressor.predict(item_features)[0] - - elif self.regressor_method == 'svr_regression': - - raw_item_id = self.trainset.to_raw_iid(i) - - item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values - - svr_regressor = self.user_profile[u] - score = svr_regressor.predict(item_features)[0] - - elif self.regressor_method == 'gradient_boosting': - - raw_item_id = self.trainset.to_raw_iid(i) - - item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values - - gradient_boosting = self.user_profile[u] - score = gradient_boosting.predict(item_features)[0] - - elif self.regressor_method == 'random_forest': - - raw_item_id = self.trainset.to_raw_iid(i) - - item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values - - randomforest = self.user_profile[u] - score = randomforest.predict(item_features)[0] - - else : - score = None - - # (implement here the regressor prediction) - - return score - - def explain(self, u) : - if u in self.user_profile_explain : - return self.user_profile_explain[u] - else : - return None - - -cb = ContentBased("combination", "") -sp_ratings = load_ratings(surprise_format=True) -train_set = sp_ratings.build_full_trainset() -cb.fit(train_set) - -print(cb.explain(11)) - -print(cb.explain(13)) - -print(cb.explain(17)) - -print(cb.explain(23)) - -print(cb.explain(27)) - -print(cb.explain(73)) \ No newline at end of file -- GitLab