diff --git a/content.py b/content.py
index 0e09f81ffc0e5fdf77c438bbdcdd6a2faaecba2d..b8b97a3681212818e0fd82f9acb19aa20610d570 100644
--- a/content.py
+++ b/content.py
@@ -134,6 +134,10 @@ def merge_csv_files(file1, file2, file3, file4):
# Exemple d'utilisation de la fonction merge_csv_files
merged_df = merge_csv_files("data/small/content/movies.csv", 'data/small/content/links.csv', 'data/small/content/tags.csv', 'data/small/evidence/ratings.csv')
+def get_top_movies_by_genre(df, genre, displayed_movies, num_movies=10):
+ genre_df = df[df['genres'].apply(lambda x: genre in x) & ~df['tmdbId'].isin(displayed_movies)]
+ top_movies = genre_df.sort_values(by='rating', ascending=False).head(num_movies)
+ return top_movies
def calculate_year_range(df):
# Extracting the year from the 'title' column
diff --git a/pages/Discover.py b/pages/Discover.py
index 0914a80ef850104933b4f0d5c216507380841770..2727224b82feabe6ccfc84d135ad1d9c507ec45f 100644
--- a/pages/Discover.py
+++ b/pages/Discover.py
@@ -1,60 +1,91 @@
import streamlit as st
import pandas as pd
-from content import genres, calculate_year_range, tag, merged_df, fetch_movie_info, df_links
+from content import genres, calculate_year_range, tag, merged_df, fetch_movie_info, df_links, get_top_movies_by_genre
-def filter_movies_by_genre(df, selected_genres):
- if not selected_genres:
- return df
- return df[df['genres'].apply(lambda x: any(genre in x for genre in selected_genres))]
-def filter_movies_by_year(df, selected_year):
- return df[(df['annee'] >= selected_year[0]) & (df['annee'] <= selected_year[1])]
-def filter_movies_by_tag(df, selected_tags):
- if not selected_tags:
- return df
- df['tag'] = df['tag'].apply(lambda x: str(x).split('|') if pd.notna(x) else [])
- filtered_df = df[df['tag'].apply(lambda tags: any(tag in tags for tag in selected_tags))]
- return filtered_df
+
+def display_top_movies_by_genre(genre, displayed_movies, num_columns=4):
+ st.subheader(f"Top 10 Movies in {genre}")
+ top_movies = get_top_movies_by_genre(merged_df, genre, displayed_movies)
+ unique_tmdbIds = top_movies['tmdbId'].tolist()
+ displayed_movies.update(unique_tmdbIds)
+ display_movies(unique_tmdbIds, num_columns)
+
+
def display_movies(unique_tmdbIds, num_columns):
- st.write("OUR SELECTION")
- for i in range(0, len(unique_tmdbIds), num_columns):
- row_tmdbIds = unique_tmdbIds[i:i+num_columns]
- cols = st.columns(num_columns)
- for idx, tmdbId in enumerate(row_tmdbIds):
+ if unique_tmdbIds:
+ cols_html = ""
+ for tmdbId in unique_tmdbIds:
title, poster_url = fetch_movie_info(tmdbId)
+ movie_title = title.get("title", "Unknown Title") if isinstance(title, dict) else title
if poster_url:
- title = title.get("title", "Unknown Title")
- html_file_url = f"http://localhost:8501/{title.replace(' ', '_')}.html"
- cols[idx].markdown(f'<div style="margin-right: 20px; margin-top: 20px; margin-bottom: 5px;"><a href="{html_file_url}" target="_blank"><img src="{poster_url}" alt="{title}" style="width:150px;height:225px;"></a></div>', unsafe_allow_html=True)
- cols[idx].markdown(f'<div style="margin-right: 20px; margin-bottom: 5px; text-align: center; max-width: 150px;"><a href="{html_file_url}" target="_blank" style="color: white; text-decoration: none; font-size: 14px; word-wrap: break-word;"><b>{title}</b></a></div>', unsafe_allow_html=True)
+ html_file_url = f"http://localhost:8501/{movie_title.replace(' ', '_')}.html"
+ cols_html += f'<div style="display: inline-block; margin-right: 20px;"><a href="{html_file_url}" target="_blank"><img src="{poster_url}" alt="{movie_title}" style="width:150px;height:225px;"></a><div style="color: white; text-decoration: none; font-size: 14px; text-align: center; max-width: 150px; word-wrap: break-word; white-space: normal;"><b>{movie_title}</b></div></a></div>'
+ else:
+ cols_html += f"<p>{movie_title}</p>"
+
+ st.markdown(f"""
+ <div style="overflow-x: scroll; white-space: nowrap; height: 300px; display: flex; flex-direction: row;">
+ {cols_html}
+ </div>
+ """, unsafe_allow_html=True)
+ else:
+ st.write("No recommendations found.")
+
+
+
def main():
- st.title('NAAC') # Next Access All Content
+ st.title('Discover') # Next Access All Content
global df_links
num_columns = 4
- # Existing filtering and display
+ # Sidebar for genre and year selection
selected_genres = st.sidebar.multiselect('Select Genre(s)', genres(merged_df))
min_range, max_range = calculate_year_range(merged_df)
selected_year = st.sidebar.slider('Select Year Range', min_value=min_range, max_value=max_range, value=(min_range, max_range))
filtered_movies = merged_df.copy()
- filtered_movies = filter_movies_by_genre(filtered_movies, selected_genres)
- filtered_movies = filter_movies_by_year(filtered_movies, selected_year)
+ # Here you can add filtering by selected genres and years if needed
+ # filtered_movies = filter_movies_by_genre(filtered_movies, selected_genres)
+ # filtered_movies = filter_movies_by_year(filtered_movies, selected_year)
unique_tags_list = tag(merged_df)
- tag_input = st.sidebar.selectbox("Search by Tag", [""] + sorted(unique_tags_list))
- if tag_input:
- filtered_movies = filter_movies_by_tag(filtered_movies, [tag_input])
+ # tag_input = st.sidebar.selectbox("Search by Tag", [""] + sorted(unique_tags_list))
+ # if tag_input:
+ # filtered_movies = filter_movies_by_tag(filtered_movies, [tag_input])
unique_tmdbIds = filtered_movies.drop_duplicates(subset='tmdbId')['tmdbId'].tolist()
display_movies(unique_tmdbIds, num_columns)
+ # Display top movies for all genres without repetition
+ displayed_movies = set()
+ all_genres = genres(merged_df)
+ for genre in all_genres:
+ display_top_movies_by_genre(genre, displayed_movies, num_columns)
+
if __name__ == "__main__":
main()
+
+
+
+# def filter_movies_by_genre(df, selected_genres):
+# if not selected_genres:
+# return df
+# return df[df['genres'].apply(lambda x: any(genre in x for genre in selected_genres))]
+
+# def filter_movies_by_year(df, selected_year):
+# return df[(df['annee'] >= selected_year[0]) & (df['annee'] <= selected_year[1])]
+
+# def filter_movies_by_tag(df, selected_tags):
+# if not selected_tags:
+# return df
+# df['tag'] = df['tag'].apply(lambda x: str(x).split('|') if pd.notna(x) else [])
+# filtered_df = df[df['tag'].apply(lambda tags: any(tag in tags for tag in selected_tags))]
+# return filtered_df
\ No newline at end of file
diff --git a/recommender.py b/recommender.py
index 0778db72658c9fcc31b65075449092178222285f..bf407ea688427efd6bbff0a7328c9ae7c0fe0bbc 100644
--- a/recommender.py
+++ b/recommender.py
@@ -267,30 +267,6 @@ class UserBased(AlgoBase):
predictions = [self.predict(uid, iid, r_ui=rating) for (uid, iid, rating) in testset]
return accuracy.mae(predictions, verbose=True)
- def catalog_coverage(self, top_n_recommendations):
- """
- Calculate catalog coverage based on the top N recommendations.
-
- Args:
- top_n_recommendations (list or dict): List or dictionary containing top N recommendations for each user.
-
- Returns:
- float: Catalog coverage ratio.
- """
- recommended_items = set()
- all_items = set(range(self.trainset.n_items))
-
- if isinstance(top_n_recommendations, dict):
- for user_recommendations in top_n_recommendations.values():
- for item_id, _ in user_recommendations:
- recommended_items.add(item_id)
- elif isinstance(top_n_recommendations, list):
- for item_id, _ in top_n_recommendations:
- recommended_items.add(item_id)
-
- coverage = len(recommended_items) / len(all_items)
- return coverage
-
###########################################################################################################################
####################################################### KNN MODEL ########################################################
@@ -367,19 +343,32 @@ class RecommenderSystem_KNN :
raise ValueError("Model has not been trained. Call train_knn_model() first.")
# Retrieve the top N recommendations
- top_n = defaultdict(list)
- rated_items = set(self.trainset.ur[userid])
+ top_n_recommendations = []
+
+ # Get the items the target user has already rated
+ rated_items = set([item for item, rating in self.trainset.ur[self.trainset.to_inner_uid(userid)]])
+
+ for iid in range(self.trainset.n_items): # Iterate over all items
+ if iid not in rated_items: # Skip items already rated by the user
+ try:
+ pred = self.model.predict(userid, iid)
+ prediction_rating = pred.est # Get estimated rating from prediction object
+ movie_id = self.trainset.to_raw_iid(iid) # Convert iid to movieId
+ top_n_recommendations.append((movie_id, prediction_rating))
+ except PredictionImpossible:
+ # Handle cases where prediction is not possible (user or item unknown)
+ pass
- for uid, iid, true_r, est, _ in self.model.test(self.testset):
- if iid not in rated_items: # Check if item has not been rated by user
- top_n[uid].append((iid, est))
+ # Sort the recommendations by estimated rating in descending order
+ top_n_recommendations.sort(key=lambda x: x[1], reverse=True)
+ top_n_recommendations = top_n_recommendations[:n]
- # Sort the predictions for each user and keep only the top N
- for uid, user_ratings in top_n.items():
- user_ratings.sort(key=lambda x: x[1], reverse=True)
- top_n[uid] = user_ratings[:n]
+ # Optionally print the top N predictions
+ print(f"Top {n} recommendations for user {userid}:")
+ for movie_id, pred in top_n_recommendations:
+ print(f"MovieId {movie_id}: {pred}")
- return top_n.get(userid, [])
+ return top_n_recommendations
def inter_user_diversity(self, top_n_recommendations):
@@ -412,30 +401,6 @@ class RecommenderSystem_KNN :
return average_distance
- def catalog_coverage(self, top_n_recommendations):
- """
- Calculate catalog coverage based on the top N recommendations.
-
- Args:
- top_n_recommendations (list or dict): List or dictionary containing top N recommendations for each user.
-
- Returns:
- float: Catalog coverage ratio.
- """
- recommended_items = set()
- all_items = set(range(self.trainset.n_items))
-
- if isinstance(top_n_recommendations, dict):
- for user_recommendations in top_n_recommendations.values():
- for item_id, _ in user_recommendations:
- recommended_items.add(item_id)
- elif isinstance(top_n_recommendations, list):
- for item_id, _ in top_n_recommendations:
- recommended_items.add(item_id)
-
- coverage = len(recommended_items) / len(all_items)
- return coverage
-
###########################################################################################################################
################################################# OTHER USER-BASED MODEL ##################################################
###########################################################################################################################
@@ -453,6 +418,7 @@ class OtherUserBased:
"""
self.user_name = user_name
self.user_id = user_id
+ self.model = None
# You might initialize any required attributes here
def load_model(self):
@@ -487,7 +453,7 @@ class OtherUserBased:
Returns:
dict: Dictionary containing top N predictions for each user.
"""
- if hasattr(self, 'model') and self.model is not None:
+ if self.model is not None:
all_item_ids = self.get_all_item_ids_from_csv(csv_file)
predictions = []
for item_id in all_item_ids:
@@ -498,9 +464,6 @@ class OtherUserBased:
for item_id, prediction in top_10_predictions:
print(f"Item {item_id} : {prediction}")
return top_10_predictions # Return the predictions here
- else:
- print(f"Model for user {self.user_id} ({self.user_name}) could not be loaded.")
- return None
def evaluate_rmse(self):
@@ -524,74 +487,29 @@ class OtherUserBased:
print(f"Model for user {self.user_id} ({self.user_name}) could not be loaded.")
return None
- def evaluate_mae(self):
- """
- Evaluate the MAE of the model on the test data.
-
- Args:
- test_data (DataFrame): A pandas DataFrame with columns ['user_id', 'movie_id', 'rating'].
-
- Returns:
- float: The MAE of the model on the test data.
- """
- if self.model is not None:
- surprise_data = load_ratings(surprise_format=True)
- trainset = surprise_data.build_full_trainset()
- testset = trainset.build_anti_testset()
- predictions = self.model.test(testset)
- mae = accuracy.mae(predictions)
- return mae
- else:
- print(f"Model for user {self.user_id} ({self.user_name}) could not be loaded.")
- return None
-
- def inter_user_diversity(self, top_n_recommendations):
+ def inter_user_diversity(self, top_n_predictions):
"""
- Calculate the inter-user diversity (IUD) of the recommender system.
+ Calculate the inter-user diversity based on the top N predictions for users.
Args:
- top_n_recommendations (dict): Dictionary containing top N recommendations for each user.
+ top_n_predictions (dict): Dictionary containing top N predictions for each user.
Returns:
- float: Average pairwise Jaccard distance between recommendations to users.
+ float: Inter-user diversity score.
"""
- jaccard_distances = []
-
- # Convert top_n_recommendations to a list of sets for easier computation
- recommendation_sets = [set([item_id for item_id, _ in recommendations]) for recommendations in top_n_recommendations.values()]
-
- # Calculate Jaccard distance between all pairs of recommendation sets
- for i in range(len(recommendation_sets)):
- for j in range(i+1, len(recommendation_sets)):
- union_size = len(recommendation_sets[i].union(recommendation_sets[j]))
- intersection_size = len(recommendation_sets[i].intersection(recommendation_sets[j]))
- jaccard_distances.append(1 - (intersection_size / union_size))
-
- # Calculate the average pairwise Jaccard distance
- if jaccard_distances:
- average_distance = sum(jaccard_distances) / len(jaccard_distances)
- else:
- average_distance = 0.0
-
- return average_distance
-
- def catalog_coverage(self, top_n_predictions):
- """
- Calculate catalog coverage based on the top N predictions.
-
- Args:
- top_n_predictions (list): List of top N predictions for the user.
-
- Returns:
- float: Catalog coverage ratio.
- """
- all_items = set()
- recommended_items = set()
- for item_id, _ in top_n_predictions:
- all_items.add(item_id)
- recommended_items.add(item_id)
- coverage = len(recommended_items) / len(all_items)
- return coverage
+ similarities = []
+ users = list(top_n_predictions.keys())
+ num_users = len(users)
+
+ for i in range(num_users):
+ for j in range(i + 1, num_users):
+ user1_predictions = set(item_id for item_id, _ in top_n_predictions[users[i]])
+ user2_predictions = set(item_id for item_id, _ in top_n_predictions[users[j]])
+ similarity = len(user1_predictions.intersection(user2_predictions)) / float(len(user1_predictions.union(user2_predictions)))
+ similarities.append(similarity)
+
+ inter_user_diversity_score = np.mean(similarities)
+ return inter_user_diversity_score
###########################################################################################################################
####################################################### CUSTOM USER BASED ################################################
@@ -631,8 +549,6 @@ class CustomUserBased(UserBased):
self.sim = similarity_matrix
-
-
###########################################################################################################################
####################################################### COMPARISON MODEL ##################################################
###########################################################################################################################
@@ -666,7 +582,7 @@ def compare_models():
print(f"RMSE of the {user_info['user_name']} model: {rmse}")
# Call the function to compare the models
-#compare_models()
+# compare_models()
def compare_similarity_measures(trainset,testset):
@@ -745,32 +661,7 @@ def compare_similarity_measures(trainset,testset):
# print(comparison_results)
-def evaluate_models(trainset, testset, ratings_path, user_name, user_id):
- # Entraînement et évaluation du modèle UserBased
- user_based_model = UserBased(k=20, min_k=20)
- user_based_model.fit(trainset)
- top_n_predictions_ub = user_based_model.get_top_n_pred_ub(testset, user_id, n=5000)
- diversity_ub = user_based_model.catalog_coverage(top_n_predictions_ub)
- print("Diversity for UserBased model:", diversity_ub)
-
- # Entraînement et évaluation du modèle KNN
- knn_model = RecommenderSystem_KNN(ratings_path)
- knn_model.train_knn_model()
- all_predictions_knn = knn_model.get_top_n_recommendations(userid=user_id, n=5000) # Modifiez 100 selon vos besoins
- diversity_knn = knn_model.catalog_coverage(all_predictions_knn)
- print("Diversity for KNN model:", diversity_knn)
-
- # Entraînement et évaluation du modèle OtherUserBased
- other_user_based_model = OtherUserBased(user_name, user_id)
- other_user_based_model.load_model()
- top_n_predictions = other_user_based_model.get_top_n_predictions_for_user(ratings_path, n=10)
- diversity_other = other_user_based_model.catalog_coverage(top_n_predictions)
- print("Diversity for OtherUserBased model:", diversity_other)
-
-# # Utilisation de la fonction
-# evaluate_models(trainset, testset, "data/small/evidence/ratings.csv", "Adrien", -1)
-
-def evaluate_inter_user_diversity(user_based_model, ratings_path, other_user_based, trainset, testset):
+def evaluate_inter_user_diversity(user_based_model, ratings_path, other_user_based_models, trainset, testset):
"""
Evaluate the inter-user diversity of different recommender models.
@@ -786,45 +677,68 @@ def evaluate_inter_user_diversity(user_based_model, ratings_path, other_user_bas
"""
inter_user_diversity_scores = {}
- # UserBased model
- user_based_model.fit(trainset)
- all_top_n_recommendations_ub = {}
- for user_id in range(user_based_model.trainset.n_users):
- try:
- trainset_user_id = user_based_model.trainset.to_raw_uid(user_id)
- top_n_recommendations_ub = user_based_model.get_top_n_pred_ub(testset, target_user=trainset_user_id, n=10)
- all_top_n_recommendations_ub[trainset_user_id] = top_n_recommendations_ub
- except ValueError:
- print(f"User {trainset_user_id} is not part of the training set for UserBased model. Skipping...")
-
- inter_user_diversity_scores['UserBased'] = user_based_model.inter_user_diversity(all_top_n_recommendations_ub)
-
- #KNN model
- knn_model = RecommenderSystem_KNN(ratings_path)
- knn_model.train_knn_model()
- knn_top_n_recommendations = knn_model.get_top_n_recommendations(testset, n=10)
- inter_user_diversity_scores['KNN'] = knn_model.inter_user_diversity(knn_top_n_recommendations)
-
- # OtherUserBased model
- other_user_based.load_model()
- other_top_n_recommendations = other_user_based.get_top_n_predictions_for_user("data/small/evidence/ratings.csv", n=10)
- inter_user_diversity_scores['OtherUserBased'] = other_user_based.inter_user_diversity(other_top_n_recommendations)
+ # # UserBased model
+ # user_based_model.fit(trainset)
+ # all_top_n_recommendations_ub = {}
+ # for user_id in range(user_based_model.trainset.n_users):
+ # try:
+ # trainset_user_id = user_based_model.trainset.to_raw_uid(user_id)
+ # top_n_recommendations_ub = user_based_model.get_top_n_pred_ub(testset, target_user=trainset_user_id, n=10)
+ # all_top_n_recommendations_ub[trainset_user_id] = top_n_recommendations_ub
+ # except ValueError:
+ # print(f"User {trainset_user_id} is not part of the training set for UserBased model. Skipping...")
+
+ # inter_user_diversity_scores['UserBased'] = user_based_model.inter_user_diversity(all_top_n_recommendations_ub)
+
+ # #KNN model
+ # knn_model = RecommenderSystem_KNN(ratings_path)
+ # knn_model.train_knn_model()
+ # all_top_n_recommendations_knn = {}
+ # for user_id in range(knn_model.trainset.n_users):
+ # try:
+ # trainset_user_id = knn_model.trainset.to_raw_uid(user_id)
+ # top_n_recommendations_knn = knn_model.get_top_n_recommendations(trainset_user_id, n=10)
+ # all_top_n_recommendations_knn[trainset_user_id] = top_n_recommendations_knn
+ # except ValueError:
+ # print(f"User {trainset_user_id} is not part of the training set for KNN model. Skipping...")
+
+ # inter_user_diversity_scores['KNN'] = knn_model.inter_user_diversity(all_top_n_recommendations_knn)
+
+ # Other user-based models
+ for other_model in other_user_based_models:
+ other_model.load_model()
+ all_top_n_recommendations_other = {}
+
+ # Get predictions for all users in the test set
+ all_user_ids = set(user for user, _, _ in testset)
+ for user_id in all_user_ids:
+ other_model.user_id = user_id # Update the user ID for the model
+ top_n_predictions = other_model.get_top_n_predictions_for_user(ratings_path, n=10)
+ all_top_n_recommendations_other[user_id] = top_n_predictions
+
+ inter_user_diversity_scores[f'Other_{other_model.user_name}'] = other_model.inter_user_diversity(all_top_n_recommendations_other)
+
return inter_user_diversity_scores
-# # Example usage:
+
# user_based_model = UserBased(k=40, min_k=40)
# ratings = "data/small/evidence/ratings.csv"
-# other_user_based = OtherUserBased("Adrien", -1)
-# other_user_based_2 = OtherUserBased("Audrey", -2)
-# other_user_based_3 = OtherUserBased("Nathanael", -3)
-# other_user_based_4 = OtherUserBased("Charles", -4)
+# other_user_based_models = [
+# OtherUserBased("Adrien", -1),
+# OtherUserBased("Audrey", -2),
+# OtherUserBased("Nathanael", -3),
+# OtherUserBased("Charles", -4)
+# ]
+
-# inter_user_diversity_scores = evaluate_inter_user_diversity(user_based_model, ratings, other_user_based, trainset, testset)
+# inter_user_diversity_scores = evaluate_inter_user_diversity(user_based_model, ratings, other_user_based_models, trainset, testset)
+
+# # Affichage des scores de diversité inter-utilisateurs
# print("Inter-user Diversity Scores:")
-# for model_name, score in inter_user_diversity_scores.items():
-# print(f"{model_name}: {score}")
+# for model_name, diversity_score in inter_user_diversity_scores.items():
+# print(f"{model_name}: {diversity_score}")
@@ -1026,18 +940,6 @@ def test_contentbased_class(feature_method, regressor_method, user_id=-1, n=10):
print(f"Item {iid}: {est:.2f}")
return top_n_recommendations
-# Example usage
-#test_contentbased_class(["title_length", "movie_year", "genre"], "gradient_boosting", user_id=-1, n=10)
-
-cb = ContentBased(["title_length", "movie_year","genre","avg_rating"], "ridge_regression")
-
-surprise_data = load_ratings(surprise_format=True)
-trainset = surprise_data.build_full_trainset()
-testset = trainset.build_anti_testset()
-cb.fit(trainset)
-
-print("RMSE: ", cb.rmse(testset))
-
###########################################################################################################################
###################################################### LATENT FACTOR MODEL ###############################################
@@ -1138,11 +1040,6 @@ class LatentFactorModel:
y_pred.append(prediction)
return np.sqrt(mean_squared_error(y_true, y_pred))
-
-
-
-
-# Example usage:
# # Load the data
@@ -1151,12 +1048,16 @@ class LatentFactorModel:
# movies = pd.read_csv('data/small/content/movies.csv')
-
# # Initialize and train the model
# user_name = "Adrien"
# lfm = LatentFactorModel(num_factors=10, learning_rate=0.01, regularization=0.1, num_epochs=20, user_name = user_name)
# lfm.fit(ratings,movies)
+# # Print RMSE
+# final_rmse = lfm.calculate_rmse(ratings)
+# print(f"Final RMSE after training: {final_rmse:.4f}")
+
+
# # Predict a rating for a specific user and movie
# user_id = -1
# movie_id = 5218
@@ -1165,5 +1066,5 @@ class LatentFactorModel:
# # Get the top 10 ratings for a specific user
-# top_ratings = lfm.top_ratings_for_user(user_id)
-# print(f"Top 10 ratings for user {user_id}: {top_ratings}")
\ No newline at end of file
+# top_ratings = lfm.top_ratings_for_user(user_id, ratings)
+# print(f"Top 10 ratings for user {user_id}: {top_ratings}")