diff --git a/App.py b/App.py new file mode 100644 index 0000000000000000000000000000000000000000..0927a84b6995b0599901f17e935d9d85aec6e6af --- /dev/null +++ b/App.py @@ -0,0 +1,28 @@ +import streamlit as st + +# Lancer un serveur local : +#cd /Users/adrien/vscodeworkspace/recomsys/sitehtml +#python3 -m http.server 8501 + +# Si vous avez arrété le serveur local, voici les étapes pour le relancer +#lsof -i :8501 +#kill 12345 # en remplaçant 12345 par le PID trouvé +#python3 -m http.server 8501 # relance le serveur local + + +# Logo dans la sidebar +st.sidebar.image("chemin/vers/votre/logo.png", use_column_width=True) + +# Sélecteur de pages dans la sidebar +page = st.sidebar.radio("Navigation", ["Page 1", "Page 2", "Page 3"]) + +# Contenu de votre application +if page == "Home": + st.title("Page 1") + st.write("Contenu de la page 1...") +elif page == "Discover": + st.title("Page 2") + st.write("Contenu de la page 2...") +elif page == "Search": + st.title("Page 3") + st.write("Contenu de la page 3...") \ No newline at end of file diff --git a/Home.py b/Home.py index fc42a2fca243e92a0246ec1aefa28d44f00df0a2..a461c88bc4cbe5eeec7b73f65536efaa987b1542 100644 --- a/Home.py +++ b/Home.py @@ -3,7 +3,6 @@ import time import pandas as pd from content import fetch_movie_info, df_links from content import df_audrey, df_adrien, df_nathanael, df_charles -from surprise import Dataset, Reader from recommender import OtherUserBased, UserBased, RecommenderSystem_KNN, LatentFactorModel,test_contentbased_class # Importer la classe OtherUserBased from loaders import load_ratings @@ -52,7 +51,7 @@ def display_recommendations_tm(user_name, user_id, csv_file): top_10_predictions = recommender.get_top_n_predictions_for_user(csv_file) if top_10_predictions is not None: - st.subheader(f"Top 10 Recommendations for {user_name}") + st.subheader(f"Explore New Favorites") cols_html = "" for prediction in top_10_predictions: item_id, pred_value = prediction # Access each element of the tuple directly @@ -88,7 +87,7 @@ def display_recommendations_ub(user_name, user_id): top_10_predictions = recommender.get_top_n_pred_ub(testset, user_id) if top_10_predictions is not None: - st.subheader(f"Top 10 based on similar users of {user_name}") + st.subheader(f"User-Approved Recommendations") cols_html = "" for prediction in top_10_predictions: iid, pred = prediction # Access each element of the tuple directly @@ -110,7 +109,7 @@ def display_recommendations_ub(user_name, user_id): st.write("No recommendations found.") def display_recommendations_knn(user_name, user_id): - st.subheader(f"Top 10 Recommendations for {user_name}") + st.subheader(f"Recommended for You") cols_html = "" # Create an instance of the RecommenderSystem_KNN @@ -141,7 +140,7 @@ def display_recommendations_knn(user_name, user_id): def display_recommendations_latent_factor(user_name, user_id): - st.subheader(f"Top 10 Recommendations for {user_name}") + st.subheader(f"Unlock Your Next Adventure") cols_html = "" # Load data @@ -184,7 +183,7 @@ def display_content_based_recommendations(user_name, user_id=-1, n=10): if top_n_recommendations: - st.write(f"Top {n} recommendations for User {user_name}:") + st.subheader(f"Discover Great Content") for iid, est in top_n_recommendations: tmdbId = df_links.loc[df_links['movieId'] == iid, 'tmdbId'].values[0] title_dict, poster_url = fetch_movie_info(tmdbId) diff --git a/content.py b/content.py index b8b97a3681212818e0fd82f9acb19aa20610d570..73da7516e41cf402378a43b60274184fa9c35375 100644 --- a/content.py +++ b/content.py @@ -12,7 +12,6 @@ import requests # Local imports from constants import Constant as C - ############### IMPORT THE DATAFRAMES AND TRAINSETS ############### # import all the dataframe @@ -22,16 +21,13 @@ df_tags = pd.read_csv('data/small/content/tags.csv') df_ratings = pd.read_csv('data/small/evidence/ratings.csv') +# Data set from each user df_audrey = pd.read_csv('data/small/users/library_audrey.csv', dtype={'movieId': str}) # Supprimer les points de la colonne 'movieId' et convertir les valeurs en entiers df_audrey['movieId'] = df_audrey['movieId'].str.replace('.', '').astype(int) # Afficher les premières lignes du DataFrame pour vérifier les modifications - df_adrien = pd.read_csv('data/small/users/library_adrien.csv', dtype={'movieId': str}) # Supprimer les points de la colonne 'movieId' et convertir les valeurs en entiers df_adrien['movieId'] = df_adrien['movieId'].str.replace('.', '').astype(int) # Afficher les premières lignes du DataFrame pour vérifier les modifications - - df_nathanael = pd.read_csv('data/small/users/library_nathanael.csv', dtype={'movieId': str}) # Supprimer les points de la colonne 'movieId' et convertir les valeurs en entiers df_nathanael['movieId'] = df_nathanael['movieId'].str.replace('.', '').astype(int) # Afficher les premières lignes du DataFrame pour vérifier les modifications - df_charles = pd.read_csv('data/small/users/library_charles.csv', dtype={'movieId': str}) # Supprimer les points de la colonne 'movieId' et convertir les valeurs en entiers df_charles['movieId'] = df_charles['movieId'].str.replace(' ', '').astype(int) # Afficher les premières lignes du DataFrame pour vérifier les modifications @@ -134,10 +130,7 @@ def merge_csv_files(file1, file2, file3, file4): # Exemple d'utilisation de la fonction merge_csv_files merged_df = merge_csv_files("data/small/content/movies.csv", 'data/small/content/links.csv', 'data/small/content/tags.csv', 'data/small/evidence/ratings.csv') -def get_top_movies_by_genre(df, genre, displayed_movies, num_movies=10): - genre_df = df[df['genres'].apply(lambda x: genre in x) & ~df['tmdbId'].isin(displayed_movies)] - top_movies = genre_df.sort_values(by='rating', ascending=False).head(num_movies) - return top_movies + def calculate_year_range(df): # Extracting the year from the 'title' column @@ -157,6 +150,10 @@ print("Nombre de films uniques :", nombre_films_uniques) min_range, max_range = calculate_year_range(merged_df) +def get_top_movies_by_genre(df, genre, displayed_movies, num_movies=10): + genre_df = df[df['genres'].apply(lambda x: genre in x) & ~df['tmdbId'].isin(displayed_movies)] + top_movies = genre_df.drop_duplicates(subset='tmdbId').sort_values(by='rating', ascending=False).head(num_movies)['tmdbId'] + return top_movies # Year def year(merged_df, ascending=True): @@ -175,6 +172,12 @@ def unrated_movies_count(df_ratings, df_movies): unrated_movies_count(df_ratings, df_movies) +def unrated_movies_count_merged_df(df_ratings, merged_df): + rated_movies = df_ratings['movieId'].unique() if 'movieId' in df_ratings.columns else [] + unrated_movies_count = merged_df[~merged_df['movieId'].isin(rated_movies)].shape[0] + print("Nombre de films qui n'ont pas encore été évalués :", unrated_movies_count) + +unrated_movies_count_merged_df(df_ratings, merged_df) # LIST OF MOVIE GENRES @@ -191,7 +194,7 @@ def genres(merged_df): merged_df['genres'] = merged_df['genres'].apply(convert_to_str) merged_df['genres'] = merged_df['genres'].str.split('|') merged_df = merged_df.explode('genres') - unique_genres = sorted(merged_df['genres'].unique()) + unique_genres = sorted(merged_df[merged_df['genres'] != '(no genres listed)']['genres'].unique()) return unique_genres diff --git a/data/Logo_NAAC-removebg-preview.png b/data/Logo_NAAC-removebg-preview.png new file mode 100644 index 0000000000000000000000000000000000000000..d9ed208b2905d78ab9cffd5294db73d74e83f38d Binary files /dev/null and b/data/Logo_NAAC-removebg-preview.png differ diff --git a/pages/Discover.py b/pages/Discover.py index 2727224b82feabe6ccfc84d135ad1d9c507ec45f..d145f820782522eebf95c804ba085956679508c3 100644 --- a/pages/Discover.py +++ b/pages/Discover.py @@ -1,91 +1,94 @@ import streamlit as st import pandas as pd +from content import df_movies, df_ratings from content import genres, calculate_year_range, tag, merged_df, fetch_movie_info, df_links, get_top_movies_by_genre - - - -def display_top_movies_by_genre(genre, displayed_movies, num_columns=4): - st.subheader(f"Top 10 Movies in {genre}") - top_movies = get_top_movies_by_genre(merged_df, genre, displayed_movies) - unique_tmdbIds = top_movies['tmdbId'].tolist() - displayed_movies.update(unique_tmdbIds) - display_movies(unique_tmdbIds, num_columns) - - - -def display_movies(unique_tmdbIds, num_columns): - if unique_tmdbIds: +def display_unrated_movies(df_ratings, merged_df): + st.header(f"Hidden Gems") + rated_movies = df_ratings['movieId'].unique() if 'movieId' in df_ratings.columns else [] + unrated_movies_df = merged_df[~merged_df['movieId'].isin(rated_movies)] + unique_unrated_movies_df = unrated_movies_df.drop_duplicates(subset='tmdbId') + sorted_movies_df = unique_unrated_movies_df.sort_values(by='tmdbId', ascending=False).head(10) + cols_html = "" + for idx, row in sorted_movies_df.iterrows(): + title_dict, poster_url = fetch_movie_info(row['tmdbId']) + movie_title = title_dict.get("title", "Unknown Title") if isinstance(title_dict, dict) else title_dict + if poster_url: + html_file_url = f"http://localhost:8501/{movie_title.replace(' ', '_')}.html" + cols_html += f'<div style="display: inline-block; margin-right: 20px;"><a href="{html_file_url}" target="_blank"><img src="{poster_url}" alt="{movie_title}" style="width:150px;height:225px;"></a><div style="color: white; text-decoration: none; font-size: 14px; text-align: center; max-width: 150px; word-wrap: break-word; white-space: normal;"><b>{movie_title}</b></div></a></div>' + else: + cols_html += f"<p>{movie_title}</p>" + + st.markdown(f""" + <div style="overflow-x: scroll; white-space: nowrap; height: 300px; display: flex; flex-direction: row;"> + {cols_html} + </div> + """, unsafe_allow_html=True) + +def display_movies_year(merged_df): + st.header(f"New this year") + cols_html = "" + unique_movies_df = merged_df.drop_duplicates(subset='tmdbId') # Supprimer les doublons basés sur tmdbId + sorted_movies_df = unique_movies_df.sort_values(by='annee', ascending=False) # Trier par année de sortie de manière décroissante + top_10_tmdbIds = sorted_movies_df['tmdbId'].head(10) # Prend les 10 premiers films en termes de TMDB ID + for tmdbId in top_10_tmdbIds: + title_dict, poster_url = fetch_movie_info(tmdbId) + movie_title = title_dict.get("title", "Unknown Title") if isinstance(title_dict, dict) else title_dict + if poster_url: + html_file_url = f"http://localhost:8501/{movie_title.replace(' ', '_')}.html" + cols_html += f'<div style="display: inline-block; margin-right: 20px;"><a href="{html_file_url}" target="_blank"><img src="{poster_url}" alt="{movie_title}" style="width:150px;height:225px;"></a><div style="color: white; text-decoration: none; font-size: 14px; text-align: center; max-width: 150px; word-wrap: break-word; white-space: normal;"><b>{movie_title}</b></div></a></div>' + else: + cols_html += f"<p>{movie_title}</p>" + st.markdown(f""" + <div style="overflow-x: scroll; white-space: nowrap; height: 300px; display: flex; flex-direction: row;"> + {cols_html} + </div> + """, unsafe_allow_html=True) + + +def display_movies_by_genre(df, num_movies=10): + unique_genres = genres(df) + for genre in unique_genres: + st.header(f"Top Movies in {genre}") cols_html = "" - for tmdbId in unique_tmdbIds: - title, poster_url = fetch_movie_info(tmdbId) - movie_title = title.get("title", "Unknown Title") if isinstance(title, dict) else title + displayed_movies = [] # List of already displayed movies, initially empty + top_movies_ids = get_top_movies_by_genre(df, genre, displayed_movies, num_movies) + for tmdbId in top_movies_ids: + title_dict, poster_url = fetch_movie_info(tmdbId) + movie_title = title_dict.get("title", "Unknown Title") if isinstance(title_dict, dict) else title_dict if poster_url: html_file_url = f"http://localhost:8501/{movie_title.replace(' ', '_')}.html" cols_html += f'<div style="display: inline-block; margin-right: 20px;"><a href="{html_file_url}" target="_blank"><img src="{poster_url}" alt="{movie_title}" style="width:150px;height:225px;"></a><div style="color: white; text-decoration: none; font-size: 14px; text-align: center; max-width: 150px; word-wrap: break-word; white-space: normal;"><b>{movie_title}</b></div></a></div>' else: cols_html += f"<p>{movie_title}</p>" - + st.markdown(f""" <div style="overflow-x: scroll; white-space: nowrap; height: 300px; display: flex; flex-direction: row;"> {cols_html} </div> """, unsafe_allow_html=True) - else: - st.write("No recommendations found.") - - - def main(): - st.title('Discover') # Next Access All Content - global df_links + st.sidebar.image("data/Logo_NAAC-removebg-preview.png", use_column_width=True) + st.title('Discover') # Next Access All Content + global df_links + global df_ratings + global df_movies num_columns = 4 - # Sidebar for genre and year selection - selected_genres = st.sidebar.multiselect('Select Genre(s)', genres(merged_df)) - min_range, max_range = calculate_year_range(merged_df) - selected_year = st.sidebar.slider('Select Year Range', min_value=min_range, max_value=max_range, value=(min_range, max_range)) - + # Existing filtering and display filtered_movies = merged_df.copy() - # Here you can add filtering by selected genres and years if needed - # filtered_movies = filter_movies_by_genre(filtered_movies, selected_genres) - # filtered_movies = filter_movies_by_year(filtered_movies, selected_year) - - unique_tags_list = tag(merged_df) - # tag_input = st.sidebar.selectbox("Search by Tag", [""] + sorted(unique_tags_list)) - # if tag_input: - # filtered_movies = filter_movies_by_tag(filtered_movies, [tag_input]) unique_tmdbIds = filtered_movies.drop_duplicates(subset='tmdbId')['tmdbId'].tolist() - display_movies(unique_tmdbIds, num_columns) - - # Display top movies for all genres without repetition - displayed_movies = set() - all_genres = genres(merged_df) - for genre in all_genres: - display_top_movies_by_genre(genre, displayed_movies, num_columns) - -if __name__ == "__main__": - main() - + display_movies_year(filtered_movies) + display_unrated_movies(df_ratings, filtered_movies) -# def filter_movies_by_genre(df, selected_genres): -# if not selected_genres: -# return df -# return df[df['genres'].apply(lambda x: any(genre in x for genre in selected_genres))] + display_movies_by_genre(filtered_movies, num_movies=10) -# def filter_movies_by_year(df, selected_year): -# return df[(df['annee'] >= selected_year[0]) & (df['annee'] <= selected_year[1])] - -# def filter_movies_by_tag(df, selected_tags): -# if not selected_tags: -# return df -# df['tag'] = df['tag'].apply(lambda x: str(x).split('|') if pd.notna(x) else []) -# filtered_df = df[df['tag'].apply(lambda tags: any(tag in tags for tag in selected_tags))] -# return filtered_df \ No newline at end of file +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/pages/Search.py b/pages/Search.py new file mode 100644 index 0000000000000000000000000000000000000000..3bba56f47662c4806b1806f9aa5cb0e23771cab7 --- /dev/null +++ b/pages/Search.py @@ -0,0 +1,69 @@ +import streamlit as st +import pandas as pd +from content import genres, calculate_year_range, tag, merged_df, fetch_movie_info, df_links + +def filter_movies_by_title(df, search_title): + if not search_title: + return df + return df[df['title'].str.contains(search_title, case=False)] + + +def filter_movies_by_genre(df, selected_genres): + if not selected_genres: + return df + return df[df['genres'].apply(lambda x: any(genre in x for genre in selected_genres))] + +def filter_movies_by_year(df, selected_year): + return df[(df['annee'] >= selected_year[0]) & (df['annee'] <= selected_year[1])] + +def filter_movies_by_tag(df, selected_tags): + if not selected_tags: + return df + df['tag'] = df['tag'].apply(lambda x: str(x).split('|') if pd.notna(x) else []) + filtered_df = df[df['tag'].apply(lambda tags: any(tag in tags for tag in selected_tags))] + return filtered_df + +def display_movies(unique_tmdbIds, num_columns): + for i in range(0, len(unique_tmdbIds), num_columns): + row_tmdbIds = unique_tmdbIds[i:i+num_columns] + cols = st.columns(num_columns) + for idx, tmdbId in enumerate(row_tmdbIds): + title, poster_url = fetch_movie_info(tmdbId) + if poster_url: + title = title.get("title", "Unknown Title") + html_file_url = f"http://localhost:8501/{title.replace(' ', '_')}.html" + cols[idx].markdown(f'<div style="margin-right: 20px; margin-top: 20px; margin-bottom: 5px;"><a href="{html_file_url}" target="_blank"><img src="{poster_url}" alt="{title}" style="width:150px;height:225px;"></a></div>', unsafe_allow_html=True) + cols[idx].markdown(f'<div style="margin-right: 20px; margin-bottom: 5px; text-align: center; max-width: 150px;"><a href="{html_file_url}" target="_blank" style="color: white; text-decoration: none; font-size: 14px; word-wrap: break-word;"><b>{title}</b></a></div>', unsafe_allow_html=True) + + + +def main(): + st.title('Movie Base') + global df_links + + num_columns = 4 + + # Search by movie title + search_title = st.sidebar.text_input("Search by Movie Title", value="", placeholder="Type to search...", key="search_title") + + # Load filters after title search + selected_genres = st.sidebar.multiselect('Select Genre(s)', genres(merged_df)) + unique_tags_list = tag(merged_df) + tag_input = st.sidebar.selectbox("Search by Tag", [""] + sorted(unique_tags_list)) + min_range, max_range = calculate_year_range(merged_df) + selected_year = st.sidebar.slider('Select Year Range', min_value=min_range, max_value=max_range, value=(min_range, max_range)) + + # Apply filters + filtered_movies = merged_df.copy() + filtered_movies = filter_movies_by_title(filtered_movies, search_title) + filtered_movies = filter_movies_by_genre(filtered_movies, selected_genres) + if tag_input: + filtered_movies = filter_movies_by_tag(filtered_movies, [tag_input]) + filtered_movies = filter_movies_by_year(filtered_movies, selected_year) + + unique_tmdbIds = filtered_movies.drop_duplicates(subset='tmdbId')['tmdbId'].tolist() + + display_movies(unique_tmdbIds, num_columns) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/recommender.py b/recommender.py index b4c16f5a814393c6228366c2d91c08de8b3a816f..41aaa708c2c5c3e599a6f61a896b1fd6181c5eb2 100644 --- a/recommender.py +++ b/recommender.py @@ -15,7 +15,6 @@ from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error -from surprise.similarities import Similarity from surprise.prediction_algorithms.knns import KNNWithMeans from sklearn.metrics.pairwise import cosine_similarity @@ -155,24 +154,6 @@ class UserBased(AlgoBase): similarity_matrix[j, i] = similarity self.sim = similarity_matrix - - def compute_cosine(self): - """ - Compute the similarity matrix based on user ratings using cosine similarity. - """ - n_users = self.trainset.n_users - similarity_matrix = np.zeros((n_users, n_users)) - - for i in range(n_users): - for j in range(i + 1, n_users): - support = np.sum(~np.isnan(self.ratings_matrix[i]) & ~np.isnan(self.ratings_matrix[j])) - if support >= self.min_k: - # Calculate cosine similarity - similarity = cosine_similarity([self.ratings_matrix[i]], [self.ratings_matrix[j]])[0, 0] - similarity_matrix[i, j] = similarity - similarity_matrix[j, i] = similarity - - self.sim = similarity_matrix def compute_mean_ratings(self): """ @@ -459,15 +440,6 @@ class RecommenderSystem_KNN : return average_distance - def train_knn_model(self): - """ - Train the KNN model on the ratings data and evaluate its RMSE. - """ - # Compute similarity matrix using cosine similarity - sim_matrix = cosine_similarity(self.trainset._raw2inner_id_items) - self.model.sim = sim_matrix - - def evaluate_knn_rmse_for_different_k(self): """ @@ -638,49 +610,6 @@ class OtherUserBased: inter_user_diversity_score = np.mean(similarities) return inter_user_diversity_score - -########################################################################################################################### -####################################################### CUSTOM METRICS #################################################### -########################################################################################################################### - - -class CustomKNNWithMeans(KNNWithMeans): - def __init__(self, k=40, min_k=1, sim_options={}, **kwargs): - sim_options['user_based'] = True - sim_options['name'] = 'custom' # Nom de la mesure de similarité personnalisée - super().__init__(k=k, min_k=min_k, sim_options=sim_options, **kwargs) - - def fit(self, trainset): - # Ici, vous devez implémenter votre propre calcul de similarité personnalisée - # Par exemple, vous pouvez utiliser une fonction définie par l'utilisateur pour calculer la similarité - self.sim = Similarity() # Remplacez Similarity par votre propre calcul de similarité - super().fit(trainset) - - - -class CustomUserBased(UserBased): - def __init__(self, k=20, min_k=20, sim_options={}, **kwargs): - sim_options['name'] = 'custom' # Nom de la mesure de similarité personnalisée - super().__init__(k=k, min_k=min_k, sim_options=sim_options, **kwargs) - - def compute_similarity_matrix(self): - """ - Calcule la matrice de similarité basée sur les évaluations des utilisateurs. - """ - n_users = self.trainset.n_users - similarity_matrix = np.eye(n_users) - - for i in range(n_users): - for j in range(i + 1, n_users): - support = np.sum(~np.isnan(self.ratings_matrix[i]) & ~np.isnan(self.ratings_matrix[j])) - if support >= self.min_k: - intersection = np.sum(~np.isnan(self.ratings_matrix[i]) & ~np.isnan(self.ratings_matrix[j])) - union = np.sum(~np.isnan(self.ratings_matrix[i]) | ~np.isnan(self.ratings_matrix[j])) - similarity = intersection / union - similarity_matrix[i, j] = similarity - similarity_matrix[j, i] = similarity - - self.sim = similarity_matrix ########################################################################################################################### ####################################################### COMPARISON MODEL ################################################## ########################################################################################################################### @@ -789,8 +718,8 @@ def compare_similarity_measures(trainset,testset): return results # # # Example usage: -comparison_results = compare_similarity_measures(trainset ,testset) -print(comparison_results) +# comparison_results = compare_similarity_measures(trainset ,testset) +# print(comparison_results) def evaluate_inter_user_diversity(user_based_model, ratings_path, other_user_based_models, trainset, testset, k, min_k):