diff --git a/Home.py b/Home.py index c70b755e04bdbf471166f607503abeb413215f98..d9754e7c1295b2d599d5ca3fe10b7432f3481afb 100644 --- a/Home.py +++ b/Home.py @@ -1,121 +1,136 @@ -import streamlit as st -import time -import pandas as pd +import streamlit as st # Import Streamlit for building the web application +import pandas as pd # Import pandas for data manipulation + +# Import necessary functions and dataframes from local modules from content import fetch_movie_info, df_links from content import df_audrey, df_adrien, df_nathanael, df_charles -from recommender import OtherUserBased, UserBased, RecommenderSystem_KNN, LatentFactorModel,test_contentbased_class # Importer la classe OtherUserBased +from recommender import OtherUserBased, UserBased, RecommenderSystem_KNN, LatentFactorModel, test_contentbased_class from loaders import load_ratings -import utils as ut - -# Lancer un serveur local : -#cd /Users/adrien/vscodeworkspace/recomsys/sitehtml -#python3 -m http.server 8501 - -# Si vous avez arrété le serveur local, voici les étapes pour le relancer -#lsof -i :8501 -#kill 12345 # en remplaçant 12345 par le PID trouvé -#python3 -m http.server 8501 # relance le serveur local - +import utils as ut # Import custom utilities + +# Instructions for starting a local server: +# Navigate to the project directory +# cd /Users/adrien/vscodeworkspace/recomsys/sitehtml +# Start the local server +# python3 -m http.server 8501 + +# Steps to restart the local server if it was stopped: +# List open files and their PIDs for port 8501 +# lsof -i :8501 +# Kill the process using its PID +# kill 12345 # replace 12345 with the PID found +# Restart the local server +# python3 -m http.server 8501 + +# Function to add a logo to the Streamlit app ut.add_logo() - +# Function to display user's movies in different sections def display_user_movies(df, title, column_name): - st.subheader(title) - # Group by 'tmdbId' and aggregate using max() for each column + st.subheader(title) # Display section title + # Filter and aggregate movies based on the specified column filtered_df = df[df[column_name] > 0].groupby('tmdbId').agg('max').reset_index() - # Sélectionner le nombre de films à afficher en fonction de la colonne + # Select number of movies to display based on the column if column_name == 'top_10': filtered_df = filtered_df.sort_values(by=column_name, ascending=False).head(15) else: filtered_df = filtered_df.sort_values(by=column_name, ascending=False) - tmdbIds = filtered_df['tmdbId'].tolist() + tmdbIds = filtered_df['tmdbId'].tolist() # Get list of tmdbIds cols_html = "" for tmdbId in tmdbIds: - title_dict, poster_url = fetch_movie_info(tmdbId) + title_dict, poster_url = fetch_movie_info(tmdbId) # Fetch movie info using tmdbId movie_title = title_dict.get("title", "Unknown Title") if isinstance(title_dict, dict) else title_dict if poster_url: + # Generate HTML for each movie html_file_url = f"http://localhost:8501/{movie_title.replace(' ', '_')}.html" cols_html += f'<div style="display: inline-block; margin-right: 20px;"><a href="{html_file_url}" target="_blank"><img src="{poster_url}" alt="{movie_title}" style="width:150px;height:225px;"></a><div style="color: white; text-decoration: none; font-size: 14px; text-align: center; max-width: 150px; word-wrap: break-word; white-space: normal;"><b>{movie_title}</b></div></a></div>' else: cols_html += f"<p>{movie_title}</p>" + # Display the movies in a horizontally scrollable div st.markdown(f""" <div style="overflow-x: scroll; white-space: nowrap; height: 300px; display: flex; flex-direction: row;"> {cols_html} </div> """, unsafe_allow_html=True) +# Function to display recommendations using the OtherUserBased algorithm def display_recommendations_tm(user_name, user_id, csv_file): - recommender = OtherUserBased(user_name, user_id) - recommender.load_model() - top_10_predictions = recommender.get_top_n_predictions_for_user(csv_file) + recommender = OtherUserBased(user_name, user_id) # Initialize recommender system + recommender.load_model() # Load the pre-trained model + top_10_predictions = recommender.get_top_n_predictions_for_user(csv_file) # Get top 10 predictions if top_10_predictions is not None: - st.subheader(f"Explore New Favorites") + st.subheader(f"Explore New Favorites") # Display section title cols_html = "" for prediction in top_10_predictions: - item_id, pred_value = prediction # Access each element of the tuple directly + item_id, pred_value = prediction # Unpack prediction tuple tmdbId = df_links.loc[df_links['movieId'] == item_id, 'tmdbId'].values[0] - title_dict, poster_url = fetch_movie_info(tmdbId) + title_dict, poster_url = fetch_movie_info(tmdbId) # Fetch movie info movie_title = title_dict.get("title", "Unknown Title") if isinstance(title_dict, dict) else title_dict if poster_url: + # Generate HTML for each movie html_file_url = f"http://localhost:8501/{movie_title.replace(' ', '_')}.html" cols_html += f'<div style="display: inline-block; margin-right: 20px;"><a href="{html_file_url}" target="_blank"><img src="{poster_url}" alt="{movie_title}" style="width:150px;height:225px;"></a><div style="color: white; text-decoration: none; font-size: 14px; text-align: center; max-width: 150px; word-wrap: break-word; white-space: normal;"><b>{movie_title}</b></div></a></div>' else: cols_html += f"<p>{movie_title}</p>" + # Display the recommendations in a horizontally scrollable div st.markdown(f""" <div style="overflow-x: scroll; white-space: nowrap; height: 300px; display: flex; flex-direction: row;"> {cols_html} </div> """, unsafe_allow_html=True) else: - st.write("No recommendations found.") - + st.write("No recommendations found.") # Display message if no recommendations +# Function to display recommendations using User-Based Collaborative Filtering def display_recommendations_ub(user_name, user_id): - # Charger les données et préparer l'ensemble de données pour l'entraînement et le test + # Load and prepare the dataset for training and testing data = load_ratings(surprise_format=True) trainset = data.build_full_trainset() testset = trainset.build_anti_testset() - # Créer une instance de l'algorithme de filtrage collaboratif basé sur les utilisateurs + # Create and train the User-Based Collaborative Filtering model recommender = UserBased(k=60, min_k=60) recommender.fit(trainset) - top_10_predictions = recommender.get_top_n_pred_ub(testset, user_id) + top_10_predictions = recommender.get_top_n_pred_ub(testset, user_id) # Get top 10 predictions if top_10_predictions is not None: - st.subheader(f"User-Approved Recommendations") + st.subheader(f"User-Approved Recommendations") # Display section title cols_html = "" for prediction in top_10_predictions: - iid, pred = prediction # Access each element of the tuple directly + iid, pred = prediction # Unpack prediction tuple tmdbId = df_links.loc[df_links['movieId'] == iid, 'tmdbId'].values[0] - title_dict, poster_url = fetch_movie_info(tmdbId) + title_dict, poster_url = fetch_movie_info(tmdbId) # Fetch movie info movie_title = title_dict.get("title", "Unknown Title") if isinstance(title_dict, dict) else title_dict if poster_url: + # Generate HTML for each movie html_file_url = f"http://localhost:8501/{movie_title.replace(' ', '_')}.html" cols_html += f'<div style="display: inline-block; margin-right: 20px;"><a href="{html_file_url}" target="_blank"><img src="{poster_url}" alt="{movie_title}" style="width:150px;height:225px;"></a><div style="color: white; text-decoration: none; font-size: 14px; text-align: center; max-width: 150px; word-wrap: break-word; white-space: normal;"><b>{movie_title}</b></div></a></div>' else: cols_html += f"<p>{movie_title}</p>" + # Display the recommendations in a horizontally scrollable div st.markdown(f""" <div style="overflow-x: scroll; white-space: nowrap; height: 300px; display: flex; flex-direction: row;"> {cols_html} </div> """, unsafe_allow_html=True) else: - st.write("No recommendations found.") + st.write("No recommendations found.") # Display message if no recommendations +# Function to display recommendations using the KNN algorithm def display_recommendations_knn(user_name, user_id): - st.subheader(f"Recommended for You") + st.subheader(f"Recommended for You") # Display section title cols_html = "" - # Create an instance of the RecommenderSystem_KNN + # Create and train the KNN-based recommender system recommender = RecommenderSystem_KNN("data/small/evidence/ratings.csv") recommender.train_knn_model() @@ -125,99 +140,104 @@ def display_recommendations_knn(user_name, user_id): if top_10_predictions: for item_id, pred_value in top_10_predictions: tmdbId = df_links.loc[df_links['movieId'] == item_id, 'tmdbId'].values[0] - title_dict, poster_url = fetch_movie_info(tmdbId) + title_dict, poster_url = fetch_movie_info(tmdbId) # Fetch movie info movie_title = title_dict.get("title", "Unknown Title") if isinstance(title_dict, dict) else title_dict if poster_url: + # Generate HTML for each movie html_file_url = f"http://localhost:8501/{movie_title.replace(' ', '_')}.html" cols_html += f'<div style="display: inline-block; margin-right: 20px;"><a href="{html_file_url}" target="_blank"><img src="{poster_url}" alt="{movie_title}" style="width:150px;height:225px;"></a><div style="color: white; text-decoration: none; font-size: 14px; text-align: center; max-width: 150px; word-wrap: break-word; white-space: normal;"><b>{movie_title}</b></div></a></div>' else: cols_html += f"<p>{movie_title}</p>" + # Display the recommendations in a horizontally scrollable div st.markdown(f""" <div style="overflow-x: scroll; white-space: nowrap; height: 300px; display: flex; flex-direction: row;"> {cols_html} </div> """, unsafe_allow_html=True) else: - st.write("No recommendations found.") - + st.write("No recommendations found.") # Display message if no recommendations +# Function to display recommendations using a Latent Factor Model def display_recommendations_latent_factor(user_name, user_id): - st.subheader(f"Unlock Your Next Adventure") + st.subheader(f"Unlock Your Next Adventure") # Display section title cols_html = "" - # Load data + # Load the ratings and movies data ratings = pd.read_csv("data/small/evidence/ratings.csv") movies = pd.read_csv("data/small/content/movies.csv") - # Create an instance of the LatentFactorModel - recommender = LatentFactorModel(num_factors=10, learning_rate=0.01, regularization=0.1, num_epochs=20, user_name = user_name) + # Create and train the Latent Factor Model + recommender = LatentFactorModel(num_factors=10, learning_rate=0.01, regularization=0.1, num_epochs=20, user_name=user_name) recommender.fit(ratings, movies) # Get top 10 recommendations for the user - top_10_predictions = recommender.top_ratings_for_user(user_id,ratings) + top_10_predictions = recommender.top_ratings_for_user(user_id, ratings) if top_10_predictions: for item_id, pred_value in top_10_predictions: tmdbId = df_links.loc[df_links['movieId'] == item_id, 'tmdbId'].values[0] - title_dict, poster_url = fetch_movie_info(tmdbId) + title_dict, poster_url = fetch_movie_info(tmdbId) # Fetch movie info movie_title = title_dict.get("title", "Unknown Title") if isinstance(title_dict, dict) else title_dict if poster_url: + # Generate HTML for each movie html_file_url = f"http://localhost:8501/{movie_title.replace(' ', '_')}.html" cols_html += f'<div style="display: inline-block; margin-right: 20px;"><a href="{html_file_url}" target="_blank"><img src="{poster_url}" alt="{movie_title}" style="width:150px;height:225px;"></a><div style="color: white; text-decoration: none; font-size: 14px; text-align: center; max-width: 150px; word-wrap: break-word; white-space: normal;"><b>{movie_title}</b></div></a></div>' else: cols_html += f"<p>{movie_title}</p>" + # Display the recommendations in a horizontally scrollable div st.markdown(f""" <div style="overflow-x: scroll; white-space: nowrap; height: 300px; display: flex; flex-direction: row;"> {cols_html} </div> """, unsafe_allow_html=True) else: - st.write("No recommendations found.") - - + st.write("No recommendations found.") # Display message if no recommendations +# Function to display content-based recommendations def display_content_based_recommendations(user_name, user_id=-1, n=15): cols_html = "" - # Call the test_contentbased_class function to get top N recommendations + # Get top N recommendations using content-based filtering top_n_recommendations = test_contentbased_class(["title_length", "movie_year", "genre", "avg_rating"], "ridge_regression", user_id=-1, n=10) - if top_n_recommendations: - st.subheader(f"Discover Great Content") + st.subheader(f"Discover Great Content") # Display section title for iid, est in top_n_recommendations: tmdbId = df_links.loc[df_links['movieId'] == iid, 'tmdbId'].values[0] - title_dict, poster_url = fetch_movie_info(tmdbId) + title_dict, poster_url = fetch_movie_info(tmdbId) # Fetch movie info movie_title = title_dict.get("title", "Unknown Title") if isinstance(title_dict, dict) else title_dict if poster_url: + # Generate HTML for each movie html_file_url = f"http://localhost:8501/{movie_title.replace(' ', '_')}.html" cols_html += f'<div style="display: inline-block; margin-right: 20px;"><a href="{html_file_url}" target="_blank"><img src="{poster_url}" alt="{movie_title}" style="width:150px;height:225px;"></a><div style="color: white; text-decoration: none; font-size: 14px; text-align: center; max-width: 150px; word-wrap: break-word; white-space: normal;"><b>{movie_title}</b></div></a></div>' else: cols_html += f"<p>{movie_title}</p>" + # Display the recommendations in a horizontally scrollable div st.markdown(f""" <div style="overflow-x: scroll; white-space: nowrap; height: 300px; display: flex; flex-direction: row;"> {cols_html} </div> """, unsafe_allow_html=True) else: - st.write("No recommendations found.") - + st.write("No recommendations found.") # Display message if no recommendations +# Main function to run the Streamlit app def main(): - global df_links + global df_links # Access the global df_links dataframe - num_columns = 4 + num_columns = 4 # Number of columns to display # Default user selection selected_user = "Adrien" - # User selection with default value + # User selection with default value using a dropdown menu selected_user = st.sidebar.selectbox("Select User", ["Audrey", "Adrien", "Nathanael", "Charles"], index=1) if selected_user != "": + # Map selected user to their respective dataframes user_options = { "Audrey": df_audrey, "Adrien": df_adrien, @@ -225,6 +245,7 @@ def main(): "Charles": df_charles } + # Map selected user to their respective user IDs user_id_options = { "Audrey": -2, "Adrien": -1, @@ -232,29 +253,28 @@ def main(): "Charles": -4 } - st.title(f'{selected_user}') + st.title(f'{selected_user}') # Display the selected user's name as the title - # Chemin vers le fichier CSV contenant les données des films - csv_file = "data/small/evidence/ratings.csv" + # Path to the CSV file containing movie data + csv_file = "data/small/evidence/ratings.csv" if selected_user: - user_df = user_options[selected_user] + user_df = user_options[selected_user] # Get the user's dataframe + # Display content-based recommendations display_content_based_recommendations(selected_user, user_id_options[selected_user], n=10) - + # Display latent factor model recommendations display_recommendations_latent_factor(selected_user, user_id_options[selected_user]) - # Afficher les recommandations basées sur l'algorithme OtherUserBased - + # Display OtherUserBased recommendations display_recommendations_tm(selected_user, user_id_options[selected_user], csv_file) - # Afficher les recommandations basées sur l'algorithme UserBased + # Display UserBased recommendations display_recommendations_ub(selected_user, user_id_options[selected_user]) - + # Display KNN-based recommendations display_recommendations_knn(selected_user, user_id_options[selected_user]) - + # Prepare the dataframe for displaying user-specific sections dataframe_links = df_links.copy() user_df['movieId'] = user_df['movieId'].astype(int) - user_df_sorted = user_df.sort_values(by='movieId') dataframe_links['movieId'] = dataframe_links['movieId'].astype(int) user_columns = ['movieId', 'top_10', 'n_watched', 'wishlist', 'recent'] @@ -265,13 +285,12 @@ def main(): dataframe_links[col] = 0 dataframe_links[col] = dataframe_links[col].fillna(0).astype(int) - # Afficher les sections + # Display different sections for the user's movies display_user_movies(dataframe_links, "Top 10", 'top_10') display_user_movies(dataframe_links, "Most Viewed", 'n_watched') display_user_movies(dataframe_links, "WishList", 'wishlist') display_user_movies(dataframe_links, "See Again", 'recent') - +# Entry point to run the main function when the script is executed if __name__ == "__main__": - - main() \ No newline at end of file + main() diff --git a/backend/evaluator.py b/backend/evaluator.py deleted file mode 100644 index ed795c4eea7cf54ee3b2c87e5020eedde3a8c7b9..0000000000000000000000000000000000000000 --- a/backend/evaluator.py +++ /dev/null @@ -1,209 +0,0 @@ -# imports -import numpy as np -import pandas as pd - -# local imports -from configs import EvalConfig -from constants import Constant as C -from loaders import export_evaluation_report -from loaders import load_ratings - -# New imports -from surprise.model_selection import train_test_split -from surprise import accuracy -from surprise.model_selection import LeaveOneOut - - -# -- implement the function generate_split_predictions -- -def generate_split_predictions(algo, ratings_dataset, eval_config): - """Generate predictions on a random test set specified in eval_config""" - - # Spliting the data into train and test sets - trainset, testset = train_test_split(ratings_dataset, test_size=eval_config.test_size) - - # Training the algorithm on the train data set - algo.fit(trainset) - - # Predict ratings for the testset - predictions = algo.test(testset) - - return predictions - -# -- implement the function generate_loo_top_n -- -def generate_loo_top_n(algo, ratings_dataset, eval_config): - """Generate top-n recommendations for each user on a random Leave-one-out split (LOO)""" - - # Create a LeaveOneOut split - loo = LeaveOneOut(n_splits=1) - - for trainset, testset in loo.split(ratings_dataset): - algo.fit(trainset) # Train the algorithm on the training set - anti_testset = trainset.build_anti_testset() # Build the anti test-set - predictions = algo.test(anti_testset) # Get predictions on the anti test-set - top_n = {} - for uid, iid, _, est, _ in predictions: - if uid not in top_n: - top_n[uid] = [] - top_n[uid].append((iid, est)) - for uid, user_ratings in top_n.items(): - user_ratings.sort(key=lambda x: x[1], reverse=True) - top_n[uid] = user_ratings[:eval_config.top_n_value] # Get top-N recommendations - anti_testset_top_n = top_n - return anti_testset_top_n, testset - -def generate_full_top_n(algo, ratings_dataset, eval_config): - """Generate top-n recommendations for each user with full training set (LOO)""" - - full_trainset = ratings_dataset.build_full_trainset() # Build the full training set - algo.fit(full_trainset) # Train the algorithm on the full training set - anti_testset = full_trainset.build_anti_testset() # Build the anti test-set - predictions = algo.test(anti_testset) # Get predictions on the anti test-set - top_n = {} - for uid, iid, _, est, _ in predictions: - if uid not in top_n: - top_n[uid] = [] - top_n[uid].append((iid, est)) - for uid, user_ratings in top_n.items(): - user_ratings.sort(key=lambda x: x[1], reverse=True) - top_n[uid] = user_ratings[:eval_config.top_n_value] # Get top-N recommendations - anti_testset_top_n = top_n - return anti_testset_top_n - -def precomputed_information(movie_data): - - """ Returns a dictionary that precomputes relevant information for evaluating in full mode - - Dictionary keys: - - precomputed_dict["item_to_rank"] : contains a dictionary mapping movie ids to rankings - - (-- for your project, add other relevant information here -- ) - """ - - # Initialize an empty dictionary to store item_id to rank mapping - item_to_rank = {} - - # Calculate popularity rank for each movie - ratings_count = movie_data.groupby('movieId').size().sort_values(ascending=False) - - # Assign ranks to movies based on their popularity - for rank, (movie_id, _) in enumerate(ratings_count.items(), start=1): - item_to_rank[movie_id] = rank - - # Create the precomputed dictionary - precomputed_dict = {} - precomputed_dict["item_to_rank"] = item_to_rank - - return precomputed_dict - -def create_evaluation_report(eval_config, sp_ratings, precomputed_dict, available_metrics): - """ Create a DataFrame evaluating various models on metrics specified in an evaluation config. """ - - evaluation_dict = {} - for model_name, model, arguments in eval_config.models: - features_method = arguments.get("features_method", "Unknown features") - regressor_method = arguments.get("regressor_method", "Unknown regressor") - print(f'\nHandling model {model_name} with features: {features_method} and regressor: {regressor_method}') - algo = model(**arguments) - evaluation_dict[model_name] = { - "features_method": features_method, - "regressor_method": regressor_method - } - - # Type 1 : split evaluations - if len(eval_config.split_metrics) > 0: - print('Training split predictions') - predictions = generate_split_predictions(algo, sp_ratings, eval_config) - for metric in eval_config.split_metrics: - print(f'- computing metric {metric}') - assert metric in available_metrics['split'] - evaluation_function, parameters = available_metrics["split"][metric] - evaluation_dict[model_name][metric] = evaluation_function(predictions, **parameters) - - # Type 2 : loo evaluations - if len(eval_config.loo_metrics) > 0: - print('Training loo predictions') - anti_testset_top_n, testset = generate_loo_top_n(algo, sp_ratings, eval_config) - for metric in eval_config.loo_metrics: - assert metric in available_metrics['loo'] - evaluation_function, parameters = available_metrics["loo"][metric] - evaluation_dict[model_name][metric] = evaluation_function(anti_testset_top_n, testset, **parameters) - - # Type 3 : full evaluations - if len(eval_config.full_metrics) > 0: - print('Training full predictions') - anti_testset_top_n = generate_full_top_n(algo, sp_ratings, eval_config) - for metric in eval_config.full_metrics: - assert metric in available_metrics['full'] - evaluation_function, parameters = available_metrics["full"][metric] - evaluation_dict[model_name][metric] = evaluation_function( - anti_testset_top_n, - **precomputed_dict, - **parameters - ) - - # Convert to DataFrame - evaluation_df = pd.DataFrame.from_dict(evaluation_dict).T - - # Reorganize columns - method_columns = ['features_method', 'regressor_method'] - metric_columns = [col for col in evaluation_df.columns if col not in method_columns] - evaluation_df = evaluation_df[method_columns + metric_columns] - - return evaluation_df - - -# -- implement the function get_hit_rate -- -def get_hit_rate(anti_testset_top_n, testset): - - """Compute the average hit over the users (loo metric) - - A hit (1) happens when the movie in the testset has been picked by the top-n recommender - A fail (0) happens when the movie in the testset has not been picked by the top-n recommender - """ - - hits = 0 - total_users = len(testset) - for uid, true_iid, _ in testset: - if uid in anti_testset_top_n and true_iid in {iid for iid, _ in anti_testset_top_n[uid]}: - hits += 1 - hit_rate = hits / total_users - - return hit_rate - -# -- implement the function get_novelty -- -def get_novelty(anti_testset_top_n, item_to_rank): - - """Compute the average novelty of the top-n recommendation over the users (full metric) - - The novelty is defined as the average ranking of the movies recommended - """ - - total_rank_sum = 0 - total_recommendations = 0 - for uid, recommendations in anti_testset_top_n.items(): - for iid, _ in recommendations: - if iid in item_to_rank: - total_rank_sum += item_to_rank[iid] - total_recommendations += 1 - if total_recommendations == 0: - return 0 # Avoid division by zero - average_rank_sum = total_rank_sum / total_recommendations - - return average_rank_sum - -AVAILABLE_METRICS = { - "split": { - "mae": (accuracy.mae, {'verbose': False}), - "rmse": (accuracy.rmse, {'verbose': False}) - }, - "loo": { - "hit_rate": (get_hit_rate, {}), - }, - "full": { - "novelty": (get_novelty, {}), - } -} - -sp_ratings = load_ratings(surprise_format=True) -precomputed_dict = precomputed_information(load_ratings(False)) -evaluation_report = create_evaluation_report(EvalConfig, sp_ratings, precomputed_dict, AVAILABLE_METRICS) -export_evaluation_report(evaluation_report) \ No newline at end of file diff --git a/backend/website.py b/backend/website.py index 2467f4727ad9cb31047f3404b2d1f2b2243a6a41..ee84ba86fb9ec45009b4331c9d27eb0db534d181 100644 --- a/backend/website.py +++ b/backend/website.py @@ -2,26 +2,33 @@ import pandas as pd import requests import streamlit as st -# Charger les données depuis les fichiers CSV +# Load movie data from CSV files movies_data = pd.read_csv("data/small/content/movies.csv") links_data = pd.read_csv("data/small/content/links.csv") -# Fusionner les données en fonction du movieId +# Merge movie data with links data based on movieId merged_data = pd.merge(movies_data, links_data, on='movieId') def fetch_movie_info(tmdbId): - api_key = '430f4a0d247540e52815bb6639b3cb04' # Remplace par ta clé d'API TMDB + """ + Fetch movie information from The Movie Database (TMDB) using the TMDB ID. + + Args: + tmdbId (int): The TMDB ID of the movie. + + Returns: + dict: A dictionary containing the movie's information, or None if an error occurs. + """ + api_key = '430f4a0d247540e52815bb6639b3cb04' # Replace with your TMDB API key base_url = "https://api.themoviedb.org/3" - # URL pour les détails du film + # URLs for movie details, credits, and keywords movie_url = f"{base_url}/movie/{tmdbId}?api_key={api_key}" - # URL pour les crédits (acteurs) credits_url = f"{base_url}/movie/{tmdbId}/credits?api_key={api_key}" - # URL pour les mots-clés keywords_url = f"{base_url}/movie/{tmdbId}/keywords?api_key={api_key}" try: - # Requête pour les détails du film + # Fetch movie details movie_response = requests.get(movie_url) if movie_response.status_code == 200: movie_data = movie_response.json() @@ -30,39 +37,36 @@ def fetch_movie_info(tmdbId): overview = movie_data.get('overview') genres = [genre['name'] for genre in movie_data.get('genres', [])] poster_path = movie_data.get('poster_path') - if poster_path: - poster_url = f"https://image.tmdb.org/t/p/original{poster_path}" - else: - poster_url = None + poster_url = f"https://image.tmdb.org/t/p/original{poster_path}" if poster_path else None runtime = movie_data.get('runtime') hours = runtime // 60 minutes = runtime % 60 - user_rating = int(movie_data.get('vote_average') * 10) # Conversion en pourcentage + user_rating = int(movie_data.get('vote_average') * 10) # Convert to percentage release_year = release_date.split("-")[0] if release_date else "N/A" else: - print(f"Erreur : statut de réponse {movie_response.status_code}") + print(f"Error: Response status {movie_response.status_code}") return None - # Requête pour les crédits (acteurs) + # Fetch movie credits (actors) credits_response = requests.get(credits_url) if credits_response.status_code == 200: credits_data = credits_response.json() - actors = [actor['name'] for actor in credits_data.get('cast', [])[:10]] # Limité aux 10 premiers acteurs + actors = [actor['name'] for actor in credits_data.get('cast', [])[:10]] # Limit to top 10 actors else: - print(f"Erreur : statut de réponse {credits_response.status_code}") + print(f"Error: Response status {credits_response.status_code}") return None - # Requête pour les mots-clés + # Fetch movie keywords keywords_response = requests.get(keywords_url) if keywords_response.status_code == 200: keywords_data = keywords_response.json() keywords = [keyword['name'] for keyword in keywords_data.get('keywords', [])] else: - print(f"Erreur : statut de réponse {keywords_response.status_code}") + print(f"Error: Response status {keywords_response.status_code}") return None - # Retourner les informations combinées + # Return combined movie information return { "title": title, "release_date": release_date, @@ -76,22 +80,20 @@ def fetch_movie_info(tmdbId): "keywords": keywords } except requests.exceptions.RequestException as e: - print(f"Erreur de connexion : {e}") + print(f"Connection error: {e}") return None - -# Parcourir les identifiants movieId +# Loop through movie IDs in the merged data for movieId in merged_data['movieId']: - # Récupérer l'ID TMDB correspondant au movieId + # Get the corresponding TMDB ID for the movieId tmdbId_row = merged_data.loc[merged_data['movieId'] == movieId] tmdbId = tmdbId_row.iloc[0]['tmdbId'] - # Essayer de récupérer les informations du film + # Try to fetch the movie information try: - # Récupérer les informations complètes du film movie_info = fetch_movie_info(tmdbId) - # Si les informations du film sont récupérées avec succès + # If movie information is successfully retrieved if movie_info: title = movie_info["title"] release_date = movie_info["release_date"] @@ -104,10 +106,10 @@ for movieId in merged_data['movieId']: actors = ", ".join(movie_info["actors"]) keywords = ", ".join(movie_info["keywords"]) - # Remplacer les espaces par des underscores dans le titre + # Replace spaces with underscores in the title for safe file naming safe_title = title.replace(" ", "_") - # Générer le contenu HTML pour chaque film + # Generate HTML content for each movie film_detail_html = f""" <!DOCTYPE html> <html lang="fr"> @@ -119,19 +121,19 @@ for movieId in merged_data['movieId']: <style> body {{ font-family: 'Roboto', sans-serif; - background-color: #121212; /* Couleur de fond sombre */ - color: #fff; /* Couleur du texte blanc */ + background-color: #121212; /* Dark background color */ + color: #fff; /* White text color */ margin: 0; padding: 20px; }} h1 {{ - color: #2980b9; /* Titre en blanc */ + color: #2980b9; /* Title color */ border-bottom: 2px solid #ffffff; padding-bottom: 10px; text-align: center; }} .movie-container {{ - background-color: #2b2b2b; /* Couleur de fond du conteneur */ + background-color: #2b2b2b; /* Container background color */ padding: 20px; border-radius: 8px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); @@ -149,7 +151,7 @@ for movieId in merged_data['movieId']: margin-top: 20px; }} .movie-details h2 {{ - color: #2980b9; /* Couleur du titre */ + color: #2980b9; /* Section title color */ margin-bottom: 10px; border-bottom: 1px solid #eee; padding-bottom: 5px; @@ -193,19 +195,19 @@ for movieId in merged_data['movieId']: <h2>Genres</h2> <p class="genres">{genres}</p> - <h2>Durée</h2> + <h2>Runtime</h2> <p class="runtime">{runtime}</p> - <h2>Année de sortie</h2> + <h2>Release Year</h2> <p class="release_year">{release_year}</p> - <h2>Évaluation des utilisateurs</h2> + <h2>User Rating</h2> <p class="user_rating">{user_rating}</p> - <h2>Acteurs</h2> + <h2>Actors</h2> <p class="actors">{actors}</p> - <h2>Mots-clés</h2> + <h2>Keywords</h2> <p class="keywords">{keywords}</p> </div> </div> @@ -213,13 +215,13 @@ for movieId in merged_data['movieId']: </html> """ - # Écrire le contenu HTML dans un fichier + # Write the HTML content to a file with open(f"sitehtml/{safe_title}.html", "w") as file: file.write(film_detail_html) else: - # Si les informations du film ne sont pas trouvées, continuer avec le prochain movieId + # If movie information is not found, continue with the next movieId continue except Exception as e: - st.error(f"Erreur lors du traitement du film avec ID {movieId}: {e}") + st.error(f"Error processing movie with ID {movieId}: {e}") continue diff --git a/configs.py b/configs.py index f9de902be256cac165fb82a3b3598454af185830..d9fef32ed494a44483ffabd1faa5d969dc00f6ad 100644 --- a/configs.py +++ b/configs.py @@ -1,37 +1,39 @@ from models import * from itertools import combinations -# Méthodes de caractéristiques disponibles +# Available feature extraction methods features_methods = [ 'genre', 'movie_year', 'avg_rating', 'title_length' ] -# Méthodes de régression disponibles +# Available regression methods regressor_methods = [ 'linear_regression','random_forest', 'lasso_regression','gradient_boosting', 'ridge_regression', 'svr_regression' ] -# Générer toutes les combinaisons possibles de méthodes de caractéristiques +# Generate all possible combinations of feature extraction methods feature_combinations = [] for r in range(1, len(features_methods) + 1): feature_combinations.extend(combinations(features_methods, r)) -# Générer toutes les combinaisons possibles de méthodes de régression et de caractéristiques +# Generate all possible combinations of regression methods and feature extraction methods model_combinations = [] for feature_set in feature_combinations: for regressor in regressor_methods: + # Create a unique model name for the combination model_name = f"combination_{regressor}_{'_'.join(feature_set)}" + # Define the arguments to be passed to the model arguments = { "features_method": list(feature_set), "regressor_method": regressor } + # Append the combination to the list model_combinations.append((model_name, ContentBased, arguments)) class EvalConfig: - """Configuration settings for evaluation.""" # List of models to evaluate, each tuple containing model_name, model class, and model parameters (dict) @@ -40,33 +42,14 @@ class EvalConfig: ("baseline_2", ModelBaseline2, {}), ("baseline_3", ModelBaseline3, {}), ("baseline_4", ModelBaseline4, {}), - ("1", ContentBased, {"features_method": ['movie_year', 'avg_rating'], "regressor_method": 'linear_regression'}), - # ("2", ContentBased, {"features_method": ['genre', 'movie_year', 'avg_rating'], "regressor_method": 'gradient_boosting'}), - # ("3", ContentBased, {"features_method": ['avg_rating'], "regressor_method": 'gradient_boosting'}), - # ("4", ContentBased, {"features_method": ['avg_rating'], "regressor_method": 'lasso_regression'}), - # ("5", ContentBased, {"features_method": ['genre'], "regressor_method": 'random_forest'}), - # ("6", ContentBased, {"features_method": ['genre'], "regressor_method": 'lasso_regression'}), - # ("7", ContentBased, {"features_method": ['avg_rating', 'title_length'], "regressor_method": 'ridge_regression'}), - # ("8", ContentBased, {"features_method": ['avg_rating'], "regressor_method": 'svr_regression'}), - # ("9", ContentBased, {"features_method": ['genre', 'movie_year', 'title_length'], "regressor_method": 'gradient_boosting'}), - # ("10", ContentBased, {"features_method": ['genre', 'title_length'], "regressor_method": 'svr_regression'}), - # ("11", ContentBased, {"features_method": ['genre', 'avg_rating', 'title_length'], "regressor_method": 'linear_regression'}), - # ("12", ContentBased, {"features_method": ['genre', 'avg_rating'], "regressor_method": 'linear_regression'}), - # ("13", ContentBased, {"features_method": ['genre', 'avg_rating', 'title_length'], "regressor_method": 'knn_regression'}), - # ("14", ContentBased, {"features_method": ['genre', 'movie_year', 'avg_rating'], "regressor_method": 'xgboost'}), - # ("15", ContentBased, {"features_method": ['genre', 'title_length'], "regressor_method": 'decision_tree'}), - # ("16", ContentBased, {"features_method": ['title_length'], "regressor_method": 'random_forest'}), - # ("17", ContentBased, {"features_method": ['genre', 'title_length'], "regressor_method": 'gradient_boosting'}), - # ("18", ContentBased, {"features_method": ['movie_year', 'title_length'], "regressor_method": 'lightgbm'}), - # ("19", ContentBased, {"features_method": ['avg_rating', 'title_length'], "regressor_method": 'decision_tree'}) ] - # # Ajouter les combinaisons de ContentBased à la liste des modèles - # models.extend(model_combinations) + # Add the combinations of ContentBased models to the list of models + models.extend(model_combinations) - # # Affichage des modèles pour vérification - # for model in models: - # print(model) + # Print the models for verification + for model in models: + print(model) # Metrics to compute for split evaluation split_metrics = ["mae", "rmse"] @@ -78,29 +61,7 @@ class EvalConfig: full_metrics = ["novelty"] # Split parameters - test_size = 0.25 # -- configure the test_size (from 0 to 1) -- + test_size = 0.25 # Configure the test size (from 0 to 1) # Loo parameters - top_n_value = 10 # -- configure the numer of recommendations (> 1) -- - - - - # #("1", ContentBased, {"features_method": ['movie_year', 'avg_rating', 'genre'], "regressor_method":'linear_regression'}), - # ("2", ContentBased, {"features_method": ['movie_year', 'avg_rating', 'genre'], "regressor_method":'random_forest'}) - - # #("3", ContentBased, {"features_method": ['movie_year', 'avg_rating', 'genre'], "regressor_method":'lasso_regression'}), - # #("4", ContentBased, {"features_method": ['movie_year', 'avg_rating', 'genre'], "regressor_method":'elastic_net'}), - # # ("2", ContentBased, {"features_method": ['genre', 'avg_rating'], "regressor_method":'ridge_regression'}), - # # ("3", ContentBased, {"features_method":['movie_year', 'avg_rating', 'title_length'], "regressor_method":'lasso_regression'}), - # # ("4", ContentBased, {"features_method":['title_length'], "regressor_method":'random_forest'}), - # # ("5", ContentBased, {"features_method":['genre', 'title_length'], "regressor_method":'lasso_regression'}), - # # ("6", ContentBased, {"features_method":['genre', 'title_length'], "regressor_method":'linear_regression'}), - # # ("7", ContentBased, {"features_method":['genre', 'avg_rating'], "regressor_method":'lasso_regression'}), - # # ("8", ContentBased, {"features_method":['avg_rating', 'title_length'], "regressor_method":'adaboost'}), - # # ("9", ContentBased, {"features_method":['genre', 'movie_year', 'avg_rating'], "regressor_method":'decision_tree'}), - # # ("10", ContentBased, {"features_method":['genre', 'movie_year'], "regressor_method":'decision_tree'}), - # # ("11", ContentBased, {"features_method":['genre', 'movie_year', 'avg_rating'], "regressor_method":'elastic_net'}), - # # ("12", ContentBased, {"features_method":['movie_year', 'avg_rating', 'title_length'], "regressor_method":'elastic_net'}) - - - + top_n_value = 10 # Configure the number of recommendations (> 1) diff --git a/constants.py b/constants.py index 6fc4c069e4d21aa0045c18dc69f927a690cf87a4..78aae412f22acb5b5f50ef7702240fdf21941e7b 100644 --- a/constants.py +++ b/constants.py @@ -1,4 +1,4 @@ -# third parties imports +# imports from pathlib import Path diff --git a/content.py b/content.py index 73da7516e41cf402378a43b60274184fa9c35375..e35004ff9ef1d23cdb1d2f971bb2bd71a8154eb3 100644 --- a/content.py +++ b/content.py @@ -1,56 +1,43 @@ -############### ALL THE IMPORT ############### +############### ALL THE IMPORTS ############### -# Import import pandas as pd +import requests # Import requests for making HTTP requests -# Standard library imports -import numpy as np -import pandas as pd -import requests - - -# Local imports -from constants import Constant as C - -############### IMPORT THE DATAFRAMES AND TRAINSETS ############### - -# import all the dataframe -df_links = pd.read_csv("data/small/content/links.csv") -df_movies = pd.read_csv("data/small/content/movies.csv") -df_tags = pd.read_csv('data/small/content/tags.csv') -df_ratings = pd.read_csv('data/small/evidence/ratings.csv') +############### IMPORT THE DATAFRAMES AND TRAINSETS ############### -# Data set from each user -df_audrey = pd.read_csv('data/small/users/library_audrey.csv', dtype={'movieId': str}) # Supprimer les points de la colonne 'movieId' et convertir les valeurs en entiers -df_audrey['movieId'] = df_audrey['movieId'].str.replace('.', '').astype(int) # Afficher les premières lignes du DataFrame pour vérifier les modifications -df_adrien = pd.read_csv('data/small/users/library_adrien.csv', dtype={'movieId': str}) # Supprimer les points de la colonne 'movieId' et convertir les valeurs en entiers -df_adrien['movieId'] = df_adrien['movieId'].str.replace('.', '').astype(int) # Afficher les premières lignes du DataFrame pour vérifier les modifications -df_nathanael = pd.read_csv('data/small/users/library_nathanael.csv', dtype={'movieId': str}) # Supprimer les points de la colonne 'movieId' et convertir les valeurs en entiers -df_nathanael['movieId'] = df_nathanael['movieId'].str.replace('.', '').astype(int) # Afficher les premières lignes du DataFrame pour vérifier les modifications -df_charles = pd.read_csv('data/small/users/library_charles.csv', dtype={'movieId': str}) # Supprimer les points de la colonne 'movieId' et convertir les valeurs en entiers -df_charles['movieId'] = df_charles['movieId'].str.replace(' ', '').astype(int) # Afficher les premières lignes du DataFrame pour vérifier les modifications +# Import all the dataframes +df_links = pd.read_csv("data/small/content/links.csv") # Load links data +df_movies = pd.read_csv("data/small/content/movies.csv") # Load movies data +df_tags = pd.read_csv('data/small/content/tags.csv') # Load tags data +df_ratings = pd.read_csv('data/small/evidence/ratings.csv') # Load ratings data - -# Fonction pour récupérer l'URL du poster d'un film à partir de son tmdbId +# Import datasets for each user and clean 'movieId' column +df_audrey = pd.read_csv('data/small/users/library_audrey.csv', dtype={'movieId': str}) +df_audrey['movieId'] = df_audrey['movieId'].str.replace('.', '').astype(int) +df_adrien = pd.read_csv('data/small/users/library_adrien.csv', dtype={'movieId': str}) +df_adrien['movieId'] = df_adrien['movieId'].str.replace('.', '').astype(int) +df_nathanael = pd.read_csv('data/small/users/library_nathanael.csv', dtype={'movieId': str}) +df_nathanael['movieId'] = df_nathanael['movieId'].str.replace('.', '').astype(int) +df_charles = pd.read_csv('data/small/users/library_charles.csv', dtype={'movieId': str}) +df_charles['movieId'] = df_charles['movieId'].str.replace(' ', '').astype(int) ############### ACCESS API ############### -# Fonction pour récupérer l'URL du poster d'un film à partir de son tmdbId - +# Function to fetch movie info from TMDB API using tmdbId def fetch_movie_info(tmdbId): - api_key = '430f4a0d247540e52815bb6639b3cb04' # Remplace par ta clé d'API TMDB + api_key = '430f4a0d247540e52815bb6639b3cb04' # Replace with your TMDB API key base_url = "https://api.themoviedb.org/3" - # URL pour les détails du film + # URL for movie details movie_url = f"{base_url}/movie/{tmdbId}?api_key={api_key}" - # URL pour les crédits (acteurs) + # URL for movie credits (actors) credits_url = f"{base_url}/movie/{tmdbId}/credits?api_key={api_key}" - # URL pour les mots-clés + # URL for movie keywords keywords_url = f"{base_url}/movie/{tmdbId}/keywords?api_key={api_key}" try: - # Requête pour les détails du film + # Request movie details movie_response = requests.get(movie_url) if movie_response.status_code == 200: movie_data = movie_response.json() @@ -59,39 +46,35 @@ def fetch_movie_info(tmdbId): overview = movie_data.get('overview') genres = [genre['name'] for genre in movie_data.get('genres', [])] poster_path = movie_data.get('poster_path') - if poster_path: - poster_url = f"https://image.tmdb.org/t/p/original{poster_path}" - else: - poster_url = None - + poster_url = f"https://image.tmdb.org/t/p/original{poster_path}" if poster_path else None runtime = movie_data.get('runtime') hours = runtime // 60 minutes = runtime % 60 - user_rating = int(movie_data.get('vote_average') * 10) # Conversion en pourcentage + user_rating = int(movie_data.get('vote_average') * 10) # Convert to percentage release_year = release_date.split("-")[0] if release_date else "N/A" else: - print(f"Erreur : statut de réponse {movie_response.status_code}") - return None, None # Modification pour renvoyer None, None en cas d'erreur + print(f"Error: Response status {movie_response.status_code}") + return None, None - # Requête pour les crédits (acteurs) + # Request movie credits credits_response = requests.get(credits_url) if credits_response.status_code == 200: credits_data = credits_response.json() - actors = [actor['name'] for actor in credits_data.get('cast', [])[:10]] # Limité aux 10 premiers acteurs + actors = [actor['name'] for actor in credits_data.get('cast', [])[:10]] # Limit to top 10 actors else: - print(f"Erreur : statut de réponse {credits_response.status_code}") - return None, None # Modification pour renvoyer None, None en cas d'erreur + print(f"Error: Response status {credits_response.status_code}") + return None, None - # Requête pour les mots-clés + # Request movie keywords keywords_response = requests.get(keywords_url) if keywords_response.status_code == 200: keywords_data = keywords_response.json() keywords = [keyword['name'] for keyword in keywords_data.get('keywords', [])] else: - print(f"Erreur : statut de réponse {keywords_response.status_code}") - return None, None # Modification pour renvoyer None, None en cas d'erreur + print(f"Error: Response status {keywords_response.status_code}") + return None, None - # Retourner les informations combinées + # Return combined movie info return { "title": title, "release_date": release_date, @@ -103,85 +86,61 @@ def fetch_movie_info(tmdbId): "poster_url": poster_url, "actors": actors, "keywords": keywords - }, poster_url # Modification pour renvoyer également l'URL du poster + }, poster_url except requests.exceptions.RequestException as e: - print(f"Erreur de connexion : {e}") - return None, None # Modification pour renvoyer None, None en cas d'erreur + print(f"Connection error: {e}") + return None, None +############### EXTRACT THE CONTENT ############### - -############### EXTRACT THE CONTENT ############### -# merge all the files in one +# Function to merge multiple CSV files into one DataFrame def merge_csv_files(file1, file2, file3, file4): - # Charger les fichiers CSV dans des DataFrames + # Load the CSV files into DataFrames df1 = pd.read_csv(file1) df2 = pd.read_csv(file2) df3 = pd.read_csv(file3) df4 = pd.read_csv(file4) - # Fusionner les DataFrames en utilisant la méthode merge de pandas + # Merge the DataFrames using the 'movieId' column merged_df = pd.merge(df1, df2, on='movieId', how='outer') merged_df = pd.merge(merged_df, df3, on='movieId', how='outer') merged_df = pd.merge(merged_df, df4, on='movieId', how='outer') - # Écrire le résultat fusionné dans un nouveau fichier CSV + # Return the merged DataFrame return merged_df -# Exemple d'utilisation de la fonction merge_csv_files +# Example of using the merge_csv_files function merged_df = merge_csv_files("data/small/content/movies.csv", 'data/small/content/links.csv', 'data/small/content/tags.csv', 'data/small/evidence/ratings.csv') - - +# Function to calculate the range of years in the dataset def calculate_year_range(df): - # Extracting the year from the 'title' column + # Extract the year from the 'title' column df['annee'] = df['title'].str.extract(r'\((.{4})\)') df['annee'] = pd.to_numeric(df['annee'], errors='coerce') - # Calculating the minimum and maximum year range + # Calculate the minimum and maximum year range min_range = int(df['annee'].min()) max_range = int(df['annee'].max()) return min_range, max_range +df_movies = pd.read_csv("data/small/content/movies.csv") # Reload movies data +nombre_films_uniques = merged_df['movieId'].nunique() # Calculate the number of unique movies -df_movies = pd.read_csv("data/small/content/movies.csv") -nombre_films_uniques = merged_df['movieId'].nunique() -print("Nombre de films uniques :", nombre_films_uniques) - - -min_range, max_range = calculate_year_range(merged_df) +min_range, max_range = calculate_year_range(merged_df) # Calculate the year range +# Function to get the top movies by genre def get_top_movies_by_genre(df, genre, displayed_movies, num_movies=10): genre_df = df[df['genres'].apply(lambda x: genre in x) & ~df['tmdbId'].isin(displayed_movies)] top_movies = genre_df.drop_duplicates(subset='tmdbId').sort_values(by='rating', ascending=False).head(num_movies)['tmdbId'] return top_movies -# Year +# Function to extract year from movie titles and sort def year(merged_df, ascending=True): - # Extraire l'année du titre du film à l'aide d'une expression régulière merged_df['annee'] = merged_df['title'].str.extract(r'\((.{4})\)') - - # Trier les années en fonction de l'ordre spécifié par 'ascending' merged_df = merged_df.sort_values(by='annee', ascending=ascending) return merged_df['annee'] -# Movies Rnrated -def unrated_movies_count(df_ratings, df_movies): - rated_movies = df_ratings['movieId'].unique() if 'movieId' in df_ratings.columns else [] - unrated_movies_count = df_movies[~df_movies.index.isin(rated_movies)].shape[0] - print("Number of movies that were not rated at all:", unrated_movies_count) - -unrated_movies_count(df_ratings, df_movies) - -def unrated_movies_count_merged_df(df_ratings, merged_df): - rated_movies = df_ratings['movieId'].unique() if 'movieId' in df_ratings.columns else [] - unrated_movies_count = merged_df[~merged_df['movieId'].isin(rated_movies)].shape[0] - print("Nombre de films qui n'ont pas encore été évalués :", unrated_movies_count) - -unrated_movies_count_merged_df(df_ratings, merged_df) - - -# LIST OF MOVIE GENRES -# Fonction pour nettoyer les genres +# Function to list unique movie genres def genres(merged_df): def convert_to_str(x): if isinstance(x, list): @@ -197,9 +156,9 @@ def genres(merged_df): unique_genres = sorted(merged_df[merged_df['genres'] != '(no genres listed)']['genres'].unique()) return unique_genres - +# Function to list unique tags def tag(merged_df): - # Concaténer tous les tags en une seule chaîne + # Concatenate all tags into a single string all_tags_str = ",".join(merged_df['tag'].astype(str)) # Split the string by comma to get individual tags diff --git a/loaders.py b/loaders.py index 2ee119c8a933784f5f4cd4ebabd90d0e505b86a8..9c2ad30abbdf1e017f48c7b92d3ee692e2e9f335 100644 --- a/loaders.py +++ b/loaders.py @@ -1,26 +1,26 @@ # Third-party imports -import pandas as pd -import os -import pickle - +import pandas as pd # Import pandas for data manipulation +import os # Import os for system-related operations +import pickle # Import pickle for object serialization # Local imports -from constants import Constant as C -from surprise import Reader, Dataset +from constants import Constant as C # Import Constant class from constants module +from surprise import Reader, Dataset # Import Reader and Dataset from surprise module for loading ratings data in Surprise format + def load_model(user_name): """ - Function to load a trained model based on user ID. - :param user_id: Identifier for the user to load the specific model. + Function to load a trained model based on the user's name. + :param user_name: Name of the user to load the specific model. :return: Loaded model. """ - model_path = f'data/small/recs/{user_name}_model.p' - if os.path.exists(model_path): + model_path = f'data/small/recs/{user_name}_model.p' # Define the path to the model file + if os.path.exists(model_path): # Check if the model file exists with open(model_path, 'rb') as file: - model = pickle.load(file) - return model + model = pickle.load(file) # Load the model from the file + return model # Return the loaded model else: - print(f"No model found for user {user_name}") + print(f"No model found for user {user_name}") # Print a message if no model is found for the user return None @@ -33,13 +33,13 @@ def load_ratings(surprise_format=False): Returns: DataFrame or surprise_data: Ratings data. """ - df_ratings = pd.read_csv(C.EVIDENCE_PATH / C.RATINGS_FILENAME) + df_ratings = pd.read_csv(C.EVIDENCE_PATH / C.RATINGS_FILENAME) # Load ratings data from a CSV file if surprise_format: - reader = Reader(rating_scale=C.RATINGS_SCALE) # on met 0.5 pcq c'est la plus petite note. - surprise_data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader) - return surprise_data + reader = Reader(rating_scale=C.RATINGS_SCALE) # Create a Reader object with the specified rating scale + surprise_data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader) # Load data in Surprise format + return surprise_data # Return Surprise data else: - return df_ratings + return df_ratings # Return ratings data as a DataFrame def load_items(): @@ -48,9 +48,10 @@ def load_items(): Returns: DataFrame: Items data. """ - df_items = pd.read_csv(C.CONTENT_PATH / C.ITEMS_FILENAME) # ce qui se trouve dans le movie csv - df_items = df_items.set_index(C.ITEM_ID_COL) # movie id - return df_items + df_items = pd.read_csv(C.CONTENT_PATH / C.ITEMS_FILENAME) # Load items data from a CSV file + df_items = df_items.set_index(C.ITEM_ID_COL) # Set the index of the DataFrame to the item ID column + return df_items # Return items data + def export_evaluation_report(report): """Exports evaluation report. @@ -61,8 +62,8 @@ def export_evaluation_report(report): Returns: DataFrame: Merged ratings and items data. """ - report_name = f"evaluation_report_{pd.Timestamp.now().strftime('%Y-%m-%d')}.csv" - export_path = os.path.join(C.EVALUATION_PATH, report_name) - report.to_csv(export_path, index=False) - print("The data has been exported to the evaluation report") - return report \ No newline at end of file + report_name = f"evaluation_report_{pd.Timestamp.now().strftime('%Y-%m-%d')}.csv" # Generate a name for the report file + export_path = os.path.join(C.EVALUATION_PATH, report_name) # Define the export path for the report file + report.to_csv(export_path, index=False) # Export the evaluation report to a CSV file + print("The data has been exported to the evaluation report") # Print a message indicating successful export + return report # Return the evaluation report diff --git a/models.py b/models.py index 14424b62ddea3a3bd20774c6bf49b90d674a7cd5..bdf711e02b590a61474d8bfbc8d68ae2daa3ed8e 100644 --- a/models.py +++ b/models.py @@ -1,24 +1,14 @@ -# standard library imports +# Standard library imports from collections import defaultdict -# third parties imports +# Third-party imports import pandas as pd import numpy as np import random as rd from surprise import AlgoBase, SVD from surprise import PredictionImpossible from sklearn.metrics import mean_squared_error - -# import local from sklearn.feature_extraction.text import TfidfVectorizer -from loaders import load_items, load_ratings -from constants import Constant as C -from sklearn.linear_model import LinearRegression -from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor -from sklearn.ensemble import BaggingRegressor - -from sklearn.svm import SVR - from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet from sklearn.svm import SVR from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor @@ -27,22 +17,14 @@ from sklearn.neighbors import KNeighborsRegressor from xgboost import XGBRegressor from lightgbm import LGBMRegressor +# Local imports +from loaders import load_items, load_ratings +from constants import Constant as C + -# All the dataframes +# Load all necessary dataframes df_items = load_items() df_ratings = load_ratings() -df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME) - - -# Example 1 : create title_length features -df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title') -df_features = df_tag[C.TAG] - - - -df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv") -df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv") - def get_top_n(predictions, n): @@ -56,7 +38,7 @@ def get_top_n(predictions, n): n(int): The number of recommendation to output for each user. Default is 10. Returns: - A dict where keys are user (raw) ids and values are lists of tuples: + A dict where keys are user (raw) ids and values are lists of tuples: [(raw item id, rating estimation), ...] of size n. """ @@ -97,6 +79,7 @@ class ModelBaseline2(AlgoBase): def estimate(self, u, i): return rd.uniform(self.trainset.rating_scale[0], self.trainset.rating_scale[1]) + # Third algorithm class ModelBaseline3(AlgoBase): def __init__(self): @@ -118,7 +101,7 @@ class ModelBaseline4(SVD): SVD.__init__(self, n_factors=100) -# ContetnBased +# ContentBased class ContentBased(AlgoBase): def __init__(self, features_method, regressor_method): AlgoBase.__init__(self) @@ -130,11 +113,9 @@ class ContentBased(AlgoBase): def create_content_features(self, features_methods): """Content Analyzer""" + # Loading dataframes needed for feature creation df_items = load_items() df_ratings = load_ratings() - df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME) - df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv") - df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv") df_features = pd.DataFrame(index=df_items.index) @@ -274,39 +255,3 @@ class ContentBased(AlgoBase): mse = mean_squared_error(true_ratings, predictions) rmse_value = np.sqrt(mse) return rmse_value - - -# Example usage: -# cb = ContentBased(["title_length", "movie_year","genre","avg_rating"], "ridge_regression") -# surprise_data = load_ratings(surprise_format=True) -# trainset = surprise_data.build_full_trainset() -# testset = trainset.build_anti_testset() -# cb.fit(trainset) - - -# print("RMSE: ", cb.rmse(testset)) - - -# # Example explanations for users: -# #print(cb.explain(11)) - -# #print(cb.explain(13)) - -# print(cb.explain(17)) - -#print(cb.explain(23)) - -#print(cb.explain(27)) - -#print(cb.explain(73)) - - - -# # Obtenir les meilleures recommandations pour chaque utilisateur -# top_n_recommendations = get_top_n(predictions, n=10) - -# # Afficher les recommandations pour quelques utilisateurs spécifiques -# for user_id, user_recommendations in top_n_recommendations.items(): -# print(f"Utilisateur {user_id}:") -# for item_id, rating in user_recommendations: -# print(f" - Item {item_id}, estimation de note : {rating}") diff --git a/pages/Discover.py b/pages/Discover.py index 4613c26cbf223b6e795bd253618945142e2b0489..009d1791a062bd51f5645588456b0417018a2e8d 100644 --- a/pages/Discover.py +++ b/pages/Discover.py @@ -1,18 +1,30 @@ import streamlit as st -import pandas as pd from content import df_movies, df_ratings -from content import genres, calculate_year_range, tag, merged_df, fetch_movie_info, df_links, get_top_movies_by_genre +from content import genres, merged_df, fetch_movie_info, df_links, get_top_movies_by_genre import utils as ut +# Add the logo to the Streamlit sidebar using a utility function from the 'utils' module ut.add_logo() def display_unrated_movies(df_ratings, merged_df): - st.header(f"Hidden Gems") + """ + Display a section for unrated movies, referred to as "Hidden Gems". + + Args: + df_ratings (DataFrame): DataFrame containing movie ratings. + merged_df (DataFrame): DataFrame containing merged movie information. + """ + st.header("Hidden Gems") + # Extract the movie IDs of rated movies rated_movies = df_ratings['movieId'].unique() if 'movieId' in df_ratings.columns else [] + # Filter out rated movies to get unrated movies unrated_movies_df = merged_df[~merged_df['movieId'].isin(rated_movies)] + # Remove duplicate movies based on 'tmdbId' unique_unrated_movies_df = unrated_movies_df.drop_duplicates(subset='tmdbId') + # Sort movies by 'tmdbId' in descending order and select the top 10 sorted_movies_df = unique_unrated_movies_df.sort_values(by='tmdbId', ascending=False).head(10) cols_html = "" + # Generate HTML for displaying each unrated movie for idx, row in sorted_movies_df.iterrows(): title_dict, poster_url = fetch_movie_info(row['tmdbId']) movie_title = title_dict.get("title", "Unknown Title") if isinstance(title_dict, dict) else title_dict @@ -21,7 +33,7 @@ def display_unrated_movies(df_ratings, merged_df): cols_html += f'<div style="display: inline-block; margin-right: 20px;"><a href="{html_file_url}" target="_blank"><img src="{poster_url}" alt="{movie_title}" style="width:150px;height:225px;"></a><div style="color: white; text-decoration: none; font-size: 14px; text-align: center; max-width: 150px; word-wrap: break-word; white-space: normal;"><b>{movie_title}</b></div></a></div>' else: cols_html += f"<p>{movie_title}</p>" - + # Display the movies in a horizontally scrollable div st.markdown(f""" <div style="overflow-x: scroll; white-space: nowrap; height: 300px; display: flex; flex-direction: row;"> {cols_html} @@ -29,11 +41,21 @@ def display_unrated_movies(df_ratings, merged_df): """, unsafe_allow_html=True) def display_movies_year(merged_df): - st.header(f"New this year") + """ + Display a section for new movies released this year. + + Args: + merged_df (DataFrame): DataFrame containing merged movie information. + """ + st.header("New this year") cols_html = "" - unique_movies_df = merged_df.drop_duplicates(subset='tmdbId') # Supprimer les doublons basés sur tmdbId - sorted_movies_df = unique_movies_df.sort_values(by='annee', ascending=False) # Trier par année de sortie de manière décroissante - top_10_tmdbIds = sorted_movies_df['tmdbId'].head(10) # Prend les 10 premiers films en termes de TMDB ID + # Remove duplicate movies based on 'tmdbId' + unique_movies_df = merged_df.drop_duplicates(subset='tmdbId') + # Sort movies by 'annee' (year of release) in descending order + sorted_movies_df = unique_movies_df.sort_values(by='annee', ascending=False) + # Select the top 10 movies based on TMDB ID + top_10_tmdbIds = sorted_movies_df['tmdbId'].head(10) + # Generate HTML for displaying each new movie for tmdbId in top_10_tmdbIds: title_dict, poster_url = fetch_movie_info(tmdbId) movie_title = title_dict.get("title", "Unknown Title") if isinstance(title_dict, dict) else title_dict @@ -42,20 +64,30 @@ def display_movies_year(merged_df): cols_html += f'<div style="display: inline-block; margin-right: 20px;"><a href="{html_file_url}" target="_blank"><img src="{poster_url}" alt="{movie_title}" style="width:150px;height:225px;"></a><div style="color: white; text-decoration: none; font-size: 14px; text-align: center; max-width: 150px; word-wrap: break-word; white-space: normal;"><b>{movie_title}</b></div></a></div>' else: cols_html += f"<p>{movie_title}</p>" + # Display the movies in a horizontally scrollable div st.markdown(f""" <div style="overflow-x: scroll; white-space: nowrap; height: 300px; display: flex; flex-direction: row;"> {cols_html} </div> """, unsafe_allow_html=True) - def display_movies_by_genre(df, num_movies=10): + """ + Display sections for top movies by genre. + + Args: + df (DataFrame): DataFrame containing movie information. + num_movies (int): Number of top movies to display for each genre. + """ + # Get a list of unique genres from the DataFrame unique_genres = genres(df) for genre in unique_genres: st.header(f"Top Movies in {genre}") cols_html = "" displayed_movies = [] # List of already displayed movies, initially empty + # Get top movies IDs by genre top_movies_ids = get_top_movies_by_genre(df, genre, displayed_movies, num_movies) + # Generate HTML for displaying each top movie in the genre for tmdbId in top_movies_ids: title_dict, poster_url = fetch_movie_info(tmdbId) movie_title = title_dict.get("title", "Unknown Title") if isinstance(title_dict, dict) else title_dict @@ -64,32 +96,31 @@ def display_movies_by_genre(df, num_movies=10): cols_html += f'<div style="display: inline-block; margin-right: 20px;"><a href="{html_file_url}" target="_blank"><img src="{poster_url}" alt="{movie_title}" style="width:150px;height:225px;"></a><div style="color: white; text-decoration: none; font-size: 14px; text-align: center; max-width: 150px; word-wrap: break-word; white-space: normal;"><b>{movie_title}</b></div></a></div>' else: cols_html += f"<p>{movie_title}</p>" - + # Display the movies in a horizontally scrollable div st.markdown(f""" <div style="overflow-x: scroll; white-space: nowrap; height: 300px; display: flex; flex-direction: row;"> {cols_html} </div> """, unsafe_allow_html=True) - def main(): - st.title('Discover') # Next Access All Content + """ + Main function to display different sections of the app. + """ + st.title('Discover') # Set the title of the Streamlit app global df_links global df_ratings global df_movies - num_columns = 4 - # Existing filtering and display + # Copy the merged DataFrame for filtering and displaying filtered_movies = merged_df.copy() - unique_tmdbIds = filtered_movies.drop_duplicates(subset='tmdbId')['tmdbId'].tolist() - + # Display sections for new movies, unrated movies, and top movies by genre display_movies_year(filtered_movies) - display_unrated_movies(df_ratings, filtered_movies) - display_movies_by_genre(filtered_movies, num_movies=10) +# Entry point for the script if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/pages/Search.py b/pages/Search.py index de4209141170a6c46efa0663e34a746c812d7fe1..b11c2c31430a9e010eab8cb487a5ff366e6813fd 100644 --- a/pages/Search.py +++ b/pages/Search.py @@ -3,26 +3,66 @@ import pandas as pd from content import genres, calculate_year_range, tag, merged_df, fetch_movie_info, df_links import utils as ut - +# Add the logo to the Streamlit sidebar using a utility function from the 'utils' module ut.add_logo() def filter_movies_by_title(df, search_title): + """ + Filter movies by their title based on the search query. + + Args: + df (DataFrame): DataFrame containing movie information. + search_title (str): Search query for movie titles. + + Returns: + DataFrame: Filtered DataFrame containing movies that match the search query. + """ if not search_title: return df return df[df['title'].str.contains(search_title, case=False)] def filter_movies_by_genre(df, selected_genres): + """ + Filter movies by their genres. + + Args: + df (DataFrame): DataFrame containing movie information. + selected_genres (list): List of selected genres to filter by. + + Returns: + DataFrame: Filtered DataFrame containing movies that match the selected genres. + """ if not selected_genres: return df return df[df['genres'].apply(lambda x: any(genre in x for genre in selected_genres))] def filter_movies_by_year(df, selected_year): + """ + Filter movies by their release year within a specified range. + + Args: + df (DataFrame): DataFrame containing movie information. + selected_year (tuple): Tuple containing the start and end years for filtering. + + Returns: + DataFrame: Filtered DataFrame containing movies released within the specified year range. + """ return df[(df['annee'] >= selected_year[0]) & (df['annee'] <= selected_year[1])] def filter_movies_by_tag(df, selected_tags): + """ + Filter movies by their tags. + + Args: + df (DataFrame): DataFrame containing movie information. + selected_tags (list): List of selected tags to filter by. + + Returns: + DataFrame: Filtered DataFrame containing movies that match the selected tags. + """ if not selected_tags: return df df['tag'] = df['tag'].apply(lambda x: str(x).split('|') if pd.notna(x) else []) @@ -30,6 +70,13 @@ def filter_movies_by_tag(df, selected_tags): return filtered_df def display_movies(unique_tmdbIds, num_columns): + """ + Display movies in a grid layout with a specified number of columns. + + Args: + unique_tmdbIds (list): List of unique TMDB IDs of movies to display. + num_columns (int): Number of columns in the grid layout. + """ for i in range(0, len(unique_tmdbIds), num_columns): row_tmdbIds = unique_tmdbIds[i:i+num_columns] cols = st.columns(num_columns) @@ -38,28 +85,31 @@ def display_movies(unique_tmdbIds, num_columns): if poster_url: title = title.get("title", "Unknown Title") html_file_url = f"http://localhost:8501/{title.replace(' ', '_')}.html" + # Display the movie poster with a link to the movie's page cols[idx].markdown(f'<div style="margin-right: 20px; margin-top: 20px; margin-bottom: 5px;"><a href="{html_file_url}" target="_blank"><img src="{poster_url}" alt="{title}" style="width:150px;height:225px;"></a></div>', unsafe_allow_html=True) + # Display the movie title below the poster cols[idx].markdown(f'<div style="margin-right: 20px; margin-bottom: 5px; text-align: center; max-width: 150px;"><a href="{html_file_url}" target="_blank" style="color: white; text-decoration: none; font-size: 14px; word-wrap: break-word;"><b>{title}</b></a></div>', unsafe_allow_html=True) - - def main(): - st.title('Movie Base') + """ + Main function to display the movie filtering and display interface. + """ + st.title('Movie Base') # Set the title of the Streamlit app global df_links - num_columns = 4 + num_columns = 4 # Number of columns to display movies in - # Search by movie title + # Sidebar input for searching movies by title search_title = st.sidebar.text_input("Search by Movie Title", value="", placeholder="Type to search...", key="search_title") - # Load filters after title search + # Sidebar inputs for filtering movies selected_genres = st.sidebar.multiselect('Select Genre(s)', genres(merged_df)) unique_tags_list = tag(merged_df) tag_input = st.sidebar.selectbox("Search by Tag", [""] + sorted(unique_tags_list)) min_range, max_range = calculate_year_range(merged_df) selected_year = st.sidebar.slider('Select Year Range', min_value=min_range, max_value=max_range, value=(min_range, max_range)) - # Apply filters + # Apply filters to the merged DataFrame filtered_movies = merged_df.copy() filtered_movies = filter_movies_by_title(filtered_movies, search_title) filtered_movies = filter_movies_by_genre(filtered_movies, selected_genres) @@ -67,9 +117,12 @@ def main(): filtered_movies = filter_movies_by_tag(filtered_movies, [tag_input]) filtered_movies = filter_movies_by_year(filtered_movies, selected_year) + # Get unique TMDB IDs of filtered movies unique_tmdbIds = filtered_movies.drop_duplicates(subset='tmdbId')['tmdbId'].tolist() + # Display the filtered movies display_movies(unique_tmdbIds, num_columns) +# Entry point for the script if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/recommender.py b/recommender.py index 41aaa708c2c5c3e599a6f61a896b1fd6181c5eb2..a89e45950a16ced6af07276b2a26c3753c277eb5 100644 --- a/recommender.py +++ b/recommender.py @@ -1,13 +1,10 @@ -# Standard library imports -import heapq -import pickle -import random as rd -from collections import defaultdict - # Third-party imports import numpy as np import pandas as pd import matplotlib.pyplot as plt +import pickle +import heapq +import random as rd from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor from sklearn.feature_extraction.text import TfidfVectorizer @@ -16,8 +13,6 @@ from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error from surprise.prediction_algorithms.knns import KNNWithMeans -from sklearn.metrics.pairwise import cosine_similarity - from sklearn.neighbors import KNeighborsRegressor from sklearn.preprocessing import MultiLabelBinarizer @@ -27,7 +22,6 @@ from surprise import AlgoBase, KNNWithMeans, accuracy, PredictionImpossible from xgboost import XGBRegressor # Local imports -from constants import Constant as C from loaders import load_items, load_ratings import matplotlib.pyplot as plt @@ -208,9 +202,6 @@ class UserBased(AlgoBase): top_n_predictions_ub = user_based_predictions[:n] # Print the top N predictions for the target user - print(f"Top {n} predictions for user {target_user}:") - for movie_id, pred in top_n_predictions_ub: - print(f"MovieId {movie_id}: {pred}") return top_n_predictions_ub def inter_user_diversity(self, top_n_recommendations): @@ -269,36 +260,6 @@ class UserBased(AlgoBase): predictions = [self.predict(uid, iid, r_ui=rating) for (uid, iid, rating) in testset] return accuracy.mae(predictions, verbose=True) - -# # Construire l'ensemble d'entraînement complet et l'ensemble de test -# surprise_data = load_ratings(surprise_format=True) -# trainset = surprise_data.build_full_trainset() -# testset = trainset.build_anti_testset() - - -# # Valeurs de k à tester -# k_values = range(1, 81, 10) -# rmse_values = [] - -# #Évaluer le modèle pour chaque valeur de k -# for k in k_values: -# print(f"Évaluating for k={k}") -# algo = UserBased(k=k, min_k=k) -# algo.fit(trainset) -# rmse = algo.evaluate_rmse(testset) -# rmse_values.append(rmse) -# print(f"k={k}, RMSE={rmse}") - -# # Tracer le graphique de l'évolution du RMSE en fonction de k -# plt.figure(figsize=(10, 6)) -# plt.plot(k_values, rmse_values, marker='o') -# plt.title('Évolution du RMSE en fonction de k') -# plt.xlabel('Nombre de voisins (k)') -# plt.ylabel('RMSE') -# plt.grid(True) -# plt.show() - - ########################################################################################################################### ####################################################### KNN MODEL ######################################################## ########################################################################################################################### @@ -403,10 +364,6 @@ class RecommenderSystem_KNN : top_n_recommendations = top_n_recommendations[:n] # Optionally print the top N predictions - print(f"Top {n} recommendations for user {userid}:") - for movie_id, pred in top_n_recommendations: - print(f"MovieId {movie_id}: {pred}") - return top_n_recommendations @@ -479,12 +436,6 @@ class RecommenderSystem_KNN : plt.grid(True) plt.show() -# # Utilisation de la classe RecommenderSystem_KNN -# recommender = RecommenderSystem_KNN(ratings_path='data/small/evidence/ratings.csv') -# recommender.evaluate_knn_rmse_for_different_k() - - - ########################################################################################################################### ################################################# OTHER USER-BASED MODEL ################################################## ########################################################################################################################### @@ -544,9 +495,6 @@ class OtherUserBased: prediction = self.model.estimate(self.user_id, item_id) predictions.append((item_id, prediction)) top_10_predictions = sorted(predictions, key=lambda x: x[1], reverse=True)[:10] - print(f"Top 100 predictions for user {self.user_id} ({self.user_name}):") - for item_id, prediction in top_10_predictions: - print(f"Item {item_id} : {prediction}") return top_10_predictions # Return the predictions here @@ -649,11 +597,6 @@ def compare_models(): print(f"RMSE of the {user_info['user_name']} model: {rmse}") print(f"MAE of the {user_info['user_name']} model: {mae}") -# Call the function to compare the models -#compare_models() - - - def compare_similarity_measures(trainset,testset): """ Compare the similarity measures MAE and RMSE with Jaccard and MSD for KNN and UserBased models. @@ -717,11 +660,6 @@ def compare_similarity_measures(trainset,testset): return results -# # # Example usage: -# comparison_results = compare_similarity_measures(trainset ,testset) -# print(comparison_results) - - def evaluate_inter_user_diversity(user_based_model, ratings_path, other_user_based_models, trainset, testset, k, min_k): """ Evaluate the inter-user diversity of different recommender models with given k and min_k. @@ -784,7 +722,6 @@ def evaluate_inter_user_diversity(user_based_model, ratings_path, other_user_bas inter_user_diversity_scores[f'Other_{other_model.user_name}'] = other_model.inter_user_diversity(all_top_n_recommendations_other) - return inter_user_diversity_scores @@ -799,62 +736,6 @@ other_user_based_models = [ ] -# inter_user_diversity_scores = evaluate_inter_user_diversity(user_based_model, ratings, other_user_based_models, trainset, testset) - -# # Affichage des scores de diversité inter-utilisateurs -# print("Inter-user Diversity Scores:") -# for model_name, diversity_score in inter_user_diversity_scores.items(): -# print(f"{model_name}: {diversity_score}") - -# k_values = [10, 20, 30,40,50,60] -# results = [] - -# for k in k_values: -# user_based_model = UserBased(k=k, min_k=k) -# scores = evaluate_inter_user_diversity(user_based_model, ratings, other_user_based_models, trainset, testset, k, k) - -# # Calculate RMSE -# rmse = user_based_model.evaluate_rmse(testset) # Assuming you have a function to calculate RMSE -# scores['RMSE'] = rmse - -# results.append((k, k, scores)) - -# # Plotting -# fig, ax1 = plt.subplots(figsize=(10, 6)) - -# # Create lists to store data for plotting lines -# k_values_plt = [] -# user_based_values = [] -# rmse_values = [] - -# # Collect data for plotting -# for result in results: -# k_value, min_k_value, scores = result -# k_values_plt.append(k_value) -# user_based_values.append(scores['UserBased']) -# rmse_values.append(scores['RMSE']) - -# # Plot Inter-user Diversity -# ax1.set_xlabel('k') -# ax1.set_ylabel('Inter-user Diversity', color='tab:blue') -# ax1.set_title('Inter-user Diversity and RMSE vs k') -# ax1.plot(k_values_plt, user_based_values, marker='o', linestyle='-', label='Inter-user Diversity', color='tab:blue') - -# ax1.tick_params(axis='y', labelcolor='tab:blue') -# ax1.legend(loc='upper left') - -# # Create a second y-axis for RMSE -# ax2 = ax1.twinx() -# ax2.set_ylabel('RMSE', color='tab:red') -# ax2.plot(k_values_plt, rmse_values, marker='x', linestyle='--', label='RMSE', color='tab:red') - -# ax2.tick_params(axis='y', labelcolor='tab:red') -# ax2.legend(loc='upper right') - -# plt.grid(True) -# plt.show() - - ########################################################################################################################### ###################################################### CONTENT-BASED MODEL ################################################ ########################################################################################################################### @@ -955,7 +836,6 @@ class ContentBased(AlgoBase): for u in self.user_profile: self.user_profile[u] = [rating for (_, rating) in trainset.ur[u]] - else: regressor_models = { 'linear_regression': LinearRegression(fit_intercept=False), @@ -1045,34 +925,8 @@ def test_contentbased_class(feature_method, regressor_method, user_id=-1, n=10): # Get the top-N recommendations for each user top_n_recommendations = get_top_n(predictions, user_id= user_id, n=n) - # Print the top-N recommendations - print(f"Top {n} recommendations for User {user_id}:") - for iid, est in top_n_recommendations: - print(f"Item {iid}: {est:.2f}") return top_n_recommendations -# top_n_recommendations = test_contentbased_class(["genre"], "linear_regression", user_id=-1, n=10) - -# # Load the test ratings -# sp_ratings = load_ratings(surprise_format=True) -# # Calculate RMSE -# content_algo = ContentBased(["genre"],"linear_regression") - -# trainset = surprise_data.build_full_trainset() - -# testset = trainset.build_anti_testset() -# content_algo.fit(trainset) -# rmse = content_algo.rmse(testset) -# print("RMSE:", rmse) - -# # Calculate MAE -# test_ratings = "data/tiny/evidence/ratings.csv" -# predictions = [rating for _, rating in top_n_recommendations] -# true_ratings = [rating for _, rating in test_ratings[test_ratings['userId'] == 1]['rating']] -# mae = mean_absolute_error(true_ratings, predictions) -# print("MAE:", mae) - - ########################################################################################################################### ###################################################### LATENT FACTOR MODEL ############################################### ########################################################################################################################### @@ -1155,9 +1009,6 @@ class LatentFactorModel: sorted_ratings = sorted(user_ratings, key=lambda x: x[1], reverse=True) top_10_ratings = sorted_ratings[:10] - print(f"Top 10 predictions for user {user_id} ({self.user_name}):") - for item_id, prediction in top_10_ratings: - print(f"Item {item_id} : {prediction:.4f}") return top_10_ratings def calculate_rmse(self, ratings): @@ -1185,35 +1036,3 @@ class LatentFactorModel: y_pred.append(prediction) return mean_absolute_error(y_true, y_pred) - - - - -# # Load the data -# ratings = pd.read_csv('data/small/evidence/ratings.csv') -# # Charger les données des films -# movies = pd.read_csv('data/small/content/movies.csv') - - -# # Initialize and train the model -# user_name = "Adrien" -# lfm = LatentFactorModel(num_factors=10, learning_rate=0.01, regularization=0.1, num_epochs=20, user_name = user_name) -# lfm.fit(ratings,movies) - -# # Print RMSE -# final_rmse = lfm.calculate_rmse(ratings) -# print(f"Final RMSE after training: {final_rmse:.4f}") - -# final_mae = lfm.calculate_mae(ratings) -# print("MAE:", final_mae) - -# # Predict a rating for a specific user and movie -# user_id = -1 -# movie_id = 5218 -# predicted_rating = lfm.predict(user_id, movie_id) -# print(f"Predicted rating for user {user_id} and movie {movie_id}: {predicted_rating}") - - -# # Get the top 10 ratings for a specific user -# top_ratings = lfm.top_ratings_for_user(user_id, ratings) -# print(f"Top 10 ratings for user {user_id}: {top_ratings}") diff --git a/utils.py b/utils.py index 892944f083b970d85f7f96e9d1440127c6ba9444..627391dd3ddf8c298b1dea92a6b82eb021701710 100644 --- a/utils.py +++ b/utils.py @@ -1,14 +1,28 @@ import streamlit as st import base64 - def get_base64_of_bin_file(bin_file): + """ + Read a binary file and encode its contents to a base64 string. + + Args: + bin_file (str): Path to the binary file. + + Returns: + str: Base64 encoded string of the file contents. + """ with open(bin_file, 'rb') as f: data = f.read() return base64.b64encode(data).decode() def add_logo(): - image_path = 'backend/Logo_NAAC-removebg-preview.png' # Remplacez cela par le chemin vers votre image + """ + Add a logo image to the Streamlit sidebar. + + The image is added by setting the background of the sidebar navigation + using inline CSS and a base64 encoded string of the image file. + """ + image_path = 'backend/Logo_NAAC-removebg-preview.png' # Replace this with the path to your image image_base64 = get_base64_of_bin_file(image_path) st.markdown( f""" @@ -20,9 +34,7 @@ def add_logo(): background-position: 20px 20px; background-size: 300px; }} - </style> """, unsafe_allow_html=True, - ) - + ) \ No newline at end of file