diff --git a/projet_en_groupe/algorithme_netflix.py b/projet_en_groupe/algorithme_netflix.py index c4392c52cad60928c0da9bcd0ed149fb5d895b5e..051bb9a30c73ed94cd7160fad72a13385e36fb56 100644 --- a/projet_en_groupe/algorithme_netflix.py +++ b/projet_en_groupe/algorithme_netflix.py @@ -273,80 +273,73 @@ def specific_genre_actor(data_1): save_to_csv(actor_type_data) else: print(f"No movies or series found for the actor {actor_input} and type {type_input}.") + return -def most_rated(data_1): - filtered_data = filter_media_type(data_1) +# rating - # Check if the 'rating' column exists in the dataset - if 'rating' not in filtered_data.columns: - print("The dataset does not contain a 'rating' column.") - return +# these are variables that needs to be registered in general not in a local function +notes = data_2.drop('show_id', axis = 1) +mean_type = notes.mean(axis = 1) * 100 +data_2['appreciation (%)'] = mean_type - # Sort the data by the 'rating' column in descending order - sorted_data = filtered_data.sort_values(by='rating', ascending=False) +def most_rated(data_1, data_2) : - # Display the top-rated movies and/or series - if not sorted_data.empty: - print("Top-rated movies and/or series:") - print(sorted_data[['title', 'type', 'rating']].head(10)) # Display the top 10 - save_to_csv(sorted_data) - else: - print("No rated movies or series found.") + filtered_data = filter_media_type(data_1) + link_between = pd.merge(filtered_data,data_2, on='show_id') -def most_rated_year(data_1): - filtered_data = filter_media_type(data_1) + link_between_sorted = link_between.sort_values(by='appreciation (%)', ascending=False) - # Check if the 'rating' and 'release_year' columns exist in the dataset - if 'rating' not in filtered_data.columns or 'release_year' not in filtered_data.columns: - print("The dataset does not contain the necessary columns.") - return + print("Films et séries les mieux notés :") + print(link_between_sorted[['show_id', 'title', 'type', 'appreciation (%)']]) + save_to_csv(link_between_sorted) + return - year_input = input("Enter the year to show the most rated movies and/or series: ") + +def most_rated_year(data_1, data_2): + # Display all available unique release years + available_years = data_1['release_year'].unique() + print("Available years: ", available_years) + + # Ask the user to enter a release year + year = input("Enter a release year: ") try: - year_input = int(year_input) + # Convert the year to an integer + year = int(year) except ValueError: - print("Invalid input. Please enter a valid year.") + print("Please enter a valid year.") return - # Filter the data for the specified year - year_data = filtered_data[filtered_data['release_year'] == year_input] - - if not year_data.empty: - # Sort the data by the 'rating' column in descending order - sorted_data = year_data.sort_values(by='rating', ascending=False) + # Filter the data based on the release year + filtered_data = filter_media_type(data_1[data_1['release_year'] == year]) - # Display the top-rated movies and/or series in the specified year - print(f"\nTop-rated movies and/or series in {year_input}:") - print(sorted_data[['title', 'type', 'rating']].head(10)) # Display the top 10 - save_to_csv(sorted_data) - else: - print(f"No rated movies or series found for the year {year_input}.") - - -def most_rated_recent(data_1): - filtered_data = filter_media_type(data_1) + # Merge the DataFrames on the 'show_id' key + link_between = pd.merge(filtered_data, data_2, on='show_id') - # Check if the 'rating' and 'date_added' columns exist in the dataset - if 'rating' not in filtered_data.columns or 'date_added' not in filtered_data.columns: - print("The dataset does not contain the necessary columns.") - return + # Sort the DataFrame by the 'appreciation' column (in descending order) + link_between_sorted = link_between.sort_values(by='appreciation (%)', ascending=False) - # Sort the data by the 'rating' column in descending order and 'date_added' in descending order - sorted_data = filtered_data.sort_values(by=['rating', 'date_added'], descending=[False, True]) + print(f"Top-rated shows for the year {year}:") + print(link_between_sorted[['show_id', 'title', 'type', 'release_year', 'appreciation (%)']]) + save_to_csv(link_between_sorted) + return - # Display the most rated and recent movies and/or series - if not sorted_data.empty: - print("Most rated and recent movies and/or series:") - print(sorted_data[['title', 'type', 'rating', 'date_added']].head(10)) # Display the top 10 - save_to_csv(sorted_data) - else: - print("No rated and recent movies or series found.") +def most_rated_recent(data_1, data_2): + # Merge the DataFrames on the 'show_id' key + merged_data = pd.merge(data_1, data_2, on='show_id') + # Sort the DataFrame by the 'appreciation' column (in descending order) and 'release_year' (in descending order) + sorted_data = merged_data.sort_values(by=['release_year', 'appreciation (%)'], ascending=[False, False]) + # Display the most rated and recent shows + top_20_data = sorted_data.head(20) + print("Top 20 most rated and recent shows:") + print(top_20_data[['show_id', 'title', 'type', 'release_year', 'appreciation (%)']]) + save_to_csv(top_20_data) + return # Example usage @@ -360,6 +353,7 @@ def parental_code(data_1): code_list.append(code) print("Here are the parental codes: ") print(code_list) + save_to_csv(code_list) @@ -440,9 +434,7 @@ def basic_statistics(data_1): country_counts = data_1['country'].str.split(', ').explode().value_counts() print("\nCountries that produced movies/series, sorted from most to least productive:") print(country_counts) - save_to_csv(pd.DataFrame({'Country Count' : [country_counts], 'Movies Count': [movies_count], 'Series Count': [series_count]}), 'counts.csv') - - + return # attention il faut demander à chaque fois, s'il désire enregistrer la liste sur un .csv @@ -545,11 +537,11 @@ def action() : elif command == "11" : specific_genre_actor(data_1) elif command == "12" : - most_rated(data_1) + most_rated(data_1, data_2) elif command == "13" : - most_rated_year(data_1) + most_rated_year(data_1, data_2) elif command == "14" : - most_rated_recent(data_1) + most_rated_recent(data_1, data_2) elif command == "15" : parental_code(data_1) elif command == "16" :