algorithme_netflix.py

# afficher le premier document

import pandas as pd
import tabulate
import os
import csv

#display(data_2)


"""file_path_1 = "/content/drive/MyDrive/Coding_project_2023/netflix_titles-2.csv"""
file_path_1 = "/Users/adrien/vscodeworkspace/coding-project/projet_en_groupe/data_cp_2023/netflix_titles-2.csv"
data_1 = pd.read_csv(file_path_1)

"""file_path_2 = "/content/drive/MyDrive/Coding_project_2023/ratings.csv"""
file_path_2 ="/Users/adrien/vscodeworkspace/coding-project/projet_en_groupe/data_cp_2023/ratings.csv"
data_2 = pd.read_csv(file_path_2)


# Show the catalog
def catalog(data_1):
    print(data_1.head(100))

    save_to_csv(data_1)
    # Be careful, you need to ask each time if they want to save the list to a .csv


def movies(data_1): # enregistrement 

    films = data_1[data_1['type'] == 'Movie']  # Filter the data to include only movies
    movie_titles = films['title'].tolist()  # Extract movie titles

    movie_df = pd.DataFrame({'Movie Titles': movie_titles})

    # afficher que les 100 premiers

    print(movie_df)  # Display movie titles
    save_to_csv(movie_df)

    return  # Be careful, you need to ask each time if they want to save the list to a .csv


def series(data_1):

    series = data_1[data_1['type'] == 'TV Show']  # Filter the data to include only series
    series_titles = series['title'].tolist()  # Extract series titles

    series_df = pd.DataFrame({'Movie Titles': series_titles})

    print(series_df)  # Display series titles

    save_to_csv(series_df)

    return  # Be careful, you need to ask each time if they want to save the list to a .csv


def by_year(data_1):  # be careful and/or !!!!!

    filtered_data = filter_media_type(data_1)

    if filtered_data is None:
        return  # Exit the function if filter_media_type returns None

    sort_type = input("Do you want to sort the years in ascending or descending order? (ascending/descending)")
    if sort_type == "ascending":
        sorted_data = filtered_data.sort_values(by='release_year', ascending=True)
    elif sort_type == "descending":
        sorted_data = filtered_data.sort_values(by='release_year', ascending=False)
    else:
        print("Invalid choice. The dataset could not be sorted!")
        return  # Exit the function if the sort type is invalid

    print(sorted_data)
    save_to_csv(sorted_data)

    return  # Be careful, you need to ask each time if they want to save the list to a .csv


def by_country(data_1):
    filtered_data = filter_media_type(data_1)

    country_list = []
    for countries in filtered_data['country'].dropna().str.split(','):
        for country in countries:
            cleaned_country = country.strip()  # Remove leading and trailing spaces
            if cleaned_country and cleaned_country not in country_list:
                country_list.append(cleaned_country)

    print("List of all available countries:")
    country_list.sort()
    print(country_list)

    country_input = input("Enter the name of the country to display movies and/or series: ").capitalize()
    country_data = filtered_data[filtered_data['country'].str.lower().str.contains(country_input.lower(), case=False, na=False)]

    if not country_data.empty:
        print(country_data)
        save_to_csv(country_data)
    else:
        print(f"No movies or series found for the country {country_input}.")

    return

 # Be careful, you need to ask each time if they want to save the list to a .csv


def genre(data_1):
    filtered_data = filter_media_type(data_1)

    genre_list = []
    for genres in data_1['listed_in'].dropna().str.split(', '):
        for genre in genres:
            if genre not in genre_list and genre != '':
                genre_list.append(genre)

    print("List of all possible genres:")
    genre_list.sort()
    print(genre_list)

    type_input = input("Enter the type (romantic, action, drama, etc.) to display movies and/or series: ").capitalize()
    type_data = filtered_data[filtered_data['listed_in'].str.lower().str.contains(type_input.lower(), case=False, na=False)]

    if not type_data.empty:
        print(type_data)
        save_to_csv(type_data)
    else:
        print(f"No movies or series found for the type {type_input}.")


def duration(data_1):
    filtered_data = filter_media_type(data_1)
    genre_list = []
    for genres in data_1['listed_in'].dropna().str.split(', '):
        for genre in genres:
            if genre not in genre_list and genre != '':
                genre_list.append(genre)

    print("List of all possible genres:")
    genre_list.sort()
    print(genre_list)

    type_input = input("Enter the type (romantic, action, drama, etc.) to display movies and/or series: ").capitalize()
    type_data = filtered_data[filtered_data['listed_in'].str.lower().str.contains(type_input.lower(), case=False, na=False)]

    print("Quel type de tri voulez-vous ? ")
    print("1. Croissant")
    print("2. Décroissant")
    sort_order = str(input("Entrez le numéro du type de tri : "))

    if not type_data.empty:
        if sort_order.lower() == '1':
            type_data_sorted = type_data.sort_values(by='duration', ascending=True)
        elif sort_order.lower() == '2':
            type_data_sorted = type_data.sort_values(by='duration', ascending=False)
        else:
            print("Invalid sort order. Defaulting to ascending order.")
            type_data_sorted = type_data.sort_values(by='duration', ascending=True)

        print(type_data_sorted)
        save_to_csv(type_data_sorted)
    else:
        print(f"No movies or series found for the type {type_input}.")


def director(data_1):
    filtered_data = filter_media_type(data_1)

    director_list = []
    for dirs in data_1['director'].dropna().str.split(', '):
        for director_name in dirs:
            if director_name not in director_list and director_name != '':
                director_list.append(director_name)

    print("List of all possible directors: ")
    print(director_list)

    director_input = input("Enter the name of the director to display movies and/or series: ")
    director_data = filtered_data[filtered_data['director'].str.lower().str.contains(director_input.lower(), case=False, na=False)]

    if not director_data.empty:

        director_data_sorted = director_data.sort_values(by='release_year', ascending=True)  # see if we do in ascending or descending

        print(director_data_sorted)
        save_to_csv(director_data_sorted)
    else:
        print(f"No person found with the name {director_input}.")


def actor(data_1):
    filtered_data = filter_media_type(data_1)

    actor_list = []
    for actors in data_1['cast'].dropna().str.split(', '):
        for actor_name in actors:
            if actor_name not in actor_list and actor_name != '':
                actor_list.append(actor_name)

    print("List of all possible actors: ")
    print(actor_list)

    actor_input = input("Enter the name of the actor to display movies and/or series: ")
    actor_data = filtered_data[filtered_data['cast'].str.lower().str.contains(actor_input.lower(), case=False, na=False)]

    if not actor_data.empty:
        actor_data_sorted = actor_data.sort_values(by='release_year', ascending=True)

        print(actor_data_sorted)
        save_to_csv(actor_data_sorted)
    else:
        print(f"No actor found with the name {actor_input}.")


def specific_genre_director(data_1):
    filtered_data = filter_media_type(data_1)

    unique_directors = filtered_data['director'].unique()
    # Convert elements to strings to handle potential float values
    unique_directors = [str(director) for director in unique_directors]
    
    print("List of all available directors:")
    print(', '.join(unique_directors))

    director_input = input("Enter the name of the director to display movies and/or series: ")

    unique_types = filtered_data['listed_in'].unique()
    print("\nList of all available types:")
    print(', '.join(unique_types))

    type_input = input("Enter the type (romantic, action, drama, etc.): ").capitalize()

    director_type_data = filtered_data[
        (filtered_data['director'].str.lower().str.contains(director_input.lower(), case=False, na=False)) &
        (filtered_data['listed_in'].str.lower().str.contains(type_input.lower(), case=False, na=False))
    ]

    if not director_type_data.empty:
        # Display the count
        count = len(director_type_data)
        print(f"The director {director_input} has directed {count} movie(s) or series of type {type_input}.")
        print(director_type_data)
        save_to_csv(director_type_data)

    else:
        print(f"No movies or series found for the director {director_input} and type {type_input}.")


def specific_genre_actor(data_1):
    filtered_data = filter_media_type(data_1)

    unique_actors = filtered_data['cast'].unique()
    # Convert elements to strings to handle potential float values
    unique_actors = [str(actor) for actor in unique_actors]
    
    print("List of all available actors:")
    print(', '.join(unique_actors))

    actor_input = input("Enter the name of the actor to display movies and/or series: ")

    unique_types = filtered_data['listed_in'].unique()
    print("\nList of all available types:")
    print(', '.join(unique_types))

    type_input = input("Enter the type (romantic, action, drama, etc.): ").capitalize()

    actor_type_data = filtered_data[
        (filtered_data['cast'].str.lower().str.contains(actor_input.lower(), case=False, na=False)) &
        (filtered_data['listed_in'].str.lower().str.contains(type_input.lower(), case=False, na=False))
    ]

    if not actor_type_data.empty:
        # Display the count
        count = len(actor_type_data)
        print(f"The actor {actor_input} has acted in {count} movie(s) or series of type {type_input}.")
        print(actor_type_data)
        save_to_csv(actor_type_data)
    else:
        print(f"No movies or series found for the actor {actor_input} and type {type_input}.")
    return


# rating

# these are variables that needs to be registered in general not in a local function
notes = data_2.drop('show_id', axis = 1)
mean_type = notes.mean(axis = 1) * 100
data_2['appreciation (%)'] = mean_type

def most_rated(data_1, data_2) :

    filtered_data = filter_media_type(data_1)

    link_between =  pd.merge(filtered_data,data_2, on='show_id')

    link_between_sorted = link_between.sort_values(by='appreciation (%)', ascending=False)

    print("Films et séries les mieux notés :")
    print(link_between_sorted[['show_id', 'title', 'type', 'appreciation (%)']])
    save_to_csv(link_between_sorted)
    return


def most_rated_year(data_1, data_2):
    # Display all available unique release years
    available_years = sorted(data_1['release_year'].unique())
    print("Available years: ", available_years) # tri des dates 

    # Ask the user to enter a release year
    year = input("Enter a release year: ")

    try:
        # Convert the year to an integer
        year = int(year)
    except ValueError:
        print("Please enter a valid year.")
        return

    # Filter the data based on the release year
    filtered_data = filter_media_type(data_1[data_1['release_year'] == year])

    # Merge the DataFrames on the 'show_id' key
    link_between = pd.merge(filtered_data, data_2, on='show_id')

    # Sort the DataFrame by the 'appreciation' column (in descending order)
    link_between_sorted = link_between.sort_values(by='appreciation (%)', ascending=False)

    print(f"Top-rated shows for the year {year}:")
    print(link_between_sorted[['show_id', 'title', 'type', 'release_year', 'appreciation (%)']])
    save_to_csv(link_between_sorted)
    return


def most_rated_recent(data_1, data_2):
    # Merge the DataFrames on the 'show_id' key
    merged_data = pd.merge(data_1, data_2, on='show_id')

    # Sort the DataFrame by the 'appreciation' column (in descending order) and 'release_year' (in descending order)
    sorted_data = merged_data.sort_values(by=['release_year', 'appreciation (%)'], ascending=[False, False])

    # Display the most rated and recent shows
    top_20_data = sorted_data.head(20)
    print("Top 20 most rated and recent shows:")
    print(top_20_data[['show_id', 'title', 'type', 'release_year', 'appreciation (%)']])
    save_to_csv(top_20_data)
    return


# Example usage

def parental_code(data_1):
    valid_codes = set(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R', 'TV-G', 'G', 'NC-17', 'NR', 'TV-Y7-FV', 'UR'])

    # Filter out entries that are not valid parental codes
    filtered_data = data_1[data_1['rating'].isin(valid_codes)]

    print("Valid parental codes:")
    print(', '.join(valid_codes))

    # Ask the user to enter a parental code
    selected_code = input("Enter a parental code to display movies and/or series: ")

    # Filter the data based on the selected parental code
    if selected_code in valid_codes:
        result_data = filtered_data[filtered_data['rating'].str.contains(selected_code, case=False, na=False)]
        if not result_data.empty:
            print(result_data)
            save_to_csv(result_data)
        else:
            print(f"No movies or series found for the parental code {selected_code}.")
    else:
        print("Invalid parental code entered.")

    return


def directors_nationality(data_1):
    # Check if the 'director' and 'country' columns exist in the dataset
    if 'director' not in data_1.columns or 'country' not in data_1.columns:
        print("The dataset does not contain 'director' or 'country' columns.")
        return

    # Extract unique directors and their respective nationalities
    directors_nationality_dict = {}
    for index, row in data_1.iterrows():
        directors = str(row['director']).split(', ') if pd.notna(row['director']) else []
        nationality = row['country']

        for director in directors:
            if director in directors_nationality_dict:
                # Ajouter la nationalité seulement si elle n'est pas déjà présente dans la liste
                if str(nationality) not in directors_nationality_dict[director]['nationalities']:
                    directors_nationality_dict[director]['nationalities'].add(str(nationality))
                directors_nationality_dict[director]['number of movies or series'] += 1
            else:
                directors_nationality_dict[director] = {'nationalities': {str(nationality)}, 'number of movies or series': 1}
        
    # Remove duplicates from the nationalities list for each director
    # Remove consecutive duplicates from the nationalities list for each director
    for director_info in directors_nationality_dict.values():
        nationalities_set = set(director_info['nationalities'])
        director_info['nationalities'] = list(nationalities_set)
    # Sort the directors by the number of movies and series produced
    sorted_directors = sorted(directors_nationality_dict.items(), key=lambda x: x[1]['number of movies or series'], reverse=True)

    columns = ['director', 'nationalities', 'number of movies or series']
    directors_df = pd.DataFrame([[director, ', '.join(info['nationalities']), info['number of movies or series']] for director, info in sorted_directors], columns=columns)

    # Display the list of directors and their nationalities
    print("Directors and their nationalities, sorted by the number of movies and series produced:")
    for i, (director, info) in enumerate(sorted_directors[:25]):
        director_name = str(director) if pd.notna(director) else 'Unknown'
        nationalities_str = ', '.join(str(n) for n in info['nationalities'])
        print(f"{i+1}. {director_name}: {nationalities_str} - {info['number of movies or series']} movies/series")

    # Save to CSV using the DataFrame
    save_to_csv(directors_df)  # Commenting this out since the 'save_to_csv' function is not provided

    return directors_df


# Allow to filter if we want movie, tv show or both 
def filter_media_type(data):
    media_type = input("What type of media do you want to display? (Movie/TV Show/Both): ").lower()
    
    if media_type in ['movie', 'tv show', 'both']:
        if media_type == 'both':
            return data
        else:
            return data[data['type'].str.lower() == media_type]
    else:
        print("Invalid choice. Displaying all types of media.")
        return data  # Return the original data if the media type choice is invalid


# Example usage
def basic_statistics(data_1):
    # Check if the 'type' and 'country' columns exist in the dataset
    if 'type' not in data_1.columns or 'country' not in data_1.columns:
        print("The dataset does not contain the necessary columns.")
        return

    # Count the number of movies and series
    movies_count = len(data_1[data_1['type'] == 'Movie'])
    series_count = len(data_1[data_1['type'] == 'TV Show'])

    print(f"Number of movies in the catalog: {movies_count}")
    print(f"Number of series in the catalog: {series_count}")

    # Compare the number of movies and series
    if movies_count > series_count:
        print("There are more movies than series in the catalog.")
    elif movies_count < series_count:
        print("There are more series than movies in the catalog.")
    else:
        print("The catalog has an equal number of movies and series.")

    # List countries that produced movies/series from most productive to least
    country_counts = data_1['country'].str.split(', ').explode().value_counts()
    print("\nCountries that produced movies/series, sorted from most to least productive:")
    print(country_counts)
    return


 # attention il faut demander à chaque fois, s'il désire enregistrer la liste sur un .csv

def save_to_csv(data, default_filename='output.csv'):
    # Ask if the user wants to save to a CSV file
    save_choice = input("Do you want to save the data to a CSV file? (YES/NO): ").upper()

    if save_choice == 'YES':
        # Prompt for a file name
        file_name = input("Enter the file name (DO NOT include .csv extension, or press Enter for the default): ")
        file_name = file_name + ".csv"
        if not file_name:
            file_name = default_filename

        # Check if the file already exists
        if os.path.exists(file_name):
            # Ask if the user wants to overwrite or create a new file
            overwrite_choice = input(f"The file '{file_name}' already exists. Do you want to overwrite it? (YES/NO): ").upper()

            if overwrite_choice == 'YES':
                # Overwrite the existing file
                data.to_csv(file_name, index=False)
                print(f"Data saved to {file_name}")

            else:
                # Prompt for a new file name
                new_filename = input("Enter a new file name (DO NOT include .csv extension): ")
                new_filename = new_filename + ".csv"
                data.to_csv(new_filename, index=False)
                print(f"Data saved to {new_filename}")
        else:
            # Save to a new file
            data.to_csv(file_name, index=False)
            print(f"Data saved to {file_name}")

    else:
        print("Data not saved.")


# début de l'algorithme de recommandation 

# Load the CSV file
categories = []


def read_movie_series_info(file_path):
    catalog = {}
    with open(file_path, 'r', encoding='utf-8') as info_file:
        info_reader = csv.reader(info_file)
        next(info_reader)  # Skip header row
        for row in info_reader:
            show_id, show_type, title, director, cast, country, date_added, release_year, rating, duration, listed_in, description = row
            catalog[show_id] = [title, listed_in.split(', ')]
    return catalog

def read_user_ratings(file_path):
    ratings = {}
    with open(file_path, 'r', encoding='utf-8') as ratings_file:
        ratings_reader = csv.reader(ratings_file)
        header = next(ratings_reader)  # Skip header row
        user_ids = list(map(int, header[1:]))
        
        for row in ratings_reader:
            show_id = row[0]
            user_ratings = list(map(int, row[1:]))
            ratings[show_id] = dict(zip(user_ids, user_ratings))
            
    return ratings

def create_category_matrix(catalog, categories, output_file_path):
    # Créez la matrice sans les noms de catégories
    category_matrix = [[0 for _ in range(len(categories))] for _ in range(len(categories))]

    # Remplissez la matrice avec les données
    for show_id, movie_categories in catalog.items():
        for i in range(len(categories)):
            if categories[i] in movie_categories[1]:
                for j in range(len(categories)):
                    if categories[j] in movie_categories[1]:
                        # Assurez-vous que les indices sont valides avant d'incrémenter
                        if i < len(category_matrix) and j < len(category_matrix[i]):
                            category_matrix[i][j] += 1

    # Ajoutez les noms de catégories à la première ligne et à la première colonne du DataFrame
    category_matrix_with_names = [[category] + row for category, row in zip(categories, category_matrix)]
    df = pd.DataFrame(category_matrix_with_names, columns=[''] + categories)

    # Enregistrez le DataFrame dans un fichier Excel avec les noms de colonnes et de lignes
    df.to_excel(output_file_path, index=False)

    return category_matrix


def recommend_movies(user_id, catalog, user_ratings, category_matrix, threshold=0.9999):
    global categories  # Déclarer categories en tant que variable globale
    categories = list(set(category for _, movie_info in catalog.items() for category in movie_info[1]))  # permet d'actualiser la variable catégories global au niveau local

    user_id = int(user_id)  # Convertir user_id en entier

    suggestions = {}
    category_index = {}

    # Créer le dictionnaire pour stocker les indices des catégories
    category_index = {category: i for i, category in enumerate(categories)}

    # Ajout de l'affectation manquante
    user_categories = categories

    for show_id, categories in catalog.items():
        # Check if the user has rated the show
        if show_id in user_ratings and user_id in user_ratings[show_id] and user_ratings[show_id][user_id] == 0:
            # Liste des catégories communes entre le film/série et les films/séries notés par l'utilisateur
            common_categories = [category for category in categories[1] if category in user_categories]

            if common_categories:
                # Calculez la similarité entre le film/série et les films/séries notés par l'utilisateur
                similarity = sum(
                    min(category_matrix[category_index[category]][category_index[user_category]] for user_category in common_categories)
                    for category in categories[1]
                )

                # Ne recommandez que des films/séries dont la similarité dépasse le seuil spécifié
                if similarity > threshold:
                    suggestions[show_id] = {'title': catalog[show_id][0], 'similarity': similarity}

    # Triez les suggestions par similarité décroissante
    sorted_suggestions = sorted(suggestions.items(), key=lambda x: x[1]['similarity'], reverse=True)

    return sorted_suggestions[:5]


# Ajout d'impression pour déboguer
print("User categories:", categories)

def recommandation_algorithm() :

    # Replace file_path_1 and file_path_2 with the actual file paths
    file_path_1 = "/Users/adrien/vscodeworkspace/coding-project/projet_en_groupe/data_cp_2023/netflix_titles-2.csv"
    file_path_2 = "/Users/adrien/vscodeworkspace/coding-project/projet_en_groupe/data_cp_2023/ratings.csv"

    while True:
        user_id = input("Quel est ton user ? ")

        try:
            user_id = int(user_id)
            # Check if user_id is between 1 and 100
            if 1 <= user_id <= 100:
                break  # Sort de la boucle si l'identifiant est valide
            else:
                print("L'identifiant de l'utilisateur doit être compris entre 1 et 100.")
        except ValueError as e:
            print(f"Veuillez entrer un identifiant d'utilisateur valide. Erreur: {e}")


    # Read data from CSV files
    catalog = read_movie_series_info(file_path_1)
    ratings = read_user_ratings(file_path_2)
    # Create category matrix
    categories = list(set(category for _, movie_info in catalog.items() for category in movie_info[1]))
    output_file_path = "matrice_categories.xlsx"
    category_matrix = create_category_matrix(catalog, categories, output_file_path)
    

    # Display movies already viewed by the user
    print("Films déjà vus par l'utilisateur:")
    for show_id, user_rating in ratings.items():
        if user_id in user_rating and user_rating[user_id] > 0:
            print(f"- {catalog[show_id][0]}")

    # Recommend movies
    recommended_movies = recommend_movies(user_id, catalog, ratings, category_matrix, threshold=0.5)

    # Display top 5 recommendations
    print("\nTop 5 recommandations:")
    for show_id, info in recommended_movies:
        print(f"Title: {info['title']}, Similarity: {info['similarity']}")


# Création du menu
def action() :
  print("Here are the different options available:")
  print("1.  View the entire catalog")
  print("2.  View all movies in the catalog")
  print("3.  View all series")
  print("4.  View all series, movies or both by year")
  print("5.  View all series, movies or both by country")
  print("6.  View all series, movies or both by type")
  print("7.  View all series, movies or both by type sorted by duration")
  print("8.  View series, movies or both directed by a specific director and sorted by year")
  print("9.  View series, movies or both featuring a specific actor and sorted by year")
  print("10. View how many series, movies or both and series directed by a director in a specific genre")
  print("11. View how many series, movies or both an actor has played in")
  print("12. Display the highest-rated series, movies or both")
  print("13. Display the highest-rated series, movies or both for a specific year")
  print("14. Display recent highest-rated series, movies or both")
  print("15. Display movies and series based on parental control code")
  print("16. Display the nationalities of directors and sort the list based on the number of movies and series directed")
  print("17. Display basic statistics")
  print("18. Get Personalized Recommendations")
  print("STOP to stop")
  command = input("Enter the number of what you want to do: ")

  if command == "1" :
    catalog(data_1)
  elif command == "2" :
    movies(data_1)
  elif command == "3" :
    series(data_1)
  elif command == "4" :
    by_year(data_1)
  elif command == "5" :
    by_country(data_1)
  elif command == "6" :
    genre(data_1)
  elif command == "7" :
    duration(data_1)
  elif command == "8" :
    director(data_1)
  elif command == "9" :
    actor(data_1)
  elif command == "10" :
    specific_genre_director(data_1)
  elif command == "11" :
    specific_genre_actor(data_1)
  elif command == "12" :
    most_rated(data_1, data_2)
  elif command == "13" :
    most_rated_year(data_1, data_2)
  elif command == "14" :
    most_rated_recent(data_1, data_2)
  elif command == "15" :
    parental_code(data_1)
  elif command == "16" :
    directors_nationality(data_1)
  elif command == "17" :
    basic_statistics(data_1)
  elif command == "18" :
    recommandation_algorithm()
  elif command == "STOP" :
    return False


 # attention il faut demander à chaque fois, s'il désire enregistrer la liste sur un .csv


menu = []

while True:
    response = action()
    if response is False:
        break
    else:
        if response == True:
            menu = []
        else:
            menu.append(response)