Newer
Older
"""file_path_1 = "/content/drive/MyDrive/Coding_project_2023/netflix_titles-2.csv"""
file_path_1 = "/Users/adrien/vscodeworkspace/coding-project/projet_en_groupe/data_cp_2023/netflix_titles-2.csv"
"""file_path_2 = "/content/drive/MyDrive/Coding_project_2023/ratings.csv"""
file_path_2 ="/Users/adrien/vscodeworkspace/coding-project/projet_en_groupe/data_cp_2023/ratings.csv"
# Show the catalog
def catalog(data_1):
print(data_1.head(100))
save_to_csv(data_1)
# Be careful, you need to ask each time if they want to save the list to a .csv
films = data_1[data_1['type'] == 'Movie'] # Filter the data to include only movies
movie_titles = films['title'].tolist() # Extract movie titles
print(movie_titles) # Display movie titles
save_to_csv(movie_titles)
return # Be careful, you need to ask each time if they want to save the list to a .csv
series = data_1[data_1['type'] == 'TV Show'] # Filter the data to include only series
series_titles = series['title'].tolist() # Extract series titles
return # Be careful, you need to ask each time if they want to save the list to a .csv
if filtered_data is None:
return # Exit the function if filter_media_type returns None
sort_type = input("Do you want to sort the years in ascending or descending order? (ascending/descending)")
if sort_type == "ascending":
sorted_data = filtered_data.sort_values(by='release_year', ascending=True)
elif sort_type == "descending":
sorted_data = filtered_data.sort_values(by='release_year', ascending=False)
else:
print("Invalid choice. The dataset could not be sorted!")
return # Exit the function if the sort type is invalid
return # Be careful, you need to ask each time if they want to save the list to a .csv
def by_country(data_1):
filtered_data = filter_media_type(data_1)
country_list = []
for countries in filtered_data['country'].dropna().str.split(', '):
for country in countries:
if country not in country_list and country != '':
country_list.append(country)
print("List of all available countries:")
country_list.sort()
print(country_list)
country_input = input("Enter the name of the country to display movies and/or series: ").capitalize()
country_data = filtered_data[filtered_data['country'].str.lower().str.contains(country_input.lower(), case=False, na=False)]
print(f"No movies or series found for the country {country_input}.")
return
# Be careful, you need to ask each time if they want to save the list to a .csv
def genre(data_1):
filtered_data = filter_media_type(data_1)
genre_list = []
for genres in data_1['listed_in'].dropna().str.split(', '):
for genre in genres:
if genre not in genre_list and genre != '':
genre_list.append(genre)
print("List of all possible genres:")
genre_list.sort()
print(genre_list)
type_input = input("Enter the type (romantic, action, drama, etc.) to display movies and/or series: ").capitalize()
type_data = filtered_data[filtered_data['listed_in'].str.lower().str.contains(type_input.lower(), case=False, na=False)]
else:
print(f"No movies or series found for the type {type_input}.")
def duration(data_1):
filtered_data = filter_media_type(data_1)
genre_list = []
for genres in data_1['listed_in'].dropna().str.split(', '):
for genre in genres:
if genre not in genre_list and genre != '':
genre_list.append(genre)
print("List of all possible genres:")
genre_list.sort()
print(genre_list)
type_input = input("Enter the type (romantic, action, drama, etc.) to display movies and/or series: ").capitalize()
type_data = filtered_data[filtered_data['listed_in'].str.lower().str.contains(type_input.lower(), case=False, na=False)]
type_data_sorted = type_data.sort_values(by='duration', ascending=True) # see if we do in ascending or descending
print(type_data_sorted)
save_to_csv(type_data_sorted)
else:
print(f"No movies or series found for the type {type_input}.")
def director(data_1):
filtered_data = filter_media_type(data_1)
director_list = []
for dirs in data_1['director'].dropna().str.split(', '):
for director_name in dirs:
if director_name not in director_list and director_name != '':
director_list.append(director_name)
director_input = input("Enter the name of the director to display movies and/or series: ")
director_data = filtered_data[filtered_data['director'].str.lower().str.contains(director_input.lower(), case=False, na=False)]
director_data_sorted = director_data.sort_values(by='release_year', ascending=True) # see if we do in ascending or descending
print(director_data)
save_to_csv(director_data)
else:
print(f"No person found with the name {director_input}.")
def actor(data_1):
filtered_data = filter_media_type(data_1)
actor_list = []
for actors in data_1['cast'].dropna().str.split(', '):
for actor_name in actors:
if actor_name not in actor_list and actor_name != '':
actor_list.append(actor_name)
actor_input = input("Enter the name of the actor to display movies and/or series: ")
actor_data = filtered_data[filtered_data['cast'].str.lower().str.contains(actor_input.lower(), case=False, na=False)]
if not actor_data.empty:
actor_data_sorted = actor_data.sort_values(by='release_year', ascending=True)
print(actor_data_sorted)
save_to_csv(actor_data_sorted)
else:
print(f"No actor found with the name {actor_input}.")
def specific_genre_director(data_1):
filtered_data = filter_media_type(data_1)
unique_directors = filtered_data['director'].unique()
print("List of all available directors:")
print(', '.join(unique_directors))
director_input = input("Enter the name of the director to display movies and/or series: ")
unique_types = filtered_data['listed_in'].unique()
print("\nList of all available types:")
print(', '.join(unique_types))
type_input = input("Enter the type (romantic, action, drama, etc.): ").capitalize()
director_type_data = filtered_data[
(filtered_data['director'].str.lower().str.contains(director_input.lower(), case=False, na=False)) &
(filtered_data['listed_in'].str.lower().str.contains(type_input.lower(), case=False, na=False))
]
# Display the count
count = len(director_type_data)
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
print(f"The director {director_input} has directed {count} movie(s) or series of type {type_input}.")
print(director_type_data)
save_to_csv(director_type_data)
else:
print(f"No movies or series found for the director {director_input} and type {type_input}.")
def specific_genre_actor(data_1):
filtered_data = filter_media_type(data_1)
unique_actors = filtered_data['cast'].unique()
print("List of all available actors:")
print(', '.join(unique_actors))
actor_input = input("Enter the name of the actor to display movies and/or series: ")
unique_types = filtered_data['listed_in'].unique()
print("\nList of all available types:")
print(', '.join(unique_types))
type_input = input("Enter the type (romantic, action, drama, etc.): ").capitalize()
actor_type_data = filtered_data[
(filtered_data['cast'].str.lower().str.contains(actor_input.lower(), case=False, na=False)) &
(filtered_data['listed_in'].str.lower().str.contains(type_input.lower(), case=False, na=False))
]
if not actor_type_data.empty:
# Display the count
count = len(actor_type_data)
print(f"The actor {actor_input} has acted in {count} movie(s) or series of type {type_input}.")
print(actor_type_data)
save_to_csv(actor_type_data)
else:
print(f"No movies or series found for the actor {actor_input} and type {type_input}.")
# these are variables that needs to be registered in general not in a local function
notes = data_2.drop('show_id', axis = 1)
mean_type = notes.mean(axis = 1) * 100
data_2['appreciation (%)'] = mean_type
link_between = pd.merge(filtered_data,data_2, on='show_id')
link_between_sorted = link_between.sort_values(by='appreciation (%)', ascending=False)
print("Films et séries les mieux notés :")
print(link_between_sorted[['show_id', 'title', 'type', 'appreciation (%)']])
save_to_csv(link_between_sorted)
return
def most_rated_year(data_1, data_2):
# Display all available unique release years
available_years = data_1['release_year'].unique()
print("Available years: ", available_years)
# Ask the user to enter a release year
year = input("Enter a release year: ")
# Filter the data based on the release year
filtered_data = filter_media_type(data_1[data_1['release_year'] == year])
# Merge the DataFrames on the 'show_id' key
link_between = pd.merge(filtered_data, data_2, on='show_id')
# Sort the DataFrame by the 'appreciation' column (in descending order)
link_between_sorted = link_between.sort_values(by='appreciation (%)', ascending=False)
print(f"Top-rated shows for the year {year}:")
print(link_between_sorted[['show_id', 'title', 'type', 'release_year', 'appreciation (%)']])
save_to_csv(link_between_sorted)
return
def most_rated_recent(data_1, data_2):
# Merge the DataFrames on the 'show_id' key
merged_data = pd.merge(data_1, data_2, on='show_id')
# Sort the DataFrame by the 'appreciation' column (in descending order) and 'release_year' (in descending order)
sorted_data = merged_data.sort_values(by=['release_year', 'appreciation (%)'], ascending=[False, False])
# Display the most rated and recent shows
top_20_data = sorted_data.head(20)
print("Top 20 most rated and recent shows:")
print(top_20_data[['show_id', 'title', 'type', 'release_year', 'appreciation (%)']])
save_to_csv(top_20_data)
return
# Example usage
def parental_code(data_1):
code_list = []
for codes in data_1['rating'].dropna().str.split(', '):
for code in codes:
if code not in code_list and code != '':
code_list.append(code)
print("Here are the parental codes: ")
print(code_list)
#code_parental = input("Entrez le code de contrôle parental : PG-13, TV-MA")
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
def directors_nationality(data_1):
# Check if the 'director' column exists in the dataset
if 'director' not in data_1.columns:
print("The dataset does not contain a 'director' column.")
return
# Extract unique directors and their respective nationalities
directors_nationality_dict = {}
for index, row in data_1.iterrows():
directors = row['director'].split(', ')
nationality = row['country']
for director in directors:
if director in directors_nationality_dict:
directors_nationality_dict[director]['nationalities'].add(nationality)
directors_nationality_dict[director]['count'] += 1
else:
directors_nationality_dict[director] = {'nationalities': {nationality}, 'count': 1}
# Sort the directors by the number of movies and series produced
sorted_directors = sorted(directors_nationality_dict.items(), key=lambda x: x[1]['count'], reverse=True)
# Display the list of directors and their nationalities
print("Directors and their nationalities, sorted by the number of movies and series produced:")
for director, info in sorted_directors:
print(f"{director}: {', '.join(info['nationalities'])} - {info['count']} movies/series")
save_to_csv(sorted_directors)
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
# Allow to filter if we want movie, tv show or both
def filter_media_type(data):
media_type = input("What type of media do you want to display? (Movie/TV Show/Both): ").lower()
if media_type in ['movie', 'tv show', 'both']:
if media_type == 'both':
return data
else:
return data[data['type'].str.lower() == media_type]
else:
print("Invalid choice. Displaying all types of media.")
return data # Return the original data if the media type choice is invalid
# Example usage
def basic_statistics(data_1):
# Check if the 'type' and 'country' columns exist in the dataset
if 'type' not in data_1.columns or 'country' not in data_1.columns:
print("The dataset does not contain the necessary columns.")
return
# Count the number of movies and series
movies_count = len(data_1[data_1['type'] == 'Movie'])
series_count = len(data_1[data_1['type'] == 'TV Show'])
print(f"Number of movies in the catalog: {movies_count}")
print(f"Number of series in the catalog: {series_count}")
# Compare the number of movies and series
if movies_count > series_count:
print("There are more movies than series in the catalog.")
elif movies_count < series_count:
print("There are more series than movies in the catalog.")
else:
print("The catalog has an equal number of movies and series.")
# List countries that produced movies/series from most productive to least
country_counts = data_1['country'].str.split(', ').explode().value_counts()
print("\nCountries that produced movies/series, sorted from most to least productive:")
print(country_counts)
# attention il faut demander à chaque fois, s'il désire enregistrer la liste sur un .csv
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
def save_to_csv(data, default_filename='output.csv'):
# Ask if the user wants to save to a CSV file
save_choice = input("Do you want to save the data to a CSV file? (YES/NO): ").upper()
if save_choice == 'YES':
# Prompt for a file name
file_name = input("Enter the file name (DO NOT include .csv extension, or press Enter for the default): ")
file_name = file_name + ".csv"
if not file_name:
file_name = default_filename
# Check if the file already exists
if os.path.exists(file_name):
# Ask if the user wants to overwrite or create a new file
overwrite_choice = input(f"The file '{file_name}' already exists. Do you want to overwrite it? (YES/NO): ").upper()
if overwrite_choice == 'YES':
# Overwrite the existing file
data.to_csv(file_name, index=False)
print(f"Data saved to {file_name}")
# Ask if the user wants to open the file
open_choice = input("Do you want to open the saved file? (YES/NO): ").upper()
if open_choice == 'YES':
os.system(file_name)
else:
# Prompt for a new file name
new_filename = input("Enter a new file name (DO NOT include .csv extension): ")
new_filename = new_filename + ".csv"
data.to_csv(new_filename, index=False)
print(f"Data saved to {new_filename}")
# Ask if the user wants to open the file
open_choice = input("Do you want to open the saved file? (YES/NO): ").upper()
if open_choice == 'YES':
os.system(file_name)
else:
# Save to a new file
data.to_csv(file_name, index=False)
print(f"Data saved to {file_name}")
# Ask if the user wants to open the file
open_choice = input("Do you want to open the saved file? (YES/NO): ").upper()
if open_choice == 'YES':
os.system(file_name)
else:
print("Data not saved.")
# début de l'algorithme de recommandation
# Load the CSV file
def read_movie_series_info(file_path):
with open(file_path, 'r', encoding='utf-8') as info_file:
info_reader = csv.reader(info_file)
next(info_reader) # Skip header row
for row in info_reader:
show_id, show_type, title, director, cast, country, date_added, release_year, rating, duration, listed_in, description = row
catalog[show_id] = [title, listed_in.split(', ')]
return catalog
def read_user_ratings(file_path):
with open(file_path, 'r', encoding='utf-8') as ratings_file:
header = next(ratings_reader) # Skip header row
user_ids = list(map(int, header[1:]))
for row in ratings_reader:
show_id = row[0]
user_ratings = list(map(int, row[1:]))
ratings[show_id] = dict(zip(user_ids, user_ratings))
return ratings
def create_category_matrix(catalog, categories):
# Créez la matrice avec une rangée et une colonne supplémentaires pour les noms de catégories
category_matrix = [[0 for _ in range(len(categories) + 1)] for _ in range(len(categories) + 1)]
# Ajoutez les noms de catégories à la première ligne et à la première colonne
for i in range(len(categories)):
category_matrix[0][i + 1] = categories[i] # Ajoutez les noms de catégories à la première ligne
category_matrix[i + 1][0] = categories[i] # Ajoutez les noms de catégories à la première colonne
# Remplissez la matrice avec les données
for show_id, movie_categories in catalog.items():
for i in range(len(categories)):
if categories[i] in movie_categories[1]:
for j in range(len(categories)):
if categories[j] in movie_categories[1]:
category_matrix[i + 1][j + 1] += 1 # Commencez à remplir à partir de la deuxième ligne et de la deuxième colonne
# Ajoutez les noms de catégories à la première colonne et les données de la matrice
def recommend_movies(user_id, catalog, user_ratings, category_matrix, threshold=0.5):
global categories # Déclarer categories en tant que variable globale
categories = list(set(category for _, movie_info in catalog.items() for category in movie_info[1])) # permet d'actualiser la variable catégories global au niveau local
user_id = int(user_id) # Convertir user_id en entier
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
suggestions = {}
category_index = {}
# Créer le dictionnaire pour stocker les indices des catégories
category_index = {category: i + 1 for i, category in enumerate(categories)}
for show_id, categories in catalog.items():
# Check if the user has rated the show
if show_id in user_ratings and user_id in user_ratings[show_id] and user_ratings[show_id][user_id] == 0:
# Liste des catégories communes entre le film/série et les films/séries notés par l'utilisateur
common_categories = [category for category in categories[1] if category in catalog[show_id][1]]
if common_categories:
# Calculez la similarité entre le film/série et les films/séries notés par l'utilisateur
similarity = sum(
min(category_matrix[category_index[category]][category_index[user_category]] for user_category in common_categories)
for category in categories[1]
)
# Ne recommandez que des films/séries dont la similarité dépasse le seuil spécifié
if similarity > threshold:
suggestions[show_id] = {'title': catalog[show_id][0], 'similarity': similarity}
# Triez les suggestions par similarité décroissante
sorted_suggestions = sorted(suggestions.items(), key=lambda x: x[1]['similarity'], reverse=True)
return sorted_suggestions[:5]
def recommandation_algorithm() :
# Replace file_path_1 and file_path_2 with the actual file paths
file_path_1 = "/Users/adrien/vscodeworkspace/coding-project/projet_en_groupe/data_cp_2023/netflix_titles-2.csv"
file_path_2 = "/Users/adrien/vscodeworkspace/coding-project/projet_en_groupe/data_cp_2023/ratings.csv"
user_id = input("quel est ton user ? ")
try:
user_id = int(user_id)
except ValueError:
print("Veuillez entrer un identifiant d'utilisateur valide.")
exit()
# Read data from CSV files
catalog = read_movie_series_info(file_path_1)
ratings = read_user_ratings(file_path_2)
# Create category matrix
categories = list(set(category for _, movie_info in catalog.items() for category in movie_info[1]))
category_matrix = create_category_matrix(catalog, categories)
# Display movies already viewed by the user
print("Films déjà vus par l'utilisateur:")
for show_id, user_rating in ratings.items():
if user_id in user_rating and user_rating[user_id] > 0:
print(f"- {catalog[show_id][0]}")
# Recommend movies
recommended_movies = recommend_movies(user_id, catalog, ratings, category_matrix, threshold=0.5)
# Display top 5 recommendations
print("\nTop 5 recommandations:")
for show_id, info in recommended_movies:
print(f"Title: {info['title']}, Similarity: {info['similarity']}")
def action() :
print("Here are the different options available:")
print("1. View the entire catalog")
print("2. View all movies in the catalog")
print("3. View all series")
print("4. View all series, movies or both by year")
print("5. View all series, movies or both by country")
print("6. View all series, movies or both by type")
print("7. View all series, movies or both by type sorted by duration")
print("8. View series, movies or both directed by a specific director and sorted by year")
print("9. View series, movies or both featuring a specific actor and sorted by year")
print("10. View how many series, movies or both and series directed by a director in a specific genre")
print("11. View how many series, movies or both an actor has played in")
print("12. Display the highest-rated series, movies or both")
print("13. Display the highest-rated series, movies or both for a specific year")
print("14. Display recent highest-rated series, movies or both")
print("15. Display movies and series based on parental control code")
print("16. Display the nationalities of directors and sort the list based on the number of movies and series directed")
print("17. Display basic statistics")
print("... Enter STOP to stop")
command = input("Enter the number of what you want to do: ")
if command == "1" :
catalog(data_1)
elif command == "2" :
movies(data_1)
elif command == "3" :
elif command == "4" :
by_year(data_1)
elif command == "5" :
by_country(data_1)
elif command == "6" :
elif command == "7" :
duration(data_1)
elif command == "8" :
director(data_1)
elif command == "9" :
actor(data_1)
elif command == "10" :
specific_genre_director(data_1)
elif command == "11" :
specific_genre_actor(data_1)
elif command == "12" :
elif command == "15" :
parental_code(data_1)
elif command == "16" :
directors_nationality(data_1)
elif command == "17" :
basic_statistics(data_1)
return False
# attention il faut demander à chaque fois, s'il désire enregistrer la liste sur un .csv
if response == True:
menu = []
else:
menu.append(response)