Skip to content
Extraits de code Groupes Projets
Valider a442a7bf rédigé par Adrien Payen's avatar Adrien Payen
Parcourir les fichiers

update Analytics

parent 7dd59e1b
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Reload modules automatically before entering the execution of code # Reload modules automatically before entering the execution of code
%load_ext autoreload %load_ext autoreload
%autoreload 2 %autoreload 2
# Third-party imports # Third-party imports
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix from scipy.sparse import csr_matrix
# Constants and functions # Constants and functions
from constants import Constant as C from constants import Constant as C
from loaders import load_ratings from loaders import load_ratings
from loaders import load_items from loaders import load_items
from tabulate import tabulate from tabulate import tabulate
# Call the load_items() function and create a variable df_items # Call the load_items() function and create a variable df_items
df_movies = load_items() df_movies = load_items()
# Display the DataFrame # Display the DataFrame
print("Display The Movies : ") print("Display The Movies : ")
display(df_movies) display(df_movies)
# Call the load_ratings() function and create a variable df_ratings # Call the load_ratings() function and create a variable df_ratings
df_ratings = load_ratings() df_ratings = load_ratings()
# Display the DataFrame # Display the DataFrame
print("Display The Ratings : ") print("Display The Ratings : ")
display(df_ratings) display(df_ratings)
``` ```
%% Output %% Output
The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload
Display The Movies : Display The Movies :
Display The Ratings : Display The Ratings :
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# NUMBER OF MOVIES # NUMBER OF MOVIES
n_movies = df_movies['title'].nunique() n_movies = df_movies['title'].nunique()
print(f"Number of movies: {n_movies}") print(f"Number of movies: {n_movies}")
``` ```
%% Output %% Output
Number of movies: 912 Number of movies: 912
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE YEAR RANGE # THE YEAR RANGE
df_movies['annee'] = df_movies['title'].str.extract(r'\((.{4})\)') df_movies['annee'] = df_movies['title'].str.extract(r'\((.{4})\)')
df_movies['annee'] = pd.to_numeric(df_movies['annee'], errors='coerce') df_movies['annee'] = pd.to_numeric(df_movies['annee'], errors='coerce')
min_range = int(df_movies['annee'].min()) min_range = int(df_movies['annee'].min())
max_range = int(df_movies['annee'].max()) max_range = int(df_movies['annee'].max())
print("Minimum range:", min_range) print("Minimum range:", min_range)
print("Maximum range:", max_range) print("Maximum range:", max_range)
``` ```
%% Output %% Output
Minimum range: 1921 Minimum range: 1921
Maximum range: 2016 Maximum range: 2016
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# LIST OF MOVIE GENRES # LIST OF MOVIE GENRES
def tabulate_genres(df_movies): def tabulate_genres(df_movies):
"""Tabulate list of movie genres.""" """Tabulate list of movie genres."""
# Split genres and explode # Split genres and explode
df_movies['genres'] = df_movies['genres'].str.split('|') df_movies['genres'] = df_movies['genres'].str.split('|')
df_movies = df_movies.explode('genres') df_movies = df_movies.explode('genres')
unique_genres = sorted(df_movies['genres'].unique()) unique_genres = sorted(df_movies['genres'].unique())
# Tabulate # Tabulate
print("\nList of all genres:") print("\nList of all genres:")
genres_table = [[genre, "|"] for genre in unique_genres] genres_table = [[genre, "|"] for genre in unique_genres]
print(tabulate(genres_table, tablefmt="plain", numalign="left")) print(tabulate(genres_table, tablefmt="plain", numalign="left"))
# Call the tabulate_genres function # Call the tabulate_genres function
tabulate_genres(df_movies) tabulate_genres(df_movies)
``` ```
%% Output %% Output
List of all genres: List of all genres:
(no genres listed) | (no genres listed) |
Action | Action |
Adventure | Adventure |
Animation | Animation |
Children | Children |
Comedy | Comedy |
Crime | Crime |
Documentary | Documentary |
Drama | Drama |
Fantasy | Fantasy |
Film-Noir | Film-Noir |
Horror | Horror |
IMAX | IMAX |
Musical | Musical |
Mystery | Mystery |
Romance | Romance |
Sci-Fi | Sci-Fi |
Thriller | Thriller |
War | War |
Western | Western |
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE TOTAL NUMBER OF RATINGS # THE TOTAL NUMBER OF RATINGS
n_ratings = df_ratings['rating'].count() n_ratings = df_ratings['rating'].count()
print(f"Number of ratings: {n_ratings}") print(f"Number of ratings: {n_ratings}")
``` ```
%% Output %% Output
Number of ratings: 5296 Number of ratings: 5296
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE NUMBER OF UNIQUE USERS # THE NUMBER OF UNIQUE USERS
n_users = df_ratings['userId'].nunique() n_users = df_ratings['userId'].nunique()
print(f"Number of users: {n_users}") print(f"Number of users: {n_users}")
``` ```
%% Output %% Output
Number of users: 107 Number of users: 107
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE NUMBER OF UNIQUE MOVIES (IN THE RATING MATRIX) # THE NUMBER OF UNIQUE MOVIES (IN THE RATING MATRIX)
unique_movies = df_ratings["movieId"].unique() unique_movies = df_ratings["movieId"].unique()
num_unique_movies = len(unique_movies) num_unique_movies = len(unique_movies)
print(f"Number of unique movies : {num_unique_movies}") print(f"Number of unique movies : {num_unique_movies}")
``` ```
%% Output %% Output
Number of unique movies : 834 Number of unique movies : 834
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE NUMBER OF RATINGS OF THE MOST RATED MOVIES # THE NUMBER OF RATINGS OF THE MOST RATED MOVIES
def most_rated_movies_ratings_count(df_ratings): def most_rated_movies_ratings_count(df_ratings):
movie_ratings_count = df_ratings.groupby('movieId')['rating'].count() movie_ratings_count = df_ratings.groupby('movieId')['rating'].count()
most_rated_movies = movie_ratings_count[movie_ratings_count == movie_ratings_count.max()] most_rated_movies = movie_ratings_count[movie_ratings_count == movie_ratings_count.max()]
print(f"Number of ratings of the most rated movie(s): {most_rated_movies.max()}") print(f"Number of ratings of the most rated movie(s): {most_rated_movies.max()}")
most_rated_movies_ratings_count(df_ratings) most_rated_movies_ratings_count(df_ratings)
``` ```
%% Output %% Output
Number of ratings of the most rated movie(s): 75 Number of ratings of the most rated movie(s): 75
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE NUMBER OF RATINGS OF THE LESS RATED MOVIES # THE NUMBER OF RATINGS OF THE LESS RATED MOVIES
def least_rated_movies_ratings_count(df_ratings): def least_rated_movies_ratings_count(df_ratings):
movie_ratings_count = df_ratings.groupby('movieId')['rating'].count() movie_ratings_count = df_ratings.groupby('movieId')['rating'].count()
least_rated_movies = movie_ratings_count[movie_ratings_count == movie_ratings_count.min()] least_rated_movies = movie_ratings_count[movie_ratings_count == movie_ratings_count.min()]
print("Number of ratings of the least rated movie(s):", least_rated_movies.min()) print("Number of ratings of the least rated movie(s):", least_rated_movies.min())
least_rated_movies_ratings_count(df_ratings) least_rated_movies_ratings_count(df_ratings)
``` ```
%% Output %% Output
Number of ratings of the least rated movie(s): 1 Number of ratings of the least rated movie(s): 1
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# ALL THE POSSIBLE RATING VALUES; FROM THE SMALLEST VALUE TO THE VALUE HIGHEST # ALL THE POSSIBLE RATING VALUES; FROM THE SMALLEST VALUE TO THE VALUE HIGHEST
def all_possible_ratings(df_ratings): def all_possible_ratings(df_ratings):
rating_values = sorted(df_ratings['rating'].unique()) rating_values = sorted(df_ratings['rating'].unique())
print("All possible rating values, from smallest to highest:") print("All possible rating values, from smallest to highest:")
for rating in rating_values: for rating in rating_values:
print(rating) print(rating)
all_possible_ratings(df_ratings) all_possible_ratings(df_ratings)
``` ```
%% Output %% Output
All possible rating values, from smallest to highest: All possible rating values, from smallest to highest:
0.5 0.5
1.0 1.0
1.5 1.5
2.0 2.0
2.5 2.5
3.0 3.0
3.5 3.5
4.0 4.0
4.5 4.5
5.0 5.0
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE NUMBER OF MOVIES THAT WERE NOT RATED AT ALL # THE NUMBER OF MOVIES THAT WERE NOT RATED AT ALL
def unrated_movies_count(df_ratings, df_movies): def unrated_movies_count(df_ratings, df_movies):
rated_movies = df_ratings['movieId'].unique() if 'movieId' in df_ratings.columns else [] rated_movies = df_ratings['movieId'].unique() if 'movieId' in df_ratings.columns else []
unrated_movies_count = df_movies[~df_movies.index.isin(rated_movies)].shape[0] unrated_movies_count = df_movies[~df_movies.index.isin(rated_movies)].shape[0]
print("Number of movies that were not rated at all:", unrated_movies_count) print("Number of movies that were not rated at all:", unrated_movies_count)
unrated_movies_count(df_ratings, df_movies) unrated_movies_count(df_ratings, df_movies)
``` ```
%% Output %% Output
Number of movies that were not rated at all: 78 Number of movies that were not rated at all: 78
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
LONG-TAIL PROPERTY LONG-TAIL PROPERTY
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Rating Frequency Distribution # Rating Frequency Distribution
merged_df = pd.merge(df_ratings,df_movies, on='movieId') merged_df = pd.merge(df_ratings,df_movies, on='movieId')
rating_counts = merged_df['movieId'].value_counts() rating_counts = merged_df['movieId'].value_counts()
value_counts = rating_counts.value_counts().sort_index() value_counts = rating_counts.value_counts().sort_index()
plt.figure(figsize=(20, 6)) plt.figure(figsize=(20, 6))
plt.plot(value_counts.values, value_counts.index, marker='o', color='skyblue', linestyle='-') # Swap x and y arguments plt.plot(value_counts.values, value_counts.index, marker='o', color='skyblue', linestyle='-') # Swap x and y arguments
plt.title('Rating Frequency Distribution') plt.title('Rating Frequency Distribution')
plt.xlabel('Number of Movies') # Update x-label plt.xlabel('Number of Movies') # Update x-label
plt.ylabel('Number of Ratings') # Update y-label plt.ylabel('Number of Ratings') # Update y-label
plt.xticks(rotation=45) plt.xticks(rotation=45)
plt.grid(axis='x', linestyle='--', alpha=0.7) # Change grid to x-axis plt.grid(axis='x', linestyle='--', alpha=0.7) # Change grid to x-axis
plt.tight_layout() plt.tight_layout()
plt.show() plt.show()
``` ```
%% Output %% Output
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
M = df_ratings['userId'].nunique() M = df_ratings['userId'].nunique()
N = df_ratings['movieId'].nunique() N = df_ratings['movieId'].nunique()
user_mapper = dict(zip(np.unique(df_ratings["userId"]), list(range(M)))) user_mapper = dict(zip(np.unique(df_ratings["userId"]), list(range(M))))
movie_mapper = dict(zip(np.unique(df_ratings["movieId"]), list(range(N)))) movie_mapper = dict(zip(np.unique(df_ratings["movieId"]), list(range(N))))
user_inv_mapper = dict(zip(list(range(M)), np.unique(df_ratings["userId"]))) user_inv_mapper = dict(zip(list(range(M)), np.unique(df_ratings["userId"])))
movie_inv_mapper = dict(zip(list(range(N)), np.unique(df_ratings["movieId"]))) movie_inv_mapper = dict(zip(list(range(N)), np.unique(df_ratings["movieId"])))
user_index = [user_mapper[i] for i in df_ratings['userId']] user_index = [user_mapper[i] for i in df_ratings['userId']]
item_index = [movie_mapper[i] for i in df_ratings['movieId']] item_index = [movie_mapper[i] for i in df_ratings['movieId']]
X = csr_matrix((df_ratings["rating"], (user_index,item_index)), shape=(M,N)) X = csr_matrix((df_ratings["rating"], (user_index,item_index)), shape=(M,N))
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def create_X(df): def create_X(df):
""" """
Generates a sparse matrix from ratings dataframe. Generates a sparse matrix from ratings dataframe.
Args: Args:
df: pandas dataframe containing 3 columns (userId, movieId, rating) df: pandas dataframe containing 3 columns (userId, movieId, rating)
Returns: Returns:
X: sparse matrix X: sparse matrix
user_mapper: dict that maps user id's to user indices user_mapper: dict that maps user id's to user indices
user_inv_mapper: dict that maps user indices to user id's user_inv_mapper: dict that maps user indices to user id's
movie_mapper: dict that maps movie id's to movie indices movie_mapper: dict that maps movie id's to movie indices
movie_inv_mapper: dict that maps movie indices to movie id's movie_inv_mapper: dict that maps movie indices to movie id's
""" """
M = df['userId'].nunique() M = df['userId'].nunique()
N = df['movieId'].nunique() N = df['movieId'].nunique()
user_mapper = dict(zip(np.unique(df["userId"]), list(range(M)))) user_mapper = dict(zip(np.unique(df["userId"]), list(range(M))))
movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(N)))) movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(N))))
user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"]))) user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"])))
movie_inv_mapper = dict(zip(list(range(N)), np.unique(df["movieId"]))) movie_inv_mapper = dict(zip(list(range(N)), np.unique(df["movieId"])))
user_index = [user_mapper[i] for i in df['userId']] user_index = [user_mapper[i] for i in df['userId']]
item_index = [movie_mapper[i] for i in df['movieId']] item_index = [movie_mapper[i] for i in df['movieId']]
X = csr_matrix((df["rating"], (user_index,item_index)), shape=(M,N)) X = csr_matrix((df["rating"], (user_index,item_index)), shape=(M,N))
return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper
# Assuming df_ratings contains your ratings dataframe # Assuming df_ratings contains your ratings dataframe
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(df_ratings) X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(df_ratings)
# Extract the 100 first users and 100 first items # Extract the 100 first users and 100 first items
X_sub = X[:100, :100] X_sub = X[:100, :100]
# Plot the non-zero values of the sparse matrix # Plot the non-zero values of the sparse matrix
plt.figure(figsize=(8, 6)) plt.figure(figsize=(8, 6))
plt.spy(X_sub, markersize=1) plt.spy(X_sub, markersize=1)
plt.title('Non-zero values of a sparse matrix') plt.title('Non-zero values of a sparse matrix')
plt.xlabel('Movie Index') plt.xlabel('Movie Index')
plt.ylabel('User Index') plt.ylabel('User Index')
plt.show() plt.show()
``` ```
%% Output %% Output
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
n_total = X.shape[0]*X.shape[1] n_total = X.shape[0]*X.shape[1]
n_ratings = X.nnz n_ratings = X.nnz
sparsity = n_ratings/n_total sparsity = n_ratings/n_total
print(f"Matrix sparsity: {round(sparsity*100,2)}%") print(f"Matrix sparsity: {round(sparsity*100,2)}%")
``` ```
%% Output %% Output
Matrix sparsity: 5.93% Matrix sparsity: 5.93%
......
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Reload modules automatically before entering the execution of code # Reload modules automatically before entering the execution of code
%load_ext autoreload %load_ext autoreload
%autoreload 2 %autoreload 2
# Third-party imports # Third-party imports
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix from scipy.sparse import csr_matrix
from sklearn.linear_model import LinearRegression from sklearn.linear_model import LinearRegression
# Constants and functions # Constants and functions
from constants import Constant as C from constants import Constant as C
# We use a pd.read_csv() so importing the loaders is not necessary # We use a pd.read_csv() so importing the loaders is not necessary
# from loaders import load_ratings # from loaders import load_ratings
# from loaders import load_items # from loaders import load_items
from tabulate import tabulate from tabulate import tabulate
# Call the load_items() function and create a variable df_items # Call the load_items() function and create a variable df_items
df_movies = pd.read_csv("data/tiny/content/movies.csv") df_movies = pd.read_csv("data/tiny/content/movies.csv")
# Display the DataFrame # Display the DataFrame
print("Display The Movies : ") print("Display The Movies : ")
display(df_movies) display(df_movies)
# Call the load_ratings() function and create a variable df_ratings # Call the load_ratings() function and create a variable df_ratings
df_ratings = pd.read_csv("data/tiny/evidence/ratings.csv") df_ratings = pd.read_csv("data/tiny/evidence/ratings.csv")
# Display the DataFrame # Display the DataFrame
print("Display The Ratings : ") print("Display The Ratings : ")
display(df_ratings) display(df_ratings)
``` ```
%% Output %% Output
The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload
Display The Movies : Display The Movies :
Display The Ratings : Display The Ratings :
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# NUMBER OF MOVIES # NUMBER OF MOVIES
n_movies = df_movies['title'].nunique() n_movies = df_movies['title'].nunique()
print(f"Number of movies: {n_movies}") print(f"Number of movies: {n_movies}")
``` ```
%% Output %% Output
Number of movies: 912 Number of movies: 912
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE YEAR RANGE # THE YEAR RANGE
df_movies['annee'] = df_movies['title'].str.extract(r'\((.{4})\)') df_movies['annee'] = df_movies['title'].str.extract(r'\((.{4})\)')
df_movies['annee'] = pd.to_numeric(df_movies['annee'], errors='coerce') df_movies['annee'] = pd.to_numeric(df_movies['annee'], errors='coerce')
min_range = int(df_movies['annee'].min()) min_range = int(df_movies['annee'].min())
max_range = int(df_movies['annee'].max()) max_range = int(df_movies['annee'].max())
print("Minimum range:", min_range) print("Minimum range:", min_range)
print("Maximum range:", max_range) print("Maximum range:", max_range)
``` ```
%% Output %% Output
Minimum range: 1921 Minimum range: 1921
Maximum range: 2016 Maximum range: 2016
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# LIST OF MOVIE GENRES # LIST OF MOVIE GENRES
def tabulate_genres(df_movies): def tabulate_genres(df_movies):
"""Tabulate list of movie genres.""" """Tabulate list of movie genres."""
# Split genres and explode # Split genres and explode
df_movies['genres'] = df_movies['genres'].str.split('|') df_movies['genres'] = df_movies['genres'].str.split('|')
df_movies = df_movies.explode('genres') df_movies = df_movies.explode('genres')
unique_genres = sorted(df_movies['genres'].unique()) unique_genres = sorted(df_movies['genres'].unique())
# Tabulate # Tabulate
print("\nList of all genres:") print("\nList of all genres:")
genres_table = [[genre, "|"] for genre in unique_genres] genres_table = [[genre, "|"] for genre in unique_genres]
print(tabulate(genres_table, tablefmt="plain", numalign="left")) print(tabulate(genres_table, tablefmt="plain", numalign="left"))
# Call the tabulate_genres function # Call the tabulate_genres function
tabulate_genres(df_movies) tabulate_genres(df_movies)
``` ```
%% Output %% Output
List of all genres: List of all genres:
(no genres listed) | (no genres listed) |
Action | Action |
Adventure | Adventure |
Animation | Animation |
Children | Children |
Comedy | Comedy |
Crime | Crime |
Documentary | Documentary |
Drama | Drama |
Fantasy | Fantasy |
Film-Noir | Film-Noir |
Horror | Horror |
IMAX | IMAX |
Musical | Musical |
Mystery | Mystery |
Romance | Romance |
Sci-Fi | Sci-Fi |
Thriller | Thriller |
War | War |
Western | Western |
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE TOTAL NUMBER OF RATINGS # THE TOTAL NUMBER OF RATINGS
n_ratings = df_ratings['rating'].count() n_ratings = df_ratings['rating'].count()
print(f"Number of ratings: {n_ratings}") print(f"Number of ratings: {n_ratings}")
``` ```
%% Output %% Output
Number of ratings: 5296 Number of ratings: 5296
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE NUMBER OF UNIQUE USERS # THE NUMBER OF UNIQUE USERS
n_users = df_ratings['userId'].nunique() n_users = df_ratings['userId'].nunique()
print(f"Number of users: {n_users}") print(f"Number of users: {n_users}")
``` ```
%% Output %% Output
Number of users: 107 Number of users: 107
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE NUMBER OF UNIQUE MOVIES (IN THE RATING MATRIX) # THE NUMBER OF UNIQUE MOVIES (IN THE RATING MATRIX)
unique_movies = df_ratings["movieId"].unique() unique_movies = df_ratings["movieId"].unique()
num_unique_movies = len(unique_movies) num_unique_movies = len(unique_movies)
print(f"Number of unique movies : {num_unique_movies}") print(f"Number of unique movies : {num_unique_movies}")
``` ```
%% Output %% Output
Number of unique movies : 834 Number of unique movies : 834
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE NUMBER OF RATINGS OF THE MOST RATED MOVIES # THE NUMBER OF RATINGS OF THE MOST RATED MOVIES
def most_rated_movies_ratings_count(df_ratings): def most_rated_movies_ratings_count(df_ratings):
movie_ratings_count = df_ratings.groupby('movieId')['rating'].count() movie_ratings_count = df_ratings.groupby('movieId')['rating'].count()
most_rated_movies = movie_ratings_count[movie_ratings_count == movie_ratings_count.max()] most_rated_movies = movie_ratings_count[movie_ratings_count == movie_ratings_count.max()]
print(f"Number of ratings of the most rated movie(s): {most_rated_movies.max()}") print(f"Number of ratings of the most rated movie(s): {most_rated_movies.max()}")
most_rated_movies_ratings_count(df_ratings) most_rated_movies_ratings_count(df_ratings)
``` ```
%% Output %% Output
Number of ratings of the most rated movie(s): 75 Number of ratings of the most rated movie(s): 75
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE NUMBER OF RATINGS OF THE LESS RATED MOVIES # THE NUMBER OF RATINGS OF THE LESS RATED MOVIES
def least_rated_movies_ratings_count(df_ratings): def least_rated_movies_ratings_count(df_ratings):
movie_ratings_count = df_ratings.groupby('movieId')['rating'].count() movie_ratings_count = df_ratings.groupby('movieId')['rating'].count()
least_rated_movies = movie_ratings_count[movie_ratings_count == movie_ratings_count.min()] least_rated_movies = movie_ratings_count[movie_ratings_count == movie_ratings_count.min()]
print("Number of ratings of the least rated movie(s):", least_rated_movies.min()) print("Number of ratings of the least rated movie(s):", least_rated_movies.min())
least_rated_movies_ratings_count(df_ratings) least_rated_movies_ratings_count(df_ratings)
``` ```
%% Output %% Output
Number of ratings of the least rated movie(s): 1 Number of ratings of the least rated movie(s): 1
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# ALL THE POSSIBLE RATING VALUES; FROM THE SMALLEST VALUE TO THE VALUE HIGHEST # ALL THE POSSIBLE RATING VALUES; FROM THE SMALLEST VALUE TO THE VALUE HIGHEST
def all_possible_ratings(df_ratings): def all_possible_ratings(df_ratings):
rating_values = sorted(df_ratings['rating'].unique()) rating_values = sorted(df_ratings['rating'].unique())
print("All possible rating values, from smallest to highest:") print("All possible rating values, from smallest to highest:")
for rating in rating_values: for rating in rating_values:
print(rating) print(rating)
all_possible_ratings(df_ratings) all_possible_ratings(df_ratings)
``` ```
%% Output %% Output
All possible rating values, from smallest to highest: All possible rating values, from smallest to highest:
0.5 0.5
1.0 1.0
1.5 1.5
2.0 2.0
2.5 2.5
3.0 3.0
3.5 3.5
4.0 4.0
4.5 4.5
5.0 5.0
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE NUMBER OF MOVIES THAT WERE NOT RATED AT ALL # THE NUMBER OF MOVIES THAT WERE NOT RATED AT ALL
def unrated_movies_count(df_ratings, df_movies): def unrated_movies_count(df_ratings, df_movies):
rated_movies = df_ratings['movieId'].unique() if 'movieId' in df_ratings.columns else [] rated_movies = df_ratings['movieId'].unique() if 'movieId' in df_ratings.columns else []
unrated_movies_count = df_movies[~df_movies.index.isin(rated_movies)].shape[0] unrated_movies_count = df_movies[~df_movies.index.isin(rated_movies)].shape[0]
print("Number of movies that were not rated at all:", unrated_movies_count) print("Number of movies that were not rated at all:", unrated_movies_count)
unrated_movies_count(df_ratings, df_movies) unrated_movies_count(df_ratings, df_movies)
``` ```
%% Output %% Output
Number of movies that were not rated at all: 846 Number of movies that were not rated at all: 846
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### LONG-TAIL PROPERTY ### LONG-TAIL PROPERTY
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Rating Frequency Distribution # Rating Frequency Distribution
merged_df = pd.merge(df_ratings,df_movies, on='movieId') merged_df = pd.merge(df_ratings,df_movies, on='movieId')
rating_counts = merged_df['movieId'].value_counts() rating_counts = merged_df['movieId'].value_counts()
value_counts = rating_counts.value_counts().sort_index() value_counts = rating_counts.value_counts().sort_index()
plt.figure(figsize=(20, 6)) plt.figure(figsize=(20, 6))
plt.plot(value_counts.values, value_counts.index, marker='o', color='skyblue', linestyle='-') # Swap x and y arguments plt.plot(value_counts.values, value_counts.index, marker='o', color='skyblue', linestyle='-') # Swap x and y arguments
plt.title('Rating Frequency Distribution') plt.title('Rating Frequency Distribution')
plt.xlabel('Number of Movies') # Update x-label plt.xlabel('Number of Movies') # Update x-label
plt.ylabel('Number of Ratings') # Update y-label plt.ylabel('Number of Ratings') # Update y-label
plt.xticks(rotation=45) plt.xticks(rotation=45)
plt.grid(axis='x', linestyle='--', alpha=0.7) # Change grid to x-axis plt.grid(axis='x', linestyle='--', alpha=0.7) # Change grid to x-axis
plt.tight_layout() plt.tight_layout()
plt.show() plt.show()
``` ```
%% Output %% Output
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def create_X(df): def create_X(df):
""" """
Generates a sparse matrix from ratings dataframe. Generates a sparse matrix from ratings dataframe.
Args: Args:
df: pandas dataframe containing 3 columns (userId, movieId, rating) df: pandas dataframe containing 3 columns (userId, movieId, rating)
Returns: Returns:
X: sparse matrix X: sparse matrix
user_mapper: dict that maps user id's to user indices user_mapper: dict that maps user id's to user indices
user_inv_mapper: dict that maps user indices to user id's user_inv_mapper: dict that maps user indices to user id's
movie_mapper: dict that maps movie id's to movie indices movie_mapper: dict that maps movie id's to movie indices
movie_inv_mapper: dict that maps movie indices to movie id's movie_inv_mapper: dict that maps movie indices to movie id's
""" """
M = df['userId'].nunique() M = df['userId'].nunique()
N = df['movieId'].nunique() N = df['movieId'].nunique()
user_mapper = dict(zip(np.unique(df["userId"]), list(range(M)))) user_mapper = dict(zip(np.unique(df["userId"]), list(range(M))))
movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(N)))) movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(N))))
user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"]))) user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"])))
movie_inv_mapper = dict(zip(list(range(N)), np.unique(df["movieId"]))) movie_inv_mapper = dict(zip(list(range(N)), np.unique(df["movieId"])))
user_index = [user_mapper[i] for i in df['userId']] user_index = [user_mapper[i] for i in df['userId']]
item_index = [movie_mapper[i] for i in df['movieId']] item_index = [movie_mapper[i] for i in df['movieId']]
X = csr_matrix((df["rating"], (user_index,item_index)), shape=(M,N)) X = csr_matrix((df["rating"], (user_index,item_index)), shape=(M,N))
return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper
# Assuming df_ratings contains your ratings dataframe # Assuming df_ratings contains your ratings dataframe
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(df_ratings) X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(df_ratings)
# Extract the 100 first users and 100 first items # Extract the 100 first users and 100 first items
X_sub = X[:100, :100] X_sub = X[:100, :100]
# Plot the non-zero values of the sparse matrix # Plot the non-zero values of the sparse matrix
plt.figure(figsize=(8, 6)) plt.figure(figsize=(8, 6))
plt.spy(X_sub, markersize=1) plt.spy(X_sub, markersize=1)
plt.title('Non-zero values of a sparse matrix') plt.title('Non-zero values of a sparse matrix')
plt.xlabel('Movie Index') plt.xlabel('Movie Index')
plt.ylabel('User Index') plt.ylabel('User Index')
plt.show() plt.show()
``` ```
%% Output %% Output
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
n_total = X.shape[0]*X.shape[1] n_total = X.shape[0]*X.shape[1]
n_ratings = X.nnz n_ratings = X.nnz
sparsity = n_ratings/n_total sparsity = n_ratings/n_total
print(f"Matrix sparsity: {round(sparsity*100,2)}%") print(f"Matrix sparsity: {round(sparsity*100,2)}%")
``` ```
%% Output %% Output
Matrix sparsity: 5.93% Matrix sparsity: 5.93%
......
# local imports
from models import *
class EvalConfig:
"""Configuration settings for evaluation."""
# List of models to evaluate, each tuple containing model_name, model class, and model parameters (dict)
models = [
("baseline_1", ModelBaseline1, {}),
("baseline_2", ModelBaseline2, {}),
("baseline_3", ModelBaseline3, {}),
("baseline_4", ModelBaseline4, {})
# model_name, model class, model parameters (dict)
]
# Metrics to compute for split evaluation
split_metrics = ["mae", "rmse"]
# Metrics to compute for Leave-One-Out (LOO) evaluation
loo_metrics = ["hit_rate"]
# Metrics to compute for full dataset evaluation
full_metrics = ["novelty"]
# Split parameters
test_size = 0.25 # -- configure the test_size (from 0 to 1) --
# Loo parameters
top_n_value = 10 # -- configure the numer of recommendations (> 1) --
%% Cell type:markdown id:a665885b tags:
# Evaluator Module
The Evaluator module creates evaluation reports.
Reports contain evaluation metrics depending on models specified in the evaluation config.
%% Cell type:code id:6aaf9140 tags:
``` python
# reloads modules automatically before entering the execution of code
%load_ext autoreload
%autoreload 2
# third parties imports
import numpy as np
import pandas as pd
# -- add new imports here --
# local imports
from configs import EvalConfig
from constants import Constant as C
from loaders import export_evaluation_report
from loaders import load_ratings
# -- add new imports here --
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import LeaveOneOut
from collections import Counter
```
%% Cell type:markdown id:d47c24a4 tags:
# 1. Model validation functions
Validation functions are a way to perform crossvalidation on recommender system models.
%% Cell type:code id:d6d82188 tags:
``` python
def generate_split_predictions(algo, ratings_dataset, eval_config):
"""Generate predictions on a random test set specified in eval_config"""
# -- implement the function generate_split_predictions --
# Spliting the data into train and test sets
trainset, testset = train_test_split(ratings_dataset, test_size=eval_config.test_size)
# Training the algorithm on the train data set
algo.fit(trainset)
# Predict ratings for the testset
predictions = algo.test(testset)
return predictions
def generate_loo_top_n(algo, ratings_dataset, eval_config):
"""Generate top-n recommendations for each user on a random Leave-one-out split (LOO)"""
# -- implement the function generate_loo_top_n --
# Create a LeaveOneOut split
loo = LeaveOneOut(n_splits=1)
for trainset, testset in loo.split(ratings_dataset):
algo.fit(trainset) # Train the algorithm on the training set
anti_testset = trainset.build_anti_testset() # Build the anti test-set
predictions = algo.test(anti_testset) # Get predictions on the anti test-set
top_n = {}
for uid, iid, _, est, _ in predictions:
if uid not in top_n:
top_n[uid] = []
top_n[uid].append((iid, est))
for uid, user_ratings in top_n.items():
user_ratings.sort(key=lambda x: x[1], reverse=True)
top_n[uid] = user_ratings[:eval_config.top_n_value] # Get top-N recommendations
anti_testset_top_n = top_n
return anti_testset_top_n, testset
def generate_full_top_n(algo, ratings_dataset, eval_config):
"""Generate top-n recommendations for each user with full training set (LOO)"""
full_trainset = ratings_dataset.build_full_trainset() # Build the full training set
algo.fit(full_trainset) # Train the algorithm on the full training set
anti_testset = full_trainset.build_anti_testset() # Build the anti test-set
predictions = algo.test(anti_testset) # Get predictions on the anti test-set
top_n = {}
for uid, iid, _, est, _ in predictions:
if uid not in top_n:
top_n[uid] = []
top_n[uid].append((iid, est))
for uid, user_ratings in top_n.items():
user_ratings.sort(key=lambda x: x[1], reverse=True)
top_n[uid] = user_ratings[:eval_config.top_n_value] # Get top-N recommendations
anti_testset_top_n = top_n
return anti_testset_top_n
def precomputed_information(movie_data):
""" Returns a dictionary that precomputes relevant information for evaluating in full mode
Dictionary keys:
- precomputed_dict["item_to_rank"] : contains a dictionary mapping movie ids to rankings
- (-- for your project, add other relevant information here -- )
"""
# Initialize an empty dictionary to store item_id to rank mapping
item_to_rank = {}
# Calculate popularity rank for each movie
ratings_count = movie_data.groupby('movieId').size().sort_values(ascending=False)
# Assign ranks to movies based on their popularity
for rank, (movie_id, _) in enumerate(ratings_count.items(), start=1):
item_to_rank[movie_id] = rank
# Create the precomputed dictionary
precomputed_dict = {}
precomputed_dict["item_to_rank"] = item_to_rank
return precomputed_dict
def create_evaluation_report(eval_config, sp_ratings, precomputed_dict, available_metrics):
""" Create a DataFrame evaluating various models on metrics specified in an evaluation config.
"""
evaluation_dict = {}
for model_name, model, arguments in eval_config.models:
print(f'Handling model {model_name}')
algo = model(**arguments)
evaluation_dict[model_name] = {}
# Type 1 : split evaluations
if len(eval_config.split_metrics) > 0:
print('Training split predictions')
predictions = generate_split_predictions(algo, sp_ratings, eval_config)
for metric in eval_config.split_metrics:
print(f'- computing metric {metric}')
assert metric in available_metrics['split']
evaluation_function, parameters = available_metrics["split"][metric]
evaluation_dict[model_name][metric] = evaluation_function(predictions, **parameters)
# Type 2 : loo evaluations
if len(eval_config.loo_metrics) > 0:
print('Training loo predictions')
anti_testset_top_n, testset = generate_loo_top_n(algo, sp_ratings, eval_config)
for metric in eval_config.loo_metrics:
assert metric in available_metrics['loo']
evaluation_function, parameters = available_metrics["loo"][metric]
evaluation_dict[model_name][metric] = evaluation_function(anti_testset_top_n, testset, **parameters)
# Type 3 : full evaluations
if len(eval_config.full_metrics) > 0:
print('Training full predictions')
anti_testset_top_n = generate_full_top_n(algo, sp_ratings, eval_config)
for metric in eval_config.full_metrics:
assert metric in available_metrics['full']
evaluation_function, parameters = available_metrics["full"][metric]
evaluation_dict[model_name][metric] = evaluation_function(
anti_testset_top_n,
**precomputed_dict,
**parameters
)
return pd.DataFrame.from_dict(evaluation_dict).T
```
%% Cell type:markdown id:f7e83d1d tags:
# 2. Evaluation metrics
Implement evaluation metrics for either rating predictions (split metrics) or for top-n recommendations (loo metric, full metric)
%% Cell type:code id:f1849e55 tags:
``` python
def get_hit_rate(anti_testset_top_n, testset):
"""Compute the average hit over the users (loo metric)
A hit (1) happens when the movie in the testset has been picked by the top-n recommender
A fail (0) happens when the movie in the testset has not been picked by the top-n recommender
"""
# -- implement the function get_hit_rate --
hits = 0
total_users = len(testset)
for uid, true_iid, _ in testset:
if uid in anti_testset_top_n and true_iid in {iid for iid, _ in anti_testset_top_n[uid]}:
hits += 1
hit_rate = hits / total_users
return hit_rate
def get_novelty(anti_testset_top_n, item_to_rank):
"""Compute the average novelty of the top-n recommendation over the users (full metric)
The novelty is defined as the average ranking of the movies recommended
"""
# -- implement the function get_novelty --
total_rank_sum = 0
total_recommendations = 0
for uid, recommendations in anti_testset_top_n.items():
for iid, _ in recommendations:
if iid in item_to_rank:
total_rank_sum += item_to_rank[iid]
total_recommendations += 1
if total_recommendations == 0:
return 0 # Avoid division by zero
average_rank_sum = total_rank_sum / total_recommendations
return average_rank_sum
```
%% Cell type:markdown id:1a9855b3 tags:
# 3. Evaluation workflow
Load data, evaluate models and save the experimental outcomes
%% Cell type:code id:704f4d2a tags:
``` python
AVAILABLE_METRICS = {
"split": {
"mae": (accuracy.mae, {'verbose': False}),
"rmse": (accuracy.rmse, {'verbose': False})
# Add new split metrics here if needed
},
"loo": {
"hit_rate": (get_hit_rate, {}),
# Add new loo metrics here if needed
},
"full": {
"novelty": (get_novelty, {}),
# Add new full metrics here if needed
}
}
sp_ratings = load_ratings(surprise_format=True)
precomputed_dict = precomputed_information(pd.read_csv("data/tiny/evidence/ratings.csv"))
evaluation_report = create_evaluation_report(EvalConfig, sp_ratings, precomputed_dict, AVAILABLE_METRICS)
export_evaluation_report(evaluation_report)
```
%% Output
Handling model baseline_1
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
Handling model baseline_2
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
Handling model baseline_3
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
Handling model baseline_4
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
The data has been exported to the evaluation report
mae rmse hit_rate novelty
baseline_1 1.567221 1.788369 0.074766 99.405607
baseline_2 1.502872 1.840696 0.056075 429.942991
baseline_3 0.873993 1.076982 0.065421 99.405607
baseline_4 0.730657 0.938814 0.186916 57.465421
# standard library imports
from collections import defaultdict
# third parties imports
import numpy as np
import random as rd
from surprise import AlgoBase
from surprise import KNNWithMeans
from surprise import SVD
def get_top_n(predictions, n):
"""Return the top-N recommendation for each user from a set of predictions.
Source: inspired by https://github.com/NicolasHug/Surprise/blob/master/examples/top_n_recommendations.py
and modified by cvandekerckh for random tie breaking
Args:
predictions(list of Prediction objects): The list of predictions, as
returned by the test method of an algorithm.
n(int): The number of recommendation to output for each user. Default
is 10.
Returns:
A dict where keys are user (raw) ids and values are lists of tuples:
[(raw item id, rating estimation), ...] of size n.
"""
rd.seed(0)
# First map the predictions to each user.
top_n = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
top_n[uid].append((iid, est))
# Then sort the predictions for each user and retrieve the k highest ones.
for uid, user_ratings in top_n.items():
rd.shuffle(user_ratings)
user_ratings.sort(key=lambda x: x[1], reverse=True)
top_n[uid] = user_ratings[:n]
return top_n
# First algorithm
class ModelBaseline1(AlgoBase):
def __init__(self):
AlgoBase.__init__(self)
def estimate(self, u, i):
return 2
# Second algorithm
class ModelBaseline2(AlgoBase):
def __init__(self):
AlgoBase.__init__(self)
def fit(self, trainset):
AlgoBase.fit(self, trainset)
rd.seed(0)
def estimate(self, u, i):
return rd.uniform(self.trainset.rating_scale[0], self.trainset.rating_scale[1])
# Third algorithm
class ModelBaseline3(AlgoBase):
def __init__(self):
AlgoBase.__init__(self)
def fit(self, trainset):
AlgoBase.fit(self, trainset)
self.the_mean = np.mean([r for (_, _, r) in self.trainset.all_ratings()])
return self
def estimate(self, u, i):
return self.the_mean
# Fourth Model
class ModelBaseline4(SVD):
def __init__(self):
SVD.__init__(self, n_factors=100)
Ce diff est replié.
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter