Skip to content
Extraits de code Groupes Projets
Valider e1451ea1 rédigé par Adrien Payen's avatar Adrien Payen
Parcourir les fichiers

content based commit

parent 9ec41711
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
%% Cell type:code id: tags:
``` python
# Reload modules automatically before entering the execution of code
%load_ext autoreload
%autoreload 2
# Third-party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
# Constants and functions
from constants import Constant as C
from loaders import load_ratings
from loaders import load_items
from tabulate import tabulate
# Call the load_items() function and create a variable df_items
df_movies = load_items()
# Display the DataFrame
print("Display The Movies : ")
display(df_movies)
# Call the load_ratings() function and create a variable df_ratings
df_ratings = load_ratings()
# Display the DataFrame
print("Display The Ratings : ")
display(df_ratings)
```
%% Output
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
Cell In[1], line 12
9 from scipy.sparse import csr_matrix
11 # Constants and functions
---> 12 from constants import Constant as C
13 from loaders import load_ratings
14 from loaders import load_items
ImportError: cannot import name 'Constant' from 'constants' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/constants.py)
Display The Movies :
Display The Ratings :
%% Cell type:code id: tags:
``` python
# NUMBER OF MOVIES
n_movies = df_movies['title'].nunique()
print(f"Number of movies: {n_movies}")
```
%% Output
Number of movies: 912
%% Cell type:code id: tags:
``` python
# THE YEAR RANGE
df_movies['annee'] = df_movies['title'].str.extract(r'\((.{4})\)')
df_movies['annee'] = pd.to_numeric(df_movies['annee'], errors='coerce')
min_range = int(df_movies['annee'].min())
max_range = int(df_movies['annee'].max())
print("Minimum range:", min_range)
print("Maximum range:", max_range)
```
%% Output
Minimum range: 1921
Maximum range: 2016
%% Cell type:code id: tags:
``` python
# LIST OF MOVIE GENRES
def tabulate_genres(df_movies):
"""Tabulate list of movie genres."""
# Split genres and explode
df_movies['genres'] = df_movies['genres'].str.split('|')
df_movies = df_movies.explode('genres')
unique_genres = sorted(df_movies['genres'].unique())
# Tabulate
print("\nList of all genres:")
genres_table = [[genre, "|"] for genre in unique_genres]
print(tabulate(genres_table, tablefmt="plain", numalign="left"))
# Call the tabulate_genres function
tabulate_genres(df_movies)
```
%% Output
List of all genres:
(no genres listed) |
Action |
Adventure |
Animation |
Children |
Comedy |
Crime |
Documentary |
Drama |
Fantasy |
Film-Noir |
Horror |
IMAX |
Musical |
Mystery |
Romance |
Sci-Fi |
Thriller |
War |
Western |
%% Cell type:code id: tags:
``` python
# THE TOTAL NUMBER OF RATINGS
n_ratings = df_ratings['rating'].count()
print(f"Number of ratings: {n_ratings}")
```
%% Output
Number of ratings: 5296
%% Cell type:code id: tags:
``` python
# THE NUMBER OF UNIQUE USERS
n_users = df_ratings['userId'].nunique()
print(f"Number of users: {n_users}")
```
%% Output
Number of users: 107
%% Cell type:code id: tags:
``` python
# THE NUMBER OF UNIQUE MOVIES (IN THE RATING MATRIX)
unique_movies = df_ratings["movieId"].unique()
num_unique_movies = len(unique_movies)
print(f"Number of unique movies : {num_unique_movies}")
```
%% Output
Number of unique movies : 834
%% Cell type:code id: tags:
``` python
# THE NUMBER OF RATINGS OF THE MOST RATED MOVIES
def most_rated_movies_ratings_count(df_ratings):
movie_ratings_count = df_ratings.groupby('movieId')['rating'].count()
most_rated_movies = movie_ratings_count[movie_ratings_count == movie_ratings_count.max()]
print(f"Number of ratings of the most rated movie(s): {most_rated_movies.max()}")
most_rated_movies_ratings_count(df_ratings)
```
%% Output
Number of ratings of the most rated movie(s): 75
%% Cell type:code id: tags:
``` python
# THE NUMBER OF RATINGS OF THE LESS RATED MOVIES
def least_rated_movies_ratings_count(df_ratings):
movie_ratings_count = df_ratings.groupby('movieId')['rating'].count()
least_rated_movies = movie_ratings_count[movie_ratings_count == movie_ratings_count.min()]
print("Number of ratings of the least rated movie(s):", least_rated_movies.min())
least_rated_movies_ratings_count(df_ratings)
```
%% Output
Number of ratings of the least rated movie(s): 1
%% Cell type:code id: tags:
``` python
# ALL THE POSSIBLE RATING VALUES; FROM THE SMALLEST VALUE TO THE VALUE HIGHEST
def all_possible_ratings(df_ratings):
rating_values = sorted(df_ratings['rating'].unique())
print("All possible rating values, from smallest to highest:")
for rating in rating_values:
print(rating)
all_possible_ratings(df_ratings)
```
%% Output
All possible rating values, from smallest to highest:
0.5
1.0
1.5
2.0
2.5
3.0
3.5
4.0
4.5
5.0
%% Cell type:code id: tags:
``` python
# THE NUMBER OF MOVIES THAT WERE NOT RATED AT ALL
def unrated_movies_count(df_ratings, df_movies):
rated_movies = df_ratings['movieId'].unique() if 'movieId' in df_ratings.columns else []
unrated_movies_count = df_movies[~df_movies.index.isin(rated_movies)].shape[0]
print("Number of movies that were not rated at all:", unrated_movies_count)
unrated_movies_count(df_ratings, df_movies)
```
%% Output
Number of movies that were not rated at all: 78
%% Cell type:markdown id: tags:
LONG-TAIL PROPERTY
%% Cell type:code id: tags:
``` python
# Rating Frequency Distribution
merged_df = pd.merge(df_ratings,df_movies, on='movieId')
rating_counts = merged_df['movieId'].value_counts()
value_counts = rating_counts.value_counts().sort_index()
plt.figure(figsize=(20, 6))
plt.plot(value_counts.values, value_counts.index, marker='o', color='skyblue', linestyle='-') # Swap x and y arguments
plt.title('Rating Frequency Distribution')
plt.xlabel('Number of Movies') # Update x-label
plt.ylabel('Number of Ratings') # Update y-label
plt.xticks(rotation=45)
plt.grid(axis='x', linestyle='--', alpha=0.7) # Change grid to x-axis
plt.tight_layout()
plt.show()
```
%% Output
%% Cell type:code id: tags:
``` python
M = df_ratings['userId'].nunique()
N = df_ratings['movieId'].nunique()
user_mapper = dict(zip(np.unique(df_ratings["userId"]), list(range(M))))
movie_mapper = dict(zip(np.unique(df_ratings["movieId"]), list(range(N))))
user_inv_mapper = dict(zip(list(range(M)), np.unique(df_ratings["userId"])))
movie_inv_mapper = dict(zip(list(range(N)), np.unique(df_ratings["movieId"])))
user_index = [user_mapper[i] for i in df_ratings['userId']]
item_index = [movie_mapper[i] for i in df_ratings['movieId']]
X = csr_matrix((df_ratings["rating"], (user_index,item_index)), shape=(M,N))
```
%% Cell type:code id: tags:
``` python
def create_X(df):
"""
Generates a sparse matrix from ratings dataframe.
Args:
df: pandas dataframe containing 3 columns (userId, movieId, rating)
Returns:
X: sparse matrix
user_mapper: dict that maps user id's to user indices
user_inv_mapper: dict that maps user indices to user id's
movie_mapper: dict that maps movie id's to movie indices
movie_inv_mapper: dict that maps movie indices to movie id's
"""
M = df['userId'].nunique()
N = df['movieId'].nunique()
user_mapper = dict(zip(np.unique(df["userId"]), list(range(M))))
movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(N))))
user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"])))
movie_inv_mapper = dict(zip(list(range(N)), np.unique(df["movieId"])))
user_index = [user_mapper[i] for i in df['userId']]
item_index = [movie_mapper[i] for i in df['movieId']]
X = csr_matrix((df["rating"], (user_index,item_index)), shape=(M,N))
return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper
# Assuming df_ratings contains your ratings dataframe
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(df_ratings)
# Extract the 100 first users and 100 first items
X_sub = X[:100, :100]
# Plot the non-zero values of the sparse matrix
plt.figure(figsize=(8, 6))
plt.spy(X_sub, markersize=1)
plt.title('Non-zero values of a sparse matrix')
plt.xlabel('Movie Index')
plt.ylabel('User Index')
plt.show()
```
%% Output
%% Cell type:code id: tags:
``` python
n_total = X.shape[0]*X.shape[1]
n_ratings = X.nnz
sparsity = n_ratings/n_total
print(f"Matrix sparsity: {round(sparsity*100,2)}%")
```
%% Output
Matrix sparsity: 5.93%
......
%% Cell type:markdown id:82d5ca82 tags:
# Packages
%% Cell type:code id:277473a3 tags:
``` python
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import random as rd
from surprise import AlgoBase
from surprise.prediction_algorithms.predictions import PredictionImpossible
from loaders import load_ratings
from loaders import load_items
from constants import Constant as C
from sklearn.linear_model import LinearRegression
```
%% Output
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
Cell In[1], line 10
7 from surprise import AlgoBase
8 from surprise.prediction_algorithms.predictions import PredictionImpossible
---> 10 from loaders import load_ratings
11 from loaders import load_items
12 from constants import Constant as C
File ~/vscodeworkspace/recomsys/loaders.py:7
3 import os
6 # Local imports
----> 7 from constants import Constant as C
8 from surprise import Reader, Dataset
10 def load_ratings(surprise_format=False):
ImportError: cannot import name 'Constant' from 'constants' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/constants.py)
The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload
%% Cell type:markdown id:a42c16bf tags:
# Explore and select content features
%% Cell type:code id:e8378976 tags:
``` python
df_items = load_items()
df_ratings = load_ratings()
# Example 1 : create title_length features
df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
display(df_features.head())
# (explore here other features)
```
%% Output
%% Cell type:markdown id:a2c9a2b6 tags:
# Build a content-based model
When ready, move the following class in the *models.py* script
%% Cell type:code id:16b0a602 tags:
``` python
class ContentBased(AlgoBase):
def __init__(self, features_method, regressor_method):
AlgoBase.__init__(self)
self.regressor_method = regressor_method
self.content_features = self.create_content_features(features_method)
def create_content_features(self, features_method):
"""Content Analyzer"""
df_items = load_items()
if features_method is None:
df_features = None
elif features_method == "title_length": # a naive method that creates only 1 feature based on title length
df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
else: # (implement other feature creations here)
raise NotImplementedError(f'Feature method {features_method} not yet implemented')
return df_features
def fit(self, trainset):
"""Profile Learner"""
AlgoBase.fit(self, trainset)
# Preallocate user profiles
self.user_profile = {u: None for u in trainset.all_users()}
if self.regressor_method == 'random_score':
for u in self.user_profile :
self.user_profile[u] = rd.uniform(0.5,5)
elif self.regressor_method == 'random_sample':
for u in self.user_profile:
self.user_profile[u] = [rating for _, rating in self.trainset.ur[u]]
else:
elif self.regressor_method == 'linear_regression' :
for u in self.user_profile:
user_ratings = [(trainset.to_raw_iid(iid), rating) for (iid, rating) in trainset.ur[u]]
user_ratings = [rating for _, rating in trainset.ur[u]]
item_ids = [iid for iid, _ in trainset.ur[u]]
df_user = pd.DataFrame(user_ratings, columns = ["item_id", "user_ratings"])
df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})
df_user["item_id"] = df_user['item_id'].map(trainset.to_raw_idd)
df_user["item_id"] = df_user["item_id"].map(trainset.to_raw_iid)
df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left')
X = df_user['n_character_title'].values.reshape(-1,1)
y = df_user['user_ratings'].values
linear_regressor = LinearRegression(fit_intercept = False)
linear_regressor.fit(X,y)
# Store the computed user profile
self.user_profile[u] = linear_regressor
else :
pass
# (implement here the regressor fitting)
def estimate(self, u, i):
"""Scoring component used for item filtering"""
# First, handle cases for unknown users and items
if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
raise PredictionImpossible('User and/or item is unkown.')
if self.regressor_method == 'random_score':
rd.seed()
score = rd.uniform(0.5,5)
elif self.regressor_method == 'random_sample':
rd.seed()
score = rd.choice(self.user_profile[u])
else:
elif self.regressor_method == 'linear_regression':
raw_item_id = self.trainset.to_raw_iid(i)
item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values
linear_regressor = self.user_profile[u]
score= linear_regressor.predict(item_features)[0]
else :
score = None
# (implement here the regressor prediction)
return score
```
%% Cell type:markdown id:ffd75b7e tags:
The following script test the ContentBased class
%% Cell type:code id:69d12f7d tags:
``` python
def test_contentbased_class(feature_method, regressor_method):
"""Test the ContentBased class.
Tries to make a prediction on the first (user,item ) tuple of the anti_test_set
"""
sp_ratings = load_ratings(surprise_format=True)
train_set = sp_ratings.build_full_trainset()
content_algo = ContentBased(feature_method, regressor_method)
content_algo.fit(train_set)
anti_test_set_first = train_set.build_anti_testset()[0]
prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])
print(prediction)
# (call here the test functions with different regressor methods)
test_contentbased_class(feature_method = "title_length" , regressor_method = "random_score")
test_contentbased_class(feature_method = "title_length" , regressor_method = "random_sample")
```
%% Output
user: 15 item: 942 r_ui = None est = 3.59 {'was_impossible': False}
user: 15 item: 942 r_ui = None est = 3.00 {'was_impossible': False}
user: 15 item: 942 r_ui = None est = 3.79 {'was_impossible': False}
user: 15 item: 942 r_ui = None est = 4.00 {'was_impossible': False}
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter