Skip to content
Extraits de code Groupes Projets
Valider e1451ea1 rédigé par Adrien Payen's avatar Adrien Payen
Parcourir les fichiers

content based commit

parent 9ec41711
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Reload modules automatically before entering the execution of code # Reload modules automatically before entering the execution of code
%load_ext autoreload %load_ext autoreload
%autoreload 2 %autoreload 2
# Third-party imports # Third-party imports
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix from scipy.sparse import csr_matrix
# Constants and functions # Constants and functions
from constants import Constant as C from constants import Constant as C
from loaders import load_ratings from loaders import load_ratings
from loaders import load_items from loaders import load_items
from tabulate import tabulate from tabulate import tabulate
# Call the load_items() function and create a variable df_items # Call the load_items() function and create a variable df_items
df_movies = load_items() df_movies = load_items()
# Display the DataFrame # Display the DataFrame
print("Display The Movies : ") print("Display The Movies : ")
display(df_movies) display(df_movies)
# Call the load_ratings() function and create a variable df_ratings # Call the load_ratings() function and create a variable df_ratings
df_ratings = load_ratings() df_ratings = load_ratings()
# Display the DataFrame # Display the DataFrame
print("Display The Ratings : ") print("Display The Ratings : ")
display(df_ratings) display(df_ratings)
``` ```
%% Output %% Output
--------------------------------------------------------------------------- Display The Movies :
ImportError Traceback (most recent call last)
Cell In[1], line 12
9 from scipy.sparse import csr_matrix Display The Ratings :
11 # Constants and functions
---> 12 from constants import Constant as C
13 from loaders import load_ratings
14 from loaders import load_items
ImportError: cannot import name 'Constant' from 'constants' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/constants.py)
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# NUMBER OF MOVIES # NUMBER OF MOVIES
n_movies = df_movies['title'].nunique() n_movies = df_movies['title'].nunique()
print(f"Number of movies: {n_movies}") print(f"Number of movies: {n_movies}")
``` ```
%% Output %% Output
Number of movies: 912 Number of movies: 912
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE YEAR RANGE # THE YEAR RANGE
df_movies['annee'] = df_movies['title'].str.extract(r'\((.{4})\)') df_movies['annee'] = df_movies['title'].str.extract(r'\((.{4})\)')
df_movies['annee'] = pd.to_numeric(df_movies['annee'], errors='coerce') df_movies['annee'] = pd.to_numeric(df_movies['annee'], errors='coerce')
min_range = int(df_movies['annee'].min()) min_range = int(df_movies['annee'].min())
max_range = int(df_movies['annee'].max()) max_range = int(df_movies['annee'].max())
print("Minimum range:", min_range) print("Minimum range:", min_range)
print("Maximum range:", max_range) print("Maximum range:", max_range)
``` ```
%% Output %% Output
Minimum range: 1921 Minimum range: 1921
Maximum range: 2016 Maximum range: 2016
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# LIST OF MOVIE GENRES # LIST OF MOVIE GENRES
def tabulate_genres(df_movies): def tabulate_genres(df_movies):
"""Tabulate list of movie genres.""" """Tabulate list of movie genres."""
# Split genres and explode # Split genres and explode
df_movies['genres'] = df_movies['genres'].str.split('|') df_movies['genres'] = df_movies['genres'].str.split('|')
df_movies = df_movies.explode('genres') df_movies = df_movies.explode('genres')
unique_genres = sorted(df_movies['genres'].unique()) unique_genres = sorted(df_movies['genres'].unique())
# Tabulate # Tabulate
print("\nList of all genres:") print("\nList of all genres:")
genres_table = [[genre, "|"] for genre in unique_genres] genres_table = [[genre, "|"] for genre in unique_genres]
print(tabulate(genres_table, tablefmt="plain", numalign="left")) print(tabulate(genres_table, tablefmt="plain", numalign="left"))
# Call the tabulate_genres function # Call the tabulate_genres function
tabulate_genres(df_movies) tabulate_genres(df_movies)
``` ```
%% Output %% Output
List of all genres: List of all genres:
(no genres listed) | (no genres listed) |
Action | Action |
Adventure | Adventure |
Animation | Animation |
Children | Children |
Comedy | Comedy |
Crime | Crime |
Documentary | Documentary |
Drama | Drama |
Fantasy | Fantasy |
Film-Noir | Film-Noir |
Horror | Horror |
IMAX | IMAX |
Musical | Musical |
Mystery | Mystery |
Romance | Romance |
Sci-Fi | Sci-Fi |
Thriller | Thriller |
War | War |
Western | Western |
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE TOTAL NUMBER OF RATINGS # THE TOTAL NUMBER OF RATINGS
n_ratings = df_ratings['rating'].count() n_ratings = df_ratings['rating'].count()
print(f"Number of ratings: {n_ratings}") print(f"Number of ratings: {n_ratings}")
``` ```
%% Output %% Output
Number of ratings: 5296 Number of ratings: 5296
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE NUMBER OF UNIQUE USERS # THE NUMBER OF UNIQUE USERS
n_users = df_ratings['userId'].nunique() n_users = df_ratings['userId'].nunique()
print(f"Number of users: {n_users}") print(f"Number of users: {n_users}")
``` ```
%% Output %% Output
Number of users: 107 Number of users: 107
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE NUMBER OF UNIQUE MOVIES (IN THE RATING MATRIX) # THE NUMBER OF UNIQUE MOVIES (IN THE RATING MATRIX)
unique_movies = df_ratings["movieId"].unique() unique_movies = df_ratings["movieId"].unique()
num_unique_movies = len(unique_movies) num_unique_movies = len(unique_movies)
print(f"Number of unique movies : {num_unique_movies}") print(f"Number of unique movies : {num_unique_movies}")
``` ```
%% Output %% Output
Number of unique movies : 834 Number of unique movies : 834
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE NUMBER OF RATINGS OF THE MOST RATED MOVIES # THE NUMBER OF RATINGS OF THE MOST RATED MOVIES
def most_rated_movies_ratings_count(df_ratings): def most_rated_movies_ratings_count(df_ratings):
movie_ratings_count = df_ratings.groupby('movieId')['rating'].count() movie_ratings_count = df_ratings.groupby('movieId')['rating'].count()
most_rated_movies = movie_ratings_count[movie_ratings_count == movie_ratings_count.max()] most_rated_movies = movie_ratings_count[movie_ratings_count == movie_ratings_count.max()]
print(f"Number of ratings of the most rated movie(s): {most_rated_movies.max()}") print(f"Number of ratings of the most rated movie(s): {most_rated_movies.max()}")
most_rated_movies_ratings_count(df_ratings) most_rated_movies_ratings_count(df_ratings)
``` ```
%% Output %% Output
Number of ratings of the most rated movie(s): 75 Number of ratings of the most rated movie(s): 75
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE NUMBER OF RATINGS OF THE LESS RATED MOVIES # THE NUMBER OF RATINGS OF THE LESS RATED MOVIES
def least_rated_movies_ratings_count(df_ratings): def least_rated_movies_ratings_count(df_ratings):
movie_ratings_count = df_ratings.groupby('movieId')['rating'].count() movie_ratings_count = df_ratings.groupby('movieId')['rating'].count()
least_rated_movies = movie_ratings_count[movie_ratings_count == movie_ratings_count.min()] least_rated_movies = movie_ratings_count[movie_ratings_count == movie_ratings_count.min()]
print("Number of ratings of the least rated movie(s):", least_rated_movies.min()) print("Number of ratings of the least rated movie(s):", least_rated_movies.min())
least_rated_movies_ratings_count(df_ratings) least_rated_movies_ratings_count(df_ratings)
``` ```
%% Output %% Output
Number of ratings of the least rated movie(s): 1 Number of ratings of the least rated movie(s): 1
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# ALL THE POSSIBLE RATING VALUES; FROM THE SMALLEST VALUE TO THE VALUE HIGHEST # ALL THE POSSIBLE RATING VALUES; FROM THE SMALLEST VALUE TO THE VALUE HIGHEST
def all_possible_ratings(df_ratings): def all_possible_ratings(df_ratings):
rating_values = sorted(df_ratings['rating'].unique()) rating_values = sorted(df_ratings['rating'].unique())
print("All possible rating values, from smallest to highest:") print("All possible rating values, from smallest to highest:")
for rating in rating_values: for rating in rating_values:
print(rating) print(rating)
all_possible_ratings(df_ratings) all_possible_ratings(df_ratings)
``` ```
%% Output %% Output
All possible rating values, from smallest to highest: All possible rating values, from smallest to highest:
0.5 0.5
1.0 1.0
1.5 1.5
2.0 2.0
2.5 2.5
3.0 3.0
3.5 3.5
4.0 4.0
4.5 4.5
5.0 5.0
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# THE NUMBER OF MOVIES THAT WERE NOT RATED AT ALL # THE NUMBER OF MOVIES THAT WERE NOT RATED AT ALL
def unrated_movies_count(df_ratings, df_movies): def unrated_movies_count(df_ratings, df_movies):
rated_movies = df_ratings['movieId'].unique() if 'movieId' in df_ratings.columns else [] rated_movies = df_ratings['movieId'].unique() if 'movieId' in df_ratings.columns else []
unrated_movies_count = df_movies[~df_movies.index.isin(rated_movies)].shape[0] unrated_movies_count = df_movies[~df_movies.index.isin(rated_movies)].shape[0]
print("Number of movies that were not rated at all:", unrated_movies_count) print("Number of movies that were not rated at all:", unrated_movies_count)
unrated_movies_count(df_ratings, df_movies) unrated_movies_count(df_ratings, df_movies)
``` ```
%% Output %% Output
Number of movies that were not rated at all: 78 Number of movies that were not rated at all: 78
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
LONG-TAIL PROPERTY LONG-TAIL PROPERTY
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Rating Frequency Distribution # Rating Frequency Distribution
merged_df = pd.merge(df_ratings,df_movies, on='movieId') merged_df = pd.merge(df_ratings,df_movies, on='movieId')
rating_counts = merged_df['movieId'].value_counts() rating_counts = merged_df['movieId'].value_counts()
value_counts = rating_counts.value_counts().sort_index() value_counts = rating_counts.value_counts().sort_index()
plt.figure(figsize=(20, 6)) plt.figure(figsize=(20, 6))
plt.plot(value_counts.values, value_counts.index, marker='o', color='skyblue', linestyle='-') # Swap x and y arguments plt.plot(value_counts.values, value_counts.index, marker='o', color='skyblue', linestyle='-') # Swap x and y arguments
plt.title('Rating Frequency Distribution') plt.title('Rating Frequency Distribution')
plt.xlabel('Number of Movies') # Update x-label plt.xlabel('Number of Movies') # Update x-label
plt.ylabel('Number of Ratings') # Update y-label plt.ylabel('Number of Ratings') # Update y-label
plt.xticks(rotation=45) plt.xticks(rotation=45)
plt.grid(axis='x', linestyle='--', alpha=0.7) # Change grid to x-axis plt.grid(axis='x', linestyle='--', alpha=0.7) # Change grid to x-axis
plt.tight_layout() plt.tight_layout()
plt.show() plt.show()
``` ```
%% Output %% Output
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
M = df_ratings['userId'].nunique() M = df_ratings['userId'].nunique()
N = df_ratings['movieId'].nunique() N = df_ratings['movieId'].nunique()
user_mapper = dict(zip(np.unique(df_ratings["userId"]), list(range(M)))) user_mapper = dict(zip(np.unique(df_ratings["userId"]), list(range(M))))
movie_mapper = dict(zip(np.unique(df_ratings["movieId"]), list(range(N)))) movie_mapper = dict(zip(np.unique(df_ratings["movieId"]), list(range(N))))
user_inv_mapper = dict(zip(list(range(M)), np.unique(df_ratings["userId"]))) user_inv_mapper = dict(zip(list(range(M)), np.unique(df_ratings["userId"])))
movie_inv_mapper = dict(zip(list(range(N)), np.unique(df_ratings["movieId"]))) movie_inv_mapper = dict(zip(list(range(N)), np.unique(df_ratings["movieId"])))
user_index = [user_mapper[i] for i in df_ratings['userId']] user_index = [user_mapper[i] for i in df_ratings['userId']]
item_index = [movie_mapper[i] for i in df_ratings['movieId']] item_index = [movie_mapper[i] for i in df_ratings['movieId']]
X = csr_matrix((df_ratings["rating"], (user_index,item_index)), shape=(M,N)) X = csr_matrix((df_ratings["rating"], (user_index,item_index)), shape=(M,N))
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def create_X(df): def create_X(df):
""" """
Generates a sparse matrix from ratings dataframe. Generates a sparse matrix from ratings dataframe.
Args: Args:
df: pandas dataframe containing 3 columns (userId, movieId, rating) df: pandas dataframe containing 3 columns (userId, movieId, rating)
Returns: Returns:
X: sparse matrix X: sparse matrix
user_mapper: dict that maps user id's to user indices user_mapper: dict that maps user id's to user indices
user_inv_mapper: dict that maps user indices to user id's user_inv_mapper: dict that maps user indices to user id's
movie_mapper: dict that maps movie id's to movie indices movie_mapper: dict that maps movie id's to movie indices
movie_inv_mapper: dict that maps movie indices to movie id's movie_inv_mapper: dict that maps movie indices to movie id's
""" """
M = df['userId'].nunique() M = df['userId'].nunique()
N = df['movieId'].nunique() N = df['movieId'].nunique()
user_mapper = dict(zip(np.unique(df["userId"]), list(range(M)))) user_mapper = dict(zip(np.unique(df["userId"]), list(range(M))))
movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(N)))) movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(N))))
user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"]))) user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"])))
movie_inv_mapper = dict(zip(list(range(N)), np.unique(df["movieId"]))) movie_inv_mapper = dict(zip(list(range(N)), np.unique(df["movieId"])))
user_index = [user_mapper[i] for i in df['userId']] user_index = [user_mapper[i] for i in df['userId']]
item_index = [movie_mapper[i] for i in df['movieId']] item_index = [movie_mapper[i] for i in df['movieId']]
X = csr_matrix((df["rating"], (user_index,item_index)), shape=(M,N)) X = csr_matrix((df["rating"], (user_index,item_index)), shape=(M,N))
return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper
# Assuming df_ratings contains your ratings dataframe # Assuming df_ratings contains your ratings dataframe
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(df_ratings) X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(df_ratings)
# Extract the 100 first users and 100 first items # Extract the 100 first users and 100 first items
X_sub = X[:100, :100] X_sub = X[:100, :100]
# Plot the non-zero values of the sparse matrix # Plot the non-zero values of the sparse matrix
plt.figure(figsize=(8, 6)) plt.figure(figsize=(8, 6))
plt.spy(X_sub, markersize=1) plt.spy(X_sub, markersize=1)
plt.title('Non-zero values of a sparse matrix') plt.title('Non-zero values of a sparse matrix')
plt.xlabel('Movie Index') plt.xlabel('Movie Index')
plt.ylabel('User Index') plt.ylabel('User Index')
plt.show() plt.show()
``` ```
%% Output %% Output
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
n_total = X.shape[0]*X.shape[1] n_total = X.shape[0]*X.shape[1]
n_ratings = X.nnz n_ratings = X.nnz
sparsity = n_ratings/n_total sparsity = n_ratings/n_total
print(f"Matrix sparsity: {round(sparsity*100,2)}%") print(f"Matrix sparsity: {round(sparsity*100,2)}%")
``` ```
%% Output %% Output
Matrix sparsity: 5.93% Matrix sparsity: 5.93%
......
%% Cell type:markdown id:82d5ca82 tags: %% Cell type:markdown id:82d5ca82 tags:
# Packages # Packages
%% Cell type:code id:277473a3 tags: %% Cell type:code id:277473a3 tags:
``` python ``` python
%load_ext autoreload %load_ext autoreload
%autoreload 2 %autoreload 2
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import random as rd import random as rd
from surprise import AlgoBase from surprise import AlgoBase
from surprise.prediction_algorithms.predictions import PredictionImpossible from surprise.prediction_algorithms.predictions import PredictionImpossible
from loaders import load_ratings from loaders import load_ratings
from loaders import load_items from loaders import load_items
from constants import Constant as C from constants import Constant as C
from sklearn.linear_model import LinearRegression from sklearn.linear_model import LinearRegression
``` ```
%% Output %% Output
--------------------------------------------------------------------------- The autoreload extension is already loaded. To reload it, use:
ImportError Traceback (most recent call last) %reload_ext autoreload
Cell In[1], line 10
7 from surprise import AlgoBase
8 from surprise.prediction_algorithms.predictions import PredictionImpossible
---> 10 from loaders import load_ratings
11 from loaders import load_items
12 from constants import Constant as C
File ~/vscodeworkspace/recomsys/loaders.py:7
3 import os
6 # Local imports
----> 7 from constants import Constant as C
8 from surprise import Reader, Dataset
10 def load_ratings(surprise_format=False):
ImportError: cannot import name 'Constant' from 'constants' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/constants.py)
%% Cell type:markdown id:a42c16bf tags: %% Cell type:markdown id:a42c16bf tags:
# Explore and select content features # Explore and select content features
%% Cell type:code id:e8378976 tags: %% Cell type:code id:e8378976 tags:
``` python ``` python
df_items = load_items() df_items = load_items()
df_ratings = load_ratings() df_ratings = load_ratings()
# Example 1 : create title_length features # Example 1 : create title_length features
df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title') df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
display(df_features.head()) display(df_features.head())
# (explore here other features) # (explore here other features)
``` ```
%% Output %% Output
%% Cell type:markdown id:a2c9a2b6 tags: %% Cell type:markdown id:a2c9a2b6 tags:
# Build a content-based model # Build a content-based model
When ready, move the following class in the *models.py* script When ready, move the following class in the *models.py* script
%% Cell type:code id:16b0a602 tags: %% Cell type:code id:16b0a602 tags:
``` python ``` python
class ContentBased(AlgoBase): class ContentBased(AlgoBase):
def __init__(self, features_method, regressor_method): def __init__(self, features_method, regressor_method):
AlgoBase.__init__(self) AlgoBase.__init__(self)
self.regressor_method = regressor_method self.regressor_method = regressor_method
self.content_features = self.create_content_features(features_method) self.content_features = self.create_content_features(features_method)
def create_content_features(self, features_method): def create_content_features(self, features_method):
"""Content Analyzer""" """Content Analyzer"""
df_items = load_items() df_items = load_items()
if features_method is None: if features_method is None:
df_features = None df_features = None
elif features_method == "title_length": # a naive method that creates only 1 feature based on title length elif features_method == "title_length": # a naive method that creates only 1 feature based on title length
df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title') df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
else: # (implement other feature creations here) else: # (implement other feature creations here)
raise NotImplementedError(f'Feature method {features_method} not yet implemented') raise NotImplementedError(f'Feature method {features_method} not yet implemented')
return df_features return df_features
def fit(self, trainset): def fit(self, trainset):
"""Profile Learner""" """Profile Learner"""
AlgoBase.fit(self, trainset) AlgoBase.fit(self, trainset)
# Preallocate user profiles # Preallocate user profiles
self.user_profile = {u: None for u in trainset.all_users()} self.user_profile = {u: None for u in trainset.all_users()}
if self.regressor_method == 'random_score': if self.regressor_method == 'random_score':
for u in self.user_profile : for u in self.user_profile :
self.user_profile[u] = rd.uniform(0.5,5) self.user_profile[u] = rd.uniform(0.5,5)
elif self.regressor_method == 'random_sample': elif self.regressor_method == 'random_sample':
for u in self.user_profile: for u in self.user_profile:
self.user_profile[u] = [rating for _, rating in self.trainset.ur[u]] self.user_profile[u] = [rating for _, rating in self.trainset.ur[u]]
else: elif self.regressor_method == 'linear_regression' :
for u in self.user_profile: for u in self.user_profile:
user_ratings = [(trainset.to_raw_iid(iid), rating) for (iid, rating) in trainset.ur[u]] user_ratings = [rating for _, rating in trainset.ur[u]]
item_ids = [iid for iid, _ in trainset.ur[u]]
df_user = pd.DataFrame(user_ratings, columns = ["item_id", "user_ratings"]) df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})
df_user["item_id"] = df_user['item_id'].map(trainset.to_raw_idd) df_user["item_id"] = df_user["item_id"].map(trainset.to_raw_iid)
df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left') df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left')
X = df_user['n_character_title'].values.reshape(-1,1) X = df_user['n_character_title'].values.reshape(-1,1)
y = df_user['user_ratings'].values y = df_user['user_ratings'].values
linear_regressor = LinearRegression(fit_intercept = False) linear_regressor = LinearRegression(fit_intercept = False)
linear_regressor.fit(X,y) linear_regressor.fit(X,y)
# Store the computed user profile # Store the computed user profile
self.user_profile[u] = linear_regressor self.user_profile[u] = linear_regressor
else :
pass
# (implement here the regressor fitting) # (implement here the regressor fitting)
def estimate(self, u, i): def estimate(self, u, i):
"""Scoring component used for item filtering""" """Scoring component used for item filtering"""
# First, handle cases for unknown users and items # First, handle cases for unknown users and items
if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
raise PredictionImpossible('User and/or item is unkown.') raise PredictionImpossible('User and/or item is unkown.')
if self.regressor_method == 'random_score': if self.regressor_method == 'random_score':
rd.seed() rd.seed()
score = rd.uniform(0.5,5) score = rd.uniform(0.5,5)
elif self.regressor_method == 'random_sample': elif self.regressor_method == 'random_sample':
rd.seed() rd.seed()
score = rd.choice(self.user_profile[u]) score = rd.choice(self.user_profile[u])
else: elif self.regressor_method == 'linear_regression':
raw_item_id = self.trainset.to_raw_iid(i) raw_item_id = self.trainset.to_raw_iid(i)
item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values
linear_regressor = self.user_profile[u] linear_regressor = self.user_profile[u]
score= linear_regressor.predict(item_features)[0] score= linear_regressor.predict(item_features)[0]
else :
score = None
# (implement here the regressor prediction) # (implement here the regressor prediction)
return score return score
``` ```
%% Cell type:markdown id:ffd75b7e tags: %% Cell type:markdown id:ffd75b7e tags:
The following script test the ContentBased class The following script test the ContentBased class
%% Cell type:code id:69d12f7d tags: %% Cell type:code id:69d12f7d tags:
``` python ``` python
def test_contentbased_class(feature_method, regressor_method): def test_contentbased_class(feature_method, regressor_method):
"""Test the ContentBased class. """Test the ContentBased class.
Tries to make a prediction on the first (user,item ) tuple of the anti_test_set Tries to make a prediction on the first (user,item ) tuple of the anti_test_set
""" """
sp_ratings = load_ratings(surprise_format=True) sp_ratings = load_ratings(surprise_format=True)
train_set = sp_ratings.build_full_trainset() train_set = sp_ratings.build_full_trainset()
content_algo = ContentBased(feature_method, regressor_method) content_algo = ContentBased(feature_method, regressor_method)
content_algo.fit(train_set) content_algo.fit(train_set)
anti_test_set_first = train_set.build_anti_testset()[0] anti_test_set_first = train_set.build_anti_testset()[0]
prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1]) prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])
print(prediction) print(prediction)
# (call here the test functions with different regressor methods) # (call here the test functions with different regressor methods)
test_contentbased_class(feature_method = "title_length" , regressor_method = "random_score") test_contentbased_class(feature_method = "title_length" , regressor_method = "random_score")
test_contentbased_class(feature_method = "title_length" , regressor_method = "random_sample") test_contentbased_class(feature_method = "title_length" , regressor_method = "random_sample")
``` ```
%% Output %% Output
user: 15 item: 942 r_ui = None est = 3.59 {'was_impossible': False} user: 15 item: 942 r_ui = None est = 3.79 {'was_impossible': False}
user: 15 item: 942 r_ui = None est = 3.00 {'was_impossible': False} user: 15 item: 942 r_ui = None est = 4.00 {'was_impossible': False}
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter