Skip to content
Extraits de code Groupes Projets
Valider 4c44fb82 rédigé par Adrien Payen's avatar Adrien Payen
Parcourir les fichiers

update

parent a7471cdc
Branches master
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
%% Cell type:markdown id:82d5ca82 tags: %% Cell type:markdown id:82d5ca82 tags:
# Packages # Packages
%% Cell type:code id:277473a3 tags: %% Cell type:code id:277473a3 tags:
``` python ``` python
%load_ext autoreload %load_ext autoreload
%autoreload 2 %autoreload 2
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import random as rd import random as rd
from surprise import AlgoBase from surprise import AlgoBase
from surprise.prediction_algorithms.predictions import PredictionImpossible from surprise.prediction_algorithms.predictions import PredictionImpossible
from loaders import load_ratings from loaders import load_ratings
from loaders import load_items from loaders import load_items
from constants import Constant as C from constants import Constant as C
from sklearn.linear_model import LinearRegression from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR from sklearn.svm import SVR
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
``` ```
%% Output %% Output
The autoreload extension is already loaded. To reload it, use: The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload %reload_ext autoreload
%% Cell type:markdown id:a42c16bf tags: %% Cell type:markdown id:a42c16bf tags:
# Explore and select content features # Explore and select content features
%% Cell type:code id:e8378976 tags: %% Cell type:code id:e8378976 tags:
``` python ``` python
# All the dataframes # All the dataframes
df_items = load_items() df_items = load_items()
df_ratings = load_ratings() df_ratings = load_ratings()
df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME) df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv") df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv")
df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv") df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv")
# Example 1 : create title_length features # Example 1 : create title_length features
df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title') df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
display(df_features.head()) display(df_features.head())
df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME) df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
df_features = df_tag[C.TAG] df_features = df_tag[C.TAG]
display(df_features.head()) display(df_features.head())
# (explore here other features) # (explore here other features)
``` ```
%% Output %% Output
%% Cell type:markdown id:a2c9a2b6 tags: %% Cell type:markdown id:a2c9a2b6 tags:
# Build a content-based model # Build a content-based model
When ready, move the following class in the *models.py* script When ready, move the following class in the *models.py* script
%% Cell type:code id:16b0a602 tags: %% Cell type:code id:16b0a602 tags:
``` python ``` python
class ContentBased(AlgoBase): class ContentBased(AlgoBase):
def __init__(self, features_method, regressor_method): def __init__(self, features_method, regressor_method):
AlgoBase.__init__(self) AlgoBase.__init__(self)
self.regressor_method = regressor_method self.regressor_method = regressor_method
self.content_features = self.create_content_features(features_method) self.content_features = self.create_content_features(features_method)
self.user_profile_explain = {} self.user_profile_explain = {}
def create_content_features(self, features_method): def create_content_features(self, features_method):
"""Content Analyzer""" """Content Analyzer"""
df_items = load_items() df_items = load_items()
df_ratings = load_ratings() df_ratings = load_ratings()
df_tag = df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME) df_tag = df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv") df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv")
df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv") df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv")
if features_method is None: if features_method is None:
df_features = None df_features = None
elif features_method == "relevance" : elif features_method == "relevance" :
df_features = df_genome_score.groupby('movieId')["relevance"].transform('mean').to_frame('avg_relevance') df_features = df_genome_score.groupby('movieId')["relevance"].transform('mean').to_frame('avg_relevance')
elif features_method == "title_length": # a naive method that creates only 1 feature based on title length elif features_method == "title_length": # a naive method that creates only 1 feature based on title length
df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title') df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
elif features_method == "movie_year" : elif features_method == "movie_year" :
df_features = df_items['movie_year'] = df_items['title'].str.extract(r'\((\d{4})\)', expand=False).to_frame('movie_year') df_features = df_items['movie_year'] = df_items['title'].str.extract(r'\((\d{4})\)', expand=False).to_frame('movie_year')
elif features_method == "genres" : elif features_method == "genres" :
genres_list = df_items['genres'].str.split('|').explode().unique() genres_list = df_items['genres'].str.split('|').explode().unique()
for genre in genres_list: for genre in genres_list:
df_features = df_items['genres'].str.contains(genre).astype(int).to_frame('genres') df_features = df_items['genres'].str.contains(genre).astype(int).to_frame('genres')
elif features_method == "combination": elif features_method == "combination":
df_length = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title') df_length = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
df_movie = df_items['title'].str.extract(r'\((\d{4})\)', expand=False).to_frame('movie_year') df_movie = df_items['title'].str.extract(r'\((\d{4})\)', expand=False).to_frame('movie_year')
genres_list = df_items['genres'].str.split('|').explode().unique() genres_list = df_items['genres'].str.split('|').explode().unique()
for genre in genres_list: for genre in genres_list:
df_genre = df_items['genres'].str.contains(genre).astype(int).to_frame('genres') df_genre = df_items['genres'].str.contains(genre).astype(int).to_frame('genres')
df_features = pd.concat([df_genre, df_length, df_movie], axis=1) df_features = pd.concat([df_genre, df_length, df_movie], axis=1)
elif features_method == "rating" : elif features_method == "rating" :
df_features = df_ratings.groupby('movieId')['rating'].transform('mean').to_frame('avg_rating') df_features = df_ratings.groupby('movieId')['rating'].transform('mean').to_frame('avg_rating')
elif features_method == "tags" : elif features_method == "tags" :
df_features = df_tag['tag'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0).to_frame('tags') df_features = df_tag['tag'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0).to_frame('tags')
elif features_method == "tags_length" : elif features_method == "tags_length" :
df_features = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(','))if isinstance(x, str) else 0).to_frame('n_character_tags') df_features = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(','))if isinstance(x, str) else 0).to_frame('n_character_tags')
else: # (implement other feature creations here) else: # (implement other feature creations here)
raise NotImplementedError(f'Feature method {features_method} not yet implemented') raise NotImplementedError(f'Feature method {features_method} not yet implemented')
return df_features return df_features
def fit(self, trainset): def fit(self, trainset):
"""Profile Learner""" """Profile Learner"""
AlgoBase.fit(self, trainset) AlgoBase.fit(self, trainset)
# Preallocate user profiles # Preallocate user profiles
self.user_profile = {u: None for u in trainset.all_users()} self.user_profile = {u: None for u in trainset.all_users()}
self.user_profile_explain = {u: {} for u in trainset.all_users()} self.user_profile_explain = {u: {} for u in trainset.all_users()}
for u in self.user_profile : for u in self.user_profile :
user_ratings = np.array([rating for _, rating in trainset.ur[u]]) user_ratings = np.array([rating for _, rating in trainset.ur[u]])
feature_values = self.content_features.values feature_values = self.content_features.values
weighted_features = feature_values.T.dot(user_ratings) weighted_features = feature_values.T.dot(user_ratings)
feature_importance = weighted_features / np.sum(user_ratings) feature_importance = weighted_features / np.sum(user_ratings)
self.user_profile_explain[u] = dict(zip(self.content_features.columns, feature_importance)) self.user_profile_explain[u] = dict(zip(self.content_features.columns, feature_importance))
if self.regressor_method == 'random_score': if self.regressor_method == 'random_score':
for u in self.user_profile : for u in self.user_profile :
self.user_profile[u] = rd.uniform(0.5,5) self.user_profile[u] = rd.uniform(0.5,5)
elif self.regressor_method == 'random_sample': elif self.regressor_method == 'random_sample':
for u in self.user_profile: for u in self.user_profile:
self.user_profile[u] = [rating for _, rating in self.trainset.ur[u]] self.user_profile[u] = [rating for _, rating in self.trainset.ur[u]]
elif self.regressor_method == 'linear_regression' : elif self.regressor_method == 'linear_regression' :
for u in self.user_profile: for u in self.user_profile:
user_ratings = [rating for _, rating in trainset.ur[u]] user_ratings = [rating for _, rating in trainset.ur[u]]
item_ids = [iid for iid, _ in trainset.ur[u]] item_ids = [iid for iid, _ in trainset.ur[u]]
df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings}) df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})
df_user["item_id"] = df_user["item_id"].map(trainset.to_raw_iid) df_user["item_id"] = df_user["item_id"].map(trainset.to_raw_iid)
df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left') df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left')
if 'n_character_title' in df_user.columns: if 'n_character_title' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['n_character_title'].values.reshape(-1, 1) X = df_user['n_character_title'].values.reshape(-1, 1)
elif 'avg_relevance' in df_user.columns: elif 'avg_relevance' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['avg_relevance'].values.reshape(-1, 1) X = df_user['avg_relevance'].values.reshape(-1, 1)
elif 'movie_year' in df_user.columns: elif 'movie_year' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['movie_year'].values.reshape(-1, 1) X = df_user['movie_year'].values.reshape(-1, 1)
elif 'genres' in df_user.columns: elif 'genres' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['genres'].values.reshape(-1, 1) X = df_user['genres'].values.reshape(-1, 1)
elif 'combination' in df_user.columns: elif 'combination' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['combination'].values.reshape(-1, 1) X = df_user['combination'].values.reshape(-1, 1)
elif 'avg_rating' in df_user.columns: elif 'avg_rating' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['avg_rating'].values.reshape(-1, 1) X = df_user['avg_rating'].values.reshape(-1, 1)
elif 'tags' in df_user.columns: elif 'tags' in df_user.columns:
# Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)
X = df_user['tags'].values.reshape(-1, 1) X = df_user['tags'].values.reshape(-1, 1)
elif 'n_character_tags' in df_user.columns: elif 'n_character_tags' in df_user.columns:
# Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)
X = df_user['n_character_tags'].values.reshape(-1, 1) X = df_user['n_character_tags'].values.reshape(-1, 1)
else: else:
# Si aucune caractéristique appropriée n'est disponible # Si aucune caractéristique appropriée n'est disponible
continue # Ou gère le cas d'erreur/exception ici continue # Ou gère le cas d'erreur/exception ici
y = df_user['user_ratings'].values y = df_user['user_ratings'].values
linear_regressor = LinearRegression(fit_intercept = False) linear_regressor = LinearRegression(fit_intercept = False)
linear_regressor.fit(X,y) linear_regressor.fit(X,y)
# Store the computed user profile # Store the computed user profile
self.user_profile[u] = linear_regressor self.user_profile[u] = linear_regressor
elif self.regressor_method == 'svr_regression': elif self.regressor_method == 'svr_regression':
for u in self.user_profile: for u in self.user_profile:
user_ratings = [rating for _, rating in trainset.ur[u]] user_ratings = [rating for _, rating in trainset.ur[u]]
item_ids = [iid for iid, _ in trainset.ur[u]] item_ids = [iid for iid, _ in trainset.ur[u]]
df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings}) df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})
df_user["item_id"] = df_user["item_id"].map(trainset.to_raw_iid) df_user["item_id"] = df_user["item_id"].map(trainset.to_raw_iid)
df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left') df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left')
if 'n_character_title' in df_user.columns: if 'n_character_title' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['n_character_title'].values.reshape(-1, 1) X = df_user['n_character_title'].values.reshape(-1, 1)
elif 'avg_relevance' in df_user.columns: elif 'avg_relevance' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['avg_relevance'].values.reshape(-1, 1) X = df_user['avg_relevance'].values.reshape(-1, 1)
elif 'movie_year' in df_user.columns: elif 'movie_year' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['movie_year'].values.reshape(-1, 1) X = df_user['movie_year'].values.reshape(-1, 1)
elif 'genres' in df_user.columns: elif 'genres' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['genres'].values.reshape(-1, 1) X = df_user['genres'].values.reshape(-1, 1)
elif 'combination' in df_user.columns: elif 'combination' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['combination'].values.reshape(-1, 1) X = df_user['combination'].values.reshape(-1, 1)
elif 'avg_rating' in df_user.columns: elif 'avg_rating' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['avg_rating'].values.reshape(-1, 1) X = df_user['avg_rating'].values.reshape(-1, 1)
elif 'tags' in df_user.columns: elif 'tags' in df_user.columns:
# Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)
X = df_user['tags'].values.reshape(-1, 1) X = df_user['tags'].values.reshape(-1, 1)
elif 'n_character_tags' in df_user.columns: elif 'n_character_tags' in df_user.columns:
# Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)
X = df_user['n_character_tags'].values.reshape(-1, 1) X = df_user['n_character_tags'].values.reshape(-1, 1)
else: else:
# Si aucune caractéristique appropriée n'est disponible # Si aucune caractéristique appropriée n'est disponible
continue # Ou gère le cas d'erreur/exception ici continue # Ou gère le cas d'erreur/exception ici
y = df_user['user_ratings'].values y = df_user['user_ratings'].values
svr_regressor = SVR(kernel='rbf', C=10, epsilon=0.2) svr_regressor = SVR(kernel='rbf', C=10, epsilon=0.2)
svr_regressor.fit(X, y) svr_regressor.fit(X, y)
self.user_profile[u] = svr_regressor self.user_profile[u] = svr_regressor
elif self.regressor_method == 'gradient_boosting': elif self.regressor_method == 'gradient_boosting':
for u in self.user_profile: for u in self.user_profile:
user_ratings = [rating for _, rating in trainset.ur[u]] user_ratings = [rating for _, rating in trainset.ur[u]]
item_ids = [iid for iid, _ in trainset.ur[u]] item_ids = [iid for iid, _ in trainset.ur[u]]
df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings}) df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})
df_user["item_id"] = df_user["item_id"].map(trainset.to_raw_iid) df_user["item_id"] = df_user["item_id"].map(trainset.to_raw_iid)
df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left') df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left')
if 'n_character_title' in df_user.columns: if 'n_character_title' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['n_character_title'].values.reshape(-1, 1) X = df_user['n_character_title'].values.reshape(-1, 1)
elif 'avg_relevance' in df_user.columns: elif 'avg_relevance' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['avg_relevance'].values.reshape(-1, 1) X = df_user['avg_relevance'].values.reshape(-1, 1)
elif 'movie_year' in df_user.columns: elif 'movie_year' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['movie_year'].values.reshape(-1, 1) X = df_user['movie_year'].values.reshape(-1, 1)
elif 'genres' in df_user.columns: elif 'genres' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['genres'].values.reshape(-1, 1) X = df_user['genres'].values.reshape(-1, 1)
elif 'combination' in df_user.columns: elif 'combination' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['combination'].values.reshape(-1, 1) X = df_user['combination'].values.reshape(-1, 1)
elif 'avg_rating' in df_user.columns: elif 'avg_rating' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['avg_rating'].values.reshape(-1, 1) X = df_user['avg_rating'].values.reshape(-1, 1)
elif 'tags' in df_user.columns: elif 'tags' in df_user.columns:
# Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)
X = df_user['tags'].values.reshape(-1, 1) X = df_user['tags'].values.reshape(-1, 1)
elif 'n_character_tags' in df_user.columns: elif 'n_character_tags' in df_user.columns:
# Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)
X = df_user['n_character_tags'].values.reshape(-1, 1) X = df_user['n_character_tags'].values.reshape(-1, 1)
else: else:
# Si aucune caractéristique appropriée n'est disponible # Si aucune caractéristique appropriée n'est disponible
continue # Ou gère le cas d'erreur/exception ici continue # Ou gère le cas d'erreur/exception ici
y = df_user['user_ratings'].values y = df_user['user_ratings'].values
gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3) gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
gb_regressor.fit(X, y) gb_regressor.fit(X, y)
self.user_profile[u] = gb_regressor self.user_profile[u] = gb_regressor
elif self.regressor_method == 'random_forest': elif self.regressor_method == 'random_forest':
for u in self.user_profile: for u in self.user_profile:
user_ratings = [rating for _, rating in trainset.ur[u]] user_ratings = [rating for _, rating in trainset.ur[u]]
item_ids = [iid for iid, _ in trainset.ur[u]] item_ids = [iid for iid, _ in trainset.ur[u]]
df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings}) df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})
df_user["item_id"] = df_user["item_id"].map(trainset.to_raw_iid) df_user["item_id"] = df_user["item_id"].map(trainset.to_raw_iid)
df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left') df_user = df_user.merge(self.content_features, left_on = "item_id", right_index = True, how = 'left')
if 'n_character_title' in df_user.columns: if 'n_character_title' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['n_character_title'].values.reshape(-1, 1) X = df_user['n_character_title'].values.reshape(-1, 1)
elif 'avg_relevance' in df_user.columns: elif 'avg_relevance' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['avg_relevance'].values.reshape(-1, 1) X = df_user['avg_relevance'].values.reshape(-1, 1)
elif 'movie_year' in df_user.columns: elif 'movie_year' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['movie_year'].values.reshape(-1, 1) X = df_user['movie_year'].values.reshape(-1, 1)
elif 'genres' in df_user.columns: elif 'genres' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['genres'].values.reshape(-1, 1) X = df_user['genres'].values.reshape(-1, 1)
elif 'combination' in df_user.columns: elif 'combination' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['combination'].values.reshape(-1, 1) X = df_user['combination'].values.reshape(-1, 1)
elif 'avg_rating' in df_user.columns: elif 'avg_rating' in df_user.columns:
# Si 'n_character_title' est disponible comme caractéristique # Si 'n_character_title' est disponible comme caractéristique
X = df_user['avg_rating'].values.reshape(-1, 1) X = df_user['avg_rating'].values.reshape(-1, 1)
elif 'tags' in df_user.columns: elif 'tags' in df_user.columns:
# Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)
X = df_user['tags'].values.reshape(-1, 1) X = df_user['tags'].values.reshape(-1, 1)
elif 'n_character_tags' in df_user.columns: elif 'n_character_tags' in df_user.columns:
# Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique) # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)
X = df_user['n_character_tags'].values.reshape(-1, 1) X = df_user['n_character_tags'].values.reshape(-1, 1)
else: else:
# Si aucune caractéristique appropriée n'est disponible # Si aucune caractéristique appropriée n'est disponible
continue # Ou gère le cas d'erreur/exception ici continue # Ou gère le cas d'erreur/exception ici
y = df_user['user_ratings'].values y = df_user['user_ratings'].values
rf_regressor = RandomForestRegressor(n_estimators=100) rf_regressor = RandomForestRegressor(n_estimators=100)
rf_regressor.fit(X, y) rf_regressor.fit(X, y)
self.user_profile[u] = rf_regressor self.user_profile[u] = rf_regressor
else : else :
pass pass
# (implement here the regressor fitting) # (implement here the regressor fitting)
def estimate(self, u, i): def estimate(self, u, i):
"""Scoring component used for item filtering""" """Scoring component used for item filtering"""
# First, handle cases for unknown users and items # First, handle cases for unknown users and items
if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
raise PredictionImpossible('User and/or item is unkown.') raise PredictionImpossible('User and/or item is unkown.')
if self.regressor_method == 'random_score': if self.regressor_method == 'random_score':
rd.seed() rd.seed()
score = rd.uniform(0.5,5) score = rd.uniform(0.5,5)
elif self.regressor_method == 'random_sample': elif self.regressor_method == 'random_sample':
rd.seed() rd.seed()
score = rd.choice(self.user_profile[u]) score = rd.choice(self.user_profile[u])
elif self.regressor_method == 'linear_regression': elif self.regressor_method == 'linear_regression':
raw_item_id = self.trainset.to_raw_iid(i) raw_item_id = self.trainset.to_raw_iid(i)
item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values
linear_regressor = self.user_profile[u] linear_regressor = self.user_profile[u]
score= linear_regressor.predict(item_features)[0] score= linear_regressor.predict(item_features)[0]
elif self.regressor_method == 'svr_regression': elif self.regressor_method == 'svr_regression':
raw_item_id = self.trainset.to_raw_iid(i) raw_item_id = self.trainset.to_raw_iid(i)
item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values
svr_regressor = self.user_profile[u] svr_regressor = self.user_profile[u]
score = svr_regressor.predict(item_features)[0] score = svr_regressor.predict(item_features)[0]
elif self.regressor_method == 'gradient_boosting': elif self.regressor_method == 'gradient_boosting':
raw_item_id = self.trainset.to_raw_iid(i) raw_item_id = self.trainset.to_raw_iid(i)
item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values
gradient_boosting = self.user_profile[u] gradient_boosting = self.user_profile[u]
score = gradient_boosting.predict(item_features)[0] score = gradient_boosting.predict(item_features)[0]
elif self.regressor_method == 'random_forest': elif self.regressor_method == 'random_forest':
raw_item_id = self.trainset.to_raw_iid(i) raw_item_id = self.trainset.to_raw_iid(i)
item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values
randomforest = self.user_profile[u] randomforest = self.user_profile[u]
score = randomforest.predict(item_features)[0] score = randomforest.predict(item_features)[0]
else : else :
score = None score = None
# (implement here the regressor prediction) # (implement here the regressor prediction)
return score return score
def explain(self, u) : def explain(self, u) :
if u in self.user_profile_explain : if u in self.user_profile_explain :
return self.user_profile_explain[u] return self.user_profile_explain[u]
else : else :
return {} return {}
cb = ContentBased("movie_year", "random_sample") cb = ContentBased("movie_year", "random_sample")
print(cb.explain('11')) print(cb.explain('11'))
print('test')
``` ```
%% Output %% Output
{} {}
%% Cell type:code id:baab88b7 tags: %% Cell type:code id:baab88b7 tags:
``` python ``` python
from pprint import pprint from pprint import pprint
# Créer une instance de TfidfVectorizer pour les genres # Créer une instance de TfidfVectorizer pour les genres
tfidf_vectorizer = TfidfVectorizer() tfidf_vectorizer = TfidfVectorizer()
# Fit et transform pour calculer la matrice TF-IDF des genres # Fit et transform pour calculer la matrice TF-IDF des genres
tfidf_matrix = tfidf_vectorizer.fit_transform(df_items['genres']) tfidf_matrix = tfidf_vectorizer.fit_transform(df_items['genres'])
# Obtenir les noms des genres (features) # Obtenir les noms des genres (features)
genre_names = tfidf_vectorizer.get_feature_names_out() genre_names = tfidf_vectorizer.get_feature_names_out()
# Créer un DataFrame à partir de la matrice TF-IDF des genres # Créer un DataFrame à partir de la matrice TF-IDF des genres
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=genre_names) df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=genre_names)
print("Matrice TF-IDF des genres :") print("Matrice TF-IDF des genres :")
display(df_tfidf) display(df_tfidf)
``` ```
%% Output %% Output
Matrice TF-IDF des genres : Matrice TF-IDF des genres :
%% Cell type:markdown id:ffd75b7e tags: %% Cell type:markdown id:ffd75b7e tags:
The following script test the ContentBased class The following script test the ContentBased class
%% Cell type:code id:69d12f7d tags: %% Cell type:code id:69d12f7d tags:
``` python ``` python
def test_contentbased_class(feature_method, regressor_method): def test_contentbased_class(feature_method, regressor_method):
"""Test the ContentBased class. """Test the ContentBased class.
Tries to make a prediction on the first (user,item ) tuple of the anti_test_set Tries to make a prediction on the first (user,item ) tuple of the anti_test_set
""" """
sp_ratings = load_ratings(surprise_format=True) sp_ratings = load_ratings(surprise_format=True)
train_set = sp_ratings.build_full_trainset() train_set = sp_ratings.build_full_trainset()
content_algo = ContentBased(feature_method, regressor_method) content_algo = ContentBased(feature_method, regressor_method)
content_algo.fit(train_set) content_algo.fit(train_set)
anti_test_set_first = train_set.build_anti_testset()[0] anti_test_set_first = train_set.build_anti_testset()[0]
prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1]) prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])
print(prediction) print(prediction)
# print("title_length :") # print("title_length :")
# test_contentbased_class(feature_method = "title_length" , regressor_method = "random_score") # test_contentbased_class(feature_method = "title_length" , regressor_method = "random_score")
# test_contentbased_class(feature_method = "title_length" , regressor_method = "random_sample") # test_contentbased_class(feature_method = "title_length" , regressor_method = "random_sample")
# test_contentbased_class(feature_method = "title_length" , regressor_method = "linear_regression") # test_contentbased_class(feature_method = "title_length" , regressor_method = "linear_regression")
# test_contentbased_class(feature_method= "title_length", regressor_method= "svr_regression") # test_contentbased_class(feature_method= "title_length", regressor_method= "svr_regression")
# test_contentbased_class(feature_method= "title_length", regressor_method= "gradient_boosting") # test_contentbased_class(feature_method= "title_length", regressor_method= "gradient_boosting")
# test_contentbased_class(feature_method= "title_length", regressor_method= "random_forest") # test_contentbased_class(feature_method= "title_length", regressor_method= "random_forest")
# print("\n") # print("\n")
# print("movie_year : ") # print("movie_year : ")
# test_contentbased_class(feature_method= "movie_year", regressor_method= "random_score") # test_contentbased_class(feature_method= "movie_year", regressor_method= "random_score")
# test_contentbased_class(feature_method= "movie_year", regressor_method= "random_sample") # test_contentbased_class(feature_method= "movie_year", regressor_method= "random_sample")
# test_contentbased_class(feature_method= "movie_year", regressor_method= "linear_regression") # test_contentbased_class(feature_method= "movie_year", regressor_method= "linear_regression")
# test_contentbased_class(feature_method= "movie_year", regressor_method= "svr_regression") # test_contentbased_class(feature_method= "movie_year", regressor_method= "svr_regression")
# test_contentbased_class(feature_method= "movie_year", regressor_method= "gradient_boosting") # test_contentbased_class(feature_method= "movie_year", regressor_method= "gradient_boosting")
# test_contentbased_class(feature_method= "movie_year", regressor_method= "random_forest") # test_contentbased_class(feature_method= "movie_year", regressor_method= "random_forest")
# print("\n") # print("\n")
# print("relevance : ") # print("relevance : ")
# test_contentbased_class(feature_method= "relevance", regressor_method= "random_score") # test_contentbased_class(feature_method= "relevance", regressor_method= "random_score")
# test_contentbased_class(feature_method= "relevance", regressor_method= "random_sample") # test_contentbased_class(feature_method= "relevance", regressor_method= "random_sample")
# test_contentbased_class(feature_method= "relevance", regressor_method= "linear_regression") # test_contentbased_class(feature_method= "relevance", regressor_method= "linear_regression")
# test_contentbased_class(feature_method= "relevance", regressor_method= "svr_regression") # test_contentbased_class(feature_method= "relevance", regressor_method= "svr_regression")
# test_contentbased_class(feature_method= "relevance", regressor_method= "gradient_boosting") # test_contentbased_class(feature_method= "relevance", regressor_method= "gradient_boosting")
# test_contentbased_class(feature_method= "relevance", regressor_method= "random_forest") # test_contentbased_class(feature_method= "relevance", regressor_method= "random_forest")
# print("\n") # print("\n")
# print("genres : ") # print("genres : ")
# test_contentbased_class(feature_method= "genres", regressor_method= "random_score") # test_contentbased_class(feature_method= "genres", regressor_method= "random_score")
# test_contentbased_class(feature_method= "genres", regressor_method= "random_sample") # test_contentbased_class(feature_method= "genres", regressor_method= "random_sample")
# test_contentbased_class(feature_method= "genres", regressor_method= "linear_regression") # test_contentbased_class(feature_method= "genres", regressor_method= "linear_regression")
# test_contentbased_class(feature_method= "genres", regressor_method= "svr_regression") # test_contentbased_class(feature_method= "genres", regressor_method= "svr_regression")
# test_contentbased_class(feature_method= "genres", regressor_method= "gradient_boosting") # test_contentbased_class(feature_method= "genres", regressor_method= "gradient_boosting")
# test_contentbased_class(feature_method= "genres", regressor_method= "random_forest") # test_contentbased_class(feature_method= "genres", regressor_method= "random_forest")
# print("\n") # print("\n")
# print("rating : ") # print("rating : ")
# test_contentbased_class(feature_method= "rating", regressor_method="random_score") # test_contentbased_class(feature_method= "rating", regressor_method="random_score")
# test_contentbased_class(feature_method= "rating", regressor_method="random_sample") # test_contentbased_class(feature_method= "rating", regressor_method="random_sample")
# # test_contentbased_class(feature_method= "rating", regressor_method="linear_regression") # # test_contentbased_class(feature_method= "rating", regressor_method="linear_regression")
# #test_contentbased_class(feature_method="rating", regressor_method="svr_regression") # #test_contentbased_class(feature_method="rating", regressor_method="svr_regression")
# #test_contentbased_class(feature_method="rating", regressor_method="gradient_boosting") # #test_contentbased_class(feature_method="rating", regressor_method="gradient_boosting")
# #test_contentbased_class(feature_method="rating", regressor_method="random_forest") # #test_contentbased_class(feature_method="rating", regressor_method="random_forest")
# print("\n") # print("\n")
# print("tags : ") # print("tags : ")
# test_contentbased_class(feature_method="tags", regressor_method="random_score") # test_contentbased_class(feature_method="tags", regressor_method="random_score")
# test_contentbased_class(feature_method="tags", regressor_method="random_sample") # test_contentbased_class(feature_method="tags", regressor_method="random_sample")
# #test_contentbased_class(feature_method="tags", regressor_method="linear_regression") # #test_contentbased_class(feature_method="tags", regressor_method="linear_regression")
# # test_contentbased_class(feature_method="tags", regressor_method="svr_regression") # # test_contentbased_class(feature_method="tags", regressor_method="svr_regression")
# # test_contentbased_class(feature_method="tags", regressor_method="gradient_boosting") # # test_contentbased_class(feature_method="tags", regressor_method="gradient_boosting")
# # test_contentbased_class(feature_method="tags", regressor_method="random_forest") # # test_contentbased_class(feature_method="tags", regressor_method="random_forest")
# print("\n") # print("\n")
# print("tags_length : ") # print("tags_length : ")
# test_contentbased_class(feature_method="tags_length", regressor_method="random_score") # test_contentbased_class(feature_method="tags_length", regressor_method="random_score")
# test_contentbased_class(feature_method="tags_length", regressor_method="random_sample") # test_contentbased_class(feature_method="tags_length", regressor_method="random_sample")
# test_contentbased_class(feature_method="tags_length", regressor_method="linear_regression") # test_contentbased_class(feature_method="tags_length", regressor_method="linear_regression")
# test_contentbased_class(feature_method="tags_length", regressor_method="svr_regression") # test_contentbased_class(feature_method="tags_length", regressor_method="svr_regression")
# test_contentbased_class(feature_method="tags_length", regressor_method="gradient_boosting") # test_contentbased_class(feature_method="tags_length", regressor_method="gradient_boosting")
# test_contentbased_class(feature_method="tags_length", regressor_method="random_forest") # test_contentbased_class(feature_method="tags_length", regressor_method="random_forest")
# print("\n") # print("\n")
# print("combination : ") # print("combination : ")
# test_contentbased_class(feature_method="combination", regressor_method="random_score") # test_contentbased_class(feature_method="combination", regressor_method="random_score")
# test_contentbased_class(feature_method="combination", regressor_method="random_sample") # test_contentbased_class(feature_method="combination", regressor_method="random_sample")
# test_contentbased_class(feature_method="combination", regressor_method="linear_regression") # test_contentbased_class(feature_method="combination", regressor_method="linear_regression")
# test_contentbased_class(feature_method="combination", regressor_method="svr_regression") # test_contentbased_class(feature_method="combination", regressor_method="svr_regression")
# test_contentbased_class(feature_method="combination", regressor_method="gradient_boosting") # test_contentbased_class(feature_method="combination", regressor_method="gradient_boosting")
# test_contentbased_class(feature_method="combination", regressor_method="random_forest") # test_contentbased_class(feature_method="combination", regressor_method="random_forest")
``` ```
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter