Skip to content
Extraits de code Groupes Projets
Valider ad331907 rédigé par Adrien Payen's avatar Adrien Payen
Parcourir les fichiers

update files

parent aa44f259
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
%% Cell type:markdown id:82d5ca82 tags:
# Packages
%% Cell type:code id:277473a3 tags:
``` python
%load_ext autoreload
%autoreload 2
# third parties imports
import pandas as pd
import numpy as np
import random as rd
from surprise import AlgoBase, SVD
from surprise import PredictionImpossible
# import local
from sklearn.feature_extraction.text import TfidfVectorizer
from loaders import load_items, load_ratings
from constants import Constant as C
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
```
%% Output
The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload
%% Cell type:markdown id:a42c16bf tags:
# Explore and select content features
%% Cell type:code id:e8378976 tags:
``` python
# All the dataframes
df_items = load_items()
df_ratings = load_ratings()
df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
#df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv")
# df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv")
# Example 1 : create title_length features
df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
display(df_features.head())
df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
df_features = df_tag[C.TAG]
display(df_features.head())
# (explore here other features)
```
%% Output
%% Cell type:markdown id:a2c9a2b6 tags:
# Build a content-based model
When ready, move the following class in the *models.py* script
%% Cell type:code id:16b0a602 tags:
``` python
# ContetnBased
class ContentBased(AlgoBase):
def __init__(self, features_method, regressor_method):
AlgoBase.__init__(self)
self.regressor_method = regressor_method
self.features_methods = features_method
self.content_features = self.create_content_features(features_method)
self.user_profile = {}
self.user_profile_explain = {}
def create_content_features(self, features_methods):
"""Content Analyzer"""
df_items = load_items()
df_ratings = load_ratings()
df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv")
df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv")
df_features = pd.DataFrame(index=df_items.index)
for method in features_methods:
if method == "title_length":
df_title_length = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('title_length')
df_features = pd.concat([df_features, df_title_length], axis=1)
elif method == "movie_year":
df_movie_year = df_items['title'].str.extract(r'\((\d{4})\)', expand=False).to_frame('movie_year')
df_features = pd.concat([df_features, df_movie_year.astype(float).fillna(0)], axis=1)
elif method == "genre":
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split('|'), token_pattern=None)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_items['genres'])
df_tfidf_genres = pd.DataFrame(tfidf_matrix.toarray(), index=df_items.index, columns=tfidf_vectorizer.get_feature_names_out())
df_features = pd.concat([df_features, df_tfidf_genres], axis=1)
elif method == "avg_rating":
df_avg_rating = df_ratings.groupby('movieId')['rating'].mean().to_frame('avg_rating')
df_features = df_features.join(df_avg_rating, on='movieId')
else:
raise NotImplementedError(f'Feature method {method} not yet implemented')
# Handle missing values in df_features
df_features.fillna(0, inplace=True)
return df_features
def fit(self, trainset):
"""Profile Learner"""
AlgoBase.fit(self, trainset)
# Preallocate user profiles
self.user_profile = {u: None for u in trainset.all_users()}
self.user_profile_explain = {}
epsilon = 1e-10 # Small value to prevent division by zero
for u in trainset.all_users():
raw_user_id = trainset.to_raw_uid(u)
self.user_profile_explain[raw_user_id] = {}
user_ratings = np.array([rating for (_, rating) in trainset.ur[u]])
item_ids = [iid for (iid, _) in trainset.ur[u]]
raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]
feature_values = self.content_features.loc[raw_item_ids].values
norms = np.linalg.norm(feature_values, axis=0) + epsilon
weighted_features = feature_values / norms
feature_importance = weighted_features.T @ user_ratings
feature_importance /= np.sum(user_ratings)
self.user_profile_explain[raw_user_id] = dict(zip(self.content_features.columns, feature_importance))
if self.regressor_method == 'random_score':
for u in self.user_profile:
self.user_profile[u] = rd.uniform(0.5, 5)
elif self.regressor_method == 'random_sample':
for u in self.user_profile:
self.user_profile[u] = [rating for (_, rating) in trainset.ur[u]]
else:
regressor_models = {
'linear_regression': LinearRegression(fit_intercept=False),
'svr_regression': SVR(kernel='rbf', C=10, epsilon=0.2),
'gradient_boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
'random_forest': RandomForestRegressor(n_estimators=100),
'lasso_regression': Lasso(alpha=0.1),
'ridge_regression': Ridge(alpha=1.0),
'elastic_net': ElasticNet(alpha=1.0, l1_ratio=0.5),
'knn_regression': KNeighborsRegressor(n_neighbors=1),
'decision_tree': DecisionTreeRegressor(max_depth=5),
'adaboost': AdaBoostRegressor(n_estimators=50),
'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
}
if self.regressor_method not in regressor_models:
raise NotImplementedError(f'Regressor method {self.regressor_method} not yet implemented')
for u in self.user_profile:
user_ratings = [rating for (_, rating) in trainset.ur[u]]
item_ids = [iid for (iid, _) in trainset.ur[u]]
raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]
df_user = pd.DataFrame({'item_id': raw_item_ids, 'user_ratings': user_ratings})
df_user = df_user.merge(self.content_features, left_on="item_id", right_index=True, how='left')
X = df_user.drop(columns=['item_id', 'user_ratings'])
y = df_user['user_ratings']
regressor = regressor_models[self.regressor_method]
regressor.fit(X, y)
self.user_profile[u] = regressor
def estimate(self, u, i):
"""Scoring component used for item filtering"""
if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
raise PredictionImpossible('User and/or item is unknown.')
if self.regressor_method == 'random_score':
return rd.uniform(0.5, 5)
elif self.regressor_method == 'random_sample':
return rd.choice(self.user_profile[u])
else:
raw_item_id = self.trainset.to_raw_iid(i)
item_features = self.content_features.loc[raw_item_id, :].values.reshape(1, -1)
regressor = self.user_profile[u]
item_features_df = pd.DataFrame(item_features, columns=self.content_features.columns)
return regressor.predict(item_features_df)[0]
def explain(self, u):
if u in self.user_profile_explain:
return self.user_profile_explain[u]
else:
return None
#Example usage:
cb = ContentBased(["title_length", "movie_year","genre","avg_rating"], "ridge_regression")
surprise_data = load_ratings(surprise_format=True)
trainset = surprise_data.build_full_trainset()
testset = trainset.build_anti_testset()
cb.fit(trainset)
#print("RMSE: ", cb.rmse(testset))
#Example explanations for users:
print(cb.explain(11))
print(cb.explain(13))
print(cb.explain(17))
print(cb.explain(23))
print(cb.explain(27))
print(cb.explain(73))
```
%% Output
{'title_length': 0.1497645139703848, 'movie_year': 0.16218667420100635, '(no genres listed)': 0.0, 'action': 0.09449072815753193, 'adventure': 0.08778978776313201, 'animation': 0.0, 'children': 0.038431411145366176, 'comedy': 0.07268129109348041, 'crime': 0.09469516433772891, 'documentary': 0.0611428358670058, 'drama': 0.10494783392380302, 'fantasy': 0.025806451608591505, 'film-noir': 0.025806451609512046, 'horror': 0.018342712153336858, 'imax': 0.06947533670577526, 'musical': 0.0, 'mystery': 0.06234903350217154, 'romance': 0.036771716124540825, 'sci-fi': 0.059571001735546115, 'thriller': 0.0993122803165238, 'war': 0.04002978709072218, 'western': 0.04547648227079719, 'avg_rating': 0.16263357553020436}
{'title_length': 0.12975573389578626, 'movie_year': 0.13738555574364605, '(no genres listed)': 0.0, 'action': 0.0640388318396414, 'adventure': 0.0827515664964472, 'animation': 0.05686854568650957, 'children': 0.06799492283569505, 'comedy': 0.07354182680364503, 'crime': 0.05543740962624167, 'documentary': 0.0, 'drama': 0.09170589087803577, 'fantasy': 0.061481521263689595, 'film-noir': 0.0, 'horror': 0.015113350123518238, 'imax': 0.04592205020685974, 'musical': 0.03201459126079391, 'mystery': 0.03412706135338736, 'romance': 0.05989121250223656, 'sci-fi': 0.04370793816378273, 'thriller': 0.045800659191095036, 'war': 0.04907194751877139, 'western': 0.027287416762806844, 'avg_rating': 0.13740560847192132}
{'title_length': 0.04702378569892371, 'movie_year': 0.052440003628289225, '(no genres listed)': 0.0, 'action': 0.020439581335728367, 'adventure': 0.015593308332521032, 'animation': 0.004256286923052558, 'children': 0.003520723090188317, 'comedy': 0.018972762464944913, 'crime': 0.028340544273099223, 'documentary': 0.005823989517206729, 'drama': 0.037415345194166824, 'fantasy': 0.013643903080149476, 'film-noir': 0.015390183296279798, 'horror': 0.01926898253629829, 'imax': 0.0014716703456143566, 'musical': 0.0061519348279224124, 'mystery': 0.02847033164163413, 'romance': 0.019827342468818163, 'sci-fi': 0.022573488552024915, 'thriller': 0.03522231545147593, 'war': 0.010339617301415098, 'western': 0.005663885036293055, 'avg_rating': 0.05327750989412312}
{'title_length': 0.033402138126294736, 'movie_year': 0.03710065977291947, '(no genres listed)': 0.0, 'action': 0.014528522669579273, 'adventure': 0.013963913494241694, 'animation': 0.005764814103226412, 'children': 0.006513197483932152, 'comedy': 0.017763201411495646, 'crime': 0.016002513666599556, 'documentary': 0.004292962983778595, 'drama': 0.027458210593047847, 'fantasy': 0.009302633945770895, 'film-noir': 0.006823368830454359, 'horror': 0.007391689869010394, 'imax': 0.004855154663168369, 'musical': 0.0058909467772061425, 'mystery': 0.012191560732760487, 'romance': 0.01723631022081761, 'sci-fi': 0.010817269433255231, 'thriller': 0.01658593988724716, 'war': 0.010193212979882352, 'western': 0.0052038255339472966, 'avg_rating': 0.03742403427834079}
{'title_length': 0.20154225634108316, 'movie_year': 0.20848962267389695, '(no genres listed)': 0.0, 'action': 0.04545454544645529, 'adventure': 0.04545454544730129, 'animation': 0.0, 'children': 0.0, 'comedy': 0.07177284969293253, 'crime': 0.1145252645738102, 'documentary': 0.0, 'drama': 0.16778172557550536, 'fantasy': 0.0, 'film-noir': 0.0, 'horror': 0.06315936177961773, 'imax': 0.0, 'musical': 0.0, 'mystery': 0.08510520557533159, 'romance': 0.09754755529442835, 'sci-fi': 0.045454545449454146, 'thriller': 0.12542163704872258, 'war': 0.08035304331050673, 'western': 0.0, 'avg_rating': 0.21152969571139305}
{'title_length': 0.021927486954368552, 'movie_year': 0.02488786702116846, '(no genres listed)': 0.0007363092498113207, 'action': 0.013836432470735639, 'adventure': 0.011610617815573265, 'animation': 0.007520799115717832, 'children': 0.006287966766754299, 'comedy': 0.012951125615087338, 'crime': 0.011084119744598393, 'documentary': 0.0018287715645832062, 'drama': 0.015221252640276463, 'fantasy': 0.008631010164284143, 'film-noir': 0.0024629052522566544, 'horror': 0.008816299251739122, 'imax': 0.005347204099216887, 'musical': 0.0038827346462235236, 'mystery': 0.0068652812039576095, 'romance': 0.008086664541950757, 'sci-fi': 0.010304269379559203, 'thriller': 0.013200133984104478, 'war': 0.005127335699821772, 'western': 0.0036215200349232765, 'avg_rating': 0.025470698706944836}
%% Cell type:markdown id:ffd75b7e tags:
The following script test the ContentBased class
%% Cell type:code id:69d12f7d tags:
``` python
def test_contentbased_class(feature_method, regressor_method):
"""Test the ContentBased class.
Tries to make a prediction on the first (user,item ) tuple of the anti_test_set
"""
sp_ratings = load_ratings(surprise_format=True)
train_set = sp_ratings.build_full_trainset()
content_algo = ContentBased(feature_method, regressor_method)
content_algo.fit(train_set)
anti_test_set_first = train_set.build_anti_testset()[0]
prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])
print(prediction)
test_contentbased_class(["title_length", "movie_year","genre","avg_rating"], "ridge_regression")
```
%% Output
user: 1 item: 10 r_ui = None est = 0.72 {'was_impossible': False}
...@@ -683,9 +683,8 @@ def compare_similarity_measures(trainset,testset): ...@@ -683,9 +683,8 @@ def compare_similarity_measures(trainset,testset):
results['KNN_Pearson_RMSE'] = rmse_pearson results['KNN_Pearson_RMSE'] = rmse_pearson
results['KNN_Pearson_MAE'] = mae_pearson results['KNN_Pearson_MAE'] = mae_pearson
# Train and evaluate KNN model with Jaccard similarity # Train and evaluate KNN model with Jaccard similarity
sim_options_jaccard = {'name': 'jaccard','user_based': True} sim_options_jaccard = {'name': '','user_based': True}
user_based_jaccard = KNNWithMeans(sim_options=sim_options_jaccard) user_based_jaccard = KNNWithMeans(sim_options=sim_options_jaccard)
user_based_jaccard.fit(trainset) user_based_jaccard.fit(trainset)
predictions_jaccard = user_based_jaccard.test(testset) predictions_jaccard = user_based_jaccard.test(testset)
...@@ -771,33 +770,33 @@ def evaluate_inter_user_diversity(user_based_model, ratings_path, other_user_bas ...@@ -771,33 +770,33 @@ def evaluate_inter_user_diversity(user_based_model, ratings_path, other_user_bas
inter_user_diversity_scores['UserBased'] = user_based_model.inter_user_diversity(all_top_n_recommendations_ub) inter_user_diversity_scores['UserBased'] = user_based_model.inter_user_diversity(all_top_n_recommendations_ub)
# # #KNN model # #KNN model
# knn_model = RecommenderSystem_KNN(ratings_path) knn_model = RecommenderSystem_KNN(ratings_path)
# knn_model.train_knn_model() knn_model.train_knn_model()
# all_top_n_recommendations_knn = {} all_top_n_recommendations_knn = {}
# for user_id in range(knn_model.trainset.n_users): for user_id in range(knn_model.trainset.n_users):
# try: try:
# trainset_user_id = knn_model.trainset.to_raw_uid(user_id) trainset_user_id = knn_model.trainset.to_raw_uid(user_id)
# top_n_recommendations_knn = knn_model.get_top_n_recommendations(trainset_user_id, n=10) top_n_recommendations_knn = knn_model.get_top_n_recommendations(trainset_user_id, n=10)
# all_top_n_recommendations_knn[trainset_user_id] = top_n_recommendations_knn all_top_n_recommendations_knn[trainset_user_id] = top_n_recommendations_knn
# except ValueError: except ValueError:
# print(f"User {trainset_user_id} is not part of the training set for KNN model. Skipping...") print(f"User {trainset_user_id} is not part of the training set for KNN model. Skipping...")
# inter_user_diversity_scores['KNN'] = knn_model.inter_user_diversity(all_top_n_recommendations_knn) inter_user_diversity_scores['KNN'] = knn_model.inter_user_diversity(all_top_n_recommendations_knn)
# # Other user-based models # Other user-based models
# for other_model in other_user_based_models: for other_model in other_user_based_models:
# other_model.load_model() other_model.load_model()
# all_top_n_recommendations_other = {} all_top_n_recommendations_other = {}
# # Get predictions for all users in the test set # Get predictions for all users in the test set
# all_user_ids = set(user for user, _, _ in testset) all_user_ids = set(user for user, _, _ in testset)
# for user_id in all_user_ids: for user_id in all_user_ids:
# other_model.user_id = user_id # Update the user ID for the model other_model.user_id = user_id # Update the user ID for the model
# top_n_predictions = other_model.get_top_n_predictions_for_user(ratings_path, n=10) top_n_predictions = other_model.get_top_n_predictions_for_user(ratings_path, n=10)
# all_top_n_recommendations_other[user_id] = top_n_predictions all_top_n_recommendations_other[user_id] = top_n_predictions
# inter_user_diversity_scores[f'Other_{other_model.user_name}'] = other_model.inter_user_diversity(all_top_n_recommendations_other) inter_user_diversity_scores[f'Other_{other_model.user_name}'] = other_model.inter_user_diversity(all_top_n_recommendations_other)
return inter_user_diversity_scores return inter_user_diversity_scores
...@@ -983,8 +982,7 @@ class ContentBased(AlgoBase): ...@@ -983,8 +982,7 @@ class ContentBased(AlgoBase):
'knn_regression': KNeighborsRegressor(n_neighbors=1), 'knn_regression': KNeighborsRegressor(n_neighbors=1),
'decision_tree': DecisionTreeRegressor(max_depth=5), 'decision_tree': DecisionTreeRegressor(max_depth=5),
'adaboost': AdaBoostRegressor(n_estimators=50), 'adaboost': AdaBoostRegressor(n_estimators=50),
'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3), 'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
} }
if self.regressor_method not in regressor_models: if self.regressor_method not in regressor_models:
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter