Skip to content
Extraits de code Groupes Projets

Comparer les révisions

Les modifications sont affichées comme si la révision source était fusionnée avec la révision cible. En savoir plus sur la comparaison des révisions.

Source

Sélectionner le projet cible
No results found

Cible

Sélectionner le projet cible
  • recommender_system/recomsys
1 résultat
Afficher les modifications
Validations sur la source (3)
......@@ -43,6 +43,65 @@ class EvalConfig:
("baseline_2", ModelBaseline2, {}),
("baseline_3", ModelBaseline3, {}),
("baseline_4", ModelBaseline4, {}),
<<<<<<< HEAD
("title_length_ContentBased_sample", ContentBased, {"title_length","random_sample"}),
("title_length_ContentBased_score", ContentBased, {"features_method" : ["title_length"], "regressor_method" : "random_score"}),
("title_length_ContentBased_Lr", ContentBased, {"features_method" : ["title_length"], "regressor_method" : "linear_regression"}),
("title_length_ContentBased_Lr", ContentBased, {"features_method" : ["title_length"], "regressor_method" : "svr_regression"}),
("title_length_ContentBased_Lr", ContentBased, {"features_method" : ["title_length"], "regressor_method" : "gradient_boosting"}),
("title_length_ContentBased_Lr", ContentBased, {"features_method" : ["title_length"], "regressor_method" : "random_forest"}),
("movie_year_ContentBased_sample", ContentBased, {"features_method" : "movie_year", "regressor_method" : "random_sample"}),
("movie_year_ContentBased_score", ContentBased, {"features_method" : "movie_year", "regressor_method" : "random_score"}),
#("movie_year_ContentBased_Lr", ContentBased, {"features_method" : "movie_year", "regressor_method" : "linear_regression"}),
#("movie_year_ContentBased_Lr", ContentBased, {"features_method" : "movie_year", "regressor_method" : "svr_regression"}),
#("movie_year_ContentBased_Lr", ContentBased, {"features_method" : "movie_year", "regressor_method" : "gradient_boosting"}),
#("movie_year_ContentBased_Lr", ContentBased, {"features_method" : "movie_year", "regressor_method" : "random_forest"}),
("genres_ContentBased_sample", ContentBased, {"features_method" : "genres", "regressor_method" : "random_sample"}),
("genres_ContentBased_score", ContentBased, {"features_method" : "genres", "regressor_method" : "random_score"}),
#("genres_ContentBased_Lr", ContentBased, {"features_method" : "genres", "regressor_method" : "linear_regression"}),
#("genres_ContentBased_Lr", ContentBased, {"features_method" : "genres", "regressor_method" : "svr_regression"}),
#("genres_ContentBased_Lr", ContentBased, {"features_method" : "genres", "regressor_method" : "gradient_boosting"}),
#("genres_ContentBased_Lr", ContentBased, {"features_method" : "genres", "regressor_method" : "random_forest"}),
("rating_ContentBased_sample", ContentBased, {"features_method" : "rating", "regressor_method" : "random_sample"}),
("rating_ContentBased_score", ContentBased, {"features_method" : "rating", "regressor_method" : "random_score"}),
#("rating_ContentBased_Lr", ContentBased, {"features_method" : "rating", "regressor_method" : "linear_regression"}),
#("rating_ContentBased_Lr", ContentBased, {"features_method" : "rating", "regressor_method" : "svr_regression"}),
#("rating_ContentBased_Lr", ContentBased, {"features_method" : "rating", "regressor_method" : "gradient_boosting"}),
#("rating_ContentBased_Lr", ContentBased, {"features_method" : "rating", "regressor_method" : "random_forest"}),
("tags_ContentBased_sample", ContentBased, {"features_method" : "tags", "regressor_method" : "random_sample"}),
("tags_ContentBased_score", ContentBased, {"features_method" : "tags", "regressor_method" : "random_score"}),
#("tags_ContentBased_Lr", ContentBased, {"features_method" : "tags", "regressor_method" : "linear_regression"}),
#("tags_ContentBased_Lr", ContentBased, {"features_method" : "tags", "regressor_method" : "svr_regression"}),
#("tags_ContentBased_Lr", ContentBased, {"features_method" : "tags", "regressor_method" : "gradient_boosting"}),
#("tags_ContentBased_Lr", ContentBased, {"features_method" : "tags", "regressor_method" : "random_forest"}),
("tags_length_ContentBased_sample", ContentBased, {"features_method" : "tags_length", "regressor_method" : "random_sample"}),
("tags_length_ContentBased_score", ContentBased, {"features_method" : "tags_length", "regressor_method" : "random_score"}),
#("tags_length_ContentBased_Lr", ContentBased, {"features_method" : "tags_length", "regressor_method" : "linear_regression"}),
#("tags_length_ContentBased_Lr", ContentBased, {"features_method" : "tags_length", "regressor_method" : "svr_regression"}),
#("tags_length_ContentBased_Lr", ContentBased, {"features_method" : "tags_length", "regressor_method" : "gradient_boosting"}),
#("tags_length_ContentBased_Lr", ContentBased, {"features_method" : "tags_length", "regressor_method" : "random_forest"}),
("timestamp_ContentBased_sample", ContentBased, {"features_method" : "timestamp", "regressor_method" : "random_sample"}),
("timestamp_ContentBased_score", ContentBased, {"features_method" : "timestamp", "regressor_method" : "random_score"}),
#("timestamp_ContentBased_Lr", ContentBased, {"features_method" : "timestamp", "regressor_method" : "linear_regression"})
#("timestamp_ContentBased_Lr", ContentBased, {"features_method" : "timestamp", "regressor_method" : "svr_regression"})
#("timestamp_ContentBased_Lr", ContentBased, {"features_method" : "timestamp", "regressor_method" : "gradient_boosting"})
#("timestamp_ContentBased_Lr", ContentBased, {"features_method" : "timestamp", "regressor_method" : "random_forest"})
# model_name, model class, model parameters (dict)
=======
>>>>>>> 5385c4bc3a5802e1caec979d0d3a6bc7af3e970f
]
# Add the combinations of ContentBased models to the list of models
......
......@@ -10,10 +10,19 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 8,
"id": "277473a3",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
......@@ -36,8 +45,7 @@
"from sklearn.neighbors import KNeighborsRegressor\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from sklearn.ensemble import AdaBoostRegressor\n",
"from xgboost import XGBRegressor\n",
"from lightgbm import LGBMRegressor"
"from xgboost import XGBRegressor"
]
},
{
......@@ -50,7 +58,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 9,
"id": "e8378976",
"metadata": {},
"outputs": [
......@@ -166,7 +174,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 10,
"id": "16b0a602",
"metadata": {},
"outputs": [
......@@ -278,8 +286,7 @@
" 'knn_regression': KNeighborsRegressor(n_neighbors=1),\n",
" 'decision_tree': DecisionTreeRegressor(max_depth=5),\n",
" 'adaboost': AdaBoostRegressor(n_estimators=50),\n",
" 'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),\n",
" 'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)\n",
" 'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)\n",
" }\n",
"\n",
" if self.regressor_method not in regressor_models:\n",
......@@ -288,7 +295,10 @@
" for u in self.user_profile:\n",
" user_ratings = [rating for (_, rating) in trainset.ur[u]]\n",
" item_ids = [iid for (iid, _) in trainset.ur[u]]\n",
" # raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]\n",
" raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]\n",
" filtered_item_ids = [item_id for item_id in raw_item_ids if item_id in df_features.index]\n",
" feature_values = self.content_features.loc[filtered_item_ids].values\n",
"\n",
" df_user = pd.DataFrame({'item_id': raw_item_ids, 'user_ratings': user_ratings})\n",
" df_user = df_user.merge(self.content_features, left_on=\"item_id\", right_index=True, how='left')\n",
......@@ -361,7 +371,11 @@
},
{
"cell_type": "code",
<<<<<<< HEAD
"execution_count": 11,
=======
"execution_count": 4,
>>>>>>> 5385c4bc3a5802e1caec979d0d3a6bc7af3e970f
"id": "69d12f7d",
"metadata": {},
"outputs": [
......@@ -406,7 +420,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
"version": "3.12.0"
}
},
"nbformat": 4,
......
Ce diff est replié.
......@@ -7,13 +7,19 @@ import numpy as np
import random as rd
from surprise import AlgoBase, SVD
from surprise import PredictionImpossible
from sklearn.metrics import mean_squared_error
from pprint import pprint as pp
# import local
from sklearn.feature_extraction.text import TfidfVectorizer
from loaders import load_items, load_ratings
from constants import Constant as C
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
<<<<<<< HEAD
from sklearn.ensemble import BaggingRegressor
=======
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
......@@ -22,10 +28,30 @@ from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
>>>>>>> 5385c4bc3a5802e1caec979d0d3a6bc7af3e970f
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
# All the dataframes
df_items = load_items()
df_ratings = load_ratings()
df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
# Example 1 : create title_length features
df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
df_features = df_tag[C.TAG]
df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv")
df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv")
......@@ -83,7 +109,6 @@ class ModelBaseline2(AlgoBase):
def estimate(self, u, i):
return rd.uniform(self.trainset.rating_scale[0], self.trainset.rating_scale[1])
# Third algorithm
class ModelBaseline3(AlgoBase):
def __init__(self):
......@@ -111,7 +136,10 @@ class ContentBased(AlgoBase):
AlgoBase.__init__(self)
self.regressor_method = regressor_method
self.features_methods = features_method
<<<<<<< HEAD
=======
self.is_hackathon = is_hackathon
>>>>>>> 5385c4bc3a5802e1caec979d0d3a6bc7af3e970f
self.content_features = self.create_content_features(features_method)
self.user_profile = {}
self.user_profile_explain = {}
......@@ -152,15 +180,53 @@ class ContentBased(AlgoBase):
df_features.fillna(0, inplace=True)
return df_features
<<<<<<< HEAD
=======
>>>>>>> 5385c4bc3a5802e1caec979d0d3a6bc7af3e970f
def fit(self, trainset):
"""Profile Learner"""
AlgoBase.fit(self, trainset)
# Preallocate user profiles
self.user_profile = {u: None for u in trainset.all_users()}
self.user_profile_explain = {}
epsilon = 1e-10 # Small value to prevent division by zero
for u in trainset.all_users():
raw_user_id = trainset.to_raw_uid(u)
self.user_profile_explain[raw_user_id] = {}
user_ratings = np.array([rating for (_, rating) in trainset.ur[u]])
item_ids = [iid for (iid, _) in trainset.ur[u]]
raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]
# filtered_item_ids = [item_id for item_id in raw_item_ids if item_id in df_features.index and item_id in df_items.index]
# feature_values = self.content_features.loc[filtered_item_ids].values
feature_values = self.content_features.loc[raw_item_ids].values
norms = np.linalg.norm(feature_values, axis=0) + epsilon
weighted_features = feature_values / norms
feature_importance = user_ratings @ weighted_features
print(feature_values)
print("---------------\n")
pp(weighted_features.T)
print(type(weighted_features))
print(f"\n--------------- Nb lignes = {len(weighted_features)} --> {len(weighted_features)} x {len(user_ratings)}\n--------------- Nb colonnes = 17\n")
print(user_ratings.reshape(1,-1))
print(user_ratings[1])
print(type(user_ratings))
print("#########################################\n")
print(feature_importance)
print("•••••••••••••••••")
feature_importance /= np.sum(user_ratings)
self.user_profile_explain[raw_user_id] = dict(zip(self.content_features.columns, feature_importance))
<<<<<<< HEAD
=======
self.user_profile_explain = {}
# Loop over all internal user IDs in the trainset
......@@ -183,6 +249,7 @@ class ContentBased(AlgoBase):
self.user_profile_explain[raw_user_id] = dict(zip(self.content_features.columns, feature_importance))
>>>>>>> 5385c4bc3a5802e1caec979d0d3a6bc7af3e970f
if self.regressor_method == 'random_score':
for u in self.user_profile:
self.user_profile[u] = rd.uniform(0.5, 5)
......@@ -203,8 +270,12 @@ class ContentBased(AlgoBase):
'knn_regression': KNeighborsRegressor(n_neighbors=1),
'decision_tree': DecisionTreeRegressor(max_depth=5),
'adaboost': AdaBoostRegressor(n_estimators=50),
<<<<<<< HEAD
'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
=======
'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
>>>>>>> 5385c4bc3a5802e1caec979d0d3a6bc7af3e970f
}
if self.regressor_method not in regressor_models:
......@@ -250,7 +321,60 @@ class ContentBased(AlgoBase):
else:
return None
def rmse(self, testset):
"""Compute RMSE on the testset"""
predictions = []
true_ratings = []
for (uid, iid, true_r) in testset:
try:
pred_r = self.estimate(self.trainset.to_inner_uid(uid), self.trainset.to_inner_iid(iid))
predictions.append(pred_r)
true_ratings.append(true_r)
except PredictionImpossible:
continue
mse = mean_squared_error(true_ratings, predictions)
rmse_value = np.sqrt(mse)
return rmse_value
<<<<<<< HEAD
# Example usage:
cb = ContentBased(["title_length", "movie_year","genre","avg_rating"], "ridge_regression")
surprise_data = load_ratings(surprise_format=True)
trainset = surprise_data.build_full_trainset()
testset = trainset.build_anti_testset()
# print(cb.fit(trainset))
# print("RMSE: ", cb.rmse(testset))
# # Example explanations for users:
# #print(cb.explain(11))
# #print(cb.explain(13))
# print(cb.explain(17))
#print(cb.explain(23))
#print(cb.explain(27))
#print(cb.explain(73))
# # Obtenir les meilleures recommandations pour chaque utilisateur
# top_n_recommendations = get_top_n(predictions, n=10)
# # Afficher les recommandations pour quelques utilisateurs spécifiques
# for user_id, user_recommendations in top_n_recommendations.items():
# print(f"Utilisateur {user_id}:")
# for item_id, rating in user_recommendations:
# print(f" - Item {item_id}, estimation de note : {rating}")
=======
def test_contentbased_class(feature_method, regressor_method):
"""Test the ContentBased class.
Tries to make a prediction on the first (user,item ) tuple of the anti_test_set
......@@ -261,4 +385,5 @@ def test_contentbased_class(feature_method, regressor_method):
content_algo.fit(train_set)
anti_test_set_first = train_set.build_anti_testset()[0]
prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])
print(prediction)
\ No newline at end of file
print(prediction)
>>>>>>> 5385c4bc3a5802e1caec979d0d3a6bc7af3e970f