Skip to content
GitLab
Explorer
Connexion
S'inscrire
Navigation principale
Rechercher ou aller à…
Projet
R
recomsys
Gestion
Activité
Membres
Labels
Programmation
Tickets
Tableaux des tickets
Jalons
Wiki
Code
Requêtes de fusion
Dépôt
Branches
Validations
Étiquettes
Graphe du dépôt
Comparer les révisions
Extraits de code
Compilation
Pipelines
Jobs
Planifications de pipeline
Artéfacts
Déploiement
Releases
Registre de paquets
Registre de conteneur
Registre de modèles
Opération
Environnements
Modules Terraform
Surveillance
Incidents
Analyse
Données d'analyse des chaînes de valeur
Analyse des contributeurs
Données d'analyse CI/CD
Données d'analyse du dépôt
Expériences du modèle
Aide
Aide
Support
Documentation de GitLab
Comparer les forfaits GitLab
Forum de la communauté
Contribuer à GitLab
Donner votre avis
Conditions générales et politique de confidentialité
Raccourcis clavier
?
Extraits de code
Groupes
Projets
Afficher davantage de fils d'Ariane
recommender_system
recomsys
Comparer les révisions
5385c4bc3a5802e1caec979d0d3a6bc7af3e970f to 21d876e105f452b3eb099c4d23fcfa41afedef99
Comparer les révisions
Les modifications sont affichées comme si la révision
source
était fusionnée avec la révision
cible
.
En savoir plus sur la comparaison des révisions.
Source
recommender_system/recomsys
Sélectionner le projet cible
No results found
21d876e105f452b3eb099c4d23fcfa41afedef99
Sélectionner une révision Git
Échanger
Cible
recommender_system/recomsys
Sélectionner le projet cible
recommender_system/recomsys
1 résultat
5385c4bc3a5802e1caec979d0d3a6bc7af3e970f
Sélectionner une révision Git
Afficher les modifications
Uniquement les modifications entrantes de la source
Inclure les modifications apportées à la cible depuis la création de la source
Comparer
Validations sur la source (3)
inutile
· 0a240ae3
Nathanaël Kindidi
a rédigé
1 year ago
0a240ae3
OK
· 12c8b310
Nathanaël Kindidi
a rédigé
1 year ago
12c8b310
Hmm
· 21d876e1
Nathanaël Kindidi
a rédigé
1 year ago
21d876e1
Tout étendre
Masquer les modifications d'espaces
En ligne
Côte à côte
Affichage de
4 fichiers modifiés
configs.py
+59
-0
59 ajouts, 0 suppression
configs.py
content_based.ipynb
+23
-9
23 ajouts, 9 suppressions
content_based.ipynb
data/hackathon/evidence/ratings_test.csv
+0
-104469
0 ajout, 104469 suppressions
data/hackathon/evidence/ratings_test.csv
models.py
+128
-3
128 ajouts, 3 suppressions
models.py
avec
210 ajouts
et
104481 suppressions
configs.py
Voir le fichier @
21d876e1
...
...
@@ -43,6 +43,65 @@ class EvalConfig:
(
"
baseline_2
"
,
ModelBaseline2
,
{}),
(
"
baseline_3
"
,
ModelBaseline3
,
{}),
(
"
baseline_4
"
,
ModelBaseline4
,
{}),
<<<<<<<
HEAD
(
"
title_length_ContentBased_sample
"
,
ContentBased
,
{
"
title_length
"
,
"
random_sample
"
}),
(
"
title_length_ContentBased_score
"
,
ContentBased
,
{
"
features_method
"
:
[
"
title_length
"
],
"
regressor_method
"
:
"
random_score
"
}),
(
"
title_length_ContentBased_Lr
"
,
ContentBased
,
{
"
features_method
"
:
[
"
title_length
"
],
"
regressor_method
"
:
"
linear_regression
"
}),
(
"
title_length_ContentBased_Lr
"
,
ContentBased
,
{
"
features_method
"
:
[
"
title_length
"
],
"
regressor_method
"
:
"
svr_regression
"
}),
(
"
title_length_ContentBased_Lr
"
,
ContentBased
,
{
"
features_method
"
:
[
"
title_length
"
],
"
regressor_method
"
:
"
gradient_boosting
"
}),
(
"
title_length_ContentBased_Lr
"
,
ContentBased
,
{
"
features_method
"
:
[
"
title_length
"
],
"
regressor_method
"
:
"
random_forest
"
}),
(
"
movie_year_ContentBased_sample
"
,
ContentBased
,
{
"
features_method
"
:
"
movie_year
"
,
"
regressor_method
"
:
"
random_sample
"
}),
(
"
movie_year_ContentBased_score
"
,
ContentBased
,
{
"
features_method
"
:
"
movie_year
"
,
"
regressor_method
"
:
"
random_score
"
}),
#("movie_year_ContentBased_Lr", ContentBased, {"features_method" : "movie_year", "regressor_method" : "linear_regression"}),
#("movie_year_ContentBased_Lr", ContentBased, {"features_method" : "movie_year", "regressor_method" : "svr_regression"}),
#("movie_year_ContentBased_Lr", ContentBased, {"features_method" : "movie_year", "regressor_method" : "gradient_boosting"}),
#("movie_year_ContentBased_Lr", ContentBased, {"features_method" : "movie_year", "regressor_method" : "random_forest"}),
(
"
genres_ContentBased_sample
"
,
ContentBased
,
{
"
features_method
"
:
"
genres
"
,
"
regressor_method
"
:
"
random_sample
"
}),
(
"
genres_ContentBased_score
"
,
ContentBased
,
{
"
features_method
"
:
"
genres
"
,
"
regressor_method
"
:
"
random_score
"
}),
#("genres_ContentBased_Lr", ContentBased, {"features_method" : "genres", "regressor_method" : "linear_regression"}),
#("genres_ContentBased_Lr", ContentBased, {"features_method" : "genres", "regressor_method" : "svr_regression"}),
#("genres_ContentBased_Lr", ContentBased, {"features_method" : "genres", "regressor_method" : "gradient_boosting"}),
#("genres_ContentBased_Lr", ContentBased, {"features_method" : "genres", "regressor_method" : "random_forest"}),
(
"
rating_ContentBased_sample
"
,
ContentBased
,
{
"
features_method
"
:
"
rating
"
,
"
regressor_method
"
:
"
random_sample
"
}),
(
"
rating_ContentBased_score
"
,
ContentBased
,
{
"
features_method
"
:
"
rating
"
,
"
regressor_method
"
:
"
random_score
"
}),
#("rating_ContentBased_Lr", ContentBased, {"features_method" : "rating", "regressor_method" : "linear_regression"}),
#("rating_ContentBased_Lr", ContentBased, {"features_method" : "rating", "regressor_method" : "svr_regression"}),
#("rating_ContentBased_Lr", ContentBased, {"features_method" : "rating", "regressor_method" : "gradient_boosting"}),
#("rating_ContentBased_Lr", ContentBased, {"features_method" : "rating", "regressor_method" : "random_forest"}),
(
"
tags_ContentBased_sample
"
,
ContentBased
,
{
"
features_method
"
:
"
tags
"
,
"
regressor_method
"
:
"
random_sample
"
}),
(
"
tags_ContentBased_score
"
,
ContentBased
,
{
"
features_method
"
:
"
tags
"
,
"
regressor_method
"
:
"
random_score
"
}),
#("tags_ContentBased_Lr", ContentBased, {"features_method" : "tags", "regressor_method" : "linear_regression"}),
#("tags_ContentBased_Lr", ContentBased, {"features_method" : "tags", "regressor_method" : "svr_regression"}),
#("tags_ContentBased_Lr", ContentBased, {"features_method" : "tags", "regressor_method" : "gradient_boosting"}),
#("tags_ContentBased_Lr", ContentBased, {"features_method" : "tags", "regressor_method" : "random_forest"}),
(
"
tags_length_ContentBased_sample
"
,
ContentBased
,
{
"
features_method
"
:
"
tags_length
"
,
"
regressor_method
"
:
"
random_sample
"
}),
(
"
tags_length_ContentBased_score
"
,
ContentBased
,
{
"
features_method
"
:
"
tags_length
"
,
"
regressor_method
"
:
"
random_score
"
}),
#("tags_length_ContentBased_Lr", ContentBased, {"features_method" : "tags_length", "regressor_method" : "linear_regression"}),
#("tags_length_ContentBased_Lr", ContentBased, {"features_method" : "tags_length", "regressor_method" : "svr_regression"}),
#("tags_length_ContentBased_Lr", ContentBased, {"features_method" : "tags_length", "regressor_method" : "gradient_boosting"}),
#("tags_length_ContentBased_Lr", ContentBased, {"features_method" : "tags_length", "regressor_method" : "random_forest"}),
(
"
timestamp_ContentBased_sample
"
,
ContentBased
,
{
"
features_method
"
:
"
timestamp
"
,
"
regressor_method
"
:
"
random_sample
"
}),
(
"
timestamp_ContentBased_score
"
,
ContentBased
,
{
"
features_method
"
:
"
timestamp
"
,
"
regressor_method
"
:
"
random_score
"
}),
#("timestamp_ContentBased_Lr", ContentBased, {"features_method" : "timestamp", "regressor_method" : "linear_regression"})
#("timestamp_ContentBased_Lr", ContentBased, {"features_method" : "timestamp", "regressor_method" : "svr_regression"})
#("timestamp_ContentBased_Lr", ContentBased, {"features_method" : "timestamp", "regressor_method" : "gradient_boosting"})
#("timestamp_ContentBased_Lr", ContentBased, {"features_method" : "timestamp", "regressor_method" : "random_forest"})
# model_name, model class, model parameters (dict)
=======
>>>>>>>
5385
c4bc3a5802e1caec979d0d3a6bc7af3e970f
]
# Add the combinations of ContentBased models to the list of models
...
...
Ce diff est replié.
Cliquez pour l'agrandir.
content_based.ipynb
Voir le fichier @
21d876e1
...
...
@@ -10,10 +10,19 @@
},
{
"cell_type": "code",
"execution_count":
1
,
"execution_count":
8
,
"id": "277473a3",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
...
...
@@ -36,8 +45,7 @@
"from sklearn.neighbors import KNeighborsRegressor\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from sklearn.ensemble import AdaBoostRegressor\n",
"from xgboost import XGBRegressor\n",
"from lightgbm import LGBMRegressor"
"from xgboost import XGBRegressor"
]
},
{
...
...
@@ -50,7 +58,7 @@
},
{
"cell_type": "code",
"execution_count":
2
,
"execution_count":
9
,
"id": "e8378976",
"metadata": {},
"outputs": [
...
...
@@ -166,7 +174,7 @@
},
{
"cell_type": "code",
"execution_count":
3
,
"execution_count":
10
,
"id": "16b0a602",
"metadata": {},
"outputs": [
...
...
@@ -278,8 +286,7 @@
" 'knn_regression': KNeighborsRegressor(n_neighbors=1),\n",
" 'decision_tree': DecisionTreeRegressor(max_depth=5),\n",
" 'adaboost': AdaBoostRegressor(n_estimators=50),\n",
" 'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),\n",
" 'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)\n",
" 'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)\n",
" }\n",
"\n",
" if self.regressor_method not in regressor_models:\n",
...
...
@@ -288,7 +295,10 @@
" for u in self.user_profile:\n",
" user_ratings = [rating for (_, rating) in trainset.ur[u]]\n",
" item_ids = [iid for (iid, _) in trainset.ur[u]]\n",
" # raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]\n",
" raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]\n",
" filtered_item_ids = [item_id for item_id in raw_item_ids if item_id in df_features.index]\n",
" feature_values = self.content_features.loc[filtered_item_ids].values\n",
"\n",
" df_user = pd.DataFrame({'item_id': raw_item_ids, 'user_ratings': user_ratings})\n",
" df_user = df_user.merge(self.content_features, left_on=\"item_id\", right_index=True, how='left')\n",
...
...
@@ -361,7 +371,11 @@
},
{
"cell_type": "code",
<<<<<<< HEAD
"execution_count": 11,
=======
"execution_count": 4,
>>>>>>> 5385c4bc3a5802e1caec979d0d3a6bc7af3e970f
"id": "69d12f7d",
"metadata": {},
"outputs": [
...
...
@@ -406,7 +420,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.
2
"
"version": "3.12.
0
"
}
},
"nbformat": 4,
...
...
Ce diff est replié.
Cliquez pour l'agrandir.
data/hackathon/evidence/ratings_test.csv
supprimé
100644 → 0
Voir le fichier @
5385c4bc
Ce diff est replié.
Cliquez pour l'agrandir.
models.py
Voir le fichier @
21d876e1
...
...
@@ -7,13 +7,19 @@ import numpy as np
import
random
as
rd
from
surprise
import
AlgoBase
,
SVD
from
surprise
import
PredictionImpossible
from
sklearn.metrics
import
mean_squared_error
from
pprint
import
pprint
as
pp
# import local
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
loaders
import
load_items
,
load_ratings
from
constants
import
Constant
as
C
from
sklearn.linear_model
import
LinearRegression
from
sklearn.ensemble
import
GradientBoostingRegressor
,
RandomForestRegressor
<<<<<<<
HEAD
from
sklearn.ensemble
import
BaggingRegressor
=======
from
sklearn.linear_model
import
Lasso
,
Ridge
,
ElasticNet
from
sklearn.neighbors
import
KNeighborsRegressor
from
sklearn.tree
import
DecisionTreeRegressor
...
...
@@ -22,10 +28,30 @@ from sklearn.feature_extraction.text import TfidfVectorizer
from
xgboost
import
XGBRegressor
from
lightgbm
import
LGBMRegressor
>>>>>>>
5385
c4bc3a5802e1caec979d0d3a6bc7af3e970f
from
sklearn.svm
import
SVR
from
sklearn.linear_model
import
LinearRegression
,
Lasso
,
Ridge
,
ElasticNet
from
sklearn.svm
import
SVR
from
sklearn.ensemble
import
GradientBoostingRegressor
,
RandomForestRegressor
,
AdaBoostRegressor
from
sklearn.tree
import
DecisionTreeRegressor
from
sklearn.neighbors
import
KNeighborsRegressor
from
xgboost
import
XGBRegressor
# All the dataframes
df_items
=
load_items
()
df_ratings
=
load_ratings
()
df_tag
=
pd
.
read_csv
(
C
.
CONTENT_PATH
/
C
.
TAGS_FILENAME
)
# Example 1 : create title_length features
df_features
=
df_items
[
C
.
LABEL_COL
].
apply
(
lambda
x
:
len
(
x
)).
to_frame
(
'
n_character_title
'
)
df_features
=
df_tag
[
C
.
TAG
]
df_genome_score
=
pd
.
read_csv
(
"
data/hackathon/content/genome-scores.csv
"
)
df_genome_tag
=
pd
.
read_csv
(
"
data/hackathon/content/genome-tags.csv
"
)
...
...
@@ -83,7 +109,6 @@ class ModelBaseline2(AlgoBase):
def
estimate
(
self
,
u
,
i
):
return
rd
.
uniform
(
self
.
trainset
.
rating_scale
[
0
],
self
.
trainset
.
rating_scale
[
1
])
# Third algorithm
class
ModelBaseline3
(
AlgoBase
):
def
__init__
(
self
):
...
...
@@ -111,7 +136,10 @@ class ContentBased(AlgoBase):
AlgoBase
.
__init__
(
self
)
self
.
regressor_method
=
regressor_method
self
.
features_methods
=
features_method
<<<<<<<
HEAD
=======
self
.
is_hackathon
=
is_hackathon
>>>>>>>
5385
c4bc3a5802e1caec979d0d3a6bc7af3e970f
self
.
content_features
=
self
.
create_content_features
(
features_method
)
self
.
user_profile
=
{}
self
.
user_profile_explain
=
{}
...
...
@@ -152,15 +180,53 @@ class ContentBased(AlgoBase):
df_features
.
fillna
(
0
,
inplace
=
True
)
return
df_features
<<<<<<<
HEAD
=======
>>>>>>>
5385
c4bc3a5802e1caec979d0d3a6bc7af3e970f
def
fit
(
self
,
trainset
):
"""
Profile Learner
"""
AlgoBase
.
fit
(
self
,
trainset
)
# Preallocate user profiles
self
.
user_profile
=
{
u
:
None
for
u
in
trainset
.
all_users
()}
self
.
user_profile_explain
=
{}
epsilon
=
1e-10
# Small value to prevent division by zero
for
u
in
trainset
.
all_users
():
raw_user_id
=
trainset
.
to_raw_uid
(
u
)
self
.
user_profile_explain
[
raw_user_id
]
=
{}
user_ratings
=
np
.
array
([
rating
for
(
_
,
rating
)
in
trainset
.
ur
[
u
]])
item_ids
=
[
iid
for
(
iid
,
_
)
in
trainset
.
ur
[
u
]]
raw_item_ids
=
[
trainset
.
to_raw_iid
(
iid
)
for
iid
in
item_ids
]
# filtered_item_ids = [item_id for item_id in raw_item_ids if item_id in df_features.index and item_id in df_items.index]
# feature_values = self.content_features.loc[filtered_item_ids].values
feature_values
=
self
.
content_features
.
loc
[
raw_item_ids
].
values
norms
=
np
.
linalg
.
norm
(
feature_values
,
axis
=
0
)
+
epsilon
weighted_features
=
feature_values
/
norms
feature_importance
=
user_ratings
@
weighted_features
print
(
feature_values
)
print
(
"
---------------
\n
"
)
pp
(
weighted_features
.
T
)
print
(
type
(
weighted_features
))
print
(
f
"
\n
--------------- Nb lignes =
{
len
(
weighted_features
)
}
-->
{
len
(
weighted_features
)
}
x
{
len
(
user_ratings
)
}
\n
--------------- Nb colonnes = 17
\n
"
)
print
(
user_ratings
.
reshape
(
1
,
-
1
))
print
(
user_ratings
[
1
])
print
(
type
(
user_ratings
))
print
(
"
#########################################
\n
"
)
print
(
feature_importance
)
print
(
"
•••••••••••••••••
"
)
feature_importance
/=
np
.
sum
(
user_ratings
)
self
.
user_profile_explain
[
raw_user_id
]
=
dict
(
zip
(
self
.
content_features
.
columns
,
feature_importance
))
<<<<<<<
HEAD
=======
self
.
user_profile_explain
=
{}
# Loop over all internal user IDs in the trainset
...
...
@@ -183,6 +249,7 @@ class ContentBased(AlgoBase):
self
.
user_profile_explain
[
raw_user_id
]
=
dict
(
zip
(
self
.
content_features
.
columns
,
feature_importance
))
>>>>>>>
5385
c4bc3a5802e1caec979d0d3a6bc7af3e970f
if
self
.
regressor_method
==
'
random_score
'
:
for
u
in
self
.
user_profile
:
self
.
user_profile
[
u
]
=
rd
.
uniform
(
0.5
,
5
)
...
...
@@ -203,8 +270,12 @@ class ContentBased(AlgoBase):
'
knn_regression
'
:
KNeighborsRegressor
(
n_neighbors
=
1
),
'
decision_tree
'
:
DecisionTreeRegressor
(
max_depth
=
5
),
'
adaboost
'
:
AdaBoostRegressor
(
n_estimators
=
50
),
<<<<<<<
HEAD
'
xgboost
'
:
XGBRegressor
(
n_estimators
=
100
,
learning_rate
=
0.1
,
max_depth
=
3
)
=======
'
xgboost
'
:
XGBRegressor
(
n_estimators
=
100
,
learning_rate
=
0.1
,
max_depth
=
3
),
'
lightgbm
'
:
LGBMRegressor
(
n_estimators
=
100
,
learning_rate
=
0.1
,
max_depth
=
3
)
>>>>>>>
5385
c4bc3a5802e1caec979d0d3a6bc7af3e970f
}
if
self
.
regressor_method
not
in
regressor_models
:
...
...
@@ -250,7 +321,60 @@ class ContentBased(AlgoBase):
else
:
return
None
def
rmse
(
self
,
testset
):
"""
Compute RMSE on the testset
"""
predictions
=
[]
true_ratings
=
[]
for
(
uid
,
iid
,
true_r
)
in
testset
:
try
:
pred_r
=
self
.
estimate
(
self
.
trainset
.
to_inner_uid
(
uid
),
self
.
trainset
.
to_inner_iid
(
iid
))
predictions
.
append
(
pred_r
)
true_ratings
.
append
(
true_r
)
except
PredictionImpossible
:
continue
mse
=
mean_squared_error
(
true_ratings
,
predictions
)
rmse_value
=
np
.
sqrt
(
mse
)
return
rmse_value
<<<<<<<
HEAD
# Example usage:
cb
=
ContentBased
([
"
title_length
"
,
"
movie_year
"
,
"
genre
"
,
"
avg_rating
"
],
"
ridge_regression
"
)
surprise_data
=
load_ratings
(
surprise_format
=
True
)
trainset
=
surprise_data
.
build_full_trainset
()
testset
=
trainset
.
build_anti_testset
()
# print(cb.fit(trainset))
# print("RMSE: ", cb.rmse(testset))
# # Example explanations for users:
# #print(cb.explain(11))
# #print(cb.explain(13))
# print(cb.explain(17))
#print(cb.explain(23))
#print(cb.explain(27))
#print(cb.explain(73))
# # Obtenir les meilleures recommandations pour chaque utilisateur
# top_n_recommendations = get_top_n(predictions, n=10)
# # Afficher les recommandations pour quelques utilisateurs spécifiques
# for user_id, user_recommendations in top_n_recommendations.items():
# print(f"Utilisateur {user_id}:")
# for item_id, rating in user_recommendations:
# print(f" - Item {item_id}, estimation de note : {rating}")
=======
def
test_contentbased_class
(
feature_method
,
regressor_method
):
"""
Test the ContentBased class.
Tries to make a prediction on the first (user,item ) tuple of the anti_test_set
...
...
@@ -261,4 +385,5 @@ def test_contentbased_class(feature_method, regressor_method):
content_algo
.
fit
(
train_set
)
anti_test_set_first
=
train_set
.
build_anti_testset
()[
0
]
prediction
=
content_algo
.
predict
(
anti_test_set_first
[
0
],
anti_test_set_first
[
1
])
print
(
prediction
)
\ No newline at end of file
print
(
prediction
)
>>>>>>>
5385
c4bc3a5802e1caec979d0d3a6bc7af3e970f
Ce diff est replié.
Cliquez pour l'agrandir.