Skip to content
GitLab
Explorer
Connexion
S'inscrire
Navigation principale
Rechercher ou aller à…
Projet
R
recomsys
Gestion
Activité
Membres
Labels
Programmation
Tickets
Tableaux des tickets
Jalons
Wiki
Code
Requêtes de fusion
Dépôt
Branches
Validations
Étiquettes
Graphe du dépôt
Comparer les révisions
Extraits de code
Compilation
Pipelines
Jobs
Planifications de pipeline
Artéfacts
Déploiement
Releases
Registre de paquets
Registre de conteneur
Registre de modèles
Opération
Environnements
Modules Terraform
Surveillance
Incidents
Analyse
Données d'analyse des chaînes de valeur
Analyse des contributeurs
Données d'analyse CI/CD
Données d'analyse du dépôt
Expériences du modèle
Aide
Aide
Support
Documentation de GitLab
Comparer les forfaits GitLab
Forum de la communauté
Contribuer à GitLab
Donner votre avis
Conditions générales et politique de confidentialité
Raccourcis clavier
?
Extraits de code
Groupes
Projets
Afficher davantage de fils d'Ariane
recommender_system
recomsys
Validations
ad331907
Valider
ad331907
rédigé
1 year ago
par
Adrien Payen
Parcourir les fichiers
Options
Téléchargements
Correctifs
Plain Diff
update files
parent
aa44f259
Aucune branche associée trouvée
Branches contenant la validation
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
Modifications
2
Masquer les modifications d'espaces
En ligne
Côte à côte
Affichage de
2 fichiers modifiés
content_based copy.ipynb
+427
-0
427 ajouts, 0 suppression
content_based copy.ipynb
recommender.py
+27
-29
27 ajouts, 29 suppressions
recommender.py
avec
454 ajouts
et
29 suppressions
content_based copy.ipynb
0 → 100644
+
427
−
0
Voir le fichier @
ad331907
{
"cells": [
{
"cell_type": "markdown",
"id": "82d5ca82",
"metadata": {},
"source": [
"# Packages"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "277473a3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"\n",
"\n",
"# third parties imports\n",
"import pandas as pd\n",
"import numpy as np\n",
"import random as rd\n",
"from surprise import AlgoBase, SVD\n",
"from surprise import PredictionImpossible\n",
"\n",
"# import local\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from loaders import load_items, load_ratings\n",
"from constants import Constant as C\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor\n",
"from sklearn.svm import SVR\n",
"\n",
"from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet\n",
"from sklearn.svm import SVR\n",
"from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from sklearn.neighbors import KNeighborsRegressor\n",
"from xgboost import XGBRegressor\n",
"from lightgbm import LGBMRegressor\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "a42c16bf",
"metadata": {},
"source": [
"# Explore and select content features"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e8378976",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>n_character_title</th>\n",
" </tr>\n",
" <tr>\n",
" <th>movieId</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>24</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>34</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" n_character_title\n",
"movieId \n",
"1 16\n",
"2 14\n",
"3 23\n",
"4 24\n",
"5 34"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"0 sandra 'boring' bullock\n",
"1 dentist\n",
"2 Cambodia\n",
"3 Russian\n",
"4 forgettable\n",
"Name: tag, dtype: object"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# All the dataframes\n",
"df_items = load_items()\n",
"df_ratings = load_ratings()\n",
"df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n",
"#df_genome_score = pd.read_csv(\"data/hackathon/content/genome-scores.csv\")\n",
"# df_genome_tag = pd.read_csv(\"data/hackathon/content/genome-tags.csv\")\n",
"\n",
"\n",
"# Example 1 : create title_length features\n",
"df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')\n",
"display(df_features.head())\n",
"\n",
"df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n",
"df_features = df_tag[C.TAG]\n",
"display(df_features.head())\n",
"\n",
"# (explore here other features)\n"
]
},
{
"cell_type": "markdown",
"id": "a2c9a2b6",
"metadata": {},
"source": [
"# Build a content-based model\n",
"When ready, move the following class in the *models.py* script"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "16b0a602",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'title_length': 0.1497645139703848, 'movie_year': 0.16218667420100635, '(no genres listed)': 0.0, 'action': 0.09449072815753193, 'adventure': 0.08778978776313201, 'animation': 0.0, 'children': 0.038431411145366176, 'comedy': 0.07268129109348041, 'crime': 0.09469516433772891, 'documentary': 0.0611428358670058, 'drama': 0.10494783392380302, 'fantasy': 0.025806451608591505, 'film-noir': 0.025806451609512046, 'horror': 0.018342712153336858, 'imax': 0.06947533670577526, 'musical': 0.0, 'mystery': 0.06234903350217154, 'romance': 0.036771716124540825, 'sci-fi': 0.059571001735546115, 'thriller': 0.0993122803165238, 'war': 0.04002978709072218, 'western': 0.04547648227079719, 'avg_rating': 0.16263357553020436}\n",
"{'title_length': 0.12975573389578626, 'movie_year': 0.13738555574364605, '(no genres listed)': 0.0, 'action': 0.0640388318396414, 'adventure': 0.0827515664964472, 'animation': 0.05686854568650957, 'children': 0.06799492283569505, 'comedy': 0.07354182680364503, 'crime': 0.05543740962624167, 'documentary': 0.0, 'drama': 0.09170589087803577, 'fantasy': 0.061481521263689595, 'film-noir': 0.0, 'horror': 0.015113350123518238, 'imax': 0.04592205020685974, 'musical': 0.03201459126079391, 'mystery': 0.03412706135338736, 'romance': 0.05989121250223656, 'sci-fi': 0.04370793816378273, 'thriller': 0.045800659191095036, 'war': 0.04907194751877139, 'western': 0.027287416762806844, 'avg_rating': 0.13740560847192132}\n",
"{'title_length': 0.04702378569892371, 'movie_year': 0.052440003628289225, '(no genres listed)': 0.0, 'action': 0.020439581335728367, 'adventure': 0.015593308332521032, 'animation': 0.004256286923052558, 'children': 0.003520723090188317, 'comedy': 0.018972762464944913, 'crime': 0.028340544273099223, 'documentary': 0.005823989517206729, 'drama': 0.037415345194166824, 'fantasy': 0.013643903080149476, 'film-noir': 0.015390183296279798, 'horror': 0.01926898253629829, 'imax': 0.0014716703456143566, 'musical': 0.0061519348279224124, 'mystery': 0.02847033164163413, 'romance': 0.019827342468818163, 'sci-fi': 0.022573488552024915, 'thriller': 0.03522231545147593, 'war': 0.010339617301415098, 'western': 0.005663885036293055, 'avg_rating': 0.05327750989412312}\n",
"{'title_length': 0.033402138126294736, 'movie_year': 0.03710065977291947, '(no genres listed)': 0.0, 'action': 0.014528522669579273, 'adventure': 0.013963913494241694, 'animation': 0.005764814103226412, 'children': 0.006513197483932152, 'comedy': 0.017763201411495646, 'crime': 0.016002513666599556, 'documentary': 0.004292962983778595, 'drama': 0.027458210593047847, 'fantasy': 0.009302633945770895, 'film-noir': 0.006823368830454359, 'horror': 0.007391689869010394, 'imax': 0.004855154663168369, 'musical': 0.0058909467772061425, 'mystery': 0.012191560732760487, 'romance': 0.01723631022081761, 'sci-fi': 0.010817269433255231, 'thriller': 0.01658593988724716, 'war': 0.010193212979882352, 'western': 0.0052038255339472966, 'avg_rating': 0.03742403427834079}\n",
"{'title_length': 0.20154225634108316, 'movie_year': 0.20848962267389695, '(no genres listed)': 0.0, 'action': 0.04545454544645529, 'adventure': 0.04545454544730129, 'animation': 0.0, 'children': 0.0, 'comedy': 0.07177284969293253, 'crime': 0.1145252645738102, 'documentary': 0.0, 'drama': 0.16778172557550536, 'fantasy': 0.0, 'film-noir': 0.0, 'horror': 0.06315936177961773, 'imax': 0.0, 'musical': 0.0, 'mystery': 0.08510520557533159, 'romance': 0.09754755529442835, 'sci-fi': 0.045454545449454146, 'thriller': 0.12542163704872258, 'war': 0.08035304331050673, 'western': 0.0, 'avg_rating': 0.21152969571139305}\n",
"{'title_length': 0.021927486954368552, 'movie_year': 0.02488786702116846, '(no genres listed)': 0.0007363092498113207, 'action': 0.013836432470735639, 'adventure': 0.011610617815573265, 'animation': 0.007520799115717832, 'children': 0.006287966766754299, 'comedy': 0.012951125615087338, 'crime': 0.011084119744598393, 'documentary': 0.0018287715645832062, 'drama': 0.015221252640276463, 'fantasy': 0.008631010164284143, 'film-noir': 0.0024629052522566544, 'horror': 0.008816299251739122, 'imax': 0.005347204099216887, 'musical': 0.0038827346462235236, 'mystery': 0.0068652812039576095, 'romance': 0.008086664541950757, 'sci-fi': 0.010304269379559203, 'thriller': 0.013200133984104478, 'war': 0.005127335699821772, 'western': 0.0036215200349232765, 'avg_rating': 0.025470698706944836}\n"
]
}
],
"source": [
"\n",
"# ContetnBased\n",
"class ContentBased(AlgoBase):\n",
" def __init__(self, features_method, regressor_method):\n",
" AlgoBase.__init__(self)\n",
" self.regressor_method = regressor_method\n",
" self.features_methods = features_method\n",
" self.content_features = self.create_content_features(features_method)\n",
" self.user_profile = {}\n",
" self.user_profile_explain = {}\n",
"\n",
" def create_content_features(self, features_methods):\n",
" \"\"\"Content Analyzer\"\"\"\n",
" df_items = load_items()\n",
" df_ratings = load_ratings()\n",
" df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n",
" df_genome_score = pd.read_csv(\"data/hackathon/content/genome-scores.csv\")\n",
" df_genome_tag = pd.read_csv(\"data/hackathon/content/genome-tags.csv\")\n",
"\n",
" df_features = pd.DataFrame(index=df_items.index)\n",
"\n",
" for method in features_methods:\n",
" if method == \"title_length\":\n",
" df_title_length = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('title_length')\n",
" df_features = pd.concat([df_features, df_title_length], axis=1)\n",
" \n",
" elif method == \"movie_year\":\n",
" df_movie_year = df_items['title'].str.extract(r'\\((\\d{4})\\)', expand=False).to_frame('movie_year')\n",
" df_features = pd.concat([df_features, df_movie_year.astype(float).fillna(0)], axis=1)\n",
" \n",
" elif method == \"genre\":\n",
" tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split('|'), token_pattern=None)\n",
" tfidf_matrix = tfidf_vectorizer.fit_transform(df_items['genres'])\n",
" df_tfidf_genres = pd.DataFrame(tfidf_matrix.toarray(), index=df_items.index, columns=tfidf_vectorizer.get_feature_names_out())\n",
" df_features = pd.concat([df_features, df_tfidf_genres], axis=1)\n",
"\n",
" elif method == \"avg_rating\":\n",
" df_avg_rating = df_ratings.groupby('movieId')['rating'].mean().to_frame('avg_rating')\n",
" df_features = df_features.join(df_avg_rating, on='movieId')\n",
"\n",
" else:\n",
" raise NotImplementedError(f'Feature method {method} not yet implemented')\n",
"\n",
" # Handle missing values in df_features\n",
" df_features.fillna(0, inplace=True)\n",
"\n",
" return df_features\n",
"\n",
" def fit(self, trainset):\n",
" \"\"\"Profile Learner\"\"\"\n",
" AlgoBase.fit(self, trainset)\n",
"\n",
" # Preallocate user profiles\n",
" self.user_profile = {u: None for u in trainset.all_users()}\n",
" self.user_profile_explain = {}\n",
"\n",
" epsilon = 1e-10 # Small value to prevent division by zero\n",
"\n",
" for u in trainset.all_users():\n",
" raw_user_id = trainset.to_raw_uid(u)\n",
" self.user_profile_explain[raw_user_id] = {}\n",
"\n",
" user_ratings = np.array([rating for (_, rating) in trainset.ur[u]])\n",
" item_ids = [iid for (iid, _) in trainset.ur[u]]\n",
" raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]\n",
"\n",
" feature_values = self.content_features.loc[raw_item_ids].values\n",
" norms = np.linalg.norm(feature_values, axis=0) + epsilon\n",
" weighted_features = feature_values / norms\n",
" feature_importance = weighted_features.T @ user_ratings\n",
" feature_importance /= np.sum(user_ratings)\n",
"\n",
" self.user_profile_explain[raw_user_id] = dict(zip(self.content_features.columns, feature_importance))\n",
"\n",
" if self.regressor_method == 'random_score':\n",
" for u in self.user_profile:\n",
" self.user_profile[u] = rd.uniform(0.5, 5)\n",
"\n",
" elif self.regressor_method == 'random_sample':\n",
" for u in self.user_profile:\n",
" self.user_profile[u] = [rating for (_, rating) in trainset.ur[u]]\n",
"\n",
" else:\n",
" regressor_models = {\n",
" 'linear_regression': LinearRegression(fit_intercept=False),\n",
" 'svr_regression': SVR(kernel='rbf', C=10, epsilon=0.2),\n",
" 'gradient_boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),\n",
" 'random_forest': RandomForestRegressor(n_estimators=100),\n",
" 'lasso_regression': Lasso(alpha=0.1),\n",
" 'ridge_regression': Ridge(alpha=1.0),\n",
" 'elastic_net': ElasticNet(alpha=1.0, l1_ratio=0.5),\n",
" 'knn_regression': KNeighborsRegressor(n_neighbors=1),\n",
" 'decision_tree': DecisionTreeRegressor(max_depth=5),\n",
" 'adaboost': AdaBoostRegressor(n_estimators=50),\n",
" 'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),\n",
" 'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)\n",
" }\n",
"\n",
" if self.regressor_method not in regressor_models:\n",
" raise NotImplementedError(f'Regressor method {self.regressor_method} not yet implemented')\n",
"\n",
" for u in self.user_profile:\n",
" user_ratings = [rating for (_, rating) in trainset.ur[u]]\n",
" item_ids = [iid for (iid, _) in trainset.ur[u]]\n",
" raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]\n",
"\n",
" df_user = pd.DataFrame({'item_id': raw_item_ids, 'user_ratings': user_ratings})\n",
" df_user = df_user.merge(self.content_features, left_on=\"item_id\", right_index=True, how='left')\n",
"\n",
" X = df_user.drop(columns=['item_id', 'user_ratings'])\n",
" y = df_user['user_ratings']\n",
"\n",
" regressor = regressor_models[self.regressor_method]\n",
" regressor.fit(X, y)\n",
"\n",
" self.user_profile[u] = regressor\n",
"\n",
" def estimate(self, u, i):\n",
" \"\"\"Scoring component used for item filtering\"\"\"\n",
" if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):\n",
" raise PredictionImpossible('User and/or item is unknown.')\n",
"\n",
" if self.regressor_method == 'random_score':\n",
" return rd.uniform(0.5, 5)\n",
"\n",
" elif self.regressor_method == 'random_sample':\n",
" return rd.choice(self.user_profile[u])\n",
"\n",
" else:\n",
" raw_item_id = self.trainset.to_raw_iid(i)\n",
" item_features = self.content_features.loc[raw_item_id, :].values.reshape(1, -1)\n",
" regressor = self.user_profile[u]\n",
" item_features_df = pd.DataFrame(item_features, columns=self.content_features.columns)\n",
" return regressor.predict(item_features_df)[0]\n",
"\n",
" def explain(self, u):\n",
" if u in self.user_profile_explain:\n",
" return self.user_profile_explain[u]\n",
" else:\n",
" return None\n",
"\n",
"\n",
"#Example usage:\n",
"cb = ContentBased([\"title_length\", \"movie_year\",\"genre\",\"avg_rating\"], \"ridge_regression\")\n",
"surprise_data = load_ratings(surprise_format=True)\n",
"trainset = surprise_data.build_full_trainset()\n",
"testset = trainset.build_anti_testset()\n",
"cb.fit(trainset)\n",
"\n",
"\n",
"#print(\"RMSE: \", cb.rmse(testset))\n",
"\n",
"\n",
"#Example explanations for users:\n",
"print(cb.explain(11))\n",
"\n",
"print(cb.explain(13))\n",
"\n",
"print(cb.explain(17))\n",
"\n",
"print(cb.explain(23))\n",
"\n",
"print(cb.explain(27))\n",
"\n",
"print(cb.explain(73))\n"
]
},
{
"cell_type": "markdown",
"id": "ffd75b7e",
"metadata": {},
"source": [
"The following script test the ContentBased class"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "69d12f7d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"user: 1 item: 10 r_ui = None est = 0.72 {'was_impossible': False}\n"
]
}
],
"source": [
"def test_contentbased_class(feature_method, regressor_method):\n",
" \"\"\"Test the ContentBased class.\n",
" Tries to make a prediction on the first (user,item ) tuple of the anti_test_set\n",
" \"\"\"\n",
" sp_ratings = load_ratings(surprise_format=True)\n",
" train_set = sp_ratings.build_full_trainset()\n",
" content_algo = ContentBased(feature_method, regressor_method)\n",
" content_algo.fit(train_set)\n",
" anti_test_set_first = train_set.build_anti_testset()[0]\n",
" prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])\n",
" print(prediction)\n",
"\n",
"test_contentbased_class([\"title_length\", \"movie_year\",\"genre\",\"avg_rating\"], \"ridge_regression\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
%% Cell type:markdown id:82d5ca82 tags:
# Packages
%% Cell type:code id:277473a3 tags:
```
python
%
load_ext
autoreload
%
autoreload
2
# third parties imports
import
pandas
as
pd
import
numpy
as
np
import
random
as
rd
from
surprise
import
AlgoBase
,
SVD
from
surprise
import
PredictionImpossible
# import local
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
loaders
import
load_items
,
load_ratings
from
constants
import
Constant
as
C
from
sklearn.linear_model
import
LinearRegression
from
sklearn.ensemble
import
GradientBoostingRegressor
,
RandomForestRegressor
from
sklearn.svm
import
SVR
from
sklearn.linear_model
import
LinearRegression
,
Lasso
,
Ridge
,
ElasticNet
from
sklearn.svm
import
SVR
from
sklearn.ensemble
import
GradientBoostingRegressor
,
RandomForestRegressor
,
AdaBoostRegressor
from
sklearn.tree
import
DecisionTreeRegressor
from
sklearn.neighbors
import
KNeighborsRegressor
from
xgboost
import
XGBRegressor
from
lightgbm
import
LGBMRegressor
```
%% Output
The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload
%% Cell type:markdown id:a42c16bf tags:
# Explore and select content features
%% Cell type:code id:e8378976 tags:
```
python
# All the dataframes
df_items
=
load_items
()
df_ratings
=
load_ratings
()
df_tag
=
pd
.
read_csv
(
C
.
CONTENT_PATH
/
C
.
TAGS_FILENAME
)
#df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv")
# df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv")
# Example 1 : create title_length features
df_features
=
df_items
[
C
.
LABEL_COL
].
apply
(
lambda
x
:
len
(
x
)).
to_frame
(
'
n_character_title
'
)
display
(
df_features
.
head
())
df_tag
=
pd
.
read_csv
(
C
.
CONTENT_PATH
/
C
.
TAGS_FILENAME
)
df_features
=
df_tag
[
C
.
TAG
]
display
(
df_features
.
head
())
# (explore here other features)
```
%% Output
%% Cell type:markdown id:a2c9a2b6 tags:
# Build a content-based model
When ready, move the following class in the
*models.py*
script
%% Cell type:code id:16b0a602 tags:
```
python
# ContetnBased
class
ContentBased
(
AlgoBase
):
def
__init__
(
self
,
features_method
,
regressor_method
):
AlgoBase
.
__init__
(
self
)
self
.
regressor_method
=
regressor_method
self
.
features_methods
=
features_method
self
.
content_features
=
self
.
create_content_features
(
features_method
)
self
.
user_profile
=
{}
self
.
user_profile_explain
=
{}
def
create_content_features
(
self
,
features_methods
):
"""
Content Analyzer
"""
df_items
=
load_items
()
df_ratings
=
load_ratings
()
df_tag
=
pd
.
read_csv
(
C
.
CONTENT_PATH
/
C
.
TAGS_FILENAME
)
df_genome_score
=
pd
.
read_csv
(
"
data/hackathon/content/genome-scores.csv
"
)
df_genome_tag
=
pd
.
read_csv
(
"
data/hackathon/content/genome-tags.csv
"
)
df_features
=
pd
.
DataFrame
(
index
=
df_items
.
index
)
for
method
in
features_methods
:
if
method
==
"
title_length
"
:
df_title_length
=
df_items
[
C
.
LABEL_COL
].
apply
(
lambda
x
:
len
(
x
)).
to_frame
(
'
title_length
'
)
df_features
=
pd
.
concat
([
df_features
,
df_title_length
],
axis
=
1
)
elif
method
==
"
movie_year
"
:
df_movie_year
=
df_items
[
'
title
'
].
str
.
extract
(
r
'
\((\d{4})\)
'
,
expand
=
False
).
to_frame
(
'
movie_year
'
)
df_features
=
pd
.
concat
([
df_features
,
df_movie_year
.
astype
(
float
).
fillna
(
0
)],
axis
=
1
)
elif
method
==
"
genre
"
:
tfidf_vectorizer
=
TfidfVectorizer
(
tokenizer
=
lambda
x
:
x
.
split
(
'
|
'
),
token_pattern
=
None
)
tfidf_matrix
=
tfidf_vectorizer
.
fit_transform
(
df_items
[
'
genres
'
])
df_tfidf_genres
=
pd
.
DataFrame
(
tfidf_matrix
.
toarray
(),
index
=
df_items
.
index
,
columns
=
tfidf_vectorizer
.
get_feature_names_out
())
df_features
=
pd
.
concat
([
df_features
,
df_tfidf_genres
],
axis
=
1
)
elif
method
==
"
avg_rating
"
:
df_avg_rating
=
df_ratings
.
groupby
(
'
movieId
'
)[
'
rating
'
].
mean
().
to_frame
(
'
avg_rating
'
)
df_features
=
df_features
.
join
(
df_avg_rating
,
on
=
'
movieId
'
)
else
:
raise
NotImplementedError
(
f
'
Feature method
{
method
}
not yet implemented
'
)
# Handle missing values in df_features
df_features
.
fillna
(
0
,
inplace
=
True
)
return
df_features
def
fit
(
self
,
trainset
):
"""
Profile Learner
"""
AlgoBase
.
fit
(
self
,
trainset
)
# Preallocate user profiles
self
.
user_profile
=
{
u
:
None
for
u
in
trainset
.
all_users
()}
self
.
user_profile_explain
=
{}
epsilon
=
1e-10
# Small value to prevent division by zero
for
u
in
trainset
.
all_users
():
raw_user_id
=
trainset
.
to_raw_uid
(
u
)
self
.
user_profile_explain
[
raw_user_id
]
=
{}
user_ratings
=
np
.
array
([
rating
for
(
_
,
rating
)
in
trainset
.
ur
[
u
]])
item_ids
=
[
iid
for
(
iid
,
_
)
in
trainset
.
ur
[
u
]]
raw_item_ids
=
[
trainset
.
to_raw_iid
(
iid
)
for
iid
in
item_ids
]
feature_values
=
self
.
content_features
.
loc
[
raw_item_ids
].
values
norms
=
np
.
linalg
.
norm
(
feature_values
,
axis
=
0
)
+
epsilon
weighted_features
=
feature_values
/
norms
feature_importance
=
weighted_features
.
T
@
user_ratings
feature_importance
/=
np
.
sum
(
user_ratings
)
self
.
user_profile_explain
[
raw_user_id
]
=
dict
(
zip
(
self
.
content_features
.
columns
,
feature_importance
))
if
self
.
regressor_method
==
'
random_score
'
:
for
u
in
self
.
user_profile
:
self
.
user_profile
[
u
]
=
rd
.
uniform
(
0.5
,
5
)
elif
self
.
regressor_method
==
'
random_sample
'
:
for
u
in
self
.
user_profile
:
self
.
user_profile
[
u
]
=
[
rating
for
(
_
,
rating
)
in
trainset
.
ur
[
u
]]
else
:
regressor_models
=
{
'
linear_regression
'
:
LinearRegression
(
fit_intercept
=
False
),
'
svr_regression
'
:
SVR
(
kernel
=
'
rbf
'
,
C
=
10
,
epsilon
=
0.2
),
'
gradient_boosting
'
:
GradientBoostingRegressor
(
n_estimators
=
100
,
learning_rate
=
0.1
,
max_depth
=
3
),
'
random_forest
'
:
RandomForestRegressor
(
n_estimators
=
100
),
'
lasso_regression
'
:
Lasso
(
alpha
=
0.1
),
'
ridge_regression
'
:
Ridge
(
alpha
=
1.0
),
'
elastic_net
'
:
ElasticNet
(
alpha
=
1.0
,
l1_ratio
=
0.5
),
'
knn_regression
'
:
KNeighborsRegressor
(
n_neighbors
=
1
),
'
decision_tree
'
:
DecisionTreeRegressor
(
max_depth
=
5
),
'
adaboost
'
:
AdaBoostRegressor
(
n_estimators
=
50
),
'
xgboost
'
:
XGBRegressor
(
n_estimators
=
100
,
learning_rate
=
0.1
,
max_depth
=
3
),
'
lightgbm
'
:
LGBMRegressor
(
n_estimators
=
100
,
learning_rate
=
0.1
,
max_depth
=
3
)
}
if
self
.
regressor_method
not
in
regressor_models
:
raise
NotImplementedError
(
f
'
Regressor method
{
self
.
regressor_method
}
not yet implemented
'
)
for
u
in
self
.
user_profile
:
user_ratings
=
[
rating
for
(
_
,
rating
)
in
trainset
.
ur
[
u
]]
item_ids
=
[
iid
for
(
iid
,
_
)
in
trainset
.
ur
[
u
]]
raw_item_ids
=
[
trainset
.
to_raw_iid
(
iid
)
for
iid
in
item_ids
]
df_user
=
pd
.
DataFrame
({
'
item_id
'
:
raw_item_ids
,
'
user_ratings
'
:
user_ratings
})
df_user
=
df_user
.
merge
(
self
.
content_features
,
left_on
=
"
item_id
"
,
right_index
=
True
,
how
=
'
left
'
)
X
=
df_user
.
drop
(
columns
=
[
'
item_id
'
,
'
user_ratings
'
])
y
=
df_user
[
'
user_ratings
'
]
regressor
=
regressor_models
[
self
.
regressor_method
]
regressor
.
fit
(
X
,
y
)
self
.
user_profile
[
u
]
=
regressor
def
estimate
(
self
,
u
,
i
):
"""
Scoring component used for item filtering
"""
if
not
(
self
.
trainset
.
knows_user
(
u
)
and
self
.
trainset
.
knows_item
(
i
)):
raise
PredictionImpossible
(
'
User and/or item is unknown.
'
)
if
self
.
regressor_method
==
'
random_score
'
:
return
rd
.
uniform
(
0.5
,
5
)
elif
self
.
regressor_method
==
'
random_sample
'
:
return
rd
.
choice
(
self
.
user_profile
[
u
])
else
:
raw_item_id
=
self
.
trainset
.
to_raw_iid
(
i
)
item_features
=
self
.
content_features
.
loc
[
raw_item_id
,
:].
values
.
reshape
(
1
,
-
1
)
regressor
=
self
.
user_profile
[
u
]
item_features_df
=
pd
.
DataFrame
(
item_features
,
columns
=
self
.
content_features
.
columns
)
return
regressor
.
predict
(
item_features_df
)[
0
]
def
explain
(
self
,
u
):
if
u
in
self
.
user_profile_explain
:
return
self
.
user_profile_explain
[
u
]
else
:
return
None
#Example usage:
cb
=
ContentBased
([
"
title_length
"
,
"
movie_year
"
,
"
genre
"
,
"
avg_rating
"
],
"
ridge_regression
"
)
surprise_data
=
load_ratings
(
surprise_format
=
True
)
trainset
=
surprise_data
.
build_full_trainset
()
testset
=
trainset
.
build_anti_testset
()
cb
.
fit
(
trainset
)
#print("RMSE: ", cb.rmse(testset))
#Example explanations for users:
print
(
cb
.
explain
(
11
))
print
(
cb
.
explain
(
13
))
print
(
cb
.
explain
(
17
))
print
(
cb
.
explain
(
23
))
print
(
cb
.
explain
(
27
))
print
(
cb
.
explain
(
73
))
```
%% Output
{'title_length': 0.1497645139703848, 'movie_year': 0.16218667420100635, '(no genres listed)': 0.0, 'action': 0.09449072815753193, 'adventure': 0.08778978776313201, 'animation': 0.0, 'children': 0.038431411145366176, 'comedy': 0.07268129109348041, 'crime': 0.09469516433772891, 'documentary': 0.0611428358670058, 'drama': 0.10494783392380302, 'fantasy': 0.025806451608591505, 'film-noir': 0.025806451609512046, 'horror': 0.018342712153336858, 'imax': 0.06947533670577526, 'musical': 0.0, 'mystery': 0.06234903350217154, 'romance': 0.036771716124540825, 'sci-fi': 0.059571001735546115, 'thriller': 0.0993122803165238, 'war': 0.04002978709072218, 'western': 0.04547648227079719, 'avg_rating': 0.16263357553020436}
{'title_length': 0.12975573389578626, 'movie_year': 0.13738555574364605, '(no genres listed)': 0.0, 'action': 0.0640388318396414, 'adventure': 0.0827515664964472, 'animation': 0.05686854568650957, 'children': 0.06799492283569505, 'comedy': 0.07354182680364503, 'crime': 0.05543740962624167, 'documentary': 0.0, 'drama': 0.09170589087803577, 'fantasy': 0.061481521263689595, 'film-noir': 0.0, 'horror': 0.015113350123518238, 'imax': 0.04592205020685974, 'musical': 0.03201459126079391, 'mystery': 0.03412706135338736, 'romance': 0.05989121250223656, 'sci-fi': 0.04370793816378273, 'thriller': 0.045800659191095036, 'war': 0.04907194751877139, 'western': 0.027287416762806844, 'avg_rating': 0.13740560847192132}
{'title_length': 0.04702378569892371, 'movie_year': 0.052440003628289225, '(no genres listed)': 0.0, 'action': 0.020439581335728367, 'adventure': 0.015593308332521032, 'animation': 0.004256286923052558, 'children': 0.003520723090188317, 'comedy': 0.018972762464944913, 'crime': 0.028340544273099223, 'documentary': 0.005823989517206729, 'drama': 0.037415345194166824, 'fantasy': 0.013643903080149476, 'film-noir': 0.015390183296279798, 'horror': 0.01926898253629829, 'imax': 0.0014716703456143566, 'musical': 0.0061519348279224124, 'mystery': 0.02847033164163413, 'romance': 0.019827342468818163, 'sci-fi': 0.022573488552024915, 'thriller': 0.03522231545147593, 'war': 0.010339617301415098, 'western': 0.005663885036293055, 'avg_rating': 0.05327750989412312}
{'title_length': 0.033402138126294736, 'movie_year': 0.03710065977291947, '(no genres listed)': 0.0, 'action': 0.014528522669579273, 'adventure': 0.013963913494241694, 'animation': 0.005764814103226412, 'children': 0.006513197483932152, 'comedy': 0.017763201411495646, 'crime': 0.016002513666599556, 'documentary': 0.004292962983778595, 'drama': 0.027458210593047847, 'fantasy': 0.009302633945770895, 'film-noir': 0.006823368830454359, 'horror': 0.007391689869010394, 'imax': 0.004855154663168369, 'musical': 0.0058909467772061425, 'mystery': 0.012191560732760487, 'romance': 0.01723631022081761, 'sci-fi': 0.010817269433255231, 'thriller': 0.01658593988724716, 'war': 0.010193212979882352, 'western': 0.0052038255339472966, 'avg_rating': 0.03742403427834079}
{'title_length': 0.20154225634108316, 'movie_year': 0.20848962267389695, '(no genres listed)': 0.0, 'action': 0.04545454544645529, 'adventure': 0.04545454544730129, 'animation': 0.0, 'children': 0.0, 'comedy': 0.07177284969293253, 'crime': 0.1145252645738102, 'documentary': 0.0, 'drama': 0.16778172557550536, 'fantasy': 0.0, 'film-noir': 0.0, 'horror': 0.06315936177961773, 'imax': 0.0, 'musical': 0.0, 'mystery': 0.08510520557533159, 'romance': 0.09754755529442835, 'sci-fi': 0.045454545449454146, 'thriller': 0.12542163704872258, 'war': 0.08035304331050673, 'western': 0.0, 'avg_rating': 0.21152969571139305}
{'title_length': 0.021927486954368552, 'movie_year': 0.02488786702116846, '(no genres listed)': 0.0007363092498113207, 'action': 0.013836432470735639, 'adventure': 0.011610617815573265, 'animation': 0.007520799115717832, 'children': 0.006287966766754299, 'comedy': 0.012951125615087338, 'crime': 0.011084119744598393, 'documentary': 0.0018287715645832062, 'drama': 0.015221252640276463, 'fantasy': 0.008631010164284143, 'film-noir': 0.0024629052522566544, 'horror': 0.008816299251739122, 'imax': 0.005347204099216887, 'musical': 0.0038827346462235236, 'mystery': 0.0068652812039576095, 'romance': 0.008086664541950757, 'sci-fi': 0.010304269379559203, 'thriller': 0.013200133984104478, 'war': 0.005127335699821772, 'western': 0.0036215200349232765, 'avg_rating': 0.025470698706944836}
%% Cell type:markdown id:ffd75b7e tags:
The following script test the ContentBased class
%% Cell type:code id:69d12f7d tags:
```
python
def
test_contentbased_class
(
feature_method
,
regressor_method
):
"""
Test the ContentBased class.
Tries to make a prediction on the first (user,item ) tuple of the anti_test_set
"""
sp_ratings
=
load_ratings
(
surprise_format
=
True
)
train_set
=
sp_ratings
.
build_full_trainset
()
content_algo
=
ContentBased
(
feature_method
,
regressor_method
)
content_algo
.
fit
(
train_set
)
anti_test_set_first
=
train_set
.
build_anti_testset
()[
0
]
prediction
=
content_algo
.
predict
(
anti_test_set_first
[
0
],
anti_test_set_first
[
1
])
print
(
prediction
)
test_contentbased_class
([
"
title_length
"
,
"
movie_year
"
,
"
genre
"
,
"
avg_rating
"
],
"
ridge_regression
"
)
```
%% Output
user: 1 item: 10 r_ui = None est = 0.72 {'was_impossible': False}
Ce diff est replié.
Cliquez pour l'agrandir.
recommender.py
+
27
−
29
Voir le fichier @
ad331907
...
@@ -683,9 +683,8 @@ def compare_similarity_measures(trainset,testset):
...
@@ -683,9 +683,8 @@ def compare_similarity_measures(trainset,testset):
results
[
'
KNN_Pearson_RMSE
'
]
=
rmse_pearson
results
[
'
KNN_Pearson_RMSE
'
]
=
rmse_pearson
results
[
'
KNN_Pearson_MAE
'
]
=
mae_pearson
results
[
'
KNN_Pearson_MAE
'
]
=
mae_pearson
# Train and evaluate KNN model with Jaccard similarity
# Train and evaluate KNN model with Jaccard similarity
sim_options_jaccard
=
{
'
name
'
:
'
jaccard
'
,
'
user_based
'
:
True
}
sim_options_jaccard
=
{
'
name
'
:
''
,
'
user_based
'
:
True
}
user_based_jaccard
=
KNNWithMeans
(
sim_options
=
sim_options_jaccard
)
user_based_jaccard
=
KNNWithMeans
(
sim_options
=
sim_options_jaccard
)
user_based_jaccard
.
fit
(
trainset
)
user_based_jaccard
.
fit
(
trainset
)
predictions_jaccard
=
user_based_jaccard
.
test
(
testset
)
predictions_jaccard
=
user_based_jaccard
.
test
(
testset
)
...
@@ -771,33 +770,33 @@ def evaluate_inter_user_diversity(user_based_model, ratings_path, other_user_bas
...
@@ -771,33 +770,33 @@ def evaluate_inter_user_diversity(user_based_model, ratings_path, other_user_bas
inter_user_diversity_scores
[
'
UserBased
'
]
=
user_based_model
.
inter_user_diversity
(
all_top_n_recommendations_ub
)
inter_user_diversity_scores
[
'
UserBased
'
]
=
user_based_model
.
inter_user_diversity
(
all_top_n_recommendations_ub
)
#
#
#KNN model
# #KNN model
#
knn_model = RecommenderSystem_KNN(ratings_path)
knn_model
=
RecommenderSystem_KNN
(
ratings_path
)
#
knn_model.train_knn_model()
knn_model
.
train_knn_model
()
#
all_top_n_recommendations_knn = {}
all_top_n_recommendations_knn
=
{}
#
for user_id in range(knn_model.trainset.n_users):
for
user_id
in
range
(
knn_model
.
trainset
.
n_users
):
#
try:
try
:
#
trainset_user_id = knn_model.trainset.to_raw_uid(user_id)
trainset_user_id
=
knn_model
.
trainset
.
to_raw_uid
(
user_id
)
#
top_n_recommendations_knn = knn_model.get_top_n_recommendations(trainset_user_id, n=10)
top_n_recommendations_knn
=
knn_model
.
get_top_n_recommendations
(
trainset_user_id
,
n
=
10
)
#
all_top_n_recommendations_knn[trainset_user_id] = top_n_recommendations_knn
all_top_n_recommendations_knn
[
trainset_user_id
]
=
top_n_recommendations_knn
#
except ValueError:
except
ValueError
:
#
print(f"User {trainset_user_id} is not part of the training set for KNN model. Skipping...")
print
(
f
"
User
{
trainset_user_id
}
is not part of the training set for KNN model. Skipping...
"
)
#
inter_user_diversity_scores['KNN'] = knn_model.inter_user_diversity(all_top_n_recommendations_knn)
inter_user_diversity_scores
[
'
KNN
'
]
=
knn_model
.
inter_user_diversity
(
all_top_n_recommendations_knn
)
#
#
Other user-based models
# Other user-based models
#
for other_model in other_user_based_models:
for
other_model
in
other_user_based_models
:
#
other_model.load_model()
other_model
.
load_model
()
#
all_top_n_recommendations_other = {}
all_top_n_recommendations_other
=
{}
#
# Get predictions for all users in the test set
# Get predictions for all users in the test set
#
all_user_ids = set(user for user, _, _ in testset)
all_user_ids
=
set
(
user
for
user
,
_
,
_
in
testset
)
#
for user_id in all_user_ids:
for
user_id
in
all_user_ids
:
#
other_model.user_id = user_id # Update the user ID for the model
other_model
.
user_id
=
user_id
# Update the user ID for the model
#
top_n_predictions = other_model.get_top_n_predictions_for_user(ratings_path, n=10)
top_n_predictions
=
other_model
.
get_top_n_predictions_for_user
(
ratings_path
,
n
=
10
)
#
all_top_n_recommendations_other[user_id] = top_n_predictions
all_top_n_recommendations_other
[
user_id
]
=
top_n_predictions
#
inter_user_diversity_scores[f'Other_{other_model.user_name}'] = other_model.inter_user_diversity(all_top_n_recommendations_other)
inter_user_diversity_scores
[
f
'
Other_
{
other_model
.
user_name
}
'
]
=
other_model
.
inter_user_diversity
(
all_top_n_recommendations_other
)
return
inter_user_diversity_scores
return
inter_user_diversity_scores
...
@@ -983,8 +982,7 @@ class ContentBased(AlgoBase):
...
@@ -983,8 +982,7 @@ class ContentBased(AlgoBase):
'
knn_regression
'
:
KNeighborsRegressor
(
n_neighbors
=
1
),
'
knn_regression
'
:
KNeighborsRegressor
(
n_neighbors
=
1
),
'
decision_tree
'
:
DecisionTreeRegressor
(
max_depth
=
5
),
'
decision_tree
'
:
DecisionTreeRegressor
(
max_depth
=
5
),
'
adaboost
'
:
AdaBoostRegressor
(
n_estimators
=
50
),
'
adaboost
'
:
AdaBoostRegressor
(
n_estimators
=
50
),
'
xgboost
'
:
XGBRegressor
(
n_estimators
=
100
,
learning_rate
=
0.1
,
max_depth
=
3
),
'
xgboost
'
:
XGBRegressor
(
n_estimators
=
100
,
learning_rate
=
0.1
,
max_depth
=
3
)
'
lightgbm
'
:
LGBMRegressor
(
n_estimators
=
100
,
learning_rate
=
0.1
,
max_depth
=
3
)
}
}
if
self
.
regressor_method
not
in
regressor_models
:
if
self
.
regressor_method
not
in
regressor_models
:
...
...
Ce diff est replié.
Cliquez pour l'agrandir.
Aperçu
0%
Chargement en cours
Veuillez réessayer
ou
joindre un nouveau fichier
.
Annuler
You are about to add
0
people
to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Enregistrer le commentaire
Annuler
Veuillez vous
inscrire
ou vous
se connecter
pour commenter