Skip to content
GitLab
Explorer
Connexion
S'inscrire
Navigation principale
Rechercher ou aller à…
Projet
R
recomsys
Gestion
Activité
Membres
Labels
Programmation
Tickets
Tableaux des tickets
Jalons
Wiki
Code
Requêtes de fusion
Dépôt
Branches
Validations
Étiquettes
Graphe du dépôt
Comparer les révisions
Extraits de code
Compilation
Pipelines
Jobs
Planifications de pipeline
Artéfacts
Déploiement
Releases
Registre de paquets
Registre de conteneur
Registre de modèles
Opération
Environnements
Modules Terraform
Surveillance
Incidents
Analyse
Données d'analyse des chaînes de valeur
Analyse des contributeurs
Données d'analyse CI/CD
Données d'analyse du dépôt
Expériences du modèle
Aide
Aide
Support
Documentation de GitLab
Comparer les forfaits GitLab
Forum de la communauté
Contribuer à GitLab
Donner votre avis
Conditions générales et politique de confidentialité
Raccourcis clavier
?
Extraits de code
Groupes
Projets
Afficher davantage de fils d'Ariane
recommender_system
recomsys
Validations
c9d719a3
Valider
c9d719a3
rédigé
1 year ago
par
Adrien Payen
Parcourir les fichiers
Options
Téléchargements
Correctifs
Plain Diff
update content based
parent
8cdb6fca
Aucune branche associée trouvée
Branches contenant la validation
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
Modifications
1
Masquer les modifications d'espaces
En ligne
Côte à côte
Affichage de
1 fichier modifié
content_based.ipynb
+99
-45
99 ajouts, 45 suppressions
content_based.ipynb
avec
99 ajouts
et
45 suppressions
content_based.ipynb
+
99
−
45
Voir le fichier @
c9d719a3
...
@@ -10,19 +10,10 @@
...
@@ -10,19 +10,10 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 1
5
,
"execution_count": 1,
"id": "277473a3",
"id": "277473a3",
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"source": [
"source": [
"%load_ext autoreload\n",
"%load_ext autoreload\n",
"%autoreload 2\n",
"%autoreload 2\n",
...
@@ -40,7 +31,13 @@
...
@@ -40,7 +31,13 @@
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor\n",
"from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor\n",
"from sklearn.svm import SVR\n",
"from sklearn.svm import SVR\n",
"from sklearn.feature_extraction.text import TfidfVectorizer"
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.linear_model import Lasso, Ridge, ElasticNet\n",
"from sklearn.neighbors import KNeighborsRegressor\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from sklearn.ensemble import AdaBoostRegressor\n",
"from xgboost import XGBRegressor\n",
"from lightgbm import LGBMRegressor"
]
]
},
},
{
{
...
@@ -53,26 +50,89 @@
...
@@ -53,26 +50,89 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
16
,
"execution_count":
2
,
"id": "e8378976",
"id": "e8378976",
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [
{
{
"ename": "FileNotFoundError",
"data": {
"evalue": "[Errno 2] No such file or directory: 'data/test/content/movies.csv'",
"text/html": [
"output_type": "error",
"<div>\n",
"traceback": [
"<style scoped>\n",
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
" .dataframe tbody tr th:only-of-type {\n",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
" vertical-align: middle;\n",
"Cell \u001b[0;32mIn[16], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# All the dataframes\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m df_items \u001b[38;5;241m=\u001b[39m \u001b[43mload_items\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m df_ratings \u001b[38;5;241m=\u001b[39m load_ratings()\n\u001b[1;32m 4\u001b[0m df_tag \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(C\u001b[38;5;241m.\u001b[39mCONTENT_PATH\u001b[38;5;241m/\u001b[39mC\u001b[38;5;241m.\u001b[39mTAGS_FILENAME)\n",
" }\n",
"File \u001b[0;32m~/Desktop/Université/Recommender Systems/recomsys/loaders.py:34\u001b[0m, in \u001b[0;36mload_items\u001b[0;34m()\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_items\u001b[39m():\n\u001b[1;32m 29\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Loads items data.\u001b[39;00m\n\u001b[1;32m 30\u001b[0m \n\u001b[1;32m 31\u001b[0m \u001b[38;5;124;03m Returns:\u001b[39;00m\n\u001b[1;32m 32\u001b[0m \u001b[38;5;124;03m DataFrame: Items data.\u001b[39;00m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 34\u001b[0m df_items \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mC\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mCONTENT_PATH\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m/\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mC\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mITEMS_FILENAME\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# ce qui se trouve dans le movie csv\u001b[39;00m\n\u001b[1;32m 35\u001b[0m df_items \u001b[38;5;241m=\u001b[39m df_items\u001b[38;5;241m.\u001b[39mset_index(C\u001b[38;5;241m.\u001b[39mITEM_ID_COL) \u001b[38;5;66;03m# movie id\u001b[39;00m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df_items\n",
"\n",
"File \u001b[0;32m~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 1014\u001b[0m dialect,\n\u001b[1;32m 1015\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1022\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 1023\u001b[0m )\n\u001b[1;32m 1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
" .dataframe tbody tr th {\n",
"File \u001b[0;32m~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 623\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
" vertical-align: top;\n",
"File \u001b[0;32m~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
" }\n",
"File \u001b[0;32m~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1878\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1879\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1880\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1881\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1882\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1883\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1884\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1885\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1886\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1887\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1888\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1889\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1890\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1891\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n",
"\n",
"File \u001b[0;32m~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 869\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 870\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 871\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 872\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 874\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 875\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 876\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 877\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 878\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 879\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
" .dataframe thead th {\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'data/test/content/movies.csv'"
" text-align: right;\n",
]
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>n_character_title</th>\n",
" </tr>\n",
" <tr>\n",
" <th>movieId</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>4993</th>\n",
" <td>57</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5952</th>\n",
" <td>45</td>\n",
" </tr>\n",
" <tr>\n",
" <th>527</th>\n",
" <td>23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2028</th>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4308</th>\n",
" <td>19</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" n_character_title\n",
"movieId \n",
"4993 57\n",
"5952 45\n",
"527 23\n",
"2028 26\n",
"4308 19"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"0 long\n",
"1 boring\n",
"2 long\n",
"3 romance\n",
"4 stupidity\n",
"Name: tag, dtype: object"
]
},
"metadata": {},
"output_type": "display_data"
}
}
],
],
"source": [
"source": [
...
@@ -106,26 +166,20 @@
...
@@ -106,26 +166,20 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
null
,
"execution_count":
3
,
"id": "16b0a602",
"id": "16b0a602",
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [
{
{
"name": "stdout",
"ename": "NameError",
"output_type": "stream",
"evalue": "name 'Lasso' is not defined",
"text": [
"output_type": "error",
"0\n",
"traceback": [
"1\n",
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"2\n",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"3\n",
"Cell \u001b[0;32mIn[3], line 147\u001b[0m\n\u001b[1;32m 145\u001b[0m trainset \u001b[38;5;241m=\u001b[39m surprise_data\u001b[38;5;241m.\u001b[39mbuild_full_trainset()\n\u001b[1;32m 146\u001b[0m testset \u001b[38;5;241m=\u001b[39m trainset\u001b[38;5;241m.\u001b[39mbuild_anti_testset()\n\u001b[0;32m--> 147\u001b[0m \u001b[43mcb\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrainset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m#print(\"RMSE: \", cb.rmse(testset))\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \n\u001b[1;32m 152\u001b[0m \n\u001b[1;32m 153\u001b[0m \u001b[38;5;66;03m#Example explanations for users:\u001b[39;00m\n\u001b[1;32m 154\u001b[0m \u001b[38;5;28mprint\u001b[39m(cb\u001b[38;5;241m.\u001b[39mexplain(\u001b[38;5;241m11\u001b[39m))\n",
"4\n",
"Cell \u001b[0;32mIn[3], line 88\u001b[0m, in \u001b[0;36mContentBased.fit\u001b[0;34m(self, trainset)\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muser_profile[u] \u001b[38;5;241m=\u001b[39m [rating \u001b[38;5;28;01mfor\u001b[39;00m (_, rating) \u001b[38;5;129;01min\u001b[39;00m trainset\u001b[38;5;241m.\u001b[39mur[u]]\n\u001b[1;32m 82\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 83\u001b[0m regressor_models \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 84\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlinear_regression\u001b[39m\u001b[38;5;124m'\u001b[39m: LinearRegression(fit_intercept\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m),\n\u001b[1;32m 85\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124msvr_regression\u001b[39m\u001b[38;5;124m'\u001b[39m: SVR(kernel\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrbf\u001b[39m\u001b[38;5;124m'\u001b[39m, C\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10\u001b[39m, epsilon\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.2\u001b[39m),\n\u001b[1;32m 86\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mgradient_boosting\u001b[39m\u001b[38;5;124m'\u001b[39m: GradientBoostingRegressor(n_estimators\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m, learning_rate\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.1\u001b[39m, max_depth\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m3\u001b[39m),\n\u001b[1;32m 87\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrandom_forest\u001b[39m\u001b[38;5;124m'\u001b[39m: RandomForestRegressor(n_estimators\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m),\n\u001b[0;32m---> 88\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlasso_regression\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[43mLasso\u001b[49m(alpha\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.1\u001b[39m),\n\u001b[1;32m 89\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mridge_regression\u001b[39m\u001b[38;5;124m'\u001b[39m: Ridge(alpha\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1.0\u001b[39m),\n\u001b[1;32m 90\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124melastic_net\u001b[39m\u001b[38;5;124m'\u001b[39m: ElasticNet(alpha\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1.0\u001b[39m, l1_ratio\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.5\u001b[39m),\n\u001b[1;32m 91\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mknn_regression\u001b[39m\u001b[38;5;124m'\u001b[39m: KNeighborsRegressor(n_neighbors\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m),\n\u001b[1;32m 92\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdecision_tree\u001b[39m\u001b[38;5;124m'\u001b[39m: DecisionTreeRegressor(max_depth\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5\u001b[39m),\n\u001b[1;32m 93\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124madaboost\u001b[39m\u001b[38;5;124m'\u001b[39m: AdaBoostRegressor(n_estimators\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m50\u001b[39m),\n\u001b[1;32m 94\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mxgboost\u001b[39m\u001b[38;5;124m'\u001b[39m: XGBRegressor(n_estimators\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m, learning_rate\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.1\u001b[39m, max_depth\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m3\u001b[39m),\n\u001b[1;32m 95\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlightgbm\u001b[39m\u001b[38;5;124m'\u001b[39m: LGBMRegressor(n_estimators\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m, learning_rate\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.1\u001b[39m, max_depth\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m3\u001b[39m)\n\u001b[1;32m 96\u001b[0m }\n\u001b[1;32m 98\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mregressor_method \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m regressor_models:\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mRegressor method \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mregressor_method\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not yet implemented\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
"5\n",
"\u001b[0;31mNameError\u001b[0m: name 'Lasso' is not defined"
"None\n",
"{'n_character_title': array([0.03019692])}\n",
"{'n_character_title': array([0.04098154])}\n",
"{'n_character_title': array([0.02942264])}\n",
"{'n_character_title': array([0.08196307])}\n",
"{'n_character_title': array([0.02798739])}\n"
]
]
}
}
],
],
...
@@ -344,7 +398,7 @@
...
@@ -344,7 +398,7 @@
"name": "python",
"name": "python",
"nbconvert_exporter": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"pygments_lexer": "ipython3",
"version": "3.12.
0
"
"version": "3.12.
2
"
}
}
},
},
"nbformat": 4,
"nbformat": 4,
...
...
%% Cell type:markdown id:82d5ca82 tags:
%% Cell type:markdown id:82d5ca82 tags:
# Packages
# Packages
%% Cell type:code id:277473a3 tags:
%% Cell type:code id:277473a3 tags:
```
python
```
python
%
load_ext
autoreload
%
load_ext
autoreload
%
autoreload
2
%
autoreload
2
import
numpy
as
np
import
numpy
as
np
import
pandas
as
pd
import
pandas
as
pd
import
random
as
rd
import
random
as
rd
from
surprise
import
AlgoBase
from
surprise
import
AlgoBase
from
surprise.prediction_algorithms.predictions
import
PredictionImpossible
from
surprise.prediction_algorithms.predictions
import
PredictionImpossible
from
loaders
import
load_ratings
from
loaders
import
load_ratings
from
loaders
import
load_items
from
loaders
import
load_items
from
constants
import
Constant
as
C
from
constants
import
Constant
as
C
from
sklearn.linear_model
import
LinearRegression
from
sklearn.linear_model
import
LinearRegression
from
sklearn.ensemble
import
GradientBoostingRegressor
,
RandomForestRegressor
from
sklearn.ensemble
import
GradientBoostingRegressor
,
RandomForestRegressor
from
sklearn.svm
import
SVR
from
sklearn.svm
import
SVR
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.linear_model
import
Lasso
,
Ridge
,
ElasticNet
from
sklearn.neighbors
import
KNeighborsRegressor
from
sklearn.tree
import
DecisionTreeRegressor
from
sklearn.ensemble
import
AdaBoostRegressor
from
xgboost
import
XGBRegressor
from
lightgbm
import
LGBMRegressor
```
```
%% Output
The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload
%% Cell type:markdown id:a42c16bf tags:
%% Cell type:markdown id:a42c16bf tags:
# Explore and select content features
# Explore and select content features
%% Cell type:code id:e8378976 tags:
%% Cell type:code id:e8378976 tags:
```
python
```
python
# All the dataframes
# All the dataframes
df_items
=
load_items
()
df_items
=
load_items
()
df_ratings
=
load_ratings
()
df_ratings
=
load_ratings
()
df_tag
=
pd
.
read_csv
(
C
.
CONTENT_PATH
/
C
.
TAGS_FILENAME
)
df_tag
=
pd
.
read_csv
(
C
.
CONTENT_PATH
/
C
.
TAGS_FILENAME
)
#df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv")
#df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv")
# df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv")
# df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv")
# Example 1 : create title_length features
# Example 1 : create title_length features
df_features
=
df_items
[
C
.
LABEL_COL
].
apply
(
lambda
x
:
len
(
x
)).
to_frame
(
'
n_character_title
'
)
df_features
=
df_items
[
C
.
LABEL_COL
].
apply
(
lambda
x
:
len
(
x
)).
to_frame
(
'
n_character_title
'
)
display
(
df_features
.
head
())
display
(
df_features
.
head
())
df_tag
=
pd
.
read_csv
(
C
.
CONTENT_PATH
/
C
.
TAGS_FILENAME
)
df_tag
=
pd
.
read_csv
(
C
.
CONTENT_PATH
/
C
.
TAGS_FILENAME
)
df_features
=
df_tag
[
C
.
TAG
]
df_features
=
df_tag
[
C
.
TAG
]
display
(
df_features
.
head
())
display
(
df_features
.
head
())
# (explore here other features)
# (explore here other features)
```
```
%% Output
%% Output
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[16], line 2
1 # All the dataframes
----> 2 df_items = load_items()
3 df_ratings = load_ratings()
4 df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
File ~/Desktop/Université/Recommender Systems/recomsys/loaders.py:34, in load_items()
28 def load_items():
29 """Loads items data.
30
31 Returns:
32 DataFrame: Items data.
33 """
---> 34 df_items = pd.read_csv(C.CONTENT_PATH / C.ITEMS_FILENAME) # ce qui se trouve dans le movie csv
35 df_items = df_items.set_index(C.ITEM_ID_COL) # movie id
36 return df_items
File ~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
1013 kwds_defaults = _refine_defaults_read(
1014 dialect,
1015 delimiter,
(...)
1022 dtype_backend=dtype_backend,
1023 )
1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)
File ~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
617 _validate_names(kwds.get("names", None))
619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer,
**
kwds)
622 if chunksize or iterator:
623 return parser
File ~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine,
**
kwds)
1617 self.options["has_index_names"] = kwds["has_index_names"]
1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)
File ~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
1878 if "b" not in mode:
1879 mode += "b"
-> 1880 self.handles = get_handle(
1881 f,
1882 mode,
1883 encoding=self.options.get("encoding", None),
1884 compression=self.options.get("compression", None),
1885 memory_map=self.options.get("memory_map", False),
1886 is_text=is_text,
1887 errors=self.options.get("encoding_errors", "strict"),
1888 storage_options=self.options.get("storage_options", None),
1889 )
1890 assert self.handles is not None
1891 f = self.handles.handle
File ~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/common.py:873, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
868 elif isinstance(handle, str):
869 # Check whether the filename is to be opened in binary mode.
870 # Binary mode does not support 'encoding' and 'newline'.
871 if ioargs.encoding and "b" not in ioargs.mode:
872 # Encoding
--> 873 handle = open(
874 handle,
875 ioargs.mode,
876 encoding=ioargs.encoding,
877 errors=errors,
878 newline="",
879 )
880 else:
881 # Binary mode
882 handle = open(handle, ioargs.mode)
FileNotFoundError: [Errno 2] No such file or directory: 'data/test/content/movies.csv'
%% Cell type:markdown id:a2c9a2b6 tags:
%% Cell type:markdown id:a2c9a2b6 tags:
# Build a content-based model
# Build a content-based model
When ready, move the following class in the
*models.py*
script
When ready, move the following class in the
*models.py*
script
%% Cell type:code id:16b0a602 tags:
%% Cell type:code id:16b0a602 tags:
```
python
```
python
# ContetnBased
# ContetnBased
class
ContentBased
(
AlgoBase
):
class
ContentBased
(
AlgoBase
):
def
__init__
(
self
,
features_method
,
regressor_method
):
def
__init__
(
self
,
features_method
,
regressor_method
):
AlgoBase
.
__init__
(
self
)
AlgoBase
.
__init__
(
self
)
self
.
regressor_method
=
regressor_method
self
.
regressor_method
=
regressor_method
self
.
features_methods
=
features_method
self
.
features_methods
=
features_method
self
.
content_features
=
self
.
create_content_features
(
features_method
)
self
.
content_features
=
self
.
create_content_features
(
features_method
)
self
.
user_profile
=
{}
self
.
user_profile
=
{}
self
.
user_profile_explain
=
{}
self
.
user_profile_explain
=
{}
def
create_content_features
(
self
,
features_methods
):
def
create_content_features
(
self
,
features_methods
):
"""
Content Analyzer
"""
"""
Content Analyzer
"""
df_items
=
load_items
()
df_items
=
load_items
()
df_ratings
=
load_ratings
()
df_ratings
=
load_ratings
()
df_tag
=
pd
.
read_csv
(
C
.
CONTENT_PATH
/
C
.
TAGS_FILENAME
)
df_tag
=
pd
.
read_csv
(
C
.
CONTENT_PATH
/
C
.
TAGS_FILENAME
)
df_genome_score
=
pd
.
read_csv
(
"
data/hackathon/content/genome-scores.csv
"
)
df_genome_score
=
pd
.
read_csv
(
"
data/hackathon/content/genome-scores.csv
"
)
df_genome_tag
=
pd
.
read_csv
(
"
data/hackathon/content/genome-tags.csv
"
)
df_genome_tag
=
pd
.
read_csv
(
"
data/hackathon/content/genome-tags.csv
"
)
df_features
=
pd
.
DataFrame
(
index
=
df_items
.
index
)
df_features
=
pd
.
DataFrame
(
index
=
df_items
.
index
)
for
method
in
features_methods
:
for
method
in
features_methods
:
if
method
==
"
title_length
"
:
if
method
==
"
title_length
"
:
df_title_length
=
df_items
[
C
.
LABEL_COL
].
apply
(
lambda
x
:
len
(
x
)).
to_frame
(
'
title_length
'
)
df_title_length
=
df_items
[
C
.
LABEL_COL
].
apply
(
lambda
x
:
len
(
x
)).
to_frame
(
'
title_length
'
)
df_features
=
pd
.
concat
([
df_features
,
df_title_length
],
axis
=
1
)
df_features
=
pd
.
concat
([
df_features
,
df_title_length
],
axis
=
1
)
elif
method
==
"
movie_year
"
:
elif
method
==
"
movie_year
"
:
df_movie_year
=
df_items
[
'
title
'
].
str
.
extract
(
r
'
\((\d{4})\)
'
,
expand
=
False
).
to_frame
(
'
movie_year
'
)
df_movie_year
=
df_items
[
'
title
'
].
str
.
extract
(
r
'
\((\d{4})\)
'
,
expand
=
False
).
to_frame
(
'
movie_year
'
)
df_features
=
pd
.
concat
([
df_features
,
df_movie_year
.
astype
(
float
).
fillna
(
0
)],
axis
=
1
)
df_features
=
pd
.
concat
([
df_features
,
df_movie_year
.
astype
(
float
).
fillna
(
0
)],
axis
=
1
)
elif
method
==
"
genre
"
:
elif
method
==
"
genre
"
:
tfidf_vectorizer
=
TfidfVectorizer
(
tokenizer
=
lambda
x
:
x
.
split
(
'
|
'
),
token_pattern
=
None
)
tfidf_vectorizer
=
TfidfVectorizer
(
tokenizer
=
lambda
x
:
x
.
split
(
'
|
'
),
token_pattern
=
None
)
tfidf_matrix
=
tfidf_vectorizer
.
fit_transform
(
df_items
[
'
genres
'
])
tfidf_matrix
=
tfidf_vectorizer
.
fit_transform
(
df_items
[
'
genres
'
])
df_tfidf_genres
=
pd
.
DataFrame
(
tfidf_matrix
.
toarray
(),
index
=
df_items
.
index
,
columns
=
tfidf_vectorizer
.
get_feature_names_out
())
df_tfidf_genres
=
pd
.
DataFrame
(
tfidf_matrix
.
toarray
(),
index
=
df_items
.
index
,
columns
=
tfidf_vectorizer
.
get_feature_names_out
())
df_features
=
pd
.
concat
([
df_features
,
df_tfidf_genres
],
axis
=
1
)
df_features
=
pd
.
concat
([
df_features
,
df_tfidf_genres
],
axis
=
1
)
elif
method
==
"
avg_rating
"
:
elif
method
==
"
avg_rating
"
:
df_avg_rating
=
df_ratings
.
groupby
(
'
movieId
'
)[
'
rating
'
].
mean
().
to_frame
(
'
avg_rating
'
)
df_avg_rating
=
df_ratings
.
groupby
(
'
movieId
'
)[
'
rating
'
].
mean
().
to_frame
(
'
avg_rating
'
)
df_features
=
df_features
.
join
(
df_avg_rating
,
on
=
'
movieId
'
)
df_features
=
df_features
.
join
(
df_avg_rating
,
on
=
'
movieId
'
)
else
:
else
:
raise
NotImplementedError
(
f
'
Feature method
{
method
}
not yet implemented
'
)
raise
NotImplementedError
(
f
'
Feature method
{
method
}
not yet implemented
'
)
# Handle missing values in df_features
# Handle missing values in df_features
df_features
.
fillna
(
0
,
inplace
=
True
)
df_features
.
fillna
(
0
,
inplace
=
True
)
return
df_features
return
df_features
def
fit
(
self
,
trainset
):
def
fit
(
self
,
trainset
):
"""
Profile Learner
"""
"""
Profile Learner
"""
AlgoBase
.
fit
(
self
,
trainset
)
AlgoBase
.
fit
(
self
,
trainset
)
# Preallocate user profiles
# Preallocate user profiles
self
.
user_profile
=
{
u
:
None
for
u
in
trainset
.
all_users
()}
self
.
user_profile
=
{
u
:
None
for
u
in
trainset
.
all_users
()}
self
.
user_profile_explain
=
{}
self
.
user_profile_explain
=
{}
epsilon
=
1e-10
# Small value to prevent division by zero
epsilon
=
1e-10
# Small value to prevent division by zero
for
u
in
trainset
.
all_users
():
for
u
in
trainset
.
all_users
():
raw_user_id
=
trainset
.
to_raw_uid
(
u
)
raw_user_id
=
trainset
.
to_raw_uid
(
u
)
self
.
user_profile_explain
[
raw_user_id
]
=
{}
self
.
user_profile_explain
[
raw_user_id
]
=
{}
user_ratings
=
np
.
array
([
rating
for
(
_
,
rating
)
in
trainset
.
ur
[
u
]])
user_ratings
=
np
.
array
([
rating
for
(
_
,
rating
)
in
trainset
.
ur
[
u
]])
item_ids
=
[
iid
for
(
iid
,
_
)
in
trainset
.
ur
[
u
]]
item_ids
=
[
iid
for
(
iid
,
_
)
in
trainset
.
ur
[
u
]]
raw_item_ids
=
[
trainset
.
to_raw_iid
(
iid
)
for
iid
in
item_ids
]
raw_item_ids
=
[
trainset
.
to_raw_iid
(
iid
)
for
iid
in
item_ids
]
feature_values
=
self
.
content_features
.
loc
[
raw_item_ids
].
values
feature_values
=
self
.
content_features
.
loc
[
raw_item_ids
].
values
norms
=
np
.
linalg
.
norm
(
feature_values
,
axis
=
0
)
+
epsilon
norms
=
np
.
linalg
.
norm
(
feature_values
,
axis
=
0
)
+
epsilon
weighted_features
=
feature_values
/
norms
weighted_features
=
feature_values
/
norms
feature_importance
=
weighted_features
.
T
@
user_ratings
feature_importance
=
weighted_features
.
T
@
user_ratings
feature_importance
/=
np
.
sum
(
user_ratings
)
feature_importance
/=
np
.
sum
(
user_ratings
)
self
.
user_profile_explain
[
raw_user_id
]
=
dict
(
zip
(
self
.
content_features
.
columns
,
feature_importance
))
self
.
user_profile_explain
[
raw_user_id
]
=
dict
(
zip
(
self
.
content_features
.
columns
,
feature_importance
))
if
self
.
regressor_method
==
'
random_score
'
:
if
self
.
regressor_method
==
'
random_score
'
:
for
u
in
self
.
user_profile
:
for
u
in
self
.
user_profile
:
self
.
user_profile
[
u
]
=
rd
.
uniform
(
0.5
,
5
)
self
.
user_profile
[
u
]
=
rd
.
uniform
(
0.5
,
5
)
elif
self
.
regressor_method
==
'
random_sample
'
:
elif
self
.
regressor_method
==
'
random_sample
'
:
for
u
in
self
.
user_profile
:
for
u
in
self
.
user_profile
:
self
.
user_profile
[
u
]
=
[
rating
for
(
_
,
rating
)
in
trainset
.
ur
[
u
]]
self
.
user_profile
[
u
]
=
[
rating
for
(
_
,
rating
)
in
trainset
.
ur
[
u
]]
else
:
else
:
regressor_models
=
{
regressor_models
=
{
'
linear_regression
'
:
LinearRegression
(
fit_intercept
=
False
),
'
linear_regression
'
:
LinearRegression
(
fit_intercept
=
False
),
'
svr_regression
'
:
SVR
(
kernel
=
'
rbf
'
,
C
=
10
,
epsilon
=
0.2
),
'
svr_regression
'
:
SVR
(
kernel
=
'
rbf
'
,
C
=
10
,
epsilon
=
0.2
),
'
gradient_boosting
'
:
GradientBoostingRegressor
(
n_estimators
=
100
,
learning_rate
=
0.1
,
max_depth
=
3
),
'
gradient_boosting
'
:
GradientBoostingRegressor
(
n_estimators
=
100
,
learning_rate
=
0.1
,
max_depth
=
3
),
'
random_forest
'
:
RandomForestRegressor
(
n_estimators
=
100
),
'
random_forest
'
:
RandomForestRegressor
(
n_estimators
=
100
),
'
lasso_regression
'
:
Lasso
(
alpha
=
0.1
),
'
lasso_regression
'
:
Lasso
(
alpha
=
0.1
),
'
ridge_regression
'
:
Ridge
(
alpha
=
1.0
),
'
ridge_regression
'
:
Ridge
(
alpha
=
1.0
),
'
elastic_net
'
:
ElasticNet
(
alpha
=
1.0
,
l1_ratio
=
0.5
),
'
elastic_net
'
:
ElasticNet
(
alpha
=
1.0
,
l1_ratio
=
0.5
),
'
knn_regression
'
:
KNeighborsRegressor
(
n_neighbors
=
1
),
'
knn_regression
'
:
KNeighborsRegressor
(
n_neighbors
=
1
),
'
decision_tree
'
:
DecisionTreeRegressor
(
max_depth
=
5
),
'
decision_tree
'
:
DecisionTreeRegressor
(
max_depth
=
5
),
'
adaboost
'
:
AdaBoostRegressor
(
n_estimators
=
50
),
'
adaboost
'
:
AdaBoostRegressor
(
n_estimators
=
50
),
'
xgboost
'
:
XGBRegressor
(
n_estimators
=
100
,
learning_rate
=
0.1
,
max_depth
=
3
),
'
xgboost
'
:
XGBRegressor
(
n_estimators
=
100
,
learning_rate
=
0.1
,
max_depth
=
3
),
'
lightgbm
'
:
LGBMRegressor
(
n_estimators
=
100
,
learning_rate
=
0.1
,
max_depth
=
3
)
'
lightgbm
'
:
LGBMRegressor
(
n_estimators
=
100
,
learning_rate
=
0.1
,
max_depth
=
3
)
}
}
if
self
.
regressor_method
not
in
regressor_models
:
if
self
.
regressor_method
not
in
regressor_models
:
raise
NotImplementedError
(
f
'
Regressor method
{
self
.
regressor_method
}
not yet implemented
'
)
raise
NotImplementedError
(
f
'
Regressor method
{
self
.
regressor_method
}
not yet implemented
'
)
for
u
in
self
.
user_profile
:
for
u
in
self
.
user_profile
:
user_ratings
=
[
rating
for
(
_
,
rating
)
in
trainset
.
ur
[
u
]]
user_ratings
=
[
rating
for
(
_
,
rating
)
in
trainset
.
ur
[
u
]]
item_ids
=
[
iid
for
(
iid
,
_
)
in
trainset
.
ur
[
u
]]
item_ids
=
[
iid
for
(
iid
,
_
)
in
trainset
.
ur
[
u
]]
raw_item_ids
=
[
trainset
.
to_raw_iid
(
iid
)
for
iid
in
item_ids
]
raw_item_ids
=
[
trainset
.
to_raw_iid
(
iid
)
for
iid
in
item_ids
]
df_user
=
pd
.
DataFrame
({
'
item_id
'
:
raw_item_ids
,
'
user_ratings
'
:
user_ratings
})
df_user
=
pd
.
DataFrame
({
'
item_id
'
:
raw_item_ids
,
'
user_ratings
'
:
user_ratings
})
df_user
=
df_user
.
merge
(
self
.
content_features
,
left_on
=
"
item_id
"
,
right_index
=
True
,
how
=
'
left
'
)
df_user
=
df_user
.
merge
(
self
.
content_features
,
left_on
=
"
item_id
"
,
right_index
=
True
,
how
=
'
left
'
)
X
=
df_user
.
drop
(
columns
=
[
'
item_id
'
,
'
user_ratings
'
])
X
=
df_user
.
drop
(
columns
=
[
'
item_id
'
,
'
user_ratings
'
])
y
=
df_user
[
'
user_ratings
'
]
y
=
df_user
[
'
user_ratings
'
]
regressor
=
regressor_models
[
self
.
regressor_method
]
regressor
=
regressor_models
[
self
.
regressor_method
]
regressor
.
fit
(
X
,
y
)
regressor
.
fit
(
X
,
y
)
self
.
user_profile
[
u
]
=
regressor
self
.
user_profile
[
u
]
=
regressor
def
estimate
(
self
,
u
,
i
):
def
estimate
(
self
,
u
,
i
):
"""
Scoring component used for item filtering
"""
"""
Scoring component used for item filtering
"""
if
not
(
self
.
trainset
.
knows_user
(
u
)
and
self
.
trainset
.
knows_item
(
i
)):
if
not
(
self
.
trainset
.
knows_user
(
u
)
and
self
.
trainset
.
knows_item
(
i
)):
raise
PredictionImpossible
(
'
User and/or item is unknown.
'
)
raise
PredictionImpossible
(
'
User and/or item is unknown.
'
)
if
self
.
regressor_method
==
'
random_score
'
:
if
self
.
regressor_method
==
'
random_score
'
:
return
rd
.
uniform
(
0.5
,
5
)
return
rd
.
uniform
(
0.5
,
5
)
elif
self
.
regressor_method
==
'
random_sample
'
:
elif
self
.
regressor_method
==
'
random_sample
'
:
return
rd
.
choice
(
self
.
user_profile
[
u
])
return
rd
.
choice
(
self
.
user_profile
[
u
])
else
:
else
:
raw_item_id
=
self
.
trainset
.
to_raw_iid
(
i
)
raw_item_id
=
self
.
trainset
.
to_raw_iid
(
i
)
item_features
=
self
.
content_features
.
loc
[
raw_item_id
,
:].
values
.
reshape
(
1
,
-
1
)
item_features
=
self
.
content_features
.
loc
[
raw_item_id
,
:].
values
.
reshape
(
1
,
-
1
)
regressor
=
self
.
user_profile
[
u
]
regressor
=
self
.
user_profile
[
u
]
item_features_df
=
pd
.
DataFrame
(
item_features
,
columns
=
self
.
content_features
.
columns
)
item_features_df
=
pd
.
DataFrame
(
item_features
,
columns
=
self
.
content_features
.
columns
)
return
regressor
.
predict
(
item_features_df
)[
0
]
return
regressor
.
predict
(
item_features_df
)[
0
]
def
explain
(
self
,
u
):
def
explain
(
self
,
u
):
if
u
in
self
.
user_profile_explain
:
if
u
in
self
.
user_profile_explain
:
return
self
.
user_profile_explain
[
u
]
return
self
.
user_profile_explain
[
u
]
else
:
else
:
return
None
return
None
#Example usage:
#Example usage:
cb
=
ContentBased
([
"
title_length
"
,
"
movie_year
"
,
"
genre
"
,
"
avg_rating
"
],
"
ridge_regression
"
)
cb
=
ContentBased
([
"
title_length
"
,
"
movie_year
"
,
"
genre
"
,
"
avg_rating
"
],
"
ridge_regression
"
)
surprise_data
=
load_ratings
(
surprise_format
=
True
)
surprise_data
=
load_ratings
(
surprise_format
=
True
)
trainset
=
surprise_data
.
build_full_trainset
()
trainset
=
surprise_data
.
build_full_trainset
()
testset
=
trainset
.
build_anti_testset
()
testset
=
trainset
.
build_anti_testset
()
cb
.
fit
(
trainset
)
cb
.
fit
(
trainset
)
#print("RMSE: ", cb.rmse(testset))
#print("RMSE: ", cb.rmse(testset))
#Example explanations for users:
#Example explanations for users:
print
(
cb
.
explain
(
11
))
print
(
cb
.
explain
(
11
))
print
(
cb
.
explain
(
13
))
print
(
cb
.
explain
(
13
))
print
(
cb
.
explain
(
17
))
print
(
cb
.
explain
(
17
))
print
(
cb
.
explain
(
23
))
print
(
cb
.
explain
(
23
))
print
(
cb
.
explain
(
27
))
print
(
cb
.
explain
(
27
))
print
(
cb
.
explain
(
73
))
print
(
cb
.
explain
(
73
))
```
```
%% Output
%% Output
0
---------------------------------------------------------------------------
1
NameError Traceback (most recent call last)
2
Cell In[3], line 147
3
145 trainset = surprise_data.build_full_trainset()
4
146 testset = trainset.build_anti_testset()
5
--> 147 cb.fit(trainset)
None
150 #print("RMSE: ", cb.rmse(testset))
{'n_character_title': array([0.03019692])}
151
{'n_character_title': array([0.04098154])}
152
{'n_character_title': array([0.02942264])}
153 #Example explanations for users:
{'n_character_title': array([0.08196307])}
154 print(cb.explain(11))
{'n_character_title': array([0.02798739])}
Cell In[3], line 88, in ContentBased.fit(self, trainset)
80 self.user_profile[u] = [rating for (_, rating) in trainset.ur[u]]
82 else:
83 regressor_models = {
84 'linear_regression': LinearRegression(fit_intercept=False),
85 'svr_regression': SVR(kernel='rbf', C=10, epsilon=0.2),
86 'gradient_boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
87 'random_forest': RandomForestRegressor(n_estimators=100),
---> 88 'lasso_regression': Lasso(alpha=0.1),
89 'ridge_regression': Ridge(alpha=1.0),
90 'elastic_net': ElasticNet(alpha=1.0, l1_ratio=0.5),
91 'knn_regression': KNeighborsRegressor(n_neighbors=1),
92 'decision_tree': DecisionTreeRegressor(max_depth=5),
93 'adaboost': AdaBoostRegressor(n_estimators=50),
94 'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
95 'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
96 }
98 if self.regressor_method not in regressor_models:
99 raise NotImplementedError(f'Regressor method {self.regressor_method} not yet implemented')
NameError: name 'Lasso' is not defined
%% Cell type:markdown id:ffd75b7e tags:
%% Cell type:markdown id:ffd75b7e tags:
The following script test the ContentBased class
The following script test the ContentBased class
%% Cell type:code id:69d12f7d tags:
%% Cell type:code id:69d12f7d tags:
```
python
```
python
def
test_contentbased_class
(
feature_method
,
regressor_method
):
def
test_contentbased_class
(
feature_method
,
regressor_method
):
"""
Test the ContentBased class.
"""
Test the ContentBased class.
Tries to make a prediction on the first (user,item ) tuple of the anti_test_set
Tries to make a prediction on the first (user,item ) tuple of the anti_test_set
"""
"""
sp_ratings
=
load_ratings
(
surprise_format
=
True
)
sp_ratings
=
load_ratings
(
surprise_format
=
True
)
train_set
=
sp_ratings
.
build_full_trainset
()
train_set
=
sp_ratings
.
build_full_trainset
()
content_algo
=
ContentBased
(
feature_method
,
regressor_method
)
content_algo
=
ContentBased
(
feature_method
,
regressor_method
)
content_algo
.
fit
(
train_set
)
content_algo
.
fit
(
train_set
)
anti_test_set_first
=
train_set
.
build_anti_testset
()[
0
]
anti_test_set_first
=
train_set
.
build_anti_testset
()[
0
]
prediction
=
content_algo
.
predict
(
anti_test_set_first
[
0
],
anti_test_set_first
[
1
])
prediction
=
content_algo
.
predict
(
anti_test_set_first
[
0
],
anti_test_set_first
[
1
])
print
(
prediction
)
print
(
prediction
)
test_contentbased_class
([
"
title_length
"
,
"
movie_year
"
,
"
genre
"
,
"
avg_rating
"
],
"
ridge_regression
"
)
test_contentbased_class
([
"
title_length
"
,
"
movie_year
"
,
"
genre
"
,
"
avg_rating
"
],
"
ridge_regression
"
)
```
```
...
...
Ce diff est replié.
Cliquez pour l'agrandir.
Aperçu
0%
Chargement en cours
Veuillez réessayer
ou
joindre un nouveau fichier
.
Annuler
You are about to add
0
people
to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Enregistrer le commentaire
Annuler
Veuillez vous
inscrire
ou vous
se connecter
pour commenter