Skip to content
GitLab
Explorer
Connexion
S'inscrire
Navigation principale
Rechercher ou aller à…
Projet
R
recomsys
Gestion
Activité
Membres
Labels
Programmation
Tickets
Tableaux des tickets
Jalons
Wiki
Code
Requêtes de fusion
Dépôt
Branches
Validations
Étiquettes
Graphe du dépôt
Comparer les révisions
Extraits de code
Compilation
Pipelines
Jobs
Planifications de pipeline
Artéfacts
Déploiement
Releases
Registre de paquets
Registre de conteneur
Registre de modèles
Opération
Environnements
Modules Terraform
Surveillance
Incidents
Analyse
Données d'analyse des chaînes de valeur
Analyse des contributeurs
Données d'analyse CI/CD
Données d'analyse du dépôt
Expériences du modèle
Aide
Aide
Support
Documentation de GitLab
Comparer les forfaits GitLab
Forum de la communauté
Contribuer à GitLab
Donner votre avis
Conditions générales et politique de confidentialité
Raccourcis clavier
?
Extraits de code
Groupes
Projets
Afficher davantage de fils d'Ariane
recommender_system
recomsys
Validations
6456e17a
Valider
6456e17a
rédigé
1 year ago
par
Nathanaël Kindidi
Parcourir les fichiers
Options
Téléchargements
Correctifs
Plain Diff
modif class content based
parent
d549c086
Aucune branche associée trouvée
Branches contenant la validation
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
Modifications
1
Masquer les modifications d'espaces
En ligne
Côte à côte
Affichage de
1 fichier modifié
content_based.ipynb
+121
-775
121 ajouts, 775 suppressions
content_based.ipynb
avec
121 ajouts
et
775 suppressions
content_based.ipynb
+
121
−
775
Voir le fichier @
6456e17a
...
@@ -10,7 +10,7 @@
...
@@ -10,7 +10,7 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
756
,
"execution_count":
15
,
"id": "277473a3",
"id": "277473a3",
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [
...
@@ -53,89 +53,26 @@
...
@@ -53,89 +53,26 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
757
,
"execution_count":
16
,
"id": "e8378976",
"id": "e8378976",
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [
{
{
"data": {
"ename": "FileNotFoundError",
"text/html": [
"evalue": "[Errno 2] No such file or directory: 'data/test/content/movies.csv'",
"<div>\n",
"output_type": "error",
"<style scoped>\n",
"traceback": [
" .dataframe tbody tr th:only-of-type {\n",
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
" vertical-align: middle;\n",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
" }\n",
"Cell \u001b[0;32mIn[16], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# All the dataframes\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m df_items \u001b[38;5;241m=\u001b[39m \u001b[43mload_items\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m df_ratings \u001b[38;5;241m=\u001b[39m load_ratings()\n\u001b[1;32m 4\u001b[0m df_tag \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(C\u001b[38;5;241m.\u001b[39mCONTENT_PATH\u001b[38;5;241m/\u001b[39mC\u001b[38;5;241m.\u001b[39mTAGS_FILENAME)\n",
"\n",
"File \u001b[0;32m~/Desktop/Université/Recommender Systems/recomsys/loaders.py:34\u001b[0m, in \u001b[0;36mload_items\u001b[0;34m()\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_items\u001b[39m():\n\u001b[1;32m 29\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Loads items data.\u001b[39;00m\n\u001b[1;32m 30\u001b[0m \n\u001b[1;32m 31\u001b[0m \u001b[38;5;124;03m Returns:\u001b[39;00m\n\u001b[1;32m 32\u001b[0m \u001b[38;5;124;03m DataFrame: Items data.\u001b[39;00m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 34\u001b[0m df_items \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mC\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mCONTENT_PATH\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m/\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mC\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mITEMS_FILENAME\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# ce qui se trouve dans le movie csv\u001b[39;00m\n\u001b[1;32m 35\u001b[0m df_items \u001b[38;5;241m=\u001b[39m df_items\u001b[38;5;241m.\u001b[39mset_index(C\u001b[38;5;241m.\u001b[39mITEM_ID_COL) \u001b[38;5;66;03m# movie id\u001b[39;00m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df_items\n",
" .dataframe tbody tr th {\n",
"File \u001b[0;32m~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 1014\u001b[0m dialect,\n\u001b[1;32m 1015\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1022\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 1023\u001b[0m )\n\u001b[1;32m 1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
" vertical-align: top;\n",
"File \u001b[0;32m~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 623\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
" }\n",
"File \u001b[0;32m~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
"\n",
"File \u001b[0;32m~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1878\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1879\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1880\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1881\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1882\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1883\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1884\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1885\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1886\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1887\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1888\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1889\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1890\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1891\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n",
" .dataframe thead th {\n",
"File \u001b[0;32m~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 869\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 870\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 871\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 872\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 874\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 875\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 876\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 877\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 878\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 879\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
" text-align: right;\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'data/test/content/movies.csv'"
" }\n",
]
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>n_character_title</th>\n",
" </tr>\n",
" <tr>\n",
" <th>movieId</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>4993</th>\n",
" <td>57</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5952</th>\n",
" <td>45</td>\n",
" </tr>\n",
" <tr>\n",
" <th>527</th>\n",
" <td>23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2028</th>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4308</th>\n",
" <td>19</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" n_character_title\n",
"movieId \n",
"4993 57\n",
"5952 45\n",
"527 23\n",
"2028 26\n",
"4308 19"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"0 long\n",
"1 boring\n",
"2 long\n",
"3 romance\n",
"4 stupidity\n",
"Name: tag, dtype: object"
]
},
"metadata": {},
"output_type": "display_data"
}
}
],
],
"source": [
"source": [
...
@@ -169,7 +106,7 @@
...
@@ -169,7 +106,7 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
758
,
"execution_count":
null
,
"id": "16b0a602",
"id": "16b0a602",
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [
...
@@ -193,697 +130,171 @@
...
@@ -193,697 +130,171 @@
}
}
],
],
"source": [
"source": [
"\n",
"# ContetnBased\n",
"class ContentBased(AlgoBase):\n",
"class ContentBased(AlgoBase):\n",
" def __init__(self, features_method, regressor_method):\n",
" def __init__(self, features_method, regressor_method):\n",
" AlgoBase.__init__(self)\n",
" AlgoBase.__init__(self)\n",
" self.regressor_method = regressor_method\n",
" self.regressor_method = regressor_method\n",
" self.features_methods = features_method\n",
" self.content_features = self.create_content_features(features_method)\n",
" self.content_features = self.create_content_features(features_method)\n",
" self.user_profile = {}\n",
" self.user_profile_explain = {}\n",
" self.user_profile_explain = {}\n",
"\n",
"\n",
" def create_content_features(self, features_method):\n",
" def create_content_features(self, features_method
s
):\n",
" \"\"\"Content Analyzer\"\"\"\n",
" \"\"\"Content Analyzer\"\"\"\n",
" df_items = load_items()\n",
" df_items = load_items()\n",
" df_ratings = load_ratings()\n",
" df_ratings = load_ratings()\n",
" df_tag =
df_tag =
pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n",
" df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n",
" df_genome_score = pd.read_csv(\"data/hackathon/content/genome-scores.csv\")\n",
" df_genome_score = pd.read_csv(\"data/hackathon/content/genome-scores.csv\")\n",
" df_genome_tag = pd.read_csv(\"data/hackathon/content/genome-tags.csv\")\n",
" df_genome_tag = pd.read_csv(\"data/hackathon/content/genome-tags.csv\")\n",
"\n",
"\n",
" if features_method is None:\n",
" df_features = pd.DataFrame(index=df_items.index)\n",
" df_features = None\n",
"\n",
" elif features_method == \"relevance\" :\n",
" df_features = df_genome_score.groupby('movieId')[\"relevance\"].transform('mean').to_frame('avg_relevance')\n",
"\n",
"\n",
" elif features_method == \"title_length\": # a naive method that creates only 1 feature based on title length\n",
" for method in features_methods:\n",
" df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')\n",
" if method == \"title_length\":\n",
" df_title_length = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('title_length')\n",
" df_features = pd.concat([df_features, df_title_length], axis=1)\n",
" \n",
" \n",
" elif features_method == \"movie_year\" :\n",
" elif method == \"movie_year\":\n",
" df_features = df_items['movie_year'] = df_items['title'].str.extract(r'\\((\\d{4})\\)', expand=False).to_frame('movie_year')\n",
" df_movie_year = df_items['title'].str.extract(r'\\((\\d{4})\\)', expand=False).to_frame('movie_year')\n",
"\n",
" df_features = pd.concat([df_features, df_movie_year.astype(float).fillna(0)], axis=1)\n",
" elif features_method == \"genres\" :\n",
" genres_list = df_items['genres'].str.split('|').explode().unique()\n",
" for genre in genres_list:\n",
" df_features = df_items['genres'].str.contains(genre).astype(int).to_frame('genres')\n",
" \n",
" elif features_method == \"combination\": \n",
" df_length = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')\n",
" df_movie = df_items['title'].str.extract(r'\\((\\d{4})\\)', expand=False).to_frame('movie_year')\n",
" genres_list = df_items['genres'].str.split('|').explode().unique()\n",
" for genre in genres_list:\n",
" df_genre = df_items['genres'].str.contains(genre).astype(int).to_frame('genres')\n",
" \n",
" df_features = pd.concat([df_genre, df_length, df_movie], axis=1)\n",
" \n",
" elif features_method == \"rating\" :\n",
" df_features = df_ratings.groupby('movieId')['rating'].transform('mean').to_frame('avg_rating')\n",
"\n",
" elif features_method == \"tags\" :\n",
" df_features = df_tag['tag'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0).to_frame('tags')\n",
"\n",
" elif features_method == \"tags_length\" :\n",
" \n",
" \n",
" df_features = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(','))if isinstance(x, str) else 0).to_frame('n_character_tags')\n",
" elif method == \"genre\":\n",
" tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split('|'), token_pattern=None)\n",
" tfidf_matrix = tfidf_vectorizer.fit_transform(df_items['genres'])\n",
" df_tfidf_genres = pd.DataFrame(tfidf_matrix.toarray(), index=df_items.index, columns=tfidf_vectorizer.get_feature_names_out())\n",
" df_features = pd.concat([df_features, df_tfidf_genres], axis=1)\n",
"\n",
"\n",
" elif method == \"avg_rating\":\n",
" df_avg_rating = df_ratings.groupby('movieId')['rating'].mean().to_frame('avg_rating')\n",
" df_features = df_features.join(df_avg_rating, on='movieId')\n",
"\n",
" else:\n",
" raise NotImplementedError(f'Feature method {method} not yet implemented')\n",
"\n",
" # Handle missing values in df_features\n",
" df_features.fillna(0, inplace=True)\n",
"\n",
"\n",
" else: # (implement other feature creations here)\n",
" raise NotImplementedError(f'Feature method {features_method} not yet implemented')\n",
" return df_features\n",
" return df_features\n",
" \n",
"\n",
"\n",
" def fit(self, trainset):\n",
" def fit(self, trainset):\n",
" \"\"\"Profile Learner\"\"\"\n",
" \"\"\"Profile Learner\"\"\"\n",
" AlgoBase.fit(self, trainset)\n",
" AlgoBase.fit(self, trainset)\n",
"
\n",
"\n",
" # Preallocate user profiles\n",
" # Preallocate user profiles\n",
" self.user_profile = {u: None for u in trainset.all_users()}\n",
" self.user_profile = {u: None for u in trainset.all_users()}\n",
" self.user_profile_explain = {}\n",
"\n",
"\n",
" self.user_profile_explain = {u: {} for u in trainset.all_users()}\n",
" epsilon = 1e-10 # Small value to prevent division by zero\n",
"\n",
" for u in self.user_profile_explain :\n",
" print(u)\n",
" user_ratings = np.array([rating for _, rating in trainset.ur[u]])\n",
"\n",
" feature_values = self.content_features.values\n",
"\n",
" fv = feature_values.astype(int)\n",
"\n",
" weighted_features = fv/np.linalg.norm(fv)\n",
"\n",
" feature_importance = weighted_features / np.sum(user_ratings)\n",
"\n",
"\n",
" self.user_profile_explain[u] = dict(zip(self.content_features.columns, feature_importance))\n",
" for u in trainset.all_users():\n",
" \n",
" raw_user_id = trainset.to_raw_uid(u)\n",
" self.user_profile_explain[raw_user_id] = {}\n",
"\n",
"\n",
" user_ratings = np.array([rating for (_, rating) in trainset.ur[u]])\n",
" item_ids = [iid for (iid, _) in trainset.ur[u]]\n",
" raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]\n",
"\n",
"\n",
" feature_values = self.content_features.loc[raw_item_ids].values\n",
" norms = np.linalg.norm(feature_values, axis=0) + epsilon\n",
" weighted_features = feature_values / norms\n",
" feature_importance = weighted_features.T @ user_ratings\n",
" feature_importance /= np.sum(user_ratings)\n",
"\n",
"\n",
" self.user_profile_explain[raw_user_id] = dict(zip(self.content_features.columns, feature_importance))\n",
"\n",
"\n",
" if self.regressor_method == 'random_score':\n",
" if self.regressor_method == 'random_score':\n",
" for u in self.user_profile :\n",
" self.user_profile[u] = rd.uniform(0.5,5)\n",
" \n",
" elif self.regressor_method == 'random_sample':\n",
" for u in self.user_profile:\n",
" for u in self.user_profile:\n",
" self.user_profile[u] =
[rating for _, rating in self.trainset.ur[u]]
\n",
" self.user_profile[u] =
rd.uniform(0.5, 5)
\n",
"\n",
"\n",
" elif self.regressor_method == '
linear_regression'
:\n",
" elif self.regressor_method == '
random_sample'
:\n",
" for u in self.user_profile:\n",
" for u in self.user_profile:\n",
" self.user_profile[u] = [rating for (_, rating) in trainset.ur[u]]\n",
"\n",
" else:\n",
" regressor_models = {\n",
" 'linear_regression': LinearRegression(fit_intercept=False),\n",
" 'svr_regression': SVR(kernel='rbf', C=10, epsilon=0.2),\n",
" 'gradient_boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),\n",
" 'random_forest': RandomForestRegressor(n_estimators=100),\n",
" 'lasso_regression': Lasso(alpha=0.1),\n",
" 'ridge_regression': Ridge(alpha=1.0),\n",
" 'elastic_net': ElasticNet(alpha=1.0, l1_ratio=0.5),\n",
" 'knn_regression': KNeighborsRegressor(n_neighbors=1),\n",
" 'decision_tree': DecisionTreeRegressor(max_depth=5),\n",
" 'adaboost': AdaBoostRegressor(n_estimators=50),\n",
" 'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),\n",
" 'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)\n",
" }\n",
"\n",
" if self.regressor_method not in regressor_models:\n",
" raise NotImplementedError(f'Regressor method {self.regressor_method} not yet implemented')\n",
"\n",
"\n",
" user_ratings = [rating for _, rating in trainset.ur[u]]\n",
" item_ids = [iid for iid, _ in trainset.ur[u]]\n",
"\n",
" df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})\n",
"\n",
" df_user[\"item_id\"] = df_user[\"item_id\"].map(trainset.to_raw_iid)\n",
"\n",
" df_user = df_user.merge(self.content_features, left_on = \"item_id\", right_index = True, how = 'left')\n",
" \n",
" if 'n_character_title' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['n_character_title'].values.reshape(-1, 1)\n",
"\n",
" elif 'avg_relevance' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['avg_relevance'].values.reshape(-1, 1)\n",
" \n",
" elif 'movie_year' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['movie_year'].values.reshape(-1, 1)\n",
" \n",
" elif 'genres' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['genres'].values.reshape(-1, 1)\n",
" \n",
" elif 'combination' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['combination'].values.reshape(-1, 1)\n",
" \n",
" elif 'avg_rating' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['avg_rating'].values.reshape(-1, 1)\n",
"\n",
" elif 'tags' in df_user.columns:\n",
" # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n",
" X = df_user['tags'].values.reshape(-1, 1)\n",
"\n",
" elif 'n_character_tags' in df_user.columns:\n",
" # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n",
" X = df_user['n_character_tags'].values.reshape(-1, 1)\n",
"\n",
" else:\n",
" # Si aucune caractéristique appropriée n'est disponible\n",
" continue # Ou gère le cas d'erreur/exception ici\n",
"\n",
" y = df_user['user_ratings'].values\n",
"\n",
" linear_regressor = LinearRegression(fit_intercept = False)\n",
"\n",
" linear_regressor.fit(X,y)\n",
" \n",
" # Store the computed user profile\n",
" self.user_profile[u] = linear_regressor\n",
"\n",
" elif self.regressor_method == 'svr_regression':\n",
" for u in self.user_profile:\n",
" for u in self.user_profile:\n",
" user_ratings = [rating for (_, rating) in trainset.ur[u]]\n",
" item_ids = [iid for (iid, _) in trainset.ur[u]]\n",
" raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]\n",
"\n",
"\n",
" user_ratings = [rating for _, rating in trainset.ur[u]]\n",
" df_user = pd.DataFrame({'item_id': raw_item_ids, 'user_ratings': user_ratings})\n",
" item_ids = [iid for iid, _ in trainset.ur[u]]\n",
" df_user = df_user.merge(self.content_features, left_on=\"item_id\", right_index=True, how='left')\n",
"\n",
" df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})\n",
"\n",
" df_user[\"item_id\"] = df_user[\"item_id\"].map(trainset.to_raw_iid)\n",
"\n",
" df_user = df_user.merge(self.content_features, left_on = \"item_id\", right_index = True, how = 'left')\n",
"\n",
" if 'n_character_title' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['n_character_title'].values.reshape(-1, 1)\n",
"\n",
" elif 'avg_relevance' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['avg_relevance'].values.reshape(-1, 1)\n",
" \n",
" elif 'movie_year' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['movie_year'].values.reshape(-1, 1)\n",
" \n",
" elif 'genres' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['genres'].values.reshape(-1, 1)\n",
" \n",
" elif 'combination' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['combination'].values.reshape(-1, 1)\n",
" \n",
" elif 'avg_rating' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['avg_rating'].values.reshape(-1, 1)\n",
"\n",
" elif 'tags' in df_user.columns:\n",
" # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n",
" X = df_user['tags'].values.reshape(-1, 1)\n",
"\n",
" elif 'n_character_tags' in df_user.columns:\n",
" # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n",
" X = df_user['n_character_tags'].values.reshape(-1, 1)\n",
"\n",
" else:\n",
" # Si aucune caractéristique appropriée n'est disponible\n",
" continue # Ou gère le cas d'erreur/exception ici\n",
" \n",
" y = df_user['user_ratings'].values\n",
" svr_regressor = SVR(kernel='rbf', C=10, epsilon=0.2)\n",
" svr_regressor.fit(X, y)\n",
" self.user_profile[u] = svr_regressor\n",
"\n",
" elif self.regressor_method == 'gradient_boosting':\n",
" for u in self.user_profile:\n",
"\n",
"\n",
" user_ratings = [rating for _, rating in trainset.ur[u]]\n",
" X = df_user.drop(columns=['item_id', 'user_ratings'])\n",
" item_ids = [iid for iid, _ in trainset.ur[u]]\n",
" y = df_user['user_ratings']\n",
"\n",
" df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})\n",
"\n",
" df_user[\"item_id\"] = df_user[\"item_id\"].map(trainset.to_raw_iid)\n",
"\n",
" df_user = df_user.merge(self.content_features, left_on = \"item_id\", right_index = True, how = 'left')\n",
"\n",
" if 'n_character_title' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['n_character_title'].values.reshape(-1, 1)\n",
"\n",
" elif 'avg_relevance' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['avg_relevance'].values.reshape(-1, 1)\n",
" \n",
" elif 'movie_year' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['movie_year'].values.reshape(-1, 1)\n",
" \n",
" elif 'genres' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['genres'].values.reshape(-1, 1)\n",
" \n",
" elif 'combination' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['combination'].values.reshape(-1, 1)\n",
" \n",
" elif 'avg_rating' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['avg_rating'].values.reshape(-1, 1)\n",
"\n",
" elif 'tags' in df_user.columns:\n",
" # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n",
" X = df_user['tags'].values.reshape(-1, 1)\n",
"\n",
" elif 'n_character_tags' in df_user.columns:\n",
" # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n",
" X = df_user['n_character_tags'].values.reshape(-1, 1)\n",
"\n",
" else:\n",
" # Si aucune caractéristique appropriée n'est disponible\n",
" continue # Ou gère le cas d'erreur/exception ici\n",
" \n",
" y = df_user['user_ratings'].values\n",
" gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)\n",
" gb_regressor.fit(X, y)\n",
" self.user_profile[u] = gb_regressor\n",
"\n",
"\n",
" regressor = regressor_models[self.regressor_method]\n",
" regressor.fit(X, y)\n",
"\n",
"\n",
" elif self.regressor_method == 'random_forest':\n",
" self.user_profile[u] = regressor\n",
" for u in self.user_profile:\n",
"\n",
"\n",
" user_ratings = [rating for _, rating in trainset.ur[u]]\n",
" item_ids = [iid for iid, _ in trainset.ur[u]]\n",
"\n",
" df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})\n",
"\n",
" df_user[\"item_id\"] = df_user[\"item_id\"].map(trainset.to_raw_iid)\n",
"\n",
" df_user = df_user.merge(self.content_features, left_on = \"item_id\", right_index = True, how = 'left')\n",
"\n",
" if 'n_character_title' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['n_character_title'].values.reshape(-1, 1)\n",
"\n",
" elif 'avg_relevance' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['avg_relevance'].values.reshape(-1, 1)\n",
" \n",
" elif 'movie_year' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['movie_year'].values.reshape(-1, 1)\n",
" \n",
" elif 'genres' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['genres'].values.reshape(-1, 1)\n",
" \n",
" elif 'combination' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['combination'].values.reshape(-1, 1)\n",
" \n",
" elif 'avg_rating' in df_user.columns:\n",
" # Si 'n_character_title' est disponible comme caractéristique\n",
" X = df_user['avg_rating'].values.reshape(-1, 1)\n",
"\n",
" elif 'tags' in df_user.columns:\n",
" # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n",
" X = df_user['tags'].values.reshape(-1, 1)\n",
"\n",
" elif 'n_character_tags' in df_user.columns:\n",
" # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n",
" X = df_user['n_character_tags'].values.reshape(-1, 1)\n",
"\n",
" else:\n",
" # Si aucune caractéristique appropriée n'est disponible\n",
" continue # Ou gère le cas d'erreur/exception ici\n",
"\n",
" y = df_user['user_ratings'].values\n",
" rf_regressor = RandomForestRegressor(n_estimators=100)\n",
" rf_regressor.fit(X, y)\n",
" self.user_profile[u] = rf_regressor\n",
"\n",
" else : \n",
" pass\n",
"\n",
" # (implement here the regressor fitting) \n",
" \n",
" def estimate(self, u, i):\n",
" def estimate(self, u, i):\n",
" \"\"\"Scoring component used for item filtering\"\"\"\n",
" \"\"\"Scoring component used for item filtering\"\"\"\n",
" # First, handle cases for unknown users and items\n",
" if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):\n",
" if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):\n",
" raise PredictionImpossible('User and/or item is unkown.')\n",
" raise PredictionImpossible('User and/or item is unknown.')\n",
"\n",
"\n",
"\n",
" if self.regressor_method == 'random_score':\n",
" if self.regressor_method == 'random_score':\n",
" rd.seed()\n",
" return rd.uniform(0.5, 5)\n",
" score = rd.uniform(0.5,5)\n",
"\n",
"\n",
" elif self.regressor_method == 'random_sample':\n",
" elif self.regressor_method == 'random_sample':\n",
" rd.seed()\n",
" return rd.choice(self.user_profile[u])\n",
" score = rd.choice(self.user_profile[u])\n",
" \n",
" elif self.regressor_method == 'linear_regression':\n",
"\n",
" raw_item_id = self.trainset.to_raw_iid(i)\n",
"\n",
" item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values\n",
"\n",
" linear_regressor = self.user_profile[u]\n",
"\n",
" score= linear_regressor.predict(item_features)[0]\n",
" \n",
" elif self.regressor_method == 'svr_regression':\n",
"\n",
"\n",
" else:\n",
" raw_item_id = self.trainset.to_raw_iid(i)\n",
" raw_item_id = self.trainset.to_raw_iid(i)\n",
" item_features = self.content_features.loc[raw_item_id, :].values.reshape(1, -1)\n",
" regressor = self.user_profile[u]\n",
" item_features_df = pd.DataFrame(item_features, columns=self.content_features.columns)\n",
" return regressor.predict(item_features_df)[0]\n",
"\n",
"\n",
" item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values\n",
" def explain(self, u):\n",
"\n",
" if u in self.user_profile_explain:\n",
" svr_regressor = self.user_profile[u]\n",
" score = svr_regressor.predict(item_features)[0]\n",
" \n",
" elif self.regressor_method == 'gradient_boosting':\n",
"\n",
" raw_item_id = self.trainset.to_raw_iid(i)\n",
"\n",
" item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values\n",
"\n",
" gradient_boosting = self.user_profile[u]\n",
" score = gradient_boosting.predict(item_features)[0]\n",
" \n",
" elif self.regressor_method == 'random_forest':\n",
"\n",
" raw_item_id = self.trainset.to_raw_iid(i)\n",
"\n",
" item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values\n",
"\n",
" randomforest = self.user_profile[u]\n",
" score = randomforest.predict(item_features)[0]\n",
" \n",
" else : \n",
" score = None\n",
"\n",
" # (implement here the regressor prediction)\n",
"\n",
" return score\n",
"\n",
" def explain(self, u) : \n",
" if u in self.user_profile_explain :\n",
" return self.user_profile_explain[u]\n",
" return self.user_profile_explain[u]\n",
" else
:\n",
" else:\n",
" return None\n",
" return None\n",
"\n",
"\n",
"\n",
"\n",
"cb = ContentBased(\"title_length\", \"random_sample\")\n",
"#Example usage:\n",
"sp_ratings = load_ratings(surprise_format=True)\n",
"cb = ContentBased([\"title_length\", \"movie_year\",\"genre\",\"avg_rating\"], \"ridge_regression\")\n",
"train_set = sp_ratings.build_full_trainset()\n",
"surprise_data = load_ratings(surprise_format=True)\n",
"print(cb.fit(train_set))\n",
"trainset = surprise_data.build_full_trainset()\n",
"testset = trainset.build_anti_testset()\n",
"cb.fit(trainset)\n",
"\n",
"\n",
"print(cb.explain(0))\n",
"\n",
"\n",
"print(
cb.explain(1
))\n",
"
#
print(
\"RMSE: \", cb.rmse(testset
))\n",
"\n",
"\n",
"print(cb.explain(2))\n",
"\n",
"\n",
"print(cb.explain(3))\n",
"#Example explanations for users:\n",
"print(cb.explain(11))\n",
"\n",
"\n",
"print(cb.explain(4))\n"
"print(cb.explain(13))\n",
]
},
{
"cell_type": "code",
"execution_count": 759,
"id": "baab88b7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Matrice TF-IDF des genres :\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>action</th>\n",
" <th>adventure</th>\n",
" <th>animation</th>\n",
" <th>children</th>\n",
" <th>comedy</th>\n",
" <th>drama</th>\n",
" <th>fantasy</th>\n",
" <th>fi</th>\n",
" <th>horror</th>\n",
" <th>imax</th>\n",
" <th>musical</th>\n",
" <th>mystery</th>\n",
" <th>romance</th>\n",
" <th>sci</th>\n",
" <th>war</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.000000</td>\n",
" <td>0.658454</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.752621</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.000000</td>\n",
" <td>0.658454</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.752621</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.572658</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.819795</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.694164</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.412209</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.590102</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.465343</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.582818</td>\n",
" <td>0.000000</td>\n",
" <td>0.666168</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.572658</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.819795</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.647689</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.761905</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.606043</td>\n",
" <td>0.515192</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.606043</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.563507</td>\n",
" <td>0.000000</td>\n",
" <td>0.662879</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.493002</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>0.000000</td>\n",
" <td>0.363703</td>\n",
" <td>0.415716</td>\n",
" <td>0.489026</td>\n",
" <td>0.000000</td>\n",
" <td>0.290394</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.489026</td>\n",
" <td>0.363703</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" action adventure animation children comedy drama fantasy \\\n",
"0 0.000000 0.658454 0.000000 0.000000 0.000000 0.000000 0.752621 \n",
"1 0.000000 0.658454 0.000000 0.000000 0.000000 0.000000 0.752621 \n",
"2 0.000000 0.000000 0.000000 0.000000 0.000000 0.572658 0.000000 \n",
"3 0.694164 0.000000 0.000000 0.000000 0.000000 0.412209 0.000000 \n",
"4 0.000000 0.000000 0.000000 0.000000 0.000000 0.465343 0.000000 \n",
"5 0.000000 0.000000 0.000000 0.000000 0.000000 0.572658 0.000000 \n",
"6 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"7 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"8 0.000000 0.000000 0.563507 0.000000 0.662879 0.000000 0.000000 \n",
"9 0.000000 0.363703 0.415716 0.489026 0.000000 0.290394 0.000000 \n",
"\n",
" fi horror imax musical mystery romance sci \\\n",
"0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"2 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"3 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"4 0.000000 0.000000 0.000000 0.582818 0.000000 0.666168 0.000000 \n",
"5 0.000000 0.000000 0.000000 0.000000 0.000000 0.819795 0.000000 \n",
"6 0.000000 0.647689 0.000000 0.000000 0.761905 0.000000 0.000000 \n",
"7 0.606043 0.515192 0.000000 0.000000 0.000000 0.000000 0.606043 \n",
"8 0.000000 0.000000 0.000000 0.493002 0.000000 0.000000 0.000000 \n",
"9 0.000000 0.000000 0.489026 0.363703 0.000000 0.000000 0.000000 \n",
"\n",
" war \n",
"0 0.000000 \n",
"1 0.000000 \n",
"2 0.819795 \n",
"3 0.590102 \n",
"4 0.000000 \n",
"5 0.000000 \n",
"6 0.000000 \n",
"7 0.000000 \n",
"8 0.000000 \n",
"9 0.000000 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from pprint import pprint\n",
"\n",
"\n",
"# Créer une instance de TfidfVectorizer pour les genres\n",
"print(cb.explain(17))\n",
"tfidf_vectorizer = TfidfVectorizer()\n",
"\n",
"\n",
"# Fit et transform pour calculer la matrice TF-IDF des genres\n",
"print(cb.explain(23))\n",
"tfidf_matrix = tfidf_vectorizer.fit_transform(df_items['genres'])\n",
"\n",
"\n",
"# Obtenir les noms des genres (features)\n",
"print(cb.explain(27))\n",
"genre_names = tfidf_vectorizer.get_feature_names_out()\n",
"\n",
"\n",
"# Créer un DataFrame à partir de la matrice TF-IDF des genres\n",
"print(cb.explain(73))\n"
"df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=genre_names)\n",
"\n",
"print(\"Matrice TF-IDF des genres :\")\n",
"display(df_tfidf)"
]
]
},
},
{
{
...
@@ -896,7 +307,7 @@
...
@@ -896,7 +307,7 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
760
,
"execution_count":
null
,
"id": "69d12f7d",
"id": "69d12f7d",
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
...
@@ -913,72 +324,7 @@
...
@@ -913,72 +324,7 @@
" prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])\n",
" prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])\n",
" print(prediction)\n",
" print(prediction)\n",
"\n",
"\n",
"\n",
"test_contentbased_class([\"title_length\", \"movie_year\",\"genre\",\"avg_rating\"], \"ridge_regression\")"
"\n",
"# print(\"title_length :\")\n",
"# test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"random_score\")\n",
"# test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"random_sample\")\n",
"# test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"linear_regression\")\n",
"# test_contentbased_class(feature_method= \"title_length\", regressor_method= \"svr_regression\")\n",
"# test_contentbased_class(feature_method= \"title_length\", regressor_method= \"gradient_boosting\")\n",
"# test_contentbased_class(feature_method= \"title_length\", regressor_method= \"random_forest\")\n",
"# print(\"\\n\")\n",
"# print(\"movie_year : \")\n",
"# test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"random_score\")\n",
"# test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"random_sample\")\n",
"# test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"linear_regression\")\n",
"# test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"svr_regression\")\n",
"# test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"gradient_boosting\")\n",
"# test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"random_forest\")\n",
"# print(\"\\n\")\n",
"# print(\"relevance : \") \n",
"# test_contentbased_class(feature_method= \"relevance\", regressor_method= \"random_score\")\n",
"# test_contentbased_class(feature_method= \"relevance\", regressor_method= \"random_sample\")\n",
"# test_contentbased_class(feature_method= \"relevance\", regressor_method= \"linear_regression\")\n",
"# test_contentbased_class(feature_method= \"relevance\", regressor_method= \"svr_regression\")\n",
"# test_contentbased_class(feature_method= \"relevance\", regressor_method= \"gradient_boosting\")\n",
"# test_contentbased_class(feature_method= \"relevance\", regressor_method= \"random_forest\")\n",
"# print(\"\\n\")\n",
"# print(\"genres : \") \n",
"# test_contentbased_class(feature_method= \"genres\", regressor_method= \"random_score\")\n",
"# test_contentbased_class(feature_method= \"genres\", regressor_method= \"random_sample\")\n",
"# test_contentbased_class(feature_method= \"genres\", regressor_method= \"linear_regression\")\n",
"# test_contentbased_class(feature_method= \"genres\", regressor_method= \"svr_regression\")\n",
"# test_contentbased_class(feature_method= \"genres\", regressor_method= \"gradient_boosting\")\n",
"# test_contentbased_class(feature_method= \"genres\", regressor_method= \"random_forest\")\n",
"# print(\"\\n\")\n",
"# print(\"rating : \")\n",
"# test_contentbased_class(feature_method= \"rating\", regressor_method=\"random_score\")\n",
"# test_contentbased_class(feature_method= \"rating\", regressor_method=\"random_sample\")\n",
"# # test_contentbased_class(feature_method= \"rating\", regressor_method=\"linear_regression\")\n",
"# #test_contentbased_class(feature_method=\"rating\", regressor_method=\"svr_regression\")\n",
"# #test_contentbased_class(feature_method=\"rating\", regressor_method=\"gradient_boosting\")\n",
"# #test_contentbased_class(feature_method=\"rating\", regressor_method=\"random_forest\")\n",
"# print(\"\\n\")\n",
"# print(\"tags : \")\n",
"# test_contentbased_class(feature_method=\"tags\", regressor_method=\"random_score\")\n",
"# test_contentbased_class(feature_method=\"tags\", regressor_method=\"random_sample\")\n",
"# #test_contentbased_class(feature_method=\"tags\", regressor_method=\"linear_regression\")\n",
"# # test_contentbased_class(feature_method=\"tags\", regressor_method=\"svr_regression\")\n",
"# # test_contentbased_class(feature_method=\"tags\", regressor_method=\"gradient_boosting\")\n",
"# # test_contentbased_class(feature_method=\"tags\", regressor_method=\"random_forest\")\n",
"# print(\"\\n\")\n",
"# print(\"tags_length : \")\n",
"# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"random_score\")\n",
"# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"random_sample\")\n",
"# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"linear_regression\")\n",
"# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"svr_regression\")\n",
"# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"gradient_boosting\")\n",
"# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"random_forest\")\n",
"\n",
"# print(\"\\n\")\n",
"# print(\"combination : \")\n",
"# test_contentbased_class(feature_method=\"combination\", regressor_method=\"random_score\")\n",
"# test_contentbased_class(feature_method=\"combination\", regressor_method=\"random_sample\")\n",
"# test_contentbased_class(feature_method=\"combination\", regressor_method=\"linear_regression\")\n",
"# test_contentbased_class(feature_method=\"combination\", regressor_method=\"svr_regression\")\n",
"# test_contentbased_class(feature_method=\"combination\", regressor_method=\"gradient_boosting\")\n",
"# test_contentbased_class(feature_method=\"combination\", regressor_method=\"random_forest\")\n"
]
]
}
}
],
],
...
@@ -998,7 +344,7 @@
...
@@ -998,7 +344,7 @@
"name": "python",
"name": "python",
"nbconvert_exporter": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"pygments_lexer": "ipython3",
"version": "3.12.
2
"
"version": "3.12.
0
"
}
}
},
},
"nbformat": 4,
"nbformat": 4,
...
...
%% Cell type:markdown id:82d5ca82 tags:
%% Cell type:markdown id:82d5ca82 tags:
# Packages
# Packages
%% Cell type:code id:277473a3 tags:
%% Cell type:code id:277473a3 tags:
```
python
```
python
%
load_ext
autoreload
%
load_ext
autoreload
%
autoreload
2
%
autoreload
2
import
numpy
as
np
import
numpy
as
np
import
pandas
as
pd
import
pandas
as
pd
import
random
as
rd
import
random
as
rd
from
surprise
import
AlgoBase
from
surprise
import
AlgoBase
from
surprise.prediction_algorithms.predictions
import
PredictionImpossible
from
surprise.prediction_algorithms.predictions
import
PredictionImpossible
from
loaders
import
load_ratings
from
loaders
import
load_ratings
from
loaders
import
load_items
from
loaders
import
load_items
from
constants
import
Constant
as
C
from
constants
import
Constant
as
C
from
sklearn.linear_model
import
LinearRegression
from
sklearn.linear_model
import
LinearRegression
from
sklearn.ensemble
import
GradientBoostingRegressor
,
RandomForestRegressor
from
sklearn.ensemble
import
GradientBoostingRegressor
,
RandomForestRegressor
from
sklearn.svm
import
SVR
from
sklearn.svm
import
SVR
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.feature_extraction.text
import
TfidfVectorizer
```
```
%% Output
%% Output
The autoreload extension is already loaded. To reload it, use:
The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload
%reload_ext autoreload
%% Cell type:markdown id:a42c16bf tags:
%% Cell type:markdown id:a42c16bf tags:
# Explore and select content features
# Explore and select content features
%% Cell type:code id:e8378976 tags:
%% Cell type:code id:e8378976 tags:
```
python
```
python
# All the dataframes
# All the dataframes
df_items
=
load_items
()
df_items
=
load_items
()
df_ratings
=
load_ratings
()
df_ratings
=
load_ratings
()
df_tag
=
pd
.
read_csv
(
C
.
CONTENT_PATH
/
C
.
TAGS_FILENAME
)
df_tag
=
pd
.
read_csv
(
C
.
CONTENT_PATH
/
C
.
TAGS_FILENAME
)
#df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv")
#df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv")
# df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv")
# df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv")
# Example 1 : create title_length features
# Example 1 : create title_length features
df_features
=
df_items
[
C
.
LABEL_COL
].
apply
(
lambda
x
:
len
(
x
)).
to_frame
(
'
n_character_title
'
)
df_features
=
df_items
[
C
.
LABEL_COL
].
apply
(
lambda
x
:
len
(
x
)).
to_frame
(
'
n_character_title
'
)
display
(
df_features
.
head
())
display
(
df_features
.
head
())
df_tag
=
pd
.
read_csv
(
C
.
CONTENT_PATH
/
C
.
TAGS_FILENAME
)
df_tag
=
pd
.
read_csv
(
C
.
CONTENT_PATH
/
C
.
TAGS_FILENAME
)
df_features
=
df_tag
[
C
.
TAG
]
df_features
=
df_tag
[
C
.
TAG
]
display
(
df_features
.
head
())
display
(
df_features
.
head
())
# (explore here other features)
# (explore here other features)
```
```
%% Output
%% Output
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[16], line 2
1 # All the dataframes
----> 2 df_items = load_items()
3 df_ratings = load_ratings()
4 df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
File ~/Desktop/Université/Recommender Systems/recomsys/loaders.py:34, in load_items()
28 def load_items():
29 """Loads items data.
30
31 Returns:
32 DataFrame: Items data.
33 """
---> 34 df_items = pd.read_csv(C.CONTENT_PATH / C.ITEMS_FILENAME) # ce qui se trouve dans le movie csv
35 df_items = df_items.set_index(C.ITEM_ID_COL) # movie id
36 return df_items
File ~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
1013 kwds_defaults = _refine_defaults_read(
1014 dialect,
1015 delimiter,
(...)
1022 dtype_backend=dtype_backend,
1023 )
1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)
File ~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
617 _validate_names(kwds.get("names", None))
619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer,
**
kwds)
622 if chunksize or iterator:
623 return parser
File ~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine,
**
kwds)
1617 self.options["has_index_names"] = kwds["has_index_names"]
1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)
File ~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
1878 if "b" not in mode:
1879 mode += "b"
-> 1880 self.handles = get_handle(
1881 f,
1882 mode,
1883 encoding=self.options.get("encoding", None),
1884 compression=self.options.get("compression", None),
1885 memory_map=self.options.get("memory_map", False),
1886 is_text=is_text,
1887 errors=self.options.get("encoding_errors", "strict"),
1888 storage_options=self.options.get("storage_options", None),
1889 )
1890 assert self.handles is not None
1891 f = self.handles.handle
File ~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/common.py:873, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
868 elif isinstance(handle, str):
869 # Check whether the filename is to be opened in binary mode.
870 # Binary mode does not support 'encoding' and 'newline'.
871 if ioargs.encoding and "b" not in ioargs.mode:
872 # Encoding
--> 873 handle = open(
874 handle,
875 ioargs.mode,
876 encoding=ioargs.encoding,
877 errors=errors,
878 newline="",
879 )
880 else:
881 # Binary mode
882 handle = open(handle, ioargs.mode)
FileNotFoundError: [Errno 2] No such file or directory: 'data/test/content/movies.csv'
%% Cell type:markdown id:a2c9a2b6 tags:
%% Cell type:markdown id:a2c9a2b6 tags:
# Build a content-based model
# Build a content-based model
When ready, move the following class in the
*models.py*
script
When ready, move the following class in the
*models.py*
script
%% Cell type:code id:16b0a602 tags:
%% Cell type:code id:16b0a602 tags:
```
python
```
python
# ContetnBased
class
ContentBased
(
AlgoBase
):
class
ContentBased
(
AlgoBase
):
def
__init__
(
self
,
features_method
,
regressor_method
):
def
__init__
(
self
,
features_method
,
regressor_method
):
AlgoBase
.
__init__
(
self
)
AlgoBase
.
__init__
(
self
)
self
.
regressor_method
=
regressor_method
self
.
regressor_method
=
regressor_method
self
.
features_methods
=
features_method
self
.
content_features
=
self
.
create_content_features
(
features_method
)
self
.
content_features
=
self
.
create_content_features
(
features_method
)
self
.
user_profile
=
{}
self
.
user_profile_explain
=
{}
self
.
user_profile_explain
=
{}
def
create_content_features
(
self
,
features_method
):
def
create_content_features
(
self
,
features_method
s
):
"""
Content Analyzer
"""
"""
Content Analyzer
"""
df_items
=
load_items
()
df_items
=
load_items
()
df_ratings
=
load_ratings
()
df_ratings
=
load_ratings
()
df_tag
=
df_tag
=
pd
.
read_csv
(
C
.
CONTENT_PATH
/
C
.
TAGS_FILENAME
)
df_tag
=
pd
.
read_csv
(
C
.
CONTENT_PATH
/
C
.
TAGS_FILENAME
)
df_genome_score
=
pd
.
read_csv
(
"
data/hackathon/content/genome-scores.csv
"
)
df_genome_score
=
pd
.
read_csv
(
"
data/hackathon/content/genome-scores.csv
"
)
df_genome_tag
=
pd
.
read_csv
(
"
data/hackathon/content/genome-tags.csv
"
)
df_genome_tag
=
pd
.
read_csv
(
"
data/hackathon/content/genome-tags.csv
"
)
if
features_method
is
None
:
df_features
=
pd
.
DataFrame
(
index
=
df_items
.
index
)
df_features
=
None
elif
features_method
==
"
relevance
"
:
df_features
=
df_genome_score
.
groupby
(
'
movieId
'
)[
"
relevance
"
].
transform
(
'
mean
'
).
to_frame
(
'
avg_relevance
'
)
elif
features_method
==
"
title_length
"
:
# a naive method that creates only 1 feature based on title length
df_features
=
df_items
[
C
.
LABEL_COL
].
apply
(
lambda
x
:
len
(
x
)).
to_frame
(
'
n_character_title
'
)
elif
features_method
==
"
movie_year
"
:
df_features
=
df_items
[
'
movie_year
'
]
=
df_items
[
'
title
'
].
str
.
extract
(
r
'
\((\d{4})\)
'
,
expand
=
False
).
to_frame
(
'
movie_year
'
)
elif
features_method
==
"
genres
"
:
genres_list
=
df_items
[
'
genres
'
].
str
.
split
(
'
|
'
).
explode
().
unique
()
for
genre
in
genres_list
:
df_features
=
df_items
[
'
genres
'
].
str
.
contains
(
genre
).
astype
(
int
).
to_frame
(
'
genres
'
)
elif
features_method
==
"
combination
"
:
df_length
=
df_items
[
C
.
LABEL_COL
].
apply
(
lambda
x
:
len
(
x
)).
to_frame
(
'
n_character_title
'
)
df_movie
=
df_items
[
'
title
'
].
str
.
extract
(
r
'
\((\d{4})\)
'
,
expand
=
False
).
to_frame
(
'
movie_year
'
)
genres_list
=
df_items
[
'
genres
'
].
str
.
split
(
'
|
'
).
explode
().
unique
()
for
genre
in
genres_list
:
df_genre
=
df_items
[
'
genres
'
].
str
.
contains
(
genre
).
astype
(
int
).
to_frame
(
'
genres
'
)
df_features
=
pd
.
concat
([
df_genre
,
df_length
,
df_movie
],
axis
=
1
)
elif
features_method
==
"
rating
"
:
for
method
in
features_methods
:
df_features
=
df_ratings
.
groupby
(
'
movieId
'
)[
'
rating
'
].
transform
(
'
mean
'
).
to_frame
(
'
avg_rating
'
)
if
method
==
"
title_length
"
:
df_title_length
=
df_items
[
C
.
LABEL_COL
].
apply
(
lambda
x
:
len
(
x
)).
to_frame
(
'
title_length
'
)
df_features
=
pd
.
concat
([
df_features
,
df_title_length
],
axis
=
1
)
elif
method
==
"
movie_year
"
:
df_movie_year
=
df_items
[
'
title
'
].
str
.
extract
(
r
'
\((\d{4})\)
'
,
expand
=
False
).
to_frame
(
'
movie_year
'
)
df_features
=
pd
.
concat
([
df_features
,
df_movie_year
.
astype
(
float
).
fillna
(
0
)],
axis
=
1
)
elif
method
==
"
genre
"
:
tfidf_vectorizer
=
TfidfVectorizer
(
tokenizer
=
lambda
x
:
x
.
split
(
'
|
'
),
token_pattern
=
None
)
tfidf_matrix
=
tfidf_vectorizer
.
fit_transform
(
df_items
[
'
genres
'
])
df_tfidf_genres
=
pd
.
DataFrame
(
tfidf_matrix
.
toarray
(),
index
=
df_items
.
index
,
columns
=
tfidf_vectorizer
.
get_feature_names_out
())
df_features
=
pd
.
concat
([
df_features
,
df_tfidf_genres
],
axis
=
1
)
elif
method
==
"
avg_rating
"
:
df_avg_rating
=
df_ratings
.
groupby
(
'
movieId
'
)[
'
rating
'
].
mean
().
to_frame
(
'
avg_rating
'
)
df_features
=
df_features
.
join
(
df_avg_rating
,
on
=
'
movieId
'
)
elif
features_method
==
"
tags
"
:
else
:
df_features
=
df_tag
[
'
tag
'
].
apply
(
lambda
x
:
len
(
x
.
split
(
'
,
'
))
if
isinstance
(
x
,
str
)
else
0
).
to_frame
(
'
tags
'
)
raise
NotImplementedError
(
f
'
Feature method
{
method
}
not yet implemented
'
)
elif
features_method
==
"
tags_length
"
:
# Handle missing values in df_features
df_features
.
fillna
(
0
,
inplace
=
True
)
df_features
=
df_tag
[
'
tag
'
].
apply
(
lambda
x
:
sum
(
len
(
tag
)
for
tag
in
x
.
split
(
'
,
'
))
if
isinstance
(
x
,
str
)
else
0
).
to_frame
(
'
n_character_tags
'
)
else
:
# (implement other feature creations here)
raise
NotImplementedError
(
f
'
Feature method
{
features_method
}
not yet implemented
'
)
return
df_features
return
df_features
def
fit
(
self
,
trainset
):
def
fit
(
self
,
trainset
):
"""
Profile Learner
"""
"""
Profile Learner
"""
AlgoBase
.
fit
(
self
,
trainset
)
AlgoBase
.
fit
(
self
,
trainset
)
# Preallocate user profiles
# Preallocate user profiles
self
.
user_profile
=
{
u
:
None
for
u
in
trainset
.
all_users
()}
self
.
user_profile
=
{
u
:
None
for
u
in
trainset
.
all_users
()}
self
.
user_profile_explain
=
{}
self
.
user_profile_explain
=
{
u
:
{}
for
u
in
trainset
.
all_users
()}
epsilon
=
1e-10
# Small value to prevent division by zero
for
u
in
self
.
user_profile_explain
:
print
(
u
)
user_ratings
=
np
.
array
([
rating
for
_
,
rating
in
trainset
.
ur
[
u
]])
feature_values
=
self
.
content_features
.
values
fv
=
feature_values
.
astype
(
int
)
weighted_features
=
fv
/
np
.
linalg
.
norm
(
fv
)
feature_importance
=
weighted_features
/
np
.
sum
(
user_ratings
)
self
.
user_profile_explain
[
u
]
=
dict
(
zip
(
self
.
content_features
.
columns
,
feature_importance
))
for
u
in
trainset
.
all_users
():
raw_user_id
=
trainset
.
to_raw_uid
(
u
)
self
.
user_profile_explain
[
raw_user_id
]
=
{}
user_ratings
=
np
.
array
([
rating
for
(
_
,
rating
)
in
trainset
.
ur
[
u
]])
item_ids
=
[
iid
for
(
iid
,
_
)
in
trainset
.
ur
[
u
]]
raw_item_ids
=
[
trainset
.
to_raw_iid
(
iid
)
for
iid
in
item_ids
]
feature_values
=
self
.
content_features
.
loc
[
raw_item_ids
].
values
norms
=
np
.
linalg
.
norm
(
feature_values
,
axis
=
0
)
+
epsilon
weighted_features
=
feature_values
/
norms
feature_importance
=
weighted_features
.
T
@
user_ratings
feature_importance
/=
np
.
sum
(
user_ratings
)
self
.
user_profile_explain
[
raw_user_id
]
=
dict
(
zip
(
self
.
content_features
.
columns
,
feature_importance
))
if
self
.
regressor_method
==
'
random_score
'
:
if
self
.
regressor_method
==
'
random_score
'
:
for
u
in
self
.
user_profile
:
self
.
user_profile
[
u
]
=
rd
.
uniform
(
0.5
,
5
)
elif
self
.
regressor_method
==
'
random_sample
'
:
for
u
in
self
.
user_profile
:
self
.
user_profile
[
u
]
=
[
rating
for
_
,
rating
in
self
.
trainset
.
ur
[
u
]]
elif
self
.
regressor_method
==
'
linear_regression
'
:
for
u
in
self
.
user_profile
:
for
u
in
self
.
user_profile
:
self
.
user_profile
[
u
]
=
rd
.
uniform
(
0.5
,
5
)
user_ratings
=
[
rating
for
_
,
rating
in
trainset
.
ur
[
u
]]
elif
self
.
regressor_method
==
'
random_sample
'
:
item_ids
=
[
iid
for
iid
,
_
in
trainset
.
ur
[
u
]]
df_user
=
pd
.
DataFrame
({
'
item_id
'
:
item_ids
,
'
user_ratings
'
:
user_ratings
})
df_user
[
"
item_id
"
]
=
df_user
[
"
item_id
"
].
map
(
trainset
.
to_raw_iid
)
df_user
=
df_user
.
merge
(
self
.
content_features
,
left_on
=
"
item_id
"
,
right_index
=
True
,
how
=
'
left
'
)
if
'
n_character_title
'
in
df_user
.
columns
:
# Si 'n_character_title' est disponible comme caractéristique
X
=
df_user
[
'
n_character_title
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
avg_relevance
'
in
df_user
.
columns
:
# Si 'n_character_title' est disponible comme caractéristique
X
=
df_user
[
'
avg_relevance
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
movie_year
'
in
df_user
.
columns
:
# Si 'n_character_title' est disponible comme caractéristique
X
=
df_user
[
'
movie_year
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
genres
'
in
df_user
.
columns
:
# Si 'n_character_title' est disponible comme caractéristique
X
=
df_user
[
'
genres
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
combination
'
in
df_user
.
columns
:
# Si 'n_character_title' est disponible comme caractéristique
X
=
df_user
[
'
combination
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
avg_rating
'
in
df_user
.
columns
:
# Si 'n_character_title' est disponible comme caractéristique
X
=
df_user
[
'
avg_rating
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
tags
'
in
df_user
.
columns
:
# Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)
X
=
df_user
[
'
tags
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
n_character_tags
'
in
df_user
.
columns
:
# Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)
X
=
df_user
[
'
n_character_tags
'
].
values
.
reshape
(
-
1
,
1
)
else
:
# Si aucune caractéristique appropriée n'est disponible
continue
# Ou gère le cas d'erreur/exception ici
y
=
df_user
[
'
user_ratings
'
].
values
linear_regressor
=
LinearRegression
(
fit_intercept
=
False
)
linear_regressor
.
fit
(
X
,
y
)
# Store the computed user profile
self
.
user_profile
[
u
]
=
linear_regressor
elif
self
.
regressor_method
==
'
svr_regression
'
:
for
u
in
self
.
user_profile
:
for
u
in
self
.
user_profile
:
self
.
user_profile
[
u
]
=
[
rating
for
(
_
,
rating
)
in
trainset
.
ur
[
u
]]
user_ratings
=
[
rating
for
_
,
rating
in
trainset
.
ur
[
u
]]
else
:
item_ids
=
[
iid
for
iid
,
_
in
trainset
.
ur
[
u
]]
regressor_models
=
{
'
linear_regression
'
:
LinearRegression
(
fit_intercept
=
False
),
df_user
=
pd
.
DataFrame
({
'
item_id
'
:
item_ids
,
'
user_ratings
'
:
user_ratings
})
'
svr_regression
'
:
SVR
(
kernel
=
'
rbf
'
,
C
=
10
,
epsilon
=
0.2
),
'
gradient_boosting
'
:
GradientBoostingRegressor
(
n_estimators
=
100
,
learning_rate
=
0.1
,
max_depth
=
3
),
df_user
[
"
item_id
"
]
=
df_user
[
"
item_id
"
].
map
(
trainset
.
to_raw_iid
)
'
random_forest
'
:
RandomForestRegressor
(
n_estimators
=
100
),
'
lasso_regression
'
:
Lasso
(
alpha
=
0.1
),
df_user
=
df_user
.
merge
(
self
.
content_features
,
left_on
=
"
item_id
"
,
right_index
=
True
,
how
=
'
left
'
)
'
ridge_regression
'
:
Ridge
(
alpha
=
1.0
),
'
elastic_net
'
:
ElasticNet
(
alpha
=
1.0
,
l1_ratio
=
0.5
),
if
'
n_character_title
'
in
df_user
.
columns
:
'
knn_regression
'
:
KNeighborsRegressor
(
n_neighbors
=
1
),
# Si 'n_character_title' est disponible comme caractéristique
'
decision_tree
'
:
DecisionTreeRegressor
(
max_depth
=
5
),
X
=
df_user
[
'
n_character_title
'
].
values
.
reshape
(
-
1
,
1
)
'
adaboost
'
:
AdaBoostRegressor
(
n_estimators
=
50
),
'
xgboost
'
:
XGBRegressor
(
n_estimators
=
100
,
learning_rate
=
0.1
,
max_depth
=
3
),
'
lightgbm
'
:
LGBMRegressor
(
n_estimators
=
100
,
learning_rate
=
0.1
,
max_depth
=
3
)
}
elif
'
avg_relevance
'
in
df_user
.
columns
:
if
self
.
regressor_method
not
in
regressor_models
:
# Si 'n_character_title' est disponible comme caractéristique
raise
NotImplementedError
(
f
'
Regressor method
{
self
.
regressor_method
}
not yet implemented
'
)
X
=
df_user
[
'
avg_relevance
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
movie_year
'
in
df_user
.
columns
:
# Si 'n_character_title' est disponible comme caractéristique
X
=
df_user
[
'
movie_year
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
genres
'
in
df_user
.
columns
:
# Si 'n_character_title' est disponible comme caractéristique
X
=
df_user
[
'
genres
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
combination
'
in
df_user
.
columns
:
# Si 'n_character_title' est disponible comme caractéristique
X
=
df_user
[
'
combination
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
avg_rating
'
in
df_user
.
columns
:
# Si 'n_character_title' est disponible comme caractéristique
X
=
df_user
[
'
avg_rating
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
tags
'
in
df_user
.
columns
:
# Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)
X
=
df_user
[
'
tags
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
n_character_tags
'
in
df_user
.
columns
:
# Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)
X
=
df_user
[
'
n_character_tags
'
].
values
.
reshape
(
-
1
,
1
)
else
:
# Si aucune caractéristique appropriée n'est disponible
continue
# Ou gère le cas d'erreur/exception ici
y
=
df_user
[
'
user_ratings
'
].
values
svr_regressor
=
SVR
(
kernel
=
'
rbf
'
,
C
=
10
,
epsilon
=
0.2
)
svr_regressor
.
fit
(
X
,
y
)
self
.
user_profile
[
u
]
=
svr_regressor
elif
self
.
regressor_method
==
'
gradient_boosting
'
:
for
u
in
self
.
user_profile
:
for
u
in
self
.
user_profile
:
user_ratings
=
[
rating
for
(
_
,
rating
)
in
trainset
.
ur
[
u
]]
item_ids
=
[
iid
for
(
iid
,
_
)
in
trainset
.
ur
[
u
]]
raw_item_ids
=
[
trainset
.
to_raw_iid
(
iid
)
for
iid
in
item_ids
]
user_ratings
=
[
rating
for
_
,
rating
in
trainset
.
ur
[
u
]]
df_user
=
pd
.
DataFrame
({
'
item_id
'
:
raw_item_ids
,
'
user_ratings
'
:
user_ratings
})
item_ids
=
[
iid
for
iid
,
_
in
trainset
.
ur
[
u
]]
df_user
=
df_user
.
merge
(
self
.
content_features
,
left_on
=
"
item_id
"
,
right_index
=
True
,
how
=
'
left
'
)
df_user
=
pd
.
DataFrame
({
'
item_id
'
:
item_ids
,
'
user_ratings
'
:
user_ratings
})
df_user
[
"
item_id
"
]
=
df_user
[
"
item_id
"
].
map
(
trainset
.
to_raw_iid
)
df_user
=
df_user
.
merge
(
self
.
content_features
,
left_on
=
"
item_id
"
,
right_index
=
True
,
how
=
'
left
'
)
if
'
n_character_title
'
in
df_user
.
columns
:
# Si 'n_character_title' est disponible comme caractéristique
X
=
df_user
[
'
n_character_title
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
avg_relevance
'
in
df_user
.
columns
:
# Si 'n_character_title' est disponible comme caractéristique
X
=
df_user
[
'
avg_relevance
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
movie_year
'
in
df_user
.
columns
:
# Si 'n_character_title' est disponible comme caractéristique
X
=
df_user
[
'
movie_year
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
genres
'
in
df_user
.
columns
:
# Si 'n_character_title' est disponible comme caractéristique
X
=
df_user
[
'
genres
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
combination
'
in
df_user
.
columns
:
X
=
df_user
.
drop
(
columns
=
[
'
item_id
'
,
'
user_ratings
'
])
# Si 'n_character_title' est disponible comme caractéristique
y
=
df_user
[
'
user_ratings
'
]
X
=
df_user
[
'
combination
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
avg_rating
'
in
df_user
.
columns
:
regressor
=
regressor_models
[
self
.
regressor_method
]
# Si 'n_character_title' est disponible comme caractéristique
regressor
.
fit
(
X
,
y
)
X
=
df_user
[
'
avg_rating
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
tags
'
in
df_user
.
columns
:
self
.
user_profile
[
u
]
=
regressor
# Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)
X
=
df_user
[
'
tags
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
n_character_tags
'
in
df_user
.
columns
:
# Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)
X
=
df_user
[
'
n_character_tags
'
].
values
.
reshape
(
-
1
,
1
)
else
:
# Si aucune caractéristique appropriée n'est disponible
continue
# Ou gère le cas d'erreur/exception ici
y
=
df_user
[
'
user_ratings
'
].
values
gb_regressor
=
GradientBoostingRegressor
(
n_estimators
=
100
,
learning_rate
=
0.1
,
max_depth
=
3
)
gb_regressor
.
fit
(
X
,
y
)
self
.
user_profile
[
u
]
=
gb_regressor
elif
self
.
regressor_method
==
'
random_forest
'
:
for
u
in
self
.
user_profile
:
user_ratings
=
[
rating
for
_
,
rating
in
trainset
.
ur
[
u
]]
item_ids
=
[
iid
for
iid
,
_
in
trainset
.
ur
[
u
]]
df_user
=
pd
.
DataFrame
({
'
item_id
'
:
item_ids
,
'
user_ratings
'
:
user_ratings
})
df_user
[
"
item_id
"
]
=
df_user
[
"
item_id
"
].
map
(
trainset
.
to_raw_iid
)
df_user
=
df_user
.
merge
(
self
.
content_features
,
left_on
=
"
item_id
"
,
right_index
=
True
,
how
=
'
left
'
)
if
'
n_character_title
'
in
df_user
.
columns
:
# Si 'n_character_title' est disponible comme caractéristique
X
=
df_user
[
'
n_character_title
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
avg_relevance
'
in
df_user
.
columns
:
# Si 'n_character_title' est disponible comme caractéristique
X
=
df_user
[
'
avg_relevance
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
movie_year
'
in
df_user
.
columns
:
# Si 'n_character_title' est disponible comme caractéristique
X
=
df_user
[
'
movie_year
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
genres
'
in
df_user
.
columns
:
# Si 'n_character_title' est disponible comme caractéristique
X
=
df_user
[
'
genres
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
combination
'
in
df_user
.
columns
:
# Si 'n_character_title' est disponible comme caractéristique
X
=
df_user
[
'
combination
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
avg_rating
'
in
df_user
.
columns
:
# Si 'n_character_title' est disponible comme caractéristique
X
=
df_user
[
'
avg_rating
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
tags
'
in
df_user
.
columns
:
# Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)
X
=
df_user
[
'
tags
'
].
values
.
reshape
(
-
1
,
1
)
elif
'
n_character_tags
'
in
df_user
.
columns
:
# Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)
X
=
df_user
[
'
n_character_tags
'
].
values
.
reshape
(
-
1
,
1
)
else
:
# Si aucune caractéristique appropriée n'est disponible
continue
# Ou gère le cas d'erreur/exception ici
y
=
df_user
[
'
user_ratings
'
].
values
rf_regressor
=
RandomForestRegressor
(
n_estimators
=
100
)
rf_regressor
.
fit
(
X
,
y
)
self
.
user_profile
[
u
]
=
rf_regressor
else
:
pass
# (implement here the regressor fitting)
def
estimate
(
self
,
u
,
i
):
def
estimate
(
self
,
u
,
i
):
"""
Scoring component used for item filtering
"""
"""
Scoring component used for item filtering
"""
# First, handle cases for unknown users and items
if
not
(
self
.
trainset
.
knows_user
(
u
)
and
self
.
trainset
.
knows_item
(
i
)):
if
not
(
self
.
trainset
.
knows_user
(
u
)
and
self
.
trainset
.
knows_item
(
i
)):
raise
PredictionImpossible
(
'
User and/or item is unkown.
'
)
raise
PredictionImpossible
(
'
User and/or item is unknown.
'
)
if
self
.
regressor_method
==
'
random_score
'
:
if
self
.
regressor_method
==
'
random_score
'
:
rd
.
seed
()
return
rd
.
uniform
(
0.5
,
5
)
score
=
rd
.
uniform
(
0.5
,
5
)
elif
self
.
regressor_method
==
'
random_sample
'
:
elif
self
.
regressor_method
==
'
random_sample
'
:
rd
.
seed
()
return
rd
.
choice
(
self
.
user_profile
[
u
])
score
=
rd
.
choice
(
self
.
user_profile
[
u
])
elif
self
.
regressor_method
==
'
linear_regression
'
:
raw_item_id
=
self
.
trainset
.
to_raw_iid
(
i
)
item_features
=
self
.
content_features
.
loc
[
raw_item_id
:
raw_item_id
,
:].
values
linear_regressor
=
self
.
user_profile
[
u
]
score
=
linear_regressor
.
predict
(
item_features
)[
0
]
elif
self
.
regressor_method
==
'
svr_regression
'
:
else
:
raw_item_id
=
self
.
trainset
.
to_raw_iid
(
i
)
raw_item_id
=
self
.
trainset
.
to_raw_iid
(
i
)
item_features
=
self
.
content_features
.
loc
[
raw_item_id
,
:].
values
.
reshape
(
1
,
-
1
)
regressor
=
self
.
user_profile
[
u
]
item_features_df
=
pd
.
DataFrame
(
item_features
,
columns
=
self
.
content_features
.
columns
)
return
regressor
.
predict
(
item_features_df
)[
0
]
item_features
=
self
.
content_features
.
loc
[
raw_item_id
:
raw_item_id
,
:].
values
def
explain
(
self
,
u
):
if
u
in
self
.
user_profile_explain
:
svr_regressor
=
self
.
user_profile
[
u
]
return
self
.
user_profile_explain
[
u
]
score
=
svr_regressor
.
predict
(
item_features
)[
0
]
else
:
return
None
elif
self
.
regressor_method
==
'
gradient_boosting
'
:
raw_item_id
=
self
.
trainset
.
to_raw_iid
(
i
)
item_features
=
self
.
content_features
.
loc
[
raw_item_id
:
raw_item_id
,
:].
values
gradient_boosting
=
self
.
user_profile
[
u
]
score
=
gradient_boosting
.
predict
(
item_features
)[
0
]
elif
self
.
regressor_method
==
'
random_forest
'
:
raw_item_id
=
self
.
trainset
.
to_raw_iid
(
i
)
item_features
=
self
.
content_features
.
loc
[
raw_item_id
:
raw_item_id
,
:].
values
randomforest
=
self
.
user_profile
[
u
]
score
=
randomforest
.
predict
(
item_features
)[
0
]
else
:
score
=
None
# (implement here the regressor prediction)
#Example usage:
cb
=
ContentBased
([
"
title_length
"
,
"
movie_year
"
,
"
genre
"
,
"
avg_rating
"
],
"
ridge_regression
"
)
surprise_data
=
load_ratings
(
surprise_format
=
True
)
trainset
=
surprise_data
.
build_full_trainset
()
testset
=
trainset
.
build_anti_testset
()
cb
.
fit
(
trainset
)
return
score
def
explain
(
self
,
u
)
:
#print("RMSE: ", cb.rmse(testset))
if
u
in
self
.
user_profile_explain
:
return
self
.
user_profile_explain
[
u
]
else
:
return
None
cb
=
ContentBased
(
"
title_length
"
,
"
random_sample
"
)
#Example explanations for users:
sp_ratings
=
load_ratings
(
surprise_format
=
True
)
print
(
cb
.
explain
(
11
))
train_set
=
sp_ratings
.
build_full_trainset
()
print
(
cb
.
fit
(
train_set
))
print
(
cb
.
explain
(
0
))
print
(
cb
.
explain
(
13
))
print
(
cb
.
explain
(
1
))
print
(
cb
.
explain
(
1
7
))
print
(
cb
.
explain
(
2
))
print
(
cb
.
explain
(
2
3
))
print
(
cb
.
explain
(
3
))
print
(
cb
.
explain
(
27
))
print
(
cb
.
explain
(
4
))
print
(
cb
.
explain
(
73
))
```
```
%% Output
%% Output
0
0
1
1
2
2
3
3
4
4
5
5
None
None
{'n_character_title': array([0.03019692])}
{'n_character_title': array([0.03019692])}
{'n_character_title': array([0.04098154])}
{'n_character_title': array([0.04098154])}
{'n_character_title': array([0.02942264])}
{'n_character_title': array([0.02942264])}
{'n_character_title': array([0.08196307])}
{'n_character_title': array([0.08196307])}
{'n_character_title': array([0.02798739])}
{'n_character_title': array([0.02798739])}
%% Cell type:code id:baab88b7 tags:
```
python
from
pprint
import
pprint
# Créer une instance de TfidfVectorizer pour les genres
tfidf_vectorizer
=
TfidfVectorizer
()
# Fit et transform pour calculer la matrice TF-IDF des genres
tfidf_matrix
=
tfidf_vectorizer
.
fit_transform
(
df_items
[
'
genres
'
])
# Obtenir les noms des genres (features)
genre_names
=
tfidf_vectorizer
.
get_feature_names_out
()
# Créer un DataFrame à partir de la matrice TF-IDF des genres
df_tfidf
=
pd
.
DataFrame
(
tfidf_matrix
.
toarray
(),
columns
=
genre_names
)
print
(
"
Matrice TF-IDF des genres :
"
)
display
(
df_tfidf
)
```
%% Output
Matrice TF-IDF des genres :
%% Cell type:markdown id:ffd75b7e tags:
%% Cell type:markdown id:ffd75b7e tags:
The following script test the ContentBased class
The following script test the ContentBased class
%% Cell type:code id:69d12f7d tags:
%% Cell type:code id:69d12f7d tags:
```
python
```
python
def
test_contentbased_class
(
feature_method
,
regressor_method
):
def
test_contentbased_class
(
feature_method
,
regressor_method
):
"""
Test the ContentBased class.
"""
Test the ContentBased class.
Tries to make a prediction on the first (user,item ) tuple of the anti_test_set
Tries to make a prediction on the first (user,item ) tuple of the anti_test_set
"""
"""
sp_ratings
=
load_ratings
(
surprise_format
=
True
)
sp_ratings
=
load_ratings
(
surprise_format
=
True
)
train_set
=
sp_ratings
.
build_full_trainset
()
train_set
=
sp_ratings
.
build_full_trainset
()
content_algo
=
ContentBased
(
feature_method
,
regressor_method
)
content_algo
=
ContentBased
(
feature_method
,
regressor_method
)
content_algo
.
fit
(
train_set
)
content_algo
.
fit
(
train_set
)
anti_test_set_first
=
train_set
.
build_anti_testset
()[
0
]
anti_test_set_first
=
train_set
.
build_anti_testset
()[
0
]
prediction
=
content_algo
.
predict
(
anti_test_set_first
[
0
],
anti_test_set_first
[
1
])
prediction
=
content_algo
.
predict
(
anti_test_set_first
[
0
],
anti_test_set_first
[
1
])
print
(
prediction
)
print
(
prediction
)
test_contentbased_class
([
"
title_length
"
,
"
movie_year
"
,
"
genre
"
,
"
avg_rating
"
],
"
ridge_regression
"
)
# print("title_length :")
# test_contentbased_class(feature_method = "title_length" , regressor_method = "random_score")
# test_contentbased_class(feature_method = "title_length" , regressor_method = "random_sample")
# test_contentbased_class(feature_method = "title_length" , regressor_method = "linear_regression")
# test_contentbased_class(feature_method= "title_length", regressor_method= "svr_regression")
# test_contentbased_class(feature_method= "title_length", regressor_method= "gradient_boosting")
# test_contentbased_class(feature_method= "title_length", regressor_method= "random_forest")
# print("\n")
# print("movie_year : ")
# test_contentbased_class(feature_method= "movie_year", regressor_method= "random_score")
# test_contentbased_class(feature_method= "movie_year", regressor_method= "random_sample")
# test_contentbased_class(feature_method= "movie_year", regressor_method= "linear_regression")
# test_contentbased_class(feature_method= "movie_year", regressor_method= "svr_regression")
# test_contentbased_class(feature_method= "movie_year", regressor_method= "gradient_boosting")
# test_contentbased_class(feature_method= "movie_year", regressor_method= "random_forest")
# print("\n")
# print("relevance : ")
# test_contentbased_class(feature_method= "relevance", regressor_method= "random_score")
# test_contentbased_class(feature_method= "relevance", regressor_method= "random_sample")
# test_contentbased_class(feature_method= "relevance", regressor_method= "linear_regression")
# test_contentbased_class(feature_method= "relevance", regressor_method= "svr_regression")
# test_contentbased_class(feature_method= "relevance", regressor_method= "gradient_boosting")
# test_contentbased_class(feature_method= "relevance", regressor_method= "random_forest")
# print("\n")
# print("genres : ")
# test_contentbased_class(feature_method= "genres", regressor_method= "random_score")
# test_contentbased_class(feature_method= "genres", regressor_method= "random_sample")
# test_contentbased_class(feature_method= "genres", regressor_method= "linear_regression")
# test_contentbased_class(feature_method= "genres", regressor_method= "svr_regression")
# test_contentbased_class(feature_method= "genres", regressor_method= "gradient_boosting")
# test_contentbased_class(feature_method= "genres", regressor_method= "random_forest")
# print("\n")
# print("rating : ")
# test_contentbased_class(feature_method= "rating", regressor_method="random_score")
# test_contentbased_class(feature_method= "rating", regressor_method="random_sample")
# # test_contentbased_class(feature_method= "rating", regressor_method="linear_regression")
# #test_contentbased_class(feature_method="rating", regressor_method="svr_regression")
# #test_contentbased_class(feature_method="rating", regressor_method="gradient_boosting")
# #test_contentbased_class(feature_method="rating", regressor_method="random_forest")
# print("\n")
# print("tags : ")
# test_contentbased_class(feature_method="tags", regressor_method="random_score")
# test_contentbased_class(feature_method="tags", regressor_method="random_sample")
# #test_contentbased_class(feature_method="tags", regressor_method="linear_regression")
# # test_contentbased_class(feature_method="tags", regressor_method="svr_regression")
# # test_contentbased_class(feature_method="tags", regressor_method="gradient_boosting")
# # test_contentbased_class(feature_method="tags", regressor_method="random_forest")
# print("\n")
# print("tags_length : ")
# test_contentbased_class(feature_method="tags_length", regressor_method="random_score")
# test_contentbased_class(feature_method="tags_length", regressor_method="random_sample")
# test_contentbased_class(feature_method="tags_length", regressor_method="linear_regression")
# test_contentbased_class(feature_method="tags_length", regressor_method="svr_regression")
# test_contentbased_class(feature_method="tags_length", regressor_method="gradient_boosting")
# test_contentbased_class(feature_method="tags_length", regressor_method="random_forest")
# print("\n")
# print("combination : ")
# test_contentbased_class(feature_method="combination", regressor_method="random_score")
# test_contentbased_class(feature_method="combination", regressor_method="random_sample")
# test_contentbased_class(feature_method="combination", regressor_method="linear_regression")
# test_contentbased_class(feature_method="combination", regressor_method="svr_regression")
# test_contentbased_class(feature_method="combination", regressor_method="gradient_boosting")
# test_contentbased_class(feature_method="combination", regressor_method="random_forest")
```
```
...
...
Ce diff est replié.
Cliquez pour l'agrandir.
Aperçu
0%
Chargement en cours
Veuillez réessayer
ou
joindre un nouveau fichier
.
Annuler
You are about to add
0
people
to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Enregistrer le commentaire
Annuler
Veuillez vous
inscrire
ou vous
se connecter
pour commenter