From 6456e17aad30d5649d8968a91b69b06f9484e7d3 Mon Sep 17 00:00:00 2001 From: Nathanael <nathanael.kindidi@student.uclouvain.be> Date: Thu, 23 May 2024 16:35:17 +0200 Subject: [PATCH] modif class content based --- content_based.ipynb | 896 ++++++-------------------------------------- 1 file changed, 121 insertions(+), 775 deletions(-) diff --git a/content_based.ipynb b/content_based.ipynb index 24950d71..e2c88030 100644 --- a/content_based.ipynb +++ b/content_based.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 756, + "execution_count": 15, "id": "277473a3", "metadata": {}, "outputs": [ @@ -53,89 +53,26 @@ }, { "cell_type": "code", - "execution_count": 757, + "execution_count": 16, "id": "e8378976", "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>n_character_title</th>\n", - " </tr>\n", - " <tr>\n", - " <th>movieId</th>\n", - " <th></th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>4993</th>\n", - " <td>57</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5952</th>\n", - " <td>45</td>\n", - " </tr>\n", - " <tr>\n", - " <th>527</th>\n", - " <td>23</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2028</th>\n", - " <td>26</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4308</th>\n", - " <td>19</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " n_character_title\n", - "movieId \n", - "4993 57\n", - "5952 45\n", - "527 23\n", - "2028 26\n", - "4308 19" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "0 long\n", - "1 boring\n", - "2 long\n", - "3 romance\n", - "4 stupidity\n", - "Name: tag, dtype: object" - ] - }, - "metadata": {}, - "output_type": "display_data" + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: 'data/test/content/movies.csv'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[16], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# All the dataframes\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m df_items \u001b[38;5;241m=\u001b[39m \u001b[43mload_items\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m df_ratings \u001b[38;5;241m=\u001b[39m load_ratings()\n\u001b[1;32m 4\u001b[0m df_tag \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(C\u001b[38;5;241m.\u001b[39mCONTENT_PATH\u001b[38;5;241m/\u001b[39mC\u001b[38;5;241m.\u001b[39mTAGS_FILENAME)\n", + "File \u001b[0;32m~/Desktop/UniversiteÌ/Recommender Systems/recomsys/loaders.py:34\u001b[0m, in \u001b[0;36mload_items\u001b[0;34m()\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_items\u001b[39m():\n\u001b[1;32m 29\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Loads items data.\u001b[39;00m\n\u001b[1;32m 30\u001b[0m \n\u001b[1;32m 31\u001b[0m \u001b[38;5;124;03m Returns:\u001b[39;00m\n\u001b[1;32m 32\u001b[0m \u001b[38;5;124;03m DataFrame: Items data.\u001b[39;00m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 34\u001b[0m df_items \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mC\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mCONTENT_PATH\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m/\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mC\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mITEMS_FILENAME\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# ce qui se trouve dans le movie csv\u001b[39;00m\n\u001b[1;32m 35\u001b[0m df_items \u001b[38;5;241m=\u001b[39m df_items\u001b[38;5;241m.\u001b[39mset_index(C\u001b[38;5;241m.\u001b[39mITEM_ID_COL) \u001b[38;5;66;03m# movie id\u001b[39;00m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df_items\n", + "File \u001b[0;32m~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 1014\u001b[0m dialect,\n\u001b[1;32m 1015\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1022\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 1023\u001b[0m )\n\u001b[1;32m 1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 623\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", + "File \u001b[0;32m~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1878\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1879\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1880\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1881\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1882\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1883\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1884\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1885\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1886\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1887\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1888\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1889\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1890\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1891\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", + "File \u001b[0;32m~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 869\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 870\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 871\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 872\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 874\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 875\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 876\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 877\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 878\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 879\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'data/test/content/movies.csv'" + ] } ], "source": [ @@ -169,7 +106,7 @@ }, { "cell_type": "code", - "execution_count": 758, + "execution_count": null, "id": "16b0a602", "metadata": {}, "outputs": [ @@ -193,697 +130,171 @@ } ], "source": [ + "\n", + "# ContetnBased\n", "class ContentBased(AlgoBase):\n", " def __init__(self, features_method, regressor_method):\n", " AlgoBase.__init__(self)\n", " self.regressor_method = regressor_method\n", + " self.features_methods = features_method\n", " self.content_features = self.create_content_features(features_method)\n", + " self.user_profile = {}\n", " self.user_profile_explain = {}\n", "\n", - " def create_content_features(self, features_method):\n", + " def create_content_features(self, features_methods):\n", " \"\"\"Content Analyzer\"\"\"\n", " df_items = load_items()\n", " df_ratings = load_ratings()\n", - " df_tag = df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n", + " df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n", " df_genome_score = pd.read_csv(\"data/hackathon/content/genome-scores.csv\")\n", " df_genome_tag = pd.read_csv(\"data/hackathon/content/genome-tags.csv\")\n", "\n", - " if features_method is None:\n", - " df_features = None\n", - "\n", - " elif features_method == \"relevance\" :\n", - " df_features = df_genome_score.groupby('movieId')[\"relevance\"].transform('mean').to_frame('avg_relevance')\n", + " df_features = pd.DataFrame(index=df_items.index)\n", "\n", - " elif features_method == \"title_length\": # a naive method that creates only 1 feature based on title length\n", - " df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')\n", + " for method in features_methods:\n", + " if method == \"title_length\":\n", + " df_title_length = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('title_length')\n", + " df_features = pd.concat([df_features, df_title_length], axis=1)\n", " \n", - " elif features_method == \"movie_year\" :\n", - " df_features = df_items['movie_year'] = df_items['title'].str.extract(r'\\((\\d{4})\\)', expand=False).to_frame('movie_year')\n", - "\n", - " elif features_method == \"genres\" :\n", - " genres_list = df_items['genres'].str.split('|').explode().unique()\n", - " for genre in genres_list:\n", - " df_features = df_items['genres'].str.contains(genre).astype(int).to_frame('genres')\n", - " \n", - " elif features_method == \"combination\": \n", - " df_length = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')\n", - " df_movie = df_items['title'].str.extract(r'\\((\\d{4})\\)', expand=False).to_frame('movie_year')\n", - " genres_list = df_items['genres'].str.split('|').explode().unique()\n", - " for genre in genres_list:\n", - " df_genre = df_items['genres'].str.contains(genre).astype(int).to_frame('genres')\n", - " \n", - " df_features = pd.concat([df_genre, df_length, df_movie], axis=1)\n", - " \n", - " elif features_method == \"rating\" :\n", - " df_features = df_ratings.groupby('movieId')['rating'].transform('mean').to_frame('avg_rating')\n", - "\n", - " elif features_method == \"tags\" :\n", - " df_features = df_tag['tag'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0).to_frame('tags')\n", - "\n", - " elif features_method == \"tags_length\" :\n", + " elif method == \"movie_year\":\n", + " df_movie_year = df_items['title'].str.extract(r'\\((\\d{4})\\)', expand=False).to_frame('movie_year')\n", + " df_features = pd.concat([df_features, df_movie_year.astype(float).fillna(0)], axis=1)\n", " \n", - " df_features = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(','))if isinstance(x, str) else 0).to_frame('n_character_tags')\n", + " elif method == \"genre\":\n", + " tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split('|'), token_pattern=None)\n", + " tfidf_matrix = tfidf_vectorizer.fit_transform(df_items['genres'])\n", + " df_tfidf_genres = pd.DataFrame(tfidf_matrix.toarray(), index=df_items.index, columns=tfidf_vectorizer.get_feature_names_out())\n", + " df_features = pd.concat([df_features, df_tfidf_genres], axis=1)\n", "\n", + " elif method == \"avg_rating\":\n", + " df_avg_rating = df_ratings.groupby('movieId')['rating'].mean().to_frame('avg_rating')\n", + " df_features = df_features.join(df_avg_rating, on='movieId')\n", + "\n", + " else:\n", + " raise NotImplementedError(f'Feature method {method} not yet implemented')\n", + "\n", + " # Handle missing values in df_features\n", + " df_features.fillna(0, inplace=True)\n", "\n", - " else: # (implement other feature creations here)\n", - " raise NotImplementedError(f'Feature method {features_method} not yet implemented')\n", " return df_features\n", - " \n", "\n", " def fit(self, trainset):\n", " \"\"\"Profile Learner\"\"\"\n", " AlgoBase.fit(self, trainset)\n", - " \n", + "\n", " # Preallocate user profiles\n", " self.user_profile = {u: None for u in trainset.all_users()}\n", + " self.user_profile_explain = {}\n", "\n", - " self.user_profile_explain = {u: {} for u in trainset.all_users()}\n", - "\n", - " for u in self.user_profile_explain :\n", - " print(u)\n", - " user_ratings = np.array([rating for _, rating in trainset.ur[u]])\n", - "\n", - " feature_values = self.content_features.values\n", - "\n", - " fv = feature_values.astype(int)\n", - "\n", - " weighted_features = fv/np.linalg.norm(fv)\n", - "\n", - " feature_importance = weighted_features / np.sum(user_ratings)\n", + " epsilon = 1e-10 # Small value to prevent division by zero\n", "\n", - " self.user_profile_explain[u] = dict(zip(self.content_features.columns, feature_importance))\n", - " \n", + " for u in trainset.all_users():\n", + " raw_user_id = trainset.to_raw_uid(u)\n", + " self.user_profile_explain[raw_user_id] = {}\n", "\n", + " user_ratings = np.array([rating for (_, rating) in trainset.ur[u]])\n", + " item_ids = [iid for (iid, _) in trainset.ur[u]]\n", + " raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]\n", "\n", + " feature_values = self.content_features.loc[raw_item_ids].values\n", + " norms = np.linalg.norm(feature_values, axis=0) + epsilon\n", + " weighted_features = feature_values / norms\n", + " feature_importance = weighted_features.T @ user_ratings\n", + " feature_importance /= np.sum(user_ratings)\n", "\n", + " self.user_profile_explain[raw_user_id] = dict(zip(self.content_features.columns, feature_importance))\n", "\n", " if self.regressor_method == 'random_score':\n", - " for u in self.user_profile :\n", - " self.user_profile[u] = rd.uniform(0.5,5)\n", - " \n", - " elif self.regressor_method == 'random_sample':\n", " for u in self.user_profile:\n", - " self.user_profile[u] = [rating for _, rating in self.trainset.ur[u]]\n", + " self.user_profile[u] = rd.uniform(0.5, 5)\n", "\n", - " elif self.regressor_method == 'linear_regression' :\n", + " elif self.regressor_method == 'random_sample':\n", " for u in self.user_profile:\n", + " self.user_profile[u] = [rating for (_, rating) in trainset.ur[u]]\n", + "\n", + " else:\n", + " regressor_models = {\n", + " 'linear_regression': LinearRegression(fit_intercept=False),\n", + " 'svr_regression': SVR(kernel='rbf', C=10, epsilon=0.2),\n", + " 'gradient_boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),\n", + " 'random_forest': RandomForestRegressor(n_estimators=100),\n", + " 'lasso_regression': Lasso(alpha=0.1),\n", + " 'ridge_regression': Ridge(alpha=1.0),\n", + " 'elastic_net': ElasticNet(alpha=1.0, l1_ratio=0.5),\n", + " 'knn_regression': KNeighborsRegressor(n_neighbors=1),\n", + " 'decision_tree': DecisionTreeRegressor(max_depth=5),\n", + " 'adaboost': AdaBoostRegressor(n_estimators=50),\n", + " 'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),\n", + " 'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)\n", + " }\n", + "\n", + " if self.regressor_method not in regressor_models:\n", + " raise NotImplementedError(f'Regressor method {self.regressor_method} not yet implemented')\n", "\n", - " user_ratings = [rating for _, rating in trainset.ur[u]]\n", - " item_ids = [iid for iid, _ in trainset.ur[u]]\n", - "\n", - " df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})\n", - "\n", - " df_user[\"item_id\"] = df_user[\"item_id\"].map(trainset.to_raw_iid)\n", - "\n", - " df_user = df_user.merge(self.content_features, left_on = \"item_id\", right_index = True, how = 'left')\n", - " \n", - " if 'n_character_title' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['n_character_title'].values.reshape(-1, 1)\n", - "\n", - " elif 'avg_relevance' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['avg_relevance'].values.reshape(-1, 1)\n", - " \n", - " elif 'movie_year' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['movie_year'].values.reshape(-1, 1)\n", - " \n", - " elif 'genres' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['genres'].values.reshape(-1, 1)\n", - " \n", - " elif 'combination' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['combination'].values.reshape(-1, 1)\n", - " \n", - " elif 'avg_rating' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['avg_rating'].values.reshape(-1, 1)\n", - "\n", - " elif 'tags' in df_user.columns:\n", - " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", - " X = df_user['tags'].values.reshape(-1, 1)\n", - "\n", - " elif 'n_character_tags' in df_user.columns:\n", - " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", - " X = df_user['n_character_tags'].values.reshape(-1, 1)\n", - "\n", - " else:\n", - " # Si aucune caractéristique appropriée n'est disponible\n", - " continue # Ou gère le cas d'erreur/exception ici\n", - "\n", - " y = df_user['user_ratings'].values\n", - "\n", - " linear_regressor = LinearRegression(fit_intercept = False)\n", - "\n", - " linear_regressor.fit(X,y)\n", - " \n", - " # Store the computed user profile\n", - " self.user_profile[u] = linear_regressor\n", - "\n", - " elif self.regressor_method == 'svr_regression':\n", " for u in self.user_profile:\n", + " user_ratings = [rating for (_, rating) in trainset.ur[u]]\n", + " item_ids = [iid for (iid, _) in trainset.ur[u]]\n", + " raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]\n", "\n", - " user_ratings = [rating for _, rating in trainset.ur[u]]\n", - " item_ids = [iid for iid, _ in trainset.ur[u]]\n", - "\n", - " df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})\n", - "\n", - " df_user[\"item_id\"] = df_user[\"item_id\"].map(trainset.to_raw_iid)\n", - "\n", - " df_user = df_user.merge(self.content_features, left_on = \"item_id\", right_index = True, how = 'left')\n", - "\n", - " if 'n_character_title' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['n_character_title'].values.reshape(-1, 1)\n", - "\n", - " elif 'avg_relevance' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['avg_relevance'].values.reshape(-1, 1)\n", - " \n", - " elif 'movie_year' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['movie_year'].values.reshape(-1, 1)\n", - " \n", - " elif 'genres' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['genres'].values.reshape(-1, 1)\n", - " \n", - " elif 'combination' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['combination'].values.reshape(-1, 1)\n", - " \n", - " elif 'avg_rating' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['avg_rating'].values.reshape(-1, 1)\n", - "\n", - " elif 'tags' in df_user.columns:\n", - " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", - " X = df_user['tags'].values.reshape(-1, 1)\n", - "\n", - " elif 'n_character_tags' in df_user.columns:\n", - " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", - " X = df_user['n_character_tags'].values.reshape(-1, 1)\n", - "\n", - " else:\n", - " # Si aucune caractéristique appropriée n'est disponible\n", - " continue # Ou gère le cas d'erreur/exception ici\n", - " \n", - " y = df_user['user_ratings'].values\n", - " svr_regressor = SVR(kernel='rbf', C=10, epsilon=0.2)\n", - " svr_regressor.fit(X, y)\n", - " self.user_profile[u] = svr_regressor\n", - "\n", - " elif self.regressor_method == 'gradient_boosting':\n", - " for u in self.user_profile:\n", + " df_user = pd.DataFrame({'item_id': raw_item_ids, 'user_ratings': user_ratings})\n", + " df_user = df_user.merge(self.content_features, left_on=\"item_id\", right_index=True, how='left')\n", "\n", - " user_ratings = [rating for _, rating in trainset.ur[u]]\n", - " item_ids = [iid for iid, _ in trainset.ur[u]]\n", - "\n", - " df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})\n", - "\n", - " df_user[\"item_id\"] = df_user[\"item_id\"].map(trainset.to_raw_iid)\n", - "\n", - " df_user = df_user.merge(self.content_features, left_on = \"item_id\", right_index = True, how = 'left')\n", - "\n", - " if 'n_character_title' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['n_character_title'].values.reshape(-1, 1)\n", - "\n", - " elif 'avg_relevance' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['avg_relevance'].values.reshape(-1, 1)\n", - " \n", - " elif 'movie_year' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['movie_year'].values.reshape(-1, 1)\n", - " \n", - " elif 'genres' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['genres'].values.reshape(-1, 1)\n", - " \n", - " elif 'combination' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['combination'].values.reshape(-1, 1)\n", - " \n", - " elif 'avg_rating' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['avg_rating'].values.reshape(-1, 1)\n", - "\n", - " elif 'tags' in df_user.columns:\n", - " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", - " X = df_user['tags'].values.reshape(-1, 1)\n", - "\n", - " elif 'n_character_tags' in df_user.columns:\n", - " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", - " X = df_user['n_character_tags'].values.reshape(-1, 1)\n", - "\n", - " else:\n", - " # Si aucune caractéristique appropriée n'est disponible\n", - " continue # Ou gère le cas d'erreur/exception ici\n", - " \n", - " y = df_user['user_ratings'].values\n", - " gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)\n", - " gb_regressor.fit(X, y)\n", - " self.user_profile[u] = gb_regressor\n", + " X = df_user.drop(columns=['item_id', 'user_ratings'])\n", + " y = df_user['user_ratings']\n", "\n", + " regressor = regressor_models[self.regressor_method]\n", + " regressor.fit(X, y)\n", "\n", - " elif self.regressor_method == 'random_forest':\n", - " for u in self.user_profile:\n", + " self.user_profile[u] = regressor\n", "\n", - " user_ratings = [rating for _, rating in trainset.ur[u]]\n", - " item_ids = [iid for iid, _ in trainset.ur[u]]\n", - "\n", - " df_user = pd.DataFrame({'item_id': item_ids, 'user_ratings': user_ratings})\n", - "\n", - " df_user[\"item_id\"] = df_user[\"item_id\"].map(trainset.to_raw_iid)\n", - "\n", - " df_user = df_user.merge(self.content_features, left_on = \"item_id\", right_index = True, how = 'left')\n", - "\n", - " if 'n_character_title' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['n_character_title'].values.reshape(-1, 1)\n", - "\n", - " elif 'avg_relevance' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['avg_relevance'].values.reshape(-1, 1)\n", - " \n", - " elif 'movie_year' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['movie_year'].values.reshape(-1, 1)\n", - " \n", - " elif 'genres' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['genres'].values.reshape(-1, 1)\n", - " \n", - " elif 'combination' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['combination'].values.reshape(-1, 1)\n", - " \n", - " elif 'avg_rating' in df_user.columns:\n", - " # Si 'n_character_title' est disponible comme caractéristique\n", - " X = df_user['avg_rating'].values.reshape(-1, 1)\n", - "\n", - " elif 'tags' in df_user.columns:\n", - " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", - " X = df_user['tags'].values.reshape(-1, 1)\n", - "\n", - " elif 'n_character_tags' in df_user.columns:\n", - " # Si une autre caractéristique est disponible (remplace 'other_feature' par le nom de ta caractéristique)\n", - " X = df_user['n_character_tags'].values.reshape(-1, 1)\n", - "\n", - " else:\n", - " # Si aucune caractéristique appropriée n'est disponible\n", - " continue # Ou gère le cas d'erreur/exception ici\n", - "\n", - " y = df_user['user_ratings'].values\n", - " rf_regressor = RandomForestRegressor(n_estimators=100)\n", - " rf_regressor.fit(X, y)\n", - " self.user_profile[u] = rf_regressor\n", - "\n", - " else : \n", - " pass\n", - "\n", - " # (implement here the regressor fitting) \n", - " \n", " def estimate(self, u, i):\n", " \"\"\"Scoring component used for item filtering\"\"\"\n", - " # First, handle cases for unknown users and items\n", " if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):\n", - " raise PredictionImpossible('User and/or item is unkown.')\n", - "\n", + " raise PredictionImpossible('User and/or item is unknown.')\n", "\n", " if self.regressor_method == 'random_score':\n", - " rd.seed()\n", - " score = rd.uniform(0.5,5)\n", + " return rd.uniform(0.5, 5)\n", "\n", " elif self.regressor_method == 'random_sample':\n", - " rd.seed()\n", - " score = rd.choice(self.user_profile[u])\n", - " \n", - " elif self.regressor_method == 'linear_regression':\n", - "\n", - " raw_item_id = self.trainset.to_raw_iid(i)\n", - "\n", - " item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values\n", - "\n", - " linear_regressor = self.user_profile[u]\n", - "\n", - " score= linear_regressor.predict(item_features)[0]\n", - " \n", - " elif self.regressor_method == 'svr_regression':\n", + " return rd.choice(self.user_profile[u])\n", "\n", + " else:\n", " raw_item_id = self.trainset.to_raw_iid(i)\n", + " item_features = self.content_features.loc[raw_item_id, :].values.reshape(1, -1)\n", + " regressor = self.user_profile[u]\n", + " item_features_df = pd.DataFrame(item_features, columns=self.content_features.columns)\n", + " return regressor.predict(item_features_df)[0]\n", "\n", - " item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values\n", - "\n", - " svr_regressor = self.user_profile[u]\n", - " score = svr_regressor.predict(item_features)[0]\n", - " \n", - " elif self.regressor_method == 'gradient_boosting':\n", - "\n", - " raw_item_id = self.trainset.to_raw_iid(i)\n", - "\n", - " item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values\n", - "\n", - " gradient_boosting = self.user_profile[u]\n", - " score = gradient_boosting.predict(item_features)[0]\n", - " \n", - " elif self.regressor_method == 'random_forest':\n", - "\n", - " raw_item_id = self.trainset.to_raw_iid(i)\n", - "\n", - " item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values\n", - "\n", - " randomforest = self.user_profile[u]\n", - " score = randomforest.predict(item_features)[0]\n", - " \n", - " else : \n", - " score = None\n", - "\n", - " # (implement here the regressor prediction)\n", - "\n", - " return score\n", - "\n", - " def explain(self, u) : \n", - " if u in self.user_profile_explain :\n", + " def explain(self, u):\n", + " if u in self.user_profile_explain:\n", " return self.user_profile_explain[u]\n", - " else :\n", + " else:\n", " return None\n", "\n", "\n", - "cb = ContentBased(\"title_length\", \"random_sample\")\n", - "sp_ratings = load_ratings(surprise_format=True)\n", - "train_set = sp_ratings.build_full_trainset()\n", - "print(cb.fit(train_set))\n", + "#Example usage:\n", + "cb = ContentBased([\"title_length\", \"movie_year\",\"genre\",\"avg_rating\"], \"ridge_regression\")\n", + "surprise_data = load_ratings(surprise_format=True)\n", + "trainset = surprise_data.build_full_trainset()\n", + "testset = trainset.build_anti_testset()\n", + "cb.fit(trainset)\n", "\n", - "print(cb.explain(0))\n", "\n", - "print(cb.explain(1))\n", + "#print(\"RMSE: \", cb.rmse(testset))\n", "\n", - "print(cb.explain(2))\n", "\n", - "print(cb.explain(3))\n", + "#Example explanations for users:\n", + "print(cb.explain(11))\n", "\n", - "print(cb.explain(4))\n" - ] - }, - { - "cell_type": "code", - "execution_count": 759, - "id": "baab88b7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Matrice TF-IDF des genres :\n" - ] - }, - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>action</th>\n", - " <th>adventure</th>\n", - " <th>animation</th>\n", - " <th>children</th>\n", - " <th>comedy</th>\n", - " <th>drama</th>\n", - " <th>fantasy</th>\n", - " <th>fi</th>\n", - " <th>horror</th>\n", - " <th>imax</th>\n", - " <th>musical</th>\n", - " <th>mystery</th>\n", - " <th>romance</th>\n", - " <th>sci</th>\n", - " <th>war</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>0.000000</td>\n", - " <td>0.658454</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.752621</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>0.000000</td>\n", - " <td>0.658454</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.752621</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.572658</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.819795</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>0.694164</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.412209</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.590102</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.465343</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.582818</td>\n", - " <td>0.000000</td>\n", - " <td>0.666168</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5</th>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.572658</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.819795</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " </tr>\n", - " <tr>\n", - " <th>6</th>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.647689</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.761905</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " </tr>\n", - " <tr>\n", - " <th>7</th>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.606043</td>\n", - " <td>0.515192</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.606043</td>\n", - " <td>0.000000</td>\n", - " </tr>\n", - " <tr>\n", - " <th>8</th>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.563507</td>\n", - " <td>0.000000</td>\n", - " <td>0.662879</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.493002</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " </tr>\n", - " <tr>\n", - " <th>9</th>\n", - " <td>0.000000</td>\n", - " <td>0.363703</td>\n", - " <td>0.415716</td>\n", - " <td>0.489026</td>\n", - " <td>0.000000</td>\n", - " <td>0.290394</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.489026</td>\n", - " <td>0.363703</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " <td>0.000000</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " action adventure animation children comedy drama fantasy \\\n", - "0 0.000000 0.658454 0.000000 0.000000 0.000000 0.000000 0.752621 \n", - "1 0.000000 0.658454 0.000000 0.000000 0.000000 0.000000 0.752621 \n", - "2 0.000000 0.000000 0.000000 0.000000 0.000000 0.572658 0.000000 \n", - "3 0.694164 0.000000 0.000000 0.000000 0.000000 0.412209 0.000000 \n", - "4 0.000000 0.000000 0.000000 0.000000 0.000000 0.465343 0.000000 \n", - "5 0.000000 0.000000 0.000000 0.000000 0.000000 0.572658 0.000000 \n", - "6 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", - "7 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", - "8 0.000000 0.000000 0.563507 0.000000 0.662879 0.000000 0.000000 \n", - "9 0.000000 0.363703 0.415716 0.489026 0.000000 0.290394 0.000000 \n", - "\n", - " fi horror imax musical mystery romance sci \\\n", - "0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", - "1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", - "2 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", - "3 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", - "4 0.000000 0.000000 0.000000 0.582818 0.000000 0.666168 0.000000 \n", - "5 0.000000 0.000000 0.000000 0.000000 0.000000 0.819795 0.000000 \n", - "6 0.000000 0.647689 0.000000 0.000000 0.761905 0.000000 0.000000 \n", - "7 0.606043 0.515192 0.000000 0.000000 0.000000 0.000000 0.606043 \n", - "8 0.000000 0.000000 0.000000 0.493002 0.000000 0.000000 0.000000 \n", - "9 0.000000 0.000000 0.489026 0.363703 0.000000 0.000000 0.000000 \n", - "\n", - " war \n", - "0 0.000000 \n", - "1 0.000000 \n", - "2 0.819795 \n", - "3 0.590102 \n", - "4 0.000000 \n", - "5 0.000000 \n", - "6 0.000000 \n", - "7 0.000000 \n", - "8 0.000000 \n", - "9 0.000000 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from pprint import pprint\n", + "print(cb.explain(13))\n", "\n", - "# Créer une instance de TfidfVectorizer pour les genres\n", - "tfidf_vectorizer = TfidfVectorizer()\n", + "print(cb.explain(17))\n", "\n", - "# Fit et transform pour calculer la matrice TF-IDF des genres\n", - "tfidf_matrix = tfidf_vectorizer.fit_transform(df_items['genres'])\n", + "print(cb.explain(23))\n", "\n", - "# Obtenir les noms des genres (features)\n", - "genre_names = tfidf_vectorizer.get_feature_names_out()\n", + "print(cb.explain(27))\n", "\n", - "# Créer un DataFrame à partir de la matrice TF-IDF des genres\n", - "df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=genre_names)\n", - "\n", - "print(\"Matrice TF-IDF des genres :\")\n", - "display(df_tfidf)" + "print(cb.explain(73))\n" ] }, { @@ -896,7 +307,7 @@ }, { "cell_type": "code", - "execution_count": 760, + "execution_count": null, "id": "69d12f7d", "metadata": {}, "outputs": [], @@ -913,72 +324,7 @@ " prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])\n", " print(prediction)\n", "\n", - "\n", - "\n", - "# print(\"title_length :\")\n", - "# test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"random_score\")\n", - "# test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"random_sample\")\n", - "# test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"linear_regression\")\n", - "# test_contentbased_class(feature_method= \"title_length\", regressor_method= \"svr_regression\")\n", - "# test_contentbased_class(feature_method= \"title_length\", regressor_method= \"gradient_boosting\")\n", - "# test_contentbased_class(feature_method= \"title_length\", regressor_method= \"random_forest\")\n", - "# print(\"\\n\")\n", - "# print(\"movie_year : \")\n", - "# test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"random_score\")\n", - "# test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"random_sample\")\n", - "# test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"linear_regression\")\n", - "# test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"svr_regression\")\n", - "# test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"gradient_boosting\")\n", - "# test_contentbased_class(feature_method= \"movie_year\", regressor_method= \"random_forest\")\n", - "# print(\"\\n\")\n", - "# print(\"relevance : \") \n", - "# test_contentbased_class(feature_method= \"relevance\", regressor_method= \"random_score\")\n", - "# test_contentbased_class(feature_method= \"relevance\", regressor_method= \"random_sample\")\n", - "# test_contentbased_class(feature_method= \"relevance\", regressor_method= \"linear_regression\")\n", - "# test_contentbased_class(feature_method= \"relevance\", regressor_method= \"svr_regression\")\n", - "# test_contentbased_class(feature_method= \"relevance\", regressor_method= \"gradient_boosting\")\n", - "# test_contentbased_class(feature_method= \"relevance\", regressor_method= \"random_forest\")\n", - "# print(\"\\n\")\n", - "# print(\"genres : \") \n", - "# test_contentbased_class(feature_method= \"genres\", regressor_method= \"random_score\")\n", - "# test_contentbased_class(feature_method= \"genres\", regressor_method= \"random_sample\")\n", - "# test_contentbased_class(feature_method= \"genres\", regressor_method= \"linear_regression\")\n", - "# test_contentbased_class(feature_method= \"genres\", regressor_method= \"svr_regression\")\n", - "# test_contentbased_class(feature_method= \"genres\", regressor_method= \"gradient_boosting\")\n", - "# test_contentbased_class(feature_method= \"genres\", regressor_method= \"random_forest\")\n", - "# print(\"\\n\")\n", - "# print(\"rating : \")\n", - "# test_contentbased_class(feature_method= \"rating\", regressor_method=\"random_score\")\n", - "# test_contentbased_class(feature_method= \"rating\", regressor_method=\"random_sample\")\n", - "# # test_contentbased_class(feature_method= \"rating\", regressor_method=\"linear_regression\")\n", - "# #test_contentbased_class(feature_method=\"rating\", regressor_method=\"svr_regression\")\n", - "# #test_contentbased_class(feature_method=\"rating\", regressor_method=\"gradient_boosting\")\n", - "# #test_contentbased_class(feature_method=\"rating\", regressor_method=\"random_forest\")\n", - "# print(\"\\n\")\n", - "# print(\"tags : \")\n", - "# test_contentbased_class(feature_method=\"tags\", regressor_method=\"random_score\")\n", - "# test_contentbased_class(feature_method=\"tags\", regressor_method=\"random_sample\")\n", - "# #test_contentbased_class(feature_method=\"tags\", regressor_method=\"linear_regression\")\n", - "# # test_contentbased_class(feature_method=\"tags\", regressor_method=\"svr_regression\")\n", - "# # test_contentbased_class(feature_method=\"tags\", regressor_method=\"gradient_boosting\")\n", - "# # test_contentbased_class(feature_method=\"tags\", regressor_method=\"random_forest\")\n", - "# print(\"\\n\")\n", - "# print(\"tags_length : \")\n", - "# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"random_score\")\n", - "# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"random_sample\")\n", - "# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"linear_regression\")\n", - "# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"svr_regression\")\n", - "# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"gradient_boosting\")\n", - "# test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"random_forest\")\n", - "\n", - "# print(\"\\n\")\n", - "# print(\"combination : \")\n", - "# test_contentbased_class(feature_method=\"combination\", regressor_method=\"random_score\")\n", - "# test_contentbased_class(feature_method=\"combination\", regressor_method=\"random_sample\")\n", - "# test_contentbased_class(feature_method=\"combination\", regressor_method=\"linear_regression\")\n", - "# test_contentbased_class(feature_method=\"combination\", regressor_method=\"svr_regression\")\n", - "# test_contentbased_class(feature_method=\"combination\", regressor_method=\"gradient_boosting\")\n", - "# test_contentbased_class(feature_method=\"combination\", regressor_method=\"random_forest\")\n" + "test_contentbased_class([\"title_length\", \"movie_year\",\"genre\",\"avg_rating\"], \"ridge_regression\")" ] } ], @@ -998,7 +344,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.2" + "version": "3.12.0" } }, "nbformat": 4, -- GitLab