diff --git a/constants.py b/constants.py index 8c11f22c0178095ff190a9f182258ff415d61f4f..e6125570df096d8edd74e9f50c99041779d73b03 100644 --- a/constants.py +++ b/constants.py @@ -16,6 +16,10 @@ class Constant: LABEL_COL = 'title' # Column name for item labels GENRES_COL = 'genres' # Column name for item genres + TAGS_FILENAME = "tags.csv" + TAG = 'tag' + + # Evidence EVIDENCE_PATH = DATA_PATH / 'evidence' # Path to evidence data # - ratings diff --git a/content_based.ipynb b/content_based.ipynb index f62bfd8653c5ac072f750fa1eb0f5bafb4f25338..979171c4bc9883f8e428d75a6e30521bb4a32ee6 100644 --- a/content_based.ipynb +++ b/content_based.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 41, "id": "277473a3", "metadata": {}, "outputs": [ @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 42, "id": "e8378976", "metadata": {}, "outputs": [ @@ -84,24 +84,24 @@ " </thead>\n", " <tbody>\n", " <tr>\n", - " <th>3</th>\n", - " <td>23</td>\n", + " <th>4993</th>\n", + " <td>57</td>\n", " </tr>\n", " <tr>\n", - " <th>15</th>\n", - " <td>23</td>\n", + " <th>5952</th>\n", + " <td>45</td>\n", " </tr>\n", " <tr>\n", - " <th>34</th>\n", - " <td>11</td>\n", + " <th>527</th>\n", + " <td>23</td>\n", " </tr>\n", " <tr>\n", - " <th>59</th>\n", - " <td>44</td>\n", + " <th>2028</th>\n", + " <td>26</td>\n", " </tr>\n", " <tr>\n", - " <th>64</th>\n", - " <td>20</td>\n", + " <th>4308</th>\n", + " <td>19</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", @@ -110,11 +110,25 @@ "text/plain": [ " n_character_title\n", "movieId \n", - "3 23\n", - "15 23\n", - "34 11\n", - "59 44\n", - "64 20" + "4993 57\n", + "5952 45\n", + "527 23\n", + "2028 26\n", + "4308 19" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "0 long\n", + "1 boring\n", + "2 long\n", + "3 romance\n", + "4 stupidity\n", + "Name: tag, dtype: object" ] }, "metadata": {}, @@ -129,6 +143,10 @@ "df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')\n", "display(df_features.head())\n", "\n", + "df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n", + "df_features = df_tag[C.TAG]\n", + "display(df_features.head())\n", + "\n", "# (explore here other features)\n" ] }, @@ -143,7 +161,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 43, "id": "16b0a602", "metadata": {}, "outputs": [], @@ -161,6 +179,28 @@ " df_features = None\n", " elif features_method == \"title_length\": # a naive method that creates only 1 feature based on title length\n", " df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')\n", + "\n", + " elif features_method == \"movie_year\" :\n", + " df_features = df_items['movie_year'] = df_items['title'].str.extract(r'\\((\\d{4})\\)', expand=False)\n", + "\n", + " elif features_method == \"genres\" :\n", + " genres_list = df_items['genres'].str.split('|').explode().unique()\n", + " for genre in genres_list:\n", + " df_features = df_items['genres'].str.contains(genre).astype(int)\n", + "\n", + " elif features_method == \"rating\" :\n", + " df_features = df_ratings.groupby('movieId')['rating'].transform('mean').to_frame('avg_rating')\n", + "\n", + " elif features_method == \"tags\" :\n", + " df_features = df_tag['tag'].apply(lambda x: len(x.split(',')))\n", + "\n", + " elif features_method == \"tags_length\" :\n", + " df_features = df_tag['tag'].apply(lambda x: sum(len(tag) for tag in x.split(',')))\n", + "\n", + " elif features_method == \"timestamp\" :\n", + " df_features = df_ratings['timestamp_sin'] = np.sin(2 * np.pi * df_ratings['timestamp'] / 86400)\n", + " df_features = df_ratings['timestamp_cos'] = np.cos(2 * np.pi * df_ratings['timestamp'] / 86400)\n", + "\n", " else: # (implement other feature creations here)\n", " raise NotImplementedError(f'Feature method {features_method} not yet implemented')\n", " return df_features\n", @@ -249,7 +289,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 44, "id": "69d12f7d", "metadata": {}, "outputs": [ @@ -257,8 +297,40 @@ "name": "stdout", "output_type": "stream", "text": [ - "user: 15 item: 942 r_ui = None est = 3.79 {'was_impossible': False}\n", - "user: 15 item: 942 r_ui = None est = 4.00 {'was_impossible': False}\n" + "user: 11 item: 1214 r_ui = None est = 0.86 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 1.00 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 4.42 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 3.00 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 4.53 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 3.00 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 0.72 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 4.00 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 3.33 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 3.00 {'was_impossible': False}\n" + ] + }, + { + "ename": "KeyError", + "evalue": "'timestamp'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/indexes/base.py:3791\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3790\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3791\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3792\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "File \u001b[0;32mindex.pyx:152\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mindex.pyx:181\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7080\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7088\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'timestamp'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[44], line 33\u001b[0m\n\u001b[1;32m 30\u001b[0m test_contentbased_class(feature_method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtags_length\u001b[39m\u001b[38;5;124m\"\u001b[39m, regressor_method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrandom_score\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 31\u001b[0m test_contentbased_class(feature_method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtags_length\u001b[39m\u001b[38;5;124m\"\u001b[39m, regressor_method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrandom_sample\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 33\u001b[0m \u001b[43mtest_contentbased_class\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeature_method\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtimestamp\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mregressor_method\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrandom_score\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 34\u001b[0m test_contentbased_class(feature_method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimestamp\u001b[39m\u001b[38;5;124m\"\u001b[39m, regressor_method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrandom_sample\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "Cell \u001b[0;32mIn[44], line 7\u001b[0m, in \u001b[0;36mtest_contentbased_class\u001b[0;34m(feature_method, regressor_method)\u001b[0m\n\u001b[1;32m 5\u001b[0m sp_ratings \u001b[38;5;241m=\u001b[39m load_ratings(surprise_format\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 6\u001b[0m train_set \u001b[38;5;241m=\u001b[39m sp_ratings\u001b[38;5;241m.\u001b[39mbuild_full_trainset()\n\u001b[0;32m----> 7\u001b[0m content_algo \u001b[38;5;241m=\u001b[39m \u001b[43mContentBased\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeature_method\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mregressor_method\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 8\u001b[0m content_algo\u001b[38;5;241m.\u001b[39mfit(train_set)\n\u001b[1;32m 9\u001b[0m anti_test_set_first \u001b[38;5;241m=\u001b[39m train_set\u001b[38;5;241m.\u001b[39mbuild_anti_testset()[\u001b[38;5;241m0\u001b[39m]\n", + "Cell \u001b[0;32mIn[43], line 5\u001b[0m, in \u001b[0;36mContentBased.__init__\u001b[0;34m(self, features_method, regressor_method)\u001b[0m\n\u001b[1;32m 3\u001b[0m AlgoBase\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mregressor_method \u001b[38;5;241m=\u001b[39m regressor_method\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontent_features \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcreate_content_features\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeatures_method\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[43], line 33\u001b[0m, in \u001b[0;36mContentBased.create_content_features\u001b[0;34m(self, features_method)\u001b[0m\n\u001b[1;32m 30\u001b[0m df_features \u001b[38;5;241m=\u001b[39m df_tag[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtag\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: \u001b[38;5;28msum\u001b[39m(\u001b[38;5;28mlen\u001b[39m(tag) \u001b[38;5;28;01mfor\u001b[39;00m tag \u001b[38;5;129;01min\u001b[39;00m x\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m,\u001b[39m\u001b[38;5;124m'\u001b[39m)))\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m features_method \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimestamp\u001b[39m\u001b[38;5;124m\"\u001b[39m :\n\u001b[0;32m---> 33\u001b[0m df_features \u001b[38;5;241m=\u001b[39m df_items[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtimestamp_sin\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39msin(\u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m np\u001b[38;5;241m.\u001b[39mpi \u001b[38;5;241m*\u001b[39m \u001b[43mdf_items\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtimestamp\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;241m/\u001b[39m \u001b[38;5;241m86400\u001b[39m)\n\u001b[1;32m 34\u001b[0m df_features \u001b[38;5;241m=\u001b[39m df_items[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtimestamp_cos\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mcos(\u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m np\u001b[38;5;241m.\u001b[39mpi \u001b[38;5;241m*\u001b[39m df_items[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtimestamp\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m/\u001b[39m \u001b[38;5;241m86400\u001b[39m)\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m: \u001b[38;5;66;03m# (implement other feature creations here)\u001b[39;00m\n", + "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/frame.py:3893\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3891\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 3892\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 3893\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3894\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 3895\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", + "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/indexes/base.py:3798\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3793\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3794\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3795\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3796\u001b[0m ):\n\u001b[1;32m 3797\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3798\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3799\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3800\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3801\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3802\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3803\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", + "\u001b[0;31mKeyError\u001b[0m: 'timestamp'" ] } ], @@ -278,7 +350,25 @@ "# (call here the test functions with different regressor methods)\n", "\n", "test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"random_score\")\n", - "test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"random_sample\")" + "test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"random_sample\")\n", + "\n", + "test_contentbased_class(feature_method=\"movie_year\", regressor_method=\"random_score\")\n", + "test_contentbased_class(feature_method=\"movie_year\", regressor_method=\"random_sample\")\n", + "\n", + "test_contentbased_class(feature_method=\"genres\", regressor_method=\"random_score\")\n", + "test_contentbased_class(feature_method=\"genres\", regressor_method=\"random_sample\")\n", + "\n", + "test_contentbased_class(feature_method=\"rating\", regressor_method=\"random_score\")\n", + "test_contentbased_class(feature_method=\"rating\", regressor_method=\"random_sample\")\n", + "\n", + "test_contentbased_class(feature_method=\"tags\", regressor_method=\"random_score\")\n", + "test_contentbased_class(feature_method=\"tags\", regressor_method=\"random_sample\")\n", + "\n", + "test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"random_score\")\n", + "test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"random_sample\")\n", + "\n", + "test_contentbased_class(feature_method=\"timestamp\", regressor_method=\"random_score\")\n", + "test_contentbased_class(feature_method=\"timestamp\", regressor_method=\"random_sample\")" ] } ],