diff --git a/content_based.ipynb b/content_based.ipynb index 979171c4bc9883f8e428d75a6e30521bb4a32ee6..7c12361204ffd09164375fb170cd551108919f56 100644 --- a/content_based.ipynb +++ b/content_based.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 86, "id": "277473a3", "metadata": {}, "outputs": [ @@ -37,7 +37,8 @@ "from loaders import load_items\n", "from constants import Constant as C\n", "\n", - "from sklearn.linear_model import LinearRegression" + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.feature_extraction.text import TfidfVectorizer" ] }, { @@ -50,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 82, "id": "e8378976", "metadata": {}, "outputs": [ @@ -161,7 +162,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 83, "id": "16b0a602", "metadata": {}, "outputs": [], @@ -279,6 +280,303 @@ " return score" ] }, + { + "cell_type": "code", + "execution_count": 92, + "id": "baab88b7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Matrice TF-IDF des genres :\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>action</th>\n", + " <th>adventure</th>\n", + " <th>animation</th>\n", + " <th>children</th>\n", + " <th>comedy</th>\n", + " <th>drama</th>\n", + " <th>fantasy</th>\n", + " <th>fi</th>\n", + " <th>horror</th>\n", + " <th>imax</th>\n", + " <th>musical</th>\n", + " <th>mystery</th>\n", + " <th>romance</th>\n", + " <th>sci</th>\n", + " <th>war</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0.000000</td>\n", + " <td>0.658454</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.752621</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>0.000000</td>\n", + " <td>0.658454</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.752621</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.572658</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.819795</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>0.694164</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.412209</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.590102</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.465343</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.582818</td>\n", + " <td>0.000000</td>\n", + " <td>0.666168</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.572658</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.819795</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.647689</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.761905</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.606043</td>\n", + " <td>0.515192</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.606043</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.563507</td>\n", + " <td>0.000000</td>\n", + " <td>0.662879</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.493002</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>0.000000</td>\n", + " <td>0.363703</td>\n", + " <td>0.415716</td>\n", + " <td>0.489026</td>\n", + " <td>0.000000</td>\n", + " <td>0.290394</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.489026</td>\n", + " <td>0.363703</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " action adventure animation children comedy drama fantasy \\\n", + "0 0.000000 0.658454 0.000000 0.000000 0.000000 0.000000 0.752621 \n", + "1 0.000000 0.658454 0.000000 0.000000 0.000000 0.000000 0.752621 \n", + "2 0.000000 0.000000 0.000000 0.000000 0.000000 0.572658 0.000000 \n", + "3 0.694164 0.000000 0.000000 0.000000 0.000000 0.412209 0.000000 \n", + "4 0.000000 0.000000 0.000000 0.000000 0.000000 0.465343 0.000000 \n", + "5 0.000000 0.000000 0.000000 0.000000 0.000000 0.572658 0.000000 \n", + "6 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "7 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "8 0.000000 0.000000 0.563507 0.000000 0.662879 0.000000 0.000000 \n", + "9 0.000000 0.363703 0.415716 0.489026 0.000000 0.290394 0.000000 \n", + "\n", + " fi horror imax musical mystery romance sci \\\n", + "0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "2 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "3 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "4 0.000000 0.000000 0.000000 0.582818 0.000000 0.666168 0.000000 \n", + "5 0.000000 0.000000 0.000000 0.000000 0.000000 0.819795 0.000000 \n", + "6 0.000000 0.647689 0.000000 0.000000 0.761905 0.000000 0.000000 \n", + "7 0.606043 0.515192 0.000000 0.000000 0.000000 0.000000 0.606043 \n", + "8 0.000000 0.000000 0.000000 0.493002 0.000000 0.000000 0.000000 \n", + "9 0.000000 0.000000 0.489026 0.363703 0.000000 0.000000 0.000000 \n", + "\n", + " war \n", + "0 0.000000 \n", + "1 0.000000 \n", + "2 0.819795 \n", + "3 0.590102 \n", + "4 0.000000 \n", + "5 0.000000 \n", + "6 0.000000 \n", + "7 0.000000 \n", + "8 0.000000 \n", + "9 0.000000 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from pprint import pprint\n", + "\n", + "# Créer une instance de TfidfVectorizer pour les genres\n", + "tfidf_vectorizer = TfidfVectorizer()\n", + "\n", + "# Fit et transform pour calculer la matrice TF-IDF des genres\n", + "tfidf_matrix = tfidf_vectorizer.fit_transform(df_items['genres'])\n", + "\n", + "# Obtenir les noms des genres (features)\n", + "genre_names = tfidf_vectorizer.get_feature_names_out()\n", + "\n", + "# Créer un DataFrame à partir de la matrice TF-IDF des genres\n", + "df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=genre_names)\n", + "\n", + "print(\"Matrice TF-IDF des genres :\")\n", + "display(df_tfidf)" + ] + }, { "cell_type": "markdown", "id": "ffd75b7e", @@ -289,7 +587,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 84, "id": "69d12f7d", "metadata": {}, "outputs": [ @@ -297,40 +595,40 @@ "name": "stdout", "output_type": "stream", "text": [ - "user: 11 item: 1214 r_ui = None est = 0.86 {'was_impossible': False}\n", - "user: 11 item: 1214 r_ui = None est = 1.00 {'was_impossible': False}\n", - "user: 11 item: 1214 r_ui = None est = 4.42 {'was_impossible': False}\n", - "user: 11 item: 1214 r_ui = None est = 3.00 {'was_impossible': False}\n", - "user: 11 item: 1214 r_ui = None est = 4.53 {'was_impossible': False}\n", + "title_length :\n", + "user: 11 item: 1214 r_ui = None est = 1.19 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 4.50 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 1.09 {'was_impossible': False}\n", + "\n", + "\n", + "movie_year : \n", + "user: 11 item: 1214 r_ui = None est = 1.24 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 4.50 {'was_impossible': False}\n", + "\n", + "\n", + "genres : \n", + "user: 11 item: 1214 r_ui = None est = 4.94 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 4.00 {'was_impossible': False}\n", + "\n", + "\n", + "rating : \n", + "user: 11 item: 1214 r_ui = None est = 1.76 {'was_impossible': False}\n", "user: 11 item: 1214 r_ui = None est = 3.00 {'was_impossible': False}\n", - "user: 11 item: 1214 r_ui = None est = 0.72 {'was_impossible': False}\n", + "\n", + "\n", + "tags : \n", + "user: 11 item: 1214 r_ui = None est = 3.67 {'was_impossible': False}\n", "user: 11 item: 1214 r_ui = None est = 4.00 {'was_impossible': False}\n", - "user: 11 item: 1214 r_ui = None est = 3.33 {'was_impossible': False}\n", - "user: 11 item: 1214 r_ui = None est = 3.00 {'was_impossible': False}\n" - ] - }, - { - "ename": "KeyError", - "evalue": "'timestamp'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/indexes/base.py:3791\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3790\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3791\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3792\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", - "File \u001b[0;32mindex.pyx:152\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mindex.pyx:181\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7080\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7088\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mKeyError\u001b[0m: 'timestamp'", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[44], line 33\u001b[0m\n\u001b[1;32m 30\u001b[0m test_contentbased_class(feature_method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtags_length\u001b[39m\u001b[38;5;124m\"\u001b[39m, regressor_method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrandom_score\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 31\u001b[0m test_contentbased_class(feature_method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtags_length\u001b[39m\u001b[38;5;124m\"\u001b[39m, regressor_method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrandom_sample\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 33\u001b[0m \u001b[43mtest_contentbased_class\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeature_method\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtimestamp\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mregressor_method\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrandom_score\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 34\u001b[0m test_contentbased_class(feature_method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimestamp\u001b[39m\u001b[38;5;124m\"\u001b[39m, regressor_method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrandom_sample\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "Cell \u001b[0;32mIn[44], line 7\u001b[0m, in \u001b[0;36mtest_contentbased_class\u001b[0;34m(feature_method, regressor_method)\u001b[0m\n\u001b[1;32m 5\u001b[0m sp_ratings \u001b[38;5;241m=\u001b[39m load_ratings(surprise_format\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 6\u001b[0m train_set \u001b[38;5;241m=\u001b[39m sp_ratings\u001b[38;5;241m.\u001b[39mbuild_full_trainset()\n\u001b[0;32m----> 7\u001b[0m content_algo \u001b[38;5;241m=\u001b[39m \u001b[43mContentBased\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeature_method\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mregressor_method\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 8\u001b[0m content_algo\u001b[38;5;241m.\u001b[39mfit(train_set)\n\u001b[1;32m 9\u001b[0m anti_test_set_first \u001b[38;5;241m=\u001b[39m train_set\u001b[38;5;241m.\u001b[39mbuild_anti_testset()[\u001b[38;5;241m0\u001b[39m]\n", - "Cell \u001b[0;32mIn[43], line 5\u001b[0m, in \u001b[0;36mContentBased.__init__\u001b[0;34m(self, features_method, regressor_method)\u001b[0m\n\u001b[1;32m 3\u001b[0m AlgoBase\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mregressor_method \u001b[38;5;241m=\u001b[39m regressor_method\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontent_features \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcreate_content_features\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeatures_method\u001b[49m\u001b[43m)\u001b[49m\n", - "Cell \u001b[0;32mIn[43], line 33\u001b[0m, in \u001b[0;36mContentBased.create_content_features\u001b[0;34m(self, features_method)\u001b[0m\n\u001b[1;32m 30\u001b[0m df_features \u001b[38;5;241m=\u001b[39m df_tag[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtag\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: \u001b[38;5;28msum\u001b[39m(\u001b[38;5;28mlen\u001b[39m(tag) \u001b[38;5;28;01mfor\u001b[39;00m tag \u001b[38;5;129;01min\u001b[39;00m x\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m,\u001b[39m\u001b[38;5;124m'\u001b[39m)))\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m features_method \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimestamp\u001b[39m\u001b[38;5;124m\"\u001b[39m :\n\u001b[0;32m---> 33\u001b[0m df_features \u001b[38;5;241m=\u001b[39m df_items[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtimestamp_sin\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39msin(\u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m np\u001b[38;5;241m.\u001b[39mpi \u001b[38;5;241m*\u001b[39m \u001b[43mdf_items\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtimestamp\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;241m/\u001b[39m \u001b[38;5;241m86400\u001b[39m)\n\u001b[1;32m 34\u001b[0m df_features \u001b[38;5;241m=\u001b[39m df_items[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtimestamp_cos\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mcos(\u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m np\u001b[38;5;241m.\u001b[39mpi \u001b[38;5;241m*\u001b[39m df_items[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtimestamp\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m/\u001b[39m \u001b[38;5;241m86400\u001b[39m)\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m: \u001b[38;5;66;03m# (implement other feature creations here)\u001b[39;00m\n", - "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/frame.py:3893\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3891\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 3892\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 3893\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3894\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 3895\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", - "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/indexes/base.py:3798\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3793\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3794\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3795\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3796\u001b[0m ):\n\u001b[1;32m 3797\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3798\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3799\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3800\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3801\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3802\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3803\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", - "\u001b[0;31mKeyError\u001b[0m: 'timestamp'" + "\n", + "\n", + "tags_length : \n", + "user: 11 item: 1214 r_ui = None est = 1.73 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 1.00 {'was_impossible': False}\n", + "\n", + "\n", + "timestamp : \n", + "user: 11 item: 1214 r_ui = None est = 3.65 {'was_impossible': False}\n", + "user: 11 item: 1214 r_ui = None est = 5.00 {'was_impossible': False}\n" ] } ], @@ -348,27 +646,40 @@ " print(prediction)\n", "\n", "# (call here the test functions with different regressor methods)\n", - "\n", + "print(\"title_length :\")\n", "test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"random_score\")\n", "test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"random_sample\")\n", - "\n", + "test_contentbased_class(feature_method = \"title_length\" , regressor_method = \"linear_regression\")\n", + "print(\"\\n\")\n", + "print(\"movie_year : \")\n", "test_contentbased_class(feature_method=\"movie_year\", regressor_method=\"random_score\")\n", "test_contentbased_class(feature_method=\"movie_year\", regressor_method=\"random_sample\")\n", - "\n", + "#test_contentbased_class(feature_method=\"movie_year\", regressor_method=\"linear_regression\")\n", + "print(\"\\n\")\n", + "print(\"genres : \")\n", "test_contentbased_class(feature_method=\"genres\", regressor_method=\"random_score\")\n", "test_contentbased_class(feature_method=\"genres\", regressor_method=\"random_sample\")\n", - "\n", + "#test_contentbased_class(feature_method=\"genres\", regressor_method=\"linear_regression\")\n", + "print(\"\\n\")\n", + "print(\"rating : \")\n", "test_contentbased_class(feature_method=\"rating\", regressor_method=\"random_score\")\n", "test_contentbased_class(feature_method=\"rating\", regressor_method=\"random_sample\")\n", - "\n", + "#test_contentbased_class(feature_method=\"rating\", regressor_method=\"linear_regression\")\n", + "print(\"\\n\")\n", + "print(\"tags : \")\n", "test_contentbased_class(feature_method=\"tags\", regressor_method=\"random_score\")\n", "test_contentbased_class(feature_method=\"tags\", regressor_method=\"random_sample\")\n", - "\n", + "#test_contentbased_class(feature_method=\"tags\", regressor_method=\"linear_regression\")\n", + "print(\"\\n\")\n", + "print(\"tags_length : \")\n", "test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"random_score\")\n", "test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"random_sample\")\n", - "\n", + "#test_contentbased_class(feature_method=\"tags_length\", regressor_method=\"linear_regression\")\n", + "print(\"\\n\")\n", + "print(\"timestamp : \")\n", "test_contentbased_class(feature_method=\"timestamp\", regressor_method=\"random_score\")\n", - "test_contentbased_class(feature_method=\"timestamp\", regressor_method=\"random_sample\")" + "test_contentbased_class(feature_method=\"timestamp\", regressor_method=\"random_sample\")\n", + "#test_contentbased_class(feature_method=\"timestamp\", regressor_method=\"linear_regression\")" ] } ], diff --git a/user_based.ipynb b/user_based.ipynb index d75d83b3db17809d0ebb23375bddcec2364abea0..ba819f78b0bdcaf77269b65b76bad1ab9c4ce7d5 100644 --- a/user_based.ipynb +++ b/user_based.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "id": "00d1b249", "metadata": {}, "outputs": [ @@ -56,20 +56,10 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 7, "id": "aafd1712", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Computing the msd similarity matrix...\n", - "Done computing similarity matrix.\n", - "user: 11 item: 364 r_ui = None est = 2.49 {'actual_k': 2, 'was_impossible': False}\n" - ] - } - ], + "outputs": [], "source": [ "# Create Surprise Dataset from the pandas DataFrame and Reader\n", "surprise_data = load_ratings(surprise_format=True)\n", @@ -91,10 +81,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "ce078b43", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing the msd similarity matrix...\n", + "Done computing similarity matrix.\n", + "user: 11 item: 364 r_ui = None est = 2.49 {'actual_k': 2, 'was_impossible': False}\n" + ] + } + ], "source": [ "#User-based prediction for the user 11 and the item 364\n", "\n", @@ -106,20 +106,20 @@ "\n", "\n", "# Build an algorithm, and train it.\n", - "algo = KNNWithMeans(sim_options=sim_options, k=3, min_k=2)\n", - "algo.fit(trainset)\n", - "algo.test(testset)\n", + "knn_model = KNNWithMeans(sim_options=sim_options, k=3, min_k=2)\n", + "knn_model.fit(trainset)\n", + "knn_model.test(testset)\n", "\n", "\n", "uid = 11 # raw user id (as in the ratings file). They are **strings**!\n", "iid = 364 \n", "\n", - "pred = algo.predict(uid, iid, verbose=True)" + "pred = knn_model.predict(uid, iid, verbose=True)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 9, "id": "ffe89c56", "metadata": {}, "outputs": [ @@ -280,7 +280,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 10, "id": "cc806424", "metadata": {}, "outputs": [ @@ -431,7 +431,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "id": "d03ed9eb", "metadata": {}, "outputs": [ @@ -574,7 +574,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 12, "id": "be53ae27", "metadata": {}, "outputs": [ @@ -631,7 +631,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 13, "id": "c20d8e19", "metadata": {}, "outputs": [ @@ -643,10 +643,10 @@ "Done computing similarity matrix.\n", "Computing the cosine similarity matrix...\n", "Done computing similarity matrix.\n", - "RMSE: 1.0812\n", - "RMSE: 1.0910\n", - "RMSE with MSD similarity: 1.0811758629789194\n", - "RMSE with Jaccard similarity: 1.0910225374454734\n" + "RMSE: 1.5426\n", + "RMSE: 1.5566\n", + "RMSE with MSD similarity: 1.5425652195869395\n", + "RMSE with Jaccard similarity: 1.5566203393757845\n" ] } ],