diff --git a/content_based copy.ipynb b/content_based copy.ipynb
deleted file mode 100644
index fffea99ab7e327cbe56f9b267cd486ef0a2fd8c1..0000000000000000000000000000000000000000
--- a/content_based copy.ipynb
+++ /dev/null
@@ -1,427 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "82d5ca82",
- "metadata": {},
- "source": [
- "# Packages"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "277473a3",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "The autoreload extension is already loaded. To reload it, use:\n",
- " %reload_ext autoreload\n"
- ]
- }
- ],
- "source": [
- "%load_ext autoreload\n",
- "%autoreload 2\n",
- "\n",
- "\n",
- "# third parties imports\n",
- "import pandas as pd\n",
- "import numpy as np\n",
- "import random as rd\n",
- "from surprise import AlgoBase, SVD\n",
- "from surprise import PredictionImpossible\n",
- "\n",
- "# import local\n",
- "from sklearn.feature_extraction.text import TfidfVectorizer\n",
- "from loaders import load_items, load_ratings\n",
- "from constants import Constant as C\n",
- "from sklearn.linear_model import LinearRegression\n",
- "from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor\n",
- "from sklearn.svm import SVR\n",
- "\n",
- "from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet\n",
- "from sklearn.svm import SVR\n",
- "from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor\n",
- "from sklearn.tree import DecisionTreeRegressor\n",
- "from sklearn.neighbors import KNeighborsRegressor\n",
- "from xgboost import XGBRegressor\n",
- "from lightgbm import LGBMRegressor\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a42c16bf",
- "metadata": {},
- "source": [
- "# Explore and select content features"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "e8378976",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>n_character_title</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>movieId</th>\n",
- " <th></th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>16</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>14</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>23</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>24</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5</th>\n",
- " <td>34</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " n_character_title\n",
- "movieId \n",
- "1 16\n",
- "2 14\n",
- "3 23\n",
- "4 24\n",
- "5 34"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- "0 sandra 'boring' bullock\n",
- "1 dentist\n",
- "2 Cambodia\n",
- "3 Russian\n",
- "4 forgettable\n",
- "Name: tag, dtype: object"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# All the dataframes\n",
- "df_items = load_items()\n",
- "df_ratings = load_ratings()\n",
- "df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n",
- "#df_genome_score = pd.read_csv(\"data/hackathon/content/genome-scores.csv\")\n",
- "# df_genome_tag = pd.read_csv(\"data/hackathon/content/genome-tags.csv\")\n",
- "\n",
- "\n",
- "# Example 1 : create title_length features\n",
- "df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')\n",
- "display(df_features.head())\n",
- "\n",
- "df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n",
- "df_features = df_tag[C.TAG]\n",
- "display(df_features.head())\n",
- "\n",
- "# (explore here other features)\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a2c9a2b6",
- "metadata": {},
- "source": [
- "# Build a content-based model\n",
- "When ready, move the following class in the *models.py* script"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "16b0a602",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{'title_length': 0.1497645139703848, 'movie_year': 0.16218667420100635, '(no genres listed)': 0.0, 'action': 0.09449072815753193, 'adventure': 0.08778978776313201, 'animation': 0.0, 'children': 0.038431411145366176, 'comedy': 0.07268129109348041, 'crime': 0.09469516433772891, 'documentary': 0.0611428358670058, 'drama': 0.10494783392380302, 'fantasy': 0.025806451608591505, 'film-noir': 0.025806451609512046, 'horror': 0.018342712153336858, 'imax': 0.06947533670577526, 'musical': 0.0, 'mystery': 0.06234903350217154, 'romance': 0.036771716124540825, 'sci-fi': 0.059571001735546115, 'thriller': 0.0993122803165238, 'war': 0.04002978709072218, 'western': 0.04547648227079719, 'avg_rating': 0.16263357553020436}\n",
- "{'title_length': 0.12975573389578626, 'movie_year': 0.13738555574364605, '(no genres listed)': 0.0, 'action': 0.0640388318396414, 'adventure': 0.0827515664964472, 'animation': 0.05686854568650957, 'children': 0.06799492283569505, 'comedy': 0.07354182680364503, 'crime': 0.05543740962624167, 'documentary': 0.0, 'drama': 0.09170589087803577, 'fantasy': 0.061481521263689595, 'film-noir': 0.0, 'horror': 0.015113350123518238, 'imax': 0.04592205020685974, 'musical': 0.03201459126079391, 'mystery': 0.03412706135338736, 'romance': 0.05989121250223656, 'sci-fi': 0.04370793816378273, 'thriller': 0.045800659191095036, 'war': 0.04907194751877139, 'western': 0.027287416762806844, 'avg_rating': 0.13740560847192132}\n",
- "{'title_length': 0.04702378569892371, 'movie_year': 0.052440003628289225, '(no genres listed)': 0.0, 'action': 0.020439581335728367, 'adventure': 0.015593308332521032, 'animation': 0.004256286923052558, 'children': 0.003520723090188317, 'comedy': 0.018972762464944913, 'crime': 0.028340544273099223, 'documentary': 0.005823989517206729, 'drama': 0.037415345194166824, 'fantasy': 0.013643903080149476, 'film-noir': 0.015390183296279798, 'horror': 0.01926898253629829, 'imax': 0.0014716703456143566, 'musical': 0.0061519348279224124, 'mystery': 0.02847033164163413, 'romance': 0.019827342468818163, 'sci-fi': 0.022573488552024915, 'thriller': 0.03522231545147593, 'war': 0.010339617301415098, 'western': 0.005663885036293055, 'avg_rating': 0.05327750989412312}\n",
- "{'title_length': 0.033402138126294736, 'movie_year': 0.03710065977291947, '(no genres listed)': 0.0, 'action': 0.014528522669579273, 'adventure': 0.013963913494241694, 'animation': 0.005764814103226412, 'children': 0.006513197483932152, 'comedy': 0.017763201411495646, 'crime': 0.016002513666599556, 'documentary': 0.004292962983778595, 'drama': 0.027458210593047847, 'fantasy': 0.009302633945770895, 'film-noir': 0.006823368830454359, 'horror': 0.007391689869010394, 'imax': 0.004855154663168369, 'musical': 0.0058909467772061425, 'mystery': 0.012191560732760487, 'romance': 0.01723631022081761, 'sci-fi': 0.010817269433255231, 'thriller': 0.01658593988724716, 'war': 0.010193212979882352, 'western': 0.0052038255339472966, 'avg_rating': 0.03742403427834079}\n",
- "{'title_length': 0.20154225634108316, 'movie_year': 0.20848962267389695, '(no genres listed)': 0.0, 'action': 0.04545454544645529, 'adventure': 0.04545454544730129, 'animation': 0.0, 'children': 0.0, 'comedy': 0.07177284969293253, 'crime': 0.1145252645738102, 'documentary': 0.0, 'drama': 0.16778172557550536, 'fantasy': 0.0, 'film-noir': 0.0, 'horror': 0.06315936177961773, 'imax': 0.0, 'musical': 0.0, 'mystery': 0.08510520557533159, 'romance': 0.09754755529442835, 'sci-fi': 0.045454545449454146, 'thriller': 0.12542163704872258, 'war': 0.08035304331050673, 'western': 0.0, 'avg_rating': 0.21152969571139305}\n",
- "{'title_length': 0.021927486954368552, 'movie_year': 0.02488786702116846, '(no genres listed)': 0.0007363092498113207, 'action': 0.013836432470735639, 'adventure': 0.011610617815573265, 'animation': 0.007520799115717832, 'children': 0.006287966766754299, 'comedy': 0.012951125615087338, 'crime': 0.011084119744598393, 'documentary': 0.0018287715645832062, 'drama': 0.015221252640276463, 'fantasy': 0.008631010164284143, 'film-noir': 0.0024629052522566544, 'horror': 0.008816299251739122, 'imax': 0.005347204099216887, 'musical': 0.0038827346462235236, 'mystery': 0.0068652812039576095, 'romance': 0.008086664541950757, 'sci-fi': 0.010304269379559203, 'thriller': 0.013200133984104478, 'war': 0.005127335699821772, 'western': 0.0036215200349232765, 'avg_rating': 0.025470698706944836}\n"
- ]
- }
- ],
- "source": [
- "\n",
- "# ContetnBased\n",
- "class ContentBased(AlgoBase):\n",
- " def __init__(self, features_method, regressor_method):\n",
- " AlgoBase.__init__(self)\n",
- " self.regressor_method = regressor_method\n",
- " self.features_methods = features_method\n",
- " self.content_features = self.create_content_features(features_method)\n",
- " self.user_profile = {}\n",
- " self.user_profile_explain = {}\n",
- "\n",
- " def create_content_features(self, features_methods):\n",
- " \"\"\"Content Analyzer\"\"\"\n",
- " df_items = load_items()\n",
- " df_ratings = load_ratings()\n",
- " df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)\n",
- " df_genome_score = pd.read_csv(\"data/hackathon/content/genome-scores.csv\")\n",
- " df_genome_tag = pd.read_csv(\"data/hackathon/content/genome-tags.csv\")\n",
- "\n",
- " df_features = pd.DataFrame(index=df_items.index)\n",
- "\n",
- " for method in features_methods:\n",
- " if method == \"title_length\":\n",
- " df_title_length = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('title_length')\n",
- " df_features = pd.concat([df_features, df_title_length], axis=1)\n",
- " \n",
- " elif method == \"movie_year\":\n",
- " df_movie_year = df_items['title'].str.extract(r'\\((\\d{4})\\)', expand=False).to_frame('movie_year')\n",
- " df_features = pd.concat([df_features, df_movie_year.astype(float).fillna(0)], axis=1)\n",
- " \n",
- " elif method == \"genre\":\n",
- " tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split('|'), token_pattern=None)\n",
- " tfidf_matrix = tfidf_vectorizer.fit_transform(df_items['genres'])\n",
- " df_tfidf_genres = pd.DataFrame(tfidf_matrix.toarray(), index=df_items.index, columns=tfidf_vectorizer.get_feature_names_out())\n",
- " df_features = pd.concat([df_features, df_tfidf_genres], axis=1)\n",
- "\n",
- " elif method == \"avg_rating\":\n",
- " df_avg_rating = df_ratings.groupby('movieId')['rating'].mean().to_frame('avg_rating')\n",
- " df_features = df_features.join(df_avg_rating, on='movieId')\n",
- "\n",
- " else:\n",
- " raise NotImplementedError(f'Feature method {method} not yet implemented')\n",
- "\n",
- " # Handle missing values in df_features\n",
- " df_features.fillna(0, inplace=True)\n",
- "\n",
- " return df_features\n",
- "\n",
- " def fit(self, trainset):\n",
- " \"\"\"Profile Learner\"\"\"\n",
- " AlgoBase.fit(self, trainset)\n",
- "\n",
- " # Preallocate user profiles\n",
- " self.user_profile = {u: None for u in trainset.all_users()}\n",
- " self.user_profile_explain = {}\n",
- "\n",
- " epsilon = 1e-10 # Small value to prevent division by zero\n",
- "\n",
- " for u in trainset.all_users():\n",
- " raw_user_id = trainset.to_raw_uid(u)\n",
- " self.user_profile_explain[raw_user_id] = {}\n",
- "\n",
- " user_ratings = np.array([rating for (_, rating) in trainset.ur[u]])\n",
- " item_ids = [iid for (iid, _) in trainset.ur[u]]\n",
- " raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]\n",
- "\n",
- " feature_values = self.content_features.loc[raw_item_ids].values\n",
- " norms = np.linalg.norm(feature_values, axis=0) + epsilon\n",
- " weighted_features = feature_values / norms\n",
- " feature_importance = weighted_features.T @ user_ratings\n",
- " feature_importance /= np.sum(user_ratings)\n",
- "\n",
- " self.user_profile_explain[raw_user_id] = dict(zip(self.content_features.columns, feature_importance))\n",
- "\n",
- " if self.regressor_method == 'random_score':\n",
- " for u in self.user_profile:\n",
- " self.user_profile[u] = rd.uniform(0.5, 5)\n",
- "\n",
- " elif self.regressor_method == 'random_sample':\n",
- " for u in self.user_profile:\n",
- " self.user_profile[u] = [rating for (_, rating) in trainset.ur[u]]\n",
- "\n",
- " else:\n",
- " regressor_models = {\n",
- " 'linear_regression': LinearRegression(fit_intercept=False),\n",
- " 'svr_regression': SVR(kernel='rbf', C=10, epsilon=0.2),\n",
- " 'gradient_boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),\n",
- " 'random_forest': RandomForestRegressor(n_estimators=100),\n",
- " 'lasso_regression': Lasso(alpha=0.1),\n",
- " 'ridge_regression': Ridge(alpha=1.0),\n",
- " 'elastic_net': ElasticNet(alpha=1.0, l1_ratio=0.5),\n",
- " 'knn_regression': KNeighborsRegressor(n_neighbors=1),\n",
- " 'decision_tree': DecisionTreeRegressor(max_depth=5),\n",
- " 'adaboost': AdaBoostRegressor(n_estimators=50),\n",
- " 'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),\n",
- " 'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)\n",
- " }\n",
- "\n",
- " if self.regressor_method not in regressor_models:\n",
- " raise NotImplementedError(f'Regressor method {self.regressor_method} not yet implemented')\n",
- "\n",
- " for u in self.user_profile:\n",
- " user_ratings = [rating for (_, rating) in trainset.ur[u]]\n",
- " item_ids = [iid for (iid, _) in trainset.ur[u]]\n",
- " raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]\n",
- "\n",
- " df_user = pd.DataFrame({'item_id': raw_item_ids, 'user_ratings': user_ratings})\n",
- " df_user = df_user.merge(self.content_features, left_on=\"item_id\", right_index=True, how='left')\n",
- "\n",
- " X = df_user.drop(columns=['item_id', 'user_ratings'])\n",
- " y = df_user['user_ratings']\n",
- "\n",
- " regressor = regressor_models[self.regressor_method]\n",
- " regressor.fit(X, y)\n",
- "\n",
- " self.user_profile[u] = regressor\n",
- "\n",
- " def estimate(self, u, i):\n",
- " \"\"\"Scoring component used for item filtering\"\"\"\n",
- " if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):\n",
- " raise PredictionImpossible('User and/or item is unknown.')\n",
- "\n",
- " if self.regressor_method == 'random_score':\n",
- " return rd.uniform(0.5, 5)\n",
- "\n",
- " elif self.regressor_method == 'random_sample':\n",
- " return rd.choice(self.user_profile[u])\n",
- "\n",
- " else:\n",
- " raw_item_id = self.trainset.to_raw_iid(i)\n",
- " item_features = self.content_features.loc[raw_item_id, :].values.reshape(1, -1)\n",
- " regressor = self.user_profile[u]\n",
- " item_features_df = pd.DataFrame(item_features, columns=self.content_features.columns)\n",
- " return regressor.predict(item_features_df)[0]\n",
- "\n",
- " def explain(self, u):\n",
- " if u in self.user_profile_explain:\n",
- " return self.user_profile_explain[u]\n",
- " else:\n",
- " return None\n",
- "\n",
- "\n",
- "#Example usage:\n",
- "cb = ContentBased([\"title_length\", \"movie_year\",\"genre\",\"avg_rating\"], \"ridge_regression\")\n",
- "surprise_data = load_ratings(surprise_format=True)\n",
- "trainset = surprise_data.build_full_trainset()\n",
- "testset = trainset.build_anti_testset()\n",
- "cb.fit(trainset)\n",
- "\n",
- "\n",
- "#print(\"RMSE: \", cb.rmse(testset))\n",
- "\n",
- "\n",
- "#Example explanations for users:\n",
- "print(cb.explain(11))\n",
- "\n",
- "print(cb.explain(13))\n",
- "\n",
- "print(cb.explain(17))\n",
- "\n",
- "print(cb.explain(23))\n",
- "\n",
- "print(cb.explain(27))\n",
- "\n",
- "print(cb.explain(73))\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ffd75b7e",
- "metadata": {},
- "source": [
- "The following script test the ContentBased class"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "69d12f7d",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "user: 1 item: 10 r_ui = None est = 0.72 {'was_impossible': False}\n"
- ]
- }
- ],
- "source": [
- "def test_contentbased_class(feature_method, regressor_method):\n",
- " \"\"\"Test the ContentBased class.\n",
- " Tries to make a prediction on the first (user,item ) tuple of the anti_test_set\n",
- " \"\"\"\n",
- " sp_ratings = load_ratings(surprise_format=True)\n",
- " train_set = sp_ratings.build_full_trainset()\n",
- " content_algo = ContentBased(feature_method, regressor_method)\n",
- " content_algo.fit(train_set)\n",
- " anti_test_set_first = train_set.build_anti_testset()[0]\n",
- " prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])\n",
- " print(prediction)\n",
- "\n",
- "test_contentbased_class([\"title_length\", \"movie_year\",\"genre\",\"avg_rating\"], \"ridge_regression\")"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.12.2"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/recommender.py b/recommender.py
index e9295588fdca77379242fa409fe3d00781dd0818..b4c16f5a814393c6228366c2d91c08de8b3a816f 100644
--- a/recommender.py
+++ b/recommender.py
@@ -9,13 +9,15 @@ import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
-from scipy.stats import pearsonr
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
-from surprise.similarities import pearson
+
+from surprise.similarities import Similarity
+from surprise.prediction_algorithms.knns import KNNWithMeans
+from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsRegressor
@@ -153,6 +155,24 @@ class UserBased(AlgoBase):
similarity_matrix[j, i] = similarity
self.sim = similarity_matrix
+
+ def compute_cosine(self):
+ """
+ Compute the similarity matrix based on user ratings using cosine similarity.
+ """
+ n_users = self.trainset.n_users
+ similarity_matrix = np.zeros((n_users, n_users))
+
+ for i in range(n_users):
+ for j in range(i + 1, n_users):
+ support = np.sum(~np.isnan(self.ratings_matrix[i]) & ~np.isnan(self.ratings_matrix[j]))
+ if support >= self.min_k:
+ # Calculate cosine similarity
+ similarity = cosine_similarity([self.ratings_matrix[i]], [self.ratings_matrix[j]])[0, 0]
+ similarity_matrix[i, j] = similarity
+ similarity_matrix[j, i] = similarity
+
+ self.sim = similarity_matrix
def compute_mean_ratings(self):
"""
@@ -438,6 +458,16 @@ class RecommenderSystem_KNN :
average_distance = 0.0
return average_distance
+
+ def train_knn_model(self):
+ """
+ Train the KNN model on the ratings data and evaluate its RMSE.
+ """
+ # Compute similarity matrix using cosine similarity
+ sim_matrix = cosine_similarity(self.trainset._raw2inner_id_items)
+ self.model.sim = sim_matrix
+
+
def evaluate_knn_rmse_for_different_k(self):
"""
@@ -608,6 +638,49 @@ class OtherUserBased:
inter_user_diversity_score = np.mean(similarities)
return inter_user_diversity_score
+
+###########################################################################################################################
+####################################################### CUSTOM METRICS ####################################################
+###########################################################################################################################
+
+
+class CustomKNNWithMeans(KNNWithMeans):
+ def __init__(self, k=40, min_k=1, sim_options={}, **kwargs):
+ sim_options['user_based'] = True
+ sim_options['name'] = 'custom' # Nom de la mesure de similarité personnalisée
+ super().__init__(k=k, min_k=min_k, sim_options=sim_options, **kwargs)
+
+ def fit(self, trainset):
+ # Ici, vous devez implémenter votre propre calcul de similarité personnalisée
+ # Par exemple, vous pouvez utiliser une fonction définie par l'utilisateur pour calculer la similarité
+ self.sim = Similarity() # Remplacez Similarity par votre propre calcul de similarité
+ super().fit(trainset)
+
+
+
+class CustomUserBased(UserBased):
+ def __init__(self, k=20, min_k=20, sim_options={}, **kwargs):
+ sim_options['name'] = 'custom' # Nom de la mesure de similarité personnalisée
+ super().__init__(k=k, min_k=min_k, sim_options=sim_options, **kwargs)
+
+ def compute_similarity_matrix(self):
+ """
+ Calcule la matrice de similarité basée sur les évaluations des utilisateurs.
+ """
+ n_users = self.trainset.n_users
+ similarity_matrix = np.eye(n_users)
+
+ for i in range(n_users):
+ for j in range(i + 1, n_users):
+ support = np.sum(~np.isnan(self.ratings_matrix[i]) & ~np.isnan(self.ratings_matrix[j]))
+ if support >= self.min_k:
+ intersection = np.sum(~np.isnan(self.ratings_matrix[i]) & ~np.isnan(self.ratings_matrix[j]))
+ union = np.sum(~np.isnan(self.ratings_matrix[i]) | ~np.isnan(self.ratings_matrix[j]))
+ similarity = intersection / union
+ similarity_matrix[i, j] = similarity
+ similarity_matrix[j, i] = similarity
+
+ self.sim = similarity_matrix
###########################################################################################################################
####################################################### COMPARISON MODEL ##################################################
###########################################################################################################################
@@ -651,6 +724,7 @@ def compare_models():
#compare_models()
+
def compare_similarity_measures(trainset,testset):
"""
Compare the similarity measures MAE and RMSE with Jaccard and MSD for KNN and UserBased models.
@@ -683,15 +757,6 @@ def compare_similarity_measures(trainset,testset):
results['KNN_Pearson_RMSE'] = rmse_pearson
results['KNN_Pearson_MAE'] = mae_pearson
- # Train and evaluate KNN model with Jaccard similarity
- sim_options_jaccard = {'name': '','user_based': True}
- user_based_jaccard = KNNWithMeans(sim_options=sim_options_jaccard)
- user_based_jaccard.fit(trainset)
- predictions_jaccard = user_based_jaccard.test(testset)
- rmse_jaccard = accuracy.rmse(predictions_jaccard)
- mae_jaccard = accuracy.mae(predictions_jaccard)
- results['KNN_Jaccard_RMSE'] = rmse_jaccard
- results['KNN_Jaccard_MAE'] = mae_jaccard
# Train and evaluate UserBased model with MSD similarity
user_based_msd = UserBased(sim_options={'name': 'msd','user_based': True})
@@ -702,15 +767,6 @@ def compare_similarity_measures(trainset,testset):
results['UserBased_MSD_RMSE'] = rmse_user_based_msd
results['UserBased_MSD_MAE'] = mae_user_based_msd
- # Train and evaluate UserBased model with Jaccard similarity
- user_based_jaccard = UserBased(sim_options={'name': 'jaccard','user_based': True})
- user_based_jaccard.fit(trainset)
- predictions_user_based_jaccard = user_based_jaccard.test(testset)
- rmse_user_based_jaccard = accuracy.rmse(predictions_user_based_jaccard)
- mae_user_based_jaccard = accuracy.mae(predictions_user_based_jaccard)
- results['UserBased_Jaccard_RMSE'] = rmse_user_based_jaccard
- results['UserBased_Jaccard_MAE'] = mae_user_based_jaccard
-
# Train and evaluate UserBased model with Pearson correlation similarity
user_based_pearson = UserBased(sim_options={'name': 'pearson'})
user_based_pearson.fit(trainset)
@@ -720,6 +776,7 @@ def compare_similarity_measures(trainset,testset):
results['UserBased_Pearson_RMSE'] = rmse_user_based_pearson
results['UserBased_Pearson_MAE'] = mae_user_based_pearson
+
# Train and evaluate OtherUserBased models
for user_name, user_id in [('Adrien', -1), ('Audrey', -2), ('Nathanael', -3), ('Charles', -4)]:
other_user_based = OtherUserBased(user_name, user_id)