Skip to content
Extraits de code Groupes Projets
Valider c9d719a3 rédigé par Adrien Payen's avatar Adrien Payen
Parcourir les fichiers

update content based

parent 8cdb6fca
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
%% Cell type:markdown id:82d5ca82 tags: %% Cell type:markdown id:82d5ca82 tags:
# Packages # Packages
%% Cell type:code id:277473a3 tags: %% Cell type:code id:277473a3 tags:
``` python ``` python
%load_ext autoreload %load_ext autoreload
%autoreload 2 %autoreload 2
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import random as rd import random as rd
from surprise import AlgoBase from surprise import AlgoBase
from surprise.prediction_algorithms.predictions import PredictionImpossible from surprise.prediction_algorithms.predictions import PredictionImpossible
from loaders import load_ratings from loaders import load_ratings
from loaders import load_items from loaders import load_items
from constants import Constant as C from constants import Constant as C
from sklearn.linear_model import LinearRegression from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR from sklearn.svm import SVR
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
``` ```
%% Output
The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload
%% Cell type:markdown id:a42c16bf tags: %% Cell type:markdown id:a42c16bf tags:
# Explore and select content features # Explore and select content features
%% Cell type:code id:e8378976 tags: %% Cell type:code id:e8378976 tags:
``` python ``` python
# All the dataframes # All the dataframes
df_items = load_items() df_items = load_items()
df_ratings = load_ratings() df_ratings = load_ratings()
df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME) df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
#df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv") #df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv")
# df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv") # df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv")
# Example 1 : create title_length features # Example 1 : create title_length features
df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title') df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
display(df_features.head()) display(df_features.head())
df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME) df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
df_features = df_tag[C.TAG] df_features = df_tag[C.TAG]
display(df_features.head()) display(df_features.head())
# (explore here other features) # (explore here other features)
``` ```
%% Output %% Output
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[16], line 2
1 # All the dataframes
----> 2 df_items = load_items()
3 df_ratings = load_ratings()
4 df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
File ~/Desktop/Université/Recommender Systems/recomsys/loaders.py:34, in load_items()
28 def load_items():
29 """Loads items data.
30
31 Returns:
32 DataFrame: Items data.
33 """
---> 34 df_items = pd.read_csv(C.CONTENT_PATH / C.ITEMS_FILENAME) # ce qui se trouve dans le movie csv
35 df_items = df_items.set_index(C.ITEM_ID_COL) # movie id
36 return df_items
File ~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
1013 kwds_defaults = _refine_defaults_read(
1014 dialect,
1015 delimiter,
(...)
1022 dtype_backend=dtype_backend,
1023 )
1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)
File ~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
617 _validate_names(kwds.get("names", None))
619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
622 if chunksize or iterator:
623 return parser
File ~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
1617 self.options["has_index_names"] = kwds["has_index_names"]
1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)
File ~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
1878 if "b" not in mode:
1879 mode += "b"
-> 1880 self.handles = get_handle(
1881 f,
1882 mode,
1883 encoding=self.options.get("encoding", None),
1884 compression=self.options.get("compression", None),
1885 memory_map=self.options.get("memory_map", False),
1886 is_text=is_text,
1887 errors=self.options.get("encoding_errors", "strict"),
1888 storage_options=self.options.get("storage_options", None),
1889 )
1890 assert self.handles is not None
1891 f = self.handles.handle
File ~/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pandas/io/common.py:873, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
868 elif isinstance(handle, str):
869 # Check whether the filename is to be opened in binary mode.
870 # Binary mode does not support 'encoding' and 'newline'.
871 if ioargs.encoding and "b" not in ioargs.mode:
872 # Encoding
--> 873 handle = open(
874 handle,
875 ioargs.mode,
876 encoding=ioargs.encoding,
877 errors=errors,
878 newline="",
879 )
880 else:
881 # Binary mode
882 handle = open(handle, ioargs.mode)
FileNotFoundError: [Errno 2] No such file or directory: 'data/test/content/movies.csv'
%% Cell type:markdown id:a2c9a2b6 tags: %% Cell type:markdown id:a2c9a2b6 tags:
# Build a content-based model # Build a content-based model
When ready, move the following class in the *models.py* script When ready, move the following class in the *models.py* script
%% Cell type:code id:16b0a602 tags: %% Cell type:code id:16b0a602 tags:
``` python ``` python
# ContetnBased # ContetnBased
class ContentBased(AlgoBase): class ContentBased(AlgoBase):
def __init__(self, features_method, regressor_method): def __init__(self, features_method, regressor_method):
AlgoBase.__init__(self) AlgoBase.__init__(self)
self.regressor_method = regressor_method self.regressor_method = regressor_method
self.features_methods = features_method self.features_methods = features_method
self.content_features = self.create_content_features(features_method) self.content_features = self.create_content_features(features_method)
self.user_profile = {} self.user_profile = {}
self.user_profile_explain = {} self.user_profile_explain = {}
def create_content_features(self, features_methods): def create_content_features(self, features_methods):
"""Content Analyzer""" """Content Analyzer"""
df_items = load_items() df_items = load_items()
df_ratings = load_ratings() df_ratings = load_ratings()
df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME) df_tag = pd.read_csv(C.CONTENT_PATH/C.TAGS_FILENAME)
df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv") df_genome_score = pd.read_csv("data/hackathon/content/genome-scores.csv")
df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv") df_genome_tag = pd.read_csv("data/hackathon/content/genome-tags.csv")
df_features = pd.DataFrame(index=df_items.index) df_features = pd.DataFrame(index=df_items.index)
for method in features_methods: for method in features_methods:
if method == "title_length": if method == "title_length":
df_title_length = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('title_length') df_title_length = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('title_length')
df_features = pd.concat([df_features, df_title_length], axis=1) df_features = pd.concat([df_features, df_title_length], axis=1)
elif method == "movie_year": elif method == "movie_year":
df_movie_year = df_items['title'].str.extract(r'\((\d{4})\)', expand=False).to_frame('movie_year') df_movie_year = df_items['title'].str.extract(r'\((\d{4})\)', expand=False).to_frame('movie_year')
df_features = pd.concat([df_features, df_movie_year.astype(float).fillna(0)], axis=1) df_features = pd.concat([df_features, df_movie_year.astype(float).fillna(0)], axis=1)
elif method == "genre": elif method == "genre":
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split('|'), token_pattern=None) tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split('|'), token_pattern=None)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_items['genres']) tfidf_matrix = tfidf_vectorizer.fit_transform(df_items['genres'])
df_tfidf_genres = pd.DataFrame(tfidf_matrix.toarray(), index=df_items.index, columns=tfidf_vectorizer.get_feature_names_out()) df_tfidf_genres = pd.DataFrame(tfidf_matrix.toarray(), index=df_items.index, columns=tfidf_vectorizer.get_feature_names_out())
df_features = pd.concat([df_features, df_tfidf_genres], axis=1) df_features = pd.concat([df_features, df_tfidf_genres], axis=1)
elif method == "avg_rating": elif method == "avg_rating":
df_avg_rating = df_ratings.groupby('movieId')['rating'].mean().to_frame('avg_rating') df_avg_rating = df_ratings.groupby('movieId')['rating'].mean().to_frame('avg_rating')
df_features = df_features.join(df_avg_rating, on='movieId') df_features = df_features.join(df_avg_rating, on='movieId')
else: else:
raise NotImplementedError(f'Feature method {method} not yet implemented') raise NotImplementedError(f'Feature method {method} not yet implemented')
# Handle missing values in df_features # Handle missing values in df_features
df_features.fillna(0, inplace=True) df_features.fillna(0, inplace=True)
return df_features return df_features
def fit(self, trainset): def fit(self, trainset):
"""Profile Learner""" """Profile Learner"""
AlgoBase.fit(self, trainset) AlgoBase.fit(self, trainset)
# Preallocate user profiles # Preallocate user profiles
self.user_profile = {u: None for u in trainset.all_users()} self.user_profile = {u: None for u in trainset.all_users()}
self.user_profile_explain = {} self.user_profile_explain = {}
epsilon = 1e-10 # Small value to prevent division by zero epsilon = 1e-10 # Small value to prevent division by zero
for u in trainset.all_users(): for u in trainset.all_users():
raw_user_id = trainset.to_raw_uid(u) raw_user_id = trainset.to_raw_uid(u)
self.user_profile_explain[raw_user_id] = {} self.user_profile_explain[raw_user_id] = {}
user_ratings = np.array([rating for (_, rating) in trainset.ur[u]]) user_ratings = np.array([rating for (_, rating) in trainset.ur[u]])
item_ids = [iid for (iid, _) in trainset.ur[u]] item_ids = [iid for (iid, _) in trainset.ur[u]]
raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids] raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]
feature_values = self.content_features.loc[raw_item_ids].values feature_values = self.content_features.loc[raw_item_ids].values
norms = np.linalg.norm(feature_values, axis=0) + epsilon norms = np.linalg.norm(feature_values, axis=0) + epsilon
weighted_features = feature_values / norms weighted_features = feature_values / norms
feature_importance = weighted_features.T @ user_ratings feature_importance = weighted_features.T @ user_ratings
feature_importance /= np.sum(user_ratings) feature_importance /= np.sum(user_ratings)
self.user_profile_explain[raw_user_id] = dict(zip(self.content_features.columns, feature_importance)) self.user_profile_explain[raw_user_id] = dict(zip(self.content_features.columns, feature_importance))
if self.regressor_method == 'random_score': if self.regressor_method == 'random_score':
for u in self.user_profile: for u in self.user_profile:
self.user_profile[u] = rd.uniform(0.5, 5) self.user_profile[u] = rd.uniform(0.5, 5)
elif self.regressor_method == 'random_sample': elif self.regressor_method == 'random_sample':
for u in self.user_profile: for u in self.user_profile:
self.user_profile[u] = [rating for (_, rating) in trainset.ur[u]] self.user_profile[u] = [rating for (_, rating) in trainset.ur[u]]
else: else:
regressor_models = { regressor_models = {
'linear_regression': LinearRegression(fit_intercept=False), 'linear_regression': LinearRegression(fit_intercept=False),
'svr_regression': SVR(kernel='rbf', C=10, epsilon=0.2), 'svr_regression': SVR(kernel='rbf', C=10, epsilon=0.2),
'gradient_boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3), 'gradient_boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
'random_forest': RandomForestRegressor(n_estimators=100), 'random_forest': RandomForestRegressor(n_estimators=100),
'lasso_regression': Lasso(alpha=0.1), 'lasso_regression': Lasso(alpha=0.1),
'ridge_regression': Ridge(alpha=1.0), 'ridge_regression': Ridge(alpha=1.0),
'elastic_net': ElasticNet(alpha=1.0, l1_ratio=0.5), 'elastic_net': ElasticNet(alpha=1.0, l1_ratio=0.5),
'knn_regression': KNeighborsRegressor(n_neighbors=1), 'knn_regression': KNeighborsRegressor(n_neighbors=1),
'decision_tree': DecisionTreeRegressor(max_depth=5), 'decision_tree': DecisionTreeRegressor(max_depth=5),
'adaboost': AdaBoostRegressor(n_estimators=50), 'adaboost': AdaBoostRegressor(n_estimators=50),
'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3), 'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3) 'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
} }
if self.regressor_method not in regressor_models: if self.regressor_method not in regressor_models:
raise NotImplementedError(f'Regressor method {self.regressor_method} not yet implemented') raise NotImplementedError(f'Regressor method {self.regressor_method} not yet implemented')
for u in self.user_profile: for u in self.user_profile:
user_ratings = [rating for (_, rating) in trainset.ur[u]] user_ratings = [rating for (_, rating) in trainset.ur[u]]
item_ids = [iid for (iid, _) in trainset.ur[u]] item_ids = [iid for (iid, _) in trainset.ur[u]]
raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids] raw_item_ids = [trainset.to_raw_iid(iid) for iid in item_ids]
df_user = pd.DataFrame({'item_id': raw_item_ids, 'user_ratings': user_ratings}) df_user = pd.DataFrame({'item_id': raw_item_ids, 'user_ratings': user_ratings})
df_user = df_user.merge(self.content_features, left_on="item_id", right_index=True, how='left') df_user = df_user.merge(self.content_features, left_on="item_id", right_index=True, how='left')
X = df_user.drop(columns=['item_id', 'user_ratings']) X = df_user.drop(columns=['item_id', 'user_ratings'])
y = df_user['user_ratings'] y = df_user['user_ratings']
regressor = regressor_models[self.regressor_method] regressor = regressor_models[self.regressor_method]
regressor.fit(X, y) regressor.fit(X, y)
self.user_profile[u] = regressor self.user_profile[u] = regressor
def estimate(self, u, i): def estimate(self, u, i):
"""Scoring component used for item filtering""" """Scoring component used for item filtering"""
if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
raise PredictionImpossible('User and/or item is unknown.') raise PredictionImpossible('User and/or item is unknown.')
if self.regressor_method == 'random_score': if self.regressor_method == 'random_score':
return rd.uniform(0.5, 5) return rd.uniform(0.5, 5)
elif self.regressor_method == 'random_sample': elif self.regressor_method == 'random_sample':
return rd.choice(self.user_profile[u]) return rd.choice(self.user_profile[u])
else: else:
raw_item_id = self.trainset.to_raw_iid(i) raw_item_id = self.trainset.to_raw_iid(i)
item_features = self.content_features.loc[raw_item_id, :].values.reshape(1, -1) item_features = self.content_features.loc[raw_item_id, :].values.reshape(1, -1)
regressor = self.user_profile[u] regressor = self.user_profile[u]
item_features_df = pd.DataFrame(item_features, columns=self.content_features.columns) item_features_df = pd.DataFrame(item_features, columns=self.content_features.columns)
return regressor.predict(item_features_df)[0] return regressor.predict(item_features_df)[0]
def explain(self, u): def explain(self, u):
if u in self.user_profile_explain: if u in self.user_profile_explain:
return self.user_profile_explain[u] return self.user_profile_explain[u]
else: else:
return None return None
#Example usage: #Example usage:
cb = ContentBased(["title_length", "movie_year","genre","avg_rating"], "ridge_regression") cb = ContentBased(["title_length", "movie_year","genre","avg_rating"], "ridge_regression")
surprise_data = load_ratings(surprise_format=True) surprise_data = load_ratings(surprise_format=True)
trainset = surprise_data.build_full_trainset() trainset = surprise_data.build_full_trainset()
testset = trainset.build_anti_testset() testset = trainset.build_anti_testset()
cb.fit(trainset) cb.fit(trainset)
#print("RMSE: ", cb.rmse(testset)) #print("RMSE: ", cb.rmse(testset))
#Example explanations for users: #Example explanations for users:
print(cb.explain(11)) print(cb.explain(11))
print(cb.explain(13)) print(cb.explain(13))
print(cb.explain(17)) print(cb.explain(17))
print(cb.explain(23)) print(cb.explain(23))
print(cb.explain(27)) print(cb.explain(27))
print(cb.explain(73)) print(cb.explain(73))
``` ```
%% Output %% Output
0 ---------------------------------------------------------------------------
1 NameError Traceback (most recent call last)
2 Cell In[3], line 147
3 145 trainset = surprise_data.build_full_trainset()
4 146 testset = trainset.build_anti_testset()
5 --> 147 cb.fit(trainset)
None 150 #print("RMSE: ", cb.rmse(testset))
{'n_character_title': array([0.03019692])} 151
{'n_character_title': array([0.04098154])} 152
{'n_character_title': array([0.02942264])} 153 #Example explanations for users:
{'n_character_title': array([0.08196307])} 154 print(cb.explain(11))
{'n_character_title': array([0.02798739])} Cell In[3], line 88, in ContentBased.fit(self, trainset)
80 self.user_profile[u] = [rating for (_, rating) in trainset.ur[u]]
82 else:
83 regressor_models = {
84 'linear_regression': LinearRegression(fit_intercept=False),
85 'svr_regression': SVR(kernel='rbf', C=10, epsilon=0.2),
86 'gradient_boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
87 'random_forest': RandomForestRegressor(n_estimators=100),
---> 88 'lasso_regression': Lasso(alpha=0.1),
89 'ridge_regression': Ridge(alpha=1.0),
90 'elastic_net': ElasticNet(alpha=1.0, l1_ratio=0.5),
91 'knn_regression': KNeighborsRegressor(n_neighbors=1),
92 'decision_tree': DecisionTreeRegressor(max_depth=5),
93 'adaboost': AdaBoostRegressor(n_estimators=50),
94 'xgboost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
95 'lightgbm': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
96 }
98 if self.regressor_method not in regressor_models:
99 raise NotImplementedError(f'Regressor method {self.regressor_method} not yet implemented')
NameError: name 'Lasso' is not defined
%% Cell type:markdown id:ffd75b7e tags: %% Cell type:markdown id:ffd75b7e tags:
The following script test the ContentBased class The following script test the ContentBased class
%% Cell type:code id:69d12f7d tags: %% Cell type:code id:69d12f7d tags:
``` python ``` python
def test_contentbased_class(feature_method, regressor_method): def test_contentbased_class(feature_method, regressor_method):
"""Test the ContentBased class. """Test the ContentBased class.
Tries to make a prediction on the first (user,item ) tuple of the anti_test_set Tries to make a prediction on the first (user,item ) tuple of the anti_test_set
""" """
sp_ratings = load_ratings(surprise_format=True) sp_ratings = load_ratings(surprise_format=True)
train_set = sp_ratings.build_full_trainset() train_set = sp_ratings.build_full_trainset()
content_algo = ContentBased(feature_method, regressor_method) content_algo = ContentBased(feature_method, regressor_method)
content_algo.fit(train_set) content_algo.fit(train_set)
anti_test_set_first = train_set.build_anti_testset()[0] anti_test_set_first = train_set.build_anti_testset()[0]
prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1]) prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])
print(prediction) print(prediction)
test_contentbased_class(["title_length", "movie_year","genre","avg_rating"], "ridge_regression") test_contentbased_class(["title_length", "movie_year","genre","avg_rating"], "ridge_regression")
``` ```
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter