diff --git a/Home.py b/Home.py index 955aa963cf4a189ba01cd5c8a2053fcee1d4f84a..97916b1c4f10406fc3d88dae9c47083b0da5ceb8 100644 --- a/Home.py +++ b/Home.py @@ -200,7 +200,7 @@ def display_content_based_recommendations(user_name, user_id=-1, n=15): cols_html = "" # Get top N recommendations using content-based filtering - top_n_recommendations = test_contentbased_class(["title_length", "movie_year", "genre", "avg_rating"], "ridge_regression", user_id=-1, n=15) + top_n_recommendations = test_contentbased_class(["title_length", "movie_year", "genre", "avg_rating"], "random_forest", user_id=-1, n=15) if top_n_recommendations: st.subheader(f"Discover Great Content") # Display section title diff --git a/README.md b/README.md index d7ed9045b3ced812dac51da83d8e928afd3046a9..5a08275eae80bb46ed17b27e15d5e73359c6a7dc 100644 --- a/README.md +++ b/README.md @@ -21,11 +21,10 @@ pip install streamlit pip install requests ``` - ## Project Structure The project is organized into the following key components: -### Configuration and Constants +### Configuration 1. ***configs.py*** - Defines an `EvalConfig` class for storing configurations for evaluating multiple recommendation models. @@ -38,7 +37,6 @@ The project is organized into the following key components: 2. ***constants.py*** - This code defines a Constant class that stores paths to datasets and column names for content and evidence data. Paths to content, evidence, and evaluation directories are defined based on the data directory path. File names and column names for article and rating data are specified, along with the rating scale. -### Data Loaders 3. ***loaders.py*** - Loads rating and item data from specified CSV files in the Constant class of the constants module. @@ -50,7 +48,6 @@ The project is organized into the following key components: - The data is loaded into pandas DataFrames, with an option available to load rating data in the format expected by the Surprise library if needed. -### Recommender Models 4. ***models.py*** - Defines several basic recommendation algorithms for the Surprise library. @@ -58,13 +55,18 @@ The project is organized into the following key components: - Recommendation algorithms are defined as classes inheriting from Surprise's `AlgoBase` class, each implementing an `estimate` method to predict user ratings for items. -### Analytics and Evaluation -5. ***analytics_ui.ipynb*** + +### Backend folder +This folder contains all the data, jupyter notebook, scripts python used to improve user experience. + +#### Analytics +***analytics_small.ipynb*** - Performs data analysis to understand the datasets and their properties. - Analyzes the number of ratings, unique users, unique items, and distribution of ratings. -6. ***evaluator.ipynb*** +#### Evaluation +***evaluator.ipynb*** - Evaluates different recommendation models using various cross-validation techniques. @@ -79,17 +81,47 @@ The project is organized into the following key components: - Exports the evaluation report to a CSV file. -7. ***analytics_tiny.ipynb*** +#### Content Based +***hackathon_make_predictions.ipynb*** + +Define a function make_hackathon_prediction that takes feature_method and regressor_method as input. + +Inside this function: + +- Load the training data and converts it into the format suitable for Surprise. +- Train a Content-Based model (ContentBased) on the training set using the specified feature and regressor methods. +- Make predictions on the test set by loading the test data from a CSV file and converting it into records. + +Converts the predictions into a DataFrame and saves them as a CSV file. + +It then calls this function with specific parameters and prints the generated predictions. + +***content_based.ipynb*** +1. ***Feature Extraction Methods*** +The system supports the following feature extraction methods: +- `genre`: Extracts genres of the movies using TF-IDF vectorization. +- `movie_year`: Extracts the release year of the movies. +- `avg_rating`: Computes the average rating for each movie. +- `title_length`: Computes the length of the movie title. - - Analyzes a smaller version of the dataset for debugging purposes. +2. ***Regression Models*** +The system supports the following regression models for predicting user ratings: - - Similar analyses to `analytics_ui.ipynb`, but on a smaller scale to speed up computation time. +- `linear_regression` +- `random_forest` +- `lasso_regression` +- `gradient_boosting` +- `ridge_regression` +- `svr_regression` +- `elastic_net` +- `knn_regression` +- `decision_tree` +- `adaboost` +- `xgboost` +- `lightgbm` - 8. ***analytics_test.ipynb*** - - Analyzes a test dataset to understand algorithm behaviors during development. - - Similar analyses to `analytics_ui.ipynb`, but on a smaller test dataset to better understand how algorithms work. ### Datasets diff --git a/recommender.py b/recommender.py index 1b7ff0c417fade494a8183df459277746da2c9c6..5c447df5ba326ab875ca047d8d2ec52185e52874 100644 --- a/recommender.py +++ b/recommender.py @@ -620,14 +620,14 @@ def compare_similarity_measures(trainset,testset): results['KNN_MSD_MAE'] = mae_msd # Train and evaluate KNN model with Pearson correlation similarity - sim_options_pearson = {'name': 'pearson', 'user_based': True} - knn_pearson = KNNWithMeans(sim_options=sim_options_pearson) - knn_pearson.fit(trainset) - predictions_pearson = knn_pearson.test(testset) - rmse_pearson = accuracy.rmse(predictions_pearson) - mae_pearson = accuracy.mae(predictions_pearson) - results['KNN_Pearson_RMSE'] = rmse_pearson - results['KNN_Pearson_MAE'] = mae_pearson + sim_options_cosine = {'name': 'cosine', 'user_based': True} + knn_cosine = KNNWithMeans(sim_options=sim_options_cosine) + knn_cosine.fit(trainset) + predictions_cosine = knn_cosine.test(testset) + rmse_cosine = accuracy.rmse(predictions_cosine) + mae_cosine = accuracy.mae(predictions_cosine) + results['KNN_cosine_RMSE'] = rmse_cosine + results['KNN_cosine_MAE'] = mae_cosine # Train and evaluate UserBased model with MSD similarity @@ -640,13 +640,13 @@ def compare_similarity_measures(trainset,testset): results['UserBased_MSD_MAE'] = mae_user_based_msd # Train and evaluate UserBased model with Pearson correlation similarity - user_based_pearson = UserBased(sim_options={'name': 'pearson'}) - user_based_pearson.fit(trainset) - predictions_user_based_pearson = user_based_pearson.test(testset) - rmse_user_based_pearson = accuracy.rmse(predictions_user_based_pearson) - mae_user_based_pearson = accuracy.mae(predictions_user_based_pearson) - results['UserBased_Pearson_RMSE'] = rmse_user_based_pearson - results['UserBased_Pearson_MAE'] = mae_user_based_pearson + user_based_cosine = UserBased(sim_options={'name': 'cosine'}) + user_based_cosine.fit(trainset) + predictions_user_based_cosine = user_based_cosine.test(testset) + rmse_user_based_cosine = accuracy.rmse(predictions_user_based_cosine) + mae_user_based_cosine = accuracy.mae(predictions_user_based_cosine) + results['UserBased_cosine_RMSE'] = rmse_user_based_cosine + results['UserBased_cosine_MAE'] = mae_user_based_cosine # Train and evaluate OtherUserBased models