Merge pull request #42 from nidhaloff/dev

implemented hyperparameter search
nidhaloff · Oct 10, 2020 · 954bc21 · 954bc21
2 parents 586c96e + f9e8a7b
commit 954bc21
Show file tree

Hide file tree

Showing 8 changed files with 132 additions and 4 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -2,6 +2,15 @@
 History
 =======
 
+
+
+0.2.8 (2020-10-09)
+-------------------
+
+* implemented hyperparameter search
+* added examples
+
+
 0.2.7 (2020-10-05)
 -------------------
 

diff --git a/README.rst b/README.rst
@@ -76,6 +76,7 @@ Features
 - Supports different data preprocessing methods
 - Provides flexibility and data control while writing configurations
 - Supports cross validation
+- Supports both hyperparameter search (version >= 0.2.8)
 - Supports yaml and json format
 - Supports different sklearn metrics for regression, classification and clustering
 - Supports multi-output/multi-target regression and classification
@@ -90,6 +91,7 @@ a **single line of code**
 All you need is a **yaml** (or **json**) file, where you need to describe what you are trying to do. That's it!
 
 Igel supports all sklearn's machine learning functionality, whether regression, classification or clustering.
+Precisely, you can use **63** different machine learning model in igel.
 
 Installation
 -------------
@@ -434,6 +436,16 @@ Here is an overview of all supported configurations (for now):
             cv: # [int] -> number of kfold (default 5)
             n_jobs:   # [signed int] -> The number of CPUs to use to do the computation (default None)
             verbose: # [int] -> The verbosity level. (default 0)
+        hyperparameter_search:
+            method: grid_search   # method you want to use: grid_search and random_search are supported
+            parameter_grid:     # put your parameters grid here that you want to use, an example is provided below
+                param1: [val1, val2]
+                param2: [val1, val2]
+            arguments:  # additional arguments you want to provide for the hyperparameter search
+                cv: 5   # number of folds
+                refit: true   # whether to refit the model after the search
+                return_train_score: false   # whether to return the train score
+                verbose: 0      # verbosity level
 
     # target you want to predict
     target:  # list of strings: basically put here the column(s), you want to predict that exist in your csv dataset
@@ -709,6 +721,8 @@ is conducted on the target column to show you more the capabilities of igel.
 Furthermore, the multioutput-example contains a **multioutput regression** example.
 Finally, the cv-example contains an example using the Ridge classifier using cross validation.
 
+You can also find a cross validation and a hyperparameter search examples in the folder.
+
 I suggest you play around with the examples and igel cli. However,
 you can also directly execute the fit.py, evaluate.py and predict.py if you want to.
 

diff --git a/examples/hyperparams-search/fit.py b/examples/hyperparams-search/fit.py
@@ -0,0 +1,24 @@
+from igel import Igel
+
+"""
+The goal of igel is to use ML without writing code. Therefore, the right and simplest way to use igel is from terminal.
+You can run ` igel fit -dp path_to_dataset -yml path_to_yaml_file`.
+
+Alternatively, you can write code if you want. This example below demonstrates how to use igel if you want to write code.
+However, I suggest you try and use the igel CLI. Type igel -h in your terminal to know more.
+
+===============================================================================================================
+
+This example fits a machine learning model on the indian-diabetes dataset
+
+- default model here is the neural network and the configuration are provided in neural-network.yaml file
+- You can switch to random forest by providing the random-forest.yaml as the config file in the parameters
+
+"""
+
+mock_fit_params = {'data_path': '../data/indian-diabetes/train-indians-diabetes.csv',
+                   'yaml_path': './igel.yaml',
+                   'cmd': 'fit'}
+
+Igel(**mock_fit_params)
+
diff --git a/examples/hyperparams-search/igel.yaml b/examples/hyperparams-search/igel.yaml
@@ -0,0 +1,32 @@
+# dataset operations
+dataset:
+    type: csv
+    split:  # split options
+        test_size: 0.2  # 0.2 means 20% for the test data, so 80% are automatically for training
+        shuffle: True   # whether to shuffle the data before/while splitting
+
+    preprocess:
+        scale:  # scaling options
+            method: standard    # standardization will scale values to have a 0 mean and 1 standard deviation  | you can also try minmax
+            target: inputs  # scale inputs. | other possible values: [outputs, all] # if you choose all then all values in the dataset will be scaled
+
+
+# model definition
+model:
+    type: classification
+    algorithm: RandomForest
+    hyperparameter_search:
+        method: random_search
+        parameter_grid:
+            max_depth: [6, 10]
+            n_estimators: [100, 300]
+            max_features: [auto, sqrt]
+        arguments:
+            cv: 5
+            refit: true
+            return_train_score: false
+            verbose: 0
+
+# target you want to predict
+target:
+    - sick
diff --git a/igel/__init__.py b/igel/__init__.py
@@ -4,4 +4,4 @@
 
 __author__ = "Nidhal Baccouri"
 __email__ = 'nidhalbacc@gmail.com'
-__version__ = '0.2.7'
+__version__ = '0.2.8'
diff --git a/igel/hyperparams.py b/igel/hyperparams.py
@@ -0,0 +1,25 @@
+from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
+
+
+def hyperparameter_search(model,
+                          method,
+                          params,
+                          x_train,
+                          y_train,
+                          **kwargs):
+
+    search = None
+    if method == 'grid_search':
+        search = GridSearchCV(model,
+                              params,
+                              **kwargs)
+
+    elif method == 'random_search':
+        search = RandomizedSearchCV(model,
+                                    params,
+                                    **kwargs)
+    else:
+        raise Exception("hyperparameter method must be grid_search or random_search")
+
+    search.fit(x_train, y_train)
+    return search.best_estimator_, search.best_score_, search.best_params_
diff --git a/igel/igel.py b/igel/igel.py
@@ -14,13 +14,15 @@
     from igel.data import models_dict, metrics_dict
     from igel.preprocessing import update_dataset_props
     from igel.preprocessing import handle_missing_values, encode, normalize
+    from igel.hyperparams import hyperparameter_search
 except ImportError:
     from utils import read_yaml, create_yaml, extract_params, _reshape, read_json
     from data import evaluate_model
     from configs import configs
     from data import models_dict, metrics_dict
     from preprocessing import update_dataset_props
     from preprocessing import handle_missing_values, encode, normalize
+    from hyperparams import hyperparameter_search
 
 from sklearn.model_selection import train_test_split, cross_validate
 from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor
@@ -315,6 +317,8 @@ def fit(self, **kwargs):
         cv_results = None
         eval_results = None
         cv_params = None
+        hp_search_results = {}
+
         if self.model_type == 'clustering':
             x_train = self._prepare_clustering_data()
         else:
@@ -338,7 +342,26 @@ def fit(self, **kwargs):
                 logger.info("performing cross validation ...")
                 cv_results = cross_validate(estimator=self.model,
                                             X=x_train,
-                                            y=y_train, **cv_params)
+                                            y=y_train,
+                                            **cv_params)
+            hyperparams_props = self.model_props.get('hyperparameter_search', None)
+            if hyperparams_props:
+                method = hyperparams_props.get('method', None)
+                grid_params = hyperparams_props.get('parameter_grid', None)
+                hp_args = hyperparams_props.get('arguments', None)
+                logger.info(f"Performing hyperparameter search using -> {method}")
+                logger.info(f"Grid parameters entered by the user: {grid_params}")
+                logger.info(f"Additional hyperparameter arguments: {hp_args}")
+                best_estimator, best_params, best_score = hyperparameter_search(model=self.model,
+                                                                                method=method,
+                                                                                params=grid_params,
+                                                                                x_train=x_train,
+                                                                                y_train=y_train,
+                                                                                **hp_args)
+                hp_search_results['best_params'] = best_params
+                hp_search_results['best_score'] = best_score
+                self.model = best_estimator
+
             self.model.fit(x_train, y_train)
 
         else:   # if the model type is clustering
@@ -380,7 +403,8 @@ def fit(self, **kwargs):
             "results_path": str(self.results_path),
             "model_path": str(self.default_model_path),
             "target": None if self.model_type == 'clustering' else self.target,
-            "results_on_test_data": eval_results
+            "results_on_test_data": eval_results,
+            "hyperparameter_search_results": hp_search_results
 
         }
         if self.model_type == 'clustering':

diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.1.8
+current_version = 0.2.8
 commit = True
 tag = True