diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 208fab0f..9b9da1d7 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -24,11 +24,8 @@ jobs:
         python -m pip install black flake8
     - name: Code Style (Black/Flake8)
       run: |
-        # Black code style
         black --check --diff pytorch_widedeep tests examples setup.py
-        # Stop the build if there are Python syntax errors or undefined names
         flake8 . --count --select=E901,E999,F821,F822,F823 --ignore=E266 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --ignore=E203,E266,E501,E721,E722,F401,F403,F405,F811,W503,C901 --statistics
 
   test:
@@ -47,41 +44,16 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install pytest-cov codecov faker
+        python -m pip install pytest pytest-cov codecov faker
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-    - name: Test with pytest
+    - name: Test with pytest and generate coverage
       run: |
-        pytest --doctest-modules pytorch_widedeep --cov-report xml --cov-report term --disable-pytest-warnings --cov=pytorch_widedeep tests/
-    - name: Upload coverage
-      uses: actions/upload-artifact@v4
-      with:
-        name: coverage${{ matrix.python-version }}
-        path: .coverage
-
-  finish: 
-    needs: test
-    runs-on: ubuntu-latest
-    if: ${{ github.event_name == 'push' || !github.event.pull_request.draft }}
-    steps:
-    - uses: actions/checkout@v4
-    - name: Set up Python 3.10
-      uses: actions/setup-python@v5
-      with:
-        python-version: "3.10"
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        python -m pip install coverage
-    - name: Download all artifacts
-      # Downloads coverage1, coverage2, etc.
-      uses: actions/download-artifact@v4
-    - name: Convert coverage
-      run: |
-        coverage combine coverage*/.coverage*
-        coverage report --fail-under=90
-        coverage xml
-    - name: upload coverage to Codecov
+        pytest --doctest-modules pytorch_widedeep --cov=pytorch_widedeep --cov-report=xml --cov-report=term --disable-pytest-warnings tests
+    - name: Upload coverage to Codecov
       uses: codecov/codecov-action@v4
       with:
         token: ${{ secrets.CODECOV_TOKEN }}
-        fail_ci_if_error: true
+        file: ./coverage.xml
+        flags: unittests
+        name: codecov-${{ matrix.python-version }}
+        fail_ci_if_error: true
\ No newline at end of file
diff --git a/.isort.cfg b/.isort.cfg
index d81d7de1..023dbdb4 100644
--- a/.isort.cfg
+++ b/.isort.cfg
@@ -1,4 +1,4 @@
 [settings]
+profile=black
 multi_line_output=3
-include_trailing_comma=True
 length_sort=1
\ No newline at end of file
diff --git a/README.md b/README.md
index ddf11304..2db68794 100644
--- a/README.md
+++ b/README.md
@@ -33,6 +33,7 @@ The content of this document is organized as follows:
     - [Introduction](#introduction)
     - [Architectures](#architectures)
     - [The ``deeptabular`` component](#the-deeptabular-component)
+    - [The ``rec`` module](#the-rec-module)
     - [Text and Images](#text-and-images)
     - [Installation](#installation)
       - [Developer Install](#developer-install)
@@ -795,6 +796,28 @@ encoder-decoder method and constrastive-denoising method. Please, see the
 documentation and the examples for details on this functionality, and all
 other options in the library.
 
+### The ``rec`` module
+
+This module was introduced as an extension to the existing components in the
+library, addressing questions and issues related to recommendation systems.
+While still under active development, it currently includes a select number
+of powerful recommendation models.
+
+It's worth noting that this library already supported the implementation of
+various recommendation algorithms using existing components. For example,
+models like Wide and Deep, Two-Tower, or Neural Collaborative Filtering could
+be constructed using the library's core functionalities.
+
+The recommendation algorithms in the `rec` module are:
+
+1. [DeepFM: A Factorization-Machine based Neural Network for CTR Prediction](https://arxiv.org/abs/1703.04247)
+2. (Deep) Field Aware Factorization Machine (FFM): a Deep Learning version of the algorithm presented in [Field-aware Factorization Machines in a Real-world Online Advertising System](https://arxiv.org/abs/1701.04099)
+3. [xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems](https://arxiv.org/pdf/1803.05170)
+4. [Deep Interest Network for Click-Through Rate Prediction](https://arxiv.org/abs/1706.06978)
+
+These can all be used as the `deeptabular` component in the `WideDeep` model.
+See the examples for more details.
+
 ### Text and Images
 For the text component, `deeptext`, the library offers the following models:
 
diff --git a/VERSION b/VERSION
index 266146b8..9edc58bb 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.6.3
+1.6.4
diff --git a/examples/scripts/adult_census.py b/examples/scripts/adult_census.py
index e76e38cd..17173c41 100644
--- a/examples/scripts/adult_census.py
+++ b/examples/scripts/adult_census.py
@@ -3,19 +3,10 @@
 import pandas as pd
 
 from pytorch_widedeep import Trainer
-from pytorch_widedeep.models import (  # noqa: F401
-    Wide,
-    TabMlp,
-    WideDeep,
-    TabResnet,
-)
+from pytorch_widedeep.models import Wide, TabMlp, WideDeep, TabResnet  # noqa: F401
 from pytorch_widedeep.metrics import Accuracy, Precision
 from pytorch_widedeep.datasets import load_adult
-from pytorch_widedeep.callbacks import (
-    LRHistory,
-    EarlyStopping,
-    ModelCheckpoint,
-)
+from pytorch_widedeep.callbacks import LRHistory, EarlyStopping, ModelCheckpoint
 from pytorch_widedeep.initializers import XavierNormal, KaimingNormal
 from pytorch_widedeep.preprocessing import TabPreprocessor, WidePreprocessor
 
diff --git a/examples/scripts/adult_census_attention_mlp.py b/examples/scripts/adult_census_attention_mlp.py
index 2fb37b3f..85562f63 100644
--- a/examples/scripts/adult_census_attention_mlp.py
+++ b/examples/scripts/adult_census_attention_mlp.py
@@ -3,11 +3,7 @@
 import pandas as pd
 
 from pytorch_widedeep import Trainer
-from pytorch_widedeep.models import (
-    WideDeep,
-    SelfAttentionMLP,
-    ContextAttentionMLP,
-)
+from pytorch_widedeep.models import WideDeep, SelfAttentionMLP, ContextAttentionMLP
 from pytorch_widedeep.metrics import Accuracy
 from pytorch_widedeep.datasets import load_adult
 from pytorch_widedeep.preprocessing import TabPreprocessor
diff --git a/examples/scripts/adult_census_cont_den_full_example.py b/examples/scripts/adult_census_cont_den_full_example.py
index d282cdbf..a661d011 100644
--- a/examples/scripts/adult_census_cont_den_full_example.py
+++ b/examples/scripts/adult_census_cont_den_full_example.py
@@ -8,9 +8,7 @@
 from pytorch_widedeep.metrics import Accuracy
 from pytorch_widedeep.datasets import load_adult
 from pytorch_widedeep.preprocessing import TabPreprocessor
-from pytorch_widedeep.self_supervised_training import (
-    ContrastiveDenoisingTrainer,
-)
+from pytorch_widedeep.self_supervised_training import ContrastiveDenoisingTrainer
 
 use_cuda = torch.cuda.is_available()
 
diff --git a/examples/scripts/adult_census_cont_den_run_all_models.py b/examples/scripts/adult_census_cont_den_run_all_models.py
index ecbaa7cc..b69174bb 100644
--- a/examples/scripts/adult_census_cont_den_run_all_models.py
+++ b/examples/scripts/adult_census_cont_den_run_all_models.py
@@ -14,9 +14,7 @@
 )
 from pytorch_widedeep.datasets import load_adult
 from pytorch_widedeep.preprocessing import TabPreprocessor
-from pytorch_widedeep.self_supervised_training import (
-    ContrastiveDenoisingTrainer,
-)
+from pytorch_widedeep.self_supervised_training import ContrastiveDenoisingTrainer
 
 use_cuda = torch.cuda.is_available()
 
diff --git a/examples/scripts/adult_census_enc_dec_run_all_models.py b/examples/scripts/adult_census_enc_dec_run_all_models.py
index 5a5bf36e..2c77cffc 100644
--- a/examples/scripts/adult_census_enc_dec_run_all_models.py
+++ b/examples/scripts/adult_census_enc_dec_run_all_models.py
@@ -5,11 +5,7 @@
 from pytorch_widedeep.models import TabMlp as TabMlpEncoder
 from pytorch_widedeep.models import TabNet as TabNetEncoder
 from pytorch_widedeep.models import TabResnet as TabResnetEncoder
-from pytorch_widedeep.models import (
-    TabMlpDecoder,
-    TabNetDecoder,
-    TabResnetDecoder,
-)
+from pytorch_widedeep.models import TabMlpDecoder, TabNetDecoder, TabResnetDecoder
 from pytorch_widedeep.datasets import load_adult
 from pytorch_widedeep.preprocessing import TabPreprocessor
 from pytorch_widedeep.self_supervised_training import EncoderDecoderTrainer
diff --git a/examples/scripts/adult_census_tabnet.py b/examples/scripts/adult_census_tabnet.py
index c523e54b..e3f2cfe7 100644
--- a/examples/scripts/adult_census_tabnet.py
+++ b/examples/scripts/adult_census_tabnet.py
@@ -6,11 +6,7 @@
 from pytorch_widedeep.models import TabNet, WideDeep
 from pytorch_widedeep.metrics import Accuracy, Precision
 from pytorch_widedeep.datasets import load_adult
-from pytorch_widedeep.callbacks import (
-    LRHistory,
-    EarlyStopping,
-    ModelCheckpoint,
-)
+from pytorch_widedeep.callbacks import LRHistory, EarlyStopping, ModelCheckpoint
 from pytorch_widedeep.preprocessing import TabPreprocessor
 
 use_cuda = torch.cuda.is_available()
diff --git a/examples/scripts/adult_census_transformers.py b/examples/scripts/adult_census_transformers.py
index 08e2fbe8..1718f0d0 100644
--- a/examples/scripts/adult_census_transformers.py
+++ b/examples/scripts/adult_census_transformers.py
@@ -14,11 +14,7 @@
 )
 from pytorch_widedeep.metrics import Accuracy
 from pytorch_widedeep.datasets import load_adult
-from pytorch_widedeep.callbacks import (
-    LRHistory,
-    EarlyStopping,
-    ModelCheckpoint,
-)
+from pytorch_widedeep.callbacks import LRHistory, EarlyStopping, ModelCheckpoint
 from pytorch_widedeep.initializers import XavierNormal, KaimingNormal
 from pytorch_widedeep.preprocessing import TabPreprocessor, WidePreprocessor
 
diff --git a/examples/scripts/movielens_din.py b/examples/scripts/movielens_din.py
new file mode 100644
index 00000000..551c5dec
--- /dev/null
+++ b/examples/scripts/movielens_din.py
@@ -0,0 +1,257 @@
+# DIN is a "special" model and the data needs a very particular preparation
+# process. Therefore, the library does not have a dedicated preprocessor for
+# this specific model. Below is a detail explanation and if someone wants to
+# use this algo I would suggest to wrap it all in an object prior to pass the
+# data to the model
+
+import re
+from functools import partial
+
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import LabelEncoder
+
+from pytorch_widedeep import Trainer
+from pytorch_widedeep.models import WideDeep
+from pytorch_widedeep.metrics import Accuracy
+from pytorch_widedeep.datasets import load_movielens100k
+from pytorch_widedeep.preprocessing import TabPreprocessor
+from pytorch_widedeep.models.rec.din import DeepInterestNetwork
+
+
+def clean_genre_list(genre_list):
+    return "_".join(
+        sorted([re.sub(r"[^a-z0-9]", "", genre.lower()) for genre in genre_list])
+    )
+
+
+def label_encode_column(df, column_name):
+    le = LabelEncoder()
+    df[column_name] = le.fit_transform(df[column_name])
+    return df, le
+
+
+def create_sequences(group, seq_len=5):
+    movies = group["movie_id"].tolist()
+    genres = group["genre_list"].tolist()
+    ratings = group["rating"].tolist()
+
+    sequences = []
+    for i in range(len(movies) - seq_len):
+        user_movies_sequence = movies[i : i + seq_len]
+        genres_sequence = genres[i : i + seq_len]
+        ratings_sequence = ratings[i : i + seq_len]
+        target_item = movies[i + seq_len]
+        target_item_rating = ratings[i + seq_len]
+
+        sequences.append(
+            {
+                "user_id": group.name,
+                "user_movies_sequence": user_movies_sequence,
+                "genres_sequence": genres_sequence,
+                "ratings_sequence": ratings_sequence,
+                "target_item": target_item,
+                "target_item_rating": target_item_rating,
+            }
+        )
+
+    seq_df = pd.DataFrame(sequences)
+    non_seq_cols = group.drop_duplicates(["user_id"]).drop(
+        ["movie_id", "genre_list", "rating", "timestamp"], axis=1
+    )
+
+    return pd.merge(seq_df, non_seq_cols, on="user_id")
+
+
+def preprocess_movie_data(df, seq_len=5):
+    df_sorted = df.sort_values(["user_id", "timestamp"])
+
+    partial_create_sequences = partial(create_sequences, seq_len=seq_len)
+
+    result_df = (
+        df_sorted.groupby("user_id")
+        .apply(partial_create_sequences)
+        .reset_index(drop=True)
+    )
+
+    return result_df
+
+
+if __name__ == "__main__":
+
+    data, users, items = load_movielens100k(as_frame=True)
+
+    list_of_genres = [
+        "unknown",
+        "Action",
+        "Adventure",
+        "Animation",
+        "Children's",
+        "Comedy",
+        "Crime",
+        "Documentary",
+        "Drama",
+        "Fantasy",
+        "Film-Noir",
+        "Horror",
+        "Musical",
+        "Mystery",
+        "Romance",
+        "Sci-Fi",
+        "Thriller",
+        "War",
+        "Western",
+    ]
+
+    assert (
+        isinstance(items, pd.DataFrame)
+        and isinstance(data, pd.DataFrame)
+        and isinstance(users, pd.DataFrame)
+    )
+    items["genre_list"] = items[list_of_genres].apply(
+        lambda x: [genre for genre in list_of_genres if x[genre] == 1], axis=1
+    )
+
+    items["genre_list"] = items["genre_list"].apply(clean_genre_list)
+
+    df = pd.merge(data, items[["movie_id", "genre_list"]], on="movie_id")
+    df = pd.merge(
+        df,
+        users[["user_id", "age", "gender", "occupation"]],
+        on="user_id",
+    )
+
+    df["rating"] = df["rating"].apply(lambda x: 1 if x >= 4 else 0)
+
+    # Up until here, everything is quite standard. Now, some columns will be
+    # treated as sequences, and therefore, they need to be tokenized
+    # and "numericalised"/label encoded
+    df, user_le = label_encode_column(df, "user_id")
+    df, item_le = label_encode_column(df, "movie_id")
+    df, genre_le = label_encode_column(df, "genre_list")
+
+    # Internally all models for tabular data in this libray use padding idx =
+    # 0 for unseen values, while sklearn's LabelEncoder starts at 0.
+    # Therefore we need to add 1 to the encoded values to leave 0 for
+    # unknown/unseen values
+    df["movie_id"] = df["movie_id"] + 1
+    df["genre_list"] = df["genre_list"] + 1
+
+    # The explanation as to why we do this with the ratings will come later
+    df["rating"] = df["rating"] + 1
+
+    # we build sequences of 5 movies. Our goal will be predicting whether the
+    # next movie will be reviewed positively or negatively
+    df = df.sort_values(by=["timestamp"]).reset_index(drop=True)
+    seq_df = preprocess_movie_data(df, seq_len=5)
+    # target back to 0/1
+    seq_df["target_item_rating"] = seq_df["target_item_rating"] - 1
+
+    X_target_item = np.array(seq_df.target_item.tolist()).reshape(-1, 1)
+
+    # in reality, all users here have more than 5 reviews, so we have complete
+    # sequences, but there is a change that this does not happen in a given
+    # dataset, so one would have to pad with padding idx (0 in this case)
+    seq_len = 5
+    X_user_behaviour = np.array(
+        [
+            lst + [0] * (seq_len - len(lst))
+            for lst in seq_df.user_movies_sequence.tolist()
+        ]
+    )
+    X_ratings = np.array(
+        [lst + [0] * (seq_len - len(lst)) for lst in seq_df.ratings_sequence.tolist()]
+    )
+    X_genres = np.array(
+        [lst + [0] * (seq_len - len(lst)) for lst in seq_df.genres_sequence.tolist()]
+    )
+
+    # At this point we have the target item as an array of shape (N obs, 1),
+    # and all columns that are going to be treated as sequences stored in
+    # arrays of shape (N obs, seq_len). The rest of the columns are going to
+    # be treated as ANY other "standard" tabular dataset
+    other_cols = ["user_id", "age", "gender", "occupation"]
+    df_other_feat = seq_df[other_cols]
+    tab_preprocessor = TabPreprocessor(cat_embed_cols=other_cols)
+    X_other_feats = tab_preprocessor.fit_transform(df_other_feat)
+
+    X_all = np.concatenate(
+        [X_other_feats, X_target_item, X_user_behaviour, X_ratings, X_genres], axis=1
+    )
+
+    # Now, all the model components in this library they take a tensor
+    # (just one) as input. Therefore, if we want to treat some data
+    # differently, we need to slice the tensor internally. For this to happen
+    # we need to "tell" the algorithm which column is which. DIN has data of
+    # two different natures: sequences and standard tabular (i.e. everything
+    # else). For the sequences, lets simply define columns with what they are
+    # an an index (but you can define then with whichever string you want)
+    user_behaviour_cols = [f"item_{i+1}" for i in range(5)]
+    genres_cols = [f"genre_{i+1}" for i in range(5)]
+    ratings_cols = [f"rating_{i+1}" for i in range(5)]
+
+    # Then all columns in the datasets are, in order of appearance in X_all:
+    all_cols = (
+        [el[0] for el in tab_preprocessor.cat_embed_input]  # tabular cols
+        + ["target_item"]  # target item
+        + user_behaviour_cols  # user behaviour seq cols
+        + ratings_cols  # ratings seq cols
+        + genres_cols  # genres seq cols
+    )
+    column_idx = {k: i for i, k in enumerate(all_cols)}
+
+    # Now we need to define the so called "configs". For the sequence columns these will consist of:
+    # - the column names
+    # - the maximum value in the column (to define the number of embeddings)
+    # - the embedding dim
+    user_behavior_confiq = (
+        user_behaviour_cols,
+        X_user_behaviour.max(),
+        32,
+    )
+
+    # Again, the explanation to this will come when we instantiate the model
+    # (also, please, read the docs)
+    rating_seq_config = (ratings_cols, 2)
+
+    # all the other sequence columns that are not user behaviour or an action
+    # related to the items that define the user behaviour will be refer
+    # as "other sequence columns" and will be pass as elements of a list
+    other_seq_cols_confiq = [(genres_cols, X_genres.max(), 16)]
+
+    # And finally, the config for the remaining, tabular columns
+    other_cols_config = tab_preprocessor.cat_embed_input
+
+    # Now, one of the params of the DeepInterestNetwork is action_seq_config.
+    # This 'action' can be, for example a rating, or purchased/not-purchased.
+    # The way that this 'action' will be used is the following: this action
+    # will **always** be learned as a 1d embedding and will be combined with
+    # the user behaviour. For example, imagine that the action is
+    # purchased/not-purchased. then per item in the user behaviour sequence
+    # there will be a binary action to learn 0/1. Such action will be
+    # represented by a float number (so 3 floats will be learned, one for
+    # purchased, one for not-purchased and one for padding) that will
+    # multiply the corresponding item embedding in the user behaviour
+    # sequence.
+    din = DeepInterestNetwork(
+        column_idx=column_idx,
+        target_item_col="target_item",
+        user_behavior_confiq=user_behavior_confiq,
+        action_seq_config=rating_seq_config,
+        other_seq_cols_confiq=other_seq_cols_confiq,
+        cat_embed_input=other_cols_config,
+        mlp_hidden_dims=[128, 64],
+    )
+
+    # And from here on, everything is standard
+    model = WideDeep(deeptabular=din)
+
+    trainer = Trainer(model=model, objective="binary", metrics=[Accuracy()])
+
+    # in the real world you would have to split the data into train, val and test
+    trainer.fit(
+        X_tab=X_all,
+        target=seq_df.target_item_rating.values,
+        n_epochs=5,
+        batch_size=512,
+    )
diff --git a/examples/scripts/movielens_fm.py b/examples/scripts/movielens_fm.py
new file mode 100644
index 00000000..77db3423
--- /dev/null
+++ b/examples/scripts/movielens_fm.py
@@ -0,0 +1,139 @@
+import re
+from typing import List, Tuple
+
+import pandas as pd
+
+from pytorch_widedeep import Trainer
+from pytorch_widedeep.models import Wide, WideDeep
+from pytorch_widedeep.metrics import Accuracy
+from pytorch_widedeep.datasets import load_movielens100k
+from pytorch_widedeep.models.rec import (
+    DeepFactorizationMachine,
+    ExtremeDeepFactorizationMachine,
+    DeepFieldAwareFactorizationMachine,
+)
+from pytorch_widedeep.preprocessing import TabPreprocessor, WidePreprocessor
+
+if __name__ == "__main__":
+
+    data, users, items = load_movielens100k(as_frame=True)
+
+    list_of_genres = [
+        "unknown",
+        "Action",
+        "Adventure",
+        "Animation",
+        "Children's",
+        "Comedy",
+        "Crime",
+        "Documentary",
+        "Drama",
+        "Fantasy",
+        "Film-Noir",
+        "Horror",
+        "Musical",
+        "Mystery",
+        "Romance",
+        "Sci-Fi",
+        "Thriller",
+        "War",
+        "Western",
+    ]
+
+    # useless assertion to avoid mypy warnings
+    assert (
+        isinstance(items, pd.DataFrame)
+        and isinstance(data, pd.DataFrame)
+        and isinstance(users, pd.DataFrame)
+    )
+    items["genre_list"] = items[list_of_genres].apply(
+        lambda x: [genre for genre in list_of_genres if x[genre] == 1], axis=1
+    )
+
+    # for each element in genre_list, all to lower case, remove non-alphanumeric
+    # characters, sort and join with an underscore
+    def clean_genre_list(genre_list):
+        return "_".join(
+            sorted([re.sub(r"[^a-z0-9]", "", genre.lower()) for genre in genre_list])
+        )
+
+    items["genre_list"] = items["genre_list"].apply(clean_genre_list)
+
+    df = pd.merge(data, users[["user_id", "age", "gender", "occupation"]], on="user_id")
+    df = pd.merge(df, items[["movie_id", "genre_list"]], on="movie_id")
+
+    # binarize the ratings.
+    df["rating"] = df["rating"].apply(lambda x: 1 if x >= 4 else 0)
+
+    # sort by timestamp, groupby user and keep the one before the last for val
+    # and the last for test
+    df = df.sort_values(by=["timestamp"])
+    train_df = df.groupby("user_id").apply(lambda x: x.iloc[:-2]).reset_index(drop=True)
+    val_df = df.groupby("user_id").apply(lambda x: x.iloc[-2]).reset_index(drop=True)
+    test_df = df.groupby("user_id").apply(lambda x: x.iloc[-1]).reset_index(drop=True)
+    assert len(df) == len(train_df) + len(val_df) + len(test_df)
+
+    cat_cols = [
+        "user_id",
+        "movie_id",
+        "age",
+        "gender",
+        "occupation",
+        "genre_list",
+    ]
+
+    tab_preprocessor = TabPreprocessor(cat_embed_cols=cat_cols, for_mf=True)
+    X_tab_tr = tab_preprocessor.fit_transform(train_df)
+    X_tab_val = tab_preprocessor.transform(val_df)
+    X_tab_te = tab_preprocessor.transform(test_df)
+
+    wide_preprocessor = WidePreprocessor(wide_cols=cat_cols)
+    X_wide_tr = wide_preprocessor.fit_transform(train_df)
+    X_wide_val = wide_preprocessor.transform(val_df)
+    X_wide_te = wide_preprocessor.transform(test_df)
+
+    cat_embed_input: List[Tuple[str, int]] = tab_preprocessor.cat_embed_input
+    dfm = DeepFactorizationMachine(
+        column_idx=tab_preprocessor.column_idx,
+        num_factors=8,
+        cat_embed_input=cat_embed_input,
+        mlp_hidden_dims=[64, 32],
+    )
+
+    dffm = DeepFieldAwareFactorizationMachine(
+        column_idx=tab_preprocessor.column_idx,
+        num_factors=8,
+        cat_embed_input=cat_embed_input,
+        mlp_hidden_dims=[64, 32],
+    )
+
+    xdfm = ExtremeDeepFactorizationMachine(
+        column_idx=tab_preprocessor.column_idx,
+        input_dim=16,
+        cat_embed_input=cat_embed_input,
+        cin_layer_dims=[32, 16],
+        mlp_hidden_dims=[64, 32],
+    )
+
+    wide = Wide(input_dim=X_tab_tr.max(), pred_dim=1)
+
+    for fm_model in [dfm, dffm, xdfm]:
+
+        fm_model = WideDeep(wide=wide, deeptabular=fm_model)
+
+        trainer = Trainer(fm_model, objective="binary", metrics=[Accuracy()])
+
+        X_train = {
+            "X_wide": X_wide_tr,
+            "X_tab": X_tab_tr,
+            "target": train_df["rating"].values,
+        }
+        X_val = {
+            "X_wide": X_wide_val,
+            "X_tab": X_tab_val,
+            "target": val_df["rating"].values,
+        }
+        X_test = {"X_wide": X_wide_te, "X_tab": X_tab_te}
+
+        trainer.fit(X_train=X_train, X_val=X_val, n_epochs=2)
+        trainer.predict(X_test={"X_wide": X_wide_te, "X_tab": X_tab_te})
diff --git a/examples/scripts/readme_snippets.py b/examples/scripts/readme_snippets.py
index 9e230e8c..bf430923 100644
--- a/examples/scripts/readme_snippets.py
+++ b/examples/scripts/readme_snippets.py
@@ -12,14 +12,7 @@
 from faker import Faker
 
 from pytorch_widedeep import Trainer
-from pytorch_widedeep.models import (
-    Wide,
-    TabMlp,
-    Vision,
-    BasicRNN,
-    WideDeep,
-    ModelFuser,
-)
+from pytorch_widedeep.models import Wide, TabMlp, Vision, BasicRNN, WideDeep, ModelFuser
 from pytorch_widedeep.preprocessing import (
     TabPreprocessor,
     TextPreprocessor,
@@ -27,9 +20,7 @@
     ImagePreprocessor,
 )
 from pytorch_widedeep.losses_multitarget import MultiTargetClassificationLoss
-from pytorch_widedeep.models._base_wd_model_component import (
-    BaseWDModelComponent,
-)
+from pytorch_widedeep.models._base_wd_model_component import BaseWDModelComponent
 
 
 def create_and_save_random_image(image_number, size=(32, 32)):
diff --git a/mkdocs/mkdocs.yml b/mkdocs/mkdocs.yml
index 3f8347cf..7cdd8ee8 100644
--- a/mkdocs/mkdocs.yml
+++ b/mkdocs/mkdocs.yml
@@ -28,6 +28,7 @@ nav:
         - Preprocessing: pytorch-widedeep/preprocessing.md
         - Load From Folder: pytorch-widedeep/load_from_folder.md
         - Model Components: pytorch-widedeep/model_components.md
+        - The Rec Module: pytorch-widedeep/the_rec_module.md
         - Bayesian models: pytorch-widedeep/bayesian_models.md
         - Losses: pytorch-widedeep/losses.md
         - Metrics: pytorch-widedeep/metrics.md
diff --git a/mkdocs/site/404.html b/mkdocs/site/404.html
index fe1eed8b..acaae39f 100644
--- a/mkdocs/site/404.html
+++ b/mkdocs/site/404.html
@@ -12,7 +12,7 @@
       
       
       <link rel="icon" href="/assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -20,7 +20,7 @@
       
     
     
-      <link rel="stylesheet" href="/assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="/assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="/assets/stylesheets/palette.06af60db.min.css">
@@ -46,7 +46,7 @@
     
       <link rel="stylesheet" href="/stylesheets/extra.css">
     
-    <script>__md_scope=new URL("/",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL("/",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -90,7 +90,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -118,7 +118,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -128,7 +128,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -136,13 +136,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -151,15 +151,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -182,7 +182,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -357,7 +357,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -459,6 +459,8 @@
         
       
         
+      
+        
       
         
       
@@ -720,6 +722,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="/pytorch-widedeep/the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="/pytorch-widedeep/bayesian_models.html" class="md-nav__link">
         
@@ -1536,7 +1559,7 @@ <h1>404 - Not found</h1>
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -1551,10 +1574,10 @@ <h1>404 - Not found</h1>
     </div>
     
     
-    <script id="__config" type="application/json">{"base": "/", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "/assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": "/", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "/assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="/assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="/assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="/stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/contributing.html b/mkdocs/site/contributing.html
index 0f6e3785..62ae46dc 100644
--- a/mkdocs/site/contributing.html
+++ b/mkdocs/site/contributing.html
@@ -16,7 +16,7 @@
       
       
       <link rel="icon" href="assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -24,7 +24,7 @@
       
     
     
-      <link rel="stylesheet" href="assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="assets/stylesheets/palette.06af60db.min.css">
@@ -50,7 +50,7 @@
     
       <link rel="stylesheet" href="stylesheets/extra.css">
     
-    <script>__md_scope=new URL(".",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL(".",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -94,7 +94,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -122,7 +122,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -132,7 +132,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -140,13 +140,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -155,15 +155,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -186,7 +186,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -363,7 +363,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -465,6 +465,8 @@
         
       
         
+      
+        
       
         
       
@@ -726,6 +728,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="pytorch-widedeep/the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="pytorch-widedeep/bayesian_models.html" class="md-nav__link">
         
@@ -1544,7 +1567,7 @@ <h1>Contributing</h1>
   <span class="md-source-file__fact">
     <span class="md-icon" title="Contributors">
       
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 4a4 4 0 0 1 4 4 4 4 0 0 1-4 4 4 4 0 0 1-4-4 4 4 0 0 1 4-4m0 10c4.42 0 8 1.79 8 4v2H4v-2c0-2.21 3.58-4 8-4Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 4a4 4 0 0 1 4 4 4 4 0 0 1-4 4 4 4 0 0 1-4-4 4 4 0 0 1 4-4m0 10c4.42 0 8 1.79 8 4v2H4v-2c0-2.21 3.58-4 8-4"/></svg>
       
     </span>
     <nav>
@@ -1599,7 +1622,7 @@ <h1>Contributing</h1>
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -1614,10 +1637,10 @@ <h1>Contributing</h1>
     </div>
     
     
-    <script id="__config" type="application/json">{"base": ".", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": ".", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/index.html b/mkdocs/site/index.html
index 78bf4577..1f68a10a 100644
--- a/mkdocs/site/index.html
+++ b/mkdocs/site/index.html
@@ -16,7 +16,7 @@
       
       
       <link rel="icon" href="assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -24,7 +24,7 @@
       
     
     
-      <link rel="stylesheet" href="assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="assets/stylesheets/palette.06af60db.min.css">
@@ -50,7 +50,7 @@
     
       <link rel="stylesheet" href="stylesheets/extra.css">
     
-    <script>__md_scope=new URL(".",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL(".",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -99,7 +99,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -127,7 +127,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -137,7 +137,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -145,13 +145,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -160,15 +160,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -191,7 +191,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -368,7 +368,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -480,6 +480,8 @@
         
       
         
+      
+        
       
         
       
@@ -741,6 +743,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="pytorch-widedeep/the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="pytorch-widedeep/bayesian_models.html" class="md-nav__link">
         
@@ -1551,6 +1574,7 @@ <h1 id="pytorch-widedeep"><strong>pytorch-widedeep</strong><a class="headerlink"
 <li><a href="#introduction">Introduction</a></li>
 <li><a href="#Architectures">Architectures</a></li>
 <li><a href="#the-deeptabular-component">The <code>deeptabular</code> component</a></li>
+<li><a href="#the-rec-module">The <code>rec</code> module</a></li>
 <li><a href="#text-and-images">Text and Images</a></li>
 <li><a href="#acknowledgments">Acknowledgments</a></li>
 <li><a href="#license">License</a></li>
@@ -2061,11 +2085,104 @@ <h3 id="architectures">Architectures<a class="headerlink" href="#architectures"
     <span class="n">batch_size</span><span class="o">=</span><span class="mi">32</span><span class="p">,</span>
 <span class="p">)</span>
 </code></pre></div>
-<p><strong>7. Tabular with a multi-target loss</strong></p>
+<p><strong>7. A two-tower model</strong></p>
+<p>This is a popular model in the context of recommendation systems. Let's say we
+have a tabular dataset formed my triples (user features, item features,
+target). We can create a two-tower model where the user and item features are
+passed through two separate models and then "fused" via a dot product.</p>
+<p align="center">
+  <img width="350" src="docs/figures/arch_7.png">
+</p>
+
+<div class="highlight"><pre><span></span><code><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
+<span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
+
+<span class="kn">from</span> <span class="nn">pytorch_widedeep</span> <span class="kn">import</span> <span class="n">Trainer</span>
+<span class="kn">from</span> <span class="nn">pytorch_widedeep.preprocessing</span> <span class="kn">import</span> <span class="n">TabPreprocessor</span>
+<span class="kn">from</span> <span class="nn">pytorch_widedeep.models</span> <span class="kn">import</span> <span class="n">TabMlp</span><span class="p">,</span> <span class="n">WideDeep</span><span class="p">,</span> <span class="n">ModelFuser</span>
+
+<span class="c1"># Let&#39;s create the interaction dataset</span>
+<span class="c1"># user_features dataframe</span>
+<span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">seed</span><span class="p">(</span><span class="mi">42</span><span class="p">)</span>
+<span class="n">user_ids</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">101</span><span class="p">)</span>
+<span class="n">ages</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="mi">18</span><span class="p">,</span> <span class="mi">60</span><span class="p">,</span> <span class="n">size</span><span class="o">=</span><span class="mi">100</span><span class="p">)</span>
+<span class="n">genders</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">choice</span><span class="p">([</span><span class="s2">&quot;male&quot;</span><span class="p">,</span> <span class="s2">&quot;female&quot;</span><span class="p">],</span> <span class="n">size</span><span class="o">=</span><span class="mi">100</span><span class="p">)</span>
+<span class="n">locations</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">choice</span><span class="p">([</span><span class="s2">&quot;city_a&quot;</span><span class="p">,</span> <span class="s2">&quot;city_b&quot;</span><span class="p">,</span> <span class="s2">&quot;city_c&quot;</span><span class="p">,</span> <span class="s2">&quot;city_d&quot;</span><span class="p">],</span> <span class="n">size</span><span class="o">=</span><span class="mi">100</span><span class="p">)</span>
+<span class="n">user_features</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span>
+    <span class="p">{</span><span class="s2">&quot;id&quot;</span><span class="p">:</span> <span class="n">user_ids</span><span class="p">,</span> <span class="s2">&quot;age&quot;</span><span class="p">:</span> <span class="n">ages</span><span class="p">,</span> <span class="s2">&quot;gender&quot;</span><span class="p">:</span> <span class="n">genders</span><span class="p">,</span> <span class="s2">&quot;location&quot;</span><span class="p">:</span> <span class="n">locations</span><span class="p">}</span>
+<span class="p">)</span>
+
+<span class="c1"># item_features dataframe</span>
+<span class="n">item_ids</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">101</span><span class="p">)</span>
+<span class="n">prices</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="mi">10</span><span class="p">,</span> <span class="mi">500</span><span class="p">,</span> <span class="n">size</span><span class="o">=</span><span class="mi">100</span><span class="p">)</span><span class="o">.</span><span class="n">round</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span>
+<span class="n">colors</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">choice</span><span class="p">([</span><span class="s2">&quot;red&quot;</span><span class="p">,</span> <span class="s2">&quot;blue&quot;</span><span class="p">,</span> <span class="s2">&quot;green&quot;</span><span class="p">,</span> <span class="s2">&quot;black&quot;</span><span class="p">],</span> <span class="n">size</span><span class="o">=</span><span class="mi">100</span><span class="p">)</span>
+<span class="n">categories</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">choice</span><span class="p">([</span><span class="s2">&quot;electronics&quot;</span><span class="p">,</span> <span class="s2">&quot;clothing&quot;</span><span class="p">,</span> <span class="s2">&quot;home&quot;</span><span class="p">,</span> <span class="s2">&quot;toys&quot;</span><span class="p">],</span> <span class="n">size</span><span class="o">=</span><span class="mi">100</span><span class="p">)</span>
+
+<span class="n">item_features</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span>
+    <span class="p">{</span><span class="s2">&quot;id&quot;</span><span class="p">:</span> <span class="n">item_ids</span><span class="p">,</span> <span class="s2">&quot;price&quot;</span><span class="p">:</span> <span class="n">prices</span><span class="p">,</span> <span class="s2">&quot;color&quot;</span><span class="p">:</span> <span class="n">colors</span><span class="p">,</span> <span class="s2">&quot;category&quot;</span><span class="p">:</span> <span class="n">categories</span><span class="p">}</span>
+<span class="p">)</span>
+
+<span class="c1"># Interactions dataframe</span>
+<span class="n">interaction_user_ids</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">choice</span><span class="p">(</span><span class="n">user_ids</span><span class="p">,</span> <span class="n">size</span><span class="o">=</span><span class="mi">1000</span><span class="p">)</span>
+<span class="n">interaction_item_ids</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">choice</span><span class="p">(</span><span class="n">item_ids</span><span class="p">,</span> <span class="n">size</span><span class="o">=</span><span class="mi">1000</span><span class="p">)</span>
+<span class="n">purchased</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">choice</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="n">size</span><span class="o">=</span><span class="mi">1000</span><span class="p">,</span> <span class="n">p</span><span class="o">=</span><span class="p">[</span><span class="mf">0.7</span><span class="p">,</span> <span class="mf">0.3</span><span class="p">])</span>
+<span class="n">interactions</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span>
+    <span class="p">{</span>
+        <span class="s2">&quot;user_id&quot;</span><span class="p">:</span> <span class="n">interaction_user_ids</span><span class="p">,</span>
+        <span class="s2">&quot;item_id&quot;</span><span class="p">:</span> <span class="n">interaction_item_ids</span><span class="p">,</span>
+        <span class="s2">&quot;purchased&quot;</span><span class="p">:</span> <span class="n">purchased</span><span class="p">,</span>
+    <span class="p">}</span>
+<span class="p">)</span>
+<span class="n">user_item_purchased</span> <span class="o">=</span> <span class="n">interactions</span><span class="o">.</span><span class="n">merge</span><span class="p">(</span>
+    <span class="n">user_features</span><span class="p">,</span> <span class="n">left_on</span><span class="o">=</span><span class="s2">&quot;user_id&quot;</span><span class="p">,</span> <span class="n">right_on</span><span class="o">=</span><span class="s2">&quot;id&quot;</span>
+<span class="p">)</span><span class="o">.</span><span class="n">merge</span><span class="p">(</span><span class="n">item_features</span><span class="p">,</span> <span class="n">left_on</span><span class="o">=</span><span class="s2">&quot;item_id&quot;</span><span class="p">,</span> <span class="n">right_on</span><span class="o">=</span><span class="s2">&quot;id&quot;</span><span class="p">)</span>
+
+<span class="c1"># Users</span>
+<span class="n">tab_preprocessor_user</span> <span class="o">=</span> <span class="n">TabPreprocessor</span><span class="p">(</span>
+    <span class="n">cat_embed_cols</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;gender&quot;</span><span class="p">,</span> <span class="s2">&quot;location&quot;</span><span class="p">],</span>
+    <span class="n">continuous_cols</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;age&quot;</span><span class="p">],</span>
+<span class="p">)</span>
+<span class="n">X_user</span> <span class="o">=</span> <span class="n">tab_preprocessor_user</span><span class="o">.</span><span class="n">fit_transform</span><span class="p">(</span><span class="n">user_item_purchased</span><span class="p">)</span>
+<span class="n">tab_mlp_user</span> <span class="o">=</span> <span class="n">TabMlp</span><span class="p">(</span>
+    <span class="n">column_idx</span><span class="o">=</span><span class="n">tab_preprocessor_user</span><span class="o">.</span><span class="n">column_idx</span><span class="p">,</span>
+    <span class="n">cat_embed_input</span><span class="o">=</span><span class="n">tab_preprocessor_user</span><span class="o">.</span><span class="n">cat_embed_input</span><span class="p">,</span>
+    <span class="n">continuous_cols</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;age&quot;</span><span class="p">],</span>
+    <span class="n">mlp_hidden_dims</span><span class="o">=</span><span class="p">[</span><span class="mi">16</span><span class="p">,</span> <span class="mi">8</span><span class="p">],</span>
+    <span class="n">mlp_dropout</span><span class="o">=</span><span class="p">[</span><span class="mf">0.2</span><span class="p">,</span> <span class="mf">0.2</span><span class="p">],</span>
+<span class="p">)</span>
+
+<span class="c1"># Items</span>
+<span class="n">tab_preprocessor_item</span> <span class="o">=</span> <span class="n">TabPreprocessor</span><span class="p">(</span>
+    <span class="n">cat_embed_cols</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;color&quot;</span><span class="p">,</span> <span class="s2">&quot;category&quot;</span><span class="p">],</span>
+    <span class="n">continuous_cols</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;price&quot;</span><span class="p">],</span>
+<span class="p">)</span>
+<span class="n">X_item</span> <span class="o">=</span> <span class="n">tab_preprocessor_item</span><span class="o">.</span><span class="n">fit_transform</span><span class="p">(</span><span class="n">user_item_purchased</span><span class="p">)</span>
+<span class="n">tab_mlp_item</span> <span class="o">=</span> <span class="n">TabMlp</span><span class="p">(</span>
+    <span class="n">column_idx</span><span class="o">=</span><span class="n">tab_preprocessor_item</span><span class="o">.</span><span class="n">column_idx</span><span class="p">,</span>
+    <span class="n">cat_embed_input</span><span class="o">=</span><span class="n">tab_preprocessor_item</span><span class="o">.</span><span class="n">cat_embed_input</span><span class="p">,</span>
+    <span class="n">continuous_cols</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;price&quot;</span><span class="p">],</span>
+    <span class="n">mlp_hidden_dims</span><span class="o">=</span><span class="p">[</span><span class="mi">16</span><span class="p">,</span> <span class="mi">8</span><span class="p">],</span>
+    <span class="n">mlp_dropout</span><span class="o">=</span><span class="p">[</span><span class="mf">0.2</span><span class="p">,</span> <span class="mf">0.2</span><span class="p">],</span>
+<span class="p">)</span>
+
+<span class="n">two_tower_model</span> <span class="o">=</span> <span class="n">ModelFuser</span><span class="p">([</span><span class="n">tab_mlp_user</span><span class="p">,</span> <span class="n">tab_mlp_item</span><span class="p">],</span> <span class="n">fusion_method</span><span class="o">=</span><span class="s2">&quot;dot&quot;</span><span class="p">)</span>
+
+<span class="n">model</span> <span class="o">=</span> <span class="n">WideDeep</span><span class="p">(</span><span class="n">deeptabular</span><span class="o">=</span><span class="n">two_tower_model</span><span class="p">)</span>
+
+<span class="n">trainer</span> <span class="o">=</span> <span class="n">Trainer</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">objective</span><span class="o">=</span><span class="s2">&quot;binary&quot;</span><span class="p">)</span>
+
+<span class="n">trainer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span>
+    <span class="n">X_tab</span><span class="o">=</span><span class="p">[</span><span class="n">X_user</span><span class="p">,</span> <span class="n">X_item</span><span class="p">],</span>
+    <span class="n">target</span><span class="o">=</span><span class="n">interactions</span><span class="o">.</span><span class="n">purchased</span><span class="o">.</span><span class="n">values</span><span class="p">,</span>
+    <span class="n">n_epochs</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
+    <span class="n">batch_size</span><span class="o">=</span><span class="mi">32</span><span class="p">,</span>
+<span class="p">)</span>
+</code></pre></div>
+<p><strong>8. Tabular with a multi-target loss</strong></p>
 <p>This one is "a bonus" to illustrate the use of multi-target losses, more than
 actually a different architecture.</p>
 <p align="center">
-  <img width="200" src="docs/figures/arch_7.png">
+  <img width="200" src="docs/figures/arch_8.png">
 </p>
 
 <div class="highlight"><pre><span></span><code><span class="kn">from</span> <span class="nn">pytorch_widedeep.preprocessing</span> <span class="kn">import</span> <span class="n">TabPreprocessor</span><span class="p">,</span> <span class="n">TextPreprocessor</span><span class="p">,</span> <span class="n">ImagePreprocessor</span>
@@ -2163,6 +2280,24 @@ <h3 id="the-deeptabular-component">The <code>deeptabular</code> component<a clas
 encoder-decoder method and constrastive-denoising method. Please, see the
 documentation and the examples for details on this functionality, and all
 other options in the library.</p>
+<h3 id="the-rec-module">The <code>rec</code> module<a class="headerlink" href="#the-rec-module" title="Permanent link">&para;</a></h3>
+<p>This module was introduced as an extension to the existing components in the
+library, addressing questions and issues related to recommendation systems.
+While still under active development, it currently includes a select number
+of powerful recommendation models.</p>
+<p>It's worth noting that this library already supported the implementation of
+various recommendation algorithms using existing components. For example,
+models like Wide and Deep, Two-Tower, or Neural Collaborative Filtering could
+be constructed using the library's core functionalities.</p>
+<p>The recommendation algorithms in the <code>rec</code> module are:</p>
+<ol>
+<li><a href="https://arxiv.org/abs/1703.04247">DeepFM: A Factorization-Machine based Neural Network for CTR Prediction</a></li>
+<li>(Deep) Field Aware Factorization Machine (FFM): a Deep Learning version of the algorithm presented in <a href="https://arxiv.org/abs/1701.04099">Field-aware Factorization Machines in a Real-world Online Advertising System</a></li>
+<li><a href="https://arxiv.org/pdf/1803.05170">xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems</a></li>
+<li><a href="https://arxiv.org/abs/1706.06978">Deep Interest Network for Click-Through Rate Prediction</a></li>
+</ol>
+<p>These can all be used as the <code>deeptabular</code> component in the <code>WideDeep</code> model.
+See the examples for more details.</p>
 <h3 id="text-and-images">Text and Images<a class="headerlink" href="#text-and-images" title="Permanent link">&para;</a></h3>
 <p>For the text component, <code>deeptext</code>, the library offers the following models:</p>
 <ol>
@@ -2240,14 +2375,15 @@ <h4 id="apa">APA<a class="headerlink" href="#apa" title="Permanent link">&para;<
   <span class="md-source-file__fact">
     <span class="md-icon" title="Contributors">
       
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.536 5.536 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13v-1.75M0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20H0m24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9V20Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.54 5.54 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13zM0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20zm24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9z"/></svg>
       
     </span>
     <nav>
       
         <a href="mailto:jrzaurin@gmail.com">Javier</a>, 
         <a href="mailto:mulinka.pavol@gmail.com">Pavol Mulinka</a>, 
-        <a href="mailto:javierrodriguezzaurin@javiers-macbook-pro.local">Javier Rodriguez Zaurin</a>
+        <a href="mailto:javierrodriguezzaurin@javiers-macbook-pro.local">Javier Rodriguez Zaurin</a>, 
+        <a href="mailto:not.committed.yet">Not Committed Yet</a>
     </nav>
   </span>
 
@@ -2297,7 +2433,7 @@ <h4 id="apa">APA<a class="headerlink" href="#apa" title="Permanent link">&para;<
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -2312,10 +2448,10 @@ <h4 id="apa">APA<a class="headerlink" href="#apa" title="Permanent link">&para;<
     </div>
     
     
-    <script id="__config" type="application/json">{"base": ".", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": ".", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/index.md b/mkdocs/site/index.md
index 208321d5..77d61195 100644
--- a/mkdocs/site/index.md
+++ b/mkdocs/site/index.md
@@ -33,6 +33,7 @@ The content of this document is organized as follows:
     - [Introduction](#introduction)
     - [Architectures](#Architectures)
     - [The ``deeptabular`` component](#the-deeptabular-component)
+    - [The ``rec`` module](#the-rec-module)
     - [Text and Images](#text-and-images)
     - [Acknowledgments](#acknowledgments)
     - [License](#license)
@@ -582,13 +583,111 @@ trainer.fit(
 )
 ```
 
-**7. Tabular with a multi-target loss**
+**7. A two-tower model**
+
+This is a popular model in the context of recommendation systems. Let's say we
+have a tabular dataset formed my triples (user features, item features,
+target). We can create a two-tower model where the user and item features are
+passed through two separate models and then "fused" via a dot product.
+
+<p align="center">
+  <img width="350" src="docs/figures/arch_7.png">
+</p>
+
+
+```python
+import numpy as np
+import pandas as pd
+
+from pytorch_widedeep import Trainer
+from pytorch_widedeep.preprocessing import TabPreprocessor
+from pytorch_widedeep.models import TabMlp, WideDeep, ModelFuser
+
+# Let's create the interaction dataset
+# user_features dataframe
+np.random.seed(42)
+user_ids = np.arange(1, 101)
+ages = np.random.randint(18, 60, size=100)
+genders = np.random.choice(["male", "female"], size=100)
+locations = np.random.choice(["city_a", "city_b", "city_c", "city_d"], size=100)
+user_features = pd.DataFrame(
+    {"id": user_ids, "age": ages, "gender": genders, "location": locations}
+)
+
+# item_features dataframe
+item_ids = np.arange(1, 101)
+prices = np.random.uniform(10, 500, size=100).round(2)
+colors = np.random.choice(["red", "blue", "green", "black"], size=100)
+categories = np.random.choice(["electronics", "clothing", "home", "toys"], size=100)
+
+item_features = pd.DataFrame(
+    {"id": item_ids, "price": prices, "color": colors, "category": categories}
+)
+
+# Interactions dataframe
+interaction_user_ids = np.random.choice(user_ids, size=1000)
+interaction_item_ids = np.random.choice(item_ids, size=1000)
+purchased = np.random.choice([0, 1], size=1000, p=[0.7, 0.3])
+interactions = pd.DataFrame(
+    {
+        "user_id": interaction_user_ids,
+        "item_id": interaction_item_ids,
+        "purchased": purchased,
+    }
+)
+user_item_purchased = interactions.merge(
+    user_features, left_on="user_id", right_on="id"
+).merge(item_features, left_on="item_id", right_on="id")
+
+# Users
+tab_preprocessor_user = TabPreprocessor(
+    cat_embed_cols=["gender", "location"],
+    continuous_cols=["age"],
+)
+X_user = tab_preprocessor_user.fit_transform(user_item_purchased)
+tab_mlp_user = TabMlp(
+    column_idx=tab_preprocessor_user.column_idx,
+    cat_embed_input=tab_preprocessor_user.cat_embed_input,
+    continuous_cols=["age"],
+    mlp_hidden_dims=[16, 8],
+    mlp_dropout=[0.2, 0.2],
+)
+
+# Items
+tab_preprocessor_item = TabPreprocessor(
+    cat_embed_cols=["color", "category"],
+    continuous_cols=["price"],
+)
+X_item = tab_preprocessor_item.fit_transform(user_item_purchased)
+tab_mlp_item = TabMlp(
+    column_idx=tab_preprocessor_item.column_idx,
+    cat_embed_input=tab_preprocessor_item.cat_embed_input,
+    continuous_cols=["price"],
+    mlp_hidden_dims=[16, 8],
+    mlp_dropout=[0.2, 0.2],
+)
+
+two_tower_model = ModelFuser([tab_mlp_user, tab_mlp_item], fusion_method="dot")
+
+model = WideDeep(deeptabular=two_tower_model)
+
+trainer = Trainer(model, objective="binary")
+
+trainer.fit(
+    X_tab=[X_user, X_item],
+    target=interactions.purchased.values,
+    n_epochs=1,
+    batch_size=32,
+)
+```
+
+**8. Tabular with a multi-target loss**
 
 This one is "a bonus" to illustrate the use of multi-target losses, more than
 actually a different architecture.
 
 <p align="center">
-  <img width="200" src="docs/figures/arch_7.png">
+  <img width="200" src="docs/figures/arch_8.png">
 </p>
 
 
@@ -692,6 +791,28 @@ encoder-decoder method and constrastive-denoising method. Please, see the
 documentation and the examples for details on this functionality, and all
 other options in the library.
 
+### The ``rec`` module
+
+This module was introduced as an extension to the existing components in the
+library, addressing questions and issues related to recommendation systems.
+While still under active development, it currently includes a select number
+of powerful recommendation models.
+
+It's worth noting that this library already supported the implementation of
+various recommendation algorithms using existing components. For example,
+models like Wide and Deep, Two-Tower, or Neural Collaborative Filtering could
+be constructed using the library's core functionalities.
+
+The recommendation algorithms in the `rec` module are:
+
+1. [DeepFM: A Factorization-Machine based Neural Network for CTR Prediction](https://arxiv.org/abs/1703.04247)
+2. (Deep) Field Aware Factorization Machine (FFM): a Deep Learning version of the algorithm presented in [Field-aware Factorization Machines in a Real-world Online Advertising System](https://arxiv.org/abs/1701.04099)
+3. [xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems](https://arxiv.org/pdf/1803.05170)
+4. [Deep Interest Network for Click-Through Rate Prediction](https://arxiv.org/abs/1706.06978)
+
+These can all be used as the `deeptabular` component in the `WideDeep` model.
+See the examples for more details.
+
 ### Text and Images
 For the text component, `deeptext`, the library offers the following models:
 
diff --git a/mkdocs/site/installation.html b/mkdocs/site/installation.html
index 0cc41b2a..f321b41e 100644
--- a/mkdocs/site/installation.html
+++ b/mkdocs/site/installation.html
@@ -18,7 +18,7 @@
       
       
       <link rel="icon" href="assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -26,7 +26,7 @@
       
     
     
-      <link rel="stylesheet" href="assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="assets/stylesheets/palette.06af60db.min.css">
@@ -52,7 +52,7 @@
     
       <link rel="stylesheet" href="stylesheets/extra.css">
     
-    <script>__md_scope=new URL(".",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL(".",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -101,7 +101,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -129,7 +129,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -139,7 +139,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -147,13 +147,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -162,15 +162,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -193,7 +193,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -370,7 +370,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -521,6 +521,8 @@
         
       
         
+      
+        
       
         
       
@@ -782,6 +784,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="pytorch-widedeep/the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="pytorch-widedeep/bayesian_models.html" class="md-nav__link">
         
@@ -1620,7 +1643,7 @@ <h2 id="dependencies">Dependencies<a class="headerlink" href="#dependencies" tit
   <span class="md-source-file__fact">
     <span class="md-icon" title="Contributors">
       
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.536 5.536 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13v-1.75M0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20H0m24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9V20Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.54 5.54 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13zM0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20zm24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9z"/></svg>
       
     </span>
     <nav>
@@ -1676,7 +1699,7 @@ <h2 id="dependencies">Dependencies<a class="headerlink" href="#dependencies" tit
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -1691,10 +1714,10 @@ <h2 id="dependencies">Dependencies<a class="headerlink" href="#dependencies" tit
     </div>
     
     
-    <script id="__config" type="application/json">{"base": ".", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": ".", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/objects.inv b/mkdocs/site/objects.inv
index a976c4c7..2818ee08 100644
Binary files a/mkdocs/site/objects.inv and b/mkdocs/site/objects.inv differ
diff --git a/mkdocs/site/pytorch-widedeep/bayesian_models.html b/mkdocs/site/pytorch-widedeep/bayesian_models.html
index 5c3b989d..77dd456f 100644
--- a/mkdocs/site/pytorch-widedeep/bayesian_models.html
+++ b/mkdocs/site/pytorch-widedeep/bayesian_models.html
@@ -11,14 +11,14 @@
         <link rel="canonical" href="https://pytorch-widedeep.readthedocs.io/pytorch-widedeep/bayesian_models.html">
       
       
-        <link rel="prev" href="model_components.html">
+        <link rel="prev" href="the_rec_module.html">
       
       
         <link rel="next" href="losses.html">
       
       
       <link rel="icon" href="../assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -26,7 +26,7 @@
       
     
     
-      <link rel="stylesheet" href="../assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="../assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="../assets/stylesheets/palette.06af60db.min.css">
@@ -52,7 +52,7 @@
     
       <link rel="stylesheet" href="../stylesheets/extra.css">
     
-    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -101,7 +101,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -129,7 +129,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -139,7 +139,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -147,13 +147,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -162,15 +162,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -193,7 +193,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -372,7 +372,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -476,6 +476,8 @@
         
       
         
+      
+        
       
         
       
@@ -736,6 +738,27 @@
                 
   
   
+  
+  
+    <li class="md-nav__item">
+      <a href="the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
     
   
   
@@ -1613,7 +1636,7 @@ <h2 id="pytorch_widedeep.bayesian_models.tabular.bayesian_linear.bayesian_wide.B
               Bases: <code><span title="pytorch_widedeep.bayesian_models._base_bayesian_model.BaseBayesianModel">BaseBayesianModel</span></code></p>
 
 
-      <p>Defines a <code>Wide</code> model. This is a linear model where the
+        <p>Defines a <code>Wide</code> model. This is a linear model where the
 non-linearlities are captured via crossed-columns</p>
 
 
@@ -1738,13 +1761,15 @@ <h2 id="pytorch_widedeep.bayesian_models.tabular.bayesian_linear.bayesian_wide.B
     <div class="highlight"><pre><span></span><code><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch</span>
 <span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pytorch_widedeep.bayesian_models</span> <span class="kn">import</span> <span class="n">BayesianWide</span>
 <span class="gp">&gt;&gt;&gt; </span><span class="n">X</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span><span class="o">.</span><span class="n">random_</span><span class="p">(</span><span class="mi">6</span><span class="p">)</span>
-<span class="gp">&gt;&gt;&gt; </span><span class="n">wide</span> <span class="o">=</span> <span class="n">BayesianWide</span><span class="p">(</span><span class="n">input_dim</span><span class="o">=</span><span class="n">X</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span><span class="o">.</span><span class="n">size</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> <span class="n">pred_dim</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">wide</span> <span class="o">=</span> <span class="n">BayesianWide</span><span class="p">(</span><span class="n">input_dim</span><span class="o">=</span><span class="nb">int</span><span class="p">(</span><span class="n">X</span><span class="o">.</span><span class="n">max</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()),</span> <span class="n">pred_dim</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
 <span class="gp">&gt;&gt;&gt; </span><span class="n">out</span> <span class="o">=</span> <span class="n">wide</span><span class="p">(</span><span class="n">X</span><span class="p">)</span>
 </code></pre></div>
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/bayesian_models/tabular/bayesian_linear/bayesian_wide.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 83</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 81</span>
+<span class="normal"> 82</span>
+<span class="normal"> 83</span>
 <span class="normal"> 84</span>
 <span class="normal"> 85</span>
 <span class="normal"> 86</span>
@@ -1764,9 +1789,7 @@ <h2 id="pytorch_widedeep.bayesian_models.tabular.bayesian_linear.bayesian_wide.B
 <span class="normal">100</span>
 <span class="normal">101</span>
 <span class="normal">102</span>
-<span class="normal">103</span>
-<span class="normal">104</span>
-<span class="normal">105</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">103</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">input_dim</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
     <span class="n">pred_dim</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
@@ -1849,7 +1872,7 @@ <h2 id="pytorch_widedeep.bayesian_models.tabular.bayesian_mlp.bayesian_tab_mlp.B
               Bases: <code><span title="pytorch_widedeep.bayesian_models._base_bayesian_model.BaseBayesianModel">BaseBayesianModel</span></code></p>
 
 
-      <p>Defines a <code>BayesianTabMlp</code> model.</p>
+        <p>Defines a <code>BayesianTabMlp</code> model.</p>
 <p>This class combines embedding representations of the categorical features
 with numerical (aka continuous) features, embedded or not. These are then
 passed through a series of probabilistic dense layers (i.e. a MLP).</p>
@@ -2101,7 +2124,18 @@ <h2 id="pytorch_widedeep.bayesian_models.tabular.bayesian_mlp.bayesian_tab_mlp.B
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/bayesian_models/tabular/bayesian_mlp/bayesian_tab_mlp.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">134</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">123</span>
+<span class="normal">124</span>
+<span class="normal">125</span>
+<span class="normal">126</span>
+<span class="normal">127</span>
+<span class="normal">128</span>
+<span class="normal">129</span>
+<span class="normal">130</span>
+<span class="normal">131</span>
+<span class="normal">132</span>
+<span class="normal">133</span>
+<span class="normal">134</span>
 <span class="normal">135</span>
 <span class="normal">136</span>
 <span class="normal">137</span>
@@ -2208,18 +2242,7 @@ <h2 id="pytorch_widedeep.bayesian_models.tabular.bayesian_mlp.bayesian_tab_mlp.B
 <span class="normal">238</span>
 <span class="normal">239</span>
 <span class="normal">240</span>
-<span class="normal">241</span>
-<span class="normal">242</span>
-<span class="normal">243</span>
-<span class="normal">244</span>
-<span class="normal">245</span>
-<span class="normal">246</span>
-<span class="normal">247</span>
-<span class="normal">248</span>
-<span class="normal">249</span>
-<span class="normal">250</span>
-<span class="normal">251</span>
-<span class="normal">252</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">241</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">column_idx</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span>
     <span class="o">*</span><span class="p">,</span>
@@ -2377,13 +2400,13 @@ <h2 id="pytorch_widedeep.bayesian_models.tabular.bayesian_mlp.bayesian_tab_mlp.B
   <span class="md-source-file__fact">
     <span class="md-icon" title="Contributors">
       
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.536 5.536 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13v-1.75M0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20H0m24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9V20Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.54 5.54 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13zM0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20zm24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9z"/></svg>
       
     </span>
     <nav>
       
-        <a href="mailto:mulinka.pavol@gmail.com">Pavol Mulinka</a>, 
-        <a href="mailto:javierrodriguezzaurin@javiers-macbook-pro.local">Javier Rodriguez Zaurin</a>
+        <a href="mailto:javierrodriguezzaurin@javiers-macbook-pro.local">Javier Rodriguez Zaurin</a>, 
+        <a href="mailto:mulinka.pavol@gmail.com">Pavol Mulinka</a>
     </nav>
   </span>
 
@@ -2433,7 +2456,7 @@ <h2 id="pytorch_widedeep.bayesian_models.tabular.bayesian_mlp.bayesian_tab_mlp.B
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -2448,10 +2471,10 @@ <h2 id="pytorch_widedeep.bayesian_models.tabular.bayesian_mlp.bayesian_tab_mlp.B
     </div>
     
     
-    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="../assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="../stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/pytorch-widedeep/bayesian_trainer.html b/mkdocs/site/pytorch-widedeep/bayesian_trainer.html
index fbaa3b6e..51e79f11 100644
--- a/mkdocs/site/pytorch-widedeep/bayesian_trainer.html
+++ b/mkdocs/site/pytorch-widedeep/bayesian_trainer.html
@@ -18,7 +18,7 @@
       
       
       <link rel="icon" href="../assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -26,7 +26,7 @@
       
     
     
-      <link rel="stylesheet" href="../assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="../assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="../assets/stylesheets/palette.06af60db.min.css">
@@ -52,7 +52,7 @@
     
       <link rel="stylesheet" href="../stylesheets/extra.css">
     
-    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -101,7 +101,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -129,7 +129,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -139,7 +139,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -147,13 +147,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -162,15 +162,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -193,7 +193,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -372,7 +372,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -476,6 +476,8 @@
         
       
         
+      
+        
       
         
       
@@ -738,6 +740,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="bayesian_models.html" class="md-nav__link">
         
@@ -1601,7 +1624,7 @@ <h2 id="pytorch_widedeep.training.BayesianTrainer" class="doc doc-heading">
               Bases: <code><span title="pytorch_widedeep.training._base_bayesian_trainer.BaseBayesianTrainer">BaseBayesianTrainer</span></code></p>
 
 
-      <p>Class to set the of attributes that will be used during the
+        <p>Class to set the of attributes that will be used during the
 training process.</p>
 <p>Both the Bayesian models and the Trainer in this repo are based on the paper:
 <a href="https://arxiv.org/pdf/1505.05424.pdf">Weight Uncertainty in Neural Networks</a>.</p>
@@ -1771,7 +1794,11 @@ <h2 id="pytorch_widedeep.training.BayesianTrainer" class="doc doc-heading">
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/training/bayesian_trainer.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">115</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">111</span>
+<span class="normal">112</span>
+<span class="normal">113</span>
+<span class="normal">114</span>
+<span class="normal">115</span>
 <span class="normal">116</span>
 <span class="normal">117</span>
 <span class="normal">118</span>
@@ -1795,11 +1822,7 @@ <h2 id="pytorch_widedeep.training.BayesianTrainer" class="doc doc-heading">
 <span class="normal">136</span>
 <span class="normal">137</span>
 <span class="normal">138</span>
-<span class="normal">139</span>
-<span class="normal">140</span>
-<span class="normal">141</span>
-<span class="normal">142</span>
-<span class="normal">143</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span>  <span class="c1"># noqa: C901</span>
+<span class="normal">139</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span>  <span class="c1"># noqa: C901</span>
     <span class="s2">&quot;objective&quot;</span><span class="p">,</span>
     <span class="p">[</span><span class="s2">&quot;loss_function&quot;</span><span class="p">,</span> <span class="s2">&quot;loss_fn&quot;</span><span class="p">,</span> <span class="s2">&quot;loss&quot;</span><span class="p">,</span> <span class="s2">&quot;cost_function&quot;</span><span class="p">,</span> <span class="s2">&quot;cost_fn&quot;</span><span class="p">,</span> <span class="s2">&quot;cost&quot;</span><span class="p">],</span>
 <span class="p">)</span>
@@ -1867,7 +1890,7 @@ <h3 id="pytorch_widedeep.training.BayesianTrainer.fit" class="doc doc-heading">
 
     <div class="doc doc-contents ">
 
-      <p>Fit method.</p>
+        <p>Fit method.</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -1975,7 +1998,11 @@ <h3 id="pytorch_widedeep.training.BayesianTrainer.fit" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/training/bayesian_trainer.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">145</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">141</span>
+<span class="normal">142</span>
+<span class="normal">143</span>
+<span class="normal">144</span>
+<span class="normal">145</span>
 <span class="normal">146</span>
 <span class="normal">147</span>
 <span class="normal">148</span>
@@ -2085,11 +2112,7 @@ <h3 id="pytorch_widedeep.training.BayesianTrainer.fit" class="doc doc-heading">
 <span class="normal">252</span>
 <span class="normal">253</span>
 <span class="normal">254</span>
-<span class="normal">255</span>
-<span class="normal">256</span>
-<span class="normal">257</span>
-<span class="normal">258</span>
-<span class="normal">259</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fit</span><span class="p">(</span>  <span class="c1"># noqa: C901</span>
+<span class="normal">255</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fit</span><span class="p">(</span>  <span class="c1"># noqa: C901</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">X_tab</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span>
     <span class="n">target</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span>
@@ -2225,7 +2248,7 @@ <h3 id="pytorch_widedeep.training.BayesianTrainer.predict" class="doc doc-headin
 
     <div class="doc doc-contents ">
 
-      <p>Returns the predictions</p>
+        <p>Returns the predictions</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -2285,7 +2308,11 @@ <h3 id="pytorch_widedeep.training.BayesianTrainer.predict" class="doc doc-headin
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/training/bayesian_trainer.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">261</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">257</span>
+<span class="normal">258</span>
+<span class="normal">259</span>
+<span class="normal">260</span>
+<span class="normal">261</span>
 <span class="normal">262</span>
 <span class="normal">263</span>
 <span class="normal">264</span>
@@ -2317,11 +2344,7 @@ <h3 id="pytorch_widedeep.training.BayesianTrainer.predict" class="doc doc-headin
 <span class="normal">290</span>
 <span class="normal">291</span>
 <span class="normal">292</span>
-<span class="normal">293</span>
-<span class="normal">294</span>
-<span class="normal">295</span>
-<span class="normal">296</span>
-<span class="normal">297</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">predict</span><span class="p">(</span>  <span class="c1"># type: ignore[return]</span>
+<span class="normal">293</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">predict</span><span class="p">(</span>  <span class="c1"># type: ignore[return]</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">X_tab</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span>
     <span class="n">n_samples</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">,</span>
@@ -2379,7 +2402,7 @@ <h3 id="pytorch_widedeep.training.BayesianTrainer.predict_proba" class="doc doc-
 
     <div class="doc doc-contents ">
 
-      <p>Returns the predicted probabilities</p>
+        <p>Returns the predicted probabilities</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -2439,7 +2462,11 @@ <h3 id="pytorch_widedeep.training.BayesianTrainer.predict_proba" class="doc doc-
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/training/bayesian_trainer.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">299</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">295</span>
+<span class="normal">296</span>
+<span class="normal">297</span>
+<span class="normal">298</span>
+<span class="normal">299</span>
 <span class="normal">300</span>
 <span class="normal">301</span>
 <span class="normal">302</span>
@@ -2478,11 +2505,7 @@ <h3 id="pytorch_widedeep.training.BayesianTrainer.predict_proba" class="doc doc-
 <span class="normal">335</span>
 <span class="normal">336</span>
 <span class="normal">337</span>
-<span class="normal">338</span>
-<span class="normal">339</span>
-<span class="normal">340</span>
-<span class="normal">341</span>
-<span class="normal">342</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">predict_proba</span><span class="p">(</span>  <span class="c1"># type: ignore[return]</span>
+<span class="normal">338</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">predict_proba</span><span class="p">(</span>  <span class="c1"># type: ignore[return]</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">X_tab</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span>
     <span class="n">n_samples</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">,</span>
@@ -2549,7 +2572,7 @@ <h3 id="pytorch_widedeep.training.BayesianTrainer.save" class="doc doc-heading">
 
     <div class="doc doc-contents ">
 
-      <p>Saves the model, training and evaluation history to disk</p>
+        <p>Saves the model, training and evaluation history to disk</p>
 <p>The <code>Trainer</code> class is built so that it 'just' trains a model. With
 that in mind, all the torch related parameters (such as optimizers or
 learning rate schedulers) have to be defined externally and then
@@ -2595,7 +2618,11 @@ <h3 id="pytorch_widedeep.training.BayesianTrainer.save" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/training/bayesian_trainer.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">344</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">340</span>
+<span class="normal">341</span>
+<span class="normal">342</span>
+<span class="normal">343</span>
+<span class="normal">344</span>
 <span class="normal">345</span>
 <span class="normal">346</span>
 <span class="normal">347</span>
@@ -2638,11 +2665,7 @@ <h3 id="pytorch_widedeep.training.BayesianTrainer.save" class="doc doc-heading">
 <span class="normal">384</span>
 <span class="normal">385</span>
 <span class="normal">386</span>
-<span class="normal">387</span>
-<span class="normal">388</span>
-<span class="normal">389</span>
-<span class="normal">390</span>
-<span class="normal">391</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">save</span><span class="p">(</span>
+<span class="normal">387</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">save</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
     <span class="n">save_state_dict</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
@@ -2720,7 +2743,7 @@ <h3 id="pytorch_widedeep.training.BayesianTrainer.save" class="doc doc-heading">
   <span class="md-source-file__fact">
     <span class="md-icon" title="Contributors">
       
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 4a4 4 0 0 1 4 4 4 4 0 0 1-4 4 4 4 0 0 1-4-4 4 4 0 0 1 4-4m0 10c4.42 0 8 1.79 8 4v2H4v-2c0-2.21 3.58-4 8-4Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 4a4 4 0 0 1 4 4 4 4 0 0 1-4 4 4 4 0 0 1-4-4 4 4 0 0 1 4-4m0 10c4.42 0 8 1.79 8 4v2H4v-2c0-2.21 3.58-4 8-4"/></svg>
       
     </span>
     <nav>
@@ -2775,7 +2798,7 @@ <h3 id="pytorch_widedeep.training.BayesianTrainer.save" class="doc doc-heading">
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -2790,10 +2813,10 @@ <h3 id="pytorch_widedeep.training.BayesianTrainer.save" class="doc doc-heading">
     </div>
     
     
-    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="../assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="../stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/pytorch-widedeep/callbacks.html b/mkdocs/site/pytorch-widedeep/callbacks.html
index bc9f877b..883b2386 100644
--- a/mkdocs/site/pytorch-widedeep/callbacks.html
+++ b/mkdocs/site/pytorch-widedeep/callbacks.html
@@ -18,7 +18,7 @@
       
       
       <link rel="icon" href="../assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -26,7 +26,7 @@
       
     
     
-      <link rel="stylesheet" href="../assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="../assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="../assets/stylesheets/palette.06af60db.min.css">
@@ -52,7 +52,7 @@
     
       <link rel="stylesheet" href="../stylesheets/extra.css">
     
-    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -101,7 +101,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -129,7 +129,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -139,7 +139,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -147,13 +147,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -162,15 +162,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -193,7 +193,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -372,7 +372,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -476,6 +476,8 @@
         
       
         
+      
+        
       
         
       
@@ -738,6 +740,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="bayesian_models.html" class="md-nav__link">
         
@@ -1613,7 +1636,7 @@ <h2 id="pytorch_widedeep.callbacks.LRHistory" class="doc doc-heading">
               Bases: <code><span title="pytorch_widedeep.callbacks.Callback">Callback</span></code></p>
 
 
-      <p>Saves the learning rates during training in the <code>lr_history</code> attribute
+        <p>Saves the learning rates during training in the <code>lr_history</code> attribute
 of the <code>Trainer</code>.</p>
 <p>Callbacks are passed as input parameters to the <code>Trainer</code> class. See
 <code>pytorch_widedeep.trainer.Trainer</code></p>
@@ -1701,7 +1724,7 @@ <h2 id="pytorch_widedeep.callbacks.ModelCheckpoint" class="doc doc-heading">
               Bases: <code><span title="pytorch_widedeep.callbacks.Callback">Callback</span></code></p>
 
 
-      <p>Saves the model after every epoch.</p>
+        <p>Saves the model after every epoch.</p>
 <p>This class is almost identical to the corresponding keras class.
 Therefore, <strong>credit</strong> to the Keras Team.</p>
 <p>Callbacks are passed as input parameters to the <code>Trainer</code> class. See
@@ -2028,7 +2051,7 @@ <h2 id="pytorch_widedeep.callbacks.EarlyStopping" class="doc doc-heading">
               Bases: <code><span title="pytorch_widedeep.callbacks.Callback">Callback</span></code></p>
 
 
-      <p>Stop training when a monitored quantity has stopped improving.</p>
+        <p>Stop training when a monitored quantity has stopped improving.</p>
 <p>This class is almost identical to the corresponding keras class.
 Therefore, <strong>credit</strong> to the Keras Team.</p>
 <p>Callbacks are passed as input parameters to the <code>Trainer</code> class. See
@@ -2287,13 +2310,13 @@ <h2 id="pytorch_widedeep.callbacks.EarlyStopping" class="doc doc-heading">
   <span class="md-source-file__fact">
     <span class="md-icon" title="Contributors">
       
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.536 5.536 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13v-1.75M0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20H0m24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9V20Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.54 5.54 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13zM0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20zm24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9z"/></svg>
       
     </span>
     <nav>
       
-        <a href="mailto:mulinka.pavol@gmail.com">Pavol Mulinka</a>, 
-        <a href="mailto:javierrodriguezzaurin@javiers-macbook-pro.local">Javier Rodriguez Zaurin</a>
+        <a href="mailto:javierrodriguezzaurin@javiers-macbook-pro.local">Javier Rodriguez Zaurin</a>, 
+        <a href="mailto:mulinka.pavol@gmail.com">Pavol Mulinka</a>
     </nav>
   </span>
 
@@ -2343,7 +2366,7 @@ <h2 id="pytorch_widedeep.callbacks.EarlyStopping" class="doc doc-heading">
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -2358,10 +2381,10 @@ <h2 id="pytorch_widedeep.callbacks.EarlyStopping" class="doc doc-heading">
     </div>
     
     
-    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="../assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="../stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/pytorch-widedeep/dataloaders.html b/mkdocs/site/pytorch-widedeep/dataloaders.html
index 97ae5a1c..45f846f4 100644
--- a/mkdocs/site/pytorch-widedeep/dataloaders.html
+++ b/mkdocs/site/pytorch-widedeep/dataloaders.html
@@ -18,7 +18,7 @@
       
       
       <link rel="icon" href="../assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -26,7 +26,7 @@
       
     
     
-      <link rel="stylesheet" href="../assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="../assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="../assets/stylesheets/palette.06af60db.min.css">
@@ -52,7 +52,7 @@
     
       <link rel="stylesheet" href="../stylesheets/extra.css">
     
-    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -101,7 +101,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -129,7 +129,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -139,7 +139,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -147,13 +147,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -162,15 +162,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -193,7 +193,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -372,7 +372,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -476,6 +476,8 @@
         
       
         
+      
+        
       
         
       
@@ -738,6 +740,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="bayesian_models.html" class="md-nav__link">
         
@@ -1597,7 +1620,7 @@ <h2 id="pytorch_widedeep.dataloaders.DataLoaderImbalanced" class="doc doc-headin
               Bases: <code><span title="torch.utils.data.DataLoader">DataLoader</span></code></p>
 
 
-      <p>Class to load and shuffle batches with adjusted weights for imbalanced
+        <p>Class to load and shuffle batches with adjusted weights for imbalanced
 datasets. If the classes do not begin from 0 remapping is necessary. See
 <a href="https://towardsdatascience.com/pytorch-tabular-multiclass-classification-9f8211a123ab">here</a>.</p>
 
@@ -1732,13 +1755,13 @@ <h2 id="pytorch_widedeep.dataloaders.DataLoaderImbalanced" class="doc doc-headin
   <span class="md-source-file__fact">
     <span class="md-icon" title="Contributors">
       
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.536 5.536 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13v-1.75M0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20H0m24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9V20Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.54 5.54 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13zM0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20zm24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9z"/></svg>
       
     </span>
     <nav>
       
-        <a href="mailto:mulinka.pavol@gmail.com">Pavol Mulinka</a>, 
-        <a href="mailto:javierrodriguezzaurin@javiers-macbook-pro.local">Javier Rodriguez Zaurin</a>
+        <a href="mailto:javierrodriguezzaurin@javiers-macbook-pro.local">Javier Rodriguez Zaurin</a>, 
+        <a href="mailto:mulinka.pavol@gmail.com">Pavol Mulinka</a>
     </nav>
   </span>
 
@@ -1788,7 +1811,7 @@ <h2 id="pytorch_widedeep.dataloaders.DataLoaderImbalanced" class="doc doc-headin
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -1803,10 +1826,10 @@ <h2 id="pytorch_widedeep.dataloaders.DataLoaderImbalanced" class="doc doc-headin
     </div>
     
     
-    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="../assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="../stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/pytorch-widedeep/losses.html b/mkdocs/site/pytorch-widedeep/losses.html
index 60bebe62..39ad8311 100644
--- a/mkdocs/site/pytorch-widedeep/losses.html
+++ b/mkdocs/site/pytorch-widedeep/losses.html
@@ -18,7 +18,7 @@
       
       
       <link rel="icon" href="../assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -26,7 +26,7 @@
       
     
     
-      <link rel="stylesheet" href="../assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="../assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="../assets/stylesheets/palette.06af60db.min.css">
@@ -52,7 +52,7 @@
     
       <link rel="stylesheet" href="../stylesheets/extra.css">
     
-    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -101,7 +101,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -129,7 +129,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -139,7 +139,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -147,13 +147,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -162,15 +162,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -193,7 +193,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -372,7 +372,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -476,6 +476,8 @@
         
       
         
+      
+        
       
         
       
@@ -738,6 +740,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="bayesian_models.html" class="md-nav__link">
         
@@ -1778,12 +1801,12 @@ <h2 id="pytorch_widedeep.losses.MSELoss" class="doc doc-heading">
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>Mean square error loss</p>
+        <p>Mean square error loss</p>
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">23</span>
-<span class="normal">24</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">16</span>
+<span class="normal">17</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
     <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
 </code></pre></div></td></tr></table></div>
                   </details>
@@ -1847,7 +1870,14 @@ <h3 id="pytorch_widedeep.losses.MSELoss.forward" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">26</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">19</span>
+<span class="normal">20</span>
+<span class="normal">21</span>
+<span class="normal">22</span>
+<span class="normal">23</span>
+<span class="normal">24</span>
+<span class="normal">25</span>
+<span class="normal">26</span>
 <span class="normal">27</span>
 <span class="normal">28</span>
 <span class="normal">29</span>
@@ -1863,14 +1893,7 @@ <h3 id="pytorch_widedeep.losses.MSELoss.forward" class="doc doc-heading">
 <span class="normal">39</span>
 <span class="normal">40</span>
 <span class="normal">41</span>
-<span class="normal">42</span>
-<span class="normal">43</span>
-<span class="normal">44</span>
-<span class="normal">45</span>
-<span class="normal">46</span>
-<span class="normal">47</span>
-<span class="normal">48</span>
-<span class="normal">49</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span>
+<span class="normal">42</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span>
     <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span>
@@ -1925,12 +1948,12 @@ <h2 id="pytorch_widedeep.losses.MSLELoss" class="doc doc-heading">
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>Mean square log error loss</p>
+        <p>Mean square log error loss</p>
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">55</span>
-<span class="normal">56</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">48</span>
+<span class="normal">49</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
     <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
 </code></pre></div></td></tr></table></div>
                   </details>
@@ -1994,7 +2017,14 @@ <h3 id="pytorch_widedeep.losses.MSLELoss.forward" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">58</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">51</span>
+<span class="normal">52</span>
+<span class="normal">53</span>
+<span class="normal">54</span>
+<span class="normal">55</span>
+<span class="normal">56</span>
+<span class="normal">57</span>
+<span class="normal">58</span>
 <span class="normal">59</span>
 <span class="normal">60</span>
 <span class="normal">61</span>
@@ -2017,14 +2047,7 @@ <h3 id="pytorch_widedeep.losses.MSLELoss.forward" class="doc doc-heading">
 <span class="normal">78</span>
 <span class="normal">79</span>
 <span class="normal">80</span>
-<span class="normal">81</span>
-<span class="normal">82</span>
-<span class="normal">83</span>
-<span class="normal">84</span>
-<span class="normal">85</span>
-<span class="normal">86</span>
-<span class="normal">87</span>
-<span class="normal">88</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span>
+<span class="normal">81</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span>
     <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span>
@@ -2086,12 +2109,12 @@ <h2 id="pytorch_widedeep.losses.RMSELoss" class="doc doc-heading">
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>Root mean square error loss</p>
+        <p>Root mean square error loss</p>
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">97</span>
-<span class="normal">98</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">90</span>
+<span class="normal">91</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
     <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
 </code></pre></div></td></tr></table></div>
                   </details>
@@ -2155,7 +2178,14 @@ <h3 id="pytorch_widedeep.losses.RMSELoss.forward" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">100</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 93</span>
+<span class="normal"> 94</span>
+<span class="normal"> 95</span>
+<span class="normal"> 96</span>
+<span class="normal"> 97</span>
+<span class="normal"> 98</span>
+<span class="normal"> 99</span>
+<span class="normal">100</span>
 <span class="normal">101</span>
 <span class="normal">102</span>
 <span class="normal">103</span>
@@ -2167,14 +2197,7 @@ <h3 id="pytorch_widedeep.losses.RMSELoss.forward" class="doc doc-heading">
 <span class="normal">109</span>
 <span class="normal">110</span>
 <span class="normal">111</span>
-<span class="normal">112</span>
-<span class="normal">113</span>
-<span class="normal">114</span>
-<span class="normal">115</span>
-<span class="normal">116</span>
-<span class="normal">117</span>
-<span class="normal">118</span>
-<span class="normal">119</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
+<span class="normal">112</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
 <span class="w">    </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Parameters</span>
 <span class="sd">    ----------</span>
@@ -2225,12 +2248,12 @@ <h2 id="pytorch_widedeep.losses.RMSLELoss" class="doc doc-heading">
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>Root mean square log error loss</p>
+        <p>Root mean square log error loss</p>
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">125</span>
-<span class="normal">126</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">118</span>
+<span class="normal">119</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
     <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
 </code></pre></div></td></tr></table></div>
                   </details>
@@ -2294,7 +2317,14 @@ <h3 id="pytorch_widedeep.losses.RMSLELoss.forward" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">128</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">121</span>
+<span class="normal">122</span>
+<span class="normal">123</span>
+<span class="normal">124</span>
+<span class="normal">125</span>
+<span class="normal">126</span>
+<span class="normal">127</span>
+<span class="normal">128</span>
 <span class="normal">129</span>
 <span class="normal">130</span>
 <span class="normal">131</span>
@@ -2313,14 +2343,7 @@ <h3 id="pytorch_widedeep.losses.RMSLELoss.forward" class="doc doc-heading">
 <span class="normal">144</span>
 <span class="normal">145</span>
 <span class="normal">146</span>
-<span class="normal">147</span>
-<span class="normal">148</span>
-<span class="normal">149</span>
-<span class="normal">150</span>
-<span class="normal">151</span>
-<span class="normal">152</span>
-<span class="normal">153</span>
-<span class="normal">154</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
+<span class="normal">147</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
 <span class="w">    </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Parameters</span>
 <span class="sd">    ----------</span>
@@ -2380,7 +2403,7 @@ <h2 id="pytorch_widedeep.losses.QuantileLoss" class="doc doc-heading">
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>Quantile loss defined as:</p>
+        <p>Quantile loss defined as:</p>
 <div class="arithmatex">\[
 Loss = max(q \times (y-y_{pred}), (1-q) \times (y_{pred}-y))
 \]</div>
@@ -2404,12 +2427,12 @@ <h2 id="pytorch_widedeep.losses.QuantileLoss" class="doc doc-heading">
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">173</span>
-<span class="normal">174</span>
-<span class="normal">175</span>
-<span class="normal">176</span>
-<span class="normal">177</span>
-<span class="normal">178</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">166</span>
+<span class="normal">167</span>
+<span class="normal">168</span>
+<span class="normal">169</span>
+<span class="normal">170</span>
+<span class="normal">171</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">quantiles</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="mf">0.02</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">,</span> <span class="mf">0.25</span><span class="p">,</span> <span class="mf">0.5</span><span class="p">,</span> <span class="mf">0.75</span><span class="p">,</span> <span class="mf">0.9</span><span class="p">,</span> <span class="mf">0.98</span><span class="p">],</span>
 <span class="p">):</span>
@@ -2480,7 +2503,14 @@ <h3 id="pytorch_widedeep.losses.QuantileLoss.forward" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">180</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">173</span>
+<span class="normal">174</span>
+<span class="normal">175</span>
+<span class="normal">176</span>
+<span class="normal">177</span>
+<span class="normal">178</span>
+<span class="normal">179</span>
+<span class="normal">180</span>
 <span class="normal">181</span>
 <span class="normal">182</span>
 <span class="normal">183</span>
@@ -2508,14 +2538,7 @@ <h3 id="pytorch_widedeep.losses.QuantileLoss.forward" class="doc doc-heading">
 <span class="normal">205</span>
 <span class="normal">206</span>
 <span class="normal">207</span>
-<span class="normal">208</span>
-<span class="normal">209</span>
-<span class="normal">210</span>
-<span class="normal">211</span>
-<span class="normal">212</span>
-<span class="normal">213</span>
-<span class="normal">214</span>
-<span class="normal">215</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
+<span class="normal">208</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
 <span class="w">    </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Parameters</span>
 <span class="sd">    ----------</span>
@@ -2582,7 +2605,7 @@ <h2 id="pytorch_widedeep.losses.FocalLoss" class="doc doc-heading">
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>Implementation of the <a href="https://arxiv.org/pdf/1708.02002.pdf">Focal loss</a>
+        <p>Implementation of the <a href="https://arxiv.org/pdf/1708.02002.pdf">Focal loss</a>
 for both binary and multiclass classification:</p>
 <div class="arithmatex">\[
 FL(p_t) = \alpha (1 - p_t)^{\gamma} log(p_t)
@@ -2619,10 +2642,10 @@ <h2 id="pytorch_widedeep.losses.FocalLoss" class="doc doc-heading">
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">240</span>
-<span class="normal">241</span>
-<span class="normal">242</span>
-<span class="normal">243</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">alpha</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.25</span><span class="p">,</span> <span class="n">gamma</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">):</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">233</span>
+<span class="normal">234</span>
+<span class="normal">235</span>
+<span class="normal">236</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">alpha</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.25</span><span class="p">,</span> <span class="n">gamma</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">):</span>
     <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
     <span class="bp">self</span><span class="o">.</span><span class="n">alpha</span> <span class="o">=</span> <span class="n">alpha</span>
     <span class="bp">self</span><span class="o">.</span><span class="n">gamma</span> <span class="o">=</span> <span class="n">gamma</span>
@@ -2695,7 +2718,14 @@ <h3 id="pytorch_widedeep.losses.FocalLoss.forward" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">250</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">243</span>
+<span class="normal">244</span>
+<span class="normal">245</span>
+<span class="normal">246</span>
+<span class="normal">247</span>
+<span class="normal">248</span>
+<span class="normal">249</span>
+<span class="normal">250</span>
 <span class="normal">251</span>
 <span class="normal">252</span>
 <span class="normal">253</span>
@@ -2727,14 +2757,7 @@ <h3 id="pytorch_widedeep.losses.FocalLoss.forward" class="doc doc-heading">
 <span class="normal">279</span>
 <span class="normal">280</span>
 <span class="normal">281</span>
-<span class="normal">282</span>
-<span class="normal">283</span>
-<span class="normal">284</span>
-<span class="normal">285</span>
-<span class="normal">286</span>
-<span class="normal">287</span>
-<span class="normal">288</span>
-<span class="normal">289</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
+<span class="normal">282</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
 <span class="w">    </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Parameters</span>
 <span class="sd">    ----------</span>
@@ -2805,14 +2828,14 @@ <h2 id="pytorch_widedeep.losses.BayesianSELoss" class="doc doc-heading">
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>Squared Loss (log Gaussian) for the case of a regression as specified in
+        <p>Squared Loss (log Gaussian) for the case of a regression as specified in
 the original publication
 <a href="https://arxiv.org/abs/1505.05424">Weight Uncertainty in Neural Networks</a>.</p>
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">318</span>
-<span class="normal">319</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">311</span>
+<span class="normal">312</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
     <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
 </code></pre></div></td></tr></table></div>
                   </details>
@@ -2875,7 +2898,14 @@ <h3 id="pytorch_widedeep.losses.BayesianSELoss.forward" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">321</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">314</span>
+<span class="normal">315</span>
+<span class="normal">316</span>
+<span class="normal">317</span>
+<span class="normal">318</span>
+<span class="normal">319</span>
+<span class="normal">320</span>
+<span class="normal">321</span>
 <span class="normal">322</span>
 <span class="normal">323</span>
 <span class="normal">324</span>
@@ -2885,14 +2915,7 @@ <h3 id="pytorch_widedeep.losses.BayesianSELoss.forward" class="doc doc-heading">
 <span class="normal">328</span>
 <span class="normal">329</span>
 <span class="normal">330</span>
-<span class="normal">331</span>
-<span class="normal">332</span>
-<span class="normal">333</span>
-<span class="normal">334</span>
-<span class="normal">335</span>
-<span class="normal">336</span>
-<span class="normal">337</span>
-<span class="normal">338</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
+<span class="normal">331</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
 <span class="w">    </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Parameters</span>
 <span class="sd">    ----------</span>
@@ -2941,15 +2964,15 @@ <h2 id="pytorch_widedeep.losses.TweedieLoss" class="doc doc-heading">
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>Tweedie loss for extremely unbalanced zero-inflated data</p>
+        <p>Tweedie loss for extremely unbalanced zero-inflated data</p>
 <p>All credits go to Wenbo Shi. See
 <a href="https://towardsdatascience.com/tweedie-loss-function-for-right-skewed-data-2c5ca470678f">this post</a>
 and the <a href="https://arxiv.org/abs/1811.10192">original publication</a> for details.</p>
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">350</span>
-<span class="normal">351</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">343</span>
+<span class="normal">344</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
     <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
 </code></pre></div></td></tr></table></div>
                   </details>
@@ -3024,7 +3047,14 @@ <h3 id="pytorch_widedeep.losses.TweedieLoss.forward" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">353</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">346</span>
+<span class="normal">347</span>
+<span class="normal">348</span>
+<span class="normal">349</span>
+<span class="normal">350</span>
+<span class="normal">351</span>
+<span class="normal">352</span>
+<span class="normal">353</span>
 <span class="normal">354</span>
 <span class="normal">355</span>
 <span class="normal">356</span>
@@ -3053,14 +3083,7 @@ <h3 id="pytorch_widedeep.losses.TweedieLoss.forward" class="doc doc-heading">
 <span class="normal">379</span>
 <span class="normal">380</span>
 <span class="normal">381</span>
-<span class="normal">382</span>
-<span class="normal">383</span>
-<span class="normal">384</span>
-<span class="normal">385</span>
-<span class="normal">386</span>
-<span class="normal">387</span>
-<span class="normal">388</span>
-<span class="normal">389</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span>
+<span class="normal">382</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span>
     <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span>
@@ -3128,15 +3151,15 @@ <h2 id="pytorch_widedeep.losses.ZILNLoss" class="doc doc-heading">
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>Adjusted implementation of the Zero Inflated LogNormal Loss</p>
+        <p>Adjusted implementation of the Zero Inflated LogNormal Loss</p>
 <p>See <a href="https://arxiv.org/pdf/1912.07753.pdf">A Deep Probabilistic Model for Customer Lifetime Value Prediction</a>
 and the corresponding
 <a href="https://github.com/google/lifetime_value/blob/master/lifetime_value/zero_inflated_lognormal.py">code</a>.</p>
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">400</span>
-<span class="normal">401</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">393</span>
+<span class="normal">394</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
     <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
 </code></pre></div></td></tr></table></div>
                   </details>
@@ -3200,7 +3223,14 @@ <h3 id="pytorch_widedeep.losses.ZILNLoss.forward" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">403</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">396</span>
+<span class="normal">397</span>
+<span class="normal">398</span>
+<span class="normal">399</span>
+<span class="normal">400</span>
+<span class="normal">401</span>
+<span class="normal">402</span>
+<span class="normal">403</span>
 <span class="normal">404</span>
 <span class="normal">405</span>
 <span class="normal">406</span>
@@ -3244,14 +3274,7 @@ <h3 id="pytorch_widedeep.losses.ZILNLoss.forward" class="doc doc-heading">
 <span class="normal">444</span>
 <span class="normal">445</span>
 <span class="normal">446</span>
-<span class="normal">447</span>
-<span class="normal">448</span>
-<span class="normal">449</span>
-<span class="normal">450</span>
-<span class="normal">451</span>
-<span class="normal">452</span>
-<span class="normal">453</span>
-<span class="normal">454</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
+<span class="normal">447</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
 <span class="w">    </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Parameters</span>
 <span class="sd">    ----------</span>
@@ -3334,12 +3357,12 @@ <h2 id="pytorch_widedeep.losses.L1Loss" class="doc doc-heading">
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>L1 loss</p>
+        <p>L1 loss</p>
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">463</span>
-<span class="normal">464</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">456</span>
+<span class="normal">457</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
     <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
 </code></pre></div></td></tr></table></div>
                   </details>
@@ -3404,7 +3427,14 @@ <h3 id="pytorch_widedeep.losses.L1Loss.forward" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">466</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">459</span>
+<span class="normal">460</span>
+<span class="normal">461</span>
+<span class="normal">462</span>
+<span class="normal">463</span>
+<span class="normal">464</span>
+<span class="normal">465</span>
+<span class="normal">466</span>
 <span class="normal">467</span>
 <span class="normal">468</span>
 <span class="normal">469</span>
@@ -3417,14 +3447,7 @@ <h3 id="pytorch_widedeep.losses.L1Loss.forward" class="doc doc-heading">
 <span class="normal">476</span>
 <span class="normal">477</span>
 <span class="normal">478</span>
-<span class="normal">479</span>
-<span class="normal">480</span>
-<span class="normal">481</span>
-<span class="normal">482</span>
-<span class="normal">483</span>
-<span class="normal">484</span>
-<span class="normal">485</span>
-<span class="normal">486</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
+<span class="normal">479</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
 <span class="w">    </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Parameters</span>
 <span class="sd">    ----------</span>
@@ -3476,7 +3499,7 @@ <h2 id="pytorch_widedeep.losses.FocalR_L1Loss" class="doc doc-heading">
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>Focal-R L1 loss</p>
+        <p>Focal-R L1 loss</p>
 <p>Based on <a href="https://arxiv.org/abs/2102.09554">Delving into Deep Imbalanced Regression</a>.</p>
 
 
@@ -3518,16 +3541,16 @@ <h2 id="pytorch_widedeep.losses.FocalR_L1Loss" class="doc doc-heading">
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">506</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">499</span>
+<span class="normal">500</span>
+<span class="normal">501</span>
+<span class="normal">502</span>
+<span class="normal">503</span>
+<span class="normal">504</span>
+<span class="normal">505</span>
+<span class="normal">506</span>
 <span class="normal">507</span>
-<span class="normal">508</span>
-<span class="normal">509</span>
-<span class="normal">510</span>
-<span class="normal">511</span>
-<span class="normal">512</span>
-<span class="normal">513</span>
-<span class="normal">514</span>
-<span class="normal">515</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">508</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">beta</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.2</span><span class="p">,</span>
     <span class="n">gamma</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span>
@@ -3600,7 +3623,14 @@ <h3 id="pytorch_widedeep.losses.FocalR_L1Loss.forward" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">517</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">510</span>
+<span class="normal">511</span>
+<span class="normal">512</span>
+<span class="normal">513</span>
+<span class="normal">514</span>
+<span class="normal">515</span>
+<span class="normal">516</span>
+<span class="normal">517</span>
 <span class="normal">518</span>
 <span class="normal">519</span>
 <span class="normal">520</span>
@@ -3627,14 +3657,7 @@ <h3 id="pytorch_widedeep.losses.FocalR_L1Loss.forward" class="doc doc-heading">
 <span class="normal">541</span>
 <span class="normal">542</span>
 <span class="normal">543</span>
-<span class="normal">544</span>
-<span class="normal">545</span>
-<span class="normal">546</span>
-<span class="normal">547</span>
-<span class="normal">548</span>
-<span class="normal">549</span>
-<span class="normal">550</span>
-<span class="normal">551</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span>
+<span class="normal">544</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span>
     <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span>
@@ -3702,7 +3725,7 @@ <h2 id="pytorch_widedeep.losses.FocalR_MSELoss" class="doc doc-heading">
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>Focal-R MSE loss</p>
+        <p>Focal-R MSE loss</p>
 <p>Based on <a href="https://arxiv.org/abs/2102.09554">Delving into Deep Imbalanced Regression</a>.</p>
 
 
@@ -3744,16 +3767,16 @@ <h2 id="pytorch_widedeep.losses.FocalR_MSELoss" class="doc doc-heading">
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">571</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">564</span>
+<span class="normal">565</span>
+<span class="normal">566</span>
+<span class="normal">567</span>
+<span class="normal">568</span>
+<span class="normal">569</span>
+<span class="normal">570</span>
+<span class="normal">571</span>
 <span class="normal">572</span>
-<span class="normal">573</span>
-<span class="normal">574</span>
-<span class="normal">575</span>
-<span class="normal">576</span>
-<span class="normal">577</span>
-<span class="normal">578</span>
-<span class="normal">579</span>
-<span class="normal">580</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">573</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">beta</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.2</span><span class="p">,</span>
     <span class="n">gamma</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span>
@@ -3826,7 +3849,14 @@ <h3 id="pytorch_widedeep.losses.FocalR_MSELoss.forward" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">582</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">575</span>
+<span class="normal">576</span>
+<span class="normal">577</span>
+<span class="normal">578</span>
+<span class="normal">579</span>
+<span class="normal">580</span>
+<span class="normal">581</span>
+<span class="normal">582</span>
 <span class="normal">583</span>
 <span class="normal">584</span>
 <span class="normal">585</span>
@@ -3853,14 +3883,7 @@ <h3 id="pytorch_widedeep.losses.FocalR_MSELoss.forward" class="doc doc-heading">
 <span class="normal">606</span>
 <span class="normal">607</span>
 <span class="normal">608</span>
-<span class="normal">609</span>
-<span class="normal">610</span>
-<span class="normal">611</span>
-<span class="normal">612</span>
-<span class="normal">613</span>
-<span class="normal">614</span>
-<span class="normal">615</span>
-<span class="normal">616</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span>
+<span class="normal">609</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span>
     <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span>
@@ -3928,7 +3951,7 @@ <h2 id="pytorch_widedeep.losses.FocalR_RMSELoss" class="doc doc-heading">
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>Focal-R RMSE loss</p>
+        <p>Focal-R RMSE loss</p>
 <p>Based on <a href="https://arxiv.org/abs/2102.09554">Delving into Deep Imbalanced Regression</a>.</p>
 
 
@@ -3970,16 +3993,16 @@ <h2 id="pytorch_widedeep.losses.FocalR_RMSELoss" class="doc doc-heading">
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">636</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">629</span>
+<span class="normal">630</span>
+<span class="normal">631</span>
+<span class="normal">632</span>
+<span class="normal">633</span>
+<span class="normal">634</span>
+<span class="normal">635</span>
+<span class="normal">636</span>
 <span class="normal">637</span>
-<span class="normal">638</span>
-<span class="normal">639</span>
-<span class="normal">640</span>
-<span class="normal">641</span>
-<span class="normal">642</span>
-<span class="normal">643</span>
-<span class="normal">644</span>
-<span class="normal">645</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">638</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">beta</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.2</span><span class="p">,</span>
     <span class="n">gamma</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span>
@@ -4052,7 +4075,14 @@ <h3 id="pytorch_widedeep.losses.FocalR_RMSELoss.forward" class="doc doc-heading"
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">647</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">640</span>
+<span class="normal">641</span>
+<span class="normal">642</span>
+<span class="normal">643</span>
+<span class="normal">644</span>
+<span class="normal">645</span>
+<span class="normal">646</span>
+<span class="normal">647</span>
 <span class="normal">648</span>
 <span class="normal">649</span>
 <span class="normal">650</span>
@@ -4079,14 +4109,7 @@ <h3 id="pytorch_widedeep.losses.FocalR_RMSELoss.forward" class="doc doc-heading"
 <span class="normal">671</span>
 <span class="normal">672</span>
 <span class="normal">673</span>
-<span class="normal">674</span>
-<span class="normal">675</span>
-<span class="normal">676</span>
-<span class="normal">677</span>
-<span class="normal">678</span>
-<span class="normal">679</span>
-<span class="normal">680</span>
-<span class="normal">681</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span>
+<span class="normal">674</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span>
     <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span>
@@ -4152,14 +4175,14 @@ <h2 id="pytorch_widedeep.losses.HuberLoss" class="doc doc-heading">
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>Hubbler Loss</p>
+        <p>Hubbler Loss</p>
 <p>Based on <a href="https://arxiv.org/abs/2102.09554">Delving into Deep Imbalanced Regression</a>.</p>
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">690</span>
-<span class="normal">691</span>
-<span class="normal">692</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">beta</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.2</span><span class="p">):</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">683</span>
+<span class="normal">684</span>
+<span class="normal">685</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">beta</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.2</span><span class="p">):</span>
     <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
     <span class="bp">self</span><span class="o">.</span><span class="n">beta</span> <span class="o">=</span> <span class="n">beta</span>
 </code></pre></div></td></tr></table></div>
@@ -4225,7 +4248,14 @@ <h3 id="pytorch_widedeep.losses.HuberLoss.forward" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">694</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">687</span>
+<span class="normal">688</span>
+<span class="normal">689</span>
+<span class="normal">690</span>
+<span class="normal">691</span>
+<span class="normal">692</span>
+<span class="normal">693</span>
+<span class="normal">694</span>
 <span class="normal">695</span>
 <span class="normal">696</span>
 <span class="normal">697</span>
@@ -4246,14 +4276,7 @@ <h3 id="pytorch_widedeep.losses.HuberLoss.forward" class="doc doc-heading">
 <span class="normal">712</span>
 <span class="normal">713</span>
 <span class="normal">714</span>
-<span class="normal">715</span>
-<span class="normal">716</span>
-<span class="normal">717</span>
-<span class="normal">718</span>
-<span class="normal">719</span>
-<span class="normal">720</span>
-<span class="normal">721</span>
-<span class="normal">722</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span>
+<span class="normal">715</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span>
     <span class="n">target</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span>
@@ -4313,7 +4336,7 @@ <h2 id="pytorch_widedeep.losses.InfoNCELoss" class="doc doc-heading">
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>InfoNCE Loss. Loss applied during the Contrastive Denoising Self
+        <p>InfoNCE Loss. Loss applied during the Contrastive Denoising Self
 Supervised Pre-training routine available in this library</p>
 <p><img alt="ℹ️" class="emojione" src="https://cdnjs.cloudflare.com/ajax/libs/emojione/2.2.7/assets/png/2139.png" title=":information_source:" /> <strong>NOTE</strong>: This loss is in principle not exposed to
  the user, as it is used internally in the library, but it is included
@@ -4350,11 +4373,11 @@ <h2 id="pytorch_widedeep.losses.InfoNCELoss" class="doc doc-heading">
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">747</span>
-<span class="normal">748</span>
-<span class="normal">749</span>
-<span class="normal">750</span>
-<span class="normal">751</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">temperature</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.1</span><span class="p">,</span> <span class="n">reduction</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;mean&quot;</span><span class="p">):</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">740</span>
+<span class="normal">741</span>
+<span class="normal">742</span>
+<span class="normal">743</span>
+<span class="normal">744</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">temperature</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.1</span><span class="p">,</span> <span class="n">reduction</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;mean&quot;</span><span class="p">):</span>
     <span class="nb">super</span><span class="p">(</span><span class="n">InfoNCELoss</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
 
     <span class="bp">self</span><span class="o">.</span><span class="n">temperature</span> <span class="o">=</span> <span class="n">temperature</span>
@@ -4414,7 +4437,14 @@ <h3 id="pytorch_widedeep.losses.InfoNCELoss.forward" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">753</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">746</span>
+<span class="normal">747</span>
+<span class="normal">748</span>
+<span class="normal">749</span>
+<span class="normal">750</span>
+<span class="normal">751</span>
+<span class="normal">752</span>
+<span class="normal">753</span>
 <span class="normal">754</span>
 <span class="normal">755</span>
 <span class="normal">756</span>
@@ -4438,14 +4468,7 @@ <h3 id="pytorch_widedeep.losses.InfoNCELoss.forward" class="doc doc-heading">
 <span class="normal">774</span>
 <span class="normal">775</span>
 <span class="normal">776</span>
-<span class="normal">777</span>
-<span class="normal">778</span>
-<span class="normal">779</span>
-<span class="normal">780</span>
-<span class="normal">781</span>
-<span class="normal">782</span>
-<span class="normal">783</span>
-<span class="normal">784</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">g_projs</span><span class="p">:</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Tensor</span><span class="p">,</span> <span class="n">Tensor</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
+<span class="normal">777</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">g_projs</span><span class="p">:</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Tensor</span><span class="p">,</span> <span class="n">Tensor</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
 <span class="w">    </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Parameters</span>
 <span class="sd">    ----------</span>
@@ -4510,7 +4533,7 @@ <h2 id="pytorch_widedeep.losses.DenoisingLoss" class="doc doc-heading">
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>Denoising Loss. Loss applied during the Contrastive Denoising Self
+        <p>Denoising Loss. Loss applied during the Contrastive Denoising Self
 Supervised Pre-training routine available in this library</p>
 <p><img alt="ℹ️" class="emojione" src="https://cdnjs.cloudflare.com/ajax/libs/emojione/2.2.7/assets/png/2139.png" title=":information_source:" /> <strong>NOTE</strong>: This loss is in principle not exposed to
  the user, as it is used internally in the library, but it is included
@@ -4558,14 +4581,14 @@ <h2 id="pytorch_widedeep.losses.DenoisingLoss" class="doc doc-heading">
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">811</span>
-<span class="normal">812</span>
-<span class="normal">813</span>
-<span class="normal">814</span>
-<span class="normal">815</span>
-<span class="normal">816</span>
-<span class="normal">817</span>
-<span class="normal">818</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">804</span>
+<span class="normal">805</span>
+<span class="normal">806</span>
+<span class="normal">807</span>
+<span class="normal">808</span>
+<span class="normal">809</span>
+<span class="normal">810</span>
+<span class="normal">811</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span> <span class="n">lambda_cat</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span> <span class="n">lambda_cont</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span> <span class="n">reduction</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;mean&quot;</span>
 <span class="p">):</span>
     <span class="nb">super</span><span class="p">(</span><span class="n">DenoisingLoss</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
@@ -4639,7 +4662,14 @@ <h3 id="pytorch_widedeep.losses.DenoisingLoss.forward" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">820</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">813</span>
+<span class="normal">814</span>
+<span class="normal">815</span>
+<span class="normal">816</span>
+<span class="normal">817</span>
+<span class="normal">818</span>
+<span class="normal">819</span>
+<span class="normal">820</span>
 <span class="normal">821</span>
 <span class="normal">822</span>
 <span class="normal">823</span>
@@ -4674,14 +4704,7 @@ <h3 id="pytorch_widedeep.losses.DenoisingLoss.forward" class="doc doc-heading">
 <span class="normal">852</span>
 <span class="normal">853</span>
 <span class="normal">854</span>
-<span class="normal">855</span>
-<span class="normal">856</span>
-<span class="normal">857</span>
-<span class="normal">858</span>
-<span class="normal">859</span>
-<span class="normal">860</span>
-<span class="normal">861</span>
-<span class="normal">862</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span>
+<span class="normal">855</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">x_cat_and_cat_</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span>
         <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Tensor</span><span class="p">,</span> <span class="n">Tensor</span><span class="p">]],</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Tensor</span><span class="p">,</span> <span class="n">Tensor</span><span class="p">]]</span>
@@ -4755,7 +4778,7 @@ <h2 id="pytorch_widedeep.losses.EncoderDecoderLoss" class="doc doc-heading">
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>'<em>Standard</em>' Encoder Decoder Loss. Loss applied during the Endoder-Decoder
+        <p>'<em>Standard</em>' Encoder Decoder Loss. Loss applied during the Endoder-Decoder
  Self-Supervised Pre-Training routine available in this library</p>
 <p><img alt="ℹ️" class="emojione" src="https://cdnjs.cloudflare.com/ajax/libs/emojione/2.2.7/assets/png/2139.png" title=":information_source:" /> <strong>NOTE</strong>: This loss is in principle not exposed to
  the user, as it is used internally in the library, but it is included
@@ -4782,9 +4805,9 @@ <h2 id="pytorch_widedeep.losses.EncoderDecoderLoss" class="doc doc-heading">
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">918</span>
-<span class="normal">919</span>
-<span class="normal">920</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">eps</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1e-9</span><span class="p">):</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">911</span>
+<span class="normal">912</span>
+<span class="normal">913</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">eps</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1e-9</span><span class="p">):</span>
     <span class="nb">super</span><span class="p">(</span><span class="n">EncoderDecoderLoss</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
     <span class="bp">self</span><span class="o">.</span><span class="n">eps</span> <span class="o">=</span> <span class="n">eps</span>
 </code></pre></div></td></tr></table></div>
@@ -4859,7 +4882,14 @@ <h3 id="pytorch_widedeep.losses.EncoderDecoderLoss.forward" class="doc doc-headi
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/losses.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">922</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">915</span>
+<span class="normal">916</span>
+<span class="normal">917</span>
+<span class="normal">918</span>
+<span class="normal">919</span>
+<span class="normal">920</span>
+<span class="normal">921</span>
+<span class="normal">922</span>
 <span class="normal">923</span>
 <span class="normal">924</span>
 <span class="normal">925</span>
@@ -4891,14 +4921,7 @@ <h3 id="pytorch_widedeep.losses.EncoderDecoderLoss.forward" class="doc doc-headi
 <span class="normal">951</span>
 <span class="normal">952</span>
 <span class="normal">953</span>
-<span class="normal">954</span>
-<span class="normal">955</span>
-<span class="normal">956</span>
-<span class="normal">957</span>
-<span class="normal">958</span>
-<span class="normal">959</span>
-<span class="normal">960</span>
-<span class="normal">961</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x_true</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">x_pred</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">mask</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
+<span class="normal">954</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x_true</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">x_pred</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">mask</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
 <span class="w">    </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Parameters</span>
 <span class="sd">    ----------</span>
@@ -4969,7 +4992,7 @@ <h2 id="pytorch_widedeep.losses_multitarget.MultiTargetRegressionLoss" class="do
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>This class is a wrapper around the Pytorch MSELoss. It allows for multi-target
+        <p>This class is a wrapper around the Pytorch MSELoss. It allows for multi-target
 regression problems. The user can provide a list of weights to apply to each
 target. The loss can be either the sum or the mean of the individual losses</p>
 
@@ -5015,20 +5038,20 @@ <h2 id="pytorch_widedeep.losses_multitarget.MultiTargetRegressionLoss" class="do
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/losses_multitarget.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">46</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">39</span>
+<span class="normal">40</span>
+<span class="normal">41</span>
+<span class="normal">42</span>
+<span class="normal">43</span>
+<span class="normal">44</span>
+<span class="normal">45</span>
+<span class="normal">46</span>
 <span class="normal">47</span>
 <span class="normal">48</span>
 <span class="normal">49</span>
 <span class="normal">50</span>
 <span class="normal">51</span>
-<span class="normal">52</span>
-<span class="normal">53</span>
-<span class="normal">54</span>
-<span class="normal">55</span>
-<span class="normal">56</span>
-<span class="normal">57</span>
-<span class="normal">58</span>
-<span class="normal">59</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;reduction&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;target_reduction&quot;</span><span class="p">])</span>
+<span class="normal">52</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;reduction&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;target_reduction&quot;</span><span class="p">])</span>
 <span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;weights&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;target_weights&quot;</span><span class="p">])</span>
 <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
@@ -5088,7 +5111,7 @@ <h2 id="pytorch_widedeep.losses_multitarget.MultiTargetClassificationLoss" class
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>This class is a wrapper around the Pytorch binary_cross_entropy_with_logits and
+        <p>This class is a wrapper around the Pytorch binary_cross_entropy_with_logits and
 cross_entropy losses. It allows for multi-target classification problems. The
 user can provide a list of weights to apply to each target. The loss can be
 either the sum or the mean of the individual losses</p>
@@ -5191,7 +5214,14 @@ <h2 id="pytorch_widedeep.losses_multitarget.MultiTargetClassificationLoss" class
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/losses_multitarget.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">144</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">137</span>
+<span class="normal">138</span>
+<span class="normal">139</span>
+<span class="normal">140</span>
+<span class="normal">141</span>
+<span class="normal">142</span>
+<span class="normal">143</span>
+<span class="normal">144</span>
 <span class="normal">145</span>
 <span class="normal">146</span>
 <span class="normal">147</span>
@@ -5231,14 +5261,7 @@ <h2 id="pytorch_widedeep.losses_multitarget.MultiTargetClassificationLoss" class
 <span class="normal">181</span>
 <span class="normal">182</span>
 <span class="normal">183</span>
-<span class="normal">184</span>
-<span class="normal">185</span>
-<span class="normal">186</span>
-<span class="normal">187</span>
-<span class="normal">188</span>
-<span class="normal">189</span>
-<span class="normal">190</span>
-<span class="normal">191</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;reduction&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;target_reduction&quot;</span><span class="p">])</span>
+<span class="normal">184</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;reduction&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;target_reduction&quot;</span><span class="p">])</span>
 <span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;weights&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;target_weights&quot;</span><span class="p">])</span>
 <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>  <span class="c1"># noqa: C901</span>
     <span class="bp">self</span><span class="p">,</span>
@@ -5333,7 +5356,7 @@ <h2 id="pytorch_widedeep.losses_multitarget.MutilTargetRegressionAndClassificati
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>This class is a wrapper around the MultiTargetRegressionLoss and the
+        <p>This class is a wrapper around the MultiTargetRegressionLoss and the
 MultiTargetClassificationLoss. It allows for multi-target regression and
 classification problems. The user can provide a list of weights to apply to
 each target. The loss can be either the sum or the mean of the individual losses</p>
@@ -5440,7 +5463,14 @@ <h2 id="pytorch_widedeep.losses_multitarget.MutilTargetRegressionAndClassificati
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/losses_multitarget.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">359</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">352</span>
+<span class="normal">353</span>
+<span class="normal">354</span>
+<span class="normal">355</span>
+<span class="normal">356</span>
+<span class="normal">357</span>
+<span class="normal">358</span>
+<span class="normal">359</span>
 <span class="normal">360</span>
 <span class="normal">361</span>
 <span class="normal">362</span>
@@ -5505,14 +5535,7 @@ <h2 id="pytorch_widedeep.losses_multitarget.MutilTargetRegressionAndClassificati
 <span class="normal">421</span>
 <span class="normal">422</span>
 <span class="normal">423</span>
-<span class="normal">424</span>
-<span class="normal">425</span>
-<span class="normal">426</span>
-<span class="normal">427</span>
-<span class="normal">428</span>
-<span class="normal">429</span>
-<span class="normal">430</span>
-<span class="normal">431</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;reduction&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;target_reduction&quot;</span><span class="p">])</span>
+<span class="normal">424</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;reduction&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;target_reduction&quot;</span><span class="p">])</span>
 <span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;weights&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;target_weights&quot;</span><span class="p">])</span>
 <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>  <span class="c1"># noqa: C901</span>
     <span class="bp">self</span><span class="p">,</span>
@@ -5624,7 +5647,7 @@ <h2 id="pytorch_widedeep.losses_multitarget.MutilTargetRegressionAndClassificati
   <span class="md-source-file__fact">
     <span class="md-icon" title="Contributors">
       
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.536 5.536 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13v-1.75M0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20H0m24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9V20Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.54 5.54 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13zM0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20zm24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9z"/></svg>
       
     </span>
     <nav>
@@ -5680,7 +5703,7 @@ <h2 id="pytorch_widedeep.losses_multitarget.MutilTargetRegressionAndClassificati
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -5695,10 +5718,10 @@ <h2 id="pytorch_widedeep.losses_multitarget.MutilTargetRegressionAndClassificati
     </div>
     
     
-    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="../assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="../stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/pytorch-widedeep/metrics.html b/mkdocs/site/pytorch-widedeep/metrics.html
index 33102abf..349df8a8 100644
--- a/mkdocs/site/pytorch-widedeep/metrics.html
+++ b/mkdocs/site/pytorch-widedeep/metrics.html
@@ -18,7 +18,7 @@
       
       
       <link rel="icon" href="../assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -26,7 +26,7 @@
       
     
     
-      <link rel="stylesheet" href="../assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="../assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="../assets/stylesheets/palette.06af60db.min.css">
@@ -52,7 +52,7 @@
     
       <link rel="stylesheet" href="../stylesheets/extra.css">
     
-    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -101,7 +101,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -129,7 +129,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -139,7 +139,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -147,13 +147,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -162,15 +162,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -193,7 +193,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -372,7 +372,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -476,6 +476,8 @@
         
       
         
+      
+        
       
         
       
@@ -738,6 +740,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="bayesian_models.html" class="md-nav__link">
         
@@ -1659,7 +1682,7 @@ <h2 id="pytorch_widedeep.metrics.Accuracy" class="doc doc-heading">
               Bases: <code><span title="pytorch_widedeep.metrics.Metric">Metric</span></code></p>
 
 
-      <p>Class to calculate the accuracy for both binary and categorical problems</p>
+        <p>Class to calculate the accuracy for both binary and categorical problems</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -1739,7 +1762,7 @@ <h3 id="pytorch_widedeep.metrics.Accuracy.reset" class="doc doc-heading">
 
     <div class="doc doc-contents ">
 
-      <p>resets counters to 0</p>
+        <p>resets counters to 0</p>
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/metrics.py</code></summary>
@@ -1785,7 +1808,7 @@ <h2 id="pytorch_widedeep.metrics.Precision" class="doc doc-heading">
               Bases: <code><span title="pytorch_widedeep.metrics.Metric">Metric</span></code></p>
 
 
-      <p>Class to calculate the precision for both binary and categorical problems</p>
+        <p>Class to calculate the precision for both binary and categorical problems</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -1867,7 +1890,7 @@ <h3 id="pytorch_widedeep.metrics.Precision.reset" class="doc doc-heading">
 
     <div class="doc doc-contents ">
 
-      <p>resets counters to 0</p>
+        <p>resets counters to 0</p>
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/metrics.py</code></summary>
@@ -1913,7 +1936,7 @@ <h2 id="pytorch_widedeep.metrics.Recall" class="doc doc-heading">
               Bases: <code><span title="pytorch_widedeep.metrics.Metric">Metric</span></code></p>
 
 
-      <p>Class to calculate the recall for both binary and categorical problems</p>
+        <p>Class to calculate the recall for both binary and categorical problems</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -1995,7 +2018,7 @@ <h3 id="pytorch_widedeep.metrics.Recall.reset" class="doc doc-heading">
 
     <div class="doc doc-contents ">
 
-      <p>resets counters to 0</p>
+        <p>resets counters to 0</p>
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/metrics.py</code></summary>
@@ -2041,7 +2064,7 @@ <h2 id="pytorch_widedeep.metrics.FBetaScore" class="doc doc-heading">
               Bases: <code><span title="pytorch_widedeep.metrics.Metric">Metric</span></code></p>
 
 
-      <p>Class to calculate the fbeta score for both binary and categorical problems</p>
+        <p>Class to calculate the fbeta score for both binary and categorical problems</p>
 <div class="arithmatex">\[
 F_{\beta} = ((1 + {\beta}^2) * \frac{(precision * recall)}{({\beta}^2 * precision + recall)}
 \]</div>
@@ -2136,7 +2159,7 @@ <h3 id="pytorch_widedeep.metrics.FBetaScore.reset" class="doc doc-heading">
 
     <div class="doc doc-contents ">
 
-      <p>resets precision and recall</p>
+        <p>resets precision and recall</p>
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/metrics.py</code></summary>
@@ -2182,7 +2205,7 @@ <h2 id="pytorch_widedeep.metrics.F1Score" class="doc doc-heading">
               Bases: <code><span title="pytorch_widedeep.metrics.Metric">Metric</span></code></p>
 
 
-      <p>Class to calculate the f1 score for both binary and categorical problems</p>
+        <p>Class to calculate the f1 score for both binary and categorical problems</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -2260,7 +2283,7 @@ <h3 id="pytorch_widedeep.metrics.F1Score.reset" class="doc doc-heading">
 
     <div class="doc doc-contents ">
 
-      <p>resets counters to 0</p>
+        <p>resets counters to 0</p>
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/metrics.py</code></summary>
@@ -2304,7 +2327,7 @@ <h2 id="pytorch_widedeep.metrics.R2Score" class="doc doc-heading">
               Bases: <code><span title="pytorch_widedeep.metrics.Metric">Metric</span></code></p>
 
 
-      <p>Calculates R-Squared, the
+        <p>Calculates R-Squared, the
 <a href="https://en.wikipedia.org/wiki/Coefficient_of_determination&gt;">coefficient of determination</a>:</p>
 <div class="arithmatex">\[
 R^2 = 1 - \frac{\sum_{j=1}^n(y_j - \hat{y_j})^2}{\sum_{j=1}^n(y_j - \bar{y})^2}
@@ -2368,7 +2391,7 @@ <h3 id="pytorch_widedeep.metrics.R2Score.reset" class="doc doc-heading">
 
     <div class="doc doc-contents ">
 
-      <p>resets counters to 0</p>
+        <p>resets counters to 0</p>
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/metrics.py</code></summary>
@@ -2417,13 +2440,13 @@ <h3 id="pytorch_widedeep.metrics.R2Score.reset" class="doc doc-heading">
   <span class="md-source-file__fact">
     <span class="md-icon" title="Contributors">
       
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.536 5.536 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13v-1.75M0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20H0m24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9V20Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.54 5.54 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13zM0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20zm24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9z"/></svg>
       
     </span>
     <nav>
       
-        <a href="mailto:mulinka.pavol@gmail.com">Pavol Mulinka</a>, 
-        <a href="mailto:javierrodriguezzaurin@javiers-macbook-pro.local">Javier Rodriguez Zaurin</a>
+        <a href="mailto:javierrodriguezzaurin@javiers-macbook-pro.local">Javier Rodriguez Zaurin</a>, 
+        <a href="mailto:mulinka.pavol@gmail.com">Pavol Mulinka</a>
     </nav>
   </span>
 
@@ -2473,7 +2496,7 @@ <h3 id="pytorch_widedeep.metrics.R2Score.reset" class="doc doc-heading">
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -2488,10 +2511,10 @@ <h3 id="pytorch_widedeep.metrics.R2Score.reset" class="doc doc-heading">
     </div>
     
     
-    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="../assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="../stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/pytorch-widedeep/model_components.html b/mkdocs/site/pytorch-widedeep/model_components.html
index 7b693485..13519665 100644
--- a/mkdocs/site/pytorch-widedeep/model_components.html
+++ b/mkdocs/site/pytorch-widedeep/model_components.html
@@ -14,11 +14,11 @@
         <link rel="prev" href="load_from_folder.html">
       
       
-        <link rel="next" href="bayesian_models.html">
+        <link rel="next" href="the_rec_module.html">
       
       
       <link rel="icon" href="../assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -26,7 +26,7 @@
       
     
     
-      <link rel="stylesheet" href="../assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="../assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="../assets/stylesheets/palette.06af60db.min.css">
@@ -52,7 +52,7 @@
     
       <link rel="stylesheet" href="../stylesheets/extra.css">
     
-    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -101,7 +101,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -129,7 +129,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -139,7 +139,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -147,13 +147,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -162,15 +162,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -193,7 +193,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -372,7 +372,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -476,6 +476,8 @@
         
       
         
+      
+        
       
         
       
@@ -914,15 +916,6 @@
     </span>
   </a>
   
-</li>
-      
-        <li class="md-nav__item">
-  <a href="#pytorch_widedeep.models.text.miscellaneous.basic_transformer.Transformer" class="md-nav__link">
-    <span class="md-ellipsis">
-      Transformer
-    </span>
-  </a>
-  
 </li>
       
         <li class="md-nav__item">
@@ -976,6 +969,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="bayesian_models.html" class="md-nav__link">
         
@@ -1785,7 +1799,7 @@ <h2 id="pytorch_widedeep.models.tabular.linear.wide.Wide" class="doc doc-heading
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>Defines a <code>Wide</code> (linear) model where the non-linearities are
+        <p>Defines a <code>Wide</code> (linear) model where the non-linearities are
 captured via the so-called crossed-columns. This can be used as the
 <code>wide</code> component of a Wide &amp; Deep model.</p>
 
@@ -1836,8 +1850,8 @@ <h2 id="pytorch_widedeep.models.tabular.linear.wide.Wide" class="doc doc-heading
 <p><span class="doc-section-title">Examples:</span></p>
     <div class="highlight"><pre><span></span><code><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch</span>
 <span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pytorch_widedeep.models</span> <span class="kn">import</span> <span class="n">Wide</span>
-<span class="gp">&gt;&gt;&gt; </span><span class="n">X</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span><span class="o">.</span><span class="n">random_</span><span class="p">(</span><span class="mi">4</span><span class="p">)</span>
-<span class="gp">&gt;&gt;&gt; </span><span class="n">wide</span> <span class="o">=</span> <span class="n">Wide</span><span class="p">(</span><span class="n">input_dim</span><span class="o">=</span><span class="n">X</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span><span class="o">.</span><span class="n">size</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> <span class="n">pred_dim</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">X</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span><span class="o">.</span><span class="n">random_</span><span class="p">(</span><span class="mi">20</span><span class="p">)</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">wide</span> <span class="o">=</span> <span class="n">Wide</span><span class="p">(</span><span class="n">input_dim</span><span class="o">=</span><span class="nb">int</span><span class="p">(</span><span class="n">X</span><span class="o">.</span><span class="n">max</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()),</span> <span class="n">pred_dim</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
 <span class="gp">&gt;&gt;&gt; </span><span class="n">out</span> <span class="o">=</span> <span class="n">wide</span><span class="p">(</span><span class="n">X</span><span class="p">)</span>
 </code></pre></div>
 
@@ -1894,7 +1908,7 @@ <h3 id="pytorch_widedeep.models.tabular.linear.wide.Wide.forward" class="doc doc
 
     <div class="doc doc-contents ">
 
-      <p>Forward pass. Simply connecting the Embedding layer with the ouput
+        <p>Forward pass. Simply connecting the Embedding layer with the ouput
 neuron(s)</p>
 
             <details class="quote">
@@ -1964,7 +1978,7 @@ <h2 id="pytorch_widedeep.models.tabular.mlp.tab_mlp.TabMlp" class="doc doc-headi
               Bases: <code><span title="pytorch_widedeep.models.tabular._base_tabular_model.BaseTabularModelWithoutAttention">BaseTabularModelWithoutAttention</span></code></p>
 
 
-      <p>Defines a <code>TabMlp</code> model that can be used as the <code>deeptabular</code>
+        <p>Defines a <code>TabMlp</code> model that can be used as the <code>deeptabular</code>
 component of a Wide &amp; Deep model or independently by itself.</p>
 <p>This class combines embedding representations of the categorical features
 with numerical (aka continuous) features, embedded or not. These are then
@@ -2281,7 +2295,15 @@ <h2 id="pytorch_widedeep.models.tabular.mlp.tab_mlp.TabMlp" class="doc doc-headi
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/models/tabular/mlp/tab_mlp.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">144</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">136</span>
+<span class="normal">137</span>
+<span class="normal">138</span>
+<span class="normal">139</span>
+<span class="normal">140</span>
+<span class="normal">141</span>
+<span class="normal">142</span>
+<span class="normal">143</span>
+<span class="normal">144</span>
 <span class="normal">145</span>
 <span class="normal">146</span>
 <span class="normal">147</span>
@@ -2340,15 +2362,7 @@ <h2 id="pytorch_widedeep.models.tabular.mlp.tab_mlp.TabMlp" class="doc doc-headi
 <span class="normal">200</span>
 <span class="normal">201</span>
 <span class="normal">202</span>
-<span class="normal">203</span>
-<span class="normal">204</span>
-<span class="normal">205</span>
-<span class="normal">206</span>
-<span class="normal">207</span>
-<span class="normal">208</span>
-<span class="normal">209</span>
-<span class="normal">210</span>
-<span class="normal">211</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">203</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">column_idx</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span>
     <span class="o">*</span><span class="p">,</span>
@@ -2447,7 +2461,7 @@ <h3 id="pytorch_widedeep.models.tabular.mlp.tab_mlp.TabMlp.output_dim" class="do
 
     <div class="doc doc-contents ">
 
-      <p>The output dimension of the model. This is a required property
+        <p>The output dimension of the model. This is a required property
 neccesary to build the <code>WideDeep</code> class</p>
     </div>
 
@@ -2488,7 +2502,7 @@ <h2 id="pytorch_widedeep.models.tabular.mlp.tab_mlp.TabMlpDecoder" class="doc do
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>Companion decoder model for the <code>TabMlp</code> model (which can be considered
+        <p>Companion decoder model for the <code>TabMlp</code> model (which can be considered
 an encoder itself).</p>
 <p>This class is designed to be used with the <code>EncoderDecoderTrainer</code> when
 using self-supervised pre-training (see the corresponding section in the
@@ -2600,7 +2614,15 @@ <h2 id="pytorch_widedeep.models.tabular.mlp.tab_mlp.TabMlpDecoder" class="doc do
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/models/tabular/mlp/tab_mlp.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">279</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">271</span>
+<span class="normal">272</span>
+<span class="normal">273</span>
+<span class="normal">274</span>
+<span class="normal">275</span>
+<span class="normal">276</span>
+<span class="normal">277</span>
+<span class="normal">278</span>
+<span class="normal">279</span>
 <span class="normal">280</span>
 <span class="normal">281</span>
 <span class="normal">282</span>
@@ -2620,15 +2642,7 @@ <h2 id="pytorch_widedeep.models.tabular.mlp.tab_mlp.TabMlpDecoder" class="doc do
 <span class="normal">296</span>
 <span class="normal">297</span>
 <span class="normal">298</span>
-<span class="normal">299</span>
-<span class="normal">300</span>
-<span class="normal">301</span>
-<span class="normal">302</span>
-<span class="normal">303</span>
-<span class="normal">304</span>
-<span class="normal">305</span>
-<span class="normal">306</span>
-<span class="normal">307</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">299</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">embed_dim</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
     <span class="n">mlp_hidden_dims</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="mi">100</span><span class="p">,</span> <span class="mi">200</span><span class="p">],</span>
@@ -2725,7 +2739,7 @@ <h2 id="pytorch_widedeep.models.tabular.resnet.tab_resnet.TabResnet" class="doc
               Bases: <code><span title="pytorch_widedeep.models.tabular._base_tabular_model.BaseTabularModelWithoutAttention">BaseTabularModelWithoutAttention</span></code></p>
 
 
-      <p>Defines a <code>TabResnet</code> model that can be used as the <code>deeptabular</code>
+        <p>Defines a <code>TabResnet</code> model that can be used as the <code>deeptabular</code>
 component of a Wide &amp; Deep model or independently by itself.</p>
 <p>This class combines embedding representations of the categorical features
 with numerical (aka continuous) features, embedded or not. These are then
@@ -3102,7 +3116,14 @@ <h2 id="pytorch_widedeep.models.tabular.resnet.tab_resnet.TabResnet" class="doc
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/models/tabular/resnet/tab_resnet.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">174</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">167</span>
+<span class="normal">168</span>
+<span class="normal">169</span>
+<span class="normal">170</span>
+<span class="normal">171</span>
+<span class="normal">172</span>
+<span class="normal">173</span>
+<span class="normal">174</span>
 <span class="normal">175</span>
 <span class="normal">176</span>
 <span class="normal">177</span>
@@ -3191,14 +3212,7 @@ <h2 id="pytorch_widedeep.models.tabular.resnet.tab_resnet.TabResnet" class="doc
 <span class="normal">260</span>
 <span class="normal">261</span>
 <span class="normal">262</span>
-<span class="normal">263</span>
-<span class="normal">264</span>
-<span class="normal">265</span>
-<span class="normal">266</span>
-<span class="normal">267</span>
-<span class="normal">268</span>
-<span class="normal">269</span>
-<span class="normal">270</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">263</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">column_idx</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span>
     <span class="o">*</span><span class="p">,</span>
@@ -3326,7 +3340,7 @@ <h3 id="pytorch_widedeep.models.tabular.resnet.tab_resnet.TabResnet.output_dim"
 
     <div class="doc doc-contents ">
 
-      <p>The output dimension of the model. This is a required property
+        <p>The output dimension of the model. This is a required property
 neccesary to build the <code>WideDeep</code> class</p>
     </div>
 
@@ -3370,7 +3384,7 @@ <h2 id="pytorch_widedeep.models.tabular.resnet.tab_resnet.TabResnetDecoder" clas
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>Companion decoder model for the <code>TabResnet</code> model (which can be
+        <p>Companion decoder model for the <code>TabResnet</code> model (which can be
 considered an encoder itself)</p>
 <p>This class is designed to be used with the <code>EncoderDecoderTrainer</code> when
 using self-supervised pre-training (see the corresponding section in the
@@ -3541,7 +3555,14 @@ <h2 id="pytorch_widedeep.models.tabular.resnet.tab_resnet.TabResnetDecoder" clas
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/models/tabular/resnet/tab_resnet.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">368</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">361</span>
+<span class="normal">362</span>
+<span class="normal">363</span>
+<span class="normal">364</span>
+<span class="normal">365</span>
+<span class="normal">366</span>
+<span class="normal">367</span>
+<span class="normal">368</span>
 <span class="normal">369</span>
 <span class="normal">370</span>
 <span class="normal">371</span>
@@ -3596,14 +3617,7 @@ <h2 id="pytorch_widedeep.models.tabular.resnet.tab_resnet.TabResnetDecoder" clas
 <span class="normal">420</span>
 <span class="normal">421</span>
 <span class="normal">422</span>
-<span class="normal">423</span>
-<span class="normal">424</span>
-<span class="normal">425</span>
-<span class="normal">426</span>
-<span class="normal">427</span>
-<span class="normal">428</span>
-<span class="normal">429</span>
-<span class="normal">430</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">423</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">embed_dim</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
     <span class="n">blocks_dims</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="mi">100</span><span class="p">,</span> <span class="mi">100</span><span class="p">,</span> <span class="mi">200</span><span class="p">],</span>
@@ -3737,7 +3751,7 @@ <h2 id="pytorch_widedeep.models.tabular.tabnet.tab_net.TabNet" class="doc doc-he
               Bases: <code><span title="pytorch_widedeep.models.tabular._base_tabular_model.BaseTabularModelWithoutAttention">BaseTabularModelWithoutAttention</span></code></p>
 
 
-      <p>Defines a <a href="https://arxiv.org/abs/1908.07442">TabNet model</a> that
+        <p>Defines a <a href="https://arxiv.org/abs/1908.07442">TabNet model</a> that
 can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model or
 independently by itself.</p>
 <p>The implementation in this library is fully based on that
@@ -4113,7 +4127,14 @@ <h2 id="pytorch_widedeep.models.tabular.tabnet.tab_net.TabNet" class="doc doc-he
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/models/tabular/tabnet/tab_net.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">158</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">151</span>
+<span class="normal">152</span>
+<span class="normal">153</span>
+<span class="normal">154</span>
+<span class="normal">155</span>
+<span class="normal">156</span>
+<span class="normal">157</span>
+<span class="normal">158</span>
 <span class="normal">159</span>
 <span class="normal">160</span>
 <span class="normal">161</span>
@@ -4192,14 +4213,7 @@ <h2 id="pytorch_widedeep.models.tabular.tabnet.tab_net.TabNet" class="doc doc-he
 <span class="normal">234</span>
 <span class="normal">235</span>
 <span class="normal">236</span>
-<span class="normal">237</span>
-<span class="normal">238</span>
-<span class="normal">239</span>
-<span class="normal">240</span>
-<span class="normal">241</span>
-<span class="normal">242</span>
-<span class="normal">243</span>
-<span class="normal">244</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">237</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">column_idx</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span>
     <span class="o">*</span><span class="p">,</span>
@@ -4317,7 +4331,7 @@ <h3 id="pytorch_widedeep.models.tabular.tabnet.tab_net.TabNet.output_dim" class=
 
     <div class="doc doc-contents ">
 
-      <p>The output dimension of the model. This is a required property
+        <p>The output dimension of the model. This is a required property
 neccesary to build the <code>WideDeep</code> class</p>
     </div>
 
@@ -4360,7 +4374,7 @@ <h2 id="pytorch_widedeep.models.tabular.tabnet.tab_net.TabNetDecoder" class="doc
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>Companion decoder model for the <code>TabNet</code> model (which can be
+        <p>Companion decoder model for the <code>TabNet</code> model (which can be
 considered an encoder itself)</p>
 <p>This class is designed to be used with the <code>EncoderDecoderTrainer</code> when
 using self-supervised pre-training (see the corresponding section in the
@@ -4495,7 +4509,14 @@ <h2 id="pytorch_widedeep.models.tabular.tabnet.tab_net.TabNetDecoder" class="doc
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/models/tabular/tabnet/tab_net.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">342</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">335</span>
+<span class="normal">336</span>
+<span class="normal">337</span>
+<span class="normal">338</span>
+<span class="normal">339</span>
+<span class="normal">340</span>
+<span class="normal">341</span>
+<span class="normal">342</span>
 <span class="normal">343</span>
 <span class="normal">344</span>
 <span class="normal">345</span>
@@ -4533,14 +4554,7 @@ <h2 id="pytorch_widedeep.models.tabular.tabnet.tab_net.TabNetDecoder" class="doc
 <span class="normal">377</span>
 <span class="normal">378</span>
 <span class="normal">379</span>
-<span class="normal">380</span>
-<span class="normal">381</span>
-<span class="normal">382</span>
-<span class="normal">383</span>
-<span class="normal">384</span>
-<span class="normal">385</span>
-<span class="normal">386</span>
-<span class="normal">387</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">380</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">embed_dim</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
     <span class="n">n_steps</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">3</span><span class="p">,</span>
@@ -4651,7 +4665,7 @@ <h2 id="pytorch_widedeep.models.tabular.mlp.context_attention_mlp.ContextAttenti
               Bases: <code><span title="pytorch_widedeep.models.tabular._base_tabular_model.BaseTabularModelWithAttention">BaseTabularModelWithAttention</span></code></p>
 
 
-      <p>Defines a <code>ContextAttentionMLP</code> model that can be used as the
+        <p>Defines a <code>ContextAttentionMLP</code> model that can be used as the
 <code>deeptabular</code> component of a Wide &amp; Deep model or independently by
 itself.</p>
 <p>This class combines embedding representations of the categorical features
@@ -4975,7 +4989,16 @@ <h2 id="pytorch_widedeep.models.tabular.mlp.context_attention_mlp.ContextAttenti
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/models/tabular/mlp/context_attention_mlp.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">152</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">143</span>
+<span class="normal">144</span>
+<span class="normal">145</span>
+<span class="normal">146</span>
+<span class="normal">147</span>
+<span class="normal">148</span>
+<span class="normal">149</span>
+<span class="normal">150</span>
+<span class="normal">151</span>
+<span class="normal">152</span>
 <span class="normal">153</span>
 <span class="normal">154</span>
 <span class="normal">155</span>
@@ -5038,17 +5061,7 @@ <h2 id="pytorch_widedeep.models.tabular.mlp.context_attention_mlp.ContextAttenti
 <span class="normal">212</span>
 <span class="normal">213</span>
 <span class="normal">214</span>
-<span class="normal">215</span>
-<span class="normal">216</span>
-<span class="normal">217</span>
-<span class="normal">218</span>
-<span class="normal">219</span>
-<span class="normal">220</span>
-<span class="normal">221</span>
-<span class="normal">222</span>
-<span class="normal">223</span>
-<span class="normal">224</span>
-<span class="normal">225</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">215</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">column_idx</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span>
     <span class="o">*</span><span class="p">,</span>
@@ -5088,7 +5101,6 @@ <h2 id="pytorch_widedeep.models.tabular.mlp.context_attention_mlp.ContextAttenti
         <span class="n">frac_shared_embed</span><span class="o">=</span><span class="n">frac_shared_embed</span><span class="p">,</span>
         <span class="n">continuous_cols</span><span class="o">=</span><span class="n">continuous_cols</span><span class="p">,</span>
         <span class="n">cont_norm_layer</span><span class="o">=</span><span class="n">cont_norm_layer</span><span class="p">,</span>
-        <span class="n">embed_continuous</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
         <span class="n">embed_continuous_method</span><span class="o">=</span><span class="n">embed_continuous_method</span><span class="p">,</span>
         <span class="n">cont_embed_dropout</span><span class="o">=</span><span class="n">cont_embed_dropout</span><span class="p">,</span>
         <span class="n">cont_embed_activation</span><span class="o">=</span><span class="n">cont_embed_activation</span><span class="p">,</span>
@@ -5153,7 +5165,7 @@ <h3 id="pytorch_widedeep.models.tabular.mlp.context_attention_mlp.ContextAttenti
 
     <div class="doc doc-contents ">
 
-      <p>The output dimension of the model. This is a required property
+        <p>The output dimension of the model. This is a required property
 neccesary to build the <code>WideDeep</code> class</p>
     </div>
 
@@ -5177,7 +5189,7 @@ <h3 id="pytorch_widedeep.models.tabular.mlp.context_attention_mlp.ContextAttenti
 
     <div class="doc doc-contents ">
 
-      <p>List with the attention weights per block</p>
+        <p>List with the attention weights per block</p>
 <p>The shape of the attention weights is <span class="arithmatex">\((N, F)\)</span>, where <span class="arithmatex">\(N\)</span> is the batch
 size and <span class="arithmatex">\(F\)</span> is the number of features/columns in the dataset</p>
     </div>
@@ -5238,7 +5250,7 @@ <h2 id="pytorch_widedeep.models.tabular.mlp.self_attention_mlp.SelfAttentionMLP"
               Bases: <code><span title="pytorch_widedeep.models.tabular._base_tabular_model.BaseTabularModelWithAttention">BaseTabularModelWithAttention</span></code></p>
 
 
-      <p>Defines a <code>SelfAttentionMLP</code> model that can be used as the
+        <p>Defines a <code>SelfAttentionMLP</code> model that can be used as the
 deeptabular component of a Wide &amp; Deep model or independently by
 itself.</p>
 <p>This class combines embedding representations of the categorical features
@@ -5602,7 +5614,14 @@ <h2 id="pytorch_widedeep.models.tabular.mlp.self_attention_mlp.SelfAttentionMLP"
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/models/tabular/mlp/self_attention_mlp.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">170</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">163</span>
+<span class="normal">164</span>
+<span class="normal">165</span>
+<span class="normal">166</span>
+<span class="normal">167</span>
+<span class="normal">168</span>
+<span class="normal">169</span>
+<span class="normal">170</span>
 <span class="normal">171</span>
 <span class="normal">172</span>
 <span class="normal">173</span>
@@ -5673,15 +5692,7 @@ <h2 id="pytorch_widedeep.models.tabular.mlp.self_attention_mlp.SelfAttentionMLP"
 <span class="normal">238</span>
 <span class="normal">239</span>
 <span class="normal">240</span>
-<span class="normal">241</span>
-<span class="normal">242</span>
-<span class="normal">243</span>
-<span class="normal">244</span>
-<span class="normal">245</span>
-<span class="normal">246</span>
-<span class="normal">247</span>
-<span class="normal">248</span>
-<span class="normal">249</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">241</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">column_idx</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span>
     <span class="o">*</span><span class="p">,</span>
@@ -5724,7 +5735,6 @@ <h2 id="pytorch_widedeep.models.tabular.mlp.self_attention_mlp.SelfAttentionMLP"
         <span class="n">frac_shared_embed</span><span class="o">=</span><span class="n">frac_shared_embed</span><span class="p">,</span>
         <span class="n">continuous_cols</span><span class="o">=</span><span class="n">continuous_cols</span><span class="p">,</span>
         <span class="n">cont_norm_layer</span><span class="o">=</span><span class="n">cont_norm_layer</span><span class="p">,</span>
-        <span class="n">embed_continuous</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
         <span class="n">embed_continuous_method</span><span class="o">=</span><span class="n">embed_continuous_method</span><span class="p">,</span>
         <span class="n">cont_embed_dropout</span><span class="o">=</span><span class="n">cont_embed_dropout</span><span class="p">,</span>
         <span class="n">cont_embed_activation</span><span class="o">=</span><span class="n">cont_embed_activation</span><span class="p">,</span>
@@ -5792,7 +5802,7 @@ <h3 id="pytorch_widedeep.models.tabular.mlp.self_attention_mlp.SelfAttentionMLP.
 
     <div class="doc doc-contents ">
 
-      <p>The output dimension of the model. This is a required property
+        <p>The output dimension of the model. This is a required property
 neccesary to build the WideDeep class</p>
     </div>
 
@@ -5816,7 +5826,7 @@ <h3 id="pytorch_widedeep.models.tabular.mlp.self_attention_mlp.SelfAttentionMLP.
 
     <div class="doc doc-contents ">
 
-      <p>List with the attention weights per block</p>
+        <p>List with the attention weights per block</p>
 <p>The shape of the attention weights is <span class="arithmatex">\((N, H, F, F)\)</span>, where <span class="arithmatex">\(N\)</span> is the
 batch size, <span class="arithmatex">\(H\)</span> is the number of attention heads and <span class="arithmatex">\(F\)</span> is the
 number of features/columns in the dataset</p>
@@ -5888,7 +5898,7 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.tab_transformer.TabTransfor
               Bases: <code><span title="pytorch_widedeep.models.tabular._base_tabular_model.BaseTabularModelWithAttention">BaseTabularModelWithAttention</span></code></p>
 
 
-      <p>Defines our adptation of the
+        <p>Defines our adptation of the
 <a href="https://arxiv.org/abs/2012.06678">TabTransformer model</a>
 that can be used as the <code>deeptabular</code> component of a
 Wide &amp; Deep model or independently by itself.</p>
@@ -6353,7 +6363,16 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.tab_transformer.TabTransfor
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/models/tabular/transformers/tab_transformer.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">201</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">192</span>
+<span class="normal">193</span>
+<span class="normal">194</span>
+<span class="normal">195</span>
+<span class="normal">196</span>
+<span class="normal">197</span>
+<span class="normal">198</span>
+<span class="normal">199</span>
+<span class="normal">200</span>
+<span class="normal">201</span>
 <span class="normal">202</span>
 <span class="normal">203</span>
 <span class="normal">204</span>
@@ -6482,7 +6501,21 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.tab_transformer.TabTransfor
 <span class="normal">327</span>
 <span class="normal">328</span>
 <span class="normal">329</span>
-<span class="normal">330</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">330</span>
+<span class="normal">331</span>
+<span class="normal">332</span>
+<span class="normal">333</span>
+<span class="normal">334</span>
+<span class="normal">335</span>
+<span class="normal">336</span>
+<span class="normal">337</span>
+<span class="normal">338</span>
+<span class="normal">339</span>
+<span class="normal">340</span>
+<span class="normal">341</span>
+<span class="normal">342</span>
+<span class="normal">343</span>
+<span class="normal">344</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">column_idx</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span>
     <span class="o">*</span><span class="p">,</span>
@@ -6534,8 +6567,12 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.tab_transformer.TabTransfor
         <span class="n">frac_shared_embed</span><span class="o">=</span><span class="n">frac_shared_embed</span><span class="p">,</span>
         <span class="n">continuous_cols</span><span class="o">=</span><span class="n">continuous_cols</span><span class="p">,</span>
         <span class="n">cont_norm_layer</span><span class="o">=</span><span class="n">cont_norm_layer</span><span class="p">,</span>
-        <span class="n">embed_continuous</span><span class="o">=</span><span class="n">embed_continuous</span><span class="p">,</span>
-        <span class="n">embed_continuous_method</span><span class="o">=</span><span class="n">embed_continuous_method</span><span class="p">,</span>
+        <span class="c1"># UGLY hack to be able to use the base class __init__ method</span>
+        <span class="n">embed_continuous_method</span><span class="o">=</span><span class="p">(</span>
+            <span class="s2">&quot;standard&quot;</span>
+            <span class="k">if</span> <span class="n">embed_continuous_method</span> <span class="ow">is</span> <span class="kc">None</span>
+            <span class="k">else</span> <span class="n">embed_continuous_method</span>
+        <span class="p">),</span>
         <span class="n">cont_embed_dropout</span><span class="o">=</span><span class="n">cont_embed_dropout</span><span class="p">,</span>
         <span class="n">cont_embed_activation</span><span class="o">=</span><span class="n">cont_embed_activation</span><span class="p">,</span>
         <span class="n">quantization_setup</span><span class="o">=</span><span class="n">quantization_setup</span><span class="p">,</span>
@@ -6546,6 +6583,25 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.tab_transformer.TabTransfor
         <span class="n">full_embed_dropout</span><span class="o">=</span><span class="n">full_embed_dropout</span><span class="p">,</span>
     <span class="p">)</span>
 
+    <span class="c1"># we overwrite the embed_continuous_method parameter that was set</span>
+    <span class="c1"># to &#39;standard&#39; if embed_continuous_method is None. In other words,</span>
+    <span class="c1"># this will already have the right value if the user provided a value</span>
+    <span class="c1"># for embed_continuous_method, otherwise will be &quot;standard&quot; when it</span>
+    <span class="c1"># should be None</span>
+    <span class="bp">self</span><span class="o">.</span><span class="n">embed_continuous_method</span> <span class="o">=</span> <span class="n">embed_continuous_method</span>
+
+    <span class="k">if</span> <span class="n">embed_continuous</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">embed_continuous</span> <span class="o">=</span> <span class="n">embed_continuous</span>
+        <span class="k">if</span> <span class="n">embed_continuous</span> <span class="ow">and</span> <span class="n">embed_continuous_method</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
+                <span class="s2">&quot;If &#39;embed_continuous&#39; is True, &#39;embed_continuous_method&#39; must be &quot;</span>
+                <span class="s2">&quot;one of &#39;standard&#39;, &#39;piecewise&#39; or &#39;periodic&#39;.&quot;</span>
+            <span class="p">)</span>
+    <span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">embed_continuous_method</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">embed_continuous</span> <span class="o">=</span> <span class="kc">True</span>
+    <span class="k">else</span><span class="p">:</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">embed_continuous</span> <span class="o">=</span> <span class="kc">False</span>
+
     <span class="bp">self</span><span class="o">.</span><span class="n">n_heads</span> <span class="o">=</span> <span class="n">n_heads</span>
     <span class="bp">self</span><span class="o">.</span><span class="n">use_qkv_bias</span> <span class="o">=</span> <span class="n">use_qkv_bias</span>
     <span class="bp">self</span><span class="o">.</span><span class="n">n_blocks</span> <span class="o">=</span> <span class="n">n_blocks</span>
@@ -6643,7 +6699,7 @@ <h3 id="pytorch_widedeep.models.tabular.transformers.tab_transformer.TabTransfor
 
     <div class="doc doc-contents ">
 
-      <p>The output dimension of the model. This is a required property
+        <p>The output dimension of the model. This is a required property
 neccesary to build the <code>WideDeep</code> class</p>
     </div>
 
@@ -6667,7 +6723,7 @@ <h3 id="pytorch_widedeep.models.tabular.transformers.tab_transformer.TabTransfor
 
     <div class="doc doc-contents ">
 
-      <p>List with the attention weights per block</p>
+        <p>List with the attention weights per block</p>
 <p>The shape of the attention weights is <span class="arithmatex">\((N, H, F, F)\)</span>, where <span class="arithmatex">\(N\)</span> is the
 batch size, <span class="arithmatex">\(H\)</span> is the number of attention heads and <span class="arithmatex">\(F\)</span> is the
 number of features/columns in the dataset</p>
@@ -6740,7 +6796,7 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.saint.SAINT" class="doc doc
               Bases: <code><span title="pytorch_widedeep.models.tabular._base_tabular_model.BaseTabularModelWithAttention">BaseTabularModelWithAttention</span></code></p>
 
 
-      <p>Defines a <a href="https://arxiv.org/abs/2106.01342">SAINT model</a> that
+        <p>Defines a <a href="https://arxiv.org/abs/2106.01342">SAINT model</a> that
 can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model or
 independently by itself.</p>
 <p>Most of the parameters for this class are <code>Optional</code> since the use of
@@ -7175,7 +7231,14 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.saint.SAINT" class="doc doc
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/models/tabular/transformers/saint.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">184</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">177</span>
+<span class="normal">178</span>
+<span class="normal">179</span>
+<span class="normal">180</span>
+<span class="normal">181</span>
+<span class="normal">182</span>
+<span class="normal">183</span>
+<span class="normal">184</span>
 <span class="normal">185</span>
 <span class="normal">186</span>
 <span class="normal">187</span>
@@ -7290,15 +7353,7 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.saint.SAINT" class="doc doc
 <span class="normal">296</span>
 <span class="normal">297</span>
 <span class="normal">298</span>
-<span class="normal">299</span>
-<span class="normal">300</span>
-<span class="normal">301</span>
-<span class="normal">302</span>
-<span class="normal">303</span>
-<span class="normal">304</span>
-<span class="normal">305</span>
-<span class="normal">306</span>
-<span class="normal">307</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">299</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">column_idx</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span>
     <span class="o">*</span><span class="p">,</span>
@@ -7347,7 +7402,6 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.saint.SAINT" class="doc doc
         <span class="n">frac_shared_embed</span><span class="o">=</span><span class="n">frac_shared_embed</span><span class="p">,</span>
         <span class="n">continuous_cols</span><span class="o">=</span><span class="n">continuous_cols</span><span class="p">,</span>
         <span class="n">cont_norm_layer</span><span class="o">=</span><span class="n">cont_norm_layer</span><span class="p">,</span>
-        <span class="n">embed_continuous</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
         <span class="n">embed_continuous_method</span><span class="o">=</span><span class="n">embed_continuous_method</span><span class="p">,</span>
         <span class="n">cont_embed_dropout</span><span class="o">=</span><span class="n">cont_embed_dropout</span><span class="p">,</span>
         <span class="n">cont_embed_activation</span><span class="o">=</span><span class="n">cont_embed_activation</span><span class="p">,</span>
@@ -7453,7 +7507,7 @@ <h3 id="pytorch_widedeep.models.tabular.transformers.saint.SAINT.output_dim" cla
 
     <div class="doc doc-contents ">
 
-      <p>The output dimension of the model. This is a required property
+        <p>The output dimension of the model. This is a required property
 neccesary to build the <code>WideDeep</code> class</p>
     </div>
 
@@ -7477,7 +7531,7 @@ <h3 id="pytorch_widedeep.models.tabular.transformers.saint.SAINT.attention_weigh
 
     <div class="doc doc-contents ">
 
-      <p>List with the attention weights. Each element of the list is a tuple
+        <p>List with the attention weights. Each element of the list is a tuple
 where the first and the second elements are the column and row
 attention weights respectively</p>
 <p>The shape of the attention weights is:</p>
@@ -7558,7 +7612,7 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.ft_transformer.FTTransforme
               Bases: <code><span title="pytorch_widedeep.models.tabular._base_tabular_model.BaseTabularModelWithAttention">BaseTabularModelWithAttention</span></code></p>
 
 
-      <p>Defines a <a href="https://arxiv.org/abs/2106.11959">FTTransformer model</a> that
+        <p>Defines a <a href="https://arxiv.org/abs/2106.11959">FTTransformer model</a> that
 can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model or
 independently by itself.</p>
 <p>Most of the parameters for this class are <code>Optional</code> since the use of
@@ -8018,7 +8072,16 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.ft_transformer.FTTransforme
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/models/tabular/transformers/ft_transformer.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">194</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">185</span>
+<span class="normal">186</span>
+<span class="normal">187</span>
+<span class="normal">188</span>
+<span class="normal">189</span>
+<span class="normal">190</span>
+<span class="normal">191</span>
+<span class="normal">192</span>
+<span class="normal">193</span>
+<span class="normal">194</span>
 <span class="normal">195</span>
 <span class="normal">196</span>
 <span class="normal">197</span>
@@ -8140,17 +8203,7 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.ft_transformer.FTTransforme
 <span class="normal">313</span>
 <span class="normal">314</span>
 <span class="normal">315</span>
-<span class="normal">316</span>
-<span class="normal">317</span>
-<span class="normal">318</span>
-<span class="normal">319</span>
-<span class="normal">320</span>
-<span class="normal">321</span>
-<span class="normal">322</span>
-<span class="normal">323</span>
-<span class="normal">324</span>
-<span class="normal">325</span>
-<span class="normal">326</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">316</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">column_idx</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span>
     <span class="o">*</span><span class="p">,</span>
@@ -8201,7 +8254,6 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.ft_transformer.FTTransforme
         <span class="n">frac_shared_embed</span><span class="o">=</span><span class="n">frac_shared_embed</span><span class="p">,</span>
         <span class="n">continuous_cols</span><span class="o">=</span><span class="n">continuous_cols</span><span class="p">,</span>
         <span class="n">cont_norm_layer</span><span class="o">=</span><span class="n">cont_norm_layer</span><span class="p">,</span>
-        <span class="n">embed_continuous</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
         <span class="n">embed_continuous_method</span><span class="o">=</span><span class="n">embed_continuous_method</span><span class="p">,</span>
         <span class="n">cont_embed_dropout</span><span class="o">=</span><span class="n">cont_embed_dropout</span><span class="p">,</span>
         <span class="n">cont_embed_activation</span><span class="o">=</span><span class="n">cont_embed_activation</span><span class="p">,</span>
@@ -8314,7 +8366,7 @@ <h3 id="pytorch_widedeep.models.tabular.transformers.ft_transformer.FTTransforme
 
     <div class="doc doc-contents ">
 
-      <p>The output dimension of the model. This is a required property
+        <p>The output dimension of the model. This is a required property
 neccesary to build the <code>WideDeep</code> class</p>
     </div>
 
@@ -8338,7 +8390,7 @@ <h3 id="pytorch_widedeep.models.tabular.transformers.ft_transformer.FTTransforme
 
     <div class="doc doc-contents ">
 
-      <p>List with the attention weights per block</p>
+        <p>List with the attention weights per block</p>
 <p>The shape of the attention weights is: <span class="arithmatex">\((N, H, F, k)\)</span>, where <span class="arithmatex">\(N\)</span> is
 the batch size, <span class="arithmatex">\(H\)</span> is the number of attention heads, <span class="arithmatex">\(F\)</span> is the
 number of features/columns and <span class="arithmatex">\(k\)</span> is the reduced sequence length or
@@ -8413,7 +8465,7 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.tab_perceiver.TabPerceiver"
               Bases: <code><span title="pytorch_widedeep.models.tabular._base_tabular_model.BaseTabularModelWithAttention">BaseTabularModelWithAttention</span></code></p>
 
 
-      <p>Defines an adaptation of a <a href="https://arxiv.org/abs/2103.03206">Perceiver</a>
+        <p>Defines an adaptation of a <a href="https://arxiv.org/abs/2103.03206">Perceiver</a>
  that can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model
  or independently by itself.</p>
 <p>Most of the parameters for this class are <code>Optional</code> since the use of
@@ -8922,7 +8974,16 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.tab_perceiver.TabPerceiver"
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/models/tabular/transformers/tab_perceiver.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">216</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">207</span>
+<span class="normal">208</span>
+<span class="normal">209</span>
+<span class="normal">210</span>
+<span class="normal">211</span>
+<span class="normal">212</span>
+<span class="normal">213</span>
+<span class="normal">214</span>
+<span class="normal">215</span>
+<span class="normal">216</span>
 <span class="normal">217</span>
 <span class="normal">218</span>
 <span class="normal">219</span>
@@ -9036,17 +9097,7 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.tab_perceiver.TabPerceiver"
 <span class="normal">327</span>
 <span class="normal">328</span>
 <span class="normal">329</span>
-<span class="normal">330</span>
-<span class="normal">331</span>
-<span class="normal">332</span>
-<span class="normal">333</span>
-<span class="normal">334</span>
-<span class="normal">335</span>
-<span class="normal">336</span>
-<span class="normal">337</span>
-<span class="normal">338</span>
-<span class="normal">339</span>
-<span class="normal">340</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">330</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">column_idx</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span>
     <span class="o">*</span><span class="p">,</span>
@@ -9100,7 +9151,6 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.tab_perceiver.TabPerceiver"
         <span class="n">frac_shared_embed</span><span class="o">=</span><span class="n">frac_shared_embed</span><span class="p">,</span>
         <span class="n">continuous_cols</span><span class="o">=</span><span class="n">continuous_cols</span><span class="p">,</span>
         <span class="n">cont_norm_layer</span><span class="o">=</span><span class="n">cont_norm_layer</span><span class="p">,</span>
-        <span class="n">embed_continuous</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
         <span class="n">embed_continuous_method</span><span class="o">=</span><span class="n">embed_continuous_method</span><span class="p">,</span>
         <span class="n">cont_embed_dropout</span><span class="o">=</span><span class="n">cont_embed_dropout</span><span class="p">,</span>
         <span class="n">cont_embed_activation</span><span class="o">=</span><span class="n">cont_embed_activation</span><span class="p">,</span>
@@ -9202,7 +9252,7 @@ <h3 id="pytorch_widedeep.models.tabular.transformers.tab_perceiver.TabPerceiver.
 
     <div class="doc doc-contents ">
 
-      <p>The output dimension of the model. This is a required property
+        <p>The output dimension of the model. This is a required property
 neccesary to build the <code>WideDeep</code> class</p>
     </div>
 
@@ -9226,7 +9276,7 @@ <h3 id="pytorch_widedeep.models.tabular.transformers.tab_perceiver.TabPerceiver.
 
     <div class="doc doc-contents ">
 
-      <p>List with the attention weights. If the weights are not shared
+        <p>List with the attention weights. If the weights are not shared
 between perceiver blocks each element of the list will be a list
 itself containing the Cross Attention and Latent Transformer
 attention weights respectively</p>
@@ -9310,7 +9360,7 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.tab_fastformer.TabFastForme
               Bases: <code><span title="pytorch_widedeep.models.tabular._base_tabular_model.BaseTabularModelWithAttention">BaseTabularModelWithAttention</span></code></p>
 
 
-      <p>Defines an adaptation of a <a href="https://arxiv.org/abs/2108.09084">FastFormer</a>
+        <p>Defines an adaptation of a <a href="https://arxiv.org/abs/2108.09084">FastFormer</a>
 that can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model
 or independently by itself.</p>
 <p>Most of the parameters for this class are <code>Optional</code> since the use of
@@ -9783,7 +9833,16 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.tab_fastformer.TabFastForme
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/models/tabular/transformers/tab_fastformer.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">200</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">191</span>
+<span class="normal">192</span>
+<span class="normal">193</span>
+<span class="normal">194</span>
+<span class="normal">195</span>
+<span class="normal">196</span>
+<span class="normal">197</span>
+<span class="normal">198</span>
+<span class="normal">199</span>
+<span class="normal">200</span>
 <span class="normal">201</span>
 <span class="normal">202</span>
 <span class="normal">203</span>
@@ -9916,17 +9975,7 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.tab_fastformer.TabFastForme
 <span class="normal">330</span>
 <span class="normal">331</span>
 <span class="normal">332</span>
-<span class="normal">333</span>
-<span class="normal">334</span>
-<span class="normal">335</span>
-<span class="normal">336</span>
-<span class="normal">337</span>
-<span class="normal">338</span>
-<span class="normal">339</span>
-<span class="normal">340</span>
-<span class="normal">341</span>
-<span class="normal">342</span>
-<span class="normal">343</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">333</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">column_idx</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span>
     <span class="o">*</span><span class="p">,</span>
@@ -9977,7 +10026,6 @@ <h2 id="pytorch_widedeep.models.tabular.transformers.tab_fastformer.TabFastForme
         <span class="n">frac_shared_embed</span><span class="o">=</span><span class="n">frac_shared_embed</span><span class="p">,</span>
         <span class="n">continuous_cols</span><span class="o">=</span><span class="n">continuous_cols</span><span class="p">,</span>
         <span class="n">cont_norm_layer</span><span class="o">=</span><span class="n">cont_norm_layer</span><span class="p">,</span>
-        <span class="n">embed_continuous</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
         <span class="n">embed_continuous_method</span><span class="o">=</span><span class="n">embed_continuous_method</span><span class="p">,</span>
         <span class="n">cont_embed_dropout</span><span class="o">=</span><span class="n">cont_embed_dropout</span><span class="p">,</span>
         <span class="n">cont_embed_activation</span><span class="o">=</span><span class="n">cont_embed_activation</span><span class="p">,</span>
@@ -10101,7 +10149,7 @@ <h3 id="pytorch_widedeep.models.tabular.transformers.tab_fastformer.TabFastForme
 
     <div class="doc doc-contents ">
 
-      <p>The output dimension of the model. This is a required property
+        <p>The output dimension of the model. This is a required property
 neccesary to build the <code>WideDeep</code> class</p>
     </div>
 
@@ -10125,7 +10173,7 @@ <h3 id="pytorch_widedeep.models.tabular.transformers.tab_fastformer.TabFastForme
 
     <div class="doc doc-contents ">
 
-      <p>List with the attention weights. Each element of the list is a
+        <p>List with the attention weights. Each element of the list is a
 tuple where the first and second elements are the <span class="arithmatex">\(\alpha\)</span>
 and <span class="arithmatex">\(\beta\)</span> attention weights in the paper.</p>
 <p>The shape of the attention weights is <span class="arithmatex">\((N, H, F)\)</span> where <span class="arithmatex">\(N\)</span> is the
@@ -10180,7 +10228,7 @@ <h2 id="pytorch_widedeep.models.text.rnns.basic_rnn.BasicRNN" class="doc doc-hea
               Bases: <code><span title="pytorch_widedeep.models._base_wd_model_component.BaseWDModelComponent">BaseWDModelComponent</span></code></p>
 
 
-      <p>Standard text classifier/regressor comprised by a stack of RNNs
+        <p>Standard text classifier/regressor comprised by a stack of RNNs
 (LSTMs or GRUs) that can be used as the <code>deeptext</code> component of a Wide &amp;
 Deep model or independently by itself.</p>
 <p>In addition, there is the option to add a Fully Connected (FC) set of
@@ -10230,7 +10278,7 @@ <h2 id="pytorch_widedeep.models.text.rnns.basic_rnn.BasicRNN" class="doc doc-hea
         </li>
         <li class="doc-section-item field-body">
           <b><code>rnn_type</code></b>
-              (<code><span title="pytorch_widedeep.wdtypes.Literal">Literal</span>[<span title="torch.lstm">lstm</span>, gru]</code>, default:
+              (<code><span title="pytorch_widedeep.wdtypes.Literal">Literal</span>[lstm, gru]</code>, default:
                   <code>&#39;lstm&#39;</code>
 )
           –
@@ -10410,7 +10458,16 @@ <h2 id="pytorch_widedeep.models.text.rnns.basic_rnn.BasicRNN" class="doc doc-hea
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/models/text/rnns/basic_rnn.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 95</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 86</span>
+<span class="normal"> 87</span>
+<span class="normal"> 88</span>
+<span class="normal"> 89</span>
+<span class="normal"> 90</span>
+<span class="normal"> 91</span>
+<span class="normal"> 92</span>
+<span class="normal"> 93</span>
+<span class="normal"> 94</span>
+<span class="normal"> 95</span>
 <span class="normal"> 96</span>
 <span class="normal"> 97</span>
 <span class="normal"> 98</span>
@@ -10509,16 +10566,7 @@ <h2 id="pytorch_widedeep.models.text.rnns.basic_rnn.BasicRNN" class="doc doc-hea
 <span class="normal">191</span>
 <span class="normal">192</span>
 <span class="normal">193</span>
-<span class="normal">194</span>
-<span class="normal">195</span>
-<span class="normal">196</span>
-<span class="normal">197</span>
-<span class="normal">198</span>
-<span class="normal">199</span>
-<span class="normal">200</span>
-<span class="normal">201</span>
-<span class="normal">202</span>
-<span class="normal">203</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">194</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">vocab_size</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
     <span class="n">embed_dim</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
@@ -10658,7 +10706,7 @@ <h3 id="pytorch_widedeep.models.text.rnns.basic_rnn.BasicRNN.output_dim" class="
 
     <div class="doc doc-contents ">
 
-      <p>The output dimension of the model. This is a required property
+        <p>The output dimension of the model. This is a required property
 neccesary to build the <code>WideDeep</code> class</p>
     </div>
 
@@ -10711,7 +10759,7 @@ <h2 id="pytorch_widedeep.models.text.rnns.attentive_rnn.AttentiveRNN" class="doc
               Bases: <code><a class="autorefs autorefs-internal" title="pytorch_widedeep.models.text.rnns.basic_rnn.BasicRNN" href="#pytorch_widedeep.models.text.rnns.basic_rnn.BasicRNN">BasicRNN</a></code></p>
 
 
-      <p>Text classifier/regressor comprised by a stack of RNNs
+        <p>Text classifier/regressor comprised by a stack of RNNs
 (LSTMs or GRUs) plus an attention layer. This model can be used as the
 <code>deeptext</code> component of a Wide &amp; Deep model or independently by
 itself.</p>
@@ -10964,7 +11012,9 @@ <h2 id="pytorch_widedeep.models.text.rnns.attentive_rnn.AttentiveRNN" class="doc
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/models/text/rnns/attentive_rnn.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 93</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 91</span>
+<span class="normal"> 92</span>
+<span class="normal"> 93</span>
 <span class="normal"> 94</span>
 <span class="normal"> 95</span>
 <span class="normal"> 96</span>
@@ -11030,9 +11080,7 @@ <h2 id="pytorch_widedeep.models.text.rnns.attentive_rnn.AttentiveRNN" class="doc
 <span class="normal">156</span>
 <span class="normal">157</span>
 <span class="normal">158</span>
-<span class="normal">159</span>
-<span class="normal">160</span>
-<span class="normal">161</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">159</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">vocab_size</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
     <span class="n">embed_dim</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
@@ -11132,7 +11180,7 @@ <h3 id="pytorch_widedeep.models.text.rnns.attentive_rnn.AttentiveRNN.attention_w
 
     <div class="doc doc-contents ">
 
-      <p>List with the attention weights</p>
+        <p>List with the attention weights</p>
 <p>The shape of the attention weights is <span class="arithmatex">\((N, S)\)</span>, where <span class="arithmatex">\(N\)</span> is the batch
 size and <span class="arithmatex">\(S\)</span> is the length of the sequence</p>
     </div>
@@ -11185,7 +11233,7 @@ <h2 id="pytorch_widedeep.models.text.rnns.stacked_attentive_rnn.StackedAttentive
               Bases: <code><span title="pytorch_widedeep.models._base_wd_model_component.BaseWDModelComponent">BaseWDModelComponent</span></code></p>
 
 
-      <p>Text classifier/regressor comprised by a stack of blocks:
+        <p>Text classifier/regressor comprised by a stack of blocks:
 <code>[RNN + Attention]</code>. This can be used as the <code>deeptext</code> component of a
 Wide &amp; Deep model or independently by itself.</p>
 <p>In addition, there is the option to add a Fully Connected (FC) set of
@@ -11428,7 +11476,17 @@ <h2 id="pytorch_widedeep.models.text.rnns.stacked_attentive_rnn.StackedAttentive
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/models/text/rnns/stacked_attentive_rnn.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">102</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 92</span>
+<span class="normal"> 93</span>
+<span class="normal"> 94</span>
+<span class="normal"> 95</span>
+<span class="normal"> 96</span>
+<span class="normal"> 97</span>
+<span class="normal"> 98</span>
+<span class="normal"> 99</span>
+<span class="normal">100</span>
+<span class="normal">101</span>
+<span class="normal">102</span>
 <span class="normal">103</span>
 <span class="normal">104</span>
 <span class="normal">105</span>
@@ -11539,17 +11597,7 @@ <h2 id="pytorch_widedeep.models.text.rnns.stacked_attentive_rnn.StackedAttentive
 <span class="normal">210</span>
 <span class="normal">211</span>
 <span class="normal">212</span>
-<span class="normal">213</span>
-<span class="normal">214</span>
-<span class="normal">215</span>
-<span class="normal">216</span>
-<span class="normal">217</span>
-<span class="normal">218</span>
-<span class="normal">219</span>
-<span class="normal">220</span>
-<span class="normal">221</span>
-<span class="normal">222</span>
-<span class="normal">223</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">213</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">vocab_size</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
     <span class="n">embed_dim</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
@@ -11702,7 +11750,7 @@ <h3 id="pytorch_widedeep.models.text.rnns.stacked_attentive_rnn.StackedAttentive
 
     <div class="doc doc-contents ">
 
-      <p>The output dimension of the model. This is a required property
+        <p>The output dimension of the model. This is a required property
 neccesary to build the <code>WideDeep</code> class</p>
     </div>
 
@@ -11726,7 +11774,7 @@ <h3 id="pytorch_widedeep.models.text.rnns.stacked_attentive_rnn.StackedAttentive
 
     <div class="doc doc-contents ">
 
-      <p>List with the attention weights per block</p>
+        <p>List with the attention weights per block</p>
 <p>The shape of the attention weights is <span class="arithmatex">\((N, S)\)</span> Where <span class="arithmatex">\(N\)</span> is the batch
 size and <span class="arithmatex">\(S\)</span> is the length of the sequence</p>
     </div>
@@ -11737,399 +11785,6 @@ <h3 id="pytorch_widedeep.models.text.rnns.stacked_attentive_rnn.StackedAttentive
 
 
 
-  </div>
-
-    </div>
-
-</div>
-
-<div class="doc doc-object doc-class">
-
-
-
-<h2 id="pytorch_widedeep.models.text.miscellaneous.basic_transformer.Transformer" class="doc doc-heading">
-            <span class="doc doc-object-name doc-class-name">Transformer</span>
-
-
-<a href="#pytorch_widedeep.models.text.miscellaneous.basic_transformer.Transformer" class="headerlink" title="Permanent link">&para;</a></h2>
-<div class="doc-signature highlight"><pre><span></span><code><span class="nf">Transformer</span><span class="p">(</span>
-    <span class="n">vocab_size</span><span class="p">,</span>
-    <span class="n">seq_length</span><span class="p">,</span>
-    <span class="n">input_dim</span><span class="p">,</span>
-    <span class="n">n_heads</span><span class="p">,</span>
-    <span class="n">n_blocks</span><span class="p">,</span>
-    <span class="n">attn_dropout</span><span class="o">=</span><span class="mf">0.1</span><span class="p">,</span>
-    <span class="n">ff_dropout</span><span class="o">=</span><span class="mf">0.1</span><span class="p">,</span>
-    <span class="n">ff_factor</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span>
-    <span class="n">activation</span><span class="o">=</span><span class="s2">&quot;gelu&quot;</span><span class="p">,</span>
-    <span class="n">use_linear_attention</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
-    <span class="n">use_flash_attention</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
-    <span class="n">padding_idx</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span>
-    <span class="n">with_cls_token</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
-    <span class="o">*</span><span class="p">,</span>
-    <span class="n">with_pos_encoding</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
-    <span class="n">pos_encoding_dropout</span><span class="o">=</span><span class="mf">0.1</span><span class="p">,</span>
-    <span class="n">pos_encoder</span><span class="o">=</span><span class="kc">None</span>
-<span class="p">)</span>
-</code></pre></div>
-
-    <div class="doc doc-contents first">
-            <p class="doc doc-class-bases">
-              Bases: <code><span title="torch.nn.Module">Module</span></code></p>
-
-
-      <p>Basic Encoder-Only Transformer Model for text classification/regression.
-As all other models in the library this model can be used as the
-<code>deeptext</code> component of a Wide &amp; Deep model or independently by itself.</p>
-<p><img alt="ℹ️" class="emojione" src="https://cdnjs.cloudflare.com/ajax/libs/emojione/2.2.7/assets/png/2139.png" title=":information_source:" /> <strong>NOTE</strong>:
-This model is introduced in the context of recommendation systems and
-thought for sequences of any nature (e.g. items). It can, of course,
-still be used for text. However, at this stage, we have decided to not
-include the possibility of loading pretrained word vectors since we aim
-to integrate the library wit Huggingface in the (hopefully) near future</p>
-
-
-<p><span class="doc-section-title">Parameters:</span></p>
-    <ul>
-        <li class="doc-section-item field-body">
-          <b><code>vocab_size</code></b>
-              (<code>int</code>)
-          –
-          <div class="doc-md-description">
-            <p>Number of words in the vocabulary</p>
-          </div>
-        </li>
-        <li class="doc-section-item field-body">
-          <b><code>input_dim</code></b>
-              (<code>int</code>)
-          –
-          <div class="doc-md-description">
-            <p>Dimension of the token embeddings</p>
-<p>Param aliases: <code>embed_dim</code>, <code>d_model</code>. <br/></p>
-          </div>
-        </li>
-        <li class="doc-section-item field-body">
-          <b><code>seq_length</code></b>
-              (<code>int</code>)
-          –
-          <div class="doc-md-description">
-            <p>Input sequence length</p>
-          </div>
-        </li>
-        <li class="doc-section-item field-body">
-          <b><code>n_heads</code></b>
-              (<code>int</code>)
-          –
-          <div class="doc-md-description">
-            <p>Number of attention heads per Transformer block</p>
-          </div>
-        </li>
-        <li class="doc-section-item field-body">
-          <b><code>n_blocks</code></b>
-              (<code>int</code>)
-          –
-          <div class="doc-md-description">
-            <p>Number of Transformer blocks</p>
-          </div>
-        </li>
-        <li class="doc-section-item field-body">
-          <b><code>attn_dropout</code></b>
-              (<code>float</code>, default:
-                  <code>0.1</code>
-)
-          –
-          <div class="doc-md-description">
-            <p>Dropout that will be applied to the Multi-Head Attention layers</p>
-          </div>
-        </li>
-        <li class="doc-section-item field-body">
-          <b><code>ff_dropout</code></b>
-              (<code>float</code>, default:
-                  <code>0.1</code>
-)
-          –
-          <div class="doc-md-description">
-            <p>Dropout that will be applied to the FeedForward network</p>
-          </div>
-        </li>
-        <li class="doc-section-item field-body">
-          <b><code>ff_factor</code></b>
-              (<code>int</code>, default:
-                  <code>4</code>
-)
-          –
-          <div class="doc-md-description">
-            <p>Multiplicative factor applied to the first layer of the FF network in
-each Transformer block, This is normally set to 4.</p>
-          </div>
-        </li>
-        <li class="doc-section-item field-body">
-          <b><code>activation</code></b>
-              (<code>str</code>, default:
-                  <code>&#39;gelu&#39;</code>
-)
-          –
-          <div class="doc-md-description">
-            <p>Transformer Encoder activation function. <em>'tanh'</em>, <em>'relu'</em>,
-<em>'leaky_relu'</em>, <em>'gelu'</em>, <em>'geglu'</em> and <em>'reglu'</em> are supported</p>
-          </div>
-        </li>
-        <li class="doc-section-item field-body">
-          <b><code>padding_idx</code></b>
-              (<code>int</code>, default:
-                  <code>0</code>
-)
-          –
-          <div class="doc-md-description">
-            <p>index of the padding token in the padded-tokenised sequences.</p>
-          </div>
-        </li>
-        <li class="doc-section-item field-body">
-          <b><code>with_cls_token</code></b>
-              (<code>bool</code>, default:
-                  <code>False</code>
-)
-          –
-          <div class="doc-md-description">
-            <p>Boolean indicating if a <code>'[CLS]'</code> token is included in the tokenized
-sequences. If present, the final hidden state corresponding to this
-token is used as the aggregated representation for classification and
-regression tasks. <strong>NOTE</strong>: if included in the tokenized sequences it
-must be inserted as the first token in the sequences.</p>
-          </div>
-        </li>
-        <li class="doc-section-item field-body">
-          <b><code>with_pos_encoding</code></b>
-              (<code>bool</code>, default:
-                  <code>True</code>
-)
-          –
-          <div class="doc-md-description">
-            <p>Boolean indicating if positional encoding will be used</p>
-          </div>
-        </li>
-        <li class="doc-section-item field-body">
-          <b><code>pos_encoding_dropout</code></b>
-              (<code>float</code>, default:
-                  <code>0.1</code>
-)
-          –
-          <div class="doc-md-description">
-            <p>Positional encoding dropout</p>
-          </div>
-        </li>
-        <li class="doc-section-item field-body">
-          <b><code>pos_encoder</code></b>
-              (<code><span title="pytorch_widedeep.wdtypes.Optional">Optional</span>[<span title="torch.nn.Module">Module</span>]</code>, default:
-                  <code>None</code>
-)
-          –
-          <div class="doc-md-description">
-            <p>This model uses by default a standard positional encoding approach.
-However, any custom positional encoder can also be used and pass to
-the Transformer model via the 'pos_encoder' parameter</p>
-          </div>
-        </li>
-    </ul>
-
-
-<p><span class="doc-section-title">Attributes:</span></p>
-    <ul>
-        <li class="doc-section-item field-body">
-          <b><code><span title="pytorch_widedeep.models.text.miscellaneous.basic_transformer.Transformer.embedding">embedding</span></code></b>
-              (<code><span title="torch.nn.Module">Module</span></code>)
-          –
-          <div class="doc-md-description">
-            <p>Standard token embedding layer</p>
-          </div>
-        </li>
-        <li class="doc-section-item field-body">
-          <b><code><span title="pytorch_widedeep.models.text.miscellaneous.basic_transformer.Transformer.pos_encoder">pos_encoder</span></code></b>
-              (<code><span title="torch.nn.Module">Module</span></code>)
-          –
-          <div class="doc-md-description">
-            <p>Positional Encoder</p>
-          </div>
-        </li>
-        <li class="doc-section-item field-body">
-          <b><code><span title="pytorch_widedeep.models.text.miscellaneous.basic_transformer.Transformer.encoder">encoder</span></code></b>
-              (<code><span title="torch.nn.Module">Module</span></code>)
-          –
-          <div class="doc-md-description">
-            <p>Sequence of Transformer blocks</p>
-          </div>
-        </li>
-    </ul>
-
-
-<p><span class="doc-section-title">Examples:</span></p>
-    <div class="highlight"><pre><span></span><code><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch</span>
-<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pytorch_widedeep.models</span> <span class="kn">import</span> <span class="n">Transformer</span>
-<span class="gp">&gt;&gt;&gt; </span><span class="n">X_text</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">cat</span><span class="p">((</span><span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">([</span><span class="mi">5</span><span class="p">,</span><span class="mi">1</span><span class="p">]),</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span><span class="o">.</span><span class="n">random_</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="mi">4</span><span class="p">)),</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
-<span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">Transformer</span><span class="p">(</span><span class="n">vocab_size</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">seq_length</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">input_dim</span><span class="o">=</span><span class="mi">8</span><span class="p">,</span> <span class="n">n_heads</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">n_blocks</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
-<span class="gp">&gt;&gt;&gt; </span><span class="n">out</span> <span class="o">=</span> <span class="n">model</span><span class="p">(</span><span class="n">X_text</span><span class="p">)</span>
-</code></pre></div>
-
-                  <details class="quote">
-                    <summary>Source code in <code>pytorch_widedeep/models/text/miscellaneous/basic_transformer.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 85</span>
-<span class="normal"> 86</span>
-<span class="normal"> 87</span>
-<span class="normal"> 88</span>
-<span class="normal"> 89</span>
-<span class="normal"> 90</span>
-<span class="normal"> 91</span>
-<span class="normal"> 92</span>
-<span class="normal"> 93</span>
-<span class="normal"> 94</span>
-<span class="normal"> 95</span>
-<span class="normal"> 96</span>
-<span class="normal"> 97</span>
-<span class="normal"> 98</span>
-<span class="normal"> 99</span>
-<span class="normal">100</span>
-<span class="normal">101</span>
-<span class="normal">102</span>
-<span class="normal">103</span>
-<span class="normal">104</span>
-<span class="normal">105</span>
-<span class="normal">106</span>
-<span class="normal">107</span>
-<span class="normal">108</span>
-<span class="normal">109</span>
-<span class="normal">110</span>
-<span class="normal">111</span>
-<span class="normal">112</span>
-<span class="normal">113</span>
-<span class="normal">114</span>
-<span class="normal">115</span>
-<span class="normal">116</span>
-<span class="normal">117</span>
-<span class="normal">118</span>
-<span class="normal">119</span>
-<span class="normal">120</span>
-<span class="normal">121</span>
-<span class="normal">122</span>
-<span class="normal">123</span>
-<span class="normal">124</span>
-<span class="normal">125</span>
-<span class="normal">126</span>
-<span class="normal">127</span>
-<span class="normal">128</span>
-<span class="normal">129</span>
-<span class="normal">130</span>
-<span class="normal">131</span>
-<span class="normal">132</span>
-<span class="normal">133</span>
-<span class="normal">134</span>
-<span class="normal">135</span>
-<span class="normal">136</span>
-<span class="normal">137</span>
-<span class="normal">138</span>
-<span class="normal">139</span>
-<span class="normal">140</span>
-<span class="normal">141</span>
-<span class="normal">142</span>
-<span class="normal">143</span>
-<span class="normal">144</span>
-<span class="normal">145</span>
-<span class="normal">146</span>
-<span class="normal">147</span>
-<span class="normal">148</span>
-<span class="normal">149</span>
-<span class="normal">150</span>
-<span class="normal">151</span>
-<span class="normal">152</span>
-<span class="normal">153</span>
-<span class="normal">154</span>
-<span class="normal">155</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;input_dim&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;embed_dim&quot;</span><span class="p">,</span> <span class="s2">&quot;d_model&quot;</span><span class="p">])</span>
-<span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;seq_length&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;max_length&quot;</span><span class="p">,</span> <span class="s2">&quot;maxlen&quot;</span><span class="p">])</span>
-<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
-    <span class="bp">self</span><span class="p">,</span>
-    <span class="n">vocab_size</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
-    <span class="n">seq_length</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
-    <span class="n">input_dim</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
-    <span class="n">n_heads</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
-    <span class="n">n_blocks</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
-    <span class="n">attn_dropout</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.1</span><span class="p">,</span>
-    <span class="n">ff_dropout</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.1</span><span class="p">,</span>
-    <span class="n">ff_factor</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">4</span><span class="p">,</span>
-    <span class="n">activation</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;gelu&quot;</span><span class="p">,</span>
-    <span class="n">use_linear_attention</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
-    <span class="n">use_flash_attention</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
-    <span class="n">padding_idx</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
-    <span class="n">with_cls_token</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
-    <span class="o">*</span><span class="p">,</span>  <span class="c1"># from here on pos encoding args</span>
-    <span class="n">with_pos_encoding</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
-    <span class="n">pos_encoding_dropout</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.1</span><span class="p">,</span>
-    <span class="n">pos_encoder</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
-<span class="p">):</span>
-    <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
-
-    <span class="bp">self</span><span class="o">.</span><span class="n">input_dim</span> <span class="o">=</span> <span class="n">input_dim</span>
-    <span class="bp">self</span><span class="o">.</span><span class="n">seq_length</span> <span class="o">=</span> <span class="n">seq_length</span>
-    <span class="bp">self</span><span class="o">.</span><span class="n">n_heads</span> <span class="o">=</span> <span class="n">n_heads</span>
-    <span class="bp">self</span><span class="o">.</span><span class="n">n_blocks</span> <span class="o">=</span> <span class="n">n_blocks</span>
-    <span class="bp">self</span><span class="o">.</span><span class="n">attn_dropout</span> <span class="o">=</span> <span class="n">attn_dropout</span>
-    <span class="bp">self</span><span class="o">.</span><span class="n">ff_dropout</span> <span class="o">=</span> <span class="n">ff_dropout</span>
-    <span class="bp">self</span><span class="o">.</span><span class="n">ff_factor</span> <span class="o">=</span> <span class="n">ff_factor</span>
-    <span class="bp">self</span><span class="o">.</span><span class="n">activation</span> <span class="o">=</span> <span class="n">activation</span>
-    <span class="bp">self</span><span class="o">.</span><span class="n">use_linear_attention</span> <span class="o">=</span> <span class="n">use_linear_attention</span>
-    <span class="bp">self</span><span class="o">.</span><span class="n">use_flash_attention</span> <span class="o">=</span> <span class="n">use_flash_attention</span>
-    <span class="bp">self</span><span class="o">.</span><span class="n">padding_idx</span> <span class="o">=</span> <span class="n">padding_idx</span>
-    <span class="bp">self</span><span class="o">.</span><span class="n">with_cls_token</span> <span class="o">=</span> <span class="n">with_cls_token</span>
-    <span class="bp">self</span><span class="o">.</span><span class="n">with_pos_encoding</span> <span class="o">=</span> <span class="n">with_pos_encoding</span>
-    <span class="bp">self</span><span class="o">.</span><span class="n">pos_encoding_dropout</span> <span class="o">=</span> <span class="n">pos_encoding_dropout</span>
-
-    <span class="bp">self</span><span class="o">.</span><span class="n">embedding</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Embedding</span><span class="p">(</span>
-        <span class="n">vocab_size</span><span class="p">,</span> <span class="n">input_dim</span><span class="p">,</span> <span class="n">padding_idx</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">padding_idx</span>
-    <span class="p">)</span>
-
-    <span class="k">if</span> <span class="n">with_pos_encoding</span><span class="p">:</span>
-        <span class="k">if</span> <span class="n">pos_encoder</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
-            <span class="bp">self</span><span class="o">.</span><span class="n">pos_encoder</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">,</span> <span class="n">nn</span><span class="o">.</span><span class="n">Identity</span><span class="p">,</span> <span class="n">PositionalEncoding</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span>
-                <span class="n">pos_encoder</span>
-            <span class="p">)</span>
-        <span class="k">else</span><span class="p">:</span>
-            <span class="bp">self</span><span class="o">.</span><span class="n">pos_encoder</span> <span class="o">=</span> <span class="n">PositionalEncoding</span><span class="p">(</span>
-                <span class="n">input_dim</span><span class="p">,</span> <span class="n">pos_encoding_dropout</span><span class="p">,</span> <span class="n">seq_length</span>
-            <span class="p">)</span>
-    <span class="k">else</span><span class="p">:</span>
-        <span class="bp">self</span><span class="o">.</span><span class="n">pos_encoder</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Identity</span><span class="p">()</span>
-
-    <span class="bp">self</span><span class="o">.</span><span class="n">encoder</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Sequential</span><span class="p">()</span>
-    <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">n_blocks</span><span class="p">):</span>
-        <span class="bp">self</span><span class="o">.</span><span class="n">encoder</span><span class="o">.</span><span class="n">add_module</span><span class="p">(</span>
-            <span class="s2">&quot;transformer_block&quot;</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">i</span><span class="p">),</span>
-            <span class="n">TransformerEncoder</span><span class="p">(</span>
-                <span class="n">input_dim</span><span class="p">,</span>
-                <span class="n">n_heads</span><span class="p">,</span>
-                <span class="kc">False</span><span class="p">,</span>  <span class="c1"># use_qkv_bias</span>
-                <span class="n">attn_dropout</span><span class="p">,</span>
-                <span class="n">ff_dropout</span><span class="p">,</span>
-                <span class="n">ff_factor</span><span class="p">,</span>
-                <span class="n">activation</span><span class="p">,</span>
-                <span class="n">use_linear_attention</span><span class="p">,</span>
-                <span class="n">use_flash_attention</span><span class="p">,</span>
-            <span class="p">),</span>
-        <span class="p">)</span>
-</code></pre></div></td></tr></table></div>
-                  </details>
-
-
-
-  <div class="doc doc-children">
-
-
-
-
-
-
-
-
-
-
-
   </div>
 
     </div>
@@ -12165,7 +11820,7 @@ <h2 id="pytorch_widedeep.models.text.huggingface_transformers.hf_model.HFModel"
               Bases: <code><span title="pytorch_widedeep.models._base_wd_model_component.BaseWDModelComponent">BaseWDModelComponent</span></code></p>
 
 
-      <p>This class is a wrapper around the Hugging Face transformers library. It
+        <p>This class is a wrapper around the Hugging Face transformers library. It
 can be used as the text component of a Wide &amp; Deep model or independently
 by itself.</p>
 <p>At the moment only models from the families BERT, RoBERTa, DistilBERT,
@@ -12318,7 +11973,12 @@ <h2 id="pytorch_widedeep.models.text.huggingface_transformers.hf_model.HFModel"
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/models/text/huggingface_transformers/hf_model.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 77</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 72</span>
+<span class="normal"> 73</span>
+<span class="normal"> 74</span>
+<span class="normal"> 75</span>
+<span class="normal"> 76</span>
+<span class="normal"> 77</span>
 <span class="normal"> 78</span>
 <span class="normal"> 79</span>
 <span class="normal"> 80</span>
@@ -12371,12 +12031,7 @@ <h2 id="pytorch_widedeep.models.text.huggingface_transformers.hf_model.HFModel"
 <span class="normal">127</span>
 <span class="normal">128</span>
 <span class="normal">129</span>
-<span class="normal">130</span>
-<span class="normal">131</span>
-<span class="normal">132</span>
-<span class="normal">133</span>
-<span class="normal">134</span>
-<span class="normal">135</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;use_cls_token&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;use_special_token&quot;</span><span class="p">])</span>
+<span class="normal">130</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;use_cls_token&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;use_special_token&quot;</span><span class="p">])</span>
 <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">model_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
@@ -12466,7 +12121,7 @@ <h3 id="pytorch_widedeep.models.text.huggingface_transformers.hf_model.HFModel.a
 
     <div class="doc doc-contents ">
 
-      <p>Returns the attention weights if the model was created with the
+        <p>Returns the attention weights if the model was created with the
 output_attention_weights=True argument. If not, it will raise an
 AttributeError.</p>
 <p>The shape of the attention weights is <span class="arithmatex">\((N, H, F, F)\)</span>, where <span class="arithmatex">\(N\)</span> is the
@@ -12516,7 +12171,7 @@ <h2 id="pytorch_widedeep.models.image.vision.Vision" class="doc doc-heading">
               Bases: <code><span title="pytorch_widedeep.models._base_wd_model_component.BaseWDModelComponent">BaseWDModelComponent</span></code></p>
 
 
-      <p>Defines a standard image classifier/regressor using a pretrained
+        <p>Defines a standard image classifier/regressor using a pretrained
 network or a sequence of convolution layers that can be used as the
 <code>deepimage</code> component of a Wide &amp; Deep model or independently by
 itself.</p>
@@ -12703,7 +12358,9 @@ <h2 id="pytorch_widedeep.models.image.vision.Vision" class="doc doc-heading">
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/models/image/vision.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">125</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">123</span>
+<span class="normal">124</span>
+<span class="normal">125</span>
 <span class="normal">126</span>
 <span class="normal">127</span>
 <span class="normal">128</span>
@@ -12750,9 +12407,7 @@ <h2 id="pytorch_widedeep.models.image.vision.Vision" class="doc doc-heading">
 <span class="normal">169</span>
 <span class="normal">170</span>
 <span class="normal">171</span>
-<span class="normal">172</span>
-<span class="normal">173</span>
-<span class="normal">174</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;pretrained_model_setup&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;pretrained_model_name&quot;</span><span class="p">])</span>
+<span class="normal">172</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;pretrained_model_setup&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;pretrained_model_name&quot;</span><span class="p">])</span>
 <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">pretrained_model_setup</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">WeightsEnum</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
@@ -12833,7 +12488,7 @@ <h3 id="pytorch_widedeep.models.image.vision.Vision.output_dim" class="doc doc-h
 
     <div class="doc doc-contents ">
 
-      <p>The output dimension of the model. This is a required property
+        <p>The output dimension of the model. This is a required property
 neccesary to build the <code>WideDeep</code> class</p>
     </div>
 
@@ -12878,7 +12533,7 @@ <h2 id="pytorch_widedeep.models.model_fusion.ModelFuser" class="doc doc-heading"
               Bases: <code><span title="pytorch_widedeep.models._base_wd_model_component.BaseWDModelComponent">BaseWDModelComponent</span></code></p>
 
 
-      <p>This class is a wrapper around a list of models that are associated to the
+        <p>This class is a wrapper around a list of models that are associated to the
 different text and/or image columns (and datasets) The class is designed
 to 'fuse' the models using a variety of methods.</p>
 
@@ -12895,14 +12550,14 @@ <h2 id="pytorch_widedeep.models.model_fusion.ModelFuser" class="doc doc-heading"
         </li>
         <li class="doc-section-item field-body">
           <b><code>fusion_method</code></b>
-              (<code><span title="pytorch_widedeep.wdtypes.Union">Union</span>[<span title="pytorch_widedeep.wdtypes.Literal">Literal</span>[concatenate, mean, max, sum, mult, <span title="pytorch_widedeep.models.model_fusion.ModelFuser.head">head</span>], <span title="pytorch_widedeep.wdtypes.List">List</span>[<span title="pytorch_widedeep.wdtypes.Literal">Literal</span>[concatenate, mean, max, sum, mult, <span title="pytorch_widedeep.models.model_fusion.ModelFuser.head">head</span>]]]</code>)
+              (<code><span title="pytorch_widedeep.wdtypes.Union">Union</span>[<span title="pytorch_widedeep.wdtypes.Literal">Literal</span>[concatenate, mean, max, sum, mult, dot, <span title="pytorch_widedeep.models.model_fusion.ModelFuser.head">head</span>], <span title="pytorch_widedeep.wdtypes.List">List</span>[<span title="pytorch_widedeep.wdtypes.Literal">Literal</span>[concatenate, mean, max, sum, mult, <span title="pytorch_widedeep.models.model_fusion.ModelFuser.head">head</span>]]]</code>)
           –
           <div class="doc-md-description">
             <p>Method to fuse the output of the models. It can be one of
-['concatenate', 'mean', 'max', 'sum', 'mult', 'head'] or a list of
-those. If a list is provided the output of the models will be fused
-using all the methods in the list and the final output will be the
-concatenation of the outputs of each method</p>
+['concatenate', 'mean', 'max', 'sum', 'mult', 'dot', 'head'] or a
+list of those, but 'dot'. If a list is provided the output of the
+models will be fused using all the methods in the list and the final
+output will be the concatenation of the outputs of each method</p>
           </div>
         </li>
         <li class="doc-section-item field-body">
@@ -13055,7 +12710,8 @@ <h2 id="pytorch_widedeep.models.model_fusion.ModelFuser" class="doc doc-heading"
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/models/model_fusion.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">102</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">101</span>
+<span class="normal">102</span>
 <span class="normal">103</span>
 <span class="normal">104</span>
 <span class="normal">105</span>
@@ -13142,6 +12798,7 @@ <h2 id="pytorch_widedeep.models.model_fusion.ModelFuser" class="doc doc-heading"
             <span class="s2">&quot;max&quot;</span><span class="p">,</span>
             <span class="s2">&quot;sum&quot;</span><span class="p">,</span>
             <span class="s2">&quot;mult&quot;</span><span class="p">,</span>
+            <span class="s2">&quot;dot&quot;</span><span class="p">,</span>
             <span class="s2">&quot;head&quot;</span><span class="p">,</span>
         <span class="p">],</span>
         <span class="n">List</span><span class="p">[</span><span class="n">Literal</span><span class="p">[</span><span class="s2">&quot;concatenate&quot;</span><span class="p">,</span> <span class="s2">&quot;mean&quot;</span><span class="p">,</span> <span class="s2">&quot;max&quot;</span><span class="p">,</span> <span class="s2">&quot;sum&quot;</span><span class="p">,</span> <span class="s2">&quot;mult&quot;</span><span class="p">,</span> <span class="s2">&quot;head&quot;</span><span class="p">]],</span>
@@ -13239,7 +12896,7 @@ <h3 id="pytorch_widedeep.models.model_fusion.ModelFuser.output_dim" class="doc d
 
     <div class="doc doc-contents ">
 
-      <p>Returns the output dimension of the model.</p>
+        <p>Returns the output dimension of the model.</p>
     </div>
 
 </div>
@@ -13259,20 +12916,11 @@ <h3 id="pytorch_widedeep.models.model_fusion.ModelFuser.project" class="doc doc-
 
     <div class="doc doc-contents ">
 
-      <p>Projects the output of the models to a common dimension.</p>
+        <p>Projects the output of the models to a common dimension.</p>
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/models/model_fusion.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">223</span>
-<span class="normal">224</span>
-<span class="normal">225</span>
-<span class="normal">226</span>
-<span class="normal">227</span>
-<span class="normal">228</span>
-<span class="normal">229</span>
-<span class="normal">230</span>
-<span class="normal">231</span>
-<span class="normal">232</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">232</span>
 <span class="normal">233</span>
 <span class="normal">234</span>
 <span class="normal">235</span>
@@ -13292,7 +12940,16 @@ <h3 id="pytorch_widedeep.models.model_fusion.ModelFuser.project" class="doc doc-
 <span class="normal">249</span>
 <span class="normal">250</span>
 <span class="normal">251</span>
-<span class="normal">252</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">project</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">X</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Tensor</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Tensor</span><span class="p">]:</span>
+<span class="normal">252</span>
+<span class="normal">253</span>
+<span class="normal">254</span>
+<span class="normal">255</span>
+<span class="normal">256</span>
+<span class="normal">257</span>
+<span class="normal">258</span>
+<span class="normal">259</span>
+<span class="normal">260</span>
+<span class="normal">261</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">project</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">X</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Tensor</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Tensor</span><span class="p">]:</span>
 <span class="w">    </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;Projects the output of the models to a common dimension.&quot;&quot;&quot;</span>
 
     <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">all_output_dim_equal</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">projection_method</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
@@ -13368,7 +13025,7 @@ <h2 id="pytorch_widedeep.models.wide_deep.WideDeep" class="doc doc-heading">
               Bases: <code><span title="torch.nn.Module">Module</span></code></p>
 
 
-      <p>Main collector class that combines all <code>wide</code>, <code>deeptabular</code>
+        <p>Main collector class that combines all <code>wide</code>, <code>deeptabular</code>
 <code>deeptext</code> and <code>deepimage</code> models.</p>
 <p>Note that all models described so far in this library must be passed to
 the <code>WideDeep</code> class once constructed. This is because the models output
@@ -13404,14 +13061,16 @@ <h2 id="pytorch_widedeep.models.wide_deep.WideDeep" class="doc doc-heading">
         </li>
         <li class="doc-section-item field-body">
           <b><code>deeptabular</code></b>
-              (<code><span title="pytorch_widedeep.wdtypes.Optional">Optional</span>[<span title="pytorch_widedeep.models._base_wd_model_component.BaseWDModelComponent">BaseWDModelComponent</span>]</code>, default:
+              (<code><span title="pytorch_widedeep.wdtypes.Optional">Optional</span>[<span title="pytorch_widedeep.wdtypes.Union">Union</span>[<span title="pytorch_widedeep.models._base_wd_model_component.BaseWDModelComponent">BaseWDModelComponent</span>, <span title="pytorch_widedeep.wdtypes.List">List</span>[<span title="pytorch_widedeep.models._base_wd_model_component.BaseWDModelComponent">BaseWDModelComponent</span>]]]</code>, default:
                   <code>None</code>
 )
           –
           <div class="doc-md-description">
             <p>Currently this library implements a number of possible architectures
 for the <code>deeptabular</code> component. See the documenation of the
-package.</p>
+package. Note that <code>deeptabular</code> can be a list of models. This is
+useful when using multiple tabular inputs (e.g. for example in the
+context of a two-tower model for recommendation systems)</p>
           </div>
         </li>
         <li class="doc-section-item field-body">
@@ -13652,14 +13311,24 @@ <h2 id="pytorch_widedeep.models.wide_deep.WideDeep" class="doc doc-heading">
 <span class="normal">189</span>
 <span class="normal">190</span>
 <span class="normal">191</span>
-<span class="normal">192</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span>  <span class="c1"># noqa: C901</span>
+<span class="normal">192</span>
+<span class="normal">193</span>
+<span class="normal">194</span>
+<span class="normal">195</span>
+<span class="normal">196</span>
+<span class="normal">197</span>
+<span class="normal">198</span>
+<span class="normal">199</span>
+<span class="normal">200</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span>  <span class="c1"># noqa: C901</span>
     <span class="s2">&quot;pred_dim&quot;</span><span class="p">,</span>
     <span class="p">[</span><span class="s2">&quot;num_class&quot;</span><span class="p">,</span> <span class="s2">&quot;pred_size&quot;</span><span class="p">],</span>
 <span class="p">)</span>
 <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">wide</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
-    <span class="n">deeptabular</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">BaseWDModelComponent</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+    <span class="n">deeptabular</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span>
+        <span class="n">Union</span><span class="p">[</span><span class="n">BaseWDModelComponent</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">BaseWDModelComponent</span><span class="p">]]</span>
+    <span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
     <span class="n">deeptext</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span>
         <span class="n">Union</span><span class="p">[</span><span class="n">BaseWDModelComponent</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">BaseWDModelComponent</span><span class="p">]]</span>
     <span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
@@ -13698,6 +13367,12 @@ <h2 id="pytorch_widedeep.models.wide_deep.WideDeep" class="doc doc-heading">
 
     <span class="bp">self</span><span class="o">.</span><span class="n">enforce_positive</span> <span class="o">=</span> <span class="n">enforce_positive</span>
 
+    <span class="c1"># better to set this attribute already here</span>
+    <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">deeptabular</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">is_tabnet</span> <span class="o">=</span> <span class="kc">False</span>
+    <span class="k">else</span><span class="p">:</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">is_tabnet</span> <span class="o">=</span> <span class="n">deeptabular</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;TabNet&quot;</span>
+
     <span class="c1"># The main 5 components of the wide and deep assemble: wide,</span>
     <span class="c1"># deeptabular, deeptext, deepimage and deephead</span>
     <span class="bp">self</span><span class="o">.</span><span class="n">with_deephead</span> <span class="o">=</span> <span class="n">deephead</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">or</span> <span class="n">head_hidden_dims</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
@@ -13723,7 +13398,7 @@ <h2 id="pytorch_widedeep.models.wide_deep.WideDeep" class="doc doc-heading">
 
     <span class="bp">self</span><span class="o">.</span><span class="n">wide</span> <span class="o">=</span> <span class="n">wide</span>
     <span class="bp">self</span><span class="o">.</span><span class="n">deeptabular</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">deeptext</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">deepimage</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set_model_components</span><span class="p">(</span>
-        <span class="n">deeptabular</span><span class="p">,</span> <span class="n">deeptext</span><span class="p">,</span> <span class="n">deepimage</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">with_deephead</span>
+        <span class="n">deeptabular</span><span class="p">,</span> <span class="n">deeptext</span><span class="p">,</span> <span class="n">deepimage</span>
     <span class="p">)</span>
 
     <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">enforce_positive</span><span class="p">:</span>
@@ -13767,7 +13442,7 @@ <h2 id="pytorch_widedeep.models.wide_deep.WideDeep" class="doc doc-heading">
   <span class="md-source-file__fact">
     <span class="md-icon" title="Contributors">
       
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.536 5.536 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13v-1.75M0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20H0m24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9V20Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.54 5.54 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13zM0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20zm24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9z"/></svg>
       
     </span>
     <nav>
@@ -13823,7 +13498,7 @@ <h2 id="pytorch_widedeep.models.wide_deep.WideDeep" class="doc doc-heading">
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -13838,10 +13513,10 @@ <h2 id="pytorch_widedeep.models.wide_deep.WideDeep" class="doc doc-heading">
     </div>
     
     
-    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="../assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="../stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/pytorch-widedeep/model_components.md b/mkdocs/site/pytorch-widedeep/model_components.md
index 10ef9e89..ab42e7c9 100644
--- a/mkdocs/site/pytorch-widedeep/model_components.md
+++ b/mkdocs/site/pytorch-widedeep/model_components.md
@@ -97,12 +97,6 @@ self-supervised pre-training with tabular data.
             - "!^_"  # exclude all members starting with _
             - "!^forward$"
 
-::: pytorch_widedeep.models.text.miscellaneous.basic_transformer.Transformer
-    selection:
-        filters:
-            - "!^_"  # exclude all members starting with _
-            - "!^forward$"
-
 ::: pytorch_widedeep.models.text.huggingface_transformers.hf_model.HFModel
     selection:
         filters:
diff --git a/mkdocs/site/pytorch-widedeep/preprocessing.html b/mkdocs/site/pytorch-widedeep/preprocessing.html
index 01e87641..2e3ec7a4 100644
--- a/mkdocs/site/pytorch-widedeep/preprocessing.html
+++ b/mkdocs/site/pytorch-widedeep/preprocessing.html
@@ -18,7 +18,7 @@
       
       
       <link rel="icon" href="../assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -26,7 +26,7 @@
       
     
     
-      <link rel="stylesheet" href="../assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="../assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="../assets/stylesheets/palette.06af60db.min.css">
@@ -52,7 +52,7 @@
     
       <link rel="stylesheet" href="../stylesheets/extra.css">
     
-    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -101,7 +101,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -129,7 +129,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -139,7 +139,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -147,13 +147,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -162,15 +162,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -193,7 +193,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -372,7 +372,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -476,6 +476,8 @@
         
       
         
+      
+        
       
         
       
@@ -877,6 +879,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="bayesian_models.html" class="md-nav__link">
         
@@ -1687,7 +1710,7 @@ <h2 id="pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor" class
               Bases: <code><span title="pytorch_widedeep.preprocessing.base_preprocessor.BasePreprocessor">BasePreprocessor</span></code></p>
 
 
-      <p>Preprocessor to prepare the wide input dataset</p>
+        <p>Preprocessor to prepare the wide input dataset</p>
 <p>This Preprocessor prepares the data for the wide, linear component.
 This linear model is implemented via an Embedding layer that is
 connected to the output neuron. <code>WidePreprocessor</code> numerically
@@ -1783,15 +1806,15 @@ <h2 id="pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor" class
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">67</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">66</span>
+<span class="normal">67</span>
 <span class="normal">68</span>
 <span class="normal">69</span>
 <span class="normal">70</span>
 <span class="normal">71</span>
 <span class="normal">72</span>
 <span class="normal">73</span>
-<span class="normal">74</span>
-<span class="normal">75</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">74</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span> <span class="n">wide_cols</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">crossed_cols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span>
 <span class="p">):</span>
     <span class="nb">super</span><span class="p">(</span><span class="n">WidePreprocessor</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
@@ -1828,7 +1851,7 @@ <h3 id="pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor.fit" c
 
     <div class="doc doc-contents ">
 
-      <p>Fits the Preprocessor and creates required attributes</p>
+        <p>Fits the Preprocessor and creates required attributes</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -1857,7 +1880,8 @@ <h3 id="pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor.fit" c
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 77</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 76</span>
+<span class="normal"> 77</span>
 <span class="normal"> 78</span>
 <span class="normal"> 79</span>
 <span class="normal"> 80</span>
@@ -1882,8 +1906,7 @@ <h3 id="pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor.fit" c
 <span class="normal"> 99</span>
 <span class="normal">100</span>
 <span class="normal">101</span>
-<span class="normal">102</span>
-<span class="normal">103</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;WidePreprocessor&quot;</span><span class="p">:</span>
+<span class="normal">102</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;WidePreprocessor&quot;</span><span class="p">:</span>
 <span class="w">    </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;Fits the Preprocessor and creates required attributes</span>
 
 <span class="sd">    Parameters</span>
@@ -1957,7 +1980,8 @@ <h3 id="pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor.transf
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">105</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">104</span>
+<span class="normal">105</span>
 <span class="normal">106</span>
 <span class="normal">107</span>
 <span class="normal">108</span>
@@ -1979,8 +2003,7 @@ <h3 id="pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor.transf
 <span class="normal">124</span>
 <span class="normal">125</span>
 <span class="normal">126</span>
-<span class="normal">127</span>
-<span class="normal">128</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">:</span>
+<span class="normal">127</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">:</span>
 <span class="w">    </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Parameters</span>
 <span class="sd">    ----------</span>
@@ -2023,7 +2046,7 @@ <h3 id="pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor.invers
 
     <div class="doc doc-contents ">
 
-      <p>Takes as input the output from the <code>transform</code> method and it will
+        <p>Takes as input the output from the <code>transform</code> method and it will
 return the original values.</p>
 
 
@@ -2054,7 +2077,8 @@ <h3 id="pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor.invers
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">133</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">132</span>
+<span class="normal">133</span>
 <span class="normal">134</span>
 <span class="normal">135</span>
 <span class="normal">136</span>
@@ -2078,8 +2102,7 @@ <h3 id="pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor.invers
 <span class="normal">154</span>
 <span class="normal">155</span>
 <span class="normal">156</span>
-<span class="normal">157</span>
-<span class="normal">158</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">inverse_transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">encoded</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
+<span class="normal">157</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">inverse_transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">encoded</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
 <span class="w">    </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;Takes as input the output from the `transform` method and it will</span>
 <span class="sd">    return the original values.</span>
 
@@ -2124,7 +2147,7 @@ <h3 id="pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor.fit_tr
 
     <div class="doc doc-contents ">
 
-      <p>Combines <code>fit</code> and <code>transform</code></p>
+        <p>Combines <code>fit</code> and <code>transform</code></p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -2153,7 +2176,8 @@ <h3 id="pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor.fit_tr
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">160</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">159</span>
+<span class="normal">160</span>
 <span class="normal">161</span>
 <span class="normal">162</span>
 <span class="normal">163</span>
@@ -2165,8 +2189,7 @@ <h3 id="pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor.fit_tr
 <span class="normal">169</span>
 <span class="normal">170</span>
 <span class="normal">171</span>
-<span class="normal">172</span>
-<span class="normal">173</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fit_transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">:</span>
+<span class="normal">172</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fit_transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;Combines `fit` and `transform`</span>
 
 <span class="sd">    Parameters</span>
@@ -2227,7 +2250,7 @@ <h2 id="pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor" class="
               Bases: <code><span title="pytorch_widedeep.preprocessing.base_preprocessor.BasePreprocessor">BasePreprocessor</span></code></p>
 
 
-      <p>Preprocessor to prepare the <code>deeptabular</code> component input dataset</p>
+        <p>Preprocessor to prepare the <code>deeptabular</code> component input dataset</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -2523,7 +2546,14 @@ <h2 id="pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor" class="
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">249</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">242</span>
+<span class="normal">243</span>
+<span class="normal">244</span>
+<span class="normal">245</span>
+<span class="normal">246</span>
+<span class="normal">247</span>
+<span class="normal">248</span>
+<span class="normal">249</span>
 <span class="normal">250</span>
 <span class="normal">251</span>
 <span class="normal">252</span>
@@ -2573,14 +2603,7 @@ <h2 id="pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor" class="
 <span class="normal">296</span>
 <span class="normal">297</span>
 <span class="normal">298</span>
-<span class="normal">299</span>
-<span class="normal">300</span>
-<span class="normal">301</span>
-<span class="normal">302</span>
-<span class="normal">303</span>
-<span class="normal">304</span>
-<span class="normal">305</span>
-<span class="normal">306</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;with_attention&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;for_transformer&quot;</span><span class="p">])</span>
+<span class="normal">299</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;with_attention&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;for_transformer&quot;</span><span class="p">,</span> <span class="s2">&quot;for_matrix_factorization&quot;</span><span class="p">,</span> <span class="s2">&quot;for_mf&quot;</span><span class="p">])</span>
 <span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;cat_embed_cols&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;embed_cols&quot;</span><span class="p">])</span>
 <span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;scale&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;scale_cont_cols&quot;</span><span class="p">])</span>
 <span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;quantization_setup&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;cols_and_bins&quot;</span><span class="p">])</span>
@@ -2666,7 +2689,7 @@ <h3 id="pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor.fit" cla
 
     <div class="doc doc-contents ">
 
-      <p>Fits the Preprocessor and creates required attributes</p>
+        <p>Fits the Preprocessor and creates required attributes</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -2695,7 +2718,14 @@ <h3 id="pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor.fit" cla
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">308</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">301</span>
+<span class="normal">302</span>
+<span class="normal">303</span>
+<span class="normal">304</span>
+<span class="normal">305</span>
+<span class="normal">306</span>
+<span class="normal">307</span>
+<span class="normal">308</span>
 <span class="normal">309</span>
 <span class="normal">310</span>
 <span class="normal">311</span>
@@ -2761,14 +2791,7 @@ <h3 id="pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor.fit" cla
 <span class="normal">371</span>
 <span class="normal">372</span>
 <span class="normal">373</span>
-<span class="normal">374</span>
-<span class="normal">375</span>
-<span class="normal">376</span>
-<span class="normal">377</span>
-<span class="normal">378</span>
-<span class="normal">379</span>
-<span class="normal">380</span>
-<span class="normal">381</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">BasePreprocessor</span><span class="p">:</span>  <span class="c1"># noqa: C901</span>
+<span class="normal">374</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">BasePreprocessor</span><span class="p">:</span>  <span class="c1"># noqa: C901</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;Fits the Preprocessor and creates required attributes</span>
 
 <span class="sd">    Parameters</span>
@@ -2861,7 +2884,7 @@ <h3 id="pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor.transfor
 
     <div class="doc doc-contents ">
 
-      <p>Returns the processed <code>dataframe</code> as a np.ndarray</p>
+        <p>Returns the processed <code>dataframe</code> as a np.ndarray</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -2890,7 +2913,14 @@ <h3 id="pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor.transfor
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">383</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">376</span>
+<span class="normal">377</span>
+<span class="normal">378</span>
+<span class="normal">379</span>
+<span class="normal">380</span>
+<span class="normal">381</span>
+<span class="normal">382</span>
+<span class="normal">383</span>
 <span class="normal">384</span>
 <span class="normal">385</span>
 <span class="normal">386</span>
@@ -2926,14 +2956,7 @@ <h3 id="pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor.transfor
 <span class="normal">416</span>
 <span class="normal">417</span>
 <span class="normal">418</span>
-<span class="normal">419</span>
-<span class="normal">420</span>
-<span class="normal">421</span>
-<span class="normal">422</span>
-<span class="normal">423</span>
-<span class="normal">424</span>
-<span class="normal">425</span>
-<span class="normal">426</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">:</span>  <span class="c1"># noqa: C901</span>
+<span class="normal">419</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">:</span>  <span class="c1"># noqa: C901</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;Returns the processed `dataframe` as a np.ndarray</span>
 
 <span class="sd">    Parameters</span>
@@ -2996,7 +3019,7 @@ <h3 id="pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor.inverse_
 
     <div class="doc doc-contents ">
 
-      <p>Takes as input the output from the <code>transform</code> method and it will
+        <p>Takes as input the output from the <code>transform</code> method and it will
 return the original values.</p>
 
 
@@ -3026,7 +3049,14 @@ <h3 id="pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor.inverse_
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">431</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">424</span>
+<span class="normal">425</span>
+<span class="normal">426</span>
+<span class="normal">427</span>
+<span class="normal">428</span>
+<span class="normal">429</span>
+<span class="normal">430</span>
+<span class="normal">431</span>
 <span class="normal">432</span>
 <span class="normal">433</span>
 <span class="normal">434</span>
@@ -3058,14 +3088,7 @@ <h3 id="pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor.inverse_
 <span class="normal">460</span>
 <span class="normal">461</span>
 <span class="normal">462</span>
-<span class="normal">463</span>
-<span class="normal">464</span>
-<span class="normal">465</span>
-<span class="normal">466</span>
-<span class="normal">467</span>
-<span class="normal">468</span>
-<span class="normal">469</span>
-<span class="normal">470</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">inverse_transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">encoded</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>  <span class="c1"># noqa: C901</span>
+<span class="normal">463</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">inverse_transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">encoded</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>  <span class="c1"># noqa: C901</span>
 <span class="w">    </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;Takes as input the output from the `transform` method and it will</span>
 <span class="sd">    return the original values.</span>
 
@@ -3124,7 +3147,7 @@ <h3 id="pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor.fit_tran
 
     <div class="doc doc-contents ">
 
-      <p>Combines <code>fit</code> and <code>transform</code></p>
+        <p>Combines <code>fit</code> and <code>transform</code></p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -3153,20 +3176,20 @@ <h3 id="pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor.fit_tran
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">472</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">465</span>
+<span class="normal">466</span>
+<span class="normal">467</span>
+<span class="normal">468</span>
+<span class="normal">469</span>
+<span class="normal">470</span>
+<span class="normal">471</span>
+<span class="normal">472</span>
 <span class="normal">473</span>
 <span class="normal">474</span>
 <span class="normal">475</span>
 <span class="normal">476</span>
 <span class="normal">477</span>
-<span class="normal">478</span>
-<span class="normal">479</span>
-<span class="normal">480</span>
-<span class="normal">481</span>
-<span class="normal">482</span>
-<span class="normal">483</span>
-<span class="normal">484</span>
-<span class="normal">485</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fit_transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">:</span>
+<span class="normal">478</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fit_transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;Combines `fit` and `transform`</span>
 
 <span class="sd">    Parameters</span>
@@ -3209,7 +3232,7 @@ <h2 id="pytorch_widedeep.preprocessing.tab_preprocessor.Quantizer" class="doc do
     <div class="doc doc-contents first">
 
 
-      <p>Helper class to perform the quantization of continuous columns. It is
+        <p>Helper class to perform the quantization of continuous columns. It is
 included in this docs for completion, since depending on the value of the
 parameter <code>'quantization_setup'</code> of the <code>TabPreprocessor</code> class, that
 class might have an attribute of type <code>Quantizer</code>. However, this class is
@@ -3232,15 +3255,15 @@ <h2 id="pytorch_widedeep.preprocessing.tab_preprocessor.Quantizer" class="doc do
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">61</span>
-<span class="normal">62</span>
-<span class="normal">63</span>
-<span class="normal">64</span>
-<span class="normal">65</span>
-<span class="normal">66</span>
-<span class="normal">67</span>
-<span class="normal">68</span>
-<span class="normal">69</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">54</span>
+<span class="normal">55</span>
+<span class="normal">56</span>
+<span class="normal">57</span>
+<span class="normal">58</span>
+<span class="normal">59</span>
+<span class="normal">60</span>
+<span class="normal">61</span>
+<span class="normal">62</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">quantization_setup</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]],</span>
     <span class="o">**</span><span class="n">kwargs</span><span class="p">,</span>
@@ -3300,7 +3323,7 @@ <h2 id="pytorch_widedeep.preprocessing.text_preprocessor.TextPreprocessor" class
               Bases: <code><span title="pytorch_widedeep.preprocessing.base_preprocessor.BasePreprocessor">BasePreprocessor</span></code></p>
 
 
-      <p>Preprocessor to prepare the <code>deeptext</code> input dataset</p>
+        <p>Preprocessor to prepare the <code>deeptext</code> input dataset</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -3534,7 +3557,7 @@ <h3 id="pytorch_widedeep.preprocessing.text_preprocessor.TextPreprocessor.fit" c
 
     <div class="doc doc-contents ">
 
-      <p>Builds the vocabulary</p>
+        <p>Builds the vocabulary</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -3651,7 +3674,7 @@ <h3 id="pytorch_widedeep.preprocessing.text_preprocessor.TextPreprocessor.transf
 
     <div class="doc doc-contents ">
 
-      <p>Returns the padded, <em>'numericalised'</em> sequences</p>
+        <p>Returns the padded, <em>'numericalised'</em> sequences</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -3732,7 +3755,7 @@ <h3 id="pytorch_widedeep.preprocessing.text_preprocessor.TextPreprocessor.transf
 
     <div class="doc doc-contents ">
 
-      <p>Returns the padded, <em>'numericalised'</em> sequence</p>
+        <p>Returns the padded, <em>'numericalised'</em> sequence</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -3811,7 +3834,7 @@ <h3 id="pytorch_widedeep.preprocessing.text_preprocessor.TextPreprocessor.fit_tr
 
     <div class="doc doc-contents ">
 
-      <p>Combines <code>fit</code> and <code>transform</code></p>
+        <p>Combines <code>fit</code> and <code>transform</code></p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -3886,7 +3909,7 @@ <h3 id="pytorch_widedeep.preprocessing.text_preprocessor.TextPreprocessor.invers
 
     <div class="doc doc-contents ">
 
-      <p>Returns the original text plus the added 'special' tokens</p>
+        <p>Returns the original text plus the added 'special' tokens</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -3986,7 +4009,7 @@ <h2 id="pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor" class="do
               Bases: <code><span title="pytorch_widedeep.preprocessing.base_preprocessor.BasePreprocessor">BasePreprocessor</span></code></p>
 
 
-      <p>Text processor to prepare the <code>deeptext</code> input dataset that is a
+        <p>Text processor to prepare the <code>deeptext</code> input dataset that is a
 wrapper around HuggingFace's tokenizers.</p>
 <p>Following the main phylosophy of the <code>pytorch-widedeep</code> library, this
 class is designed to be as flexible as possible. Therefore, it is coded
@@ -4225,7 +4248,7 @@ <h3 id="pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.encode" cl
 
     <div class="doc doc-contents ">
 
-      <p>Encodes a list of texts. The method is a wrapper around the
+        <p>Encodes a list of texts. The method is a wrapper around the
 <code>batch_encode_plus</code> method of the HuggingFace's tokenizer.</p>
 <p>if 'use_fast_tokenizer' is True, the method will use the <code>batch_encode_plus</code></p>
 
@@ -4404,7 +4427,7 @@ <h3 id="pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.decode" cl
 
     <div class="doc doc-contents ">
 
-      <p>Decodes a list of input_ids. The method is a wrapper around the
+        <p>Decodes a list of input_ids. The method is a wrapper around the
 <code>convert_ids_to_tokens</code> and <code>convert_tokens_to_string</code> methods of the
 HuggingFace's tokenizer.</p>
 
@@ -4515,7 +4538,7 @@ <h3 id="pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.fit" class
 
     <div class="doc doc-contents ">
 
-      <p>This method is included for consistency with the rest of the library
+        <p>This method is included for consistency with the rest of the library
 in general and with the <code>BasePreprocessor</code> in particular. HuggingFace's
 tokenizers and models are already trained. Therefore, the 'fit' method
 here does nothing other than checking that the 'text_col' parameter is
@@ -4595,7 +4618,7 @@ <h3 id="pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.transform"
 
     <div class="doc doc-contents ">
 
-      <p>Encodes the text data in the input dataframe. This method simply
+        <p>Encodes the text data in the input dataframe. This method simply
 calls the <code>encode</code> method under the hood. Similar to the <code>fit</code> method,
 this method is included for consistency with the rest of the library
 in general and with the <code>BasePreprocessor</code> in particular.</p>
@@ -4700,7 +4723,7 @@ <h3 id="pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.transform_
 
     <div class="doc doc-contents ">
 
-      <p>Encodes a single text sample.</p>
+        <p>Encodes a single text sample.</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -4787,7 +4810,7 @@ <h3 id="pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.fit_transf
 
     <div class="doc doc-contents ">
 
-      <p>Encodes the text data in the input dataframe.</p>
+        <p>Encodes the text data in the input dataframe.</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -4867,7 +4890,7 @@ <h3 id="pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.inverse_tr
 
     <div class="doc doc-contents ">
 
-      <p>Decodes a list of input_ids. The method simply calls the <code>decode</code> method
+        <p>Decodes a list of input_ids. The method simply calls the <code>decode</code> method
 under the hood.</p>
 
 
@@ -4977,7 +5000,7 @@ <h2 id="pytorch_widedeep.preprocessing.image_preprocessor.ImagePreprocessor" cla
               Bases: <code><span title="pytorch_widedeep.preprocessing.base_preprocessor.BasePreprocessor">BasePreprocessor</span></code></p>
 
 
-      <p>Preprocessor to prepare the <code>deepimage</code> input dataset.</p>
+        <p>Preprocessor to prepare the <code>deepimage</code> input dataset.</p>
 <p>The Preprocessing consists simply on resizing according to their
 aspect ratio</p>
 
@@ -5152,7 +5175,7 @@ <h3 id="pytorch_widedeep.preprocessing.image_preprocessor.ImagePreprocessor.tran
 
     <div class="doc doc-contents ">
 
-      <p>Resizes the images to the input height and width.</p>
+        <p>Resizes the images to the input height and width.</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -5329,7 +5352,7 @@ <h3 id="pytorch_widedeep.preprocessing.image_preprocessor.ImagePreprocessor.fit_
 
     <div class="doc doc-contents ">
 
-      <p>Combines <code>fit</code> and <code>transform</code></p>
+        <p>Combines <code>fit</code> and <code>transform</code></p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -5425,7 +5448,7 @@ <h2 id="pytorch_widedeep.preprocessing.wide_preprocessor.ChunkWidePreprocessor"
               Bases: <code><a class="autorefs autorefs-internal" title="pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor" href="#pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor">WidePreprocessor</a></code></p>
 
 
-      <p>Preprocessor to prepare the wide input dataset</p>
+        <p>Preprocessor to prepare the wide input dataset</p>
 <p>This Preprocessor prepares the data for the wide, linear component.
 This linear model is implemented via an Embedding layer that is
 connected to the output neuron. <code>ChunkWidePreprocessor</code> numerically
@@ -5511,7 +5534,8 @@ <h2 id="pytorch_widedeep.preprocessing.wide_preprocessor.ChunkWidePreprocessor"
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">254</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">253</span>
+<span class="normal">254</span>
 <span class="normal">255</span>
 <span class="normal">256</span>
 <span class="normal">257</span>
@@ -5522,8 +5546,7 @@ <h2 id="pytorch_widedeep.preprocessing.wide_preprocessor.ChunkWidePreprocessor"
 <span class="normal">262</span>
 <span class="normal">263</span>
 <span class="normal">264</span>
-<span class="normal">265</span>
-<span class="normal">266</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">265</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">wide_cols</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span>
     <span class="n">n_chunks</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
@@ -5564,7 +5587,7 @@ <h3 id="pytorch_widedeep.preprocessing.wide_preprocessor.ChunkWidePreprocessor.p
 
     <div class="doc doc-contents ">
 
-      <p>Fits the Preprocessor and creates required attributes</p>
+        <p>Fits the Preprocessor and creates required attributes</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -5593,7 +5616,8 @@ <h3 id="pytorch_widedeep.preprocessing.wide_preprocessor.ChunkWidePreprocessor.p
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">268</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">267</span>
+<span class="normal">268</span>
 <span class="normal">269</span>
 <span class="normal">270</span>
 <span class="normal">271</span>
@@ -5627,8 +5651,7 @@ <h3 id="pytorch_widedeep.preprocessing.wide_preprocessor.ChunkWidePreprocessor.p
 <span class="normal">299</span>
 <span class="normal">300</span>
 <span class="normal">301</span>
-<span class="normal">302</span>
-<span class="normal">303</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">partial_fit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">chunk</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;ChunkWidePreprocessor&quot;</span><span class="p">:</span>
+<span class="normal">302</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">partial_fit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">chunk</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;ChunkWidePreprocessor&quot;</span><span class="p">:</span>
 <span class="w">    </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;Fits the Preprocessor and creates required attributes</span>
 
 <span class="sd">    Parameters</span>
@@ -5683,17 +5706,17 @@ <h3 id="pytorch_widedeep.preprocessing.wide_preprocessor.ChunkWidePreprocessor.f
 
     <div class="doc doc-contents ">
 
-      <p>Runs <code>partial_fit</code>. This is just to override the fit method in the base
+        <p>Runs <code>partial_fit</code>. This is just to override the fit method in the base
 class. This class is not designed or thought to run fit</p>
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">305</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">304</span>
+<span class="normal">305</span>
 <span class="normal">306</span>
 <span class="normal">307</span>
 <span class="normal">308</span>
-<span class="normal">309</span>
-<span class="normal">310</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;ChunkWidePreprocessor&quot;</span><span class="p">:</span>
+<span class="normal">309</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;ChunkWidePreprocessor&quot;</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Runs `partial_fit`. This is just to override the fit method in the base</span>
 <span class="sd">    class. This class is not designed or thought to run fit</span>
@@ -5745,7 +5768,7 @@ <h2 id="pytorch_widedeep.preprocessing.tab_preprocessor.ChunkTabPreprocessor" cl
               Bases: <code><a class="autorefs autorefs-internal" title="pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor" href="#pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor">TabPreprocessor</a></code></p>
 
 
-      <p>Preprocessor to prepare the <code>deeptabular</code> component input dataset</p>
+        <p>Preprocessor to prepare the <code>deeptabular</code> component input dataset</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -6009,7 +6032,14 @@ <h2 id="pytorch_widedeep.preprocessing.tab_preprocessor.ChunkTabPreprocessor" cl
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">825</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">818</span>
+<span class="normal">819</span>
+<span class="normal">820</span>
+<span class="normal">821</span>
+<span class="normal">822</span>
+<span class="normal">823</span>
+<span class="normal">824</span>
+<span class="normal">825</span>
 <span class="normal">826</span>
 <span class="normal">827</span>
 <span class="normal">828</span>
@@ -6048,14 +6078,7 @@ <h2 id="pytorch_widedeep.preprocessing.tab_preprocessor.ChunkTabPreprocessor" cl
 <span class="normal">861</span>
 <span class="normal">862</span>
 <span class="normal">863</span>
-<span class="normal">864</span>
-<span class="normal">865</span>
-<span class="normal">866</span>
-<span class="normal">867</span>
-<span class="normal">868</span>
-<span class="normal">869</span>
-<span class="normal">870</span>
-<span class="normal">871</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;with_attention&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;for_transformer&quot;</span><span class="p">])</span>
+<span class="normal">864</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;with_attention&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;for_transformer&quot;</span><span class="p">])</span>
 <span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;cat_embed_cols&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;embed_cols&quot;</span><span class="p">])</span>
 <span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;scale&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;scale_cont_cols&quot;</span><span class="p">])</span>
 <span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;cols_and_bins&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;quantization_setup&quot;</span><span class="p">])</span>
@@ -6155,7 +6178,7 @@ <h2 id="pytorch_widedeep.preprocessing.text_preprocessor.ChunkTextPreprocessor"
               Bases: <code><a class="autorefs autorefs-internal" title="pytorch_widedeep.preprocessing.text_preprocessor.TextPreprocessor" href="#pytorch_widedeep.preprocessing.text_preprocessor.TextPreprocessor">TextPreprocessor</a></code></p>
 
 
-      <p>Preprocessor to prepare the <code>deeptext</code> input dataset</p>
+        <p>Preprocessor to prepare the <code>deeptext</code> input dataset</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -6422,7 +6445,7 @@ <h2 id="pytorch_widedeep.preprocessing.hf_preprocessor.ChunkHFPreprocessor" clas
               Bases: <code><a class="autorefs autorefs-internal" title="pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor" href="#pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor">HFPreprocessor</a></code></p>
 
 
-      <p>Text processor to prepare the <code>deeptext</code> input dataset that is a
+        <p>Text processor to prepare the <code>deeptext</code> input dataset that is a
 wrapper around HuggingFace's tokenizers.</p>
 <p>Hugginface Tokenizer's are already 'trained'. Therefore, unlike the
 <code>ChunkTextPreprocessor</code> this is mostly identical to the <code>HFPreprocessor</code>
@@ -6666,7 +6689,7 @@ <h2 id="pytorch_widedeep.preprocessing.hf_preprocessor.ChunkHFPreprocessor" clas
   <span class="md-source-file__fact">
     <span class="md-icon" title="Contributors">
       
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.536 5.536 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13v-1.75M0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20H0m24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9V20Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.54 5.54 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13zM0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20zm24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9z"/></svg>
       
     </span>
     <nav>
@@ -6722,7 +6745,7 @@ <h2 id="pytorch_widedeep.preprocessing.hf_preprocessor.ChunkHFPreprocessor" clas
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -6737,10 +6760,10 @@ <h2 id="pytorch_widedeep.preprocessing.hf_preprocessor.ChunkHFPreprocessor" clas
     </div>
     
     
-    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="../assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="../stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/pytorch-widedeep/self_supervised_pretraining.html b/mkdocs/site/pytorch-widedeep/self_supervised_pretraining.html
index 91279c47..63f91bc5 100644
--- a/mkdocs/site/pytorch-widedeep/self_supervised_pretraining.html
+++ b/mkdocs/site/pytorch-widedeep/self_supervised_pretraining.html
@@ -18,7 +18,7 @@
       
       
       <link rel="icon" href="../assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -26,7 +26,7 @@
       
     
     
-      <link rel="stylesheet" href="../assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="../assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="../assets/stylesheets/palette.06af60db.min.css">
@@ -52,7 +52,7 @@
     
       <link rel="stylesheet" href="../stylesheets/extra.css">
     
-    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -101,7 +101,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -129,7 +129,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -139,7 +139,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -147,13 +147,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -162,15 +162,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -193,7 +193,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -372,7 +372,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -476,6 +476,8 @@
         
       
         
+      
+        
       
         
       
@@ -738,6 +740,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="bayesian_models.html" class="md-nav__link">
         
@@ -1648,7 +1671,7 @@ <h2 id="pytorch_widedeep.self_supervised_training.EncoderDecoderTrainer" class="
               Bases: <code><span title="pytorch_widedeep.self_supervised_training._base_encoder_decoder_trainer.BaseEncoderDecoderTrainer">BaseEncoderDecoderTrainer</span></code></p>
 
 
-      <p>This class implements an Encoder-Decoder self-supervised 'routine'
+        <p>This class implements an Encoder-Decoder self-supervised 'routine'
 inspired by
 <a href="https://arxiv.org/abs/1908.07442">TabNet: Attentive Interpretable Tabular Learning</a>.
 See Figure 1 above.</p>
@@ -1855,7 +1878,7 @@ <h3 id="pytorch_widedeep.self_supervised_training.EncoderDecoderTrainer.pretrain
 
     <div class="doc doc-contents ">
 
-      <p>Pretrain method. Can also be called using <code>.fit(&lt;same_args&gt;)</code></p>
+        <p>Pretrain method. Can also be called using <code>.fit(&lt;same_args&gt;)</code></p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -2155,7 +2178,7 @@ <h2 id="pytorch_widedeep.self_supervised_training.ContrastiveDenoisingTrainer" c
               Bases: <code><span title="pytorch_widedeep.self_supervised_training._base_contrastive_denoising_trainer.BaseContrastiveDenoisingTrainer">BaseContrastiveDenoisingTrainer</span></code></p>
 
 
-      <p>This class trains a Contrastive, Denoising Self Supervised 'routine' that
+        <p>This class trains a Contrastive, Denoising Self Supervised 'routine' that
 is based on the one described in
 <a href="https://arxiv.org/abs/2106.01342">SAINT: Improved Neural Networks for Tabular Data via Row Attention and
 Contrastive Pre-Training</a>, their Figure 1.</p>
@@ -2459,7 +2482,7 @@ <h3 id="pytorch_widedeep.self_supervised_training.ContrastiveDenoisingTrainer.pr
 
     <div class="doc doc-contents ">
 
-      <p>Pretrain method. Can also be called using <code>.fit(&lt;same_args&gt;)</code></p>
+        <p>Pretrain method. Can also be called using <code>.fit(&lt;same_args&gt;)</code></p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -2766,7 +2789,7 @@ <h3 id="pytorch_widedeep.self_supervised_training.ContrastiveDenoisingTrainer.pr
   <span class="md-source-file__fact">
     <span class="md-icon" title="Contributors">
       
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.536 5.536 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13v-1.75M0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20H0m24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9V20Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.54 5.54 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13zM0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20zm24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9z"/></svg>
       
     </span>
     <nav>
@@ -2822,7 +2845,7 @@ <h3 id="pytorch_widedeep.self_supervised_training.ContrastiveDenoisingTrainer.pr
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -2837,10 +2860,10 @@ <h3 id="pytorch_widedeep.self_supervised_training.ContrastiveDenoisingTrainer.pr
     </div>
     
     
-    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="../assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="../stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/pytorch-widedeep/tab2vec.html b/mkdocs/site/pytorch-widedeep/tab2vec.html
index 34106d94..f33c81b7 100644
--- a/mkdocs/site/pytorch-widedeep/tab2vec.html
+++ b/mkdocs/site/pytorch-widedeep/tab2vec.html
@@ -18,7 +18,7 @@
       
       
       <link rel="icon" href="../assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -26,7 +26,7 @@
       
     
     
-      <link rel="stylesheet" href="../assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="../assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="../assets/stylesheets/palette.06af60db.min.css">
@@ -52,7 +52,7 @@
     
       <link rel="stylesheet" href="../stylesheets/extra.css">
     
-    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -101,7 +101,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -129,7 +129,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -139,7 +139,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -147,13 +147,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -162,15 +162,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -193,7 +193,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -372,7 +372,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -476,6 +476,8 @@
         
       
         
+      
+        
       
         
       
@@ -738,6 +740,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="bayesian_models.html" class="md-nav__link">
         
@@ -1593,7 +1616,7 @@ <h2 id="pytorch_widedeep.tab2vec.Tab2Vec" class="doc doc-heading">
     <div class="doc doc-contents first">
 
 
-      <p>Class to transform an input dataframe into vectorized form.</p>
+        <p>Class to transform an input dataframe into vectorized form.</p>
 <p>This class will take an input dataframe in the form of the dataframe used
 for training, and it will turn it into a vectorised form based on the
 processing applied by the model to the categorical and continuous
@@ -1692,7 +1715,16 @@ <h2 id="pytorch_widedeep.tab2vec.Tab2Vec" class="doc doc-heading">
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/tab2vec.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 95</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 86</span>
+<span class="normal"> 87</span>
+<span class="normal"> 88</span>
+<span class="normal"> 89</span>
+<span class="normal"> 90</span>
+<span class="normal"> 91</span>
+<span class="normal"> 92</span>
+<span class="normal"> 93</span>
+<span class="normal"> 94</span>
+<span class="normal"> 95</span>
 <span class="normal"> 96</span>
 <span class="normal"> 97</span>
 <span class="normal"> 98</span>
@@ -1700,16 +1732,7 @@ <h2 id="pytorch_widedeep.tab2vec.Tab2Vec" class="doc doc-heading">
 <span class="normal">100</span>
 <span class="normal">101</span>
 <span class="normal">102</span>
-<span class="normal">103</span>
-<span class="normal">104</span>
-<span class="normal">105</span>
-<span class="normal">106</span>
-<span class="normal">107</span>
-<span class="normal">108</span>
-<span class="normal">109</span>
-<span class="normal">110</span>
-<span class="normal">111</span>
-<span class="normal">112</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">103</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">tab_preprocessor</span><span class="p">:</span> <span class="n">TabPreprocessor</span><span class="p">,</span>
     <span class="n">model</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">WideDeep</span><span class="p">,</span> <span class="n">BayesianWide</span><span class="p">,</span> <span class="n">BayesianTabMlp</span><span class="p">],</span>
@@ -1755,7 +1778,7 @@ <h3 id="pytorch_widedeep.tab2vec.Tab2Vec.fit" class="doc doc-heading">
 
     <div class="doc doc-contents ">
 
-      <p>This is an empty method i.e. Returns the unchanged object itself. Is
+        <p>This is an empty method i.e. Returns the unchanged object itself. Is
 only included for consistency in case <code>Tab2Vec</code> is used as part of a
 Pipeline</p>
 
@@ -1799,7 +1822,16 @@ <h3 id="pytorch_widedeep.tab2vec.Tab2Vec.fit" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/tab2vec.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">114</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">105</span>
+<span class="normal">106</span>
+<span class="normal">107</span>
+<span class="normal">108</span>
+<span class="normal">109</span>
+<span class="normal">110</span>
+<span class="normal">111</span>
+<span class="normal">112</span>
+<span class="normal">113</span>
+<span class="normal">114</span>
 <span class="normal">115</span>
 <span class="normal">116</span>
 <span class="normal">117</span>
@@ -1810,16 +1842,7 @@ <h3 id="pytorch_widedeep.tab2vec.Tab2Vec.fit" class="doc doc-heading">
 <span class="normal">122</span>
 <span class="normal">123</span>
 <span class="normal">124</span>
-<span class="normal">125</span>
-<span class="normal">126</span>
-<span class="normal">127</span>
-<span class="normal">128</span>
-<span class="normal">129</span>
-<span class="normal">130</span>
-<span class="normal">131</span>
-<span class="normal">132</span>
-<span class="normal">133</span>
-<span class="normal">134</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">target_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Tab2Vec&quot;</span><span class="p">:</span>
+<span class="normal">125</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">target_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Tab2Vec&quot;</span><span class="p">:</span>
 <span class="w">    </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;This is an empty method i.e. Returns the unchanged object itself. Is</span>
 <span class="sd">    only included for consistency in case `Tab2Vec` is used as part of a</span>
 <span class="sd">    Pipeline</span>
@@ -1859,7 +1882,7 @@ <h3 id="pytorch_widedeep.tab2vec.Tab2Vec.transform" class="doc doc-heading">
 
     <div class="doc doc-contents ">
 
-      <p>Transforms the input dataframe into vectorized form. If a target
+        <p>Transforms the input dataframe into vectorized form. If a target
 column name is passed the target values will be returned separately
 in their corresponding type (np.ndarray or pd.DataFrame)</p>
 
@@ -1906,7 +1929,16 @@ <h3 id="pytorch_widedeep.tab2vec.Tab2Vec.transform" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/tab2vec.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">136</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">127</span>
+<span class="normal">128</span>
+<span class="normal">129</span>
+<span class="normal">130</span>
+<span class="normal">131</span>
+<span class="normal">132</span>
+<span class="normal">133</span>
+<span class="normal">134</span>
+<span class="normal">135</span>
+<span class="normal">136</span>
 <span class="normal">137</span>
 <span class="normal">138</span>
 <span class="normal">139</span>
@@ -1961,16 +1993,7 @@ <h3 id="pytorch_widedeep.tab2vec.Tab2Vec.transform" class="doc doc-heading">
 <span class="normal">188</span>
 <span class="normal">189</span>
 <span class="normal">190</span>
-<span class="normal">191</span>
-<span class="normal">192</span>
-<span class="normal">193</span>
-<span class="normal">194</span>
-<span class="normal">195</span>
-<span class="normal">196</span>
-<span class="normal">197</span>
-<span class="normal">198</span>
-<span class="normal">199</span>
-<span class="normal">200</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">transform</span><span class="p">(</span>
+<span class="normal">191</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">transform</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span>
     <span class="n">target_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
@@ -2054,20 +2077,20 @@ <h3 id="pytorch_widedeep.tab2vec.Tab2Vec.fit_transform" class="doc doc-heading">
 
     <div class="doc doc-contents ">
 
-      <p>Combines <code>fit</code> and <code>transform</code></p>
+        <p>Combines <code>fit</code> and <code>transform</code></p>
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/tab2vec.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">202</span>
-<span class="normal">203</span>
-<span class="normal">204</span>
-<span class="normal">205</span>
-<span class="normal">206</span>
-<span class="normal">207</span>
-<span class="normal">208</span>
-<span class="normal">209</span>
-<span class="normal">210</span>
-<span class="normal">211</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fit_transform</span><span class="p">(</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">193</span>
+<span class="normal">194</span>
+<span class="normal">195</span>
+<span class="normal">196</span>
+<span class="normal">197</span>
+<span class="normal">198</span>
+<span class="normal">199</span>
+<span class="normal">200</span>
+<span class="normal">201</span>
+<span class="normal">202</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fit_transform</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">target_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
 <span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span>
     <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span>
@@ -2107,7 +2130,7 @@ <h3 id="pytorch_widedeep.tab2vec.Tab2Vec.fit_transform" class="doc doc-heading">
   <span class="md-source-file__fact">
     <span class="md-icon" title="Contributors">
       
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 4a4 4 0 0 1 4 4 4 4 0 0 1-4 4 4 4 0 0 1-4-4 4 4 0 0 1 4-4m0 10c4.42 0 8 1.79 8 4v2H4v-2c0-2.21 3.58-4 8-4Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 4a4 4 0 0 1 4 4 4 4 0 0 1-4 4 4 4 0 0 1-4-4 4 4 0 0 1 4-4m0 10c4.42 0 8 1.79 8 4v2H4v-2c0-2.21 3.58-4 8-4"/></svg>
       
     </span>
     <nav>
@@ -2162,7 +2185,7 @@ <h3 id="pytorch_widedeep.tab2vec.Tab2Vec.fit_transform" class="doc doc-heading">
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -2177,10 +2200,10 @@ <h3 id="pytorch_widedeep.tab2vec.Tab2Vec.fit_transform" class="doc doc-heading">
     </div>
     
     
-    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="../assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="../stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/pytorch-widedeep/trainer.html b/mkdocs/site/pytorch-widedeep/trainer.html
index 65c3bcd0..666c35e8 100644
--- a/mkdocs/site/pytorch-widedeep/trainer.html
+++ b/mkdocs/site/pytorch-widedeep/trainer.html
@@ -18,7 +18,7 @@
       
       
       <link rel="icon" href="../assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -26,7 +26,7 @@
       
     
     
-      <link rel="stylesheet" href="../assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="../assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="../assets/stylesheets/palette.06af60db.min.css">
@@ -52,7 +52,7 @@
     
       <link rel="stylesheet" href="../stylesheets/extra.css">
     
-    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -101,7 +101,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -129,7 +129,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -139,7 +139,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -147,13 +147,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -162,15 +162,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -193,7 +193,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -372,7 +372,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -476,6 +476,8 @@
         
       
         
+      
+        
       
         
       
@@ -738,6 +740,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="bayesian_models.html" class="md-nav__link">
         
@@ -1610,7 +1633,7 @@ <h2 id="pytorch_widedeep.training.Trainer" class="doc doc-heading">
               Bases: <code><span title="pytorch_widedeep.training._base_trainer.BaseTrainer">BaseTrainer</span></code></p>
 
 
-      <p>Class to set the of attributes that will be used during the
+        <p>Class to set the of attributes that will be used during the
 training process.</p>
 
 
@@ -1929,7 +1952,10 @@ <h2 id="pytorch_widedeep.training.Trainer" class="doc doc-heading">
 
                   <details class="quote">
                     <summary>Source code in <code>pytorch_widedeep/training/trainer.py</code></summary>
-                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">229</span>
+                    <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">226</span>
+<span class="normal">227</span>
+<span class="normal">228</span>
+<span class="normal">229</span>
 <span class="normal">230</span>
 <span class="normal">231</span>
 <span class="normal">232</span>
@@ -1964,10 +1990,7 @@ <h2 id="pytorch_widedeep.training.Trainer" class="doc doc-heading">
 <span class="normal">261</span>
 <span class="normal">262</span>
 <span class="normal">263</span>
-<span class="normal">264</span>
-<span class="normal">265</span>
-<span class="normal">266</span>
-<span class="normal">267</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span>  <span class="c1"># noqa: C901</span>
+<span class="normal">264</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span>  <span class="c1"># noqa: C901</span>
     <span class="s2">&quot;objective&quot;</span><span class="p">,</span>
     <span class="p">[</span><span class="s2">&quot;loss_function&quot;</span><span class="p">,</span> <span class="s2">&quot;loss_fn&quot;</span><span class="p">,</span> <span class="s2">&quot;loss&quot;</span><span class="p">,</span> <span class="s2">&quot;cost_function&quot;</span><span class="p">,</span> <span class="s2">&quot;cost_fn&quot;</span><span class="p">,</span> <span class="s2">&quot;cost&quot;</span><span class="p">],</span>
 <span class="p">)</span>
@@ -2050,7 +2073,7 @@ <h3 id="pytorch_widedeep.training.Trainer.fit" class="doc doc-heading">
 
     <div class="doc doc-contents ">
 
-      <p>Fit method.</p>
+        <p>Fit method.</p>
 <p>The input datasets can be passed either directly via numpy arrays
 (<code>X_wide</code>, <code>X_tab</code>, <code>X_text</code> or <code>X_img</code>) or alternatively, in
 dictionaries (<code>X_train</code> or <code>X_val</code>).</p>
@@ -2071,13 +2094,15 @@ <h3 id="pytorch_widedeep.training.Trainer.fit" class="doc doc-heading">
         </li>
         <li class="doc-section-item field-body">
           <b><code>X_tab</code></b>
-              (<code><span title="pytorch_widedeep.wdtypes.Optional">Optional</span>[<span title="numpy.ndarray">ndarray</span>]</code>, default:
+              (<code><span title="pytorch_widedeep.wdtypes.Optional">Optional</span>[<span title="pytorch_widedeep.wdtypes.Union">Union</span>[<span title="numpy.ndarray">ndarray</span>, <span title="pytorch_widedeep.wdtypes.List">List</span>[<span title="numpy.ndarray">ndarray</span>]]]</code>, default:
                   <code>None</code>
 )
           –
           <div class="doc-md-description">
             <p>Input for the <code>deeptabular</code> model component.
-See <code>pytorch_widedeep.preprocessing.TabPreprocessor</code></p>
+See <code>pytorch_widedeep.preprocessing.TabPreprocessor</code>. If multiple
+tabular models are used for different columns, this should be a
+list of numpy arrays</p>
           </div>
         </li>
         <li class="doc-section-item field-body">
@@ -2297,7 +2322,10 @@ <h3 id="pytorch_widedeep.training.Trainer.fit" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/training/trainer.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">269</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">266</span>
+<span class="normal">267</span>
+<span class="normal">268</span>
+<span class="normal">269</span>
 <span class="normal">270</span>
 <span class="normal">271</span>
 <span class="normal">272</span>
@@ -2554,12 +2582,11 @@ <h3 id="pytorch_widedeep.training.Trainer.fit" class="doc doc-heading">
 <span class="normal">523</span>
 <span class="normal">524</span>
 <span class="normal">525</span>
-<span class="normal">526</span>
-<span class="normal">527</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;finetune&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;warmup&quot;</span><span class="p">])</span>
+<span class="normal">526</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@alias</span><span class="p">(</span><span class="s2">&quot;finetune&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;warmup&quot;</span><span class="p">])</span>
 <span class="k">def</span> <span class="nf">fit</span><span class="p">(</span>  <span class="c1"># noqa: C901</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">X_wide</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
-    <span class="n">X_tab</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+    <span class="n">X_tab</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
     <span class="n">X_text</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
     <span class="n">X_img</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
     <span class="n">X_train</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
@@ -2587,7 +2614,9 @@ <h3 id="pytorch_widedeep.training.Trainer.fit" class="doc doc-heading">
 <span class="sd">        See `pytorch_widedeep.preprocessing.WidePreprocessor`</span>
 <span class="sd">    X_tab: np.ndarray, Optional. default=None</span>
 <span class="sd">        Input for the `deeptabular` model component.</span>
-<span class="sd">        See `pytorch_widedeep.preprocessing.TabPreprocessor`</span>
+<span class="sd">        See `pytorch_widedeep.preprocessing.TabPreprocessor`. If multiple</span>
+<span class="sd">        tabular models are used for different columns, this should be a</span>
+<span class="sd">        list of numpy arrays</span>
 <span class="sd">    X_text: Union[np.ndarray, List[np.ndarray]], Optional. default=None</span>
 <span class="sd">        Input for the `deeptext` model component.</span>
 <span class="sd">        See `pytorch_widedeep.preprocessing.TextPreprocessor`.</span>
@@ -2840,7 +2869,7 @@ <h3 id="pytorch_widedeep.training.Trainer.predict" class="doc doc-heading">
 
     <div class="doc doc-contents ">
 
-      <p>Returns the predictions</p>
+        <p>Returns the predictions</p>
 <p>The input datasets can be passed either directly via numpy arrays
 (<code>X_wide</code>, <code>X_tab</code>, <code>X_text</code> or <code>X_img</code>) or alternatively, in
 a dictionary (<code>X_test</code>)</p>
@@ -2861,7 +2890,7 @@ <h3 id="pytorch_widedeep.training.Trainer.predict" class="doc doc-heading">
         </li>
         <li class="doc-section-item field-body">
           <b><code>X_tab</code></b>
-              (<code><span title="pytorch_widedeep.wdtypes.Optional">Optional</span>[<span title="numpy.ndarray">ndarray</span>]</code>, default:
+              (<code><span title="pytorch_widedeep.wdtypes.Optional">Optional</span>[<span title="pytorch_widedeep.wdtypes.Union">Union</span>[<span title="numpy.ndarray">ndarray</span>, <span title="pytorch_widedeep.wdtypes.List">List</span>[<span title="numpy.ndarray">ndarray</span>]]]</code>, default:
                   <code>None</code>
 )
           –
@@ -2932,7 +2961,8 @@ <h3 id="pytorch_widedeep.training.Trainer.predict" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/training/trainer.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">529</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">528</span>
+<span class="normal">529</span>
 <span class="normal">530</span>
 <span class="normal">531</span>
 <span class="normal">532</span>
@@ -2986,10 +3016,19 @@ <h3 id="pytorch_widedeep.training.Trainer.predict" class="doc doc-heading">
 <span class="normal">580</span>
 <span class="normal">581</span>
 <span class="normal">582</span>
-<span class="normal">583</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">predict</span><span class="p">(</span>  <span class="c1"># type: ignore[override, return]</span>
+<span class="normal">583</span>
+<span class="normal">584</span>
+<span class="normal">585</span>
+<span class="normal">586</span>
+<span class="normal">587</span>
+<span class="normal">588</span>
+<span class="normal">589</span>
+<span class="normal">590</span>
+<span class="normal">591</span>
+<span class="normal">592</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">predict</span><span class="p">(</span>  <span class="c1"># type: ignore[override, return]</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">X_wide</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
-    <span class="n">X_tab</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+    <span class="n">X_tab</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
     <span class="n">X_text</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
     <span class="n">X_img</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
     <span class="n">X_test</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
@@ -3007,13 +3046,13 @@ <h3 id="pytorch_widedeep.training.Trainer.predict" class="doc doc-heading">
 <span class="sd">    X_wide: np.ndarray, Optional. default=None</span>
 <span class="sd">        Input for the `wide` model component.</span>
 <span class="sd">        See `pytorch_widedeep.preprocessing.WidePreprocessor`</span>
-<span class="sd">    X_tab: np.ndarray, Optional. default=None</span>
+<span class="sd">    X_tab: np.ndarray or List[np.ndarray], Optional. default=None</span>
 <span class="sd">        Input for the `deeptabular` model component.</span>
 <span class="sd">        See `pytorch_widedeep.preprocessing.TabPreprocessor`</span>
-<span class="sd">    X_text: np.ndarray, Optional. default=None</span>
+<span class="sd">    X_text: np.ndarray or List[np.ndarray], Optional. default=None</span>
 <span class="sd">        Input for the `deeptext` model component.</span>
 <span class="sd">        See `pytorch_widedeep.preprocessing.TextPreprocessor`</span>
-<span class="sd">    X_img: np.ndarray, Optional. default=None</span>
+<span class="sd">    X_img: np.ndarray or List[np.ndarray], Optional. default=None</span>
 <span class="sd">        Input for the `deepimage` model component.</span>
 <span class="sd">        See `pytorch_widedeep.preprocessing.ImagePreprocessor`</span>
 <span class="sd">    X_test: Dict, Optional. default=None</span>
@@ -3041,6 +3080,16 @@ <h3 id="pytorch_widedeep.training.Trainer.predict" class="doc doc-heading">
     <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">method</span> <span class="o">==</span> <span class="s2">&quot;multiclass&quot;</span><span class="p">:</span>
         <span class="n">preds</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">vstack</span><span class="p">(</span><span class="n">preds_l</span><span class="p">)</span>
         <span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="n">preds</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>  <span class="c1"># type: ignore[return-value]</span>
+    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">method</span> <span class="o">==</span> <span class="s2">&quot;multitarget&quot;</span><span class="p">:</span>
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">loss_fn</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span> <span class="ow">in</span> <span class="p">[</span>
+            <span class="s2">&quot;MultiTargetClassificationLoss&quot;</span><span class="p">,</span>
+            <span class="s2">&quot;MutilTargetRegressionAndClassificationLoss&quot;</span><span class="p">,</span>
+        <span class="p">]:</span>
+            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
+                <span class="s2">&quot;MultiTargetClassificationLoss and MutilTargetRegressionAndClassificationLoss &quot;</span>
+                <span class="s2">&quot;are not supported by predict method. Please use predict_proba method instead.&quot;</span>
+            <span class="p">)</span>
+        <span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">vstack</span><span class="p">(</span><span class="n">preds_l</span><span class="p">)</span>
 </code></pre></div></td></tr></table></div>
             </details>
     </div>
@@ -3068,7 +3117,7 @@ <h3 id="pytorch_widedeep.training.Trainer.predict_uncertainty" class="doc doc-he
 
     <div class="doc doc-contents ">
 
-      <p>Returns the predicted ucnertainty of the model for the test dataset
+        <p>Returns the predicted ucnertainty of the model for the test dataset
 using a Monte Carlo method during which dropout layers are activated
 in the evaluation/prediction phase and each sample is predicted N
 times (<code>uncertainty_granularity</code> times).</p>
@@ -3092,7 +3141,7 @@ <h3 id="pytorch_widedeep.training.Trainer.predict_uncertainty" class="doc doc-he
         </li>
         <li class="doc-section-item field-body">
           <b><code>X_tab</code></b>
-              (<code><span title="pytorch_widedeep.wdtypes.Optional">Optional</span>[<span title="numpy.ndarray">ndarray</span>]</code>, default:
+              (<code><span title="pytorch_widedeep.wdtypes.Optional">Optional</span>[<span title="pytorch_widedeep.wdtypes.Union">Union</span>[<span title="numpy.ndarray">ndarray</span>, <span title="pytorch_widedeep.wdtypes.List">List</span>[<span title="numpy.ndarray">ndarray</span>]]]</code>, default:
                   <code>None</code>
 )
           –
@@ -3178,16 +3227,7 @@ <h3 id="pytorch_widedeep.training.Trainer.predict_uncertainty" class="doc doc-he
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/training/trainer.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">585</span>
-<span class="normal">586</span>
-<span class="normal">587</span>
-<span class="normal">588</span>
-<span class="normal">589</span>
-<span class="normal">590</span>
-<span class="normal">591</span>
-<span class="normal">592</span>
-<span class="normal">593</span>
-<span class="normal">594</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">594</span>
 <span class="normal">595</span>
 <span class="normal">596</span>
 <span class="normal">597</span>
@@ -3273,10 +3313,24 @@ <h3 id="pytorch_widedeep.training.Trainer.predict_uncertainty" class="doc doc-he
 <span class="normal">677</span>
 <span class="normal">678</span>
 <span class="normal">679</span>
-<span class="normal">680</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">predict_uncertainty</span><span class="p">(</span>  <span class="c1"># type: ignore[return]</span>
+<span class="normal">680</span>
+<span class="normal">681</span>
+<span class="normal">682</span>
+<span class="normal">683</span>
+<span class="normal">684</span>
+<span class="normal">685</span>
+<span class="normal">686</span>
+<span class="normal">687</span>
+<span class="normal">688</span>
+<span class="normal">689</span>
+<span class="normal">690</span>
+<span class="normal">691</span>
+<span class="normal">692</span>
+<span class="normal">693</span>
+<span class="normal">694</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">predict_uncertainty</span><span class="p">(</span>  <span class="c1"># type: ignore[return]</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">X_wide</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
-    <span class="n">X_tab</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+    <span class="n">X_tab</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
     <span class="n">X_text</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
     <span class="n">X_img</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
     <span class="n">X_test</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
@@ -3297,13 +3351,13 @@ <h3 id="pytorch_widedeep.training.Trainer.predict_uncertainty" class="doc doc-he
 <span class="sd">    X_wide: np.ndarray, Optional. default=None</span>
 <span class="sd">        Input for the `wide` model component.</span>
 <span class="sd">        See `pytorch_widedeep.preprocessing.WidePreprocessor`</span>
-<span class="sd">    X_tab: np.ndarray, Optional. default=None</span>
+<span class="sd">    X_tab: np.ndarray or List[np.ndarray], Optional. default=None</span>
 <span class="sd">        Input for the `deeptabular` model component.</span>
 <span class="sd">        See `pytorch_widedeep.preprocessing.TabPreprocessor`</span>
-<span class="sd">    X_text: np.ndarray, Optional. default=None</span>
+<span class="sd">    X_text: np.ndarray or List[np.ndarray], Optional. default=None</span>
 <span class="sd">        Input for the `deeptext` model component.</span>
 <span class="sd">        See `pytorch_widedeep.preprocessing.TextPreprocessor`</span>
-<span class="sd">    X_img: np.ndarray, Optional. default=None</span>
+<span class="sd">    X_img: np.ndarray or List[np.ndarray], Optional. default=None</span>
 <span class="sd">        Input for the `deepimage` model component.</span>
 <span class="sd">        See `pytorch_widedeep.preprocessing.ImagePreprocessor`</span>
 <span class="sd">    X_test: Dict, Optional. default=None</span>
@@ -3369,6 +3423,11 @@ <h3 id="pytorch_widedeep.training.Trainer.predict_uncertainty" class="doc doc-he
         <span class="n">preds</span> <span class="o">=</span> <span class="n">preds</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
         <span class="n">preds</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">hstack</span><span class="p">((</span><span class="n">preds</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">vstack</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="n">preds</span><span class="p">,</span> <span class="mi">1</span><span class="p">))))</span>
         <span class="k">return</span> <span class="n">preds</span>
+
+    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">method</span> <span class="o">==</span> <span class="s2">&quot;multitarget&quot;</span><span class="p">:</span>
+        <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
+            <span class="s2">&quot;Currently predict_uncertainty is not supported for multitarget method&quot;</span>
+        <span class="p">)</span>
 </code></pre></div></td></tr></table></div>
             </details>
     </div>
@@ -3395,7 +3454,7 @@ <h3 id="pytorch_widedeep.training.Trainer.predict_proba" class="doc doc-heading"
 
     <div class="doc doc-contents ">
 
-      <p>Returns the predicted probabilities for the test dataset for  binary
+        <p>Returns the predicted probabilities for the test dataset for  binary
 and multiclass methods</p>
 <p>The input datasets can be passed either directly via numpy arrays
 (<code>X_wide</code>, <code>X_tab</code>, <code>X_text</code> or <code>X_img</code>) or alternatively, in
@@ -3417,7 +3476,7 @@ <h3 id="pytorch_widedeep.training.Trainer.predict_proba" class="doc doc-heading"
         </li>
         <li class="doc-section-item field-body">
           <b><code>X_tab</code></b>
-              (<code><span title="pytorch_widedeep.wdtypes.Optional">Optional</span>[<span title="numpy.ndarray">ndarray</span>]</code>, default:
+              (<code><span title="pytorch_widedeep.wdtypes.Optional">Optional</span>[<span title="pytorch_widedeep.wdtypes.Union">Union</span>[<span title="numpy.ndarray">ndarray</span>, <span title="pytorch_widedeep.wdtypes.List">List</span>[<span title="numpy.ndarray">ndarray</span>]]]</code>, default:
                   <code>None</code>
 )
           –
@@ -3488,21 +3547,7 @@ <h3 id="pytorch_widedeep.training.Trainer.predict_proba" class="doc doc-heading"
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/training/trainer.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">682</span>
-<span class="normal">683</span>
-<span class="normal">684</span>
-<span class="normal">685</span>
-<span class="normal">686</span>
-<span class="normal">687</span>
-<span class="normal">688</span>
-<span class="normal">689</span>
-<span class="normal">690</span>
-<span class="normal">691</span>
-<span class="normal">692</span>
-<span class="normal">693</span>
-<span class="normal">694</span>
-<span class="normal">695</span>
-<span class="normal">696</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">696</span>
 <span class="normal">697</span>
 <span class="normal">698</span>
 <span class="normal">699</span>
@@ -3541,10 +3586,26 @@ <h3 id="pytorch_widedeep.training.Trainer.predict_proba" class="doc doc-heading"
 <span class="normal">732</span>
 <span class="normal">733</span>
 <span class="normal">734</span>
-<span class="normal">735</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">predict_proba</span><span class="p">(</span>  <span class="c1"># type: ignore[override, return]  # noqa: C901</span>
+<span class="normal">735</span>
+<span class="normal">736</span>
+<span class="normal">737</span>
+<span class="normal">738</span>
+<span class="normal">739</span>
+<span class="normal">740</span>
+<span class="normal">741</span>
+<span class="normal">742</span>
+<span class="normal">743</span>
+<span class="normal">744</span>
+<span class="normal">745</span>
+<span class="normal">746</span>
+<span class="normal">747</span>
+<span class="normal">748</span>
+<span class="normal">749</span>
+<span class="normal">750</span>
+<span class="normal">751</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">predict_proba</span><span class="p">(</span>  <span class="c1"># type: ignore[override, return]  # noqa: C901</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">X_wide</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
-    <span class="n">X_tab</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+    <span class="n">X_tab</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
     <span class="n">X_text</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
     <span class="n">X_img</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
     <span class="n">X_test</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
@@ -3562,13 +3623,13 @@ <h3 id="pytorch_widedeep.training.Trainer.predict_proba" class="doc doc-heading"
 <span class="sd">    X_wide: np.ndarray, Optional. default=None</span>
 <span class="sd">        Input for the `wide` model component.</span>
 <span class="sd">        See `pytorch_widedeep.preprocessing.WidePreprocessor`</span>
-<span class="sd">    X_tab: np.ndarray, Optional. default=None</span>
+<span class="sd">    X_tab: np.ndarray or List[np.ndarray], Optional. default=None</span>
 <span class="sd">        Input for the `deeptabular` model component.</span>
 <span class="sd">        See `pytorch_widedeep.preprocessing.TabPreprocessor`</span>
-<span class="sd">    X_text: np.ndarray, Optional. default=None</span>
+<span class="sd">    X_text: np.ndarray or List[np.ndarray], Optional. default=None</span>
 <span class="sd">        Input for the `deeptext` model component.</span>
 <span class="sd">        See `pytorch_widedeep.preprocessing.TextPreprocessor`</span>
-<span class="sd">    X_img: np.ndarray, Optional. default=None</span>
+<span class="sd">    X_img: np.ndarray or List[np.ndarray], Optional. default=None</span>
 <span class="sd">        Input for the `deepimage` model component.</span>
 <span class="sd">        See `pytorch_widedeep.preprocessing.ImagePreprocessor`</span>
 <span class="sd">    X_test: Dict, Optional. default=None</span>
@@ -3595,6 +3656,8 @@ <h3 id="pytorch_widedeep.training.Trainer.predict_proba" class="doc doc-heading"
         <span class="k">return</span> <span class="n">probs</span>
     <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">method</span> <span class="o">==</span> <span class="s2">&quot;multiclass&quot;</span><span class="p">:</span>
         <span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">vstack</span><span class="p">(</span><span class="n">preds_l</span><span class="p">)</span>
+    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">method</span> <span class="o">==</span> <span class="s2">&quot;multitarget&quot;</span><span class="p">:</span>
+        <span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">vstack</span><span class="p">(</span><span class="n">preds_l</span><span class="p">)</span>
 </code></pre></div></td></tr></table></div>
             </details>
     </div>
@@ -3619,7 +3682,7 @@ <h3 id="pytorch_widedeep.training.Trainer.save" class="doc doc-heading">
 
     <div class="doc doc-contents ">
 
-      <p>Saves the model, training and evaluation history, and the
+        <p>Saves the model, training and evaluation history, and the
 <code>feature_importance</code> attribute (if the <code>deeptabular</code> component is a
 Tabnet model) to disk</p>
 <p>The <code>Trainer</code> class is built so that it 'just' trains a model. With
@@ -3685,23 +3748,7 @@ <h3 id="pytorch_widedeep.training.Trainer.save" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/training/trainer.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">748</span>
-<span class="normal">749</span>
-<span class="normal">750</span>
-<span class="normal">751</span>
-<span class="normal">752</span>
-<span class="normal">753</span>
-<span class="normal">754</span>
-<span class="normal">755</span>
-<span class="normal">756</span>
-<span class="normal">757</span>
-<span class="normal">758</span>
-<span class="normal">759</span>
-<span class="normal">760</span>
-<span class="normal">761</span>
-<span class="normal">762</span>
-<span class="normal">763</span>
-<span class="normal">764</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">764</span>
 <span class="normal">765</span>
 <span class="normal">766</span>
 <span class="normal">767</span>
@@ -3735,7 +3782,23 @@ <h3 id="pytorch_widedeep.training.Trainer.save" class="doc doc-heading">
 <span class="normal">795</span>
 <span class="normal">796</span>
 <span class="normal">797</span>
-<span class="normal">798</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">save</span><span class="p">(</span>
+<span class="normal">798</span>
+<span class="normal">799</span>
+<span class="normal">800</span>
+<span class="normal">801</span>
+<span class="normal">802</span>
+<span class="normal">803</span>
+<span class="normal">804</span>
+<span class="normal">805</span>
+<span class="normal">806</span>
+<span class="normal">807</span>
+<span class="normal">808</span>
+<span class="normal">809</span>
+<span class="normal">810</span>
+<span class="normal">811</span>
+<span class="normal">812</span>
+<span class="normal">813</span>
+<span class="normal">814</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">save</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
     <span class="n">save_state_dict</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
@@ -3816,13 +3879,13 @@ <h3 id="pytorch_widedeep.training.Trainer.save" class="doc doc-heading">
   <span class="md-source-file__fact">
     <span class="md-icon" title="Contributors">
       
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.536 5.536 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13v-1.75M0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20H0m24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9V20Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.54 5.54 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13zM0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20zm24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9z"/></svg>
       
     </span>
     <nav>
       
-        <a href="mailto:mulinka.pavol@gmail.com">Pavol Mulinka</a>, 
-        <a href="mailto:javierrodriguezzaurin@javiers-macbook-pro.local">Javier Rodriguez Zaurin</a>
+        <a href="mailto:javierrodriguezzaurin@javiers-macbook-pro.local">Javier Rodriguez Zaurin</a>, 
+        <a href="mailto:mulinka.pavol@gmail.com">Pavol Mulinka</a>
     </nav>
   </span>
 
@@ -3872,7 +3935,7 @@ <h3 id="pytorch_widedeep.training.Trainer.save" class="doc doc-heading">
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -3887,10 +3950,10 @@ <h3 id="pytorch_widedeep.training.Trainer.save" class="doc doc-heading">
     </div>
     
     
-    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": "..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="../assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="../stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/pytorch-widedeep/utils/deeptabular_utils.html b/mkdocs/site/pytorch-widedeep/utils/deeptabular_utils.html
index 90762c5e..e14dcbd0 100644
--- a/mkdocs/site/pytorch-widedeep/utils/deeptabular_utils.html
+++ b/mkdocs/site/pytorch-widedeep/utils/deeptabular_utils.html
@@ -18,7 +18,7 @@
       
       
       <link rel="icon" href="../../assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -26,7 +26,7 @@
       
     
     
-      <link rel="stylesheet" href="../../assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="../../assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="../../assets/stylesheets/palette.06af60db.min.css">
@@ -52,7 +52,7 @@
     
       <link rel="stylesheet" href="../../stylesheets/extra.css">
     
-    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -101,7 +101,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -129,7 +129,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -139,7 +139,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -147,13 +147,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -162,15 +162,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -193,7 +193,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -372,7 +372,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -476,6 +476,8 @@
         
       
         
+      
+        
       
         
       
@@ -787,6 +789,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../bayesian_models.html" class="md-nav__link">
         
@@ -1592,7 +1615,7 @@ <h2 id="pytorch_widedeep.utils.deeptabular_utils.LabelEncoder" class="doc doc-he
     <div class="doc doc-contents first">
 
 
-      <p>Label Encode categorical values for multiple columns at once</p>
+        <p>Label Encode categorical values for multiple columns at once</p>
 <p><img alt="ℹ️" class="emojione" src="https://cdnjs.cloudflare.com/ajax/libs/emojione/2.2.7/assets/png/2139.png" title=":information_source:" /> <strong>NOTE</strong>:
 LabelEncoder reserves 0 for <code>unseen</code> new categories. This is convenient
 when defining the embedding layers, since we can just set padding idx to 0.</p>
@@ -1719,7 +1742,7 @@ <h3 id="pytorch_widedeep.utils.deeptabular_utils.LabelEncoder.partial_fit" class
 
     <div class="doc doc-contents ">
 
-      <p>Main method. Creates encoding attributes.</p>
+        <p>Main method. Creates encoding attributes.</p>
 
 
 <p><span class="doc-section-title">Returns:</span></p>
@@ -1903,7 +1926,7 @@ <h3 id="pytorch_widedeep.utils.deeptabular_utils.LabelEncoder.fit" class="doc do
 
     <div class="doc doc-contents ">
 
-      <p>Simply runs the <code>partial_fit</code> method when the data fits in memory</p>
+        <p>Simply runs the <code>partial_fit</code> method when the data fits in memory</p>
 
 
 <p><span class="doc-section-title">Returns:</span></p>
@@ -1967,7 +1990,7 @@ <h3 id="pytorch_widedeep.utils.deeptabular_utils.LabelEncoder.transform" class="
 
     <div class="doc doc-contents ">
 
-      <p>Label Encoded the categories in <code>columns_to_encode</code></p>
+        <p>Label Encoded the categories in <code>columns_to_encode</code></p>
 
 
 <p><span class="doc-section-title">Returns:</span></p>
@@ -2053,7 +2076,7 @@ <h3 id="pytorch_widedeep.utils.deeptabular_utils.LabelEncoder.fit_transform" cla
 
     <div class="doc doc-contents ">
 
-      <p>Combines <code>fit</code> and <code>transform</code></p>
+        <p>Combines <code>fit</code> and <code>transform</code></p>
 
 
 <p><span class="doc-section-title">Examples:</span></p>
@@ -2153,7 +2176,7 @@ <h3 id="pytorch_widedeep.utils.deeptabular_utils.LabelEncoder.inverse_transform"
 
     <div class="doc doc-contents ">
 
-      <p>Returns the original categories</p>
+        <p>Returns the original categories</p>
 
 
 <p><span class="doc-section-title">Examples:</span></p>
@@ -2275,7 +2298,7 @@ <h3 id="pytorch_widedeep.utils.deeptabular_utils.LabelEncoder.inverse_transform"
   <span class="md-source-file__fact">
     <span class="md-icon" title="Contributors">
       
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 4a4 4 0 0 1 4 4 4 4 0 0 1-4 4 4 4 0 0 1-4-4 4 4 0 0 1 4-4m0 10c4.42 0 8 1.79 8 4v2H4v-2c0-2.21 3.58-4 8-4Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 4a4 4 0 0 1 4 4 4 4 0 0 1-4 4 4 4 0 0 1-4-4 4 4 0 0 1 4-4m0 10c4.42 0 8 1.79 8 4v2H4v-2c0-2.21 3.58-4 8-4"/></svg>
       
     </span>
     <nav>
@@ -2330,7 +2353,7 @@ <h3 id="pytorch_widedeep.utils.deeptabular_utils.LabelEncoder.inverse_transform"
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -2345,10 +2368,10 @@ <h3 id="pytorch_widedeep.utils.deeptabular_utils.LabelEncoder.inverse_transform"
     </div>
     
     
-    <script id="__config" type="application/json">{"base": "../..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": "../..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../../assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="../../assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="../../stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/pytorch-widedeep/utils/fastai_transforms.html b/mkdocs/site/pytorch-widedeep/utils/fastai_transforms.html
index 9faf3791..920e567d 100644
--- a/mkdocs/site/pytorch-widedeep/utils/fastai_transforms.html
+++ b/mkdocs/site/pytorch-widedeep/utils/fastai_transforms.html
@@ -18,7 +18,7 @@
       
       
       <link rel="icon" href="../../assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -26,7 +26,7 @@
       
     
     
-      <link rel="stylesheet" href="../../assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="../../assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="../../assets/stylesheets/palette.06af60db.min.css">
@@ -52,7 +52,7 @@
     
       <link rel="stylesheet" href="../../stylesheets/extra.css">
     
-    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -101,7 +101,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -129,7 +129,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -139,7 +139,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -147,13 +147,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -162,15 +162,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -193,7 +193,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -372,7 +372,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -476,6 +476,8 @@
         
       
         
+      
+        
       
         
       
@@ -796,6 +798,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../bayesian_models.html" class="md-nav__link">
         
@@ -1611,7 +1634,7 @@ <h2 id="pytorch_widedeep.utils.fastai_transforms.Tokenizer" class="doc doc-headi
     <div class="doc doc-contents first">
 
 
-      <p>Class to combine a series of rules and a tokenizer function to tokenize
+        <p>Class to combine a series of rules and a tokenizer function to tokenize
 text with multiprocessing.</p>
 <p>Setting some of the parameters of this class require perhaps some
 familiarity with the source code.</p>
@@ -1747,7 +1770,7 @@ <h3 id="pytorch_widedeep.utils.fastai_transforms.Tokenizer.process_text" class="
 
     <div class="doc doc-contents ">
 
-      <p>Process and tokenize one text <code>t</code> with tokenizer <code>tok</code>.</p>
+        <p>Process and tokenize one text <code>t</code> with tokenizer <code>tok</code>.</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -1847,7 +1870,7 @@ <h3 id="pytorch_widedeep.utils.fastai_transforms.Tokenizer.process_all" class="d
 
     <div class="doc doc-contents ">
 
-      <p>Process a list of texts. Parallel execution of <code>process_text</code>.</p>
+        <p>Process a list of texts. Parallel execution of <code>process_text</code>.</p>
 
 
 <p><span class="doc-section-title">Examples:</span></p>
@@ -1956,13 +1979,15 @@ <h2 id="pytorch_widedeep.utils.fastai_transforms.Vocab" class="doc doc-heading">
 
 
 <a href="#pytorch_widedeep.utils.fastai_transforms.Vocab" class="headerlink" title="Permanent link">&para;</a></h2>
-<div class="doc-signature highlight"><pre><span></span><code><span class="nf">Vocab</span><span class="p">(</span><span class="n">max_vocab</span><span class="p">,</span> <span class="n">min_freq</span><span class="p">,</span> <span class="n">pad_idx</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
+<div class="doc-signature highlight"><pre><span></span><code><span class="nf">Vocab</span><span class="p">(</span>
+    <span class="n">max_vocab</span><span class="p">,</span> <span class="n">min_freq</span><span class="p">,</span> <span class="n">pad_idx</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">special_cases</span><span class="o">=</span><span class="kc">None</span>
+<span class="p">)</span>
 </code></pre></div>
 
     <div class="doc doc-contents first">
 
 
-      <p>Contains the correspondence between numbers and tokens.</p>
+        <p>Contains the correspondence between numbers and tokens.</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -2030,15 +2055,23 @@ <h2 id="pytorch_widedeep.utils.fastai_transforms.Vocab" class="doc doc-heading">
 <span class="normal">371</span>
 <span class="normal">372</span>
 <span class="normal">373</span>
-<span class="normal">374</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+<span class="normal">374</span>
+<span class="normal">375</span>
+<span class="normal">376</span>
+<span class="normal">377</span>
+<span class="normal">378</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">max_vocab</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
     <span class="n">min_freq</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
     <span class="n">pad_idx</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+    <span class="n">special_cases</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Collection</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
 <span class="p">):</span>
     <span class="bp">self</span><span class="o">.</span><span class="n">max_vocab</span> <span class="o">=</span> <span class="n">max_vocab</span>
     <span class="bp">self</span><span class="o">.</span><span class="n">min_freq</span> <span class="o">=</span> <span class="n">min_freq</span>
     <span class="bp">self</span><span class="o">.</span><span class="n">pad_idx</span> <span class="o">=</span> <span class="n">pad_idx</span>
+    <span class="bp">self</span><span class="o">.</span><span class="n">special_cases</span> <span class="o">=</span> <span class="p">(</span>
+        <span class="n">special_cases</span> <span class="k">if</span> <span class="n">special_cases</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">defaults</span><span class="o">.</span><span class="n">text_spec_tok</span>
+    <span class="p">)</span>
 </code></pre></div></td></tr></table></div>
                   </details>
 
@@ -2067,7 +2100,7 @@ <h3 id="pytorch_widedeep.utils.fastai_transforms.Vocab.create" class="doc doc-he
 
     <div class="doc doc-contents ">
 
-      <p>Create a vocabulary object from a set of tokens.</p>
+        <p>Create a vocabulary object from a set of tokens.</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -2114,11 +2147,7 @@ <h3 id="pytorch_widedeep.utils.fastai_transforms.Vocab.create" class="doc doc-he
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">376</span>
-<span class="normal">377</span>
-<span class="normal">378</span>
-<span class="normal">379</span>
-<span class="normal">380</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">380</span>
 <span class="normal">381</span>
 <span class="normal">382</span>
 <span class="normal">383</span>
@@ -2176,7 +2205,11 @@ <h3 id="pytorch_widedeep.utils.fastai_transforms.Vocab.create" class="doc doc-he
 <span class="normal">435</span>
 <span class="normal">436</span>
 <span class="normal">437</span>
-<span class="normal">438</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">create</span><span class="p">(</span>
+<span class="normal">438</span>
+<span class="normal">439</span>
+<span class="normal">440</span>
+<span class="normal">441</span>
+<span class="normal">442</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">create</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">tokens</span><span class="p">:</span> <span class="n">Tokens</span><span class="p">,</span>
 <span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Vocab&quot;</span><span class="p">:</span>
@@ -2213,7 +2246,7 @@ <h3 id="pytorch_widedeep.utils.fastai_transforms.Vocab.create" class="doc doc-he
 
     <span class="n">freq</span> <span class="o">=</span> <span class="n">Counter</span><span class="p">(</span><span class="n">p</span> <span class="k">for</span> <span class="n">o</span> <span class="ow">in</span> <span class="n">tokens</span> <span class="k">for</span> <span class="n">p</span> <span class="ow">in</span> <span class="n">o</span><span class="p">)</span>
     <span class="n">itos</span> <span class="o">=</span> <span class="p">[</span><span class="n">o</span> <span class="k">for</span> <span class="n">o</span><span class="p">,</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">freq</span><span class="o">.</span><span class="n">most_common</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">max_vocab</span><span class="p">)</span> <span class="k">if</span> <span class="n">c</span> <span class="o">&gt;=</span> <span class="bp">self</span><span class="o">.</span><span class="n">min_freq</span><span class="p">]</span>
-    <span class="k">for</span> <span class="n">o</span> <span class="ow">in</span> <span class="nb">reversed</span><span class="p">(</span><span class="n">defaults</span><span class="o">.</span><span class="n">text_spec_tok</span><span class="p">):</span>
+    <span class="k">for</span> <span class="n">o</span> <span class="ow">in</span> <span class="nb">reversed</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">special_cases</span><span class="p">):</span>
         <span class="k">if</span> <span class="n">o</span> <span class="ow">in</span> <span class="n">itos</span><span class="p">:</span>
             <span class="n">itos</span><span class="o">.</span><span class="n">remove</span><span class="p">(</span><span class="n">o</span><span class="p">)</span>
         <span class="n">itos</span><span class="o">.</span><span class="n">insert</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">o</span><span class="p">)</span>
@@ -2258,20 +2291,20 @@ <h3 id="pytorch_widedeep.utils.fastai_transforms.Vocab.fit" class="doc doc-headi
 
     <div class="doc doc-contents ">
 
-      <p>Calls the <code>create</code> method. I simply want to honor fast ai naming, but
+        <p>Calls the <code>create</code> method. I simply want to honor fast ai naming, but
 for consistency with the rest of the library I am including a fit method</p>
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">440</span>
-<span class="normal">441</span>
-<span class="normal">442</span>
-<span class="normal">443</span>
-<span class="normal">444</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">444</span>
 <span class="normal">445</span>
 <span class="normal">446</span>
 <span class="normal">447</span>
-<span class="normal">448</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fit</span><span class="p">(</span>
+<span class="normal">448</span>
+<span class="normal">449</span>
+<span class="normal">450</span>
+<span class="normal">451</span>
+<span class="normal">452</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fit</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span>
     <span class="n">tokens</span><span class="p">:</span> <span class="n">Tokens</span><span class="p">,</span>
 <span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Vocab&quot;</span><span class="p">:</span>
@@ -2299,7 +2332,7 @@ <h3 id="pytorch_widedeep.utils.fastai_transforms.Vocab.numericalize" class="doc
 
     <div class="doc doc-contents ">
 
-      <p>Convert a list of tokens <code>t</code> to their ids.</p>
+        <p>Convert a list of tokens <code>t</code> to their ids.</p>
 
 
 <p><span class="doc-section-title">Returns:</span></p>
@@ -2315,15 +2348,15 @@ <h3 id="pytorch_widedeep.utils.fastai_transforms.Vocab.numericalize" class="doc
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">450</span>
-<span class="normal">451</span>
-<span class="normal">452</span>
-<span class="normal">453</span>
-<span class="normal">454</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">454</span>
 <span class="normal">455</span>
 <span class="normal">456</span>
 <span class="normal">457</span>
-<span class="normal">458</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">numericalize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">t</span><span class="p">:</span> <span class="n">Collection</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span>
+<span class="normal">458</span>
+<span class="normal">459</span>
+<span class="normal">460</span>
+<span class="normal">461</span>
+<span class="normal">462</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">numericalize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">t</span><span class="p">:</span> <span class="n">Collection</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;Convert a list of tokens ``t`` to their ids.</span>
 
 <span class="sd">    Returns</span>
@@ -2351,19 +2384,19 @@ <h3 id="pytorch_widedeep.utils.fastai_transforms.Vocab.transform" class="doc doc
 
     <div class="doc doc-contents ">
 
-      <p>Calls the <code>numericalize</code> method. I simply want to honor fast ai naming,
+        <p>Calls the <code>numericalize</code> method. I simply want to honor fast ai naming,
 but for consistency with the rest of the library I am including a
 transform method</p>
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">460</span>
-<span class="normal">461</span>
-<span class="normal">462</span>
-<span class="normal">463</span>
-<span class="normal">464</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">464</span>
 <span class="normal">465</span>
-<span class="normal">466</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">t</span><span class="p">:</span> <span class="n">Collection</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span>
+<span class="normal">466</span>
+<span class="normal">467</span>
+<span class="normal">468</span>
+<span class="normal">469</span>
+<span class="normal">470</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">t</span><span class="p">:</span> <span class="n">Collection</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Calls the `numericalize` method. I simply want to honor fast ai naming,</span>
 <span class="sd">    but for consistency with the rest of the library I am including a</span>
@@ -2389,7 +2422,7 @@ <h3 id="pytorch_widedeep.utils.fastai_transforms.Vocab.textify" class="doc doc-h
 
     <div class="doc doc-contents ">
 
-      <p>Convert a list of <code>nums</code> (or indexes) to their tokens.</p>
+        <p>Convert a list of <code>nums</code> (or indexes) to their tokens.</p>
 
 
 <p><span class="doc-section-title">Returns:</span></p>
@@ -2405,11 +2438,7 @@ <h3 id="pytorch_widedeep.utils.fastai_transforms.Vocab.textify" class="doc doc-h
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">468</span>
-<span class="normal">469</span>
-<span class="normal">470</span>
-<span class="normal">471</span>
-<span class="normal">472</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">472</span>
 <span class="normal">473</span>
 <span class="normal">474</span>
 <span class="normal">475</span>
@@ -2417,7 +2446,11 @@ <h3 id="pytorch_widedeep.utils.fastai_transforms.Vocab.textify" class="doc doc-h
 <span class="normal">477</span>
 <span class="normal">478</span>
 <span class="normal">479</span>
-<span class="normal">480</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">textify</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">nums</span><span class="p">:</span> <span class="n">Collection</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">sep</span><span class="o">=</span><span class="s2">&quot; &quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]:</span>
+<span class="normal">480</span>
+<span class="normal">481</span>
+<span class="normal">482</span>
+<span class="normal">483</span>
+<span class="normal">484</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">textify</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">nums</span><span class="p">:</span> <span class="n">Collection</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">sep</span><span class="o">=</span><span class="s2">&quot; &quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;Convert a list of ``nums`` (or indexes) to their tokens.</span>
 
 <span class="sd">    Returns</span>
@@ -2449,23 +2482,23 @@ <h3 id="pytorch_widedeep.utils.fastai_transforms.Vocab.inverse_transform" class=
 
     <div class="doc doc-contents ">
 
-      <p>Calls the <code>textify</code> method. I simply want to honor fast ai naming, but
+        <p>Calls the <code>textify</code> method. I simply want to honor fast ai naming, but
 for consistency with the rest of the library I am including an
 inverse_transform method</p>
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">482</span>
-<span class="normal">483</span>
-<span class="normal">484</span>
-<span class="normal">485</span>
-<span class="normal">486</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">486</span>
 <span class="normal">487</span>
 <span class="normal">488</span>
 <span class="normal">489</span>
 <span class="normal">490</span>
 <span class="normal">491</span>
-<span class="normal">492</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">inverse_transform</span><span class="p">(</span>
+<span class="normal">492</span>
+<span class="normal">493</span>
+<span class="normal">494</span>
+<span class="normal">495</span>
+<span class="normal">496</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">inverse_transform</span><span class="p">(</span>
     <span class="bp">self</span><span class="p">,</span> <span class="n">nums</span><span class="p">:</span> <span class="n">Collection</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">sep</span><span class="o">=</span><span class="s2">&quot; &quot;</span>
 <span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
@@ -2506,7 +2539,7 @@ <h3 id="pytorch_widedeep.utils.fastai_transforms.Vocab.inverse_transform" class=
   <span class="md-source-file__fact">
     <span class="md-icon" title="Contributors">
       
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.536 5.536 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13v-1.75M0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20H0m24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9V20Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.54 5.54 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13zM0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20zm24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9z"/></svg>
       
     </span>
     <nav>
@@ -2562,7 +2595,7 @@ <h3 id="pytorch_widedeep.utils.fastai_transforms.Vocab.inverse_transform" class=
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -2577,10 +2610,10 @@ <h3 id="pytorch_widedeep.utils.fastai_transforms.Vocab.inverse_transform" class=
     </div>
     
     
-    <script id="__config" type="application/json">{"base": "../..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": "../..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../../assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="../../assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="../../stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/pytorch-widedeep/utils/image_utils.html b/mkdocs/site/pytorch-widedeep/utils/image_utils.html
index d7e3aa08..bf2eb2d1 100644
--- a/mkdocs/site/pytorch-widedeep/utils/image_utils.html
+++ b/mkdocs/site/pytorch-widedeep/utils/image_utils.html
@@ -18,7 +18,7 @@
       
       
       <link rel="icon" href="../../assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -26,7 +26,7 @@
       
     
     
-      <link rel="stylesheet" href="../../assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="../../assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="../../assets/stylesheets/palette.06af60db.min.css">
@@ -52,7 +52,7 @@
     
       <link rel="stylesheet" href="../../stylesheets/extra.css">
     
-    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -101,7 +101,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -129,7 +129,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -139,7 +139,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -147,13 +147,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -162,15 +162,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -193,7 +193,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -372,7 +372,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -476,6 +476,8 @@
         
       
         
+      
+        
       
         
       
@@ -796,6 +798,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../bayesian_models.html" class="md-nav__link">
         
@@ -1604,7 +1627,7 @@ <h2 id="pytorch_widedeep.utils.image_utils.AspectAwarePreprocessor" class="doc d
     <div class="doc doc-contents first">
 
 
-      <p>Class to resize an image to a certain width and height taking into account
+        <p>Class to resize an image to a certain width and height taking into account
 the image aspect ratio</p>
 
 
@@ -1673,7 +1696,7 @@ <h3 id="pytorch_widedeep.utils.image_utils.AspectAwarePreprocessor.preprocess" c
 
     <div class="doc doc-contents ">
 
-      <p>Returns the resized input image taking into account the image aspect ratio</p>
+        <p>Returns the resized input image taking into account the image aspect ratio</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -1831,7 +1854,7 @@ <h2 id="pytorch_widedeep.utils.image_utils.SimplePreprocessor" class="doc doc-he
     <div class="doc doc-contents first">
 
 
-      <p>Class to resize an image to a certain width and height</p>
+        <p>Class to resize an image to a certain width and height</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -1899,7 +1922,7 @@ <h3 id="pytorch_widedeep.utils.image_utils.SimplePreprocessor.preprocess" class=
 
     <div class="doc doc-contents ">
 
-      <p>Returns the resized input image</p>
+        <p>Returns the resized input image</p>
 
 
 <p><span class="doc-section-title">Parameters:</span></p>
@@ -1995,7 +2018,7 @@ <h3 id="pytorch_widedeep.utils.image_utils.SimplePreprocessor.preprocess" class=
   <span class="md-source-file__fact">
     <span class="md-icon" title="Contributors">
       
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 4a4 4 0 0 1 4 4 4 4 0 0 1-4 4 4 4 0 0 1-4-4 4 4 0 0 1 4-4m0 10c4.42 0 8 1.79 8 4v2H4v-2c0-2.21 3.58-4 8-4Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 4a4 4 0 0 1 4 4 4 4 0 0 1-4 4 4 4 0 0 1-4-4 4 4 0 0 1 4-4m0 10c4.42 0 8 1.79 8 4v2H4v-2c0-2.21 3.58-4 8-4"/></svg>
       
     </span>
     <nav>
@@ -2050,7 +2073,7 @@ <h3 id="pytorch_widedeep.utils.image_utils.SimplePreprocessor.preprocess" class=
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -2065,10 +2088,10 @@ <h3 id="pytorch_widedeep.utils.image_utils.SimplePreprocessor.preprocess" class=
     </div>
     
     
-    <script id="__config" type="application/json">{"base": "../..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": "../..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../../assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="../../assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="../../stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/pytorch-widedeep/utils/index.html b/mkdocs/site/pytorch-widedeep/utils/index.html
index 0582c272..20ee1df1 100644
--- a/mkdocs/site/pytorch-widedeep/utils/index.html
+++ b/mkdocs/site/pytorch-widedeep/utils/index.html
@@ -18,7 +18,7 @@
       
       
       <link rel="icon" href="../../assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -26,7 +26,7 @@
       
     
     
-      <link rel="stylesheet" href="../../assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="../../assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="../../assets/stylesheets/palette.06af60db.min.css">
@@ -52,7 +52,7 @@
     
       <link rel="stylesheet" href="../../stylesheets/extra.css">
     
-    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -101,7 +101,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -129,7 +129,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -139,7 +139,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -147,13 +147,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -162,15 +162,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -193,7 +193,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -372,7 +372,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -476,6 +476,8 @@
         
       
         
+      
+        
       
         
       
@@ -738,6 +740,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../bayesian_models.html" class="md-nav__link">
         
@@ -1548,7 +1571,7 @@ <h1 id="the-utils-module">The <code>utils</code> module<a class="headerlink" hre
   <span class="md-source-file__fact">
     <span class="md-icon" title="Contributors">
       
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 4a4 4 0 0 1 4 4 4 4 0 0 1-4 4 4 4 0 0 1-4-4 4 4 0 0 1 4-4m0 10c4.42 0 8 1.79 8 4v2H4v-2c0-2.21 3.58-4 8-4Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 4a4 4 0 0 1 4 4 4 4 0 0 1-4 4 4 4 0 0 1-4-4 4 4 0 0 1 4-4m0 10c4.42 0 8 1.79 8 4v2H4v-2c0-2.21 3.58-4 8-4"/></svg>
       
     </span>
     <nav>
@@ -1603,7 +1626,7 @@ <h1 id="the-utils-module">The <code>utils</code> module<a class="headerlink" hre
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -1618,10 +1641,10 @@ <h1 id="the-utils-module">The <code>utils</code> module<a class="headerlink" hre
     </div>
     
     
-    <script id="__config" type="application/json">{"base": "../..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": "../..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../../assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="../../assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="../../stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/pytorch-widedeep/utils/text_utils.html b/mkdocs/site/pytorch-widedeep/utils/text_utils.html
index fd281126..99866cc4 100644
--- a/mkdocs/site/pytorch-widedeep/utils/text_utils.html
+++ b/mkdocs/site/pytorch-widedeep/utils/text_utils.html
@@ -18,7 +18,7 @@
       
       
       <link rel="icon" href="../../assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -26,7 +26,7 @@
       
     
     
-      <link rel="stylesheet" href="../../assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="../../assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="../../assets/stylesheets/palette.06af60db.min.css">
@@ -52,7 +52,7 @@
     
       <link rel="stylesheet" href="../../stylesheets/extra.css">
     
-    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -101,7 +101,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -129,7 +129,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -139,7 +139,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -147,13 +147,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -162,15 +162,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -193,7 +193,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -372,7 +372,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -476,6 +476,8 @@
         
       
         
+      
+        
       
         
       
@@ -814,6 +816,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../bayesian_models.html" class="md-nav__link">
         
@@ -1616,7 +1639,7 @@ <h2 id="pytorch_widedeep.utils.text_utils.simple_preprocess" class="doc doc-head
 
     <div class="doc doc-contents first">
 
-      <p>This is <code>Gensim</code>'s <code>simple_preprocess</code> with a <code>lower</code> param to
+        <p>This is <code>Gensim</code>'s <code>simple_preprocess</code> with a <code>lower</code> param to
 indicate wether or not to lower case all the token in the doc</p>
 <p>For more information see: <code>Gensim</code> <a href="https://radimrehurek.com/gensim/utils.html">utils module</a></p>
 
@@ -1694,7 +1717,11 @@ <h2 id="pytorch_widedeep.utils.text_utils.simple_preprocess" class="doc doc-head
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/utils/text_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">16</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">12</span>
+<span class="normal">13</span>
+<span class="normal">14</span>
+<span class="normal">15</span>
+<span class="normal">16</span>
 <span class="normal">17</span>
 <span class="normal">18</span>
 <span class="normal">19</span>
@@ -1732,11 +1759,7 @@ <h2 id="pytorch_widedeep.utils.text_utils.simple_preprocess" class="doc doc-head
 <span class="normal">51</span>
 <span class="normal">52</span>
 <span class="normal">53</span>
-<span class="normal">54</span>
-<span class="normal">55</span>
-<span class="normal">56</span>
-<span class="normal">57</span>
-<span class="normal">58</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">simple_preprocess</span><span class="p">(</span>
+<span class="normal">54</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">simple_preprocess</span><span class="p">(</span>
     <span class="n">doc</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
     <span class="n">lower</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
     <span class="n">deacc</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
@@ -1798,7 +1821,7 @@ <h2 id="pytorch_widedeep.utils.text_utils.get_texts" class="doc doc-heading">
 
     <div class="doc doc-contents first">
 
-      <p>Tokenization using <code>Fastai</code>'s <code>Tokenizer</code> because it does a
+        <p>Tokenization using <code>Fastai</code>'s <code>Tokenizer</code> because it does a
 series of very convenients things during the tokenization process</p>
 <p>See <code>pytorch_widedeep.utils.fastai_utils.Tokenizer</code></p>
 
@@ -1902,7 +1925,11 @@ <h2 id="pytorch_widedeep.utils.text_utils.get_texts" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/utils/text_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 61</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 57</span>
+<span class="normal"> 58</span>
+<span class="normal"> 59</span>
+<span class="normal"> 60</span>
+<span class="normal"> 61</span>
 <span class="normal"> 62</span>
 <span class="normal"> 63</span>
 <span class="normal"> 64</span>
@@ -1947,11 +1974,7 @@ <h2 id="pytorch_widedeep.utils.text_utils.get_texts" class="doc doc-heading">
 <span class="normal">103</span>
 <span class="normal">104</span>
 <span class="normal">105</span>
-<span class="normal">106</span>
-<span class="normal">107</span>
-<span class="normal">108</span>
-<span class="normal">109</span>
-<span class="normal">110</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">get_texts</span><span class="p">(</span>
+<span class="normal">106</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">get_texts</span><span class="p">(</span>
     <span class="n">texts</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span>
     <span class="n">already_processed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
     <span class="n">n_cpus</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
@@ -2020,7 +2043,7 @@ <h2 id="pytorch_widedeep.utils.text_utils.pad_sequences" class="doc doc-heading"
 
     <div class="doc doc-contents first">
 
-      <p>Given a List of tokenized and <code>numericalised</code> sequences it will return
+        <p>Given a List of tokenized and <code>numericalised</code> sequences it will return
 padded sequences according to the input parameters.</p>
 
 
@@ -2087,7 +2110,11 @@ <h2 id="pytorch_widedeep.utils.text_utils.pad_sequences" class="doc doc-heading"
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/utils/text_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">113</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">109</span>
+<span class="normal">110</span>
+<span class="normal">111</span>
+<span class="normal">112</span>
+<span class="normal">113</span>
 <span class="normal">114</span>
 <span class="normal">115</span>
 <span class="normal">116</span>
@@ -2125,11 +2152,7 @@ <h2 id="pytorch_widedeep.utils.text_utils.pad_sequences" class="doc doc-heading"
 <span class="normal">148</span>
 <span class="normal">149</span>
 <span class="normal">150</span>
-<span class="normal">151</span>
-<span class="normal">152</span>
-<span class="normal">153</span>
-<span class="normal">154</span>
-<span class="normal">155</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">pad_sequences</span><span class="p">(</span>
+<span class="normal">151</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">pad_sequences</span><span class="p">(</span>
     <span class="n">seq</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">maxlen</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">pad_first</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">pad_idx</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span>
 <span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">:</span>
 <span class="w">    </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
@@ -2193,7 +2216,7 @@ <h2 id="pytorch_widedeep.utils.text_utils.build_embeddings_matrix" class="doc do
 
     <div class="doc doc-contents first">
 
-      <p>Build the embedding matrix using pretrained word vectors.</p>
+        <p>Build the embedding matrix using pretrained word vectors.</p>
 <p>Returns pretrained word embeddings. If a word in our vocabulary is not
 among the pretrained embeddings it will be assigned the mean pretrained
 word-embeddings vector</p>
@@ -2251,7 +2274,11 @@ <h2 id="pytorch_widedeep.utils.text_utils.build_embeddings_matrix" class="doc do
 
             <details class="quote">
               <summary>Source code in <code>pytorch_widedeep/utils/text_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">158</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">154</span>
+<span class="normal">155</span>
+<span class="normal">156</span>
+<span class="normal">157</span>
+<span class="normal">158</span>
 <span class="normal">159</span>
 <span class="normal">160</span>
 <span class="normal">161</span>
@@ -2313,11 +2340,7 @@ <h2 id="pytorch_widedeep.utils.text_utils.build_embeddings_matrix" class="doc do
 <span class="normal">217</span>
 <span class="normal">218</span>
 <span class="normal">219</span>
-<span class="normal">220</span>
-<span class="normal">221</span>
-<span class="normal">222</span>
-<span class="normal">223</span>
-<span class="normal">224</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">build_embeddings_matrix</span><span class="p">(</span>
+<span class="normal">220</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">build_embeddings_matrix</span><span class="p">(</span>
     <span class="n">vocab</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Vocab</span><span class="p">,</span> <span class="n">ChunkVocab</span><span class="p">],</span>
     <span class="n">word_vectors_path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
     <span class="n">min_freq</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
@@ -2406,7 +2429,7 @@ <h2 id="pytorch_widedeep.utils.text_utils.build_embeddings_matrix" class="doc do
   <span class="md-source-file__fact">
     <span class="md-icon" title="Contributors">
       
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 4a4 4 0 0 1 4 4 4 4 0 0 1-4 4 4 4 0 0 1-4-4 4 4 0 0 1 4-4m0 10c4.42 0 8 1.79 8 4v2H4v-2c0-2.21 3.58-4 8-4Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 4a4 4 0 0 1 4 4 4 4 0 0 1-4 4 4 4 0 0 1-4-4 4 4 0 0 1 4-4m0 10c4.42 0 8 1.79 8 4v2H4v-2c0-2.21 3.58-4 8-4"/></svg>
       
     </span>
     <nav>
@@ -2461,7 +2484,7 @@ <h2 id="pytorch_widedeep.utils.text_utils.build_embeddings_matrix" class="doc do
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -2476,10 +2499,10 @@ <h2 id="pytorch_widedeep.utils.text_utils.build_embeddings_matrix" class="doc do
     </div>
     
     
-    <script id="__config" type="application/json">{"base": "../..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": "../..", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "../../assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="../../assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="../../stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/quick_start.html b/mkdocs/site/quick_start.html
index 73b1df19..d23987b2 100644
--- a/mkdocs/site/quick_start.html
+++ b/mkdocs/site/quick_start.html
@@ -18,7 +18,7 @@
       
       
       <link rel="icon" href="assets/images/favicon.ico">
-      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.35">
     
     
       
@@ -26,7 +26,7 @@
       
     
     
-      <link rel="stylesheet" href="assets/stylesheets/main.3cba04c6.min.css">
+      <link rel="stylesheet" href="assets/stylesheets/main.35f28582.min.css">
       
         
         <link rel="stylesheet" href="assets/stylesheets/palette.06af60db.min.css">
@@ -52,7 +52,7 @@
     
       <link rel="stylesheet" href="stylesheets/extra.css">
     
-    <script>__md_scope=new URL(".",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    <script>__md_scope=new URL(".",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
     
       
 
@@ -101,7 +101,7 @@
     </a>
     <label class="md-header__button md-icon" for="__drawer">
       
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
     </label>
     <div class="md-header__title" data-md-component="header-title">
       <div class="md-header__ellipsis">
@@ -129,7 +129,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
     
       <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -139,7 +139,7 @@
     <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="red" data-md-color-accent="deep-orange"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
     
       <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
       </label>
     
   
@@ -147,13 +147,13 @@
       
     
     
-      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
     
     
     
       <label class="md-header__button md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
       </label>
       <div class="md-search" data-md-component="search" role="dialog">
   <label class="md-search__overlay" for="__search"></label>
@@ -162,15 +162,15 @@
       <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
       <label class="md-search__icon md-icon" for="__search">
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
         
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
       </label>
       <nav class="md-search__options" aria-label="Search">
         
         <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
           
-          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
         </button>
       </nav>
       
@@ -193,7 +193,7 @@
         <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -370,7 +370,7 @@
       <a href="https://github.com/jrzaurin/pytorch-widedeep" title="Go to repository" class="md-source" data-md-component="source">
   <div class="md-source__icon md-icon">
     
-    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
   </div>
   <div class="md-source__repository">
     pytorch_widedeep
@@ -482,6 +482,8 @@
         
       
         
+      
+        
       
         
       
@@ -743,6 +745,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="pytorch-widedeep/the_rec_module.html" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    The Rec Module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="pytorch-widedeep/bayesian_models.html" class="md-nav__link">
         
@@ -1645,7 +1668,7 @@ <h1 id="quick-start">Quick Start<a class="headerlink" href="#quick-start" title=
   <span class="md-source-file__fact">
     <span class="md-icon" title="Contributors">
       
-        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.536 5.536 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13v-1.75M0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20H0m24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9V20Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 5.5A3.5 3.5 0 0 1 15.5 9a3.5 3.5 0 0 1-3.5 3.5A3.5 3.5 0 0 1 8.5 9 3.5 3.5 0 0 1 12 5.5M5 8c.56 0 1.08.15 1.53.42-.15 1.43.27 2.85 1.13 3.96C7.16 13.34 6.16 14 5 14a3 3 0 0 1-3-3 3 3 0 0 1 3-3m14 0a3 3 0 0 1 3 3 3 3 0 0 1-3 3c-1.16 0-2.16-.66-2.66-1.62a5.54 5.54 0 0 0 1.13-3.96c.45-.27.97-.42 1.53-.42M5.5 18.25c0-2.07 2.91-3.75 6.5-3.75s6.5 1.68 6.5 3.75V20h-13zM0 20v-1.5c0-1.39 1.89-2.56 4.45-2.9-.59.68-.95 1.62-.95 2.65V20zm24 0h-3.5v-1.75c0-1.03-.36-1.97-.95-2.65 2.56.34 4.45 1.51 4.45 2.9z"/></svg>
       
     </span>
     <nav>
@@ -1701,7 +1724,7 @@ <h1 id="quick-start">Quick Start<a class="headerlink" href="#quick-start" title=
       
     
     <a href="https://jrzaurin.medium.com/" target="_blank" rel="noopener" title="jrzaurin.medium.com" class="md-social__link">
-      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262Zm288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M180.5 74.262C80.813 74.262 0 155.633 0 256s80.819 181.738 180.5 181.738S361 356.373 361 256 280.191 74.262 180.5 74.262m288.25 10.646c-49.845 0-90.245 76.619-90.245 171.095s40.406 171.1 90.251 171.1 90.251-76.619 90.251-171.1H559c0-94.503-40.4-171.095-90.248-171.095Zm139.506 17.821c-17.526 0-31.735 68.628-31.735 153.274s14.2 153.274 31.735 153.274S640 340.631 640 256c0-84.649-14.215-153.271-31.742-153.271Z"/></svg>
     </a>
   
 </div>
@@ -1716,10 +1739,10 @@ <h1 id="quick-start">Quick Start<a class="headerlink" href="#quick-start" title=
     </div>
     
     
-    <script id="__config" type="application/json">{"base": ".", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    <script id="__config" type="application/json">{"base": ".", "features": ["navigation.tabs", "navigation.tabs.sticky", "navigation.indexes", "navigation.expand", "toc.integrate"], "search": "assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
     
     
-      <script src="assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      <script src="assets/javascripts/bundle.56dfad97.min.js"></script>
       
         <script src="stylesheets/extra.js"></script>
       
diff --git a/mkdocs/site/search/search_index.json b/mkdocs/site/search/search_index.json
index 0f52f1ad..c3e78296 100644
--- a/mkdocs/site/search/search_index.json
+++ b/mkdocs/site/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"index.html","title":"Home","text":""},{"location":"index.html#pytorch-widedeep","title":"pytorch-widedeep","text":"<p>A flexible package for multimodal-deep-learning to combine tabular data with text and images using Wide and Deep models in Pytorch</p> <p>Documentation: https://pytorch-widedeep.readthedocs.io</p> <p>Companion posts and tutorials: infinitoml</p> <p>Experiments and comparison with <code>LightGBM</code>: TabularDL vs LightGBM</p> <p>Slack: if you want to contribute or just want to chat with us, join slack</p> <p>The content of this document is organized as follows:</p> <ul> <li>pytorch-widedeep<ul> <li>Introduction</li> <li>Architectures</li> <li>The <code>deeptabular</code> component</li> <li>Text and Images</li> <li>Acknowledgments</li> <li>License</li> <li>Cite<ul> <li>BibTex</li> <li>APA</li> </ul> </li> </ul> </li> </ul>"},{"location":"index.html#introduction","title":"Introduction","text":"<p><code>pytorch-widedeep</code> is based on Google's Wide and Deep Algorithm, adjusted for multi-modal datasets.</p> <p>In general terms, <code>pytorch-widedeep</code> is a package to use deep learning with tabular data. In particular, is intended to facilitate the combination of text and images with corresponding tabular data using wide and deep models. With that in mind there are a number of architectures that can be implemented with the library. The main components of those architectures are shown in the Figure below:</p> <p> </p> <p>In math terms, and following the notation in the paper, the expression for the architecture without a <code>deephead</code> component can be formulated as:</p> <p> </p> <p>Where \u03c3 is the sigmoid function, 'W' are the weight matrices applied to the wide model and to the final activations of the deep models, 'a' are these final activations, \u03c6(x) are the cross product transformations of the original features 'x', and , and 'b' is the bias term. In case you are wondering what are \"cross product transformations\", here is a quote taken directly from the paper: \"For binary features, a cross-product transformation (e.g., \u201cAND(gender=female, language=en)\u201d) is 1 if and only if the constituent features (\u201cgender=female\u201d and \u201clanguage=en\u201d) are all 1, and 0 otherwise\".</p> <p>It is perfectly possible to use custom models (and not necessarily those in the library) as long as the the custom models have a property called <code>output_dim</code> with the size of the last layer of activations, so that <code>WideDeep</code> can be constructed. Examples on how to use custom components can be found in the Examples folder and the section below.</p>"},{"location":"index.html#architectures","title":"Architectures","text":"<p>The <code>pytorch-widedeep</code> library offers a number of different architectures. In this section we will show some of them in their simplest form (i.e. with default param values in most cases) with their corresponding code snippets. Note that all the snippets below shoud run locally. For a more detailed explanation of the different components and their parameters, please refer to the documentation.</p> <p>For the examples below we will be using a toy dataset generated as follows:</p> <pre><code>import os\nimport random\n\nimport numpy as np\nimport pandas as pd\nfrom PIL import Image\nfrom faker import Faker\n\n\ndef create_and_save_random_image(image_number, size=(32, 32)):\n\n    if not os.path.exists(\"images\"):\n        os.makedirs(\"images\")\n\n    array = np.random.randint(0, 256, (size[0], size[1], 3), dtype=np.uint8)\n\n    image = Image.fromarray(array)\n\n    image_name = f\"image_{image_number}.png\"\n    image.save(os.path.join(\"images\", image_name))\n\n    return image_name\n\n\nfake = Faker()\n\ncities = [\"New York\", \"Los Angeles\", \"Chicago\", \"Houston\"]\nnames = [\"Alice\", \"Bob\", \"Charlie\", \"David\", \"Eva\"]\n\ndata = {\n    \"city\": [random.choice(cities) for _ in range(100)],\n    \"name\": [random.choice(names) for _ in range(100)],\n    \"age\": [random.uniform(18, 70) for _ in range(100)],\n    \"height\": [random.uniform(150, 200) for _ in range(100)],\n    \"sentence\": [fake.sentence() for _ in range(100)],\n    \"other_sentence\": [fake.sentence() for _ in range(100)],\n    \"image_name\": [create_and_save_random_image(i) for i in range(100)],\n    \"target\": [random.choice([0, 1]) for _ in range(100)],\n}\n\ndf = pd.DataFrame(data)\n</code></pre> <p>This will create a 100 rows dataframe and a dir in your local folder, called <code>images</code> with 100 random images (or images with just noise).</p> <p>Perhaps the simplest architecture would be just one component, <code>wide</code>, <code>deeptabular</code>, <code>deeptext</code> or <code>deepimage</code> on their own, which is also possible, but let's start the examples with a standard Wide and Deep architecture. From there, how to build a model comprised only of one component will be straightforward.</p> <p>Note that the examples shown below would be almost identical using any of the models available in the library. For example, <code>TabMlp</code> can be replaced by <code>TabResnet</code>, <code>TabNet</code>, <code>TabTransformer</code>, etc. Similarly, <code>BasicRNN</code> can be replaced by <code>AttentiveRNN</code>, <code>StackedAttentiveRNN</code>, or <code>HFModel</code> with their corresponding parameters and preprocessor in the case of the Hugging Face models.</p> <p>1. Wide and Tabular component (aka deeptabular)</p> <p> </p> <pre><code>from pytorch_widedeep.preprocessing import TabPreprocessor, WidePreprocessor\nfrom pytorch_widedeep.models import Wide, TabMlp, WideDeep\nfrom pytorch_widedeep.training import Trainer\n\n# Wide\nwide_cols = [\"city\"]\ncrossed_cols = [(\"city\", \"name\")]\nwide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)\nX_wide = wide_preprocessor.fit_transform(df)\nwide = Wide(input_dim=np.unique(X_wide).shape[0])\n\n# Tabular\ntab_preprocessor = TabPreprocessor(\n    embed_cols=[\"city\", \"name\"], continuous_cols=[\"age\", \"height\"]\n)\nX_tab = tab_preprocessor.fit_transform(df)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    mlp_hidden_dims=[64, 32],\n)\n\n# WideDeep\nmodel = WideDeep(wide=wide, deeptabular=tab_mlp)\n\n# Train\ntrainer = Trainer(model, objective=\"binary\")\n\ntrainer.fit(\n    X_wide=X_wide,\n    X_tab=X_tab,\n    target=df[\"target\"].values,\n    n_epochs=1,\n    batch_size=32,\n)\n</code></pre> <p>2. Tabular and Text data</p> <p> </p> <pre><code>from pytorch_widedeep.preprocessing import TabPreprocessor, TextPreprocessor\nfrom pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep\nfrom pytorch_widedeep.training import Trainer\n\n# Tabular\ntab_preprocessor = TabPreprocessor(\n    embed_cols=[\"city\", \"name\"], continuous_cols=[\"age\", \"height\"]\n)\nX_tab = tab_preprocessor.fit_transform(df)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    mlp_hidden_dims=[64, 32],\n)\n\n# Text\ntext_preprocessor = TextPreprocessor(\n    text_col=\"sentence\", maxlen=20, max_vocab=100, n_cpus=1\n)\nX_text = text_preprocessor.fit_transform(df)\nrnn = BasicRNN(\n    vocab_size=len(text_preprocessor.vocab.itos),\n    embed_dim=16,\n    hidden_dim=8,\n    n_layers=1,\n)\n\n# WideDeep\nmodel = WideDeep(deeptabular=tab_mlp, deeptext=rnn)\n\n# Train\ntrainer = Trainer(model, objective=\"binary\")\n\ntrainer.fit(\n    X_tab=X_tab,\n    X_text=X_text,\n    target=df[\"target\"].values,\n    n_epochs=1,\n    batch_size=32,\n)\n</code></pre> <p>3. Tabular and text with a FC head on top via the <code>head_hidden_dims</code> param   in <code>WideDeep</code></p> <p> </p> <pre><code>from pytorch_widedeep.preprocessing import TabPreprocessor, TextPreprocessor\nfrom pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep\nfrom pytorch_widedeep.training import Trainer\n\n# Tabular\ntab_preprocessor = TabPreprocessor(\n    embed_cols=[\"city\", \"name\"], continuous_cols=[\"age\", \"height\"]\n)\nX_tab = tab_preprocessor.fit_transform(df)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    mlp_hidden_dims=[64, 32],\n)\n\n# Text\ntext_preprocessor = TextPreprocessor(\n    text_col=\"sentence\", maxlen=20, max_vocab=100, n_cpus=1\n)\nX_text = text_preprocessor.fit_transform(df)\nrnn = BasicRNN(\n    vocab_size=len(text_preprocessor.vocab.itos),\n    embed_dim=16,\n    hidden_dim=8,\n    n_layers=1,\n)\n\n# WideDeep\nmodel = WideDeep(deeptabular=tab_mlp, deeptext=rnn, head_hidden_dims=[32, 16])\n\n# Train\ntrainer = Trainer(model, objective=\"binary\")\n\ntrainer.fit(\n    X_tab=X_tab,\n    X_text=X_text,\n    target=df[\"target\"].values,\n    n_epochs=1,\n    batch_size=32,\n)\n</code></pre> <p>4. Tabular and multiple text columns that are passed directly to   <code>WideDeep</code></p> <p> </p> <pre><code>from pytorch_widedeep.preprocessing import TabPreprocessor, TextPreprocessor\nfrom pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep\nfrom pytorch_widedeep.training import Trainer\n\n\n# Tabular\ntab_preprocessor = TabPreprocessor(\n    embed_cols=[\"city\", \"name\"], continuous_cols=[\"age\", \"height\"]\n)\nX_tab = tab_preprocessor.fit_transform(df)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    mlp_hidden_dims=[64, 32],\n)\n\n# Text\ntext_preprocessor_1 = TextPreprocessor(\n    text_col=\"sentence\", maxlen=20, max_vocab=100, n_cpus=1\n)\nX_text_1 = text_preprocessor_1.fit_transform(df)\ntext_preprocessor_2 = TextPreprocessor(\n    text_col=\"other_sentence\", maxlen=20, max_vocab=100, n_cpus=1\n)\nX_text_2 = text_preprocessor_2.fit_transform(df)\nrnn_1 = BasicRNN(\n    vocab_size=len(text_preprocessor_1.vocab.itos),\n    embed_dim=16,\n    hidden_dim=8,\n    n_layers=1,\n)\nrnn_2 = BasicRNN(\n    vocab_size=len(text_preprocessor_2.vocab.itos),\n    embed_dim=16,\n    hidden_dim=8,\n    n_layers=1,\n)\n\n# WideDeep\nmodel = WideDeep(deeptabular=tab_mlp, deeptext=[rnn_1, rnn_2])\n\n# Train\ntrainer = Trainer(model, objective=\"binary\")\n\ntrainer.fit(\n    X_tab=X_tab,\n    X_text=[X_text_1, X_text_2],\n    target=df[\"target\"].values,\n    n_epochs=1,\n    batch_size=32,\n)\n</code></pre> <p>5. Tabular data and multiple text columns that are fused via a the library's   <code>ModelFuser</code> class</p> <p> </p> <pre><code>from pytorch_widedeep.preprocessing import TabPreprocessor, TextPreprocessor\nfrom pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep, ModelFuser\nfrom pytorch_widedeep import Trainer\n\n# Tabular\ntab_preprocessor = TabPreprocessor(\n    embed_cols=[\"city\", \"name\"], continuous_cols=[\"age\", \"height\"]\n)\nX_tab = tab_preprocessor.fit_transform(df)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    mlp_hidden_dims=[64, 32],\n)\n\n# Text\ntext_preprocessor_1 = TextPreprocessor(\n    text_col=\"sentence\", maxlen=20, max_vocab=100, n_cpus=1\n)\nX_text_1 = text_preprocessor_1.fit_transform(df)\ntext_preprocessor_2 = TextPreprocessor(\n    text_col=\"other_sentence\", maxlen=20, max_vocab=100, n_cpus=1\n)\nX_text_2 = text_preprocessor_2.fit_transform(df)\n\nrnn_1 = BasicRNN(\n    vocab_size=len(text_preprocessor_1.vocab.itos),\n    embed_dim=16,\n    hidden_dim=8,\n    n_layers=1,\n)\nrnn_2 = BasicRNN(\n    vocab_size=len(text_preprocessor_2.vocab.itos),\n    embed_dim=16,\n    hidden_dim=8,\n    n_layers=1,\n)\n\nmodels_fuser = ModelFuser(models=[rnn_1, rnn_2], fusion_method=\"mult\")\n\n# WideDeep\nmodel = WideDeep(deeptabular=tab_mlp, deeptext=models_fuser)\n\n# Train\ntrainer = Trainer(model, objective=\"binary\")\n\ntrainer.fit(\n    X_tab=X_tab,\n    X_text=[X_text_1, X_text_2],\n    target=df[\"target\"].values,\n    n_epochs=1,\n    batch_size=32,\n)\n</code></pre> <p>6. Tabular and multiple text columns, with an image column. The text columns   are fused via the library's <code>ModelFuser</code> and then all fused via the   deephead paramenter in <code>WideDeep</code> which is a custom <code>ModelFuser</code> coded by   the user</p> <p>This is perhaps the less elegant solution as it involves a custom component by the user and slicing the 'incoming' tensor. In the future, we will include a <code>TextAndImageModelFuser</code> to make this process more straightforward. Still, is not really complicated and it is a good example of how to use custom components in <code>pytorch-widedeep</code>.</p> <p>Note that the only requirement for the custom component is that it has a property called <code>output_dim</code> that returns the size of the last layer of activations. In other words, it does not need to inherit from <code>BaseWDModelComponent</code>. This base class simply checks the existence of such property and avoids some typing errors internally.</p> <p> </p> <pre><code>import torch\n\nfrom pytorch_widedeep.preprocessing import TabPreprocessor, TextPreprocessor, ImagePreprocessor\nfrom pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep, ModelFuser, Vision\nfrom pytorch_widedeep.models._base_wd_model_component import BaseWDModelComponent\nfrom pytorch_widedeep import Trainer\n\n# Tabular\ntab_preprocessor = TabPreprocessor(\n    embed_cols=[\"city\", \"name\"], continuous_cols=[\"age\", \"height\"]\n)\nX_tab = tab_preprocessor.fit_transform(df)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    mlp_hidden_dims=[16, 8],\n)\n\n# Text\ntext_preprocessor_1 = TextPreprocessor(\n    text_col=\"sentence\", maxlen=20, max_vocab=100, n_cpus=1\n)\nX_text_1 = text_preprocessor_1.fit_transform(df)\ntext_preprocessor_2 = TextPreprocessor(\n    text_col=\"other_sentence\", maxlen=20, max_vocab=100, n_cpus=1\n)\nX_text_2 = text_preprocessor_2.fit_transform(df)\nrnn_1 = BasicRNN(\n    vocab_size=len(text_preprocessor_1.vocab.itos),\n    embed_dim=16,\n    hidden_dim=8,\n    n_layers=1,\n)\nrnn_2 = BasicRNN(\n    vocab_size=len(text_preprocessor_2.vocab.itos),\n    embed_dim=16,\n    hidden_dim=8,\n    n_layers=1,\n)\nmodels_fuser = ModelFuser(\n    models=[rnn_1, rnn_2],\n    fusion_method=\"mult\",\n)\n\n# Image\nimage_preprocessor = ImagePreprocessor(img_col=\"image_name\", img_path=\"images\")\nX_img = image_preprocessor.fit_transform(df)\nvision = Vision(pretrained_model_setup=\"resnet18\", head_hidden_dims=[16, 8])\n\n# deephead (custom model fuser)\nclass MyModelFuser(BaseWDModelComponent):\n    \"\"\"\n    Simply a Linear + Relu sequence on top of the text + images followed by a\n    Linear -&gt; Relu -&gt; Linear for the concatenation of tabular slice of the\n    tensor and the output of the text and image sequential model\n    \"\"\"\n    def __init__(\n        self,\n        tab_incoming_dim: int,\n        text_incoming_dim: int,\n        image_incoming_dim: int,\n        output_units: int,\n    ):\n\n        super(MyModelFuser, self).__init__()\n\n        self.tab_incoming_dim = tab_incoming_dim\n        self.text_incoming_dim = text_incoming_dim\n        self.image_incoming_dim = image_incoming_dim\n        self.output_units = output_units\n        self.text_and_image_fuser = torch.nn.Sequential(\n            torch.nn.Linear(text_incoming_dim + image_incoming_dim, output_units),\n            torch.nn.ReLU(),\n        )\n        self.out = torch.nn.Sequential(\n            torch.nn.Linear(output_units + tab_incoming_dim, output_units * 4),\n            torch.nn.ReLU(),\n            torch.nn.Linear(output_units * 4, output_units),\n        )\n\n    def forward(self, X: torch.Tensor) -&gt; torch.Tensor:\n        tab_slice = slice(0, self.tab_incoming_dim)\n        text_slice = slice(\n            self.tab_incoming_dim, self.tab_incoming_dim + self.text_incoming_dim\n        )\n        image_slice = slice(\n            self.tab_incoming_dim + self.text_incoming_dim,\n            self.tab_incoming_dim + self.text_incoming_dim + self.image_incoming_dim,\n        )\n        X_tab = X[:, tab_slice]\n        X_text = X[:, text_slice]\n        X_img = X[:, image_slice]\n        X_text_and_image = self.text_and_image_fuser(torch.cat([X_text, X_img], dim=1))\n        return self.out(torch.cat([X_tab, X_text_and_image], dim=1))\n\n    @property\n    def output_dim(self):\n        return self.output_units\n\ndeephead = MyModelFuser(\n    tab_incoming_dim=tab_mlp.output_dim,\n    text_incoming_dim=models_fuser.output_dim,\n    image_incoming_dim=vision.output_dim,\n    output_units=8,\n)\n\n# WideDeep\nmodel = WideDeep(\n    deeptabular=tab_mlp,\n    deeptext=models_fuser,\n    deepimage=vision,\n    deephead=deephead,\n)\n\n# Train\ntrainer = Trainer(model, objective=\"binary\")\n\ntrainer.fit(\n    X_tab=X_tab,\n    X_text=[X_text_1, X_text_2],\n    X_img=X_img,\n    target=df[\"target\"].values,\n    n_epochs=1,\n    batch_size=32,\n)\n</code></pre> <p>7. Tabular with a multi-target loss</p> <p>This one is \"a bonus\" to illustrate the use of multi-target losses, more than actually a different architecture.</p> <p> </p> <pre><code>from pytorch_widedeep.preprocessing import TabPreprocessor, TextPreprocessor, ImagePreprocessor\nfrom pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep, ModelFuser, Vision\nfrom pytorch_widedeep.losses_multitarget import MultiTargetClassificationLoss\nfrom pytorch_widedeep.models._base_wd_model_component import BaseWDModelComponent\nfrom pytorch_widedeep import Trainer\n\n# let's add a second target to the dataframe\ndf[\"target2\"] = [random.choice([0, 1]) for _ in range(100)]\n\n# Tabular\ntab_preprocessor = TabPreprocessor(\n    embed_cols=[\"city\", \"name\"], continuous_cols=[\"age\", \"height\"]\n)\nX_tab = tab_preprocessor.fit_transform(df)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    mlp_hidden_dims=[64, 32],\n)\n\n# 'pred_dim=2' because we have two binary targets. For other types of targets,\n#  please, see the documentation\nmodel = WideDeep(deeptabular=tab_mlp, pred_dim=2).\n\nloss = MultiTargetClassificationLoss(binary_config=[0, 1], reduction=\"mean\")\n\n# When a multi-target loss is used, 'custom_loss_function' must not be None.\n# See the docs\ntrainer = Trainer(model, objective=\"multitarget\", custom_loss_function=loss)\n\ntrainer.fit(\n    X_tab=X_tab,\n    target=df[[\"target\", \"target2\"]].values,\n    n_epochs=1,\n    batch_size=32,\n)\n</code></pre>"},{"location":"index.html#the-deeptabular-component","title":"The <code>deeptabular</code> component","text":"<p>It is important to emphasize again that each individual component, <code>wide</code>, <code>deeptabular</code>, <code>deeptext</code> and <code>deepimage</code>, can be used independently and in isolation. For example, one could use only <code>wide</code>, which is in simply a linear model. In fact, one of the most interesting functionalities in<code>pytorch-widedeep</code> would be the use of the <code>deeptabular</code> component on its own, i.e. what one might normally refer as Deep Learning for Tabular Data. Currently, <code>pytorch-widedeep</code> offers the following different models for that component:</p> <ol> <li>Wide: a simple linear model where the nonlinearities are captured via cross-product transformations, as explained before.</li> <li>TabMlp: a simple MLP that receives embeddings representing the categorical features, concatenated with the continuous features, which can also be embedded.</li> <li>TabResnet: similar to the previous model but the embeddings are passed through a series of ResNet blocks built with dense layers.</li> <li>TabNet: details on TabNet can be found in TabNet: Attentive Interpretable Tabular Learning</li> </ol> <p>Two simpler attention based models that we call:</p> <ol> <li>ContextAttentionMLP: MLP with at attention mechanism \"on top\" that is based on     Hierarchical Attention Networks for Document Classification</li> <li>SelfAttentionMLP: MLP with an attention mechanism that is a simplified     version of a transformer block that we refer as \"query-key self-attention\".</li> </ol> <p>The <code>Tabformer</code> family, i.e. Transformers for Tabular data:</p> <ol> <li>TabTransformer: details on the TabTransformer can be found in TabTransformer: Tabular Data Modeling Using Contextual Embeddings.</li> <li>SAINT: Details on SAINT can be found in SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training.</li> <li>FT-Transformer: details on the FT-Transformer can be found in Revisiting Deep Learning Models for Tabular Data.</li> <li>TabFastFormer: adaptation of the FastFormer for tabular data. Details on the Fasformer can be found in FastFormers: Highly Efficient Transformer Models for Natural Language Understanding</li> <li>TabPerceiver: adaptation of the Perceiver for tabular data. Details on the Perceiver can be found in Perceiver: General Perception with Iterative Attention</li> </ol> <p>And probabilistic DL models for tabular data based on Weight Uncertainty in Neural Networks:</p> <ol> <li>BayesianWide: Probabilistic adaptation of the <code>Wide</code> model.</li> <li>BayesianTabMlp: Probabilistic adaptation of the <code>TabMlp</code> model</li> </ol> <p>Note that while there are scientific publications for the TabTransformer, SAINT and FT-Transformer, the TabFasfFormer and TabPerceiver are our own adaptation of those algorithms for tabular data.</p> <p>In addition, Self-Supervised pre-training can be used for all <code>deeptabular</code> models, with the exception of the <code>TabPerceiver</code>. Self-Supervised pre-training can be used via two methods or routines which we refer as: encoder-decoder method and constrastive-denoising method. Please, see the documentation and the examples for details on this functionality, and all other options in the library.</p>"},{"location":"index.html#text-and-images","title":"Text and Images","text":"<p>For the text component, <code>deeptext</code>, the library offers the following models:</p> <ol> <li>BasicRNN: a simple RNN 2. AttentiveRNN: a RNN with an attention mechanism based on the Hierarchical Attention Networks for DocumentClassification</li> <li>StackedAttentiveRNN: a stack of AttentiveRNNs</li> <li>HFModel: a wrapper around Hugging Face Transfomer-based models. At the moment only models from the families BERT, RoBERTa, DistilBERT, ALBERT and ELECTRA are supported. This is because this library is designed to address classification and regression tasks and these are the most 'popular' encoder-only models, which have proved to be those that work best for these tasks. If there is demand for other models, they will be included in the future.</li> </ol> <p>For the image component, <code>deepimage</code>, the library supports models from the following families: 'resnet', 'shufflenet', 'resnext', 'wide_resnet', 'regnet', 'densenet', 'mobilenetv3',  'mobilenetv2', 'mnasnet', 'efficientnet' and 'squeezenet'.  These are  offered via <code>torchvision</code> and wrapped up in the <code>Vision</code> class.</p>"},{"location":"index.html#acknowledgments","title":"Acknowledgments","text":"<p>This library takes from a series of other libraries, so I think it is just fair to mention them here in the README (specific mentions are also included in the code).</p> <p>The <code>Callbacks</code> and <code>Initializers</code> structure and code is inspired by the <code>torchsample</code> library, which in itself partially inspired by <code>Keras</code>.</p> <p>The <code>TextProcessor</code> class in this library uses the <code>fastai</code>'s <code>Tokenizer</code> and <code>Vocab</code>. The code at <code>utils.fastai_transforms</code> is a minor adaptation of their code so it functions within this library. To my experience their <code>Tokenizer</code> is the best in class.</p> <p>The <code>ImageProcessor</code> class in this library uses code from the fantastic Deep Learning for Computer Vision (DL4CV) book by Adrian Rosebrock.</p>"},{"location":"index.html#license","title":"License","text":"<p>This work is dual-licensed under Apache 2.0 and MIT (or any later version). You can choose between one of them if you use this work.</p> <p><code>SPDX-License-Identifier: Apache-2.0 AND MIT</code></p>"},{"location":"index.html#cite","title":"Cite","text":""},{"location":"index.html#bibtex","title":"BibTex","text":"<pre><code>@article{Zaurin_pytorch-widedeep_A_flexible_2023,\nauthor = {Zaurin, Javier Rodriguez and Mulinka, Pavol},\ndoi = {10.21105/joss.05027},\njournal = {Journal of Open Source Software},\nmonth = jun,\nnumber = {86},\npages = {5027},\ntitle = {{pytorch-widedeep: A flexible package for multimodal deep learning}},\nurl = {https://joss.theoj.org/papers/10.21105/joss.05027},\nvolume = {8},\nyear = {2023}\n}\n</code></pre>"},{"location":"index.html#apa","title":"APA","text":"<pre><code>Zaurin, J. R., &amp; Mulinka, P. (2023). pytorch-widedeep: A flexible package for\nmultimodal deep learning. Journal of Open Source Software, 8(86), 5027.\nhttps://doi.org/10.21105/joss.05027\n</code></pre>"},{"location":"contributing.html","title":"Contributing","text":"<p>Pytorch-widedeep is being developed and used by many active community members. Your help is very valuable to make it better for everyone.</p> <ul> <li>Check for the Roadmap or Open an issue to report problems or recommend new features and submit a draft pull requests, which will be changed to pull request after intial review</li> <li>Contribute to the tests to make it more reliable.</li> <li>Contribute to the documentation to make it clearer for everyone.</li> <li>Contribute to the examples to share your experience with other users.</li> <li>Join the dicussion on slack</li> </ul>"},{"location":"installation.html","title":"Installation","text":"<p>This section explains how to install <code>pytorch-widedeep</code>.</p> <p>For the latest stable release, execute:</p> <pre><code>pip install pytorch-widedeep\n</code></pre> <p>For the bleeding-edge version, execute:</p> <pre><code>pip install git+https://github.com/jrzaurin/pytorch-widedeep.git\n</code></pre> <p>For developer install</p> <pre><code># Clone the repository\ngit clone https://github.com/jrzaurin/pytorch-widedeep\ncd pytorch-widedeep\n\n# Install in dev mode\npip install -e .\n</code></pre>"},{"location":"installation.html#dependencies","title":"Dependencies","text":"<ul> <li>pandas&gt;=1.3.5</li> <li>numpy&gt;=1.21.6</li> <li>scipy&gt;=1.7.3,&lt;=1.12.0</li> <li>scikit-learn&gt;=1.0.2</li> <li>gensim</li> <li>spacy</li> <li>opencv-contrib-python</li> <li>imutils</li> <li>tqdm</li> <li>torch &gt;= 2.0.0</li> <li>torchvision &gt;= 0.15.0</li> <li>einops</li> <li>wrapt</li> <li>torchmetrics</li> <li>pyarrow</li> <li>fastparquet&gt;=0.8.1</li> <li>transformers</li> <li>sentence-transformers</li> <li>sentencepiece</li> </ul>"},{"location":"quick_start.html","title":"Quick Start","text":"<p>This is an example of a binary classification with the adult census dataset using a combination of a wide and deep model (in this case a so called <code>deeptabular</code> model) with defaults settings.</p> <pre><code>import numpy as np\nimport torch\nfrom sklearn.model_selection import train_test_split\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor\nfrom pytorch_widedeep.models import Wide, TabMlp, WideDeep\nfrom pytorch_widedeep.metrics import Accuracy\nfrom pytorch_widedeep.datasets import load_adult\n\n\ndf = load_adult(as_frame=True)\ndf[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int)\ndf.drop(\"income\", axis=1, inplace=True)\ndf_train, df_test = train_test_split(df, test_size=0.2, stratify=df.income_label)\n\n# Define the 'column set up'\nwide_cols = [\n    \"education\",\n    \"relationship\",\n    \"workclass\",\n    \"occupation\",\n    \"native-country\",\n    \"gender\",\n]\ncrossed_cols = [(\"education\", \"occupation\"), (\"native-country\", \"occupation\")]\n\ncat_embed_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital-status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"capital-gain\",\n    \"capital-loss\",\n    \"native-country\",\n]\ncontinuous_cols = [\"age\", \"hours-per-week\"]\ntarget = \"income_label\"\ntarget = df_train[target].values\n\n# prepare the data\nwide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)\nX_wide = wide_preprocessor.fit_transform(df_train)\n\ntab_preprocessor = TabPreprocessor(\n    cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols  # type: ignore[arg-type]\n)\nX_tab = tab_preprocessor.fit_transform(df_train)\n\n# build the model\nwide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=continuous_cols,\n)\nmodel = WideDeep(wide=wide, deeptabular=tab_mlp)\n\n# train and validate\ntrainer = Trainer(model, objective=\"binary\", metrics=[Accuracy])\ntrainer.fit(\n    X_wide=X_wide,\n    X_tab=X_tab,\n    target=target,\n    n_epochs=5,\n    batch_size=256,\n)\n\n# predict on test\nX_wide_te = wide_preprocessor.transform(df_test)\nX_tab_te = tab_preprocessor.transform(df_test)\npreds = trainer.predict(X_wide=X_wide_te, X_tab=X_tab_te)\n\n# Save and load\n\n# Option 1: this will also save training history and lr history if the\n# LRHistory callback is used\ntrainer.save(path=\"model_weights\", save_state_dict=True)\n\n# Option 2: save as any other torch model\ntorch.save(model.state_dict(), \"model_weights/wd_model.pt\")\n\n# From here in advance, Option 1 or 2 are the same. I assume the user has\n# prepared the data and defined the new model components:\n# 1. Build the model\nmodel_new = WideDeep(wide=wide, deeptabular=tab_mlp)\nmodel_new.load_state_dict(torch.load(\"model_weights/wd_model.pt\"))\n\n# 2. Instantiate the trainer\ntrainer_new = Trainer(model_new, objective=\"binary\")\n\n# 3. Either start the fit or directly predict\npreds = trainer_new.predict(X_wide=X_wide, X_tab=X_tab, batch_size=32)\n</code></pre>"},{"location":"examples/01_preprocessors_and_utils.html","title":"01_preprocessors_and_utils","text":"<p>For example</p> In\u00a0[1]: Copied! <pre>import numpy as np\nimport pandas as pd\nimport pytorch_widedeep as wd\n\nfrom pytorch_widedeep.datasets import load_adult\nfrom pytorch_widedeep.preprocessing import WidePreprocessor\n</pre> import numpy as np import pandas as pd import pytorch_widedeep as wd  from pytorch_widedeep.datasets import load_adult from pytorch_widedeep.preprocessing import WidePreprocessor <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n</pre> In\u00a0[2]: Copied! <pre>df = load_adult(as_frame=True)\ndf.head()\n</pre> df = load_adult(as_frame=True) df.head() Out[2]: age workclass fnlwgt education educational-num marital-status occupation relationship race gender capital-gain capital-loss hours-per-week native-country income 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States &lt;=50K 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States &lt;=50K 2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States &gt;50K 3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States &gt;50K 4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States &lt;=50K In\u00a0[3]: Copied! <pre>wide_cols = [\n    \"education\",\n    \"relationship\",\n    \"workclass\",\n    \"occupation\",\n    \"native-country\",\n    \"gender\",\n]\ncrossed_cols = [(\"education\", \"occupation\"), (\"native-country\", \"occupation\")]\n</pre> wide_cols = [     \"education\",     \"relationship\",     \"workclass\",     \"occupation\",     \"native-country\",     \"gender\", ] crossed_cols = [(\"education\", \"occupation\"), (\"native-country\", \"occupation\")] In\u00a0[4]: Copied! <pre>wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)\nX_wide = wide_preprocessor.fit_transform(df)\n# From here on, any new observation can be prepared by simply running `.transform`\n# new_X_wide = wide_preprocessor.transform(new_df)\n</pre> wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = wide_preprocessor.fit_transform(df) # From here on, any new observation can be prepared by simply running `.transform` # new_X_wide = wide_preprocessor.transform(new_df) In\u00a0[5]: Copied! <pre>X_wide\n</pre> X_wide Out[5]: <pre>array([[  1,  17,  23, ...,  89,  91, 316],\n       [  2,  18,  23, ...,  89,  92, 317],\n       [  3,  18,  24, ...,  89,  93, 318],\n       ...,\n       [  2,  20,  23, ...,  90, 103, 323],\n       [  2,  17,  23, ...,  89, 103, 323],\n       [  2,  21,  29, ...,  90, 115, 324]])</pre> <p>Note that the label encoding starts from <code>1</code>. This is because it is convenient to leave <code>0</code> for padding, i.e. unknown categories. Let's take from example the first entry</p> In\u00a0[6]: Copied! <pre>X_wide[0]\n</pre> X_wide[0] Out[6]: <pre>array([  1,  17,  23,  32,  47,  89,  91, 316])</pre> In\u00a0[7]: Copied! <pre>wide_preprocessor.inverse_transform(X_wide[:1])\n</pre> wide_preprocessor.inverse_transform(X_wide[:1]) Out[7]: education relationship workclass occupation native-country gender education_occupation native-country_occupation 0 11th Own-child Private Machine-op-inspct United-States Male 11th-Machine-op-inspct United-States-Machine-op-inspct <p>As we can see, <code>wide_preprocessor</code> numerically encodes the <code>wide_cols</code> and the <code>crossed_cols</code>, which can be recovered using the method <code>inverse_transform</code>.</p> In\u00a0[8]: Copied! <pre>from pytorch_widedeep.preprocessing import TabPreprocessor\n</pre> from pytorch_widedeep.preprocessing import TabPreprocessor In\u00a0[9]: Copied! <pre># cat_embed_cols = [(column_name, embed_dim), ...]\ncat_embed_cols = [\n    (\"education\", 10),\n    (\"relationship\", 8),\n    (\"workclass\", 10),\n    (\"occupation\", 10),\n    (\"native-country\", 10),\n]\ncontinuous_cols = [\"age\", \"hours-per-week\"]\n</pre> # cat_embed_cols = [(column_name, embed_dim), ...] cat_embed_cols = [     (\"education\", 10),     (\"relationship\", 8),     (\"workclass\", 10),     (\"occupation\", 10),     (\"native-country\", 10), ] continuous_cols = [\"age\", \"hours-per-week\"] In\u00a0[10]: Copied! <pre>tab_preprocessor = TabPreprocessor(\n    cat_embed_cols=cat_embed_cols,\n    continuous_cols=continuous_cols,\n    cols_to_scale=[\"age\"],  # or scale=True or cols_to_scale=continuous_cols\n)\nX_tab = tab_preprocessor.fit_transform(df)\n# From here on, any new observation can be prepared by simply running `.transform`\n# new_X_deep = deep_preprocessor.transform(new_df)\n</pre> tab_preprocessor = TabPreprocessor(     cat_embed_cols=cat_embed_cols,     continuous_cols=continuous_cols,     cols_to_scale=[\"age\"],  # or scale=True or cols_to_scale=continuous_cols ) X_tab = tab_preprocessor.fit_transform(df) # From here on, any new observation can be prepared by simply running `.transform` # new_X_deep = deep_preprocessor.transform(new_df) In\u00a0[11]: Copied! <pre>X_tab\n</pre> X_tab Out[11]: <pre>array([[ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,\n         1.00000000e+00, -9.95128932e-01,  4.00000000e+01],\n       [ 2.00000000e+00,  2.00000000e+00,  1.00000000e+00, ...,\n         1.00000000e+00, -4.69415091e-02,  5.00000000e+01],\n       [ 3.00000000e+00,  2.00000000e+00,  2.00000000e+00, ...,\n         1.00000000e+00, -7.76316450e-01,  4.00000000e+01],\n       ...,\n       [ 2.00000000e+00,  4.00000000e+00,  1.00000000e+00, ...,\n         1.00000000e+00,  1.41180837e+00,  4.00000000e+01],\n       [ 2.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,\n         1.00000000e+00, -1.21394141e+00,  2.00000000e+01],\n       [ 2.00000000e+00,  5.00000000e+00,  7.00000000e+00, ...,\n         1.00000000e+00,  9.74183408e-01,  4.00000000e+01]])</pre> <p>Note that the label encoding starts from <code>1</code>. This is because it is convenient to leave <code>0</code> for padding, i.e. unknown categories. Let's take from example the first entry</p> In\u00a0[12]: Copied! <pre>X_tab[0]\n</pre> X_tab[0] Out[12]: <pre>array([ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,\n       -0.99512893, 40.        ])</pre> In\u00a0[13]: Copied! <pre>tab_preprocessor.inverse_transform(X_tab[:1])\n</pre> tab_preprocessor.inverse_transform(X_tab[:1]) Out[13]: education relationship workclass occupation native-country age hours-per-week 0 11th Own-child Private Machine-op-inspct United-States 25.0 40.0 <p>The <code>TabPreprocessor</code> will have a series of useful attributes that can later be used when instantiating the different Tabular Models, such us for example, the column indexes (used to slice the tensors, internally in the models) or the categorical embeddings set up</p> In\u00a0[14]: Copied! <pre>tab_preprocessor.column_idx\n</pre> tab_preprocessor.column_idx Out[14]: <pre>{'education': 0,\n 'relationship': 1,\n 'workclass': 2,\n 'occupation': 3,\n 'native-country': 4,\n 'age': 5,\n 'hours-per-week': 6}</pre> In\u00a0[15]: Copied! <pre># column name, num unique, embedding dim\ntab_preprocessor.cat_embed_input\n</pre> # column name, num unique, embedding dim tab_preprocessor.cat_embed_input Out[15]: <pre>[('education', 16, 10),\n ('relationship', 6, 8),\n ('workclass', 9, 10),\n ('occupation', 15, 10),\n ('native-country', 42, 10)]</pre> <p>As I mentioned, there is more one can do, such as for example, quantize (or bucketize) the continuous cols. For this we could use the <code>quantization_setup</code> param. This parameter accepts a number of different inputs and uses <code>pd.cut</code> under the hood to quantize the continuous cols. For more info, please, read the docs. Let's use it here to quantize \"age\" and \"hours-per-week\" in 4 and 5 \"buckets\" respectively</p> In\u00a0[16]: Copied! <pre>quantization_setup = {\n    \"age\": 4,\n    \"hours-per-week\": 5,\n}  # you can also pass a list of floats with the boundaries if you wanted\nquant_tab_preprocessor = TabPreprocessor(\n    cat_embed_cols=cat_embed_cols,\n    continuous_cols=continuous_cols,\n    quantization_setup=quantization_setup,\n)\nqX_tab = quant_tab_preprocessor.fit_transform(df)\n</pre> quantization_setup = {     \"age\": 4,     \"hours-per-week\": 5, }  # you can also pass a list of floats with the boundaries if you wanted quant_tab_preprocessor = TabPreprocessor(     cat_embed_cols=cat_embed_cols,     continuous_cols=continuous_cols,     quantization_setup=quantization_setup, ) qX_tab = quant_tab_preprocessor.fit_transform(df) <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> In\u00a0[17]: Copied! <pre>qX_tab\n</pre> qX_tab Out[17]: <pre>array([[1, 1, 1, ..., 1, 1, 2],\n       [2, 2, 1, ..., 1, 2, 3],\n       [3, 2, 2, ..., 1, 1, 2],\n       ...,\n       [2, 4, 1, ..., 1, 3, 2],\n       [2, 1, 1, ..., 1, 1, 1],\n       [2, 5, 7, ..., 1, 2, 2]])</pre> <p>Note that the continuous columns that have been bucketised into quantiles are treated as any other categorical column</p> In\u00a0[18]: Copied! <pre>quant_tab_preprocessor.cat_embed_input\n</pre> quant_tab_preprocessor.cat_embed_input Out[18]: <pre>[('education', 16, 10),\n ('relationship', 6, 8),\n ('workclass', 9, 10),\n ('occupation', 15, 10),\n ('native-country', 42, 10),\n ('age', 4, 4),\n ('hours-per-week', 5, 4)]</pre> <p>Where the column 'age' has now 4 categories, which will be encoded using embeddings of 4 dims. Note that, as any other categorical columns, the categorical \"counter\" starts with 1. This is because all incoming values that are lower/higher than the existing lowest/highest value in the train (or already seen) dataset, will be encoded as 0.</p> In\u00a0[19]: Copied! <pre>np.unique(qX_tab[:, quant_tab_preprocessor.column_idx[\"age\"]])\n</pre> np.unique(qX_tab[:, quant_tab_preprocessor.column_idx[\"age\"]]) Out[19]: <pre>array([1, 2, 3, 4])</pre> <p>Finally, if we now wanted to <code>inverse_transform</code> the transformed array into the original dataframe, we could still do it, but the continuous, bucketized columns will be transformed back to the middle of their quantile/bucket range</p> In\u00a0[20]: Copied! <pre>df_decoded = quant_tab_preprocessor.inverse_transform(qX_tab)\n</pre> df_decoded = quant_tab_preprocessor.inverse_transform(qX_tab) <pre>Note that quantized cols will be turned into the mid point of the corresponding bin\n</pre> In\u00a0[21]: Copied! <pre>df.head(2)\n</pre> df.head(2) Out[21]: age workclass fnlwgt education educational-num marital-status occupation relationship race gender capital-gain capital-loss hours-per-week native-country income 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States &lt;=50K 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States &lt;=50K In\u00a0[22]: Copied! <pre>df_decoded.head(2)\n</pre> df_decoded.head(2) Out[22]: education relationship workclass occupation native-country age hours-per-week 0 11th Own-child Private Machine-op-inspct United-States 26.0885 30.4 1 HS-grad Husband Private Farming-fishing United-States 44.3750 50.0 <p>there is one final comment to make regarding to the <code>inverse_transform</code> functionality. As we mentioned before, the encoding <code>0</code> is reserved for values that fall outside the range covered by the data we used to run the <code>fit</code> method. For example</p> In\u00a0[23]: Copied! <pre>df.age.min(), df.age.max()\n</pre> df.age.min(), df.age.max() Out[23]: <pre>(17, 90)</pre> <p>All future age values outside that range will be encoded as 0 and decoded as <code>NaN</code></p> In\u00a0[24]: Copied! <pre>tmp_df = df.head(1).copy()\ntmp_df.loc[:, \"age\"] = 5\n</pre> tmp_df = df.head(1).copy() tmp_df.loc[:, \"age\"] = 5 In\u00a0[25]: Copied! <pre>tmp_df\n</pre> tmp_df Out[25]: age workclass fnlwgt education educational-num marital-status occupation relationship race gender capital-gain capital-loss hours-per-week native-country income 0 5 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States &lt;=50K In\u00a0[26]: Copied! <pre># quant_tab_preprocessor has already been fitted with a data that has an age range between 17 and 90\ntmp_qX_tab = quant_tab_preprocessor.transform(tmp_df)\n</pre> # quant_tab_preprocessor has already been fitted with a data that has an age range between 17 and 90 tmp_qX_tab = quant_tab_preprocessor.transform(tmp_df) In\u00a0[27]: Copied! <pre>tmp_qX_tab\n</pre> tmp_qX_tab Out[27]: <pre>array([[1, 1, 1, 1, 1, 0, 2]])</pre> In\u00a0[28]: Copied! <pre>quant_tab_preprocessor.inverse_transform(tmp_qX_tab)\n</pre> quant_tab_preprocessor.inverse_transform(tmp_qX_tab) <pre>Note that quantized cols will be turned into the mid point of the corresponding bin\n</pre> Out[28]: education relationship workclass occupation native-country age hours-per-week 0 11th Own-child Private Machine-op-inspct United-States NaN 30.4 In\u00a0[29]: Copied! <pre>from pytorch_widedeep.preprocessing import TextPreprocessor\n</pre> from pytorch_widedeep.preprocessing import TextPreprocessor In\u00a0[30]: Copied! <pre># The airbnb dataset, which you could get from here:\n# http://insideairbnb.com/get-the-data.html, is too big to be included in\n# our datasets module (when including images). Therefore, go there,\n# download it, and use the download_images.py script to get the images\n# and the airbnb_data_processing.py to process the data. We'll find\n# better datasets in the future ;). Note that here we are only using a\n# small sample to illustrate the use, so PLEASE ignore the results, just\n# focus on usage\ndf = pd.read_csv(\"../tmp_data/airbnb/airbnb_sample.csv\")\n</pre> # The airbnb dataset, which you could get from here: # http://insideairbnb.com/get-the-data.html, is too big to be included in # our datasets module (when including images). Therefore, go there, # download it, and use the download_images.py script to get the images # and the airbnb_data_processing.py to process the data. We'll find # better datasets in the future ;). Note that here we are only using a # small sample to illustrate the use, so PLEASE ignore the results, just # focus on usage df = pd.read_csv(\"../tmp_data/airbnb/airbnb_sample.csv\") In\u00a0[31]: Copied! <pre>texts = df.description.tolist()\ntexts[:2]\n</pre> texts = df.description.tolist() texts[:2] Out[31]: <pre>[\"My bright double bedroom with a large window has a relaxed feeling! It comfortably fits one or two and is centrally located just two blocks from Finsbury Park. Enjoy great restaurants in the area and easy access to easy transport tubes, trains and buses. Babies and children of all ages are welcome. Hello Everyone, I'm offering my lovely double bedroom in Finsbury Park area (zone 2) for let in a shared apartment.  You will share the apartment with me and it is fully furnished with a self catering kitchen. Two people can easily sleep well as the room has a queen size bed. I also have a travel cot for a baby for guest with small children.  I will require a deposit up front as a security gesture on both our parts and will be given back to you when you return the keys.  I trust anyone who will be responding to this add would treat my home with care and respect .  Best Wishes  Alina Guest will have access to the self catering kitchen and bathroom. There is the flat is equipped wifi internet,\",\n \"Lots of windows and light.  St Luke's Gardens are at the end of the block, and the river not too far the other way. Ten minutes walk if you go slowly. Buses to everywhere round the corner and shops, restaurants, pubs, the cinema and Waitrose . Bright Chelsea Apartment  This is a bright one bedroom ground floor apartment in an interesting listed building. There is one double bedroom and a living room/kitchen The apartment has a full  bathroom and the kitchen is fully equipped. Two wardrobes are available exclusively for guests and bedside tables and two long drawers. This sunny convenient compact flat is just around the corner from the Waitrose supermarket and all sorts of shops, cinemas, restaurants and pubs.  This is a lovely part of London. There is a fun farmers market in the King's Road at the weekend.  Buses to everywhere are just round the corner, and two underground stations are within ten minutes walk. There is a very nice pub round by St. Luke's gardens, 4 mins slow walk, the \"]</pre> In\u00a0[32]: Copied! <pre>text_preprocessor = TextPreprocessor(text_col=\"description\")\nX_text = text_preprocessor.fit_transform(df)\n# From here on, any new observation can be prepared by simply running `.transform`\n# new_X_text = text_preprocessor.transform(new_df)\n</pre> text_preprocessor = TextPreprocessor(text_col=\"description\") X_text = text_preprocessor.fit_transform(df) # From here on, any new observation can be prepared by simply running `.transform` # new_X_text = text_preprocessor.transform(new_df) <pre>The vocabulary contains 2192 tokens\n</pre> In\u00a0[33]: Copied! <pre>print(X_text[0])\n</pre> print(X_text[0]) <pre>[  29   48   37  367  818   17  910   17  177   15  122  349   53  879\n 1174  126  393   40  911    0   23  228   71  819    9   53   55 1380\n  225   11   18  308   18 1564   10  755    0  942  239   53   55    0\n   11   36 1013  277 1974   70   62   15 1475    9  943    5  251    5\n    0    5    0    5  177   53   37   75   11   10  294  726   32    9\n   42    5   25   12   10   22   12  136  100  145]\n</pre> In\u00a0[34]: Copied! <pre>from pytorch_widedeep.preprocessing import ImagePreprocessor\n</pre> from pytorch_widedeep.preprocessing import ImagePreprocessor In\u00a0[35]: Copied! <pre>image_preprocessor = wd.preprocessing.ImagePreprocessor(\n    img_col=\"id\", img_path=\"../tmp_data/airbnb/property_picture/\"\n)\nX_images = image_preprocessor.fit_transform(df)\n# From here on, any new observation can be prepared by simply running `.transform`\n# new_X_images = image_preprocessor.transform(new_df)\n</pre> image_preprocessor = wd.preprocessing.ImagePreprocessor(     img_col=\"id\", img_path=\"../tmp_data/airbnb/property_picture/\" ) X_images = image_preprocessor.fit_transform(df) # From here on, any new observation can be prepared by simply running `.transform` # new_X_images = image_preprocessor.transform(new_df) <pre>Reading Images from ../tmp_data/airbnb/property_picture/\nResizing\n</pre> <pre>100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1001/1001 [00:01&lt;00:00, 667.89it/s]\n</pre> <pre>Computing normalisation metrics\n</pre> In\u00a0[36]: Copied! <pre>X_images[0].shape\n</pre> X_images[0].shape Out[36]: <pre>(224, 224, 3)</pre>"},{"location":"examples/01_preprocessors_and_utils.html#processors-and-utils","title":"Processors and Utils\u00b6","text":"<p>Description of the main tools and utilities that one needs to prepare the data for a <code>WideDeep</code> model constructor.</p>"},{"location":"examples/01_preprocessors_and_utils.html#the-preprocessing-module","title":"The <code>preprocessing</code>  module\u00b6","text":"<p>There are 4 preprocessors, corresponding to 4 main components of the <code>WideDeep</code> model. These are</p> <ul> <li><code>WidePreprocessor</code></li> <li><code>TabPreprocessor</code></li> <li><code>TextPreprocessor</code></li> <li><code>ImagePreprocessor</code></li> </ul> <p>Behind the scenes, these preprocessors use a series of helper funcions and classes that are in the <code>utils</code> module. If you were interested please go and have a look to the documentation</p>"},{"location":"examples/01_preprocessors_and_utils.html#1-widepreprocessor","title":"1. WidePreprocessor\u00b6","text":"<p>The <code>wide</code> component of the model is a linear model that in principle, could be implemented as a linear layer receiving the result of on one-hot encoding categorical columns. However, this is not memory efficient. Therefore, we implement a liner layer as an Embedding layer plus a bias. I will explain in a bit more detail later.</p> <p>With that in mind, <code>WidePreprocessor</code> simply encodes the categories numerically so that they are the indexes of the lookup table that is an Embedding layer.</p>"},{"location":"examples/01_preprocessors_and_utils.html#2-tabpreprocessor","title":"2. TabPreprocessor\u00b6","text":"<p>The <code>TabPreprocessor</code> has a lot of different functionalities. Let's explore some of them in detail. In its basic use, the <code>TabPreprocessor</code> simply label encodes the categorical columns and normalises the numerical ones (unless otherwised specified).</p>"},{"location":"examples/01_preprocessors_and_utils.html#3-textpreprocessor","title":"3. TextPreprocessor\u00b6","text":"<p>This preprocessor returns the tokenised, padded sequences that will be directly fed to the stack of LSTMs.</p>"},{"location":"examples/01_preprocessors_and_utils.html#4-imagepreprocessor","title":"4. ImagePreprocessor\u00b6","text":"<p><code>ImagePreprocessor</code> simply resizes the images, being aware of the aspect ratio.</p>"},{"location":"examples/02_model_components.html","title":"02_model_components","text":"In\u00a0[1]: Copied! <pre>import torch\nimport pandas as pd\nimport numpy as np\n\nfrom torch import nn\n</pre> import torch import pandas as pd import numpy as np  from torch import nn In\u00a0[2]: Copied! <pre>df = pd.DataFrame({\"color\": [\"r\", \"b\", \"g\"], \"size\": [\"s\", \"n\", \"l\"]})\ndf.head()\n</pre> df = pd.DataFrame({\"color\": [\"r\", \"b\", \"g\"], \"size\": [\"s\", \"n\", \"l\"]}) df.head() Out[2]: color size 0 r s 1 b n 2 g l <p>one hot encoded, the first observation would be</p> In\u00a0[3]: Copied! <pre>obs_0_oh = (np.array([1.0, 0.0, 0.0, 1.0, 0.0, 0.0])).astype(\"float32\")\n</pre> obs_0_oh = (np.array([1.0, 0.0, 0.0, 1.0, 0.0, 0.0])).astype(\"float32\") <p>if we simply numerically encode (label encode or <code>le</code>) the values:</p> In\u00a0[4]: Copied! <pre>obs_0_le = (np.array([0, 3])).astype(\"int64\")\n</pre> obs_0_le = (np.array([0, 3])).astype(\"int64\") <p>Note that in the functioning implementation of the package we start from 1, saving 0 for padding, i.e. unseen values.</p> <p>Now, let's see if the two implementations are equivalent</p> In\u00a0[5]: Copied! <pre># we have 6 different values. Let's assume we are performing a regression, so pred_dim = 1\nlin = nn.Linear(6, 1)\n</pre> # we have 6 different values. Let's assume we are performing a regression, so pred_dim = 1 lin = nn.Linear(6, 1) In\u00a0[6]: Copied! <pre>emb = nn.Embedding(6, 1)\nemb.weight = nn.Parameter(lin.weight.reshape_as(emb.weight))\n</pre> emb = nn.Embedding(6, 1) emb.weight = nn.Parameter(lin.weight.reshape_as(emb.weight)) In\u00a0[7]: Copied! <pre>lin(torch.tensor(obs_0_oh))\n</pre> lin(torch.tensor(obs_0_oh)) Out[7]: <pre>tensor([-0.5181], grad_fn=&lt;ViewBackward0&gt;)</pre> In\u00a0[8]: Copied! <pre>emb(torch.tensor(obs_0_le)).sum() + lin.bias\n</pre> emb(torch.tensor(obs_0_le)).sum() + lin.bias Out[8]: <pre>tensor([-0.5181], grad_fn=&lt;AddBackward0&gt;)</pre> <p>And this is precisely how the linear model <code>Wide</code> is implemented</p> In\u00a0[9]: Copied! <pre>from pytorch_widedeep.models import Wide\n</pre> from pytorch_widedeep.models import Wide <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n</pre> In\u00a0[10]: Copied! <pre># ?Wide\n</pre> # ?Wide In\u00a0[11]: Copied! <pre>wide = Wide(input_dim=10, pred_dim=1)\nwide\n</pre> wide = Wide(input_dim=10, pred_dim=1) wide Out[11]: <pre>Wide(\n  (wide_linear): Embedding(11, 1, padding_idx=0)\n)</pre> <p>Note that even though the input dim is 10, the Embedding layer has 11 weights. Again, this is because we save <code>0</code> for padding, which is used for unseen values during the encoding process.</p> <p>As I mentioned, <code>deeptabular</code> has enough complexity on its own and it will be described in a separated notebook. Let's then jump to <code>deeptext</code>.</p> In\u00a0[12]: Copied! <pre>from pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.models import TabMlp\n</pre> from pytorch_widedeep.preprocessing import TabPreprocessor from pytorch_widedeep.models import TabMlp In\u00a0[13]: Copied! <pre>data = {\n    \"cat1\": np.random.choice([\"A\", \"B\", \"C\"], size=20),\n    \"cat2\": np.random.choice([\"X\", \"Y\"], size=20),\n    \"cont1\": np.random.rand(20),\n    \"cont2\": np.random.rand(20),\n}\n\ndf = pd.DataFrame(data)\n</pre> data = {     \"cat1\": np.random.choice([\"A\", \"B\", \"C\"], size=20),     \"cat2\": np.random.choice([\"X\", \"Y\"], size=20),     \"cont1\": np.random.rand(20),     \"cont2\": np.random.rand(20), }  df = pd.DataFrame(data) In\u00a0[14]: Copied! <pre>df.head()\n</pre> df.head() Out[14]: cat1 cat2 cont1 cont2 0 A Y 0.789347 0.561789 1 C X 0.050822 0.061538 2 A Y 0.863784 0.241967 3 C X 0.917848 0.644658 4 C Y 0.042328 0.417303 In\u00a0[15]: Copied! <pre># see the docs for details on all params/options\ntab_preprocessor = TabPreprocessor(\n    cat_embed_cols=[\"cat1\", \"cat2\"],\n    continuous_cols=[\"cont1\", \"cont2\"],\n    embedding_rule=\"fastai\",\n)\n</pre> # see the docs for details on all params/options tab_preprocessor = TabPreprocessor(     cat_embed_cols=[\"cat1\", \"cat2\"],     continuous_cols=[\"cont1\", \"cont2\"],     embedding_rule=\"fastai\", ) In\u00a0[16]: Copied! <pre>X_tab = tab_preprocessor.fit_transform(df)\n</pre> X_tab = tab_preprocessor.fit_transform(df) <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> In\u00a0[17]: Copied! <pre># toy example just to build a model.\ntabmlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    embed_continuous_method=\"standard\",\n    cont_embed_dim=4,\n    mlp_hidden_dims=[8, 4],\n    mlp_linear_first=True,\n)\ntabmlp\n</pre> # toy example just to build a model. tabmlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=tab_preprocessor.continuous_cols,     embed_continuous_method=\"standard\",     cont_embed_dim=4,     mlp_hidden_dims=[8, 4],     mlp_linear_first=True, ) tabmlp Out[17]: <pre>TabMlp(\n  (cat_embed): DiffSizeCatEmbeddings(\n    (embed_layers): ModuleDict(\n      (emb_layer_cat1): Embedding(4, 3, padding_idx=0)\n      (emb_layer_cat2): Embedding(3, 2, padding_idx=0)\n    )\n    (embedding_dropout): Dropout(p=0.0, inplace=False)\n  )\n  (cont_norm): Identity()\n  (cont_embed): ContEmbeddings(\n    INFO: [ContLinear = weight(n_cont_cols, embed_dim) + bias(n_cont_cols, embed_dim)]\n    (linear): ContLinear(n_cont_cols=2, embed_dim=4, embed_dropout=0.0)\n    (dropout): Dropout(p=0.0, inplace=False)\n  )\n  (encoder): MLP(\n    (mlp): Sequential(\n      (dense_layer_0): Sequential(\n        (0): Linear(in_features=13, out_features=8, bias=True)\n        (1): ReLU(inplace=True)\n        (2): Dropout(p=0.1, inplace=False)\n      )\n      (dense_layer_1): Sequential(\n        (0): Linear(in_features=8, out_features=4, bias=True)\n        (1): ReLU(inplace=True)\n        (2): Dropout(p=0.1, inplace=False)\n      )\n    )\n  )\n)</pre> <p>Lets describe a bit the model: first we have what we call a <code>DiffSizeCatEmbeddings</code>, where categorical columns with different number of unique categories will be encoded with embeddings of different dimensions. Then the continuous columns will not be normalised (the normalised layer is just the identity) and they will be embedded via a \"standard\" method, using a so-called <code>ContLinear</code> layer. This layer displays some <code>INFO</code> that tells us what it is (<code>ContLinear = weight(n_cont_cols, embed_dim) + bias(n_cont_cols, embed_dim)]</code>). There are two other options available to embed the continuous cols based on the paper On Embeddings for Numerical Features in Tabular Deep Learning. These are <code>PieceWise</code> and <code>Periodic</code> and all available via the <code>embed_continuous_method</code> param, which can adopt values <code>\"standard\", \"piecewise\"</code> and <code>\"periodic\"</code>. The embedded categorical and continuous columns will be then concatenated ($3 + 2 + (4 * 2) = 13$ input dims) and passed to an MLP.</p> In\u00a0[18]: Copied! <pre>from pytorch_widedeep.models import BasicRNN\n</pre> from pytorch_widedeep.models import BasicRNN In\u00a0[19]: Copied! <pre>basic_rnn = BasicRNN(vocab_size=4, hidden_dim=4, n_layers=1, padding_idx=0, embed_dim=4)\n</pre> basic_rnn = BasicRNN(vocab_size=4, hidden_dim=4, n_layers=1, padding_idx=0, embed_dim=4) <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/torch/nn/modules/rnn.py:82: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.1 and num_layers=1\n  warnings.warn(\"dropout option adds dropout after all but last \"\n</pre> In\u00a0[20]: Copied! <pre>basic_rnn\n</pre> basic_rnn Out[20]: <pre>BasicRNN(\n  (word_embed): Embedding(4, 4, padding_idx=0)\n  (rnn): LSTM(4, 4, batch_first=True, dropout=0.1)\n  (rnn_mlp): Identity()\n)</pre> <p>You could, if you wanted, add a Fully Connected Head (FC-Head) on top of it</p> In\u00a0[21]: Copied! <pre>from pytorch_widedeep.models import Vision\n</pre> from pytorch_widedeep.models import Vision In\u00a0[22]: Copied! <pre>resnet = Vision(pretrained_model_setup=\"resnet18\", n_trainable=0)\n</pre> resnet = Vision(pretrained_model_setup=\"resnet18\", n_trainable=0) In\u00a0[23]: Copied! <pre>resnet\n</pre> resnet Out[23]: <pre>Vision(\n  (features): Sequential(\n    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)\n    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n    (2): ReLU(inplace=True)\n    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)\n    (4): Sequential(\n      (0): BasicBlock(\n        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (relu): ReLU(inplace=True)\n        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      )\n      (1): BasicBlock(\n        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (relu): ReLU(inplace=True)\n        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      )\n    )\n    (5): Sequential(\n      (0): BasicBlock(\n        (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (relu): ReLU(inplace=True)\n        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (downsample): Sequential(\n          (0): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)\n          (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        )\n      )\n      (1): BasicBlock(\n        (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (relu): ReLU(inplace=True)\n        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      )\n    )\n    (6): Sequential(\n      (0): BasicBlock(\n        (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (relu): ReLU(inplace=True)\n        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (downsample): Sequential(\n          (0): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)\n          (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        )\n      )\n      (1): BasicBlock(\n        (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (relu): ReLU(inplace=True)\n        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      )\n    )\n    (7): Sequential(\n      (0): BasicBlock(\n        (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (relu): ReLU(inplace=True)\n        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (downsample): Sequential(\n          (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)\n          (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        )\n      )\n      (1): BasicBlock(\n        (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (relu): ReLU(inplace=True)\n        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      )\n    )\n    (8): AdaptiveAvgPool2d(output_size=(1, 1))\n  )\n)</pre>"},{"location":"examples/02_model_components.html#model-components","title":"Model Components\u00b6","text":"<p>The main components of a <code>WideDeep</code> (i.e. Multimodal) model are tabular data, text and images, which are feed into the model via so called <code>wide</code>, <code>deeptabular</code>, <code>deeptext</code> and <code>deepimage</code> model components</p>"},{"location":"examples/02_model_components.html#1-wide","title":"1. <code>wide</code>\u00b6","text":"<p>The <code>wide</code> component is a Linear layer \"plugged\" into the output neuron(s). Here, the non-linearities are captured via crossed columns. Crossed columns are, quoting directly the paper: \"For binary features, a cross-product transformation (e.g., \u201cAND(gender=female, language=en)\u201d) is 1 if and only if the constituent features (\u201cgender=female\u201d and \u201clanguage=en\u201d) are all 1, and 0 otherwise\".</p> <p>The only particularity of our implementation is that we have implemented the linear layer via an Embedding layer plus a bias. While the implementations are equivalent, the latter is faster and far more memory efficient, since we do not need to one hot encode the categorical features.</p> <p>Let's assume we the following dataset:</p>"},{"location":"examples/02_model_components.html#2-deeptabular","title":"2. <code>deeptabular</code>\u00b6","text":"<p>The <code>deeptabular</code> model alone is what normally would be referred as Deep Learning for tabular data. As mentioned a number of times throughout the library, each component can be used independently. Therefore, if you wanted to use any of the models below alone, it is perfectly possible. There are just a couple of simple requirement that will be covered in a later notebook.</p> <p>By the time of writing, there are a number of models available in <code>pytorch-widedeep</code> to do DL for tabular data. These are:</p> <ol> <li><code>TabMlp</code></li> <li><code>ContextAttentionMLP</code></li> <li><code>SelfAttentionMLP</code></li> <li><code>TabResnet</code></li> <li><code>Tabnet</code></li> <li><code>TabTransformer</code></li> <li><code>FT-Tabransformer</code></li> <li><code>SAINT</code></li> <li><code>TabFastFormer</code></li> <li><code>TabPerceiver</code></li> </ol> <p>Let's have a look to one of them. For more information on each of these models, please, have a look to the documentation</p>"},{"location":"examples/02_model_components.html#3-deeptext","title":"3. <code>deeptext</code>\u00b6","text":"<p>At the time of writing, <code>pytorch-widedeep</code> offers three models that can be passed to <code>WideDeep</code> as the <code>deeptext</code> component. These are:</p> <ol> <li>BasicRNN</li> <li>AttentiveRNN</li> <li>StackedAttentiveRNN</li> </ol> <p>For details on each of these models, please, have a look to the documentation of the package.</p> <p>We will soon integrate with Hugginface, but let me insist. It is perfectly possible to use custom models for each component, please, have a look to the corresponding notebook. In general, simply, build them and pass them as the corresponding parameters. Note that the custom models MUST return a last layer of activations (i.e. not the final prediction) so that  these activations are collected by <code>WideDeep</code> and combined accordingly. In  addition, the models MUST also contain an attribute <code>output_dim</code> with the size of these last layers of activations.</p> <p>Let's have a look to the <code>BasicRNN</code> model</p>"},{"location":"examples/02_model_components.html#4-deepimage","title":"4. <code>deepimage</code>\u00b6","text":"<p>At the time of writing <code>pytorch-widedeep</code> is integrated with torchvision via the <code>Vision</code> class. This means that the it is possible to use a variant of the following architectures:</p> <ol> <li>resnet</li> <li>shufflenet</li> <li>resnext</li> <li>wide_resnet</li> <li>regnet</li> <li>densenet</li> <li>mobilenet</li> <li>mnasnet</li> <li>efficientnet</li> <li>squeezenet</li> </ol> <p>The user can choose which layers will be trainable. Alternatively, in none of these architectures is useful, one could use a simple, fully trained CNN (please see the package documentation) or pass a custom model.</p> <p>let's have a look</p>"},{"location":"examples/03_binary_classification_with_defaults.html","title":"03_binary_classification_with_defaults","text":"In\u00a0[1]: Copied! <pre>import numpy as np\nimport pandas as pd\nimport torch\n\nfrom pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor\nfrom pytorch_widedeep.training import Trainer\nfrom pytorch_widedeep.models import Wide, TabMlp, WideDeep\nfrom pytorch_widedeep.metrics import Accuracy, Precision\nfrom pytorch_widedeep.datasets import load_adult\n</pre> import numpy as np import pandas as pd import torch  from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor from pytorch_widedeep.training import Trainer from pytorch_widedeep.models import Wide, TabMlp, WideDeep from pytorch_widedeep.metrics import Accuracy, Precision from pytorch_widedeep.datasets import load_adult <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n</pre> In\u00a0[2]: Copied! <pre>df = load_adult(as_frame=True)\ndf.head()\n</pre> df = load_adult(as_frame=True) df.head() Out[2]: age workclass fnlwgt education educational-num marital-status occupation relationship race gender capital-gain capital-loss hours-per-week native-country income 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States &lt;=50K 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States &lt;=50K 2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States &gt;50K 3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States &gt;50K 4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States &lt;=50K In\u00a0[3]: Copied! <pre># For convenience, we'll replace '-' with '_'\ndf.columns = [c.replace(\"-\", \"_\") for c in df.columns]\n# binary target\ndf[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int)\ndf.drop(\"income\", axis=1, inplace=True)\ndf.head()\n</pre> # For convenience, we'll replace '-' with '_' df.columns = [c.replace(\"-\", \"_\") for c in df.columns] # binary target df[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int) df.drop(\"income\", axis=1, inplace=True) df.head() Out[3]: age workclass fnlwgt education educational_num marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country income_label 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States 0 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States 0 2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States 1 3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States 1 4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States 0 In\u00a0[4]: Copied! <pre>df.drop([\"fnlwgt\", \"educational_num\"], axis=1, inplace=True)\n</pre> df.drop([\"fnlwgt\", \"educational_num\"], axis=1, inplace=True) In\u00a0[5]: Copied! <pre># Define wide, crossed and deep tabular columns\nwide_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital_status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"native_country\",\n]\ncrossed_cols = [(\"education\", \"occupation\"), (\"native_country\", \"occupation\")]\n</pre> # Define wide, crossed and deep tabular columns wide_cols = [     \"workclass\",     \"education\",     \"marital_status\",     \"occupation\",     \"relationship\",     \"race\",     \"gender\",     \"native_country\", ] crossed_cols = [(\"education\", \"occupation\"), (\"native_country\", \"occupation\")] In\u00a0[6]: Copied! <pre>cat_embed_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital_status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"capital_gain\",\n    \"capital_loss\",\n    \"native_country\",\n]\ncontinuous_cols = [\"age\", \"hours_per_week\"]\n</pre> cat_embed_cols = [     \"workclass\",     \"education\",     \"marital_status\",     \"occupation\",     \"relationship\",     \"race\",     \"gender\",     \"capital_gain\",     \"capital_loss\",     \"native_country\", ] continuous_cols = [\"age\", \"hours_per_week\"] In\u00a0[7]: Copied! <pre># TARGET\ntarget_col = \"income_label\"\ntarget = df[target_col].values\n</pre> # TARGET target_col = \"income_label\" target = df[target_col].values <p>let's see what the preprocessors do</p> In\u00a0[8]: Copied! <pre># wide\nwide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)\nX_wide = wide_preprocessor.fit_transform(df)\n</pre> # wide wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = wide_preprocessor.fit_transform(df) In\u00a0[9]: Copied! <pre># # wide_preprocessor has an attribute called encoding_dict with the encoding dictionary\n# wide_preprocessor.encoding_dict\n</pre> # # wide_preprocessor has an attribute called encoding_dict with the encoding dictionary # wide_preprocessor.encoding_dict In\u00a0[10]: Copied! <pre># deeptabular\ntab_preprocessor = TabPreprocessor(\n    embed_cols=cat_embed_cols,\n    continuous_cols=continuous_cols,\n    cols_to_scale=continuous_cols,\n)\nX_tab = tab_preprocessor.fit_transform(df)\n</pre> # deeptabular tab_preprocessor = TabPreprocessor(     embed_cols=cat_embed_cols,     continuous_cols=continuous_cols,     cols_to_scale=continuous_cols, ) X_tab = tab_preprocessor.fit_transform(df) In\u00a0[11]: Copied! <pre># check the docs to understand the useful attributes that the tab_preprocessor has. For example,\n# as well as an encoding dictionary, tab_preprocessor has an attribute called cat_embed_input\n# that specifies the categortical columns that will be represented as embeddings, the number\n# of different categories per feature, and the dimension of the embeddings as defined by some\n# of the internal rules of thumb that the preprocessor has (have a look to the docs)\ntab_preprocessor.cat_embed_input\n</pre> # check the docs to understand the useful attributes that the tab_preprocessor has. For example, # as well as an encoding dictionary, tab_preprocessor has an attribute called cat_embed_input # that specifies the categortical columns that will be represented as embeddings, the number # of different categories per feature, and the dimension of the embeddings as defined by some # of the internal rules of thumb that the preprocessor has (have a look to the docs) tab_preprocessor.cat_embed_input Out[11]: <pre>[('workclass', 9, 5),\n ('education', 16, 8),\n ('marital_status', 7, 5),\n ('occupation', 15, 7),\n ('relationship', 6, 4),\n ('race', 5, 4),\n ('gender', 2, 2),\n ('capital_gain', 123, 24),\n ('capital_loss', 99, 21),\n ('native_country', 42, 13)]</pre> In\u00a0[12]: Copied! <pre>print(X_wide)\nprint(X_wide.shape)\n</pre> print(X_wide) print(X_wide.shape) <pre>[[  1  10  26 ...  61 103 328]\n [  1  11  27 ...  61 104 329]\n [  2  12  27 ...  61 105 330]\n ...\n [  1  11  28 ...  61 115 335]\n [  1  11  26 ...  61 115 335]\n [  7  11  27 ...  61 127 336]]\n(48842, 10)\n</pre> In\u00a0[13]: Copied! <pre>print(X_tab)\nprint(X_tab.shape)\n</pre> print(X_tab) print(X_tab.shape) <pre>[[ 1.          1.          1.         ...  1.         -0.99512893\n  -0.03408696]\n [ 1.          2.          2.         ...  1.         -0.04694151\n   0.77292975]\n [ 2.          3.          2.         ...  1.         -0.77631645\n  -0.03408696]\n ...\n [ 1.          2.          3.         ...  1.          1.41180837\n  -0.03408696]\n [ 1.          2.          1.         ...  1.         -1.21394141\n  -1.64812038]\n [ 7.          2.          2.         ...  1.          0.97418341\n  -0.03408696]]\n(48842, 12)\n</pre> In\u00a0[14]: Copied! <pre>wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    mlp_hidden_dims=[400, 200],\n    mlp_dropout=0.5,\n    mlp_activation=\"leaky_relu\",\n)\n</pre> wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1) tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     mlp_hidden_dims=[400, 200],     mlp_dropout=0.5,     mlp_activation=\"leaky_relu\", ) <p>Let's first find out how a linear model performs</p> In\u00a0[15]: Copied! <pre>wide\n</pre> wide Out[15]: <pre>Wide(\n  (wide_linear): Embedding(809, 1, padding_idx=0)\n)</pre> <p>Before being passed to the Trainer, the models need to be \"constructed\" with the <code>WideDeep</code> constructor class. For the particular case of the wide/linear model, not much really happens</p> In\u00a0[16]: Copied! <pre>lin_model = WideDeep(wide=wide)\n</pre> lin_model = WideDeep(wide=wide) In\u00a0[17]: Copied! <pre>lin_model\n</pre> lin_model Out[17]: <pre>WideDeep(\n  (wide): Wide(\n    (wide_linear): Embedding(809, 1, padding_idx=0)\n  )\n)</pre> In\u00a0[18]: Copied! <pre>lin_trainer = Trainer(\n    model=lin_model,\n    objective=\"binary\",\n    optimizers=torch.optim.AdamW(lin_model.parameters(), lr=0.01),\n    metrics=[Accuracy, Precision],\n)\n</pre> lin_trainer = Trainer(     model=lin_model,     objective=\"binary\",     optimizers=torch.optim.AdamW(lin_model.parameters(), lr=0.01),     metrics=[Accuracy, Precision], ) In\u00a0[19]: Copied! <pre>lin_trainer.fit(X_wide=X_wide, target=target, n_epochs=4, batch_size=128, val_split=0.2)\n</pre> lin_trainer.fit(X_wide=X_wide, target=target, n_epochs=4, batch_size=128, val_split=0.2) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:02&lt;00:00, 109.04it/s, loss=0.426, metrics={'acc': 0.7983, 'prec': 0.6152}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 102.46it/s, loss=0.366, metrics={'acc': 0.832, 'prec': 0.6916}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:02&lt;00:00, 130.27it/s, loss=0.364, metrics={'acc': 0.8305, 'prec': 0.6933}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 150.46it/s, loss=0.361, metrics={'acc': 0.8357, 'prec': 0.6982}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:02&lt;00:00, 133.19it/s, loss=0.359, metrics={'acc': 0.8329, 'prec': 0.6994}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 145.75it/s, loss=0.361, metrics={'acc': 0.836, 'prec': 0.7009}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:02&lt;00:00, 130.91it/s, loss=0.358, metrics={'acc': 0.8333, 'prec': 0.7005}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 155.08it/s, loss=0.361, metrics={'acc': 0.8364, 'prec': 0.702}]\n</pre> <p>Bear in mind that <code>wide</code> is a linear model where the non-linearities are captured via the crossed columns. For the crossed-columns to be effective one needs proper business knowledge. There is no magic formula to produce them</p> <p>Let's have a look to the tabular model by itself</p> In\u00a0[20]: Copied! <pre>tab_model = WideDeep(deeptabular=tab_mlp)\n</pre> tab_model = WideDeep(deeptabular=tab_mlp) In\u00a0[21]: Copied! <pre>tab_model\n</pre> tab_model Out[21]: <pre>WideDeep(\n  (deeptabular): Sequential(\n    (0): TabMlp(\n      (cat_embed): DiffSizeCatEmbeddings(\n        (embed_layers): ModuleDict(\n          (emb_layer_workclass): Embedding(10, 5, padding_idx=0)\n          (emb_layer_education): Embedding(17, 8, padding_idx=0)\n          (emb_layer_marital_status): Embedding(8, 5, padding_idx=0)\n          (emb_layer_occupation): Embedding(16, 7, padding_idx=0)\n          (emb_layer_relationship): Embedding(7, 4, padding_idx=0)\n          (emb_layer_race): Embedding(6, 4, padding_idx=0)\n          (emb_layer_gender): Embedding(3, 2, padding_idx=0)\n          (emb_layer_capital_gain): Embedding(124, 24, padding_idx=0)\n          (emb_layer_capital_loss): Embedding(100, 21, padding_idx=0)\n          (emb_layer_native_country): Embedding(43, 13, padding_idx=0)\n        )\n        (embedding_dropout): Dropout(p=0.1, inplace=False)\n      )\n      (cont_norm): Identity()\n      (encoder): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=95, out_features=400, bias=True)\n            (1): LeakyReLU(negative_slope=0.01, inplace=True)\n            (2): Dropout(p=0.5, inplace=False)\n          )\n          (dense_layer_1): Sequential(\n            (0): Linear(in_features=400, out_features=200, bias=True)\n            (1): LeakyReLU(negative_slope=0.01, inplace=True)\n            (2): Dropout(p=0.5, inplace=False)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=200, out_features=1, bias=True)\n  )\n)</pre> <p>You can see how the <code>WideDeep</code> class has added a final prediction layer that collects the activations from the last layer of the model and plugs them into the output neuron. If this was a multiclass classification problem, the prediction dimension (i.e. the size of that final layer) needs to be specified via the <code>pred_dim</code> when instantiating the <code>WideDeep</code> class, as we will see later</p> In\u00a0[22]: Copied! <pre>tab_trainer = Trainer(\n    model=tab_model,\n    objective=\"binary\",\n    optimizers=torch.optim.AdamW(tab_model.parameters(), lr=0.001),\n    metrics=[Accuracy, Precision],\n)\n</pre> tab_trainer = Trainer(     model=tab_model,     objective=\"binary\",     optimizers=torch.optim.AdamW(tab_model.parameters(), lr=0.001),     metrics=[Accuracy, Precision], ) In\u00a0[23]: Copied! <pre>tab_trainer.fit(X_tab=X_tab, target=target, n_epochs=4, batch_size=128, val_split=0.2)\n</pre> tab_trainer.fit(X_tab=X_tab, target=target, n_epochs=4, batch_size=128, val_split=0.2) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:03&lt;00:00, 97.00it/s, loss=0.37, metrics={'acc': 0.8267, 'prec': 0.7037}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 134.91it/s, loss=0.313, metrics={'acc': 0.8588, 'prec': 0.7577}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:03&lt;00:00, 86.86it/s, loss=0.319, metrics={'acc': 0.8514, 'prec': 0.761}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:01&lt;00:00, 73.13it/s, loss=0.296, metrics={'acc': 0.8675, 'prec': 0.7685}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:03&lt;00:00, 79.07it/s, loss=0.305, metrics={'acc': 0.8574, 'prec': 0.7646}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 130.11it/s, loss=0.289, metrics={'acc': 0.8696, 'prec': 0.7765}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:03&lt;00:00, 87.39it/s, loss=0.296, metrics={'acc': 0.8622, 'prec': 0.7769}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 90.63it/s, loss=0.285, metrics={'acc': 0.8697, 'prec': 0.7741}]\n</pre> <p>The best result I ever obtained with <code>LightGBM</code> on this dataset is 0.8782...so we are pretty close.</p> <p>Let's combine the <code>wide</code> and <code>tab_mlp</code> components see if it helps</p> In\u00a0[24]: Copied! <pre>wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    mlp_hidden_dims=[400, 200],\n    mlp_dropout=0.5,\n    mlp_activation=\"leaky_relu\",\n)\nwd_model = WideDeep(wide=wide, deeptabular=tab_mlp)\n</pre> wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1) tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     mlp_hidden_dims=[400, 200],     mlp_dropout=0.5,     mlp_activation=\"leaky_relu\", ) wd_model = WideDeep(wide=wide, deeptabular=tab_mlp) In\u00a0[25]: Copied! <pre>wd_trainer = Trainer(\n    model=wd_model,\n    objective=\"binary\",\n    optimizers=torch.optim.AdamW(wd_model.parameters(), lr=0.001),\n    metrics=[Accuracy, Precision],\n)\n</pre> wd_trainer = Trainer(     model=wd_model,     objective=\"binary\",     optimizers=torch.optim.AdamW(wd_model.parameters(), lr=0.001),     metrics=[Accuracy, Precision], ) In\u00a0[26]: Copied! <pre>wd_trainer.fit(\n    X_wide=X_wide, X_tab=X_tab, target=target, n_epochs=4, batch_size=128, val_split=0.2\n)\n</pre> wd_trainer.fit(     X_wide=X_wide, X_tab=X_tab, target=target, n_epochs=4, batch_size=128, val_split=0.2 ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:03&lt;00:00, 77.48it/s, loss=0.418, metrics={'acc': 0.8047, 'prec': 0.6154}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 110.51it/s, loss=0.321, metrics={'acc': 0.8521, 'prec': 0.7059}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:03&lt;00:00, 82.70it/s, loss=0.333, metrics={'acc': 0.8428, 'prec': 0.7141}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 112.52it/s, loss=0.299, metrics={'acc': 0.866, 'prec': 0.7447}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:04&lt;00:00, 74.34it/s, loss=0.312, metrics={'acc': 0.8533, 'prec': 0.7404}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 89.86it/s, loss=0.29, metrics={'acc': 0.8683, 'prec': 0.7496}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:04&lt;00:00, 65.32it/s, loss=0.301, metrics={'acc': 0.8591, 'prec': 0.7542}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 86.81it/s, loss=0.286, metrics={'acc': 0.8712, 'prec': 0.7552}]\n</pre> <p>For this particular case, the combination of both did not lead to better results that using just the tab_mlp model, when using only 4 epochs.</p> <p>Note that we have use a <code>TabMlp</code> model, but we could use any other model in the library using the same syntax</p> In\u00a0[27]: Copied! <pre>from pytorch_widedeep.models import TabTransformer\n</pre> from pytorch_widedeep.models import TabTransformer <p>The parameters for the <code>TabTransformer</code> are this</p> <pre><code>column_idx: Dict[str, int],\ncat_embed_input: Optional[List[Tuple[str, int]]] = None,\ncat_embed_dropout: Optional[float] = None,\nuse_cat_bias: Optional[bool] = None,\ncat_embed_activation: Optional[str] = None,\nshared_embed: Optional[bool] = None,\nadd_shared_embed: Optional[bool] = None,\nfrac_shared_embed: Optional[float] = None,\ncontinuous_cols: Optional[List[str]] = None,\ncont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\nembed_continuous: Optional[bool] = None,\nembed_continuous_method: Optional[Literal[\"standard\", \"piecewise\", \"periodic\"]] = None,\ncont_embed_dropout: Optional[float] = None,\ncont_embed_activation: Optional[str] = None,\nquantization_setup: Optional[Dict[str, List[float]]] = None,\nn_frequencies: Optional[int] = None,\nsigma: Optional[float] = None,\nshare_last_layer: Optional[bool] = None,\nfull_embed_dropout: Optional[bool] = None,\ninput_dim: int = 32,\nn_heads: int = 8,\nuse_qkv_bias: bool = False,\nn_blocks: int = 4,\nattn_dropout: float = 0.2,\nff_dropout: float = 0.1,\nff_factor: int = 4,\ntransformer_activation: str = \"gelu\",\nuse_linear_attention: bool = False,\nuse_flash_attention: bool = False,\nmlp_hidden_dims: Optional[List[int]] = None,\nmlp_activation: str = \"relu\",\nmlp_dropout: float = 0.1,\nmlp_batchnorm: bool = False,\nmlp_batchnorm_last: bool = False,\nmlp_linear_first: bool = True,\n</code></pre> <p>Please, see the documentation for details on each one of them, for now let's see how one could use a <code>TabTransformer</code> model in a few lines of code</p> In\u00a0[28]: Copied! <pre>tab_transformer = TabTransformer(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    embed_continuous_method=\"standard\",\n    cont_norm_layer=\"layernorm\",\n    cont_embed_dropout=0.2,\n    cont_embed_activation=\"leaky_relu\",\n    n_heads=4,\n    ff_dropout=0.2,\n    mlp_dropout=0.5,\n    mlp_activation=\"leaky_relu\",\n    mlp_linear_first=\"True\",\n)\n</pre> tab_transformer = TabTransformer(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     embed_continuous_method=\"standard\",     cont_norm_layer=\"layernorm\",     cont_embed_dropout=0.2,     cont_embed_activation=\"leaky_relu\",     n_heads=4,     ff_dropout=0.2,     mlp_dropout=0.5,     mlp_activation=\"leaky_relu\",     mlp_linear_first=\"True\", ) In\u00a0[29]: Copied! <pre>tab_model = WideDeep(deeptabular=tab_transformer)\n</pre> tab_model = WideDeep(deeptabular=tab_transformer) In\u00a0[30]: Copied! <pre>tab_model\n</pre> tab_model Out[30]: <pre>WideDeep(\n  (deeptabular): Sequential(\n    (0): TabTransformer(\n      (cat_embed): SameSizeCatEmbeddings(\n        (embed): Embedding(325, 32, padding_idx=0)\n        (dropout): Dropout(p=0.1, inplace=False)\n      )\n      (cont_norm): LayerNorm((2,), eps=1e-05, elementwise_affine=True)\n      (cont_embed): ContEmbeddings(\n        INFO: [ContLinear = weight(n_cont_cols, embed_dim) + bias(n_cont_cols, embed_dim)]\n        (linear): ContLinear(n_cont_cols=2, embed_dim=32, embed_dropout=0.2)\n        (activation_fn): LeakyReLU(negative_slope=0.01, inplace=True)\n        (dropout): Dropout(p=0.2, inplace=False)\n      )\n      (encoder): Sequential(\n        (transformer_block0): TransformerEncoder(\n          (attn): MultiHeadedAttention(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (q_proj): Linear(in_features=32, out_features=32, bias=False)\n            (kv_proj): Linear(in_features=32, out_features=64, bias=False)\n            (out_proj): Linear(in_features=32, out_features=32, bias=False)\n          )\n          (ff): FeedForward(\n            (w_1): Linear(in_features=32, out_features=128, bias=True)\n            (w_2): Linear(in_features=128, out_features=32, bias=True)\n            (dropout): Dropout(p=0.2, inplace=False)\n            (activation): GELU(approximate='none')\n          )\n          (attn_addnorm): AddNorm(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n          (ff_addnorm): AddNorm(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n        )\n        (transformer_block1): TransformerEncoder(\n          (attn): MultiHeadedAttention(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (q_proj): Linear(in_features=32, out_features=32, bias=False)\n            (kv_proj): Linear(in_features=32, out_features=64, bias=False)\n            (out_proj): Linear(in_features=32, out_features=32, bias=False)\n          )\n          (ff): FeedForward(\n            (w_1): Linear(in_features=32, out_features=128, bias=True)\n            (w_2): Linear(in_features=128, out_features=32, bias=True)\n            (dropout): Dropout(p=0.2, inplace=False)\n            (activation): GELU(approximate='none')\n          )\n          (attn_addnorm): AddNorm(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n          (ff_addnorm): AddNorm(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n        )\n        (transformer_block2): TransformerEncoder(\n          (attn): MultiHeadedAttention(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (q_proj): Linear(in_features=32, out_features=32, bias=False)\n            (kv_proj): Linear(in_features=32, out_features=64, bias=False)\n            (out_proj): Linear(in_features=32, out_features=32, bias=False)\n          )\n          (ff): FeedForward(\n            (w_1): Linear(in_features=32, out_features=128, bias=True)\n            (w_2): Linear(in_features=128, out_features=32, bias=True)\n            (dropout): Dropout(p=0.2, inplace=False)\n            (activation): GELU(approximate='none')\n          )\n          (attn_addnorm): AddNorm(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n          (ff_addnorm): AddNorm(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n        )\n        (transformer_block3): TransformerEncoder(\n          (attn): MultiHeadedAttention(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (q_proj): Linear(in_features=32, out_features=32, bias=False)\n            (kv_proj): Linear(in_features=32, out_features=64, bias=False)\n            (out_proj): Linear(in_features=32, out_features=32, bias=False)\n          )\n          (ff): FeedForward(\n            (w_1): Linear(in_features=32, out_features=128, bias=True)\n            (w_2): Linear(in_features=128, out_features=32, bias=True)\n            (dropout): Dropout(p=0.2, inplace=False)\n            (activation): GELU(approximate='none')\n          )\n          (attn_addnorm): AddNorm(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n          (ff_addnorm): AddNorm(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=384, out_features=1, bias=True)\n  )\n)</pre> In\u00a0[31]: Copied! <pre>tab_trainer = Trainer(\n    model=tab_model,\n    objective=\"binary\",\n    optimizers=torch.optim.AdamW(tab_model.parameters(), lr=0.001),\n    metrics=[Accuracy, Precision],\n)\n</pre> tab_trainer = Trainer(     model=tab_model,     objective=\"binary\",     optimizers=torch.optim.AdamW(tab_model.parameters(), lr=0.001),     metrics=[Accuracy, Precision], ) In\u00a0[32]: Copied! <pre>tab_trainer.fit(X_tab=X_tab, target=target, n_epochs=1, batch_size=128, val_split=0.2)\n</pre> tab_trainer.fit(X_tab=X_tab, target=target, n_epochs=1, batch_size=128, val_split=0.2) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:11&lt;00:00, 27.57it/s, loss=0.359, metrics={'acc': 0.8334, 'prec': 0.7082}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:01&lt;00:00, 57.89it/s, loss=0.33, metrics={'acc': 0.8536, 'prec': 0.7152}]\n</pre>"},{"location":"examples/03_binary_classification_with_defaults.html#simple-binary-classification-with-defaults","title":"Simple Binary Classification with defaults\u00b6","text":"<p>In this notebook we will train a Wide and Deep model and simply a \"Deep\" model using the well known adult dataset</p>"},{"location":"examples/03_binary_classification_with_defaults.html#preparing-the-data","title":"Preparing the data\u00b6","text":""},{"location":"examples/03_binary_classification_with_defaults.html#defining-the-model","title":"Defining the model\u00b6","text":""},{"location":"examples/04_regression_with_images_and_text.html","title":"04_regression_with_images_and_text","text":"In\u00a0[1]: Copied! <pre>import numpy as np\nimport pandas as pd\nimport os\nimport torch\nfrom torchvision.transforms import ToTensor, Normalize\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.preprocessing import (\n    WidePreprocessor,\n    TabPreprocessor,\n    TextPreprocessor,\n    ImagePreprocessor,\n)\nfrom pytorch_widedeep.models import (\n    Wide,\n    TabMlp,\n    Vision,\n    BasicRNN,\n    WideDeep,\n)\nfrom pytorch_widedeep.losses import RMSELoss\nfrom pytorch_widedeep.initializers import *\nfrom pytorch_widedeep.callbacks import *\n</pre> import numpy as np import pandas as pd import os import torch from torchvision.transforms import ToTensor, Normalize  from pytorch_widedeep import Trainer from pytorch_widedeep.preprocessing import (     WidePreprocessor,     TabPreprocessor,     TextPreprocessor,     ImagePreprocessor, ) from pytorch_widedeep.models import (     Wide,     TabMlp,     Vision,     BasicRNN,     WideDeep, ) from pytorch_widedeep.losses import RMSELoss from pytorch_widedeep.initializers import * from pytorch_widedeep.callbacks import * <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n</pre> In\u00a0[2]: Copied! <pre>df = pd.read_csv(\"../tmp_data/airbnb/airbnb_sample.csv\")\ndf.head()\n</pre> df = pd.read_csv(\"../tmp_data/airbnb/airbnb_sample.csv\") df.head() Out[2]: id host_id description host_listings_count host_identity_verified neighbourhood_cleansed latitude longitude is_location_exact property_type ... amenity_wide_entrance amenity_wide_entrance_for_guests amenity_wide_entryway amenity_wide_hallways amenity_wifi amenity_window_guards amenity_wine_cooler security_deposit extra_people yield 0 13913.jpg 54730 My bright double bedroom with a large window h... 4.0 f Islington 51.56802 -0.11121 t apartment ... 1 0 0 0 1 0 0 100.0 15.0 12.00 1 15400.jpg 60302 Lots of windows and light.  St Luke's Gardens ... 1.0 t Kensington and Chelsea 51.48796 -0.16898 t apartment ... 0 0 0 0 1 0 0 150.0 0.0 109.50 2 17402.jpg 67564 Open from June 2018 after a 3-year break, we a... 19.0 t Westminster 51.52098 -0.14002 t apartment ... 0 0 0 0 1 0 0 350.0 10.0 149.65 3 24328.jpg 41759 Artist house, bright high ceiling rooms, priva... 2.0 t Wandsworth 51.47298 -0.16376 t other ... 0 0 0 0 1 0 0 250.0 0.0 215.60 4 25023.jpg 102813 Large, all comforts, 2-bed flat; first floor; ... 1.0 f Wandsworth 51.44687 -0.21874 t apartment ... 0 0 0 0 1 0 0 250.0 11.0 79.35 <p>5 rows \u00d7 223 columns</p> In\u00a0[3]: Copied! <pre># There are a number of columns that are already binary. Therefore, no need to one hot encode them\ncrossed_cols = [(\"property_type\", \"room_type\")]\nalready_dummies = [c for c in df.columns if \"amenity\" in c] + [\"has_house_rules\"]\nwide_cols = [\n    \"is_location_exact\",\n    \"property_type\",\n    \"room_type\",\n    \"host_gender\",\n    \"instant_bookable\",\n] + already_dummies\n\ncat_embed_cols = [(c, 16) for c in df.columns if \"catg\" in c] + [\n    (\"neighbourhood_cleansed\", 64),\n    (\"cancellation_policy\", 16),\n]\ncontinuous_cols = [\"latitude\", \"longitude\", \"security_deposit\", \"extra_people\"]\n\n# text and image colnames\ntext_col = \"description\"\nimg_col = \"id\"\n\n# path to pretrained word embeddings and the images\nword_vectors_path = \"../tmp_data/glove.6B/glove.6B.100d.txt\"\nimg_path = \"../tmp_data/airbnb/property_picture\"\n\n# target\ntarget_col = \"yield\"\n</pre> # There are a number of columns that are already binary. Therefore, no need to one hot encode them crossed_cols = [(\"property_type\", \"room_type\")] already_dummies = [c for c in df.columns if \"amenity\" in c] + [\"has_house_rules\"] wide_cols = [     \"is_location_exact\",     \"property_type\",     \"room_type\",     \"host_gender\",     \"instant_bookable\", ] + already_dummies  cat_embed_cols = [(c, 16) for c in df.columns if \"catg\" in c] + [     (\"neighbourhood_cleansed\", 64),     (\"cancellation_policy\", 16), ] continuous_cols = [\"latitude\", \"longitude\", \"security_deposit\", \"extra_people\"]  # text and image colnames text_col = \"description\" img_col = \"id\"  # path to pretrained word embeddings and the images word_vectors_path = \"../tmp_data/glove.6B/glove.6B.100d.txt\" img_path = \"../tmp_data/airbnb/property_picture\"  # target target_col = \"yield\" In\u00a0[4]: Copied! <pre>target = df[target_col].values\n</pre> target = df[target_col].values In\u00a0[5]: Copied! <pre>wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)\nX_wide = wide_preprocessor.fit_transform(df)\n</pre> wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = wide_preprocessor.fit_transform(df) In\u00a0[6]: Copied! <pre>tab_preprocessor = TabPreprocessor(\n    cat_embed_cols=cat_embed_cols,\n    continuous_cols=continuous_cols,\n)\nX_tab = tab_preprocessor.fit_transform(df)\n</pre> tab_preprocessor = TabPreprocessor(     cat_embed_cols=cat_embed_cols,     continuous_cols=continuous_cols, ) X_tab = tab_preprocessor.fit_transform(df) <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> In\u00a0[7]: Copied! <pre>text_preprocessor = TextPreprocessor(\n    word_vectors_path=word_vectors_path, text_col=text_col\n)\nX_text = text_preprocessor.fit_transform(df)\n</pre> text_preprocessor = TextPreprocessor(     word_vectors_path=word_vectors_path, text_col=text_col ) X_text = text_preprocessor.fit_transform(df) <pre>The vocabulary contains 2192 tokens\nIndexing word vectors...\nLoaded 400000 word vectors\nPreparing embeddings matrix...\n2175 words in the vocabulary had ../tmp_data/glove.6B/glove.6B.100d.txt vectors and appear more than 5 times\n</pre> In\u00a0[8]: Copied! <pre>image_processor = ImagePreprocessor(img_col=img_col, img_path=img_path)\nX_images = image_processor.fit_transform(df)\n</pre> image_processor = ImagePreprocessor(img_col=img_col, img_path=img_path) X_images = image_processor.fit_transform(df) <pre>Reading Images from ../tmp_data/airbnb/property_picture\nResizing\n</pre> <pre>100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1001/1001 [00:01&lt;00:00, 638.00it/s]\n</pre> <pre>Computing normalisation metrics\n</pre> In\u00a0[9]: Copied! <pre># Linear model\nwide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\n\n# DeepDense: 2 Dense layers\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=continuous_cols,\n    mlp_hidden_dims=[128, 64],\n    mlp_dropout=0.1,\n)\n\n# DeepText: a stack of 2 LSTMs\nbasic_rnn = BasicRNN(\n    vocab_size=len(text_preprocessor.vocab.itos),\n    embed_matrix=text_preprocessor.embedding_matrix,\n    n_layers=2,\n    hidden_dim=64,\n    rnn_dropout=0.5,\n)\n\n# Pretrained Resnet 18\nresnet = Vision(pretrained_model_setup=\"resnet18\", n_trainable=4)\n</pre> # Linear model wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)  # DeepDense: 2 Dense layers tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=continuous_cols,     mlp_hidden_dims=[128, 64],     mlp_dropout=0.1, )  # DeepText: a stack of 2 LSTMs basic_rnn = BasicRNN(     vocab_size=len(text_preprocessor.vocab.itos),     embed_matrix=text_preprocessor.embedding_matrix,     n_layers=2,     hidden_dim=64,     rnn_dropout=0.5, )  # Pretrained Resnet 18 resnet = Vision(pretrained_model_setup=\"resnet18\", n_trainable=4) <p>Combine them all with the \"collector\" class <code>WideDeep</code></p> In\u00a0[10]: Copied! <pre>model = WideDeep(\n    wide=wide,\n    deeptabular=tab_mlp,\n    deeptext=basic_rnn,\n    deepimage=resnet,\n    head_hidden_dims=[256, 128],\n)\n</pre> model = WideDeep(     wide=wide,     deeptabular=tab_mlp,     deeptext=basic_rnn,     deepimage=resnet,     head_hidden_dims=[256, 128], ) In\u00a0[11]: Copied! <pre>trainer = Trainer(model, objective=\"rmse\")\n</pre> trainer = Trainer(model, objective=\"rmse\") In\u00a0[12]: Copied! <pre>trainer.fit(\n    X_wide=X_wide,\n    X_tab=X_tab,\n    X_text=X_text,\n    X_img=X_images,\n    target=target,\n    n_epochs=1,\n    batch_size=32,\n    val_split=0.2,\n)\n</pre> trainer.fit(     X_wide=X_wide,     X_tab=X_tab,     X_text=X_text,     X_img=X_images,     target=target,     n_epochs=1,     batch_size=32,     val_split=0.2, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 25/25 [00:19&lt;00:00,  1.28it/s, loss=115]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 7/7 [00:04&lt;00:00,  1.62it/s, loss=94.1]\n</pre> <p>Both, the Text and Image components allow FC-heads on their own (have a look to the documentation).</p> <p>Now let's go \"kaggle crazy\". Let's use different optimizers, initializers and schedulers for different components. Moreover, let's use a different learning rate for different parameter groups, for the <code>deeptabular</code> component</p> In\u00a0[13]: Copied! <pre>deep_params = []\nfor childname, child in model.named_children():\n    if childname == \"deeptabular\":\n        for n, p in child.named_parameters():\n            if \"embed_layer\" in n:\n                deep_params.append({\"params\": p, \"lr\": 1e-4})\n            else:\n                deep_params.append({\"params\": p, \"lr\": 1e-3})\n</pre> deep_params = [] for childname, child in model.named_children():     if childname == \"deeptabular\":         for n, p in child.named_parameters():             if \"embed_layer\" in n:                 deep_params.append({\"params\": p, \"lr\": 1e-4})             else:                 deep_params.append({\"params\": p, \"lr\": 1e-3}) In\u00a0[14]: Copied! <pre>wide_opt = torch.optim.Adam(model.wide.parameters(), lr=0.03)\ndeep_opt = torch.optim.Adam(deep_params)\ntext_opt = torch.optim.AdamW(model.deeptext.parameters())\nimg_opt = torch.optim.AdamW(model.deepimage.parameters())\nhead_opt = torch.optim.Adam(model.deephead.parameters())\n</pre> wide_opt = torch.optim.Adam(model.wide.parameters(), lr=0.03) deep_opt = torch.optim.Adam(deep_params) text_opt = torch.optim.AdamW(model.deeptext.parameters()) img_opt = torch.optim.AdamW(model.deepimage.parameters()) head_opt = torch.optim.Adam(model.deephead.parameters()) In\u00a0[15]: Copied! <pre>wide_sch = torch.optim.lr_scheduler.StepLR(wide_opt, step_size=5)\ndeep_sch = torch.optim.lr_scheduler.MultiStepLR(deep_opt, milestones=[3, 8])\ntext_sch = torch.optim.lr_scheduler.StepLR(text_opt, step_size=5)\nimg_sch = torch.optim.lr_scheduler.MultiStepLR(deep_opt, milestones=[3, 8])\nhead_sch = torch.optim.lr_scheduler.StepLR(head_opt, step_size=5)\n</pre> wide_sch = torch.optim.lr_scheduler.StepLR(wide_opt, step_size=5) deep_sch = torch.optim.lr_scheduler.MultiStepLR(deep_opt, milestones=[3, 8]) text_sch = torch.optim.lr_scheduler.StepLR(text_opt, step_size=5) img_sch = torch.optim.lr_scheduler.MultiStepLR(deep_opt, milestones=[3, 8]) head_sch = torch.optim.lr_scheduler.StepLR(head_opt, step_size=5) In\u00a0[16]: Copied! <pre># remember, one optimizer per model components, for lr_schedures and initializers is not neccesary\noptimizers = {\n    \"wide\": wide_opt,\n    \"deeptabular\": deep_opt,\n    \"deeptext\": text_opt,\n    \"deepimage\": img_opt,\n    \"deephead\": head_opt,\n}\nschedulers = {\n    \"wide\": wide_sch,\n    \"deeptabular\": deep_sch,\n    \"deeptext\": text_sch,\n    \"deepimage\": img_sch,\n    \"deephead\": head_sch,\n}\n\n# Now...we have used pretrained word embeddings, so you do not want to\n# initialise these  embeddings. However you might still want to initialise the\n# other layers in the DeepText component. No probs, you can do that with the\n# parameter pattern and your knowledge on regular  expressions. Here we are\n# telling to the KaimingNormal initializer to NOT touch the  parameters whose\n# name contains the string word_embed.\ninitializers = {\n    \"wide\": KaimingNormal,\n    \"deeptabular\": KaimingNormal,\n    \"deeptext\": KaimingNormal(pattern=r\"^(?!.*word_embed).*$\"),\n    \"deepimage\": KaimingNormal,\n}\n\nmean = [0.406, 0.456, 0.485]  # BGR\nstd = [0.225, 0.224, 0.229]  # BGR\ntransforms = [ToTensor, Normalize(mean=mean, std=std)]\ncallbacks = [\n    LRHistory(n_epochs=10),\n    EarlyStopping,\n    ModelCheckpoint(filepath=\"model_weights/wd_out\"),\n]\n</pre> # remember, one optimizer per model components, for lr_schedures and initializers is not neccesary optimizers = {     \"wide\": wide_opt,     \"deeptabular\": deep_opt,     \"deeptext\": text_opt,     \"deepimage\": img_opt,     \"deephead\": head_opt, } schedulers = {     \"wide\": wide_sch,     \"deeptabular\": deep_sch,     \"deeptext\": text_sch,     \"deepimage\": img_sch,     \"deephead\": head_sch, }  # Now...we have used pretrained word embeddings, so you do not want to # initialise these  embeddings. However you might still want to initialise the # other layers in the DeepText component. No probs, you can do that with the # parameter pattern and your knowledge on regular  expressions. Here we are # telling to the KaimingNormal initializer to NOT touch the  parameters whose # name contains the string word_embed. initializers = {     \"wide\": KaimingNormal,     \"deeptabular\": KaimingNormal,     \"deeptext\": KaimingNormal(pattern=r\"^(?!.*word_embed).*$\"),     \"deepimage\": KaimingNormal, }  mean = [0.406, 0.456, 0.485]  # BGR std = [0.225, 0.224, 0.229]  # BGR transforms = [ToTensor, Normalize(mean=mean, std=std)] callbacks = [     LRHistory(n_epochs=10),     EarlyStopping,     ModelCheckpoint(filepath=\"model_weights/wd_out\"), ] In\u00a0[17]: Copied! <pre>trainer = Trainer(\n    model,\n    objective=\"rmse\",\n    initializers=initializers,\n    optimizers=optimizers,\n    lr_schedulers=schedulers,\n    callbacks=callbacks,\n    transforms=transforms,\n)\n</pre> trainer = Trainer(     model,     objective=\"rmse\",     initializers=initializers,     optimizers=optimizers,     lr_schedulers=schedulers,     callbacks=callbacks,     transforms=transforms, ) <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/initializers.py:34: UserWarning: No initializer found for deephead\n  warnings.warn(\n</pre> In\u00a0[18]: Copied! <pre>trainer.fit(\n    X_wide=X_wide,\n    X_tab=X_tab,\n    X_text=X_text,\n    X_img=X_images,\n    target=target,\n    n_epochs=1,\n    batch_size=32,\n    val_split=0.2,\n)\n</pre> trainer.fit(     X_wide=X_wide,     X_tab=X_tab,     X_text=X_text,     X_img=X_images,     target=target,     n_epochs=1,     batch_size=32,     val_split=0.2, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 25/25 [00:19&lt;00:00,  1.25it/s, loss=101]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 7/7 [00:04&lt;00:00,  1.62it/s, loss=90.6]\n</pre> <pre>Model weights after training corresponds to the those of the final epoch which might not be the best performing weights. Use the 'ModelCheckpoint' Callback to restore the best epoch weights.\n</pre> <p>we have only run one epoch, but let's check that the LRHistory callback records the lr values for each group</p> In\u00a0[19]: Copied! <pre>trainer.lr_history\n</pre> trainer.lr_history Out[19]: <pre>{'lr_wide_0': [0.03, 0.03],\n 'lr_deeptabular_0': [0.0001, 0.0001],\n 'lr_deeptabular_1': [0.0001, 0.0001],\n 'lr_deeptabular_2': [0.0001, 0.0001],\n 'lr_deeptabular_3': [0.0001, 0.0001],\n 'lr_deeptabular_4': [0.0001, 0.0001],\n 'lr_deeptabular_5': [0.0001, 0.0001],\n 'lr_deeptabular_6': [0.0001, 0.0001],\n 'lr_deeptabular_7': [0.0001, 0.0001],\n 'lr_deeptabular_8': [0.0001, 0.0001],\n 'lr_deeptabular_9': [0.001, 0.001],\n 'lr_deeptabular_10': [0.001, 0.001],\n 'lr_deeptabular_11': [0.001, 0.001],\n 'lr_deeptabular_12': [0.001, 0.001],\n 'lr_deeptext_0': [0.001, 0.001],\n 'lr_deepimage_0': [0.001, 0.001],\n 'lr_deephead_0': [0.001, 0.001]}</pre>"},{"location":"examples/04_regression_with_images_and_text.html#regression-with-images-and-text","title":"Regression with Images and Text\u00b6","text":"<p>In this notebook we will go through a series of examples on how to combine all Wide &amp; Deep components.</p> <p>To that aim I will use the Airbnb listings dataset for London, which you can download from here. I use this dataset simply because it contains tabular data, images and text.</p> <p>I have taken a sample of 1000 listings to keep the data tractable in this notebook. Also, I have preprocessed the data and prepared it for this exercise. All preprocessing steps can be found in the notebook <code>airbnb_data_preprocessing.ipynb</code> in this <code>examples</code> folder.</p>"},{"location":"examples/04_regression_with_images_and_text.html#regression-with-the-defaults","title":"Regression with the defaults\u00b6","text":"<p>The set up</p>"},{"location":"examples/04_regression_with_images_and_text.html#prepare-the-data","title":"Prepare the data\u00b6","text":"<p>I will focus here on how to prepare the data and run the model. Check notebooks 1 and 2 to see what's going on behind the scences</p> <p>Preparing the data is rather simple</p>"},{"location":"examples/04_regression_with_images_and_text.html#build-the-model-components","title":"Build the model components\u00b6","text":""},{"location":"examples/04_regression_with_images_and_text.html#build-the-trainer-and-fit","title":"Build the trainer and fit\u00b6","text":""},{"location":"examples/05_save_and_load_model_and_artifacts.html","title":"05_save_and_load_model_and_artifacts","text":"In\u00a0[1]: Copied! <pre>import pickle\nimport numpy as np\nimport pandas as pd\nimport torch\nimport shutil\n\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.training import Trainer\nfrom pytorch_widedeep.callbacks import EarlyStopping, ModelCheckpoint, LRHistory\nfrom pytorch_widedeep.models import TabMlp, WideDeep\nfrom pytorch_widedeep.metrics import Accuracy\nfrom pytorch_widedeep.datasets import load_adult\nfrom sklearn.model_selection import train_test_split\n</pre> import pickle import numpy as np import pandas as pd import torch import shutil  from pytorch_widedeep.preprocessing import TabPreprocessor from pytorch_widedeep.training import Trainer from pytorch_widedeep.callbacks import EarlyStopping, ModelCheckpoint, LRHistory from pytorch_widedeep.models import TabMlp, WideDeep from pytorch_widedeep.metrics import Accuracy from pytorch_widedeep.datasets import load_adult from sklearn.model_selection import train_test_split <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n</pre> In\u00a0[2]: Copied! <pre>df = load_adult(as_frame=True)\ndf.head()\n</pre> df = load_adult(as_frame=True) df.head() Out[2]: age workclass fnlwgt education educational-num marital-status occupation relationship race gender capital-gain capital-loss hours-per-week native-country income 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States &lt;=50K 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States &lt;=50K 2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States &gt;50K 3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States &gt;50K 4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States &lt;=50K In\u00a0[3]: Copied! <pre># For convenience, we'll replace '-' with '_'\ndf.columns = [c.replace(\"-\", \"_\") for c in df.columns]\n# binary target\ndf[\"target\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int)\ndf.drop(\"income\", axis=1, inplace=True)\ndf.head()\n</pre> # For convenience, we'll replace '-' with '_' df.columns = [c.replace(\"-\", \"_\") for c in df.columns] # binary target df[\"target\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int) df.drop(\"income\", axis=1, inplace=True) df.head() Out[3]: age workclass fnlwgt education educational_num marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country target 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States 0 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States 0 2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States 1 3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States 1 4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States 0 In\u00a0[4]: Copied! <pre>train, valid = train_test_split(df, test_size=0.2, stratify=df.target)\n# the test data will be used lately as if it was \"fresh\", new data coming after some time...\nvalid, test = train_test_split(valid, test_size=0.5, stratify=valid.target)\n</pre> train, valid = train_test_split(df, test_size=0.2, stratify=df.target) # the test data will be used lately as if it was \"fresh\", new data coming after some time... valid, test = train_test_split(valid, test_size=0.5, stratify=valid.target) In\u00a0[5]: Copied! <pre>print(f\"train shape: {train.shape}\")\nprint(f\"valid shape: {valid.shape}\")\nprint(f\"test shape: {test.shape}\")\n</pre> print(f\"train shape: {train.shape}\") print(f\"valid shape: {valid.shape}\") print(f\"test shape: {test.shape}\") <pre>train shape: (39073, 15)\nvalid shape: (4884, 15)\ntest shape: (4885, 15)\n</pre> In\u00a0[6]: Copied! <pre>cat_embed_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital_status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"capital_gain\",\n    \"capital_loss\",\n    \"native_country\",\n]\ncontinuous_cols = [\"age\", \"hours_per_week\"]\n</pre> cat_embed_cols = [     \"workclass\",     \"education\",     \"marital_status\",     \"occupation\",     \"relationship\",     \"race\",     \"gender\",     \"capital_gain\",     \"capital_loss\",     \"native_country\", ] continuous_cols = [\"age\", \"hours_per_week\"] In\u00a0[7]: Copied! <pre>tab_preprocessor = TabPreprocessor(\n    embed_cols=cat_embed_cols,\n    continuous_cols=continuous_cols,\n)\nX_tab_train = tab_preprocessor.fit_transform(train)\ny_train = train.target.values\nX_tab_valid = tab_preprocessor.transform(valid)\ny_valid = valid.target.values\n</pre> tab_preprocessor = TabPreprocessor(     embed_cols=cat_embed_cols,     continuous_cols=continuous_cols, ) X_tab_train = tab_preprocessor.fit_transform(train) y_train = train.target.values X_tab_valid = tab_preprocessor.transform(valid) y_valid = valid.target.values <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> In\u00a0[8]: Copied! <pre>tab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    cont_norm_layer=\"layernorm\",\n    embed_continuous_method=\"standard\",\n    cont_embed_dim=8,\n    mlp_hidden_dims=[64, 32],\n    mlp_dropout=0.2,\n    mlp_activation=\"leaky_relu\",\n)\nmodel = WideDeep(deeptabular=tab_mlp)\n</pre> tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     cont_norm_layer=\"layernorm\",     embed_continuous_method=\"standard\",     cont_embed_dim=8,     mlp_hidden_dims=[64, 32],     mlp_dropout=0.2,     mlp_activation=\"leaky_relu\", ) model = WideDeep(deeptabular=tab_mlp) In\u00a0[9]: Copied! <pre>model\n</pre> model Out[9]: <pre>WideDeep(\n  (deeptabular): Sequential(\n    (0): TabMlp(\n      (cat_embed): DiffSizeCatEmbeddings(\n        (embed_layers): ModuleDict(\n          (emb_layer_workclass): Embedding(10, 5, padding_idx=0)\n          (emb_layer_education): Embedding(17, 8, padding_idx=0)\n          (emb_layer_marital_status): Embedding(8, 5, padding_idx=0)\n          (emb_layer_occupation): Embedding(16, 7, padding_idx=0)\n          (emb_layer_relationship): Embedding(7, 4, padding_idx=0)\n          (emb_layer_race): Embedding(6, 4, padding_idx=0)\n          (emb_layer_gender): Embedding(3, 2, padding_idx=0)\n          (emb_layer_capital_gain): Embedding(122, 23, padding_idx=0)\n          (emb_layer_capital_loss): Embedding(97, 21, padding_idx=0)\n          (emb_layer_native_country): Embedding(43, 13, padding_idx=0)\n        )\n        (embedding_dropout): Dropout(p=0.1, inplace=False)\n      )\n      (cont_norm): LayerNorm((2,), eps=1e-05, elementwise_affine=True)\n      (cont_embed): ContEmbeddings(\n        INFO: [ContLinear = weight(n_cont_cols, embed_dim) + bias(n_cont_cols, embed_dim)]\n        (linear): ContLinear(n_cont_cols=2, embed_dim=8, embed_dropout=0.0)\n        (dropout): Dropout(p=0.0, inplace=False)\n      )\n      (encoder): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=108, out_features=64, bias=True)\n            (1): LeakyReLU(negative_slope=0.01, inplace=True)\n            (2): Dropout(p=0.2, inplace=False)\n          )\n          (dense_layer_1): Sequential(\n            (0): Linear(in_features=64, out_features=32, bias=True)\n            (1): LeakyReLU(negative_slope=0.01, inplace=True)\n            (2): Dropout(p=0.2, inplace=False)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=32, out_features=1, bias=True)\n  )\n)</pre> In\u00a0[10]: Copied! <pre>early_stopping = EarlyStopping()\nmodel_checkpoint = ModelCheckpoint(\n    filepath=\"tmp_dir/adult_tabmlp_model\",\n    save_best_only=True,\n    verbose=1,\n    max_save=1,\n)\n\ntrainer = Trainer(\n    model,\n    objective=\"binary\",\n    callbacks=[early_stopping, model_checkpoint],\n    metrics=[Accuracy],\n)\n\ntrainer.fit(\n    X_train={\"X_tab\": X_tab_train, \"target\": y_train},\n    X_val={\"X_tab\": X_tab_valid, \"target\": y_valid},\n    n_epochs=4,\n    batch_size=256,\n)\n</pre> early_stopping = EarlyStopping() model_checkpoint = ModelCheckpoint(     filepath=\"tmp_dir/adult_tabmlp_model\",     save_best_only=True,     verbose=1,     max_save=1, )  trainer = Trainer(     model,     objective=\"binary\",     callbacks=[early_stopping, model_checkpoint],     metrics=[Accuracy], )  trainer.fit(     X_train={\"X_tab\": X_tab_train, \"target\": y_train},     X_val={\"X_tab\": X_tab_valid, \"target\": y_valid},     n_epochs=4,     batch_size=256, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:02&lt;00:00, 76.25it/s, loss=0.452, metrics={'acc': 0.7867}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 125.36it/s, loss=0.335, metrics={'acc': 0.8532}]\n</pre> <pre>\nEpoch 1: val_loss improved from inf to 0.33532 Saving model to tmp_dir/adult_tabmlp_model_1.p\n</pre> <pre>epoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 76.98it/s, loss=0.355, metrics={'acc': 0.8401}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 106.51it/s, loss=0.303, metrics={'acc': 0.8665}]\n</pre> <pre>\nEpoch 2: val_loss improved from 0.33532 to 0.30273 Saving model to tmp_dir/adult_tabmlp_model_2.p\n</pre> <pre>epoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 82.71it/s, loss=0.332, metrics={'acc': 0.849}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 107.80it/s, loss=0.288, metrics={'acc': 0.8757}]\n</pre> <pre>\nEpoch 3: val_loss improved from 0.30273 to 0.28791 Saving model to tmp_dir/adult_tabmlp_model_3.p\n</pre> <pre>epoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 79.02it/s, loss=0.32, metrics={'acc': 0.8541}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 127.07it/s, loss=0.282, metrics={'acc': 0.8763}]</pre> <pre>\nEpoch 4: val_loss improved from 0.28791 to 0.28238 Saving model to tmp_dir/adult_tabmlp_model_4.p\nModel weights restored to best epoch: 4\n</pre> <pre>\n</pre> In\u00a0[11]: Copied! <pre>torch.save(model, \"tmp_dir/model_saved_option_1.pt\")\n</pre> torch.save(model, \"tmp_dir/model_saved_option_1.pt\") In\u00a0[12]: Copied! <pre>torch.save(model.state_dict(), \"tmp_dir/model_state_dict_saved_option_1.pt\")\n</pre> torch.save(model.state_dict(), \"tmp_dir/model_state_dict_saved_option_1.pt\") In\u00a0[13]: Copied! <pre>trainer.save(path=\"tmp_dir/\", model_filename=\"model_saved_option_2.pt\")\n</pre> trainer.save(path=\"tmp_dir/\", model_filename=\"model_saved_option_2.pt\") <p>or the state dict</p> In\u00a0[14]: Copied! <pre>trainer.save(\n    path=\"tmp_dir/\",\n    model_filename=\"model_state_dict_saved_option_2.pt\",\n    save_state_dict=True,\n)\n</pre> trainer.save(     path=\"tmp_dir/\",     model_filename=\"model_state_dict_saved_option_2.pt\",     save_state_dict=True, ) In\u00a0[15]: Copied! <pre>%%bash\n\nls tmp_dir/\n</pre> %%bash  ls tmp_dir/ <pre>adult_tabmlp_model_4.p\nhistory\nmodel_saved_option_1.pt\nmodel_saved_option_2.pt\nmodel_state_dict_saved_option_1.pt\nmodel_state_dict_saved_option_2.pt\n</pre> In\u00a0[16]: Copied! <pre>%%bash\n\nls tmp_dir/history/\n</pre> %%bash  ls tmp_dir/history/ <pre>train_eval_history.json\n</pre> <p>Note that since we have used the <code>ModelCheckpoint</code> Callback, <code>adult_tabmlp_model_2.p</code> is the model state dict of the model at epoch 2, i.e. same as <code>model_state_dict_saved_option_1.p</code> or <code>model_state_dict_saved_option_2.p</code>.</p> In\u00a0[17]: Copied! <pre>with open(\"tmp_dir/tab_preproc.pkl\", \"wb\") as dp:\n    pickle.dump(tab_preprocessor, dp)\n</pre> with open(\"tmp_dir/tab_preproc.pkl\", \"wb\") as dp:     pickle.dump(tab_preprocessor, dp) In\u00a0[18]: Copied! <pre>with open(\"tmp_dir/eary_stop.pkl\", \"wb\") as es:\n    pickle.dump(early_stopping, es)\n</pre> with open(\"tmp_dir/eary_stop.pkl\", \"wb\") as es:     pickle.dump(early_stopping, es) In\u00a0[19]: Copied! <pre>%%bash\n\nls tmp_dir/\n</pre> %%bash  ls tmp_dir/ <pre>adult_tabmlp_model_4.p\neary_stop.pkl\nhistory\nmodel_saved_option_1.pt\nmodel_saved_option_2.pt\nmodel_state_dict_saved_option_1.pt\nmodel_state_dict_saved_option_2.pt\ntab_preproc.pkl\n</pre> <p>And that is pretty much all you need to resume training or directly predict, let's see</p> In\u00a0[20]: Copied! <pre>test.head()\n</pre> test.head() Out[20]: age workclass fnlwgt education educational_num marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country target 10103 43 Private 198282 HS-grad 9 Married-civ-spouse Craft-repair Husband White Male 0 0 40 United-States 1 31799 20 Private 228686 11th 7 Married-civ-spouse Other-service Husband White Male 0 0 40 United-States 0 19971 26 Private 291968 HS-grad 9 Married-civ-spouse Transport-moving Husband White Male 0 0 44 United-States 0 3039 48 Private 175958 Bachelors 13 Divorced Prof-specialty Not-in-family White Male 0 0 30 United-States 0 20725 18 Private 232024 11th 7 Never-married Machine-op-inspct Own-child White Male 0 0 55 United-States 0 In\u00a0[21]: Copied! <pre>with open(\"tmp_dir/tab_preproc.pkl\", \"rb\") as tp:\n    tab_preprocessor_new = pickle.load(tp)\n</pre> with open(\"tmp_dir/tab_preproc.pkl\", \"rb\") as tp:     tab_preprocessor_new = pickle.load(tp) In\u00a0[22]: Copied! <pre>X_test_tab = tab_preprocessor_new.transform(test)\ny_test = test.target\n</pre> X_test_tab = tab_preprocessor_new.transform(test) y_test = test.target In\u00a0[23]: Copied! <pre>tab_mlp_new = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    cont_norm_layer=\"layernorm\",\n    embed_continuous_method=\"standard\",\n    cont_embed_dim=8,\n    mlp_hidden_dims=[64, 32],\n    mlp_dropout=0.2,\n    mlp_activation=\"leaky_relu\",\n)\nnew_model = WideDeep(deeptabular=tab_mlp)\n</pre> tab_mlp_new = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     cont_norm_layer=\"layernorm\",     embed_continuous_method=\"standard\",     cont_embed_dim=8,     mlp_hidden_dims=[64, 32],     mlp_dropout=0.2,     mlp_activation=\"leaky_relu\", ) new_model = WideDeep(deeptabular=tab_mlp) In\u00a0[24]: Copied! <pre>new_model.load_state_dict(torch.load(\"tmp_dir/model_state_dict_saved_option_2.pt\"))\n</pre> new_model.load_state_dict(torch.load(\"tmp_dir/model_state_dict_saved_option_2.pt\")) Out[24]: <pre>&lt;All keys matched successfully&gt;</pre> In\u00a0[25]: Copied! <pre>trainer = Trainer(\n    model,\n    objective=\"binary\",\n)\n</pre> trainer = Trainer(     model,     objective=\"binary\", ) In\u00a0[26]: Copied! <pre>preds = trainer.predict(X_tab=X_test_tab, batch_size=32)\n</pre> preds = trainer.predict(X_tab=X_test_tab, batch_size=32) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:00&lt;00:00, 309.83it/s]\n</pre> In\u00a0[27]: Copied! <pre>from sklearn.metrics import accuracy_score\n</pre> from sklearn.metrics import accuracy_score In\u00a0[28]: Copied! <pre>accuracy_score(y_test, preds)\n</pre> accuracy_score(y_test, preds) Out[28]: <pre>0.8595701125895598</pre> In\u00a0[29]: Copied! <pre>shutil.rmtree(\"tmp_dir/\")\n</pre> shutil.rmtree(\"tmp_dir/\")"},{"location":"examples/05_save_and_load_model_and_artifacts.html#save-and-load-model-and-artifacts","title":"Save and load model and artifacts\u00b6","text":"<p>In this notebook I will show the different options to save and load a model, as well as some additional objects produced during training.</p> <p>On a given day, you train a model...</p>"},{"location":"examples/05_save_and_load_model_and_artifacts.html#save-model-option-1","title":"Save model: option 1\u00b6","text":"<p>save (and load) a model as you woud do with any other torch model</p>"},{"location":"examples/05_save_and_load_model_and_artifacts.html#save-model-option-2","title":"Save model: option 2\u00b6","text":"<p>use the <code>trainer</code>. The <code>trainer</code> will also save the training history and the learning rate history (if learning rate schedulers are used)</p>"},{"location":"examples/05_save_and_load_model_and_artifacts.html#save-preprocessors-and-callbacks","title":"Save preprocessors and callbacks\u00b6","text":"<p>...just pickle them</p>"},{"location":"examples/05_save_and_load_model_and_artifacts.html#run-new-experiment-prepare-new-dataset-load-model-and-predict","title":"Run New experiment: prepare new dataset, load model, and predict\u00b6","text":""},{"location":"examples/06_finetune_and_warmup.html","title":"06_finetune_and_warmup","text":"In\u00a0[1]: Copied! <pre>import numpy as np\nimport pandas as pd\nimport torch\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor\nfrom pytorch_widedeep.models import Wide, TabMlp, TabResnet, WideDeep\nfrom pytorch_widedeep.metrics import Accuracy\nfrom pytorch_widedeep.datasets import load_adult\n</pre> import numpy as np import pandas as pd import torch  from pytorch_widedeep import Trainer from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor from pytorch_widedeep.models import Wide, TabMlp, TabResnet, WideDeep from pytorch_widedeep.metrics import Accuracy from pytorch_widedeep.datasets import load_adult <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n</pre> In\u00a0[2]: Copied! <pre>df = load_adult(as_frame=True)\n# For convenience, we'll replace '-' with '_'\ndf.columns = [c.replace(\"-\", \"_\") for c in df.columns]\n# binary target\ndf[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int)\ndf.drop(\"income\", axis=1, inplace=True)\ndf.head()\n</pre> df = load_adult(as_frame=True) # For convenience, we'll replace '-' with '_' df.columns = [c.replace(\"-\", \"_\") for c in df.columns] # binary target df[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int) df.drop(\"income\", axis=1, inplace=True) df.head() Out[2]: age workclass fnlwgt education educational_num marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country income_label 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States 0 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States 0 2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States 1 3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States 1 4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States 0 In\u00a0[3]: Copied! <pre># Define wide, crossed and deep tabular columns\nwide_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital_status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"native_country\",\n]\ncrossed_cols = [(\"education\", \"occupation\"), (\"native_country\", \"occupation\")]\ncat_embed_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital_status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"capital_gain\",\n    \"capital_loss\",\n    \"native_country\",\n]\ncontinuous_cols = [\"age\", \"hours_per_week\"]\ntarget_col = \"income_label\"\ntarget = df[target_col].values\n</pre> # Define wide, crossed and deep tabular columns wide_cols = [     \"workclass\",     \"education\",     \"marital_status\",     \"occupation\",     \"relationship\",     \"race\",     \"gender\",     \"native_country\", ] crossed_cols = [(\"education\", \"occupation\"), (\"native_country\", \"occupation\")] cat_embed_cols = [     \"workclass\",     \"education\",     \"marital_status\",     \"occupation\",     \"relationship\",     \"race\",     \"gender\",     \"capital_gain\",     \"capital_loss\",     \"native_country\", ] continuous_cols = [\"age\", \"hours_per_week\"] target_col = \"income_label\" target = df[target_col].values In\u00a0[4]: Copied! <pre># TARGET\ntarget = df[target_col].values\n\n# WIDE\nwide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)\nX_wide = wide_preprocessor.fit_transform(df)\n\n# DEEP\ntab_preprocessor = TabPreprocessor(\n    cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols\n)\nX_tab = tab_preprocessor.fit_transform(df)\n</pre> # TARGET target = df[target_col].values  # WIDE wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = wide_preprocessor.fit_transform(df)  # DEEP tab_preprocessor = TabPreprocessor(     cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols ) X_tab = tab_preprocessor.fit_transform(df) <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> In\u00a0[5]: Copied! <pre>wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    embed_continuous_method=\"standard\",\n    cont_embed_dim=8,\n    mlp_hidden_dims=[64, 32],\n    mlp_dropout=0.2,\n    mlp_activation=\"leaky_relu\",\n)\nmodel = WideDeep(wide=wide, deeptabular=tab_mlp)\n</pre> wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1) tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     embed_continuous_method=\"standard\",     cont_embed_dim=8,     mlp_hidden_dims=[64, 32],     mlp_dropout=0.2,     mlp_activation=\"leaky_relu\", ) model = WideDeep(wide=wide, deeptabular=tab_mlp) In\u00a0[6]: Copied! <pre>trainer = Trainer(\n    model,\n    objective=\"binary\",\n    optimizers=torch.optim.Adam(model.parameters(), lr=0.01),\n    metrics=[Accuracy],\n)\n</pre> trainer = Trainer(     model,     objective=\"binary\",     optimizers=torch.optim.Adam(model.parameters(), lr=0.01),     metrics=[Accuracy], ) In\u00a0[7]: Copied! <pre>trainer.fit(\n    X_wide=X_wide, X_tab=X_tab, target=target, n_epochs=2, val_split=0.2, batch_size=256\n)\n</pre> trainer.fit(     X_wide=X_wide, X_tab=X_tab, target=target, n_epochs=2, val_split=0.2, batch_size=256 ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:02&lt;00:00, 74.26it/s, loss=0.399, metrics={'acc': 0.8163}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 39/39 [00:00&lt;00:00, 91.03it/s, loss=0.296, metrics={'acc': 0.8677}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 81.31it/s, loss=0.3, metrics={'acc': 0.8614}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 39/39 [00:00&lt;00:00, 106.45it/s, loss=0.285, metrics={'acc': 0.8721}]\n</pre> In\u00a0[8]: Copied! <pre>trainer.save(path=\"models_dir/\", save_state_dict=True, model_filename=\"model_1.pt\")\n</pre> trainer.save(path=\"models_dir/\", save_state_dict=True, model_filename=\"model_1.pt\") <p>Now time goes by...and we want to fine-tune the model to another, new dataset (for example, a dataset that is identical to the one you used to train the previous model but for another country).</p> <p>Here I will use the same dataset just for illustration purposes, but the flow would be identical to that new dataset</p> In\u00a0[9]: Copied! <pre>wide_1 = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\ntab_mlp_1 = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    embed_continuous_method=\"standard\",\n    cont_embed_dim=8,\n    mlp_hidden_dims=[64, 32],\n    mlp_dropout=0.2,\n    mlp_activation=\"leaky_relu\",\n)\nmodel_1 = WideDeep(wide=wide, deeptabular=tab_mlp)\n</pre> wide_1 = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1) tab_mlp_1 = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     embed_continuous_method=\"standard\",     cont_embed_dim=8,     mlp_hidden_dims=[64, 32],     mlp_dropout=0.2,     mlp_activation=\"leaky_relu\", ) model_1 = WideDeep(wide=wide, deeptabular=tab_mlp) In\u00a0[10]: Copied! <pre>model_1.load_state_dict(torch.load(\"models_dir/model_1.pt\"))\n</pre> model_1.load_state_dict(torch.load(\"models_dir/model_1.pt\")) Out[10]: <pre>&lt;All keys matched successfully&gt;</pre> In\u00a0[11]: Copied! <pre>trainer_1 = Trainer(model_1, objective=\"binary\", metrics=[Accuracy])\n</pre> trainer_1 = Trainer(model_1, objective=\"binary\", metrics=[Accuracy]) In\u00a0[12]: Copied! <pre>trainer_1.fit(\n    X_wide=X_wide,\n    X_tab=X_tab,\n    target=target,\n    n_epochs=2,\n    batch_size=256,\n    finetune=True,\n    finetune_epochs=2,\n)\n</pre> trainer_1.fit(     X_wide=X_wide,     X_tab=X_tab,     target=target,     n_epochs=2,     batch_size=256,     finetune=True,     finetune_epochs=2, ) <pre>Training wide for 2 epochs\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 191/191 [00:01&lt;00:00, 97.37it/s, loss=0.39, metrics={'acc': 0.8152}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 191/191 [00:01&lt;00:00, 104.04it/s, loss=0.359, metrics={'acc': 0.824}]\n</pre> <pre>Training deeptabular for 2 epochs\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 191/191 [00:02&lt;00:00, 83.83it/s, loss=0.297, metrics={'acc': 0.8365}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 191/191 [00:02&lt;00:00, 82.78it/s, loss=0.283, metrics={'acc': 0.8445}]\n</pre> <pre>Fine-tuning (or warmup) of individual components completed. Training the whole model for 2 epochs\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 191/191 [00:02&lt;00:00, 72.84it/s, loss=0.281, metrics={'acc': 0.8716}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 191/191 [00:02&lt;00:00, 77.46it/s, loss=0.273, metrics={'acc': 0.8744}]\n</pre> <p>Note that, as I describe above, in scenario 2, we can just use this to warm up models before they joined training begins:</p> In\u00a0[13]: Copied! <pre>wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    embed_continuous_method=\"standard\",\n    cont_embed_dim=8,\n    mlp_hidden_dims=[64, 32],\n    mlp_dropout=0.2,\n    mlp_activation=\"leaky_relu\",\n)\nmodel = WideDeep(wide=wide, deeptabular=tab_mlp)\n</pre> wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1) tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     embed_continuous_method=\"standard\",     cont_embed_dim=8,     mlp_hidden_dims=[64, 32],     mlp_dropout=0.2,     mlp_activation=\"leaky_relu\", ) model = WideDeep(wide=wide, deeptabular=tab_mlp) In\u00a0[14]: Copied! <pre>trainer_2 = Trainer(model, objective=\"binary\", metrics=[Accuracy])\n</pre> trainer_2 = Trainer(model, objective=\"binary\", metrics=[Accuracy]) In\u00a0[15]: Copied! <pre>trainer_2.fit(\n    X_wide=X_wide,\n    X_tab=X_tab,\n    target=target,\n    val_split=0.1,\n    warmup=True,\n    warmup_epochs=2,\n    n_epochs=2,\n    batch_size=256,\n)\n</pre> trainer_2.fit(     X_wide=X_wide,     X_tab=X_tab,     target=target,     val_split=0.1,     warmup=True,     warmup_epochs=2,     n_epochs=2,     batch_size=256, ) <pre>Training wide for 2 epochs\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:01&lt;00:00, 102.49it/s, loss=0.52, metrics={'acc': 0.7519}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:01&lt;00:00, 98.15it/s, loss=0.381, metrics={'acc': 0.7891}]\n</pre> <pre>Training deeptabular for 2 epochs\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 82.97it/s, loss=0.356, metrics={'acc': 0.8043}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 80.27it/s, loss=0.295, metrics={'acc': 0.8195}]\n</pre> <pre>Fine-tuning (or warmup) of individual components completed. Training the whole model for 2 epochs\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 77.27it/s, loss=0.291, metrics={'acc': 0.8667}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 89.57it/s, loss=0.289, metrics={'acc': 0.8665}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 72.69it/s, loss=0.283, metrics={'acc': 0.8693}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 91.81it/s, loss=0.284, metrics={'acc': 0.869}]\n</pre> <p>We need to explicitly indicate</p> <ol> <li><p>That we want fine-tune</p> </li> <li><p>The components that we want to individually fine-tune</p> </li> <li><p>In case of gradual fine-tuning, the routine (\"felbo\" or \"howard\")</p> </li> <li><p>The layers we want to fine-tune.</p> </li> </ol> <p>For example</p> In\u00a0[16]: Copied! <pre>wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\ntab_resnet = TabResnet(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    blocks_dims=[200, 200, 200],\n)\nmodel = WideDeep(wide=wide, deeptabular=tab_resnet)\n</pre> wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1) tab_resnet = TabResnet(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     blocks_dims=[200, 200, 200], ) model = WideDeep(wide=wide, deeptabular=tab_resnet) In\u00a0[17]: Copied! <pre>model\n</pre> model Out[17]: <pre>WideDeep(\n  (wide): Wide(\n    (wide_linear): Embedding(809, 1, padding_idx=0)\n  )\n  (deeptabular): Sequential(\n    (0): TabResnet(\n      (cat_embed): DiffSizeCatEmbeddings(\n        (embed_layers): ModuleDict(\n          (emb_layer_workclass): Embedding(10, 5, padding_idx=0)\n          (emb_layer_education): Embedding(17, 8, padding_idx=0)\n          (emb_layer_marital_status): Embedding(8, 5, padding_idx=0)\n          (emb_layer_occupation): Embedding(16, 7, padding_idx=0)\n          (emb_layer_relationship): Embedding(7, 4, padding_idx=0)\n          (emb_layer_race): Embedding(6, 4, padding_idx=0)\n          (emb_layer_gender): Embedding(3, 2, padding_idx=0)\n          (emb_layer_capital_gain): Embedding(124, 24, padding_idx=0)\n          (emb_layer_capital_loss): Embedding(100, 21, padding_idx=0)\n          (emb_layer_native_country): Embedding(43, 13, padding_idx=0)\n        )\n        (embedding_dropout): Dropout(p=0.1, inplace=False)\n      )\n      (cont_norm): Identity()\n      (encoder): DenseResnet(\n        (dense_resnet): Sequential(\n          (lin_inp): Linear(in_features=95, out_features=200, bias=False)\n          (bn_inp): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n          (block_0): BasicBlock(\n            (lin1): Linear(in_features=200, out_features=200, bias=False)\n            (bn1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n            (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n            (dp): Dropout(p=0.1, inplace=False)\n            (lin2): Linear(in_features=200, out_features=200, bias=False)\n            (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n          )\n          (block_1): BasicBlock(\n            (lin1): Linear(in_features=200, out_features=200, bias=False)\n            (bn1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n            (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n            (dp): Dropout(p=0.1, inplace=False)\n            (lin2): Linear(in_features=200, out_features=200, bias=False)\n            (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=200, out_features=1, bias=True)\n  )\n)</pre> <p>let's first train as usual</p> In\u00a0[18]: Copied! <pre>trainer_3 = Trainer(model, objective=\"binary\", metrics=[Accuracy])\n</pre> trainer_3 = Trainer(model, objective=\"binary\", metrics=[Accuracy]) In\u00a0[19]: Copied! <pre>trainer_3.fit(\n    X_wide=X_wide, X_tab=X_tab, target=target, val_split=0.1, n_epochs=2, batch_size=256\n)\n</pre> trainer_3.fit(     X_wide=X_wide, X_tab=X_tab, target=target, val_split=0.1, n_epochs=2, batch_size=256 ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:03&lt;00:00, 54.23it/s, loss=0.382, metrics={'acc': 0.8239}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 84.72it/s, loss=0.331, metrics={'acc': 0.8526}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:03&lt;00:00, 54.35it/s, loss=0.33, metrics={'acc': 0.8465}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 68.15it/s, loss=0.312, metrics={'acc': 0.8604}]\n</pre> In\u00a0[20]: Copied! <pre>trainer_3.save(path=\"models_dir\", save_state_dict=True, model_filename=\"model_3.pt\")\n</pre> trainer_3.save(path=\"models_dir\", save_state_dict=True, model_filename=\"model_3.pt\") <p>Now we are going to fine-tune the model components, and in the case of the <code>deeptabular</code> component, we will fine-tune the resnet-blocks and the linear layer but NOT the embeddings.</p> <p>For this, we need to access the model component's children: <code>deeptabular</code> $\\rightarrow$ <code>tab_resnet</code> $\\rightarrow$ <code>dense_resnet</code> $\\rightarrow$ <code>blocks</code></p> In\u00a0[21]: Copied! <pre>wide_3 = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\ntab_resnet_3 = TabResnet(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    blocks_dims=[200, 200, 200],\n)\nmodel_3 = WideDeep(wide=wide, deeptabular=tab_resnet)\n</pre> wide_3 = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1) tab_resnet_3 = TabResnet(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     blocks_dims=[200, 200, 200], ) model_3 = WideDeep(wide=wide, deeptabular=tab_resnet) In\u00a0[22]: Copied! <pre>model_3.load_state_dict(torch.load(\"models_dir/model_3.pt\"))\n</pre> model_3.load_state_dict(torch.load(\"models_dir/model_3.pt\")) Out[22]: <pre>&lt;All keys matched successfully&gt;</pre> In\u00a0[23]: Copied! <pre>model_3\n</pre> model_3 Out[23]: <pre>WideDeep(\n  (wide): Wide(\n    (wide_linear): Embedding(809, 1, padding_idx=0)\n  )\n  (deeptabular): Sequential(\n    (0): TabResnet(\n      (cat_embed): DiffSizeCatEmbeddings(\n        (embed_layers): ModuleDict(\n          (emb_layer_workclass): Embedding(10, 5, padding_idx=0)\n          (emb_layer_education): Embedding(17, 8, padding_idx=0)\n          (emb_layer_marital_status): Embedding(8, 5, padding_idx=0)\n          (emb_layer_occupation): Embedding(16, 7, padding_idx=0)\n          (emb_layer_relationship): Embedding(7, 4, padding_idx=0)\n          (emb_layer_race): Embedding(6, 4, padding_idx=0)\n          (emb_layer_gender): Embedding(3, 2, padding_idx=0)\n          (emb_layer_capital_gain): Embedding(124, 24, padding_idx=0)\n          (emb_layer_capital_loss): Embedding(100, 21, padding_idx=0)\n          (emb_layer_native_country): Embedding(43, 13, padding_idx=0)\n        )\n        (embedding_dropout): Dropout(p=0.1, inplace=False)\n      )\n      (cont_norm): Identity()\n      (encoder): DenseResnet(\n        (dense_resnet): Sequential(\n          (lin_inp): Linear(in_features=95, out_features=200, bias=False)\n          (bn_inp): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n          (block_0): BasicBlock(\n            (lin1): Linear(in_features=200, out_features=200, bias=False)\n            (bn1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n            (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n            (dp): Dropout(p=0.1, inplace=False)\n            (lin2): Linear(in_features=200, out_features=200, bias=False)\n            (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n          )\n          (block_1): BasicBlock(\n            (lin1): Linear(in_features=200, out_features=200, bias=False)\n            (bn1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n            (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n            (dp): Dropout(p=0.1, inplace=False)\n            (lin2): Linear(in_features=200, out_features=200, bias=False)\n            (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=200, out_features=1, bias=True)\n  )\n)</pre> In\u00a0[24]: Copied! <pre>tab_lin_layer = list(model_3.deeptabular.children())[1]\n</pre> tab_lin_layer = list(model_3.deeptabular.children())[1] In\u00a0[25]: Copied! <pre>tab_lin_layer\n</pre> tab_lin_layer Out[25]: <pre>Linear(in_features=200, out_features=1, bias=True)</pre> In\u00a0[26]: Copied! <pre>tab_deep_layers = []\nfor n1, c1 in model_3.deeptabular.named_children():\n    if (\n        n1 == \"0\"\n    ):  # 0 is the model component and 1 is always the prediction layer added by the `WideDeep` class\n        for n2, c2 in c1.named_children():\n            if n2 == \"encoder\":  # TabResnet\n                for _, c3 in c2.named_children():\n                    for n4, c4 in c3.named_children():  # dense_resnet\n                        if \"block\" in n4:\n                            tab_deep_layers.append((n4, c4))\n</pre> tab_deep_layers = [] for n1, c1 in model_3.deeptabular.named_children():     if (         n1 == \"0\"     ):  # 0 is the model component and 1 is always the prediction layer added by the `WideDeep` class         for n2, c2 in c1.named_children():             if n2 == \"encoder\":  # TabResnet                 for _, c3 in c2.named_children():                     for n4, c4 in c3.named_children():  # dense_resnet                         if \"block\" in n4:                             tab_deep_layers.append((n4, c4)) In\u00a0[27]: Copied! <pre>tab_deep_layers\n</pre> tab_deep_layers Out[27]: <pre>[('block_0',\n  BasicBlock(\n    (lin1): Linear(in_features=200, out_features=200, bias=False)\n    (bn1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n    (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n    (dp): Dropout(p=0.1, inplace=False)\n    (lin2): Linear(in_features=200, out_features=200, bias=False)\n    (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n  )),\n ('block_1',\n  BasicBlock(\n    (lin1): Linear(in_features=200, out_features=200, bias=False)\n    (bn1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n    (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n    (dp): Dropout(p=0.1, inplace=False)\n    (lin2): Linear(in_features=200, out_features=200, bias=False)\n    (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n  ))]</pre> <p>Now remember, we need to pass ONLY LAYERS (before I included the name for clarity) the layers in WARM UP ORDER, therefore:</p> In\u00a0[28]: Copied! <pre>tab_deep_layers = [el[1] for el in tab_deep_layers][::-1]\n</pre> tab_deep_layers = [el[1] for el in tab_deep_layers][::-1] In\u00a0[29]: Copied! <pre>tab_layers = [tab_lin_layer] + tab_deep_layers[::-1]\n</pre> tab_layers = [tab_lin_layer] + tab_deep_layers[::-1] In\u00a0[30]: Copied! <pre>tab_layers\n</pre> tab_layers Out[30]: <pre>[Linear(in_features=200, out_features=1, bias=True),\n BasicBlock(\n   (lin1): Linear(in_features=200, out_features=200, bias=False)\n   (bn1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n   (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n   (dp): Dropout(p=0.1, inplace=False)\n   (lin2): Linear(in_features=200, out_features=200, bias=False)\n   (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n ),\n BasicBlock(\n   (lin1): Linear(in_features=200, out_features=200, bias=False)\n   (bn1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n   (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n   (dp): Dropout(p=0.1, inplace=False)\n   (lin2): Linear(in_features=200, out_features=200, bias=False)\n   (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n )]</pre> <p>And now simply</p> In\u00a0[31]: Copied! <pre>trainer_4 = Trainer(model_3, objective=\"binary\", metrics=[Accuracy])\n</pre> trainer_4 = Trainer(model_3, objective=\"binary\", metrics=[Accuracy]) In\u00a0[32]: Copied! <pre>trainer_4.fit(\n    X_wide=X_wide,\n    X_tab=X_tab,\n    target=target,\n    val_split=0.1,\n    finetune=True,\n    finetune_epochs=2,\n    deeptabular_gradual=True,\n    deeptabular_layers=tab_layers,\n    deeptabular_max_lr=0.01,\n    n_epochs=2,\n    batch_size=256,\n)\n</pre> trainer_4.fit(     X_wide=X_wide,     X_tab=X_tab,     target=target,     val_split=0.1,     finetune=True,     finetune_epochs=2,     deeptabular_gradual=True,     deeptabular_layers=tab_layers,     deeptabular_max_lr=0.01,     n_epochs=2,     batch_size=256, ) <pre>Training wide for 2 epochs\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:01&lt;00:00, 95.17it/s, loss=0.504, metrics={'acc': 0.7523}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:01&lt;00:00, 99.83it/s, loss=0.384, metrics={'acc': 0.789}]\n</pre> <pre>Training deeptabular, layer 1 of 3\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 72.31it/s, loss=0.317, metrics={'acc': 0.8098}]\n</pre> <pre>Training deeptabular, layer 2 of 3\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 65.97it/s, loss=0.312, metrics={'acc': 0.8214}]\n</pre> <pre>Training deeptabular, layer 3 of 3\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 63.92it/s, loss=0.306, metrics={'acc': 0.8284}]\n</pre> <pre>Fine-tuning (or warmup) of individual components completed. Training the whole model for 2 epochs\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:03&lt;00:00, 57.26it/s, loss=0.292, metrics={'acc': 0.8664}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 84.56it/s, loss=0.292, metrics={'acc': 0.8696}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:03&lt;00:00, 53.61it/s, loss=0.282, metrics={'acc': 0.8693}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 80.59it/s, loss=0.289, metrics={'acc': 0.8719}]\n</pre> <p>Finally, there is one more use case I would like to consider. The case where we train only one component and we just want to fine-tune and stop the training afterwards, since there is no joined training. This is a simple as</p> In\u00a0[33]: Copied! <pre>tab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    mlp_hidden_dims=[64, 32],\n    mlp_dropout=0.2,\n    mlp_activation=\"leaky_relu\",\n)\nmodel = WideDeep(deeptabular=tab_mlp)\n</pre> tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     mlp_hidden_dims=[64, 32],     mlp_dropout=0.2,     mlp_activation=\"leaky_relu\", ) model = WideDeep(deeptabular=tab_mlp) In\u00a0[34]: Copied! <pre>trainer_5 = Trainer(\n    model,\n    objective=\"binary\",\n    optimizers=torch.optim.Adam(model.parameters(), lr=0.01),\n    metrics=[Accuracy],\n)\n</pre> trainer_5 = Trainer(     model,     objective=\"binary\",     optimizers=torch.optim.Adam(model.parameters(), lr=0.01),     metrics=[Accuracy], ) In\u00a0[35]: Copied! <pre>trainer_5.fit(\n    X_wide=X_wide, X_tab=X_tab, target=target, val_split=0.1, n_epochs=1, batch_size=256\n)\n</pre> trainer_5.fit(     X_wide=X_wide, X_tab=X_tab, target=target, val_split=0.1, n_epochs=1, batch_size=256 ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 73.69it/s, loss=0.365, metrics={'acc': 0.8331}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 92.56it/s, loss=0.299, metrics={'acc': 0.8673}]\n</pre> In\u00a0[36]: Copied! <pre>trainer_5.save(path=\"models_dir\", save_state_dict=True, model_filename=\"model_5.pt\")\n</pre> trainer_5.save(path=\"models_dir\", save_state_dict=True, model_filename=\"model_5.pt\") In\u00a0[37]: Copied! <pre>tab_mlp_5 = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    mlp_hidden_dims=[64, 32],\n    mlp_dropout=0.2,\n    mlp_activation=\"leaky_relu\",\n)\nmodel_5 = WideDeep(deeptabular=tab_mlp_5)\n</pre> tab_mlp_5 = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     mlp_hidden_dims=[64, 32],     mlp_dropout=0.2,     mlp_activation=\"leaky_relu\", ) model_5 = WideDeep(deeptabular=tab_mlp_5) In\u00a0[38]: Copied! <pre>model_5.load_state_dict(torch.load(\"models_dir/model_5.pt\"))\n</pre> model_5.load_state_dict(torch.load(\"models_dir/model_5.pt\")) Out[38]: <pre>&lt;All keys matched successfully&gt;</pre> <p>...times go by...</p> In\u00a0[39]: Copied! <pre>trainer_6 = Trainer(\n    model_5,\n    objective=\"binary\",\n    optimizers=torch.optim.Adam(model.parameters(), lr=0.01),\n    metrics=[Accuracy],\n)\n</pre> trainer_6 = Trainer(     model_5,     objective=\"binary\",     optimizers=torch.optim.Adam(model.parameters(), lr=0.01),     metrics=[Accuracy], ) In\u00a0[40]: Copied! <pre>trainer_6.fit(\n    X_wide=X_wide,\n    X_tab=X_tab,\n    target=target,\n    val_split=0.1,\n    finetune=True,\n    finetune_epochs=2,\n    finetune_max_lr=0.01,\n    stop_after_finetuning=True,\n    batch_size=256,\n)\n</pre> trainer_6.fit(     X_wide=X_wide,     X_tab=X_tab,     target=target,     val_split=0.1,     finetune=True,     finetune_epochs=2,     finetune_max_lr=0.01,     stop_after_finetuning=True,     batch_size=256, ) <pre>Training deeptabular for 2 epochs\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 73.86it/s, loss=0.298, metrics={'acc': 0.8652}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 75.45it/s, loss=0.286, metrics={'acc': 0.8669}]\n</pre> <pre>Fine-tuning (or warmup) of individual components completed. Training the whole model for 1 epochs\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 76.29it/s, loss=0.282, metrics={'acc': 0.8698}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 84.93it/s, loss=0.281, metrics={'acc': 0.8749}]\n</pre> In\u00a0[42]: Copied! <pre>import shutil\n\nshutil.rmtree(\"models_dir/\")\nshutil.rmtree(\"model_weights/\")\n</pre> import shutil  shutil.rmtree(\"models_dir/\") shutil.rmtree(\"model_weights/\") In\u00a0[\u00a0]: Copied! <pre>\n</pre>"},{"location":"examples/06_finetune_and_warmup.html#the-finetunewarm-up-option","title":"The FineTune/Warm Up option\u00b6","text":"<p>Let's place ourselves in two possible scenarios.</p> <ol> <li><p>Let's assume we have run a model and we want to just transfer the learnings (you know...transfer-learning) to another dataset, or simply we have received new data and we do not want to start the training of each component from scratch. Simply, we want to load the pre-trained weights and fine-tune.</p> </li> <li><p>We just want to \"warm up\" individual model components individually before the joined training begins.</p> </li> </ol> <p>This can be done with the <code>finetune</code> set of parameters. There are 3 fine-tuning routines:</p> <ol> <li>Fine-tune all trainable layers at once with a triangular one-cycle learning rate (referred as slanted triangular learning rates in Howard &amp; Ruder 2018)</li> <li>Gradual fine-tuning inspired by the work of Felbo et al., 2017</li> <li>Gradual fine-tuning based on the work of Howard &amp; Ruder 2018</li> </ol> <p>Currently fine-tunning is only supported without a fully connected head, i.e. if <code>deephead=None</code>. In addition, <code>Felbo</code> and <code>Howard</code> routines only applied, of course, to the <code>deeptabular</code>, <code>deeptext</code> and <code>deepimage</code> models. The <code>wide</code> component can also be fine-tuned, but only in an \"all at once\" mode.</p>"},{"location":"examples/06_finetune_and_warmup.html#fine-tune-or-warm-up-all-at-once","title":"Fine-tune or warm-up all at once\u00b6","text":"<p>Here, the model components will be trained for <code>finetune_epochs</code> using a triangular one-cycle learning rate (slanted triangular learning rate) ranging from <code>finetune_max_lr/10</code> to <code>finetune_max_lr</code> (default is 0.01). 10% of the training steps are used to increase the learning rate which then decreases for the remaining 90%.</p> <p>Here all trainable layers are fine-tuned.</p> <p>Let's have a look to one example.</p>"},{"location":"examples/06_finetune_and_warmup.html#fine-tune-gradually-the-felbo-and-the-howard-routines","title":"Fine-tune Gradually: The \"felbo\"  and the \"howard\" routines\u00b6","text":"<p>The Felbo routine can be illustrated as follows:</p> <p> </p> <p>Figure 1. The figure can be described as follows: fine-tune (or train) the last layer for one epoch using a one cycle triangular learning rate. Then fine-tune the next deeper layer for one epoch, with a learning rate that is a factor of 2.5 lower than the previous learning rate (the 2.5 factor is fixed) while freezing the already warmed up layer(s). Repeat untill all individual layers are warmed. Then warm one last epoch with all warmed layers trainable. The vanishing color gradient in the figure attempts to illustrate the decreasing learning rate.</p> <p>Note that this is not identical to the Fine-Tunning routine described in Felbo et al, 2017, this is why I used the word 'inspired'.</p> <p>The Howard routine can be illustrated as follows:</p> <p> </p> <p>Figure 2. The figure can be described as follows: fine-tune (or train) the last layer for one epoch using a one cycle triangular learning rate. Then fine-tune the next deeper layer for one epoch, with a learning rate that is a factor of 2.5 lower than the previous learning rate (the 2.5 factor is fixed) while keeping the already warmed up layer(s) trainable. Repeat. The vanishing color gradient in the figure attempts to illustrate the decreasing learning rate.</p> <p>Note that I write \"fine-tune (or train) the last layer for one epoch [...]\". However, in practice the user will have to specify the order of the layers to be fine-tuned. This is another reason why I wrote that the fine-tune routines I have implemented are inspired by the work of Felbo and Howard and not identical to their implemenations.</p> <p>The <code>felbo</code> and <code>howard</code> routines can be accessed with via the <code>fine-tune</code> parameters.</p>"},{"location":"examples/07_custom_components.html","title":"07_custom_components","text":"In\u00a0[1]: Copied! <pre>import numpy as np\nimport pandas as pd\nimport os\nimport torch\n\nfrom torch import Tensor\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.preprocessing import (\n    WidePreprocessor,\n    TabPreprocessor,\n    TextPreprocessor,\n    ImagePreprocessor,\n)\nfrom pytorch_widedeep.models import (\n    Wide,\n    TabMlp,\n    Vision,\n    BasicRNN,\n    WideDeep,\n)\nfrom pytorch_widedeep.losses import RMSELoss\nfrom pytorch_widedeep.initializers import *\nfrom pytorch_widedeep.callbacks import *\nfrom pytorch_widedeep.datasets import load_adult\n</pre> import numpy as np import pandas as pd import os import torch  from torch import Tensor from pytorch_widedeep import Trainer from pytorch_widedeep.preprocessing import (     WidePreprocessor,     TabPreprocessor,     TextPreprocessor,     ImagePreprocessor, ) from pytorch_widedeep.models import (     Wide,     TabMlp,     Vision,     BasicRNN,     WideDeep, ) from pytorch_widedeep.losses import RMSELoss from pytorch_widedeep.initializers import * from pytorch_widedeep.callbacks import * from pytorch_widedeep.datasets import load_adult <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n</pre> In\u00a0[2]: Copied! <pre>df = pd.read_csv(\"../tmp_data/airbnb/airbnb_sample.csv\")\ndf.head()\n</pre> df = pd.read_csv(\"../tmp_data/airbnb/airbnb_sample.csv\") df.head() Out[2]: id host_id description host_listings_count host_identity_verified neighbourhood_cleansed latitude longitude is_location_exact property_type ... amenity_wide_entrance amenity_wide_entrance_for_guests amenity_wide_entryway amenity_wide_hallways amenity_wifi amenity_window_guards amenity_wine_cooler security_deposit extra_people yield 0 13913.jpg 54730 My bright double bedroom with a large window h... 4.0 f Islington 51.56802 -0.11121 t apartment ... 1 0 0 0 1 0 0 100.0 15.0 12.00 1 15400.jpg 60302 Lots of windows and light.  St Luke's Gardens ... 1.0 t Kensington and Chelsea 51.48796 -0.16898 t apartment ... 0 0 0 0 1 0 0 150.0 0.0 109.50 2 17402.jpg 67564 Open from June 2018 after a 3-year break, we a... 19.0 t Westminster 51.52098 -0.14002 t apartment ... 0 0 0 0 1 0 0 350.0 10.0 149.65 3 24328.jpg 41759 Artist house, bright high ceiling rooms, priva... 2.0 t Wandsworth 51.47298 -0.16376 t other ... 0 0 0 0 1 0 0 250.0 0.0 215.60 4 25023.jpg 102813 Large, all comforts, 2-bed flat; first floor; ... 1.0 f Wandsworth 51.44687 -0.21874 t apartment ... 0 0 0 0 1 0 0 250.0 11.0 79.35 <p>5 rows \u00d7 223 columns</p> In\u00a0[3]: Copied! <pre># There are a number of columns that are already binary. Therefore, no need to one hot encode them\ncrossed_cols = [(\"property_type\", \"room_type\")]\nalready_dummies = [c for c in df.columns if \"amenity\" in c] + [\"has_house_rules\"]\nwide_cols = [\n    \"is_location_exact\",\n    \"property_type\",\n    \"room_type\",\n    \"host_gender\",\n    \"instant_bookable\",\n] + already_dummies\n\ncat_embed_cols = [(c, 16) for c in df.columns if \"catg\" in c] + [\n    (\"neighbourhood_cleansed\", 64),\n    (\"cancellation_policy\", 16),\n]\ncontinuous_cols = [\"latitude\", \"longitude\", \"security_deposit\", \"extra_people\"]\n# it does not make sense to standarised Latitude and Longitude\nalready_standard = [\"latitude\", \"longitude\"]\n\n# text and image colnames\ntext_col = \"description\"\nimg_col = \"id\"\n\n# path to pretrained word embeddings and the images\nword_vectors_path = \"../tmp_data/glove.6B/glove.6B.100d.txt\"\nimg_path = \"../tmp_data/airbnb/property_picture\"\n\n# target\ntarget_col = \"yield\"\n</pre> # There are a number of columns that are already binary. Therefore, no need to one hot encode them crossed_cols = [(\"property_type\", \"room_type\")] already_dummies = [c for c in df.columns if \"amenity\" in c] + [\"has_house_rules\"] wide_cols = [     \"is_location_exact\",     \"property_type\",     \"room_type\",     \"host_gender\",     \"instant_bookable\", ] + already_dummies  cat_embed_cols = [(c, 16) for c in df.columns if \"catg\" in c] + [     (\"neighbourhood_cleansed\", 64),     (\"cancellation_policy\", 16), ] continuous_cols = [\"latitude\", \"longitude\", \"security_deposit\", \"extra_people\"] # it does not make sense to standarised Latitude and Longitude already_standard = [\"latitude\", \"longitude\"]  # text and image colnames text_col = \"description\" img_col = \"id\"  # path to pretrained word embeddings and the images word_vectors_path = \"../tmp_data/glove.6B/glove.6B.100d.txt\" img_path = \"../tmp_data/airbnb/property_picture\"  # target target_col = \"yield\" In\u00a0[4]: Copied! <pre>target = df[target_col].values\n</pre> target = df[target_col].values In\u00a0[5]: Copied! <pre>wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)\nX_wide = wide_preprocessor.fit_transform(df)\n\ntab_preprocessor = TabPreprocessor(\n    embed_cols=cat_embed_cols, continuous_cols=continuous_cols\n)\nX_tab = tab_preprocessor.fit_transform(df)\n\ntext_preprocessor = TextPreprocessor(\n    word_vectors_path=word_vectors_path, text_col=text_col\n)\nX_text = text_preprocessor.fit_transform(df)\n\nimage_processor = ImagePreprocessor(img_col=img_col, img_path=img_path)\nX_images = image_processor.fit_transform(df)\n</pre> wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = wide_preprocessor.fit_transform(df)  tab_preprocessor = TabPreprocessor(     embed_cols=cat_embed_cols, continuous_cols=continuous_cols ) X_tab = tab_preprocessor.fit_transform(df)  text_preprocessor = TextPreprocessor(     word_vectors_path=word_vectors_path, text_col=text_col ) X_text = text_preprocessor.fit_transform(df)  image_processor = ImagePreprocessor(img_col=img_col, img_path=img_path) X_images = image_processor.fit_transform(df) <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> <pre>The vocabulary contains 2192 tokens\nIndexing word vectors...\nLoaded 400000 word vectors\nPreparing embeddings matrix...\n2175 words in the vocabulary had ../tmp_data/glove.6B/glove.6B.100d.txt vectors and appear more than 5 times\nReading Images from ../tmp_data/airbnb/property_picture\nResizing\n</pre> <pre>100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1001/1001 [00:02&lt;00:00, 497.80it/s]\n</pre> <pre>Computing normalisation metrics\n</pre> <p>Now we are ready to build a wide and deep model. Three of the four components we will use are included in this package, and they will be combined with a custom <code>deeptext</code> component. Then the fit process will run with a custom loss function.</p> <p>Let's have a look</p> In\u00a0[6]: Copied! <pre># Linear model\nwide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\n\n# DeepDense: 2 Dense layers\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    mlp_hidden_dims=[128, 64],\n    mlp_dropout=0.1,\n)\n\n# Pretrained Resnet 18\nresnet = Vision(pretrained_model_name=\"resnet18\", n_trainable=0)\n</pre> # Linear model wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)  # DeepDense: 2 Dense layers tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     mlp_hidden_dims=[128, 64],     mlp_dropout=0.1, )  # Pretrained Resnet 18 resnet = Vision(pretrained_model_name=\"resnet18\", n_trainable=0) In\u00a0[7]: Copied! <pre>class MyDeepText(nn.Module):\n    def __init__(self, vocab_size, padding_idx=1, embed_dim=100, hidden_dim=64):\n        super(MyDeepText, self).__init__()\n\n        # word/token embeddings\n        self.word_embed = nn.Embedding(vocab_size, embed_dim, padding_idx=padding_idx)\n\n        # stack of RNNs\n        self.rnn = nn.GRU(\n            embed_dim,\n            hidden_dim,\n            num_layers=2,\n            bidirectional=True,\n            batch_first=True,\n        )\n\n        # Remember, this MUST be defined. If not WideDeep will through an error\n        self.output_dim = hidden_dim * 2\n\n    def forward(self, X):\n        embed = self.word_embed(X.long())\n        o, h = self.rnn(embed)\n        return torch.cat((h[-2], h[-1]), dim=1)\n</pre> class MyDeepText(nn.Module):     def __init__(self, vocab_size, padding_idx=1, embed_dim=100, hidden_dim=64):         super(MyDeepText, self).__init__()          # word/token embeddings         self.word_embed = nn.Embedding(vocab_size, embed_dim, padding_idx=padding_idx)          # stack of RNNs         self.rnn = nn.GRU(             embed_dim,             hidden_dim,             num_layers=2,             bidirectional=True,             batch_first=True,         )          # Remember, this MUST be defined. If not WideDeep will through an error         self.output_dim = hidden_dim * 2      def forward(self, X):         embed = self.word_embed(X.long())         o, h = self.rnn(embed)         return torch.cat((h[-2], h[-1]), dim=1) In\u00a0[8]: Copied! <pre>mydeeptext = MyDeepText(vocab_size=len(text_preprocessor.vocab.itos))\n</pre> mydeeptext = MyDeepText(vocab_size=len(text_preprocessor.vocab.itos)) In\u00a0[9]: Copied! <pre>model = WideDeep(wide=wide, deeptabular=tab_mlp, deeptext=mydeeptext, deepimage=resnet)\n</pre> model = WideDeep(wide=wide, deeptabular=tab_mlp, deeptext=mydeeptext, deepimage=resnet) In\u00a0[10]: Copied! <pre>class RMSELoss(nn.Module):\n    def __init__(self):\n        \"\"\"root mean squared error\"\"\"\n        super().__init__()\n        self.mse = nn.MSELoss()\n\n    def forward(self, input: Tensor, target: Tensor) -&gt; Tensor:\n        return torch.sqrt(self.mse(input, target))\n</pre> class RMSELoss(nn.Module):     def __init__(self):         \"\"\"root mean squared error\"\"\"         super().__init__()         self.mse = nn.MSELoss()      def forward(self, input: Tensor, target: Tensor) -&gt; Tensor:         return torch.sqrt(self.mse(input, target)) <p>and now we just instantiate the <code>Trainer</code> as usual. Needless to say, but this runs with 1000 random observations, so loss and metric values are meaningless. This is just an example</p> In\u00a0[11]: Copied! <pre>trainer = Trainer(model, objective=\"regression\", custom_loss_function=RMSELoss())\n</pre> trainer = Trainer(model, objective=\"regression\", custom_loss_function=RMSELoss()) In\u00a0[12]: Copied! <pre>trainer.fit(\n    X_wide=X_wide,\n    X_tab=X_tab,\n    X_text=X_text,\n    X_img=X_images,\n    target=target,\n    n_epochs=1,\n    batch_size=32,\n    val_split=0.2,\n)\n</pre> trainer.fit(     X_wide=X_wide,     X_tab=X_tab,     X_text=X_text,     X_img=X_images,     target=target,     n_epochs=1,     batch_size=32,     val_split=0.2, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 25/25 [00:23&lt;00:00,  1.07it/s, loss=126]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 7/7 [00:05&lt;00:00,  1.24it/s, loss=97.4]\n</pre> <p>In addition to model components and loss functions, we can also use custom callbacks or custom metrics. The former need to be of type <code>Callback</code> and the latter need to be of type <code>Metric</code>. See:</p> <pre>pytorch-widedeep.callbacks\n</pre> <p>and</p> <pre>pytorch-widedeep.metrics\n</pre> <p>For this example let me use the adult dataset. Again, we first prepare the data as usual</p> In\u00a0[13]: Copied! <pre>df = load_adult(as_frame=True)\ndf.head()\n</pre> df = load_adult(as_frame=True) df.head() Out[13]: age workclass fnlwgt education educational-num marital-status occupation relationship race gender capital-gain capital-loss hours-per-week native-country income 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States &lt;=50K 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States &lt;=50K 2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States &gt;50K 3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States &gt;50K 4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States &lt;=50K In\u00a0[14]: Copied! <pre># For convenience, we'll replace '-' with '_'\ndf.columns = [c.replace(\"-\", \"_\") for c in df.columns]\n# binary target\ndf[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int)\ndf.drop(\"income\", axis=1, inplace=True)\ndf.head()\n</pre> # For convenience, we'll replace '-' with '_' df.columns = [c.replace(\"-\", \"_\") for c in df.columns] # binary target df[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int) df.drop(\"income\", axis=1, inplace=True) df.head() Out[14]: age workclass fnlwgt education educational_num marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country income_label 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States 0 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States 0 2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States 1 3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States 1 4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States 0 In\u00a0[15]: Copied! <pre># Define wide, crossed and deep tabular columns\nwide_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital_status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"native_country\",\n]\ncrossed_cols = [(\"education\", \"occupation\"), (\"native_country\", \"occupation\")]\ncat_embed_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital_status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"capital_gain\",\n    \"capital_loss\",\n    \"native_country\",\n]\ncontinuous_cols = [\"age\", \"hours_per_week\"]\ntarget_col = \"income_label\"\ntarget = df[target_col].values\n</pre> # Define wide, crossed and deep tabular columns wide_cols = [     \"workclass\",     \"education\",     \"marital_status\",     \"occupation\",     \"relationship\",     \"race\",     \"gender\",     \"native_country\", ] crossed_cols = [(\"education\", \"occupation\"), (\"native_country\", \"occupation\")] cat_embed_cols = [     \"workclass\",     \"education\",     \"marital_status\",     \"occupation\",     \"relationship\",     \"race\",     \"gender\",     \"capital_gain\",     \"capital_loss\",     \"native_country\", ] continuous_cols = [\"age\", \"hours_per_week\"] target_col = \"income_label\" target = df[target_col].values In\u00a0[16]: Copied! <pre># wide\nwide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)\nX_wide = wide_preprocessor.fit_transform(df)\n\n# deeptabular\ntab_preprocessor = TabPreprocessor(\n    embed_cols=cat_embed_cols, continuous_cols=continuous_cols\n)\nX_tab = tab_preprocessor.fit_transform(df)\n</pre> # wide wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = wide_preprocessor.fit_transform(df)  # deeptabular tab_preprocessor = TabPreprocessor(     embed_cols=cat_embed_cols, continuous_cols=continuous_cols ) X_tab = tab_preprocessor.fit_transform(df) <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> In\u00a0[17]: Copied! <pre>wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=continuous_cols,\n    mlp_hidden_dims=[128, 64],\n    mlp_dropout=0.2,\n    mlp_activation=\"leaky_relu\",\n)\nmodel = WideDeep(wide=wide, deeptabular=tab_mlp)\n</pre> wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1) tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=continuous_cols,     mlp_hidden_dims=[128, 64],     mlp_dropout=0.2,     mlp_activation=\"leaky_relu\", ) model = WideDeep(wide=wide, deeptabular=tab_mlp) In\u00a0[18]: Copied! <pre>from pytorch_widedeep.metrics import Metric\n</pre> from pytorch_widedeep.metrics import Metric In\u00a0[19]: Copied! <pre>class Accuracy(Metric):\n    def __init__(self, top_k: int = 1):\n        super(Accuracy, self).__init__()\n\n        self.top_k = top_k\n        self.correct_count = 0\n        self.total_count = 0\n\n        # \u00a0metric name needs to be defined\n        self._name = \"acc\"\n\n    def reset(self):\n        self.correct_count = 0\n        self.total_count = 0\n\n    def __call__(self, y_pred: Tensor, y_true: Tensor) -&gt; np.ndarray:\n        num_classes = y_pred.size(1)\n\n        if num_classes == 1:\n            y_pred = y_pred.round()\n            y_true = y_true\n        elif num_classes &gt; 1:\n            y_pred = y_pred.topk(self.top_k, 1)[1]\n            y_true = y_true.view(-1, 1).expand_as(y_pred)\n\n        self.correct_count += y_pred.eq(y_true).sum().item()\n        self.total_count += len(y_pred)\n        accuracy = float(self.correct_count) / float(self.total_count)\n        return np.array(accuracy)\n</pre> class Accuracy(Metric):     def __init__(self, top_k: int = 1):         super(Accuracy, self).__init__()          self.top_k = top_k         self.correct_count = 0         self.total_count = 0          # \u00a0metric name needs to be defined         self._name = \"acc\"      def reset(self):         self.correct_count = 0         self.total_count = 0      def __call__(self, y_pred: Tensor, y_true: Tensor) -&gt; np.ndarray:         num_classes = y_pred.size(1)          if num_classes == 1:             y_pred = y_pred.round()             y_true = y_true         elif num_classes &gt; 1:             y_pred = y_pred.topk(self.top_k, 1)[1]             y_true = y_true.view(-1, 1).expand_as(y_pred)          self.correct_count += y_pred.eq(y_true).sum().item()         self.total_count += len(y_pred)         accuracy = float(self.correct_count) / float(self.total_count)         return np.array(accuracy) In\u00a0[20]: Copied! <pre># have a look to the class\nfrom pytorch_widedeep.callbacks import Callback\n</pre> # have a look to the class from pytorch_widedeep.callbacks import Callback In\u00a0[21]: Copied! <pre>class SillyCallback(Callback):\n    def on_train_begin(self, logs=None):\n        # recordings will be the trainer object attributes\n        self.trainer.silly_callback = {}\n\n        self.trainer.silly_callback[\"beginning\"] = []\n        self.trainer.silly_callback[\"end\"] = []\n\n    def on_epoch_begin(self, epoch, logs=None):\n        self.trainer.silly_callback[\"beginning\"].append(epoch + 1)\n\n    def on_epoch_end(self, epoch, logs=None, metric=None):\n        self.trainer.silly_callback[\"end\"].append(epoch + 1)\n</pre> class SillyCallback(Callback):     def on_train_begin(self, logs=None):         # recordings will be the trainer object attributes         self.trainer.silly_callback = {}          self.trainer.silly_callback[\"beginning\"] = []         self.trainer.silly_callback[\"end\"] = []      def on_epoch_begin(self, epoch, logs=None):         self.trainer.silly_callback[\"beginning\"].append(epoch + 1)      def on_epoch_end(self, epoch, logs=None, metric=None):         self.trainer.silly_callback[\"end\"].append(epoch + 1) <p>and now, as usual:</p> In\u00a0[22]: Copied! <pre>trainer = Trainer(\n    model, objective=\"binary\", metrics=[Accuracy], callbacks=[SillyCallback]\n)\n</pre> trainer = Trainer(     model, objective=\"binary\", metrics=[Accuracy], callbacks=[SillyCallback] ) In\u00a0[23]: Copied! <pre>trainer.fit(\n    X_wide=X_wide, X_tab=X_tab, target=target, n_epochs=5, batch_size=64, val_split=0.2\n)\n</pre> trainer.fit(     X_wide=X_wide, X_tab=X_tab, target=target, n_epochs=5, batch_size=64, val_split=0.2 ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 611/611 [00:06&lt;00:00, 94.39it/s, loss=0.411, metrics={'acc': 0.814}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 121.91it/s, loss=0.327, metrics={'acc': 0.8449}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 611/611 [00:07&lt;00:00, 85.39it/s, loss=0.324, metrics={'acc': 0.8495}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 88.68it/s, loss=0.298, metrics={'acc': 0.8612}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 611/611 [00:08&lt;00:00, 74.35it/s, loss=0.302, metrics={'acc': 0.8593}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 100.51it/s, loss=0.29, metrics={'acc': 0.8665}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 611/611 [00:08&lt;00:00, 73.83it/s, loss=0.292, metrics={'acc': 0.8637}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 105.98it/s, loss=0.286, metrics={'acc': 0.8695}]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 611/611 [00:08&lt;00:00, 72.15it/s, loss=0.286, metrics={'acc': 0.866}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 92.27it/s, loss=0.284, metrics={'acc': 0.8698}]\n</pre> In\u00a0[24]: Copied! <pre>trainer.silly_callback\n</pre> trainer.silly_callback Out[24]: <pre>{'beginning': [1, 2, 3, 4, 5], 'end': [1, 2, 3, 4, 5]}</pre>"},{"location":"examples/07_custom_components.html#custom-components","title":"Custom components\u00b6","text":"<p>As I mentioned earlier in the example notebooks, and also in the <code>README</code>, it is possible to customise almost every component in <code>pytorch-widedeep</code>.</p> <p>Let's now go through a couple of simple examples to illustrate how that could be done.</p> <p>First let's load and process the data \"as usual\", let's start with a regression and the airbnb dataset.</p>"},{"location":"examples/07_custom_components.html#custom-deeptext","title":"Custom <code>deeptext</code>\u00b6","text":"<p>Standard Pytorch model</p>"},{"location":"examples/07_custom_components.html#custom-loss-function","title":"Custom loss function\u00b6","text":"<p>Loss functions must simply inherit pytorch's <code>nn.Module</code>. For example, let's say we want to use <code>RMSE</code> (note that this is already available in the package, but I will pass it here as a custom loss for illustration purposes)</p>"},{"location":"examples/07_custom_components.html#custom-metric","title":"Custom metric\u00b6","text":"<p>Let's say we want to use our own accuracy metric (again, this is already available in the package, but I will pass it here as a custom loss for illustration purposes).</p> <p>This could be done as:</p>"},{"location":"examples/07_custom_components.html#custom-callback","title":"Custom Callback\u00b6","text":"<p>Let's code a callback that records the current epoch at the beginning and the end of each epoch (silly, but you know, this is just an example)</p>"},{"location":"examples/08_custom_dataLoader_imbalanced_dataset.html","title":"08_custom_dataLoader_imbalanced_dataset","text":"<ul> <li>In this notebook we will use the higly imbalanced Protein Homology Dataset from KDD cup 2004</li> </ul> <pre><code>* The first element of each line is a BLOCK ID that denotes to which native sequence this example belongs. There is a unique BLOCK ID for each native sequence. BLOCK IDs are integers running from 1 to 303 (one for each native sequence, i.e. for each query). BLOCK IDs were assigned before the blocks were split into the train and test sets, so they do not run consecutively in either file.\n* The second element of each line is an EXAMPLE ID that uniquely describes the example. You will need this EXAMPLE ID and the BLOCK ID when you submit results.\n* The third element is the class of the example. Proteins that are homologous to the native sequence are denoted by 1, non-homologous proteins (i.e. decoys) by 0. Test examples have a \"?\" in this position.\n* All following elements are feature values. There are 74 feature values in each line. The features describe the match (e.g. the score of a sequence alignment) between the native protein sequence and the sequence that is tested for homology.\n</code></pre> In\u00a0[1]: Copied! <pre>import numpy as np\nimport pandas as pd\nimport torch\nfrom torch.optim import SGD, lr_scheduler\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.models import TabMlp, WideDeep\nfrom pytorch_widedeep.dataloaders import DataLoaderImbalanced, DataLoaderDefault\nfrom torchmetrics import F1Score as F1_torchmetrics\nfrom torchmetrics import Accuracy as Accuracy_torchmetrics\nfrom torchmetrics import Precision as Precision_torchmetrics\nfrom torchmetrics import Recall as Recall_torchmetrics\nfrom pytorch_widedeep.metrics import Accuracy, Recall, Precision, F1Score, R2Score\nfrom pytorch_widedeep.initializers import XavierNormal\nfrom pytorch_widedeep.datasets import load_bio_kdd04\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import classification_report\n\nimport time\nimport datetime\n\nimport warnings\n\nwarnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n\n# increase displayed columns in jupyter notebook\npd.set_option(\"display.max_columns\", 200)\npd.set_option(\"display.max_rows\", 300)\n</pre> import numpy as np import pandas as pd import torch from torch.optim import SGD, lr_scheduler  from pytorch_widedeep import Trainer from pytorch_widedeep.preprocessing import TabPreprocessor from pytorch_widedeep.models import TabMlp, WideDeep from pytorch_widedeep.dataloaders import DataLoaderImbalanced, DataLoaderDefault from torchmetrics import F1Score as F1_torchmetrics from torchmetrics import Accuracy as Accuracy_torchmetrics from torchmetrics import Precision as Precision_torchmetrics from torchmetrics import Recall as Recall_torchmetrics from pytorch_widedeep.metrics import Accuracy, Recall, Precision, F1Score, R2Score from pytorch_widedeep.initializers import XavierNormal from pytorch_widedeep.datasets import load_bio_kdd04  from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report  import time import datetime  import warnings  warnings.filterwarnings(\"ignore\", category=DeprecationWarning)  # increase displayed columns in jupyter notebook pd.set_option(\"display.max_columns\", 200) pd.set_option(\"display.max_rows\", 300) <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n</pre> In\u00a0[2]: Copied! <pre>df = load_bio_kdd04(as_frame=True)\n# drop columns we won't need in this example\ndf.drop(columns=[\"EXAMPLE_ID\", \"BLOCK_ID\"], inplace=True)\n\ndf_train, df_valid = train_test_split(\n    df, test_size=0.2, stratify=df[\"target\"], random_state=1\n)\ndf_valid, df_test = train_test_split(\n    df_valid, test_size=0.5, stratify=df_valid[\"target\"], random_state=1\n)\n\ncontinuous_cols = df.drop(columns=[\"target\"]).columns.values.tolist()\n</pre> df = load_bio_kdd04(as_frame=True) # drop columns we won't need in this example df.drop(columns=[\"EXAMPLE_ID\", \"BLOCK_ID\"], inplace=True)  df_train, df_valid = train_test_split(     df, test_size=0.2, stratify=df[\"target\"], random_state=1 ) df_valid, df_test = train_test_split(     df_valid, test_size=0.5, stratify=df_valid[\"target\"], random_state=1 )  continuous_cols = df.drop(columns=[\"target\"]).columns.values.tolist() In\u00a0[3]: Copied! <pre># deeptabular\ntab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True)\nX_tab_train = tab_preprocessor.fit_transform(df_train)\nX_tab_valid = tab_preprocessor.transform(df_valid)\nX_tab_test = tab_preprocessor.transform(df_test)\n\n# target\ny_train = df_train[\"target\"].values\ny_valid = df_valid[\"target\"].values\ny_test = df_test[\"target\"].values\n</pre> # deeptabular tab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True) X_tab_train = tab_preprocessor.fit_transform(df_train) X_tab_valid = tab_preprocessor.transform(df_valid) X_tab_test = tab_preprocessor.transform(df_test)  # target y_train = df_train[\"target\"].values y_valid = df_valid[\"target\"].values y_test = df_test[\"target\"].values In\u00a0[4]: Copied! <pre># Define the model\ninput_layer = len(tab_preprocessor.continuous_cols)\noutput_layer = 1\nhidden_layers = np.linspace(\n    input_layer * 2, output_layer, 5, endpoint=False, dtype=int\n).tolist()\n\ndeeptabular = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    mlp_hidden_dims=hidden_layers,\n)\nmodel = WideDeep(deeptabular=deeptabular)\nmodel\n</pre> # Define the model input_layer = len(tab_preprocessor.continuous_cols) output_layer = 1 hidden_layers = np.linspace(     input_layer * 2, output_layer, 5, endpoint=False, dtype=int ).tolist()  deeptabular = TabMlp(     column_idx=tab_preprocessor.column_idx,     continuous_cols=tab_preprocessor.continuous_cols,     mlp_hidden_dims=hidden_layers, ) model = WideDeep(deeptabular=deeptabular) model Out[4]: <pre>WideDeep(\n  (deeptabular): Sequential(\n    (0): TabMlp(\n      (cont_norm): Identity()\n      (encoder): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=74, out_features=148, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_1): Sequential(\n            (0): Linear(in_features=148, out_features=118, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_2): Sequential(\n            (0): Linear(in_features=118, out_features=89, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_3): Sequential(\n            (0): Linear(in_features=89, out_features=59, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_4): Sequential(\n            (0): Linear(in_features=59, out_features=30, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=30, out_features=1, bias=True)\n  )\n)</pre> In\u00a0[5]: Copied! <pre># Metrics from pytorch-widedeep\naccuracy = Accuracy(top_k=2)\nprecision = Precision(average=False)\n\n# # Metrics from torchmetrics\n# accuracy = Accuracy_torchmetrics(average=None, num_classes=1)\n# precision = Precision_torchmetrics(average=\"micro\", num_classes=1)\n\n# Optimizers\ndeep_opt = SGD(model.deeptabular.parameters(), lr=0.1)\n\n# LR Scheduler\ndeep_sch = lr_scheduler.StepLR(deep_opt, step_size=3)\n\ntrainer = Trainer(\n    model,\n    objective=\"binary\",\n    lr_schedulers={\"deeptabular\": deep_sch},\n    initializers={\"deeptabular\": XavierNormal},\n    optimizers={\"deeptabular\": deep_opt},\n    metrics=[accuracy, precision],\n    verbose=1,\n)\n</pre> # Metrics from pytorch-widedeep accuracy = Accuracy(top_k=2) precision = Precision(average=False)  # # Metrics from torchmetrics # accuracy = Accuracy_torchmetrics(average=None, num_classes=1) # precision = Precision_torchmetrics(average=\"micro\", num_classes=1)  # Optimizers deep_opt = SGD(model.deeptabular.parameters(), lr=0.1)  # LR Scheduler deep_sch = lr_scheduler.StepLR(deep_opt, step_size=3)  trainer = Trainer(     model,     objective=\"binary\",     lr_schedulers={\"deeptabular\": deep_sch},     initializers={\"deeptabular\": XavierNormal},     optimizers={\"deeptabular\": deep_opt},     metrics=[accuracy, precision],     verbose=1, ) In\u00a0[6]: Copied! <pre>start = time.time()\ntrainer.fit(\n    X_train={\"X_tab\": X_tab_train, \"target\": y_train},\n    X_val={\"X_tab\": X_tab_valid, \"target\": y_valid},\n    n_epochs=1,\n    batch_size=32,\n    custom_dataloader=DataLoaderImbalanced,\n    oversample_mul=5,\n)\nprint(\n    \"Training time[s]: {}\".format(\n        datetime.timedelta(seconds=round(time.time() - start))\n    )\n)\n\npd.DataFrame(trainer.history)\n\ndf_pred = trainer.predict(X_tab=X_tab_test)\nprint(classification_report(df_test[\"target\"].to_list(), df_pred))\nprint(\"Actual predicted values:\\n{}\".format(np.unique(df_pred, return_counts=True)))\n</pre> start = time.time() trainer.fit(     X_train={\"X_tab\": X_tab_train, \"target\": y_train},     X_val={\"X_tab\": X_tab_valid, \"target\": y_valid},     n_epochs=1,     batch_size=32,     custom_dataloader=DataLoaderImbalanced,     oversample_mul=5, ) print(     \"Training time[s]: {}\".format(         datetime.timedelta(seconds=round(time.time() - start))     ) )  pd.DataFrame(trainer.history)  df_pred = trainer.predict(X_tab=X_tab_test) print(classification_report(df_test[\"target\"].to_list(), df_pred)) print(\"Actual predicted values:\\n{}\".format(np.unique(df_pred, return_counts=True))) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 325/325 [00:02&lt;00:00, 153.99it/s, loss=0.163, metrics={'acc': 0.9363, 'prec': [0.9358]}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 456/456 [00:02&lt;00:00, 205.93it/s, loss=0.1, metrics={'acc': 0.9501, 'prec': [0.1447]}]\n</pre> <pre>Training time[s]: 0:00:04\n</pre> <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 456/456 [00:01&lt;00:00, 368.16it/s]\n</pre> <pre>              precision    recall  f1-score   support\n\n           0       1.00      0.95      0.97     14446\n           1       0.15      0.95      0.25       130\n\n    accuracy                           0.95     14576\n   macro avg       0.57      0.95      0.61     14576\nweighted avg       0.99      0.95      0.97     14576\n\nActual predicted values:\n(array([0, 1]), array([13736,   840]))\n</pre>"},{"location":"examples/08_custom_dataLoader_imbalanced_dataset.html#custom-dataloader-for-imbalanced-dataset","title":"Custom DataLoader for Imbalanced dataset\u00b6","text":""},{"location":"examples/08_custom_dataLoader_imbalanced_dataset.html#initial-imports","title":"Initial imports\u00b6","text":""},{"location":"examples/09_extracting_embeddings.html","title":"09_extracting_embeddings","text":"In\u00a0[1]: Copied! <pre>import numpy as np\nimport pandas as pd\nimport torch\n\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.training import Trainer\nfrom pytorch_widedeep.models import FTTransformer, WideDeep\nfrom pytorch_widedeep.metrics import Accuracy\nfrom pytorch_widedeep import Tab2Vec\nfrom pytorch_widedeep.datasets import load_adult\n</pre> import numpy as np import pandas as pd import torch  from pytorch_widedeep.preprocessing import TabPreprocessor from pytorch_widedeep.training import Trainer from pytorch_widedeep.models import FTTransformer, WideDeep from pytorch_widedeep.metrics import Accuracy from pytorch_widedeep import Tab2Vec from pytorch_widedeep.datasets import load_adult In\u00a0[2]: Copied! <pre>df = load_adult(as_frame=True)\ndf.head()\n</pre> df = load_adult(as_frame=True) df.head() Out[2]: age workclass fnlwgt education educational-num marital-status occupation relationship race gender capital-gain capital-loss hours-per-week native-country income 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States &lt;=50K 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States &lt;=50K 2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States &gt;50K 3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States &gt;50K 4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States &lt;=50K In\u00a0[3]: Copied! <pre># For convenience, we'll replace '-' with '_'\ndf.columns = [c.replace(\"-\", \"_\") for c in df.columns]\n# binary target\ndf[\"target\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int)\ndf.drop([\"income\", \"educational_num\"], axis=1, inplace=True)\n\ndf.head()\n</pre> # For convenience, we'll replace '-' with '_' df.columns = [c.replace(\"-\", \"_\") for c in df.columns] # binary target df[\"target\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int) df.drop([\"income\", \"educational_num\"], axis=1, inplace=True)  df.head() Out[3]: age workclass fnlwgt education marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country target 0 25 Private 226802 11th Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States 0 1 38 Private 89814 HS-grad Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States 0 2 28 Local-gov 336951 Assoc-acdm Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States 1 3 44 Private 160323 Some-college Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States 1 4 18 ? 103497 Some-college Never-married ? Own-child White Female 0 0 30 United-States 0 In\u00a0[4]: Copied! <pre>cat_cols, cont_cols = [], []\nfor col in df.columns:\n    # 50 is just a random number I choose here for this example\n    if df[col].dtype == \"O\" or df[col].nunique() &lt; 50 and col != \"target\":\n        cat_cols.append(col)\n    elif col != \"target\":\n        cont_cols.append(col)\ntarget_col = \"target\"\n</pre> cat_cols, cont_cols = [], [] for col in df.columns:     # 50 is just a random number I choose here for this example     if df[col].dtype == \"O\" or df[col].nunique() &lt; 50 and col != \"target\":         cat_cols.append(col)     elif col != \"target\":         cont_cols.append(col) target_col = \"target\" In\u00a0[5]: Copied! <pre>target = df[target_col].values\n\ntab_preprocessor = TabPreprocessor(\n    embed_cols=cat_cols, continuous_cols=cont_cols, for_transformer=True\n)\nX_tab = tab_preprocessor.fit_transform(df)\n</pre> target = df[target_col].values  tab_preprocessor = TabPreprocessor(     embed_cols=cat_cols, continuous_cols=cont_cols, for_transformer=True ) X_tab = tab_preprocessor.fit_transform(df) <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> In\u00a0[6]: Copied! <pre>ft_transformer = FTTransformer(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    embed_continuous_method=\"standard\",\n    n_blocks=2,\n    n_heads=4,\n    input_dim=16,\n)\n</pre> ft_transformer = FTTransformer(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=tab_preprocessor.continuous_cols,     embed_continuous_method=\"standard\",     n_blocks=2,     n_heads=4,     input_dim=16, ) <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/utils/general_utils.py:12: DeprecationWarning: The 'embed_continuous' parameter is deprecated and will be removed in the next release. Please use 'embed_continuous_method' instead See the documentation for more details.\n  return func(*args, **kwargs)\n</pre> In\u00a0[7]: Copied! <pre>model = WideDeep(deeptabular=ft_transformer)\ntrainer = Trainer(model, objective=\"binary\", metrics=[Accuracy])\ntrainer.fit(X_tab=X_tab, target=target, n_epochs=1, batch_size=256, val_split=0.2)\n</pre> model = WideDeep(deeptabular=ft_transformer) trainer = Trainer(model, objective=\"binary\", metrics=[Accuracy]) trainer.fit(X_tab=X_tab, target=target, n_epochs=1, batch_size=256, val_split=0.2) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:03&lt;00:00, 41.47it/s, loss=221, metrics={'acc': 0.686}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 39/39 [00:00&lt;00:00, 86.24it/s, loss=9.28, metrics={'acc': 0.76}]\n</pre> In\u00a0[8]: Copied! <pre>t2v = Tab2Vec(model=model, tab_preprocessor=tab_preprocessor)\n</pre> t2v = Tab2Vec(model=model, tab_preprocessor=tab_preprocessor) In\u00a0[9]: Copied! <pre># assuming is a test set with target col\nX_vec, y = t2v.transform(df.sample(100), target_col=\"target\")\n</pre> # assuming is a test set with target col X_vec, y = t2v.transform(df.sample(100), target_col=\"target\") In\u00a0[10]: Copied! <pre># X vec is the dataframe turned into the embeddings\nX_vec.shape\n</pre> # X vec is the dataframe turned into the embeddings X_vec.shape Out[10]: <pre>(100, 208)</pre> <p><code>208 = input_dim (16) * n_cols (13)</code></p> In\u00a0[11]: Copied! <pre># ...or if we don't have target col\nX_vec = t2v.transform(df.sample(100))\n</pre> # ...or if we don't have target col X_vec = t2v.transform(df.sample(100))"},{"location":"examples/09_extracting_embeddings.html#extracting-embeddings","title":"Extracting embeddings\u00b6","text":"<p>This notebook is a simple guide to extracting learned feature embeddings using Tab2Vec</p>"},{"location":"examples/10_3rd_party_integration-RayTune_WnB.html","title":"10_3rd_party_integration-RayTune_WnB","text":"In\u00a0[11]: Copied! <pre>from typing import Optional, Dict\nimport os\n\nimport numpy as np\nimport pandas as pd\nimport torch\nimport wandb\nfrom torch.optim import SGD, lr_scheduler\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.models import TabMlp, WideDeep\nfrom torchmetrics import F1Score as F1_torchmetrics\nfrom torchmetrics import Accuracy as Accuracy_torchmetrics\nfrom torchmetrics import Precision as Precision_torchmetrics\nfrom torchmetrics import Recall as Recall_torchmetrics\nfrom pytorch_widedeep.metrics import Accuracy, Recall, Precision, F1Score, R2Score\nfrom pytorch_widedeep.initializers import XavierNormal\nfrom pytorch_widedeep.callbacks import (\n    EarlyStopping,\n    ModelCheckpoint,\n    Callback,\n)\nfrom pytorch_widedeep.datasets import load_bio_kdd04\n\nfrom sklearn.model_selection import train_test_split\nimport warnings\n\nwarnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n\nfrom ray import tune\nfrom ray.tune.schedulers import AsyncHyperBandScheduler\nfrom ray.tune import JupyterNotebookReporter\nfrom ray.air.integrations.wandb import WandbLoggerCallback\n\n# from ray.tune.integration.wandb import wandb_mixin\n\nimport tracemalloc\n\ntracemalloc.start()\n\n# increase displayed columns in jupyter notebook\npd.set_option(\"display.max_columns\", 200)\npd.set_option(\"display.max_rows\", 300)\n</pre> from typing import Optional, Dict import os  import numpy as np import pandas as pd import torch import wandb from torch.optim import SGD, lr_scheduler  from pytorch_widedeep import Trainer from pytorch_widedeep.preprocessing import TabPreprocessor from pytorch_widedeep.models import TabMlp, WideDeep from torchmetrics import F1Score as F1_torchmetrics from torchmetrics import Accuracy as Accuracy_torchmetrics from torchmetrics import Precision as Precision_torchmetrics from torchmetrics import Recall as Recall_torchmetrics from pytorch_widedeep.metrics import Accuracy, Recall, Precision, F1Score, R2Score from pytorch_widedeep.initializers import XavierNormal from pytorch_widedeep.callbacks import (     EarlyStopping,     ModelCheckpoint,     Callback, ) from pytorch_widedeep.datasets import load_bio_kdd04  from sklearn.model_selection import train_test_split import warnings  warnings.filterwarnings(\"ignore\", category=DeprecationWarning)  from ray import tune from ray.tune.schedulers import AsyncHyperBandScheduler from ray.tune import JupyterNotebookReporter from ray.air.integrations.wandb import WandbLoggerCallback  # from ray.tune.integration.wandb import wandb_mixin  import tracemalloc  tracemalloc.start()  # increase displayed columns in jupyter notebook pd.set_option(\"display.max_columns\", 200) pd.set_option(\"display.max_rows\", 300) In\u00a0[12]: Copied! <pre>class RayTuneReporter(Callback):\n    r\"\"\"Callback that allows reporting history and lr_history values to RayTune\n    during Hyperparameter tuning\n\n    Callbacks are passed as input parameters to the ``Trainer`` class. See\n    :class:`pytorch_widedeep.trainer.Trainer`\n\n    For examples see the examples folder at:\n\n        .. code-block:: bash\n\n            /examples/12_HyperParameter_tuning_w_RayTune.ipynb\n    \"\"\"\n\n    def on_epoch_end(\n        self, epoch: int, logs: Optional[Dict] = None, metric: Optional[float] = None\n    ):\n        report_dict = {}\n        for k, v in self.trainer.history.items():\n            report_dict.update({k: v[-1]})\n        if hasattr(self.trainer, \"lr_history\"):\n            for k, v in self.trainer.lr_history.items():\n                report_dict.update({k: v[-1]})\n        tune.report(report_dict)\n\n\nclass WnBReportBest(Callback):\n    r\"\"\"Callback that allows reporting best performance of a run to WnB\n    during Hyperparameter tuning. It is an adjusted pytorch_widedeep.callbacks.ModelCheckpoint\n    with added WnB and removed checkpoint saving.\n\n    Callbacks are passed as input parameters to the ``Trainer`` class.\n\n    Parameters\n    ----------\n    wb: obj\n        Weights&amp;Biases API interface to report single best result usable for\n        comparisson of multiple paramater combinations by, for example,\n        `parallel coordinates\n        &lt;https://docs.wandb.ai/ref/app/features/panels/parallel-coordinates&gt;`_.\n        E.g W&amp;B summary report `wandb.run.summary[\"best\"]`.\n    monitor: str, default=\"loss\"\n        quantity to monitor. Typically `'val_loss'` or metric name\n        (e.g. `'val_acc'`)\n    mode: str, default=\"auto\"\n        If ``save_best_only=True``, the decision to overwrite the current save\n        file is made based on either the maximization or the minimization of\n        the monitored quantity. For `'acc'`, this should be `'max'`, for\n        `'loss'` this should be `'min'`, etc. In `'auto'` mode, the\n        direction is automatically inferred from the name of the monitored\n        quantity.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        wb: object,\n        monitor: str = \"val_loss\",\n        mode: str = \"auto\",\n    ):\n        super(WnBReportBest, self).__init__()\n\n        self.monitor = monitor\n        self.mode = mode\n        self.wb = wb\n\n        if self.mode not in [\"auto\", \"min\", \"max\"]:\n            warnings.warn(\n                \"WnBReportBest mode %s is unknown, \"\n                \"fallback to auto mode.\" % (self.mode),\n                RuntimeWarning,\n            )\n            self.mode = \"auto\"\n        if self.mode == \"min\":\n            self.monitor_op = np.less\n            self.best = np.Inf\n        elif self.mode == \"max\":\n            self.monitor_op = np.greater  # type: ignore[assignment]\n            self.best = -np.Inf\n        else:\n            if self._is_metric(self.monitor):\n                self.monitor_op = np.greater  # type: ignore[assignment]\n                self.best = -np.Inf\n            else:\n                self.monitor_op = np.less\n                self.best = np.Inf\n\n    def on_epoch_end(  # noqa: C901\n        self, epoch: int, logs: Optional[Dict] = None, metric: Optional[float] = None\n    ):\n        logs = logs or {}\n        current = logs.get(self.monitor)\n        if current is not None:\n            if self.monitor_op(current, self.best):\n                self.wb.run.summary[\"best\"] = current  # type: ignore[attr-defined]\n                self.best = current\n                self.best_epoch = epoch\n\n    @staticmethod\n    def _is_metric(monitor: str):\n        \"copied from pytorch_widedeep.callbacks\"\n        if any([s in monitor for s in [\"acc\", \"prec\", \"rec\", \"fscore\", \"f1\", \"f2\"]]):\n            return True\n        else:\n            return False\n</pre> class RayTuneReporter(Callback):     r\"\"\"Callback that allows reporting history and lr_history values to RayTune     during Hyperparameter tuning      Callbacks are passed as input parameters to the ``Trainer`` class. See     :class:`pytorch_widedeep.trainer.Trainer`      For examples see the examples folder at:          .. code-block:: bash              /examples/12_HyperParameter_tuning_w_RayTune.ipynb     \"\"\"      def on_epoch_end(         self, epoch: int, logs: Optional[Dict] = None, metric: Optional[float] = None     ):         report_dict = {}         for k, v in self.trainer.history.items():             report_dict.update({k: v[-1]})         if hasattr(self.trainer, \"lr_history\"):             for k, v in self.trainer.lr_history.items():                 report_dict.update({k: v[-1]})         tune.report(report_dict)   class WnBReportBest(Callback):     r\"\"\"Callback that allows reporting best performance of a run to WnB     during Hyperparameter tuning. It is an adjusted pytorch_widedeep.callbacks.ModelCheckpoint     with added WnB and removed checkpoint saving.      Callbacks are passed as input parameters to the ``Trainer`` class.      Parameters     ----------     wb: obj         Weights&amp;Biases API interface to report single best result usable for         comparisson of multiple paramater combinations by, for example,         `parallel coordinates         `_.         E.g W&amp;B summary report `wandb.run.summary[\"best\"]`.     monitor: str, default=\"loss\"         quantity to monitor. Typically `'val_loss'` or metric name         (e.g. `'val_acc'`)     mode: str, default=\"auto\"         If ``save_best_only=True``, the decision to overwrite the current save         file is made based on either the maximization or the minimization of         the monitored quantity. For `'acc'`, this should be `'max'`, for         `'loss'` this should be `'min'`, etc. In `'auto'` mode, the         direction is automatically inferred from the name of the monitored         quantity.      \"\"\"      def __init__(         self,         wb: object,         monitor: str = \"val_loss\",         mode: str = \"auto\",     ):         super(WnBReportBest, self).__init__()          self.monitor = monitor         self.mode = mode         self.wb = wb          if self.mode not in [\"auto\", \"min\", \"max\"]:             warnings.warn(                 \"WnBReportBest mode %s is unknown, \"                 \"fallback to auto mode.\" % (self.mode),                 RuntimeWarning,             )             self.mode = \"auto\"         if self.mode == \"min\":             self.monitor_op = np.less             self.best = np.Inf         elif self.mode == \"max\":             self.monitor_op = np.greater  # type: ignore[assignment]             self.best = -np.Inf         else:             if self._is_metric(self.monitor):                 self.monitor_op = np.greater  # type: ignore[assignment]                 self.best = -np.Inf             else:                 self.monitor_op = np.less                 self.best = np.Inf      def on_epoch_end(  # noqa: C901         self, epoch: int, logs: Optional[Dict] = None, metric: Optional[float] = None     ):         logs = logs or {}         current = logs.get(self.monitor)         if current is not None:             if self.monitor_op(current, self.best):                 self.wb.run.summary[\"best\"] = current  # type: ignore[attr-defined]                 self.best = current                 self.best_epoch = epoch      @staticmethod     def _is_metric(monitor: str):         \"copied from pytorch_widedeep.callbacks\"         if any([s in monitor for s in [\"acc\", \"prec\", \"rec\", \"fscore\", \"f1\", \"f2\"]]):             return True         else:             return False In\u00a0[13]: Copied! <pre>df = load_bio_kdd04(as_frame=True)\ndf.head()\n</pre> df = load_bio_kdd04(as_frame=True) df.head() Out[13]: EXAMPLE_ID BLOCK_ID target 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 0 279 261532 0 52.0 32.69 0.30 2.5 20.0 1256.8 -0.89 0.33 11.0 -55.0 267.2 0.52 0.05 -2.36 49.6 252.0 0.43 1.16 -2.06 -33.0 -123.2 1.60 -0.49 -6.06 65.0 296.1 -0.28 -0.26 -3.83 -22.6 -170.0 3.06 -1.05 -3.29 22.9 286.3 0.12 2.58 4.08 -33.0 -178.9 1.88 0.53 -7.0 -44.0 1987.0 -5.41 0.95 -4.0 -57.0 722.9 -3.26 -0.55 -7.5 125.5 1547.2 -0.36 1.12 9.0 -37.0 72.5 0.47 0.74 -11.0 -8.0 1595.1 -1.64 2.83 -2.0 -50.0 445.2 -0.35 0.26 0.76 1 279 261533 0 58.0 33.33 0.00 16.5 9.5 608.1 0.50 0.07 20.5 -52.5 521.6 -1.08 0.58 -0.02 -3.2 103.6 -0.95 0.23 -2.87 -25.9 -52.2 -0.21 0.87 -1.81 10.4 62.0 -0.28 -0.04 1.48 -17.6 -198.3 3.43 2.84 5.87 -16.9 72.6 -0.31 2.79 2.71 -33.5 -11.6 -1.11 4.01 5.0 -57.0 666.3 1.13 4.38 5.0 -64.0 39.3 1.07 -0.16 32.5 100.0 1893.7 -2.80 -0.22 2.5 -28.5 45.0 0.58 0.41 -19.0 -6.0 762.9 0.29 0.82 -3.0 -35.0 140.3 1.16 0.39 0.73 2 279 261534 0 77.0 27.27 -0.91 6.0 58.5 1623.6 -1.40 0.02 -6.5 -48.0 621.0 -1.20 0.14 -0.20 73.6 609.1 -0.44 -0.58 -0.04 -23.0 -27.4 -0.72 -1.04 -1.09 91.1 635.6 -0.88 0.24 0.59 -18.7 -7.2 -0.60 -2.82 -0.71 52.4 504.1 0.89 -0.67 -9.30 -20.8 -25.7 -0.77 -0.85 0.0 -20.0 2259.0 -0.94 1.15 -4.0 -44.0 -22.7 0.94 -0.98 -19.0 105.0 1267.9 1.03 1.27 11.0 -39.5 82.3 0.47 -0.19 -10.0 7.0 1491.8 0.32 -1.29 0.0 -34.0 658.2 -0.76 0.26 0.24 3 279 261535 0 41.0 27.91 -0.35 3.0 46.0 1921.6 -1.36 -0.47 -32.0 -51.5 560.9 -0.29 -0.10 -1.11 124.3 791.6 0.00 0.39 -1.85 -21.7 -44.9 -0.21 0.02 0.89 133.9 797.8 -0.08 1.06 -0.26 -16.4 -74.1 0.97 -0.80 -0.41 66.9 955.3 -1.90 1.28 -6.65 -28.1 47.5 -1.91 1.42 1.0 -30.0 1846.7 0.76 1.10 -4.0 -52.0 -53.9 1.71 -0.22 -12.0 97.5 1969.8 -1.70 0.16 -1.0 -32.5 255.9 -0.46 1.57 10.0 6.0 2047.7 -0.98 1.53 0.0 -49.0 554.2 -0.83 0.39 0.73 4 279 261536 0 50.0 28.00 -1.32 -9.0 12.0 464.8 0.88 0.19 8.0 -51.5 98.1 1.09 -0.33 -2.16 -3.9 102.7 0.39 -1.22 -3.39 -15.2 -42.2 -1.18 -1.11 -3.55 8.9 141.3 -0.16 -0.43 -4.15 -12.9 -13.4 -1.32 -0.98 -3.69 8.8 136.1 -0.30 4.13 1.89 -13.0 -18.7 -1.37 -0.93 0.0 -1.0 810.1 -2.29 6.72 1.0 -23.0 -29.7 0.58 -1.10 -18.5 33.5 206.8 1.84 -0.13 4.0 -29.0 30.1 0.80 -0.24 5.0 -14.0 479.5 0.68 -0.59 2.0 -36.0 -6.9 2.02 0.14 -0.23 In\u00a0[14]: Copied! <pre># imbalance of the classes\ndf[\"target\"].value_counts()\n</pre> # imbalance of the classes df[\"target\"].value_counts() Out[14]: <pre>target\n0    144455\n1      1296\nName: count, dtype: int64</pre> In\u00a0[15]: Copied! <pre># drop columns we won't need in this example\ndf.drop(columns=[\"EXAMPLE_ID\", \"BLOCK_ID\"], inplace=True)\n</pre> # drop columns we won't need in this example df.drop(columns=[\"EXAMPLE_ID\", \"BLOCK_ID\"], inplace=True) In\u00a0[16]: Copied! <pre>df_train, df_valid = train_test_split(\n    df, test_size=0.2, stratify=df[\"target\"], random_state=1\n)\ndf_valid, df_test = train_test_split(\n    df_valid, test_size=0.5, stratify=df_valid[\"target\"], random_state=1\n)\n</pre> df_train, df_valid = train_test_split(     df, test_size=0.2, stratify=df[\"target\"], random_state=1 ) df_valid, df_test = train_test_split(     df_valid, test_size=0.5, stratify=df_valid[\"target\"], random_state=1 ) In\u00a0[17]: Copied! <pre>continuous_cols = df.drop(columns=[\"target\"]).columns.values.tolist()\n</pre> continuous_cols = df.drop(columns=[\"target\"]).columns.values.tolist() In\u00a0[18]: Copied! <pre># deeptabular\ntab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True)\nX_tab_train = tab_preprocessor.fit_transform(df_train)\nX_tab_valid = tab_preprocessor.transform(df_valid)\nX_tab_test = tab_preprocessor.transform(df_test)\n\n# target\ny_train = df_train[\"target\"].values\ny_valid = df_valid[\"target\"].values\ny_test = df_test[\"target\"].values\n</pre> # deeptabular tab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True) X_tab_train = tab_preprocessor.fit_transform(df_train) X_tab_valid = tab_preprocessor.transform(df_valid) X_tab_test = tab_preprocessor.transform(df_test)  # target y_train = df_train[\"target\"].values y_valid = df_valid[\"target\"].values y_test = df_test[\"target\"].values In\u00a0[19]: Copied! <pre>input_layer = len(tab_preprocessor.continuous_cols)\noutput_layer = 1\nhidden_layers = np.linspace(\n    input_layer * 2, output_layer, 5, endpoint=False, dtype=int\n).tolist()\n</pre> input_layer = len(tab_preprocessor.continuous_cols) output_layer = 1 hidden_layers = np.linspace(     input_layer * 2, output_layer, 5, endpoint=False, dtype=int ).tolist() In\u00a0[20]: Copied! <pre>deeptabular = TabMlp(\n    mlp_hidden_dims=hidden_layers,\n    column_idx=tab_preprocessor.column_idx,\n    continuous_cols=tab_preprocessor.continuous_cols,\n)\nmodel = WideDeep(deeptabular=deeptabular)\nmodel\n</pre> deeptabular = TabMlp(     mlp_hidden_dims=hidden_layers,     column_idx=tab_preprocessor.column_idx,     continuous_cols=tab_preprocessor.continuous_cols, ) model = WideDeep(deeptabular=deeptabular) model Out[20]: <pre>WideDeep(\n  (deeptabular): Sequential(\n    (0): TabMlp(\n      (cont_norm): Identity()\n      (encoder): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=74, out_features=148, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_1): Sequential(\n            (0): Linear(in_features=148, out_features=118, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_2): Sequential(\n            (0): Linear(in_features=118, out_features=89, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_3): Sequential(\n            (0): Linear(in_features=89, out_features=59, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_4): Sequential(\n            (0): Linear(in_features=59, out_features=30, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=30, out_features=1, bias=True)\n  )\n)</pre> In\u00a0[24]: Copied! <pre># Metrics from torchmetrics\naccuracy = Accuracy_torchmetrics(average=None, num_classes=1, task=\"binary\")\nprecision = Precision_torchmetrics(average=\"micro\", num_classes=1, task=\"binary\")\nf1 = F1_torchmetrics(average=None, num_classes=1, task=\"binary\")\nrecall = Recall_torchmetrics(average=None, num_classes=1, task=\"binary\")\n</pre> # Metrics from torchmetrics accuracy = Accuracy_torchmetrics(average=None, num_classes=1, task=\"binary\") precision = Precision_torchmetrics(average=\"micro\", num_classes=1, task=\"binary\") f1 = F1_torchmetrics(average=None, num_classes=1, task=\"binary\") recall = Recall_torchmetrics(average=None, num_classes=1, task=\"binary\") <p>Note:</p> <p>Following cells includes usage of both <code>RayTuneReporter</code> and <code>WnBReportBest</code> callbacks. In case you want to use just <code>RayTuneReporter</code>, remove following:</p> <ul> <li>wandb from config</li> <li><code>WandbLoggerCallback</code></li> <li><code>WnBReportBest</code></li> <li><code>@wandb_mixin</code> decorator</li> </ul> <p>We do not see strong reason to use WnB without RayTune for a single paramater combination run, but it is possible:</p> <ul> <li>option01: define paramaters in config only for a single value <code>tune.grid_search([1000])</code> (single value RayTune run)</li> <li>option02: define WnB callback that reports currnet validation/training loss, metrics, etc. at the end of batch, ie. do not report to WnB at <code>epoch_end</code> as in <code>WnBReportBest</code> but at the <code>on_batch_end</code>, see <code>pytorch_widedeep.callbacks.Callback</code></li> </ul> In\u00a0[26]: Copied! <pre>config = {\n    \"batch_size\": tune.grid_search([1000, 5000]),\n    \"wandb\": {\n        \"project\": \"test\",\n        # \"api_key_file\": os.getcwd() + \"/wandb_api.key\",\n        \"api_key\": \"WNB_API_KEY\",\n    },\n}\n\n# Optimizers\ndeep_opt = SGD(model.deeptabular.parameters(), lr=0.1)\n# LR Scheduler\ndeep_sch = lr_scheduler.StepLR(deep_opt, step_size=3)\n\n\n@wandb_mixin\ndef training_function(config, X_train, X_val):\n    early_stopping = EarlyStopping()\n    model_checkpoint = ModelCheckpoint(save_best_only=True)\n    # Hyperparameters\n    batch_size = config[\"batch_size\"]\n    trainer = Trainer(\n        model,\n        objective=\"binary_focal_loss\",\n        callbacks=[\n            RayTuneReporter,\n            WnBReportBest(wb=wandb),\n            early_stopping,\n            model_checkpoint,\n        ],\n        lr_schedulers={\"deeptabular\": deep_sch},\n        initializers={\"deeptabular\": XavierNormal},\n        optimizers={\"deeptabular\": deep_opt},\n        metrics=[accuracy, precision, recall, f1],\n        verbose=0,\n    )\n\n    trainer.fit(X_train=X_train, X_val=X_val, n_epochs=5, batch_size=batch_size)\n\n\nX_train = {\"X_tab\": X_tab_train, \"target\": y_train}\nX_val = {\"X_tab\": X_tab_valid, \"target\": y_valid}\n\nasha_scheduler = AsyncHyperBandScheduler(\n    time_attr=\"training_iteration\",\n    metric=\"_metric/val_loss\",\n    mode=\"min\",\n    max_t=100,\n    grace_period=10,\n    reduction_factor=3,\n    brackets=1,\n)\n\nanalysis = tune.run(\n    tune.with_parameters(training_function, X_train=X_train, X_val=X_val),\n    resources_per_trial={\"cpu\": 1, \"gpu\": 0},\n    progress_reporter=JupyterNotebookReporter(overwrite=True),\n    scheduler=asha_scheduler,\n    config=config,\n    callbacks=[\n        WandbLoggerCallback(\n            project=config[\"wandb\"][\"project\"],\n            # api_key_file=config[\"wandb\"][\"api_key_file\"],\n            api_key=config[\"wandb\"][\"api_key\"],\n            log_config=True,\n        )\n    ],\n)\n</pre> config = {     \"batch_size\": tune.grid_search([1000, 5000]),     \"wandb\": {         \"project\": \"test\",         # \"api_key_file\": os.getcwd() + \"/wandb_api.key\",         \"api_key\": \"WNB_API_KEY\",     }, }  # Optimizers deep_opt = SGD(model.deeptabular.parameters(), lr=0.1) # LR Scheduler deep_sch = lr_scheduler.StepLR(deep_opt, step_size=3)   @wandb_mixin def training_function(config, X_train, X_val):     early_stopping = EarlyStopping()     model_checkpoint = ModelCheckpoint(save_best_only=True)     # Hyperparameters     batch_size = config[\"batch_size\"]     trainer = Trainer(         model,         objective=\"binary_focal_loss\",         callbacks=[             RayTuneReporter,             WnBReportBest(wb=wandb),             early_stopping,             model_checkpoint,         ],         lr_schedulers={\"deeptabular\": deep_sch},         initializers={\"deeptabular\": XavierNormal},         optimizers={\"deeptabular\": deep_opt},         metrics=[accuracy, precision, recall, f1],         verbose=0,     )      trainer.fit(X_train=X_train, X_val=X_val, n_epochs=5, batch_size=batch_size)   X_train = {\"X_tab\": X_tab_train, \"target\": y_train} X_val = {\"X_tab\": X_tab_valid, \"target\": y_valid}  asha_scheduler = AsyncHyperBandScheduler(     time_attr=\"training_iteration\",     metric=\"_metric/val_loss\",     mode=\"min\",     max_t=100,     grace_period=10,     reduction_factor=3,     brackets=1, )  analysis = tune.run(     tune.with_parameters(training_function, X_train=X_train, X_val=X_val),     resources_per_trial={\"cpu\": 1, \"gpu\": 0},     progress_reporter=JupyterNotebookReporter(overwrite=True),     scheduler=asha_scheduler,     config=config,     callbacks=[         WandbLoggerCallback(             project=config[\"wandb\"][\"project\"],             # api_key_file=config[\"wandb\"][\"api_key_file\"],             api_key=config[\"wandb\"][\"api_key\"],             log_config=True,         )     ], ) <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/lib/python3.10/tempfile.py:860: ResourceWarning: Implicitly cleaning up &lt;TemporaryDirectory '/var/folders/_2/lrjn1qn54c758tdtktr1bvkc0000gn/T/tmp60pfyl1kwandb'&gt;\n  _warnings.warn(warn_message, ResourceWarning)\n/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/lib/python3.10/tempfile.py:860: ResourceWarning: Implicitly cleaning up &lt;TemporaryDirectory '/var/folders/_2/lrjn1qn54c758tdtktr1bvkc0000gn/T/tmpnjv2rg1wwandb-artifacts'&gt;\n  _warnings.warn(warn_message, ResourceWarning)\n/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/lib/python3.10/tempfile.py:860: ResourceWarning: Implicitly cleaning up &lt;TemporaryDirectory '/var/folders/_2/lrjn1qn54c758tdtktr1bvkc0000gn/T/tmpgebu5k1kwandb-media'&gt;\n  _warnings.warn(warn_message, ResourceWarning)\n/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/lib/python3.10/tempfile.py:860: ResourceWarning: Implicitly cleaning up &lt;TemporaryDirectory '/var/folders/_2/lrjn1qn54c758tdtktr1bvkc0000gn/T/tmpxy9y2yriwandb-media'&gt;\n  _warnings.warn(warn_message, ResourceWarning)\n</pre> In\u00a0[14]: Copied! <pre>analysis.results\n</pre> analysis.results Out[14]: <pre>{'fc9a8_00000': {'_metric': {'train_loss': 0.006297602537127896,\n   'train_Accuracy': 0.9925042986869812,\n   'train_Precision': 0.9939393997192383,\n   'train_Recall': 0.15814851224422455,\n   'train_F1Score': 0.2728785574436188,\n   'val_loss': 0.005045663565397263,\n   'val_Accuracy': 0.9946483969688416,\n   'val_Precision': 1.0,\n   'val_Recall': 0.39534884691238403,\n   'val_F1Score': 0.5666667222976685},\n  'time_this_iter_s': 2.388202428817749,\n  'done': True,\n  'timesteps_total': None,\n  'episodes_total': None,\n  'training_iteration': 5,\n  'trial_id': 'fc9a8_00000',\n  'experiment_id': 'baad1d4f3d924b48b9ece1b9f26c80cc',\n  'date': '2022-07-31_14-06-51',\n  'timestamp': 1659276411,\n  'time_total_s': 12.656474113464355,\n  'pid': 1813,\n  'hostname': 'jupyter-5uperpalo',\n  'node_ip': '10.32.44.172',\n  'config': {'batch_size': 1000},\n  'time_since_restore': 12.656474113464355,\n  'timesteps_since_restore': 0,\n  'iterations_since_restore': 5,\n  'warmup_time': 0.8006253242492676,\n  'experiment_tag': '0_batch_size=1000'},\n 'fc9a8_00001': {'_metric': {'train_loss': 0.02519632239515583,\n   'train_Accuracy': 0.9910891652107239,\n   'train_Precision': 0.25,\n   'train_Recall': 0.0009643201483413577,\n   'train_F1Score': 0.0019212296465411782,\n   'val_loss': 0.02578434906899929,\n   'val_Accuracy': 0.9911492466926575,\n   'val_Precision': 0.0,\n   'val_Recall': 0.0,\n   'val_F1Score': 0.0},\n  'time_this_iter_s': 4.113586902618408,\n  'done': True,\n  'timesteps_total': None,\n  'episodes_total': None,\n  'training_iteration': 5,\n  'trial_id': 'fc9a8_00001',\n  'experiment_id': 'f2e54a6a5780429fbf0db0746853347e',\n  'date': '2022-07-31_14-06-56',\n  'timestamp': 1659276416,\n  'time_total_s': 12.926990509033203,\n  'pid': 1962,\n  'hostname': 'jupyter-5uperpalo',\n  'node_ip': '10.32.44.172',\n  'config': {'batch_size': 5000},\n  'time_since_restore': 12.926990509033203,\n  'timesteps_since_restore': 0,\n  'iterations_since_restore': 5,\n  'warmup_time': 0.9253025054931641,\n  'experiment_tag': '1_batch_size=5000'}}</pre> <p>Using Weights and Biases logging you can create parallel coordinates graphs that map parametr combinations to the best(lowest) loss achieved during the training of the networks</p> <p></p> <p>local visualization of raytune reults using tensorboard</p> In\u00a0[23]: Copied! <pre>%load_ext tensorboard\n%tensorboard --logdir ~/ray_results\n</pre> %load_ext tensorboard %tensorboard --logdir ~/ray_results"},{"location":"examples/10_3rd_party_integration-RayTune_WnB.html#3rd-party-integration-raytune-weights-biases","title":"3rd party integration - RayTune, Weights &amp; Biases\u00b6","text":"<p>This notebook provides guideline for integration of external library functions in the model training process through <code>Callback</code> objects, a popular concept of using objects as arguments for other objects.</p> <p>[DISCLAIMER]</p> <p>We show integration of RayTune (a hyperparameter tuning framework) and Weights &amp; Biases (ML projects experiment tracking and versioning solution) in the <code>pytorch_widedeep</code> model training process. We did not include <code>RayTuneReporter</code> and <code>WnBReportBest</code> in the library code to minimize the dependencies on other libraries that are not directly included in the model design and training.</p>"},{"location":"examples/10_3rd_party_integration-RayTune_WnB.html#initial-imports","title":"Initial imports\u00b6","text":""},{"location":"examples/10_3rd_party_integration-RayTune_WnB.html#preparing-the-data","title":"Preparing the data\u00b6","text":""},{"location":"examples/10_3rd_party_integration-RayTune_WnB.html#define-the-model","title":"Define the model\u00b6","text":""},{"location":"examples/11_auc_multiclass.html","title":"11_auc_multiclass","text":"In\u00a0[1]: Copied! <pre>import numpy as np\nimport pandas as pd\nfrom torch.optim import SGD, lr_scheduler\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.models import TabMlp, WideDeep\nfrom torchmetrics import AUROC\nfrom pytorch_widedeep.initializers import XavierNormal\nfrom pytorch_widedeep.datasets import load_ecoli\nfrom pytorch_widedeep.utils import LabelEncoder\n\nfrom sklearn.model_selection import train_test_split\n\n# increase displayed columns in jupyter notebook\npd.set_option(\"display.max_columns\", 200)\npd.set_option(\"display.max_rows\", 300)\n</pre> import numpy as np import pandas as pd from torch.optim import SGD, lr_scheduler  from pytorch_widedeep import Trainer from pytorch_widedeep.preprocessing import TabPreprocessor from pytorch_widedeep.models import TabMlp, WideDeep from torchmetrics import AUROC from pytorch_widedeep.initializers import XavierNormal from pytorch_widedeep.datasets import load_ecoli from pytorch_widedeep.utils import LabelEncoder  from sklearn.model_selection import train_test_split  # increase displayed columns in jupyter notebook pd.set_option(\"display.max_columns\", 200) pd.set_option(\"display.max_rows\", 300) In\u00a0[2]: Copied! <pre>df = load_ecoli(as_frame=True)\ndf.head()\n</pre> df = load_ecoli(as_frame=True) df.head() Out[2]: SequenceName mcg gvh lip chg aac alm1 alm2 class 0 AAT_ECOLI 0.49 0.29 0.48 0.5 0.56 0.24 0.35 cp 1 ACEA_ECOLI 0.07 0.40 0.48 0.5 0.54 0.35 0.44 cp 2 ACEK_ECOLI 0.56 0.40 0.48 0.5 0.49 0.37 0.46 cp 3 ACKA_ECOLI 0.59 0.49 0.48 0.5 0.52 0.45 0.36 cp 4 ADI_ECOLI 0.23 0.32 0.48 0.5 0.55 0.25 0.35 cp In\u00a0[3]: Copied! <pre># imbalance of the classes\ndf[\"class\"].value_counts()\n</pre> # imbalance of the classes df[\"class\"].value_counts() Out[3]: <pre>class\ncp     143\nim      77\npp      52\nimU     35\nom      20\nomL      5\nimS      2\nimL      2\nName: count, dtype: int64</pre> In\u00a0[4]: Copied! <pre>df = df.loc[~df[\"class\"].isin([\"omL\", \"imS\", \"imL\"])]\ndf.reset_index(inplace=True, drop=True)\n</pre> df = df.loc[~df[\"class\"].isin([\"omL\", \"imS\", \"imL\"])] df.reset_index(inplace=True, drop=True) In\u00a0[5]: Copied! <pre>encoder = LabelEncoder([\"class\"])\ndf_enc = encoder.fit_transform(df)\ndf_enc[\"class\"] = df_enc[\"class\"] - 1\n</pre> encoder = LabelEncoder([\"class\"]) df_enc = encoder.fit_transform(df) df_enc[\"class\"] = df_enc[\"class\"] - 1 In\u00a0[6]: Copied! <pre># drop columns we won't need in this example\ndf_enc = df_enc.drop(columns=[\"SequenceName\"])\n</pre> # drop columns we won't need in this example df_enc = df_enc.drop(columns=[\"SequenceName\"]) In\u00a0[7]: Copied! <pre>df_train, df_valid = train_test_split(\n    df_enc, test_size=0.2, stratify=df_enc[\"class\"], random_state=1\n)\ndf_valid, df_test = train_test_split(\n    df_valid, test_size=0.5, stratify=df_valid[\"class\"], random_state=1\n)\n</pre> df_train, df_valid = train_test_split(     df_enc, test_size=0.2, stratify=df_enc[\"class\"], random_state=1 ) df_valid, df_test = train_test_split(     df_valid, test_size=0.5, stratify=df_valid[\"class\"], random_state=1 ) In\u00a0[8]: Copied! <pre>continuous_cols = df_enc.drop(columns=[\"class\"]).columns.values.tolist()\n</pre> continuous_cols = df_enc.drop(columns=[\"class\"]).columns.values.tolist() In\u00a0[9]: Copied! <pre># deeptabular\ntab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True)\nX_tab_train = tab_preprocessor.fit_transform(df_train)\nX_tab_valid = tab_preprocessor.transform(df_valid)\nX_tab_test = tab_preprocessor.transform(df_test)\n\n# target\ny_train = df_train[\"class\"].values\ny_valid = df_valid[\"class\"].values\ny_test = df_test[\"class\"].values\n\nX_train = {\"X_tab\": X_tab_train, \"target\": y_train}\nX_val = {\"X_tab\": X_tab_valid, \"target\": y_valid}\n</pre> # deeptabular tab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True) X_tab_train = tab_preprocessor.fit_transform(df_train) X_tab_valid = tab_preprocessor.transform(df_valid) X_tab_test = tab_preprocessor.transform(df_test)  # target y_train = df_train[\"class\"].values y_valid = df_valid[\"class\"].values y_test = df_test[\"class\"].values  X_train = {\"X_tab\": X_tab_train, \"target\": y_train} X_val = {\"X_tab\": X_tab_valid, \"target\": y_valid} <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:295: DeprecationWarning: 'scale' and 'already_standard' will be deprecated in the next release. Please use 'cols_to_scale' instead\n  self._check_inputs(cat_embed_cols)\n</pre> In\u00a0[10]: Copied! <pre>deeptabular = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    continuous_cols=tab_preprocessor.continuous_cols,\n)\nmodel = WideDeep(deeptabular=deeptabular, pred_dim=df_enc[\"class\"].nunique())\nmodel\n</pre> deeptabular = TabMlp(     column_idx=tab_preprocessor.column_idx,     continuous_cols=tab_preprocessor.continuous_cols, ) model = WideDeep(deeptabular=deeptabular, pred_dim=df_enc[\"class\"].nunique()) model Out[10]: <pre>WideDeep(\n  (deeptabular): Sequential(\n    (0): TabMlp(\n      (cont_norm): Identity()\n      (encoder): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=7, out_features=200, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_1): Sequential(\n            (0): Linear(in_features=200, out_features=100, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=100, out_features=5, bias=True)\n  )\n)</pre> In\u00a0[11]: Copied! <pre>auroc = AUROC(num_classes=df_enc[\"class\"].nunique(), task=\"multiclass\")\n</pre> auroc = AUROC(num_classes=df_enc[\"class\"].nunique(), task=\"multiclass\") In\u00a0[12]: Copied! <pre># Optimizers\ndeep_opt = SGD(model.deeptabular.parameters(), lr=0.1)\n# LR Scheduler\ndeep_sch = lr_scheduler.StepLR(deep_opt, step_size=3)\n# Hyperparameters\ntrainer = Trainer(\n    model,\n    objective=\"multiclass_focal_loss\",\n    lr_schedulers={\"deeptabular\": deep_sch},\n    initializers={\"deeptabular\": XavierNormal},\n    optimizers={\"deeptabular\": deep_opt},\n    metrics=[auroc],\n)\n\ntrainer.fit(X_train=X_train, X_val=X_val, n_epochs=5, batch_size=50)\n</pre> # Optimizers deep_opt = SGD(model.deeptabular.parameters(), lr=0.1) # LR Scheduler deep_sch = lr_scheduler.StepLR(deep_opt, step_size=3) # Hyperparameters trainer = Trainer(     model,     objective=\"multiclass_focal_loss\",     lr_schedulers={\"deeptabular\": deep_sch},     initializers={\"deeptabular\": XavierNormal},     optimizers={\"deeptabular\": deep_opt},     metrics=[auroc], )  trainer.fit(X_train=X_train, X_val=X_val, n_epochs=5, batch_size=50) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [00:00&lt;00:00, 54.59it/s, loss=0.109, metrics={'MulticlassAUROC': 0.314}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 98.35it/s, loss=0.105, metrics={'MulticlassAUROC': 0.2558}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [00:00&lt;00:00, 91.55it/s, loss=0.105, metrics={'MulticlassAUROC': 0.3546}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 111.68it/s, loss=0.101, metrics={'MulticlassAUROC': 0.2737}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [00:00&lt;00:00, 62.55it/s, loss=0.1, metrics={'MulticlassAUROC': 0.3795}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 108.51it/s, loss=0.0966, metrics={'MulticlassAUROC': 0.3053}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [00:00&lt;00:00, 99.35it/s, loss=0.0965, metrics={'MulticlassAUROC': 0.3809}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 117.73it/s, loss=0.0962, metrics={'MulticlassAUROC': 0.3089}]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [00:00&lt;00:00, 110.56it/s, loss=0.0967, metrics={'MulticlassAUROC': 0.3509}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 127.35it/s, loss=0.0958, metrics={'MulticlassAUROC': 0.3089}]\n</pre> In\u00a0[\u00a0]: Copied! <pre>\n</pre>"},{"location":"examples/11_auc_multiclass.html#auc-multiclass-computation","title":"AUC multiclass computation\u00b6","text":""},{"location":"examples/11_auc_multiclass.html#initial-imports","title":"Initial imports\u00b6","text":""},{"location":"examples/11_auc_multiclass.html#preparing-the-data","title":"Preparing the data\u00b6","text":""},{"location":"examples/11_auc_multiclass.html#define-the-model","title":"Define the model\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html","title":"12_ZILNLoss_origkeras_vs_pytorch_widedeep","text":"In\u00a0[1]: Copied! <pre># @title Copyright 2019 The Lifetime Value Authors.\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# ============================================================================\n</pre> # @title Copyright 2019 The Lifetime Value Authors. # Licensed under the Apache License, Version 2.0 (the \"License\"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # #     https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an \"AS IS\" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ Run in Google Colab View source on GitHub In\u00a0[3]: Copied! <pre>import os\n\nimport numpy as np\nimport pandas as pd\nfrom scipy import stats\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport tensorflow as tf\nimport tensorflow_probability as tfp\nfrom typing import Sequence\n\n# install and import ltv\n!pip install -q git+https://github.com/google/lifetime_value\nimport lifetime_value as ltv\n</pre> import os  import numpy as np import pandas as pd from scipy import stats import matplotlib.pyplot as plt import seaborn as sns import tensorflow as tf import tensorflow_probability as tfp from typing import Sequence  # install and import ltv !pip install -q git+https://github.com/google/lifetime_value import lifetime_value as ltv In\u00a0[\u00a0]: Copied! <pre>tfd = tfp.distributions\n%config InlineBackend.figure_format='retina'\nsns.set_style(\"whitegrid\")\n</pre> tfd = tfp.distributions %config InlineBackend.figure_format='retina' sns.set_style(\"whitegrid\") In\u00a0[\u00a0]: Copied! <pre>MODEL = \"dnn\"\nLOSS = \"ziln\"  # @param { isTemplate: true, type: 'string'} ['mse', 'ziln']\nLEARNING_RATE = 0.001  # @param { isTemplate: true}\nVERSION = 0  # @param { isTemplate: true, type: 'integer'}\nOUTPUT_CSV_FOLDER = \"/tmp/lifetime-value/kdd_cup_98/result\"  # @param { isTemplate: true, type: 'string'}\n</pre> MODEL = \"dnn\" LOSS = \"ziln\"  # @param { isTemplate: true, type: 'string'} ['mse', 'ziln'] LEARNING_RATE = 0.001  # @param { isTemplate: true} VERSION = 0  # @param { isTemplate: true, type: 'integer'} OUTPUT_CSV_FOLDER = \"/tmp/lifetime-value/kdd_cup_98/result\"  # @param { isTemplate: true, type: 'string'} <p>Download kdd_cup_98 data to /tmp/lifetime-value/kdd_cup_98</p> In\u00a0[\u00a0]: Copied! <pre>%%bash\nmkdir -p /tmp/lifetime-value/kdd_cup_98\nwget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98lrn.zip -P /tmp/lifetime-value/kdd_cup_98/\nwget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98val.zip -P /tmp/lifetime-value/kdd_cup_98/\nwget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/valtargt.txt -P /tmp/lifetime-value/kdd_cup_98/\ncd /tmp/lifetime-value/kdd_cup_98/\nunzip cup98lrn.zip\nunzip cup98val.zip\n</pre> %%bash mkdir -p /tmp/lifetime-value/kdd_cup_98 wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98lrn.zip -P /tmp/lifetime-value/kdd_cup_98/ wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98val.zip -P /tmp/lifetime-value/kdd_cup_98/ wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/valtargt.txt -P /tmp/lifetime-value/kdd_cup_98/ cd /tmp/lifetime-value/kdd_cup_98/ unzip cup98lrn.zip unzip cup98val.zip In\u00a0[\u00a0]: Copied! <pre>df_train = pd.read_csv(\"/tmp/lifetime-value/kdd_cup_98/cup98LRN.txt\")\nnum_train = df_train.shape[0]\ndf_eval = pd.read_csv(\"/tmp/lifetime-value/kdd_cup_98/cup98VAL.txt\")\ndf_eval_target = pd.read_csv(\"/tmp/lifetime-value/kdd_cup_98/valtargt.txt\")\ndf_eval = df_eval.merge(df_eval_target, on=\"CONTROLN\")\n</pre> df_train = pd.read_csv(\"/tmp/lifetime-value/kdd_cup_98/cup98LRN.txt\") num_train = df_train.shape[0] df_eval = pd.read_csv(\"/tmp/lifetime-value/kdd_cup_98/cup98VAL.txt\") df_eval_target = pd.read_csv(\"/tmp/lifetime-value/kdd_cup_98/valtargt.txt\") df_eval = df_eval.merge(df_eval_target, on=\"CONTROLN\") In\u00a0[\u00a0]: Copied! <pre>df = pd.concat([df_train, df_eval], axis=0, sort=True)\n</pre> df = pd.concat([df_train, df_eval], axis=0, sort=True) In\u00a0[\u00a0]: Copied! <pre>y = df[\"TARGET_D\"][:num_train]\n</pre> y = df[\"TARGET_D\"][:num_train] In\u00a0[\u00a0]: Copied! <pre>def plot_hist_log_scale(y):\n    max_val = y.max() + 1.0\n    ax = pd.Series(y).hist(\n        figsize=(8, 5), bins=10 ** np.linspace(0.0, np.log10(max_val), 20)\n    )\n\n    plt.xlabel(\"Donation ($)\")\n    plt.ylabel(\"Count\")\n    # plt.title('Histogram of LTV')\n    plt.xticks(rotation=\"horizontal\")\n    plt.legend(loc=\"upper left\")\n    ax.set_xscale(\"log\")\n    ax.grid(False)\n    # Hide the right and top spines\n    ax.spines[\"right\"].set_visible(False)\n    ax.spines[\"top\"].set_visible(False)\n    # Only show ticks on the left and bottom spines\n    ax.yaxis.set_ticks_position(\"left\")\n    ax.xaxis.set_ticks_position(\"bottom\")\n    plt.show()\n\n    fig = ax.get_figure()\n    output_file = tf.io.gfile.GFile(\n        \"/tmp/lifetime-value/kdd_cup_98/histogram_kdd98_log_scale.pdf\", \"wb\"\n    )\n    fig.savefig(output_file, bbox_inches=\"tight\", format=\"pdf\")\n</pre> def plot_hist_log_scale(y):     max_val = y.max() + 1.0     ax = pd.Series(y).hist(         figsize=(8, 5), bins=10 ** np.linspace(0.0, np.log10(max_val), 20)     )      plt.xlabel(\"Donation ($)\")     plt.ylabel(\"Count\")     # plt.title('Histogram of LTV')     plt.xticks(rotation=\"horizontal\")     plt.legend(loc=\"upper left\")     ax.set_xscale(\"log\")     ax.grid(False)     # Hide the right and top spines     ax.spines[\"right\"].set_visible(False)     ax.spines[\"top\"].set_visible(False)     # Only show ticks on the left and bottom spines     ax.yaxis.set_ticks_position(\"left\")     ax.xaxis.set_ticks_position(\"bottom\")     plt.show()      fig = ax.get_figure()     output_file = tf.io.gfile.GFile(         \"/tmp/lifetime-value/kdd_cup_98/histogram_kdd98_log_scale.pdf\", \"wb\"     )     fig.savefig(output_file, bbox_inches=\"tight\", format=\"pdf\") In\u00a0[\u00a0]: Copied! <pre>plot_hist_log_scale(y[y &gt; 0])\n</pre> plot_hist_log_scale(y[y &gt; 0]) In\u00a0[\u00a0]: Copied! <pre>VOCAB_FEATURES = [\n    \"ODATEDW\",  # date of donor's first gift (YYMM)\n    \"OSOURCE\",  # donor acquisition mailing list\n    \"TCODE\",  # donor title code\n    \"STATE\",\n    \"ZIP\",\n    \"DOMAIN\",  # urbanicity level and socio-economic status of the neighborhood\n    \"CLUSTER\",  # socio-economic status\n    \"GENDER\",\n    \"MAXADATE\",  # date of the most recent promotion received\n    \"MINRDATE\",\n    \"LASTDATE\",\n    \"FISTDATE\",\n    \"RFA_2A\",\n]\n</pre> VOCAB_FEATURES = [     \"ODATEDW\",  # date of donor's first gift (YYMM)     \"OSOURCE\",  # donor acquisition mailing list     \"TCODE\",  # donor title code     \"STATE\",     \"ZIP\",     \"DOMAIN\",  # urbanicity level and socio-economic status of the neighborhood     \"CLUSTER\",  # socio-economic status     \"GENDER\",     \"MAXADATE\",  # date of the most recent promotion received     \"MINRDATE\",     \"LASTDATE\",     \"FISTDATE\",     \"RFA_2A\", ] In\u00a0[\u00a0]: Copied! <pre>df[\"ODATEDW\"] = df[\"ODATEDW\"].astype(\"str\")\ndf[\"TCODE\"] = df[\"TCODE\"].apply(lambda x: \"{:03d}\".format(x // 1000 if x &gt; 1000 else x))\ndf[\"ZIP\"] = df[\"ZIP\"].str.slice(0, 5)\ndf[\"MAXADATE\"] = df[\"MAXADATE\"].astype(\"str\")\ndf[\"MINRDATE\"] = df[\"MINRDATE\"].astype(\"str\")\ndf[\"LASTDATE\"] = df[\"LASTDATE\"].astype(\"str\")\ndf[\"FISTDATE\"] = df[\"FISTDATE\"].astype(\"str\")\n</pre> df[\"ODATEDW\"] = df[\"ODATEDW\"].astype(\"str\") df[\"TCODE\"] = df[\"TCODE\"].apply(lambda x: \"{:03d}\".format(x // 1000 if x &gt; 1000 else x)) df[\"ZIP\"] = df[\"ZIP\"].str.slice(0, 5) df[\"MAXADATE\"] = df[\"MAXADATE\"].astype(\"str\") df[\"MINRDATE\"] = df[\"MINRDATE\"].astype(\"str\") df[\"LASTDATE\"] = df[\"LASTDATE\"].astype(\"str\") df[\"FISTDATE\"] = df[\"FISTDATE\"].astype(\"str\") In\u00a0[\u00a0]: Copied! <pre>def label_encoding(y, frequency_threshold=100):\n    value_counts = pd.value_counts(y)\n    categories = value_counts[value_counts &gt;= frequency_threshold].index.to_numpy()\n    # 0 indicates the unknown category.\n    return pd.Categorical(y, categories=categories).codes + 1\n</pre> def label_encoding(y, frequency_threshold=100):     value_counts = pd.value_counts(y)     categories = value_counts[value_counts &gt;= frequency_threshold].index.to_numpy()     # 0 indicates the unknown category.     return pd.Categorical(y, categories=categories).codes + 1 In\u00a0[\u00a0]: Copied! <pre>for key in VOCAB_FEATURES:\n    df[key] = label_encoding(df[key])\n</pre> for key in VOCAB_FEATURES:     df[key] = label_encoding(df[key]) In\u00a0[\u00a0]: Copied! <pre>MAIL_ORDER_RESPONSES = [\n    \"MBCRAFT\",\n    \"MBGARDEN\",\n    \"MBBOOKS\",\n    \"MBCOLECT\",\n    \"MAGFAML\",\n    \"MAGFEM\",\n    \"MAGMALE\",\n    \"PUBGARDN\",\n    \"PUBCULIN\",\n    \"PUBHLTH\",\n    \"PUBDOITY\",\n    \"PUBNEWFN\",\n    \"PUBPHOTO\",\n    \"PUBOPP\",\n    \"RFA_2F\",\n]\n</pre> MAIL_ORDER_RESPONSES = [     \"MBCRAFT\",     \"MBGARDEN\",     \"MBBOOKS\",     \"MBCOLECT\",     \"MAGFAML\",     \"MAGFEM\",     \"MAGMALE\",     \"PUBGARDN\",     \"PUBCULIN\",     \"PUBHLTH\",     \"PUBDOITY\",     \"PUBNEWFN\",     \"PUBPHOTO\",     \"PUBOPP\",     \"RFA_2F\", ] In\u00a0[\u00a0]: Copied! <pre>INDICATOR_FEATURES = [\n    \"AGE\",  # age decile, 0 indicates unknown\n    \"NUMCHLD\",\n    \"INCOME\",\n    \"WEALTH1\",\n    \"HIT\",\n] + MAIL_ORDER_RESPONSES\n</pre> INDICATOR_FEATURES = [     \"AGE\",  # age decile, 0 indicates unknown     \"NUMCHLD\",     \"INCOME\",     \"WEALTH1\",     \"HIT\", ] + MAIL_ORDER_RESPONSES In\u00a0[\u00a0]: Copied! <pre>df[\"AGE\"] = pd.qcut(df[\"AGE\"].values, 10).codes + 1\ndf[\"NUMCHLD\"] = df[\"NUMCHLD\"].apply(lambda x: 0 if np.isnan(x) else int(x))\ndf[\"INCOME\"] = df[\"INCOME\"].apply(lambda x: 0 if np.isnan(x) else int(x))\ndf[\"WEALTH1\"] = df[\"WEALTH1\"].apply(lambda x: 0 if np.isnan(x) else int(x) + 1)\ndf[\"HIT\"] = pd.qcut(df[\"HIT\"].values, q=50, duplicates=\"drop\").codes\n\nfor col in MAIL_ORDER_RESPONSES:\n    df[col] = pd.qcut(df[col].values, q=20, duplicates=\"drop\").codes + 1\n</pre> df[\"AGE\"] = pd.qcut(df[\"AGE\"].values, 10).codes + 1 df[\"NUMCHLD\"] = df[\"NUMCHLD\"].apply(lambda x: 0 if np.isnan(x) else int(x)) df[\"INCOME\"] = df[\"INCOME\"].apply(lambda x: 0 if np.isnan(x) else int(x)) df[\"WEALTH1\"] = df[\"WEALTH1\"].apply(lambda x: 0 if np.isnan(x) else int(x) + 1) df[\"HIT\"] = pd.qcut(df[\"HIT\"].values, q=50, duplicates=\"drop\").codes  for col in MAIL_ORDER_RESPONSES:     df[col] = pd.qcut(df[col].values, q=20, duplicates=\"drop\").codes + 1 In\u00a0[\u00a0]: Copied! <pre>NUMERIC_FEATURES = [\n    # binary\n    \"MAILCODE\",  # bad address\n    \"NOEXCH\",  # do not exchange\n    \"RECINHSE\",  # donor has given to PVA's in house program\n    \"RECP3\",  # donor has given to PVA's P3 program\n    \"RECPGVG\",  # planned giving record\n    \"RECSWEEP\",  # sweepstakes record\n    \"HOMEOWNR\",  # home owner\n    \"CHILD03\",\n    \"CHILD07\",\n    \"CHILD12\",\n    \"CHILD18\",\n    # continuous\n    \"CARDPROM\",\n    \"NUMPROM\",\n    \"CARDPM12\",\n    \"NUMPRM12\",\n    \"RAMNTALL\",\n    \"NGIFTALL\",\n    \"MINRAMNT\",\n    \"MAXRAMNT\",\n    \"LASTGIFT\",\n    \"AVGGIFT\",\n]\n</pre> NUMERIC_FEATURES = [     # binary     \"MAILCODE\",  # bad address     \"NOEXCH\",  # do not exchange     \"RECINHSE\",  # donor has given to PVA's in house program     \"RECP3\",  # donor has given to PVA's P3 program     \"RECPGVG\",  # planned giving record     \"RECSWEEP\",  # sweepstakes record     \"HOMEOWNR\",  # home owner     \"CHILD03\",     \"CHILD07\",     \"CHILD12\",     \"CHILD18\",     # continuous     \"CARDPROM\",     \"NUMPROM\",     \"CARDPM12\",     \"NUMPRM12\",     \"RAMNTALL\",     \"NGIFTALL\",     \"MINRAMNT\",     \"MAXRAMNT\",     \"LASTGIFT\",     \"AVGGIFT\", ] In\u00a0[\u00a0]: Copied! <pre>df[\"MAILCODE\"] = (df[\"MAILCODE\"] == \"B\").astype(\"float32\")\ndf[\"PVASTATE\"] = df[\"PVASTATE\"].isin([\"P\", \"E\"]).astype(\"float32\")\ndf[\"NOEXCH\"] = df[\"NOEXCH\"].isin([\"X\", \"1\"]).astype(\"float32\")\ndf[\"RECINHSE\"] = (df[\"RECINHSE\"] == \"X\").astype(\"float32\")\ndf[\"RECP3\"] = (df[\"RECP3\"] == \"X\").astype(\"float32\")\ndf[\"RECPGVG\"] = (df[\"RECPGVG\"] == \"X\").astype(\"float32\")\ndf[\"RECSWEEP\"] = (df[\"RECSWEEP\"] == \"X\").astype(\"float32\")\ndf[\"HOMEOWNR\"] = (df[\"HOMEOWNR\"] == \"H\").astype(\"float32\")\ndf[\"CHILD03\"] = df[\"CHILD03\"].isin([\"M\", \"F\", \"B\"]).astype(\"float32\")\ndf[\"CHILD07\"] = df[\"CHILD07\"].isin([\"M\", \"F\", \"B\"]).astype(\"float32\")\ndf[\"CHILD12\"] = df[\"CHILD12\"].isin([\"M\", \"F\", \"B\"]).astype(\"float32\")\ndf[\"CHILD18\"] = df[\"CHILD18\"].isin([\"M\", \"F\", \"B\"]).astype(\"float32\")\n\ndf[\"CARDPROM\"] = df[\"CARDPROM\"] / 100\ndf[\"NUMPROM\"] = df[\"NUMPROM\"] / 100\ndf[\"CARDPM12\"] = df[\"CARDPM12\"] / 100\ndf[\"NUMPRM12\"] = df[\"NUMPRM12\"] / 100\ndf[\"RAMNTALL\"] = np.log1p(df[\"RAMNTALL\"])\ndf[\"NGIFTALL\"] = np.log1p(df[\"NGIFTALL\"])\ndf[\"MINRAMNT\"] = np.log1p(df[\"MINRAMNT\"])\ndf[\"MAXRAMNT\"] = np.log1p(df[\"MAXRAMNT\"])\ndf[\"LASTGIFT\"] = np.log1p(df[\"LASTGIFT\"])\ndf[\"AVGGIFT\"] = np.log1p(df[\"AVGGIFT\"])\n</pre> df[\"MAILCODE\"] = (df[\"MAILCODE\"] == \"B\").astype(\"float32\") df[\"PVASTATE\"] = df[\"PVASTATE\"].isin([\"P\", \"E\"]).astype(\"float32\") df[\"NOEXCH\"] = df[\"NOEXCH\"].isin([\"X\", \"1\"]).astype(\"float32\") df[\"RECINHSE\"] = (df[\"RECINHSE\"] == \"X\").astype(\"float32\") df[\"RECP3\"] = (df[\"RECP3\"] == \"X\").astype(\"float32\") df[\"RECPGVG\"] = (df[\"RECPGVG\"] == \"X\").astype(\"float32\") df[\"RECSWEEP\"] = (df[\"RECSWEEP\"] == \"X\").astype(\"float32\") df[\"HOMEOWNR\"] = (df[\"HOMEOWNR\"] == \"H\").astype(\"float32\") df[\"CHILD03\"] = df[\"CHILD03\"].isin([\"M\", \"F\", \"B\"]).astype(\"float32\") df[\"CHILD07\"] = df[\"CHILD07\"].isin([\"M\", \"F\", \"B\"]).astype(\"float32\") df[\"CHILD12\"] = df[\"CHILD12\"].isin([\"M\", \"F\", \"B\"]).astype(\"float32\") df[\"CHILD18\"] = df[\"CHILD18\"].isin([\"M\", \"F\", \"B\"]).astype(\"float32\")  df[\"CARDPROM\"] = df[\"CARDPROM\"] / 100 df[\"NUMPROM\"] = df[\"NUMPROM\"] / 100 df[\"CARDPM12\"] = df[\"CARDPM12\"] / 100 df[\"NUMPRM12\"] = df[\"NUMPRM12\"] / 100 df[\"RAMNTALL\"] = np.log1p(df[\"RAMNTALL\"]) df[\"NGIFTALL\"] = np.log1p(df[\"NGIFTALL\"]) df[\"MINRAMNT\"] = np.log1p(df[\"MINRAMNT\"]) df[\"MAXRAMNT\"] = np.log1p(df[\"MAXRAMNT\"]) df[\"LASTGIFT\"] = np.log1p(df[\"LASTGIFT\"]) df[\"AVGGIFT\"] = np.log1p(df[\"AVGGIFT\"]) In\u00a0[\u00a0]: Copied! <pre>CATEGORICAL_FEATURES = VOCAB_FEATURES + INDICATOR_FEATURES\nALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES\n</pre> CATEGORICAL_FEATURES = VOCAB_FEATURES + INDICATOR_FEATURES ALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES In\u00a0[\u00a0]: Copied! <pre>def dnn_split(df):\n    df_train = df.iloc[:num_train]\n    df_eval = df.iloc[num_train:]\n\n    def feature_dict(df):\n        features = {k: v.values for k, v in dict(df[CATEGORICAL_FEATURES]).items()}\n        features[\"numeric\"] = df[NUMERIC_FEATURES].astype(\"float32\").values\n        return features\n\n    x_train, y_train = (\n        feature_dict(df_train),\n        df_train[\"TARGET_D\"].astype(\"float32\").values,\n    )\n    x_eval, y_eval = feature_dict(df_eval), df_eval[\"TARGET_D\"].astype(\"float32\").values\n\n    return x_train, x_eval, y_train, y_eval\n</pre> def dnn_split(df):     df_train = df.iloc[:num_train]     df_eval = df.iloc[num_train:]      def feature_dict(df):         features = {k: v.values for k, v in dict(df[CATEGORICAL_FEATURES]).items()}         features[\"numeric\"] = df[NUMERIC_FEATURES].astype(\"float32\").values         return features      x_train, y_train = (         feature_dict(df_train),         df_train[\"TARGET_D\"].astype(\"float32\").values,     )     x_eval, y_eval = feature_dict(df_eval), df_eval[\"TARGET_D\"].astype(\"float32\").values      return x_train, x_eval, y_train, y_eval In\u00a0[\u00a0]: Copied! <pre>def embedding_dim(x):\n    return int(x**0.25) + 1\n\n\ndef embedding_layer(vocab_size):\n    return tf.keras.Sequential(\n        [\n            tf.keras.layers.Embedding(\n                input_dim=vocab_size,\n                output_dim=embedding_dim(vocab_size),\n                input_length=1,\n            ),\n            tf.keras.layers.Flatten(),\n        ]\n    )\n\n\ndef dnn_model(output_units):\n    numeric_input = tf.keras.layers.Input(\n        shape=(len(NUMERIC_FEATURES),), name=\"numeric\"\n    )\n\n    embedding_inputs = [\n        tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)\n        for key in CATEGORICAL_FEATURES\n    ]\n\n    embedding_outputs = [\n        embedding_layer(vocab_size=df[key].max() + 1)(input)\n        for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)\n    ]\n\n    deep_input = tf.keras.layers.concatenate([numeric_input] + embedding_outputs)\n    deep_model = tf.keras.Sequential(\n        [\n            tf.keras.layers.Dense(128, activation=\"relu\"),\n            tf.keras.layers.Dense(128, activation=\"relu\"),\n            tf.keras.layers.Dense(64, activation=\"relu\"),\n            tf.keras.layers.Dense(64, activation=\"relu\"),\n            tf.keras.layers.Dense(units=output_units),\n        ]\n    )\n    return tf.keras.Model(\n        inputs=[numeric_input] + embedding_inputs, outputs=deep_model(deep_input)\n    )\n</pre> def embedding_dim(x):     return int(x**0.25) + 1   def embedding_layer(vocab_size):     return tf.keras.Sequential(         [             tf.keras.layers.Embedding(                 input_dim=vocab_size,                 output_dim=embedding_dim(vocab_size),                 input_length=1,             ),             tf.keras.layers.Flatten(),         ]     )   def dnn_model(output_units):     numeric_input = tf.keras.layers.Input(         shape=(len(NUMERIC_FEATURES),), name=\"numeric\"     )      embedding_inputs = [         tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)         for key in CATEGORICAL_FEATURES     ]      embedding_outputs = [         embedding_layer(vocab_size=df[key].max() + 1)(input)         for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)     ]      deep_input = tf.keras.layers.concatenate([numeric_input] + embedding_outputs)     deep_model = tf.keras.Sequential(         [             tf.keras.layers.Dense(128, activation=\"relu\"),             tf.keras.layers.Dense(128, activation=\"relu\"),             tf.keras.layers.Dense(64, activation=\"relu\"),             tf.keras.layers.Dense(64, activation=\"relu\"),             tf.keras.layers.Dense(units=output_units),         ]     )     return tf.keras.Model(         inputs=[numeric_input] + embedding_inputs, outputs=deep_model(deep_input)     ) In\u00a0[\u00a0]: Copied! <pre>if LOSS == \"mse\":\n    loss = tf.keras.losses.MeanSquaredError()\n    output_units = 1\n\nif LOSS == \"ziln\":\n    loss = ltv.zero_inflated_lognormal_loss\n    output_units = 3\n</pre> if LOSS == \"mse\":     loss = tf.keras.losses.MeanSquaredError()     output_units = 1  if LOSS == \"ziln\":     loss = ltv.zero_inflated_lognormal_loss     output_units = 3 In\u00a0[\u00a0]: Copied! <pre>x_train, x_eval, y_train, y_eval = dnn_split(df)\nmodel = dnn_model(output_units)\n</pre> x_train, x_eval, y_train, y_eval = dnn_split(df) model = dnn_model(output_units) In\u00a0[\u00a0]: Copied! <pre>model.compile(optimizer=tf.keras.optimizers.Nadam(lr=LEARNING_RATE), loss=loss)\n</pre> model.compile(optimizer=tf.keras.optimizers.Nadam(lr=LEARNING_RATE), loss=loss) In\u00a0[\u00a0]: Copied! <pre>callbacks = [\n    tf.keras.callbacks.ReduceLROnPlateau(monitor=\"val_loss\", min_lr=1e-6),\n    tf.keras.callbacks.EarlyStopping(monitor=\"val_loss\", patience=10),\n]\n</pre> callbacks = [     tf.keras.callbacks.ReduceLROnPlateau(monitor=\"val_loss\", min_lr=1e-6),     tf.keras.callbacks.EarlyStopping(monitor=\"val_loss\", patience=10), ] In\u00a0[\u00a0]: Copied! <pre>history = model.fit(\n    x=x_train,\n    y=y_train,\n    batch_size=2048,\n    epochs=200,\n    verbose=2,\n    callbacks=callbacks,\n    validation_data=(x_eval, y_eval),\n).history\n</pre> history = model.fit(     x=x_train,     y=y_train,     batch_size=2048,     epochs=200,     verbose=2,     callbacks=callbacks,     validation_data=(x_eval, y_eval), ).history In\u00a0[\u00a0]: Copied! <pre>pd.DataFrame(history)[[\"loss\", \"val_loss\"]].plot();\n</pre> pd.DataFrame(history)[[\"loss\", \"val_loss\"]].plot(); In\u00a0[\u00a0]: Copied! <pre>if LOSS == \"mse\":\n    y_pred = model.predict(x=x_eval, batch_size=1024).flatten()\n\nif LOSS == \"ziln\":\n    logits = model.predict(x=x_eval, batch_size=1024)\n    y_pred = ltv.zero_inflated_lognormal_pred(logits).numpy().flatten()\n</pre> if LOSS == \"mse\":     y_pred = model.predict(x=x_eval, batch_size=1024).flatten()  if LOSS == \"ziln\":     logits = model.predict(x=x_eval, batch_size=1024)     y_pred = ltv.zero_inflated_lognormal_pred(logits).numpy().flatten() In\u00a0[\u00a0]: Copied! <pre>from pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.training import Trainer\nfrom pytorch_widedeep.models import TabMlp, WideDeep\nfrom torch.optim.lr_scheduler import ReduceLROnPlateau\nfrom pytorch_widedeep.callbacks import EarlyStopping\nfrom torch.optim import NAdam\n</pre> from pytorch_widedeep.preprocessing import TabPreprocessor from pytorch_widedeep.training import Trainer from pytorch_widedeep.models import TabMlp, WideDeep from torch.optim.lr_scheduler import ReduceLROnPlateau from pytorch_widedeep.callbacks import EarlyStopping from torch.optim import NAdam In\u00a0[\u00a0]: Copied! <pre># CATEGORICAL_FEATURES\nNUMERICAL_FEATURES = [\"num\" + str(i) for i in range(21)]\nx_train_pyt_num = pd.DataFrame(columns=NUMERICAL_FEATURES, data=x_train[\"numeric\"])\nx_train_pyt_cat = pd.DataFrame(\n    {key: value for key, value in x_train.items() if key not in [\"numeric\"]}\n)\n\nx_eval_pyt_num = pd.DataFrame(columns=NUMERICAL_FEATURES, data=x_eval[\"numeric\"])\nx_eval_pyt_cat = pd.DataFrame(\n    {key: value for key, value in x_eval.items() if key not in [\"numeric\"]}\n)\n</pre> # CATEGORICAL_FEATURES NUMERICAL_FEATURES = [\"num\" + str(i) for i in range(21)] x_train_pyt_num = pd.DataFrame(columns=NUMERICAL_FEATURES, data=x_train[\"numeric\"]) x_train_pyt_cat = pd.DataFrame(     {key: value for key, value in x_train.items() if key not in [\"numeric\"]} )  x_eval_pyt_num = pd.DataFrame(columns=NUMERICAL_FEATURES, data=x_eval[\"numeric\"]) x_eval_pyt_cat = pd.DataFrame(     {key: value for key, value in x_eval.items() if key not in [\"numeric\"]} ) In\u00a0[\u00a0]: Copied! <pre>x_train_pyt = pd.concat([x_train_pyt_num, x_train_pyt_cat], axis=1)\nx_eval_pyt = pd.concat([x_eval_pyt_num, x_eval_pyt_cat], axis=1)\n</pre> x_train_pyt = pd.concat([x_train_pyt_num, x_train_pyt_cat], axis=1) x_eval_pyt = pd.concat([x_eval_pyt_num, x_eval_pyt_cat], axis=1) In\u00a0[\u00a0]: Copied! <pre>embed_input = [\n    (u, int(x_train_pyt[u].nunique() ** 0.25) + 1) for u in CATEGORICAL_FEATURES\n]\n</pre> embed_input = [     (u, int(x_train_pyt[u].nunique() ** 0.25) + 1) for u in CATEGORICAL_FEATURES ] In\u00a0[\u00a0]: Copied! <pre># deeptabular\ntab_preprocessor = TabPreprocessor(\n    embed_cols=embed_input,\n    continuous_cols=NUMERICAL_FEATURES,\n    shared_embed=False,\n    scale=False,\n)\nX_tab_train = tab_preprocessor.fit_transform(x_train_pyt)\nX_tab_valid = tab_preprocessor.transform(x_eval_pyt)\nX_tab_test = tab_preprocessor.transform(x_eval_pyt)\n\n# target\ny_train = y_train\ny_valid = y_eval\ny_test = y_train\n\nX_train = {\"X_tab\": X_tab_train, \"target\": y_train}\nX_val = {\"X_tab\": X_tab_valid, \"target\": y_valid}\nX_test = {\"X_tab\": X_tab_test}\n\ndeeptabular = TabMlp(\n    mlp_hidden_dims=[128, 128, 64, 64],\n    column_idx=tab_preprocessor.column_idx,\n    embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n)\n\nmodel = WideDeep(deeptabular=deeptabular, pred_dim=3)\n\ndeep_opt = NAdam(model.deeptabular.parameters(), lr=LEARNING_RATE)\ncallbacks = [EarlyStopping()]\ndeep_sch = ReduceLROnPlateau(deep_opt, min_lr=1e-6)\n\nobjective = \"ziln\"\n\ntrainer = Trainer(\n    model,\n    callbacks=callbacks,\n    lr_schedulers={\"deeptabular\": deep_sch},\n    objective=objective,\n    optimizers={\"deeptabular\": deep_opt},\n)\n\ntrainer.fit(\n    X_train=X_train,\n    X_val=X_val,\n    n_epochs=200,\n    batch_size=2048,\n)\n\ny_pred_pytorch = trainer.predict(X_test=X_test)\n</pre> # deeptabular tab_preprocessor = TabPreprocessor(     embed_cols=embed_input,     continuous_cols=NUMERICAL_FEATURES,     shared_embed=False,     scale=False, ) X_tab_train = tab_preprocessor.fit_transform(x_train_pyt) X_tab_valid = tab_preprocessor.transform(x_eval_pyt) X_tab_test = tab_preprocessor.transform(x_eval_pyt)  # target y_train = y_train y_valid = y_eval y_test = y_train  X_train = {\"X_tab\": X_tab_train, \"target\": y_train} X_val = {\"X_tab\": X_tab_valid, \"target\": y_valid} X_test = {\"X_tab\": X_tab_test}  deeptabular = TabMlp(     mlp_hidden_dims=[128, 128, 64, 64],     column_idx=tab_preprocessor.column_idx,     embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=tab_preprocessor.continuous_cols, )  model = WideDeep(deeptabular=deeptabular, pred_dim=3)  deep_opt = NAdam(model.deeptabular.parameters(), lr=LEARNING_RATE) callbacks = [EarlyStopping()] deep_sch = ReduceLROnPlateau(deep_opt, min_lr=1e-6)  objective = \"ziln\"  trainer = Trainer(     model,     callbacks=callbacks,     lr_schedulers={\"deeptabular\": deep_sch},     objective=objective,     optimizers={\"deeptabular\": deep_opt}, )  trainer.fit(     X_train=X_train,     X_val=X_val,     n_epochs=200,     batch_size=2048, )  y_pred_pytorch = trainer.predict(X_test=X_test) In\u00a0[\u00a0]: Copied! <pre>pd.DataFrame(trainer.history)[[\"train_loss\", \"val_loss\"]].plot();\n</pre> pd.DataFrame(trainer.history)[[\"train_loss\", \"val_loss\"]].plot(); In\u00a0[\u00a0]: Copied! <pre>from sklearn.metrics import mean_squared_error\n\nmean_squared_error(y_pred, y_pred_pytorch)\n</pre> from sklearn.metrics import mean_squared_error  mean_squared_error(y_pred, y_pred_pytorch) In\u00a0[\u00a0]: Copied! <pre>unit_costs = [0.4, 0.5, 0.6, 0.68, 0.7, 0.8, 0.9, 1.0]\n</pre> unit_costs = [0.4, 0.5, 0.6, 0.68, 0.7, 0.8, 0.9, 1.0] In\u00a0[\u00a0]: Copied! <pre>num_mailed = [np.sum(y_pred &gt; v) for v in unit_costs]\nnum_mailed\n</pre> num_mailed = [np.sum(y_pred &gt; v) for v in unit_costs] num_mailed In\u00a0[\u00a0]: Copied! <pre>baseline_total_profit = np.sum(y_eval - 0.68)\nbaseline_total_profit\n</pre> baseline_total_profit = np.sum(y_eval - 0.68) baseline_total_profit In\u00a0[\u00a0]: Copied! <pre>total_profits = [np.sum(y_eval[y_pred &gt; v] - v) for v in unit_costs]\ntotal_profits\n</pre> total_profits = [np.sum(y_eval[y_pred &gt; v] - v) for v in unit_costs] total_profits In\u00a0[\u00a0]: Copied! <pre>gain = pd.DataFrame(\n    {\n        \"lorenz\": ltv.cumulative_true(y_eval, y_eval),\n        \"baseline\": ltv.cumulative_true(y_eval, x_eval[\"numeric\"][:, 19]),\n        \"model\": ltv.cumulative_true(y_eval, y_pred),\n    }\n)\n</pre> gain = pd.DataFrame(     {         \"lorenz\": ltv.cumulative_true(y_eval, y_eval),         \"baseline\": ltv.cumulative_true(y_eval, x_eval[\"numeric\"][:, 19]),         \"model\": ltv.cumulative_true(y_eval, y_pred),     } ) In\u00a0[\u00a0]: Copied! <pre>num_customers = np.float32(gain.shape[0])\ngain[\"cumulative_customer\"] = (np.arange(num_customers) + 1.0) / num_customers\n</pre> num_customers = np.float32(gain.shape[0]) gain[\"cumulative_customer\"] = (np.arange(num_customers) + 1.0) / num_customers In\u00a0[\u00a0]: Copied! <pre>ax = gain[\n    [\n        \"cumulative_customer\",\n        \"lorenz\",\n        \"baseline\",\n        \"model\",\n    ]\n].plot(x=\"cumulative_customer\", figsize=(8, 5), legend=True)\n\nax.legend([\"Groundtruth\", \"Baseline\", \"Model\"], loc=\"lower right\")\n\nax.set_xlabel(\"Cumulative Fraction of Customers\")\nax.set_xticks(np.arange(0, 1.1, 0.1))\nax.set_xlim((0, 1.0))\n\nax.set_ylabel(\"Cumulative Fraction of Total Lifetime Value\")\nax.set_yticks(np.arange(0, 1.1, 0.1))\nax.set_ylim((0, 1.05))\nax.set_title(\"Gain Chart\");\n</pre> ax = gain[     [         \"cumulative_customer\",         \"lorenz\",         \"baseline\",         \"model\",     ] ].plot(x=\"cumulative_customer\", figsize=(8, 5), legend=True)  ax.legend([\"Groundtruth\", \"Baseline\", \"Model\"], loc=\"lower right\")  ax.set_xlabel(\"Cumulative Fraction of Customers\") ax.set_xticks(np.arange(0, 1.1, 0.1)) ax.set_xlim((0, 1.0))  ax.set_ylabel(\"Cumulative Fraction of Total Lifetime Value\") ax.set_yticks(np.arange(0, 1.1, 0.1)) ax.set_ylim((0, 1.05)) ax.set_title(\"Gain Chart\"); In\u00a0[\u00a0]: Copied! <pre>gini = ltv.gini_from_gain(gain[[\"lorenz\", \"baseline\", \"model\"]])\ngini\n</pre> gini = ltv.gini_from_gain(gain[[\"lorenz\", \"baseline\", \"model\"]]) gini In\u00a0[\u00a0]: Copied! <pre>df_decile = ltv.decile_stats(y_eval, y_pred)\ndf_decile\n</pre> df_decile = ltv.decile_stats(y_eval, y_pred) df_decile In\u00a0[\u00a0]: Copied! <pre>ax = df_decile[[\"label_mean\", \"pred_mean\"]].plot.bar(rot=0)\n\nax.set_title(\"Decile Chart\")\nax.set_xlabel(\"Prediction bucket\")\nax.set_ylabel(\"Average bucket value\")\nax.legend([\"Label\", \"Prediction\"], loc=\"upper left\");\n</pre> ax = df_decile[[\"label_mean\", \"pred_mean\"]].plot.bar(rot=0)  ax.set_title(\"Decile Chart\") ax.set_xlabel(\"Prediction bucket\") ax.set_ylabel(\"Average bucket value\") ax.legend([\"Label\", \"Prediction\"], loc=\"upper left\"); In\u00a0[\u00a0]: Copied! <pre>def spearmanr(x1: Sequence[float], x2: Sequence[float]) -&gt; float:\n    \"\"\"Calculates spearmanr rank correlation coefficient.\n\n    See https://docs.scipy.org/doc/scipy/reference/stats.html.\n\n    Args:\n      x1: 1D array_like.\n      x2: 1D array_like.\n\n    Returns:\n      correlation: float.\n    \"\"\"\n    return stats.spearmanr(x1, x2, nan_policy=\"raise\")[0]\n\n\nspearman_corr = spearmanr(y_eval, y_pred)\nspearman_corr\n</pre> def spearmanr(x1: Sequence[float], x2: Sequence[float]) -&gt; float:     \"\"\"Calculates spearmanr rank correlation coefficient.      See https://docs.scipy.org/doc/scipy/reference/stats.html.      Args:       x1: 1D array_like.       x2: 1D array_like.      Returns:       correlation: float.     \"\"\"     return stats.spearmanr(x1, x2, nan_policy=\"raise\")[0]   spearman_corr = spearmanr(y_eval, y_pred) spearman_corr In\u00a0[\u00a0]: Copied! <pre>df_metrics = pd.DataFrame(\n    {\n        \"model\": MODEL,\n        \"loss_function\": LOSS,\n        \"train_loss\": history[\"loss\"][-1],\n        \"eval_loss\": history[\"val_loss\"][-1],\n        \"label_positive\": np.mean(y_eval &gt; 0),\n        \"label_mean\": y_eval.mean(),\n        \"pred_mean\": y_pred.mean(),\n        \"decile_mape\": df_decile[\"decile_mape\"].mean(),\n        \"baseline_gini\": gini[\"normalized\"][1],\n        \"gini\": gini[\"normalized\"][2],\n        \"spearman_corr\": spearman_corr,\n    },\n    index=[VERSION],\n)\n</pre> df_metrics = pd.DataFrame(     {         \"model\": MODEL,         \"loss_function\": LOSS,         \"train_loss\": history[\"loss\"][-1],         \"eval_loss\": history[\"val_loss\"][-1],         \"label_positive\": np.mean(y_eval &gt; 0),         \"label_mean\": y_eval.mean(),         \"pred_mean\": y_pred.mean(),         \"decile_mape\": df_decile[\"decile_mape\"].mean(),         \"baseline_gini\": gini[\"normalized\"][1],         \"gini\": gini[\"normalized\"][2],         \"spearman_corr\": spearman_corr,     },     index=[VERSION], ) In\u00a0[\u00a0]: Copied! <pre>for unit_cost, total_profit in zip(unit_costs, total_profits):\n    df_metrics[\"total_profit_{:02d}\".format(int(unit_cost * 100))] = total_profit\n</pre> for unit_cost, total_profit in zip(unit_costs, total_profits):     df_metrics[\"total_profit_{:02d}\".format(int(unit_cost * 100))] = total_profit In\u00a0[\u00a0]: Copied! <pre>df_metrics.T\n</pre> df_metrics.T In\u00a0[\u00a0]: Copied! <pre>output_path = OUTPUT_CSV_FOLDER\n</pre> output_path = OUTPUT_CSV_FOLDER In\u00a0[\u00a0]: Copied! <pre>if not os.path.isdir(output_path):\n    os.makedirs(output_path)\n</pre> if not os.path.isdir(output_path):     os.makedirs(output_path) In\u00a0[\u00a0]: Copied! <pre>output_file = os.path.join(\n    output_path, \"{}_regression_{}_{}.csv\".format(MODEL, LOSS, VERSION)\n)\n</pre> output_file = os.path.join(     output_path, \"{}_regression_{}_{}.csv\".format(MODEL, LOSS, VERSION) ) In\u00a0[\u00a0]: Copied! <pre>df_metrics.to_csv(output_file, index=False)\n</pre> df_metrics.to_csv(output_file, index=False)"},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#zilnloss","title":"ZILNLoss\u00b6","text":"<p>[DISCLAIMER]</p> <p>Purpose of this notebook is to check if ZILNloss implemented originaly Keras give same results in pytorch-widedeep implemenatation</p>"},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#kdd-cup-98-ltv-prediction","title":"KDD Cup 98 LTV Prediction\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#configs","title":"Configs\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#load-data","title":"Load data\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#label-distribution","title":"Label distribution\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#preprocess-features","title":"Preprocess features\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#vocab","title":"Vocab\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#indicator","title":"Indicator\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#numeric","title":"Numeric\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#all","title":"All\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#traineval-split","title":"Train/eval split\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#model","title":"Model\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#loss","title":"Loss\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#train","title":"Train\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#eval","title":"Eval\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#pytorch-widedeep-approach","title":"Pytorch-widedeep approach\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#appendix","title":"Appendix\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#total-profit","title":"Total Profit\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#gini-coefficient","title":"Gini Coefficient\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#calibration","title":"Calibration\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#rank-correlation","title":"Rank Correlation\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#all-metrics-together","title":"All metrics together\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#save","title":"Save\u00b6","text":""},{"location":"examples/13_model_uncertainty_prediction.html","title":"13_model_uncertainty_prediction","text":"<ul> <li>In this notebook we will use the higly imbalanced Protein Homology Dataset from KDD cup 2004</li> </ul> <pre><code>* The first element of each line is a BLOCK ID that denotes to which native sequence this example belongs. There is a unique BLOCK ID for each native sequence. BLOCK IDs are integers running from 1 to 303 (one for each native sequence, i.e. for each query). BLOCK IDs were assigned before the blocks were split into the train and test sets, so they do not run consecutively in either file.\n* The second element of each line is an EXAMPLE ID that uniquely describes the example. You will need this EXAMPLE ID and the BLOCK ID when you submit results.\n* The third element is the class of the example. Proteins that are homologous to the native sequence are denoted by 1, non-homologous proteins (i.e. decoys) by 0. Test examples have a \"?\" in this position.\n* All following elements are feature values. There are 74 feature values in each line. The features describe the match (e.g. the score of a sequence alignment) between the native protein sequence and the sequence that is tested for homology.\n</code></pre> In\u00a0[1]: Copied! <pre>import numpy as np\nimport pandas as pd\nimport torch\nfrom torch.optim import SGD, lr_scheduler\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.models import TabMlp, WideDeep\nfrom pytorch_widedeep.dataloaders import DataLoaderImbalanced, DataLoaderDefault\nfrom torchmetrics import F1Score as F1_torchmetrics\nfrom torchmetrics import Accuracy as Accuracy_torchmetrics\nfrom torchmetrics import Precision as Precision_torchmetrics\nfrom torchmetrics import Recall as Recall_torchmetrics\nfrom pytorch_widedeep.metrics import Accuracy, Recall, Precision, F1Score, R2Score\nfrom pytorch_widedeep.initializers import XavierNormal\nfrom pytorch_widedeep.datasets import load_bio_kdd04\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import classification_report\n\nimport time\nimport datetime\n\nimport warnings\n\nwarnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n\n# increase displayed columns in jupyter notebook\npd.set_option(\"display.max_columns\", 200)\npd.set_option(\"display.max_rows\", 300)\n</pre> import numpy as np import pandas as pd import torch from torch.optim import SGD, lr_scheduler  from pytorch_widedeep import Trainer from pytorch_widedeep.preprocessing import TabPreprocessor from pytorch_widedeep.models import TabMlp, WideDeep from pytorch_widedeep.dataloaders import DataLoaderImbalanced, DataLoaderDefault from torchmetrics import F1Score as F1_torchmetrics from torchmetrics import Accuracy as Accuracy_torchmetrics from torchmetrics import Precision as Precision_torchmetrics from torchmetrics import Recall as Recall_torchmetrics from pytorch_widedeep.metrics import Accuracy, Recall, Precision, F1Score, R2Score from pytorch_widedeep.initializers import XavierNormal from pytorch_widedeep.datasets import load_bio_kdd04  from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report  import time import datetime  import warnings  warnings.filterwarnings(\"ignore\", category=DeprecationWarning)  # increase displayed columns in jupyter notebook pd.set_option(\"display.max_columns\", 200) pd.set_option(\"display.max_rows\", 300) In\u00a0[2]: Copied! <pre>df = load_bio_kdd04(as_frame=True)\ndf.head()\n</pre> df = load_bio_kdd04(as_frame=True) df.head() Out[2]: EXAMPLE_ID BLOCK_ID target 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 0 279 261532 0 52.0 32.69 0.30 2.5 20.0 1256.8 -0.89 0.33 11.0 -55.0 267.2 0.52 0.05 -2.36 49.6 252.0 0.43 1.16 -2.06 -33.0 -123.2 1.60 -0.49 -6.06 65.0 296.1 -0.28 -0.26 -3.83 -22.6 -170.0 3.06 -1.05 -3.29 22.9 286.3 0.12 2.58 4.08 -33.0 -178.9 1.88 0.53 -7.0 -44.0 1987.0 -5.41 0.95 -4.0 -57.0 722.9 -3.26 -0.55 -7.5 125.5 1547.2 -0.36 1.12 9.0 -37.0 72.5 0.47 0.74 -11.0 -8.0 1595.1 -1.64 2.83 -2.0 -50.0 445.2 -0.35 0.26 0.76 1 279 261533 0 58.0 33.33 0.00 16.5 9.5 608.1 0.50 0.07 20.5 -52.5 521.6 -1.08 0.58 -0.02 -3.2 103.6 -0.95 0.23 -2.87 -25.9 -52.2 -0.21 0.87 -1.81 10.4 62.0 -0.28 -0.04 1.48 -17.6 -198.3 3.43 2.84 5.87 -16.9 72.6 -0.31 2.79 2.71 -33.5 -11.6 -1.11 4.01 5.0 -57.0 666.3 1.13 4.38 5.0 -64.0 39.3 1.07 -0.16 32.5 100.0 1893.7 -2.80 -0.22 2.5 -28.5 45.0 0.58 0.41 -19.0 -6.0 762.9 0.29 0.82 -3.0 -35.0 140.3 1.16 0.39 0.73 2 279 261534 0 77.0 27.27 -0.91 6.0 58.5 1623.6 -1.40 0.02 -6.5 -48.0 621.0 -1.20 0.14 -0.20 73.6 609.1 -0.44 -0.58 -0.04 -23.0 -27.4 -0.72 -1.04 -1.09 91.1 635.6 -0.88 0.24 0.59 -18.7 -7.2 -0.60 -2.82 -0.71 52.4 504.1 0.89 -0.67 -9.30 -20.8 -25.7 -0.77 -0.85 0.0 -20.0 2259.0 -0.94 1.15 -4.0 -44.0 -22.7 0.94 -0.98 -19.0 105.0 1267.9 1.03 1.27 11.0 -39.5 82.3 0.47 -0.19 -10.0 7.0 1491.8 0.32 -1.29 0.0 -34.0 658.2 -0.76 0.26 0.24 3 279 261535 0 41.0 27.91 -0.35 3.0 46.0 1921.6 -1.36 -0.47 -32.0 -51.5 560.9 -0.29 -0.10 -1.11 124.3 791.6 0.00 0.39 -1.85 -21.7 -44.9 -0.21 0.02 0.89 133.9 797.8 -0.08 1.06 -0.26 -16.4 -74.1 0.97 -0.80 -0.41 66.9 955.3 -1.90 1.28 -6.65 -28.1 47.5 -1.91 1.42 1.0 -30.0 1846.7 0.76 1.10 -4.0 -52.0 -53.9 1.71 -0.22 -12.0 97.5 1969.8 -1.70 0.16 -1.0 -32.5 255.9 -0.46 1.57 10.0 6.0 2047.7 -0.98 1.53 0.0 -49.0 554.2 -0.83 0.39 0.73 4 279 261536 0 50.0 28.00 -1.32 -9.0 12.0 464.8 0.88 0.19 8.0 -51.5 98.1 1.09 -0.33 -2.16 -3.9 102.7 0.39 -1.22 -3.39 -15.2 -42.2 -1.18 -1.11 -3.55 8.9 141.3 -0.16 -0.43 -4.15 -12.9 -13.4 -1.32 -0.98 -3.69 8.8 136.1 -0.30 4.13 1.89 -13.0 -18.7 -1.37 -0.93 0.0 -1.0 810.1 -2.29 6.72 1.0 -23.0 -29.7 0.58 -1.10 -18.5 33.5 206.8 1.84 -0.13 4.0 -29.0 30.1 0.80 -0.24 5.0 -14.0 479.5 0.68 -0.59 2.0 -36.0 -6.9 2.02 0.14 -0.23 In\u00a0[3]: Copied! <pre># imbalance of the classes\ndf[\"target\"].value_counts()\n</pre> # imbalance of the classes df[\"target\"].value_counts() Out[3]: <pre>target\n0    144455\n1      1296\nName: count, dtype: int64</pre> In\u00a0[4]: Copied! <pre># drop columns we won't need in this example\ndf.drop(columns=[\"EXAMPLE_ID\", \"BLOCK_ID\"], inplace=True)\n</pre> # drop columns we won't need in this example df.drop(columns=[\"EXAMPLE_ID\", \"BLOCK_ID\"], inplace=True) In\u00a0[5]: Copied! <pre>df_train, df_valid = train_test_split(\n    df, test_size=0.2, stratify=df[\"target\"], random_state=1\n)\ndf_valid, df_test = train_test_split(\n    df_valid, test_size=0.5, stratify=df_valid[\"target\"], random_state=1\n)\n</pre> df_train, df_valid = train_test_split(     df, test_size=0.2, stratify=df[\"target\"], random_state=1 ) df_valid, df_test = train_test_split(     df_valid, test_size=0.5, stratify=df_valid[\"target\"], random_state=1 ) In\u00a0[6]: Copied! <pre>continuous_cols = df.drop(columns=[\"target\"]).columns.values.tolist()\n</pre> continuous_cols = df.drop(columns=[\"target\"]).columns.values.tolist() In\u00a0[7]: Copied! <pre># deeptabular\ntab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True)\nX_tab_train = tab_preprocessor.fit_transform(df_train)\nX_tab_valid = tab_preprocessor.transform(df_valid)\nX_tab_test = tab_preprocessor.transform(df_test)\n\n# target\ny_train = df_train[\"target\"].values\ny_valid = df_valid[\"target\"].values\ny_test = df_test[\"target\"].values\n</pre> # deeptabular tab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True) X_tab_train = tab_preprocessor.fit_transform(df_train) X_tab_valid = tab_preprocessor.transform(df_valid) X_tab_test = tab_preprocessor.transform(df_test)  # target y_train = df_train[\"target\"].values y_valid = df_valid[\"target\"].values y_test = df_test[\"target\"].values In\u00a0[8]: Copied! <pre>input_layer = len(tab_preprocessor.continuous_cols)\noutput_layer = 1\nhidden_layers = np.linspace(\n    input_layer * 2, output_layer, 5, endpoint=False, dtype=int\n).tolist()\n</pre> input_layer = len(tab_preprocessor.continuous_cols) output_layer = 1 hidden_layers = np.linspace(     input_layer * 2, output_layer, 5, endpoint=False, dtype=int ).tolist() In\u00a0[9]: Copied! <pre>deeptabular = TabMlp(\n    mlp_hidden_dims=hidden_layers,\n    column_idx=tab_preprocessor.column_idx,\n    continuous_cols=tab_preprocessor.continuous_cols,\n)\nmodel = WideDeep(deeptabular=deeptabular, pred_dim=1)\nmodel\n</pre> deeptabular = TabMlp(     mlp_hidden_dims=hidden_layers,     column_idx=tab_preprocessor.column_idx,     continuous_cols=tab_preprocessor.continuous_cols, ) model = WideDeep(deeptabular=deeptabular, pred_dim=1) model Out[9]: <pre>WideDeep(\n  (deeptabular): Sequential(\n    (0): TabMlp(\n      (cont_norm): Identity()\n      (encoder): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=74, out_features=148, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_1): Sequential(\n            (0): Linear(in_features=148, out_features=118, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_2): Sequential(\n            (0): Linear(in_features=118, out_features=89, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_3): Sequential(\n            (0): Linear(in_features=89, out_features=59, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_4): Sequential(\n            (0): Linear(in_features=59, out_features=30, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=30, out_features=1, bias=True)\n  )\n)</pre> In\u00a0[10]: Copied! <pre># # Metrics from torchmetrics\n# accuracy = Accuracy_torchmetrics(average=None, num_classes=1)\n# precision = Precision_torchmetrics(average=\"micro\", num_classes=1)\n# f1 = F1_torchmetrics(average=None, num_classes=1)\n# recall = Recall_torchmetrics(average=None, num_classes=1)\n</pre> # # Metrics from torchmetrics # accuracy = Accuracy_torchmetrics(average=None, num_classes=1) # precision = Precision_torchmetrics(average=\"micro\", num_classes=1) # f1 = F1_torchmetrics(average=None, num_classes=1) # recall = Recall_torchmetrics(average=None, num_classes=1) In\u00a0[11]: Copied! <pre># Metrics from pytorch-widedeep\naccuracy = Accuracy(top_k=2)\nprecision = Precision(average=False)\nrecall = Recall(average=True)\nf1 = F1Score(average=False)\n</pre> # Metrics from pytorch-widedeep accuracy = Accuracy(top_k=2) precision = Precision(average=False) recall = Recall(average=True) f1 = F1Score(average=False) In\u00a0[12]: Copied! <pre># Optimizers\ndeep_opt = SGD(model.deeptabular.parameters(), lr=0.1)\n# LR Scheduler\ndeep_sch = lr_scheduler.StepLR(deep_opt, step_size=3)\n\ntrainer = Trainer(\n    model,\n    objective=\"binary\",\n    lr_schedulers={\"deeptabular\": deep_sch},\n    initializers={\"deeptabular\": XavierNormal},\n    optimizers={\"deeptabular\": deep_opt},\n    metrics=[accuracy, precision, recall, f1],\n    verbose=1,\n)\n</pre> # Optimizers deep_opt = SGD(model.deeptabular.parameters(), lr=0.1) # LR Scheduler deep_sch = lr_scheduler.StepLR(deep_opt, step_size=3)  trainer = Trainer(     model,     objective=\"binary\",     lr_schedulers={\"deeptabular\": deep_sch},     initializers={\"deeptabular\": XavierNormal},     optimizers={\"deeptabular\": deep_opt},     metrics=[accuracy, precision, recall, f1],     verbose=1, ) In\u00a0[13]: Copied! <pre>start = time.time()\ntrainer.fit(\n    X_train={\"X_tab\": X_tab_train, \"target\": y_train},\n    X_val={\"X_tab\": X_tab_valid, \"target\": y_valid},\n    n_epochs=3,\n    batch_size=50,\n    custom_dataloader=DataLoaderImbalanced,\n    oversample_mul=5,\n)\nprint(\n    \"Training time[s]: {}\".format(\n        datetime.timedelta(seconds=round(time.time() - start))\n    )\n)\n</pre> start = time.time() trainer.fit(     X_train={\"X_tab\": X_tab_train, \"target\": y_train},     X_val={\"X_tab\": X_tab_valid, \"target\": y_valid},     n_epochs=3,     batch_size=50,     custom_dataloader=DataLoaderImbalanced,     oversample_mul=5, ) print(     \"Training time[s]: {}\".format(         datetime.timedelta(seconds=round(time.time() - start))     ) ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 208/208 [00:01&lt;00:00, 130.75it/s, loss=0.187, metrics={'acc': 0.9214, 'prec': [0.9149], 'rec': 0.9318, 'f1': [0.9233]}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 292/292 [00:01&lt;00:00, 173.71it/s, loss=0.106, metrics={'acc': 0.9499, 'prec': [0.1435], 'rec': 0.938, 'f1': [0.249]}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 208/208 [00:01&lt;00:00, 139.68it/s, loss=0.109, metrics={'acc': 0.9559, 'prec': [0.9537], 'rec': 0.9572, 'f1': [0.9554]}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 292/292 [00:01&lt;00:00, 177.53it/s, loss=0.0888, metrics={'acc': 0.9602, 'prec': [0.1755], 'rec': 0.9457, 'f1': [0.2961]}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 208/208 [00:01&lt;00:00, 141.63it/s, loss=0.08, metrics={'acc': 0.9706, 'prec': [0.9648], 'rec': 0.9766, 'f1': [0.9707]}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 292/292 [00:01&lt;00:00, 165.36it/s, loss=0.0969, metrics={'acc': 0.9564, 'prec': [0.1636], 'rec': 0.9535, 'f1': [0.2792]}]</pre> <pre>Training time[s]: 0:00:10\n</pre> <pre>\n</pre> In\u00a0[14]: Copied! <pre>pd.DataFrame(trainer.history)\n</pre> pd.DataFrame(trainer.history) Out[14]: train_loss train_acc train_prec train_rec train_f1 val_loss val_acc val_prec val_rec val_f1 0 0.186707 0.921408 [0.9149412512779236] 0.931801 [0.9232940673828125] 0.106023 0.949914 [0.14353498816490173] 0.937984 [0.24897116422653198] 1 0.109498 0.955931 [0.9536514282226562] 0.957193 [0.9554190039634705] 0.088787 0.960206 [0.17553956806659698] 0.945736 [0.29611650109291077] 2 0.079979 0.970588 [0.9648183584213257] 0.976582 [0.9706646203994751] 0.096858 0.956432 [0.1635638326406479] 0.953488 [0.279228150844574] In\u00a0[15]: Copied! <pre>df_pred = trainer.predict(X_tab=X_tab_test)\nprint(classification_report(df_test[\"target\"].to_list(), df_pred))\nprint(\"Actual predicted values:\\n{}\".format(np.unique(df_pred, return_counts=True)))\n</pre> df_pred = trainer.predict(X_tab=X_tab_test) print(classification_report(df_test[\"target\"].to_list(), df_pred)) print(\"Actual predicted values:\\n{}\".format(np.unique(df_pred, return_counts=True))) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 292/292 [00:00&lt;00:00, 346.55it/s]\n</pre> <pre>              precision    recall  f1-score   support\n\n           0       1.00      0.96      0.98     14446\n           1       0.17      0.95      0.29       130\n\n    accuracy                           0.96     14576\n   macro avg       0.58      0.95      0.63     14576\nweighted avg       0.99      0.96      0.97     14576\n\nActual predicted values:\n(array([0, 1]), array([13845,   731]))\n</pre> In\u00a0[16]: Copied! <pre>df_pred_unc = trainer.predict_uncertainty(X_tab=X_tab_test, uncertainty_granularity=10)\nprint(classification_report(df_test[\"target\"].to_list(), df_pred))\nprint(\n    \"Actual predicted values:\\n{}\".format(\n        np.unique(df_pred_unc[:, -1], return_counts=True)\n    )\n)\n</pre> df_pred_unc = trainer.predict_uncertainty(X_tab=X_tab_test, uncertainty_granularity=10) print(classification_report(df_test[\"target\"].to_list(), df_pred)) print(     \"Actual predicted values:\\n{}\".format(         np.unique(df_pred_unc[:, -1], return_counts=True)     ) ) <pre>predict_UncertaintyIter: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 10/10 [00:03&lt;00:00,  3.25it/s]</pre> <pre>              precision    recall  f1-score   support\n\n           0       1.00      0.96      0.98     14446\n           1       0.17      0.95      0.29       130\n\n    accuracy                           0.96     14576\n   macro avg       0.58      0.95      0.63     14576\nweighted avg       0.99      0.96      0.97     14576\n\nActual predicted values:\n(array([0.]), array([14576]))\n</pre> <pre>\n</pre> In\u00a0[17]: Copied! <pre>df_pred_unc\n</pre> df_pred_unc Out[17]: <pre>array([[9.98401165e-01, 1.59881881e-03, 0.00000000e+00],\n       [9.99941409e-01, 5.85634953e-05, 0.00000000e+00],\n       [9.97351170e-01, 2.64881272e-03, 0.00000000e+00],\n       ...,\n       [9.99494374e-01, 5.05603210e-04, 0.00000000e+00],\n       [9.99981642e-01, 1.83574630e-05, 0.00000000e+00],\n       [9.99996483e-01, 3.52600046e-06, 0.00000000e+00]])</pre>"},{"location":"examples/13_model_uncertainty_prediction.html#model-uncertainty-prediction","title":"Model Uncertainty prediction\u00b6","text":"<p>Note:</p> <p>This notebook extends the \"Custom DataLoader for Imbalanced dataset\" notebook</p>"},{"location":"examples/13_model_uncertainty_prediction.html#initial-imports","title":"Initial imports\u00b6","text":""},{"location":"examples/13_model_uncertainty_prediction.html#preparing-the-data","title":"Preparing the data\u00b6","text":""},{"location":"examples/13_model_uncertainty_prediction.html#define-the-model","title":"Define the model\u00b6","text":""},{"location":"examples/13_model_uncertainty_prediction.html#normal-prediction","title":"\"Normal\" prediction\u00b6","text":""},{"location":"examples/13_model_uncertainty_prediction.html#prediction-using-uncertainty","title":"Prediction using uncertainty\u00b6","text":""},{"location":"examples/14_bayesian_models.html","title":"14_bayesian_models","text":"In\u00a0[1]: Copied! <pre>import numpy as np\nimport torch\nimport pandas as pd\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import accuracy_score\n\nfrom pytorch_widedeep.metrics import Accuracy\nfrom pytorch_widedeep.datasets import load_adult\nfrom pytorch_widedeep.callbacks import EarlyStopping, ModelCheckpoint\nfrom pytorch_widedeep.preprocessing import TabPreprocessor, WidePreprocessor\nfrom pytorch_widedeep.bayesian_models import BayesianWide, BayesianTabMlp\nfrom pytorch_widedeep.training.bayesian_trainer import BayesianTrainer\n</pre> import numpy as np import torch import pandas as pd  from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score  from pytorch_widedeep.metrics import Accuracy from pytorch_widedeep.datasets import load_adult from pytorch_widedeep.callbacks import EarlyStopping, ModelCheckpoint from pytorch_widedeep.preprocessing import TabPreprocessor, WidePreprocessor from pytorch_widedeep.bayesian_models import BayesianWide, BayesianTabMlp from pytorch_widedeep.training.bayesian_trainer import BayesianTrainer <p>The first few things to do we know them very well, like with any other model described in any of the other notebooks</p> In\u00a0[2]: Copied! <pre>df = load_adult(as_frame=True)\ndf.columns = [c.replace(\"-\", \"_\") for c in df.columns]\ndf[\"age_buckets\"] = pd.cut(\n    df.age, bins=[16, 25, 30, 35, 40, 45, 50, 55, 60, 91], labels=np.arange(9)\n)\ndf[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int)\ndf.drop(\"income\", axis=1, inplace=True)\ndf.head()\n</pre> df = load_adult(as_frame=True) df.columns = [c.replace(\"-\", \"_\") for c in df.columns] df[\"age_buckets\"] = pd.cut(     df.age, bins=[16, 25, 30, 35, 40, 45, 50, 55, 60, 91], labels=np.arange(9) ) df[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int) df.drop(\"income\", axis=1, inplace=True) df.head() Out[2]: age workclass fnlwgt education educational_num marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country age_buckets income_label 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States 0 0 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States 3 0 2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States 1 1 3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States 4 1 4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States 0 0 In\u00a0[3]: Copied! <pre>train, test = train_test_split(df, test_size=0.2, stratify=df.income_label)\n</pre> train, test = train_test_split(df, test_size=0.2, stratify=df.income_label) In\u00a0[4]: Copied! <pre>wide_cols = [\n    \"age_buckets\",\n    \"education\",\n    \"relationship\",\n    \"workclass\",\n    \"occupation\",\n    \"native_country\",\n    \"gender\",\n]\ncrossed_cols = [(\"education\", \"occupation\"), (\"native_country\", \"occupation\")]\n\ncat_embed_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital_status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"capital_gain\",\n    \"capital_loss\",\n    \"native_country\",\n]\ncontinuous_cols = [\"age\", \"hours_per_week\"]\n\ntarget = train[\"income_label\"].values\n</pre> wide_cols = [     \"age_buckets\",     \"education\",     \"relationship\",     \"workclass\",     \"occupation\",     \"native_country\",     \"gender\", ] crossed_cols = [(\"education\", \"occupation\"), (\"native_country\", \"occupation\")]  cat_embed_cols = [     \"workclass\",     \"education\",     \"marital_status\",     \"occupation\",     \"relationship\",     \"race\",     \"gender\",     \"capital_gain\",     \"capital_loss\",     \"native_country\", ] continuous_cols = [\"age\", \"hours_per_week\"]  target = train[\"income_label\"].values In\u00a0[5]: Copied! <pre>wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)\nX_tab = wide_preprocessor.fit_transform(train)\n</pre> wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_tab = wide_preprocessor.fit_transform(train) In\u00a0[6]: Copied! <pre>model = BayesianWide(\n    input_dim=np.unique(X_tab).shape[0],\n    prior_sigma_1=1.0,\n    prior_sigma_2=0.002,\n    prior_pi=0.8,\n    posterior_mu_init=0,\n    posterior_rho_init=-7.0,\n    pred_dim=1,  # here the models are NOT passed to a WideDeep constructor class so the output dim MUST be specified\n)\n</pre> model = BayesianWide(     input_dim=np.unique(X_tab).shape[0],     prior_sigma_1=1.0,     prior_sigma_2=0.002,     prior_pi=0.8,     posterior_mu_init=0,     posterior_rho_init=-7.0,     pred_dim=1,  # here the models are NOT passed to a WideDeep constructor class so the output dim MUST be specified ) In\u00a0[7]: Copied! <pre>trainer = BayesianTrainer(\n    model,\n    objective=\"binary\",\n    optimizer=torch.optim.Adam(model.parameters(), lr=0.01),\n    metrics=[Accuracy],\n)\n</pre> trainer = BayesianTrainer(     model,     objective=\"binary\",     optimizer=torch.optim.Adam(model.parameters(), lr=0.01),     metrics=[Accuracy], ) In\u00a0[8]: Copied! <pre>trainer.fit(\n    X_tab=X_tab,\n    target=target,\n    val_split=0.2,\n    n_epochs=2,\n    batch_size=256,\n)\n</pre> trainer.fit(     X_tab=X_tab,     target=target,     val_split=0.2,     n_epochs=2,     batch_size=256, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 123/123 [00:00&lt;00:00, 124.32it/s, loss=163, metrics={'acc': 0.7813}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 31/31 [00:00&lt;00:00, 238.67it/s, loss=141, metrics={'acc': 0.8219}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 123/123 [00:00&lt;00:00, 132.81it/s, loss=140, metrics={'acc': 0.8285}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 31/31 [00:00&lt;00:00, 190.16it/s, loss=140, metrics={'acc': 0.8298}]\n</pre> In\u00a0[9]: Copied! <pre>tab_preprocessor = TabPreprocessor(\n    cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols\n)\nX_tab = tab_preprocessor.fit_transform(train)\n</pre> tab_preprocessor = TabPreprocessor(     cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols ) X_tab = tab_preprocessor.fit_transform(train) <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> In\u00a0[10]: Copied! <pre>model = BayesianTabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=continuous_cols,\n    #     embed_continuous_method = \"standard\",\n    #     cont_embed_activation=\"leaky_relu\",\n    #     cont_embed_dim = 8,\n    mlp_hidden_dims=[128, 64],\n    prior_sigma_1=1.0,\n    prior_sigma_2=0.002,\n    prior_pi=0.8,\n    posterior_mu_init=0,\n    posterior_rho_init=-7.0,\n    pred_dim=1,\n)\n</pre> model = BayesianTabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=continuous_cols,     #     embed_continuous_method = \"standard\",     #     cont_embed_activation=\"leaky_relu\",     #     cont_embed_dim = 8,     mlp_hidden_dims=[128, 64],     prior_sigma_1=1.0,     prior_sigma_2=0.002,     prior_pi=0.8,     posterior_mu_init=0,     posterior_rho_init=-7.0,     pred_dim=1, ) In\u00a0[11]: Copied! <pre>trainer = BayesianTrainer(\n    model,\n    objective=\"binary\",\n    optimizer=torch.optim.Adam(model.parameters(), lr=0.01),\n    metrics=[Accuracy],\n)\n</pre> trainer = BayesianTrainer(     model,     objective=\"binary\",     optimizer=torch.optim.Adam(model.parameters(), lr=0.01),     metrics=[Accuracy], ) In\u00a0[12]: Copied! <pre>trainer.fit(\n    X_tab=X_tab,\n    target=target,\n    val_split=0.2,\n    n_epochs=2,\n    batch_size=256,\n)\n</pre> trainer.fit(     X_tab=X_tab,     target=target,     val_split=0.2,     n_epochs=2,     batch_size=256, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 123/123 [00:04&lt;00:00, 28.74it/s, loss=2e+3, metrics={'acc': 0.8007}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 31/31 [00:00&lt;00:00, 136.89it/s, loss=1.75e+3, metrics={'acc': 0.8418}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 123/123 [00:04&lt;00:00, 29.41it/s, loss=1.73e+3, metrics={'acc': 0.8596}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 31/31 [00:00&lt;00:00, 143.87it/s, loss=1.71e+3, metrics={'acc': 0.8569}]\n</pre> <p>These models are powerful beyond the success metrics because they give us a sense of uncertainty as we predict. Let's have a look</p> In\u00a0[13]: Copied! <pre>X_tab_test = tab_preprocessor.transform(test)\n</pre> X_tab_test = tab_preprocessor.transform(test) In\u00a0[14]: Copied! <pre>preds = trainer.predict(X_tab_test, return_samples=True, n_samples=5)\n</pre> preds = trainer.predict(X_tab_test, return_samples=True, n_samples=5) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 39/39 [00:01&lt;00:00, 33.92it/s]\n</pre> In\u00a0[15]: Copied! <pre>preds.shape\n</pre> preds.shape Out[15]: <pre>(5, 9769)</pre> <p>as we can see the prediction have shape <code>(5, 9769)</code>, one set of predictions each time we have internally run predict (i.e. sample the network and predict, defined by the parameter <code>n_samples</code>). This gives us an idea of how certain the model is about a certain prediction.</p> <p>Similarly, we could obtain the probabilities</p> In\u00a0[16]: Copied! <pre>probs = trainer.predict_proba(X_tab_test, return_samples=True, n_samples=5)\n</pre> probs = trainer.predict_proba(X_tab_test, return_samples=True, n_samples=5) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 39/39 [00:01&lt;00:00, 32.79it/s]\n</pre> In\u00a0[17]: Copied! <pre>probs.shape\n</pre> probs.shape Out[17]: <pre>(5, 9769, 2)</pre> <p>And we could see how the model performs each time we sampled the network</p> In\u00a0[18]: Copied! <pre>for p in preds:\n    print(accuracy_score(p, test[\"income_label\"].values))\n</pre> for p in preds:     print(accuracy_score(p, test[\"income_label\"].values)) <pre>0.8559729757395844\n0.8564847988535162\n0.8567918927218753\n0.8562800696079435\n0.8558706111167981\n</pre> In\u00a0[\u00a0]: Copied! <pre>\n</pre>"},{"location":"examples/14_bayesian_models.html#the-bayesian-models","title":"The Bayesian Models\u00b6","text":"<p>Perhaps one of the most interesting functionality in the library is the access to full Bayesian models in almost exactly the same way one would use any of the other models in the library.</p> <p>Note however that the Bayesian models are ONLY available for tabular data and, at the moment, we do not support combining them to form a Wide and Deep model.</p> <p>The implementation in this library is based on the publication: Weight Uncertainty in Neural Networks, by Blundell et al., 2015. Code-wise, our implementation is inspired by a number of source:</p> <ol> <li>https://joshfeldman.net/WeightUncertainty/</li> <li>https://www.nitarshan.com/bayes-by-backprop/</li> <li>https://github.com/piEsposito/blitz-bayesian-deep-learning</li> <li>https://github.com/zackchase/mxnet-the-straight-dope/tree/master/chapter18_variational-methods-and-uncertainty</li> </ol> <p>The two Bayesian models available in the library are:</p> <ol> <li>BayesianWide: this is a linear model where the non-linearities are captured via crossed-columns</li> <li>BayesianMLP: this is a standard MLP that receives categorical embeddings and continuous cols (embedded or not) which are the passed through a series of dense layers. All parameters in the model are probabilistic.</li> </ol>"},{"location":"examples/14_bayesian_models.html#1-bayesianwide","title":"1. <code>BayesianWide</code>\u00b6","text":""},{"location":"examples/14_bayesian_models.html#2-bayesiantabmlp","title":"2. <code>BayesianTabMlp</code>\u00b6","text":""},{"location":"examples/15_Self_Supervised_Pretraning_pt1.html","title":"15 Self Supervised Pretraning pt1","text":"In\u00a0[1]: Copied! <pre>import torch\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.model_selection import train_test_split\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.models import TabMlp, WideDeep\nfrom pytorch_widedeep.metrics import Accuracy\nfrom pytorch_widedeep.datasets import load_adult\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.self_supervised_training import EncoderDecoderTrainer\n</pre> import torch from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split  from pytorch_widedeep import Trainer from pytorch_widedeep.models import TabMlp, WideDeep from pytorch_widedeep.metrics import Accuracy from pytorch_widedeep.datasets import load_adult from pytorch_widedeep.preprocessing import TabPreprocessor from pytorch_widedeep.self_supervised_training import EncoderDecoderTrainer In\u00a0[2]: Copied! <pre>df = load_adult(as_frame=True)\ndf.columns = [c.replace(\"-\", \"_\") for c in df.columns]\ndf[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int)\ndf.drop(\"income\", axis=1, inplace=True)\n</pre> df = load_adult(as_frame=True) df.columns = [c.replace(\"-\", \"_\") for c in df.columns] df[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int) df.drop(\"income\", axis=1, inplace=True) In\u00a0[3]: Copied! <pre># one could chose to use a validation set for early stopping, hyperparam\n# optimization, etc. This is just an example, so we simply use train/test\n# split\ndf_tr, df_te = train_test_split(df, test_size=0.2, stratify=df.income_label)\n</pre> # one could chose to use a validation set for early stopping, hyperparam # optimization, etc. This is just an example, so we simply use train/test # split df_tr, df_te = train_test_split(df, test_size=0.2, stratify=df.income_label) In\u00a0[4]: Copied! <pre>df_tr.head(2)\n</pre> df_tr.head(2) Out[4]: age workclass fnlwgt education educational_num marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country income_label 9042 26 Local-gov 250551 HS-grad 9 Married-civ-spouse Craft-repair Own-child Black Male 0 0 40 United-States 0 25322 50 Private 34832 Bachelors 13 Married-civ-spouse Tech-support Husband White Male 15024 0 40 United-States 1 In\u00a0[5]: Copied! <pre># As always, we need to define which cols will be represented as embeddings\n# and which one will be continuous features\ncat_embed_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital_status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"capital_gain\",\n    \"capital_loss\",\n    \"native_country\",\n]\ncontinuous_cols = [\"age\", \"hours_per_week\"]\ntarget_col = \"income_label\"\n</pre> # As always, we need to define which cols will be represented as embeddings # and which one will be continuous features cat_embed_cols = [     \"workclass\",     \"education\",     \"marital_status\",     \"occupation\",     \"relationship\",     \"race\",     \"gender\",     \"capital_gain\",     \"capital_loss\",     \"native_country\", ] continuous_cols = [\"age\", \"hours_per_week\"] target_col = \"income_label\" In\u00a0[6]: Copied! <pre># We prepare the data to be passed to the model\ntab_preprocessor = TabPreprocessor(\n    cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols\n)\nX_tab = tab_preprocessor.fit_transform(df_tr)\ntarget = df_tr[target_col].values\n</pre> # We prepare the data to be passed to the model tab_preprocessor = TabPreprocessor(     cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols ) X_tab = tab_preprocessor.fit_transform(df_tr) target = df_tr[target_col].values <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> In\u00a0[7]: Copied! <pre>X_tab[:5]\n</pre> X_tab[:5] Out[7]: <pre>array([[ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 26, 40],\n       [ 2,  2,  1,  2,  2,  2,  1,  2,  1,  1, 50, 40],\n       [ 2,  1,  1,  3,  2,  2,  1,  1,  2,  1, 39, 46],\n       [ 2,  3,  2,  4,  1,  2,  2,  1,  1,  1, 17, 10],\n       [ 3,  4,  2,  1,  1,  2,  1,  1,  1,  1, 32, 20]])</pre> In\u00a0[8]: Copied! <pre># We define a model that will act as the encoder in the encoder/decoder\n# architecture. This could be any of: TabMlp, TabResnet or TabNet\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n)\n</pre> # We define a model that will act as the encoder in the encoder/decoder # architecture. This could be any of: TabMlp, TabResnet or TabNet tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=tab_preprocessor.continuous_cols, ) In\u00a0[9]: Copied! <pre>tab_mlp\n</pre> tab_mlp Out[9]: <pre>TabMlp(\n  (cat_embed): DiffSizeCatEmbeddings(\n    (embed_layers): ModuleDict(\n      (emb_layer_workclass): Embedding(10, 5, padding_idx=0)\n      (emb_layer_education): Embedding(17, 8, padding_idx=0)\n      (emb_layer_marital_status): Embedding(8, 5, padding_idx=0)\n      (emb_layer_occupation): Embedding(16, 7, padding_idx=0)\n      (emb_layer_relationship): Embedding(7, 4, padding_idx=0)\n      (emb_layer_race): Embedding(6, 4, padding_idx=0)\n      (emb_layer_gender): Embedding(3, 2, padding_idx=0)\n      (emb_layer_capital_gain): Embedding(124, 24, padding_idx=0)\n      (emb_layer_capital_loss): Embedding(98, 21, padding_idx=0)\n      (emb_layer_native_country): Embedding(42, 13, padding_idx=0)\n    )\n    (embedding_dropout): Dropout(p=0.0, inplace=False)\n  )\n  (cont_norm): Identity()\n  (encoder): MLP(\n    (mlp): Sequential(\n      (dense_layer_0): Sequential(\n        (0): Linear(in_features=95, out_features=200, bias=True)\n        (1): ReLU(inplace=True)\n        (2): Dropout(p=0.1, inplace=False)\n      )\n      (dense_layer_1): Sequential(\n        (0): Linear(in_features=200, out_features=100, bias=True)\n        (1): ReLU(inplace=True)\n        (2): Dropout(p=0.1, inplace=False)\n      )\n    )\n  )\n)</pre> In\u00a0[10]: Copied! <pre># If we do not pass a custom decoder, which is perfectly possible via the\n# decoder param,  the EncoderDecoderTrainer will automatically build a\n# decoder which will be the 'mirror' image of the encoder\nencoder_decoder_trainer = EncoderDecoderTrainer(encoder=tab_mlp)\n</pre> # If we do not pass a custom decoder, which is perfectly possible via the # decoder param,  the EncoderDecoderTrainer will automatically build a # decoder which will be the 'mirror' image of the encoder encoder_decoder_trainer = EncoderDecoderTrainer(encoder=tab_mlp) In\u00a0[11]: Copied! <pre># let's have a look to the encoder_decoder_model (aka ed_model)\nencoder_decoder_trainer.ed_model\n</pre> # let's have a look to the encoder_decoder_model (aka ed_model) encoder_decoder_trainer.ed_model Out[11]: <pre>EncoderDecoderModel(\n  (encoder): TabMlp(\n    (cat_embed): DiffSizeCatEmbeddings(\n      (embed_layers): ModuleDict(\n        (emb_layer_workclass): Embedding(10, 5, padding_idx=0)\n        (emb_layer_education): Embedding(17, 8, padding_idx=0)\n        (emb_layer_marital_status): Embedding(8, 5, padding_idx=0)\n        (emb_layer_occupation): Embedding(16, 7, padding_idx=0)\n        (emb_layer_relationship): Embedding(7, 4, padding_idx=0)\n        (emb_layer_race): Embedding(6, 4, padding_idx=0)\n        (emb_layer_gender): Embedding(3, 2, padding_idx=0)\n        (emb_layer_capital_gain): Embedding(124, 24, padding_idx=0)\n        (emb_layer_capital_loss): Embedding(98, 21, padding_idx=0)\n        (emb_layer_native_country): Embedding(42, 13, padding_idx=0)\n      )\n      (embedding_dropout): Dropout(p=0.0, inplace=False)\n    )\n    (cont_norm): Identity()\n    (encoder): MLP(\n      (mlp): Sequential(\n        (dense_layer_0): Sequential(\n          (0): Linear(in_features=95, out_features=200, bias=True)\n          (1): ReLU(inplace=True)\n          (2): Dropout(p=0.1, inplace=False)\n        )\n        (dense_layer_1): Sequential(\n          (0): Linear(in_features=200, out_features=100, bias=True)\n          (1): ReLU(inplace=True)\n          (2): Dropout(p=0.1, inplace=False)\n        )\n      )\n    )\n  )\n  (decoder): TabMlpDecoder(\n    (decoder): MLP(\n      (mlp): Sequential(\n        (dense_layer_0): Sequential(\n          (0): Linear(in_features=100, out_features=200, bias=True)\n          (1): ReLU(inplace=True)\n          (2): Dropout(p=0.1, inplace=False)\n        )\n        (dense_layer_1): Sequential(\n          (0): Linear(in_features=200, out_features=95, bias=True)\n          (1): ReLU(inplace=True)\n          (2): Dropout(p=0.1, inplace=False)\n        )\n      )\n    )\n  )\n  (masker): RandomObfuscator()\n)</pre> <p>Ignoring the <code>masker</code>, which just...well...masks, the <code>ed_model</code> consists of:</p> <ol> <li>An encoder model that is a <code>TabMlp</code> model that is in itself comprised by an Embedding layer (or rather a collection of them, referred as <code>cat_and_cont_embed</code>) and an encoder (a simple MLP, referred as <code>encoder</code>)</li> <li>A decoder which is just an \"inverted\" MLP (referred as <code>decoder</code>)</li> </ol> In\u00a0[12]: Copied! <pre># And we just...pretrain\nencoder_decoder_trainer.pretrain(X_tab, n_epochs=5, batch_size=256)\n</pre> # And we just...pretrain encoder_decoder_trainer.pretrain(X_tab, n_epochs=5, batch_size=256) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 82.90it/s, loss=4.07]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 89.87it/s, loss=3.09]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 92.86it/s, loss=2.53]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 91.24it/s, loss=2.09]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 91.38it/s, loss=1.78]\n</pre> <p>At this point we have two options, we could either save the model for later use or we could continue to supervised training. The latter is rather simple, after running:</p> <pre>encoder_decoder_trainer.pretrain(X_tab, n_epochs=5, batch_size=256)\n</pre> <p>you just have to</p> <pre>model = WideDeep(deeptabular=tab_mlp)\ntrainer = Trainer(model=model, objective=\"binary\", metrics=[Accuracy])\n\ntrainer.fit(X_tab=X_tab, target=target, n_epochs=5, batch_size=256)\n\n# And, you know...we get a test metric\nX_tab_te = tab_preprocessor.transform(df_te)\ntarget_te = df_te[target_col].values\n\npreds = trainer.predict(X_tab=X_tab_te)\ntest_acc = accuracy_score(target_te, preds)\n</pre> <p>Let's say that in any case, we are 'decent' scientists/people and we want to save the model:</p> In\u00a0[13]: Copied! <pre>encoder_decoder_trainer.save(\n    path=\"pretrained_weights\", model_filename=\"encoder_decoder_model.pt\"\n)\n</pre> encoder_decoder_trainer.save(     path=\"pretrained_weights\", model_filename=\"encoder_decoder_model.pt\" ) <p>some time has passed...</p> In\u00a0[14]: Copied! <pre>encoder_decoder_model = torch.load(\"pretrained_weights/encoder_decoder_model.pt\")\n</pre> encoder_decoder_model = torch.load(\"pretrained_weights/encoder_decoder_model.pt\") <p>Now, AND THIS IS IMPORTANT We have loaded the encoder AND the decoder. To proceed to the supervised training we ONLY need the encoder</p> In\u00a0[15]: Copied! <pre>pretrained_encoder = encoder_decoder_model.encoder\n</pre> pretrained_encoder = encoder_decoder_model.encoder In\u00a0[16]: Copied! <pre>pretrained_encoder\n</pre> pretrained_encoder Out[16]: <pre>TabMlp(\n  (cat_embed): DiffSizeCatEmbeddings(\n    (embed_layers): ModuleDict(\n      (emb_layer_workclass): Embedding(10, 5, padding_idx=0)\n      (emb_layer_education): Embedding(17, 8, padding_idx=0)\n      (emb_layer_marital_status): Embedding(8, 5, padding_idx=0)\n      (emb_layer_occupation): Embedding(16, 7, padding_idx=0)\n      (emb_layer_relationship): Embedding(7, 4, padding_idx=0)\n      (emb_layer_race): Embedding(6, 4, padding_idx=0)\n      (emb_layer_gender): Embedding(3, 2, padding_idx=0)\n      (emb_layer_capital_gain): Embedding(124, 24, padding_idx=0)\n      (emb_layer_capital_loss): Embedding(98, 21, padding_idx=0)\n      (emb_layer_native_country): Embedding(42, 13, padding_idx=0)\n    )\n    (embedding_dropout): Dropout(p=0.0, inplace=False)\n  )\n  (cont_norm): Identity()\n  (encoder): MLP(\n    (mlp): Sequential(\n      (dense_layer_0): Sequential(\n        (0): Linear(in_features=95, out_features=200, bias=True)\n        (1): ReLU(inplace=True)\n        (2): Dropout(p=0.1, inplace=False)\n      )\n      (dense_layer_1): Sequential(\n        (0): Linear(in_features=200, out_features=100, bias=True)\n        (1): ReLU(inplace=True)\n        (2): Dropout(p=0.1, inplace=False)\n      )\n    )\n  )\n)</pre> In\u00a0[17]: Copied! <pre># and as always, ANY supervised model in this library has to go throuth the WideDeep class:\nmodel = WideDeep(deeptabular=pretrained_encoder)\ntrainer = Trainer(model=model, objective=\"binary\", metrics=[Accuracy])\n\ntrainer.fit(X_tab=X_tab, target=target, n_epochs=5, batch_size=256)\n\nX_tab_te = tab_preprocessor.transform(df_te)\ntarget_te = df_te[target_col].values\n\npreds = trainer.predict(X_tab=X_tab_te)\ntest_acc = accuracy_score(target_te, preds)\nprint(test_acc)\n</pre> # and as always, ANY supervised model in this library has to go throuth the WideDeep class: model = WideDeep(deeptabular=pretrained_encoder) trainer = Trainer(model=model, objective=\"binary\", metrics=[Accuracy])  trainer.fit(X_tab=X_tab, target=target, n_epochs=5, batch_size=256)  X_tab_te = tab_preprocessor.transform(df_te) target_te = df_te[target_col].values  preds = trainer.predict(X_tab=X_tab_te) test_acc = accuracy_score(target_te, preds) print(test_acc) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 88.04it/s, loss=0.374, metrics={'acc': 0.8253}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 85.63it/s, loss=0.324, metrics={'acc': 0.8491}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 87.56it/s, loss=0.301, metrics={'acc': 0.8608}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:02&lt;00:00, 73.38it/s, loss=0.29, metrics={'acc': 0.8655}]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 78.68it/s, loss=0.284, metrics={'acc': 0.8686}]\npredict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 39/39 [00:00&lt;00:00, 173.02it/s]\n</pre> <pre>0.8730678677449074\n</pre> <p>As we mentioned before, we can also use a <code>TabResNet</code> or <code>TabNet</code> model and a custom decoder. Let's have a look:</p> In\u00a0[18]: Copied! <pre>from pytorch_widedeep.models import TabResnet as TabResnetEncoder, TabResnetDecoder\n</pre> from pytorch_widedeep.models import TabResnet as TabResnetEncoder, TabResnetDecoder In\u00a0[19]: Copied! <pre>resnet_encoder = TabResnetEncoder(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=continuous_cols,\n    blocks_dims=[200, 100, 100],\n)\n</pre> resnet_encoder = TabResnetEncoder(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=continuous_cols,     blocks_dims=[200, 100, 100], ) <p>let's have a look to the model</p> In\u00a0[20]: Copied! <pre>resnet_encoder\n</pre> resnet_encoder Out[20]: <pre>TabResnet(\n  (cat_embed): DiffSizeCatEmbeddings(\n    (embed_layers): ModuleDict(\n      (emb_layer_workclass): Embedding(10, 5, padding_idx=0)\n      (emb_layer_education): Embedding(17, 8, padding_idx=0)\n      (emb_layer_marital_status): Embedding(8, 5, padding_idx=0)\n      (emb_layer_occupation): Embedding(16, 7, padding_idx=0)\n      (emb_layer_relationship): Embedding(7, 4, padding_idx=0)\n      (emb_layer_race): Embedding(6, 4, padding_idx=0)\n      (emb_layer_gender): Embedding(3, 2, padding_idx=0)\n      (emb_layer_capital_gain): Embedding(124, 24, padding_idx=0)\n      (emb_layer_capital_loss): Embedding(98, 21, padding_idx=0)\n      (emb_layer_native_country): Embedding(42, 13, padding_idx=0)\n    )\n    (embedding_dropout): Dropout(p=0.0, inplace=False)\n  )\n  (cont_norm): Identity()\n  (encoder): DenseResnet(\n    (dense_resnet): Sequential(\n      (lin_inp): Linear(in_features=95, out_features=200, bias=False)\n      (bn_inp): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      (block_0): BasicBlock(\n        (resize): Sequential(\n          (0): Linear(in_features=200, out_features=100, bias=False)\n          (1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        )\n        (lin1): Linear(in_features=200, out_features=100, bias=False)\n        (bn1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n        (dp): Dropout(p=0.1, inplace=False)\n        (lin2): Linear(in_features=100, out_features=100, bias=False)\n        (bn2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      )\n      (block_1): BasicBlock(\n        (lin1): Linear(in_features=100, out_features=100, bias=False)\n        (bn1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n        (dp): Dropout(p=0.1, inplace=False)\n        (lin2): Linear(in_features=100, out_features=100, bias=False)\n        (bn2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      )\n    )\n  )\n)</pre> <p>As we can see, the tensor we are trying to reconstruct, the embeddings, is of size <code>94</code> (this number is stored in the attribute: <code>esnet_encoder.cat_and_cont_embed.output_dim</code>), with that information we could build or own decoder as:</p> In\u00a0[21]: Copied! <pre># for all possible params see the docs\nresnet_decoder = TabResnetDecoder(\n    embed_dim=resnet_encoder.cat_out_dim + resnet_encoder.cont_out_dim,\n    blocks_dims=[100, 100, 200],\n)\n</pre> # for all possible params see the docs resnet_decoder = TabResnetDecoder(     embed_dim=resnet_encoder.cat_out_dim + resnet_encoder.cont_out_dim,     blocks_dims=[100, 100, 200], ) In\u00a0[22]: Copied! <pre>resnet_decoder\n</pre> resnet_decoder Out[22]: <pre>TabResnetDecoder(\n  (decoder): DenseResnet(\n    (dense_resnet): Sequential(\n      (block_0): BasicBlock(\n        (lin1): Linear(in_features=100, out_features=100, bias=False)\n        (bn1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n        (dp): Dropout(p=0.1, inplace=False)\n        (lin2): Linear(in_features=100, out_features=100, bias=False)\n        (bn2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      )\n      (block_1): BasicBlock(\n        (resize): Sequential(\n          (0): Linear(in_features=100, out_features=200, bias=False)\n          (1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        )\n        (lin1): Linear(in_features=100, out_features=200, bias=False)\n        (bn1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n        (dp): Dropout(p=0.1, inplace=False)\n        (lin2): Linear(in_features=200, out_features=200, bias=False)\n        (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      )\n    )\n  )\n  (reconstruction_layer): Linear(in_features=200, out_features=95, bias=False)\n)</pre> <p>and now:</p> In\u00a0[23]: Copied! <pre>ec_trainer = EncoderDecoderTrainer(\n    encoder=resnet_encoder,\n    decoder=resnet_decoder,\n    masked_prob=0.2,\n)\nec_trainer.pretrain(X_tab, n_epochs=5, batch_size=256)\n</pre> ec_trainer = EncoderDecoderTrainer(     encoder=resnet_encoder,     decoder=resnet_decoder,     masked_prob=0.2, ) ec_trainer.pretrain(X_tab, n_epochs=5, batch_size=256) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:03&lt;00:00, 46.89it/s, loss=1.52]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:03&lt;00:00, 46.78it/s, loss=0.81]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:03&lt;00:00, 39.82it/s, loss=0.56]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:03&lt;00:00, 46.73it/s, loss=0.417]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:03&lt;00:00, 46.24it/s, loss=0.329]\n</pre> In\u00a0[24]: Copied! <pre># and as always, ANY supervised model in this library has to go throuth the WideDeep class:\nmodel = WideDeep(deeptabular=resnet_encoder)\ntrainer = Trainer(model=model, objective=\"binary\", metrics=[Accuracy])\n\ntrainer.fit(X_tab=X_tab, target=target, n_epochs=5, batch_size=256)\n\nX_tab_te = tab_preprocessor.transform(df_te)\ntarget_te = df_te[target_col].values\n\npreds = trainer.predict(X_tab=X_tab_te)\ntest_acc = accuracy_score(target_te, preds)\nprint(test_acc)\n</pre> # and as always, ANY supervised model in this library has to go throuth the WideDeep class: model = WideDeep(deeptabular=resnet_encoder) trainer = Trainer(model=model, objective=\"binary\", metrics=[Accuracy])  trainer.fit(X_tab=X_tab, target=target, n_epochs=5, batch_size=256)  X_tab_te = tab_preprocessor.transform(df_te) target_te = df_te[target_col].values  preds = trainer.predict(X_tab=X_tab_te) test_acc = accuracy_score(target_te, preds) print(test_acc) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:02&lt;00:00, 58.63it/s, loss=0.335, metrics={'acc': 0.8442}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:02&lt;00:00, 58.02it/s, loss=0.296, metrics={'acc': 0.864}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:02&lt;00:00, 55.91it/s, loss=0.283, metrics={'acc': 0.8687}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:02&lt;00:00, 55.00it/s, loss=0.276, metrics={'acc': 0.871}]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:02&lt;00:00, 51.95it/s, loss=0.272, metrics={'acc': 0.8732}]\npredict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 39/39 [00:00&lt;00:00, 120.15it/s]\n</pre> <pre>0.8725560446309756\n</pre> In\u00a0[\u00a0]: Copied! <pre>\n</pre>"},{"location":"examples/15_Self_Supervised_Pretraning_pt1.html#self-supervised-pretraining-for-tabular-data","title":"Self Supervised Pretraining for Tabular Data\u00b6","text":"<p>We have implemented two Self Supervised Pre-training routines that allow the user to pre-train all tabular models in the library with the exception of the TabPerceiver (which is a special monster).</p> <p>The two routines implemented are illustrated in the figures below. The 1st is from TabNet: Attentive Interpretable Tabular Learning and is designed for models that do not use transformer-based architectures, while the second is from SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training, and is designed for models that use transformer-based architectures.</p> <p> </p> <p>Fig 1. Figure 2 in their paper. I have included de original caption in case is useful, althought the Figure itself is pretty self explanatory</p> <p> </p> <p>Fig 2. Figure 1 in their paper. Here the caption is necessary \ud83d\ude0f</p> <p>It is beyond the scope of this notebook to explain in detail those implementations. Therefore, we strongly recommend the user to go and read the papers if this functionality is of interest to her/him.</p> <p>One thing is worth noticing however. As seen in Fig 1(the TabNet paper's Fig 2) the masking of the input features happens in the feature space. However, the implementation in this library is inspired by that at the dreamquark-ai repo, which is in itself inspired by the original implementation (by the way, at this point I will write it once again. All TabNet related things in this library are inspired when not directly based in the code in that repo, therefore, ALL CREDIT TO THE GUYS AT dreamquark-ai).</p> <p>In that implementation the masking happens in the embedding space, and currently does not mask the entire embedding (i.e. categorical feature). We decided to release as it is in this version and we will implement the exact same process described in the paper in future releases.</p> <p>Having said all of the above let's see how to use self supervision for tabular data with <code>pytorch-widedeep</code>. We will concentrate in this notebook on the 1st of the two approaches (the 'TabNet approach'). For details on the second approach please see <code>16_Self_Supervised_Pretraning_pt2</code>.</p>"},{"location":"examples/15_Self_Supervised_Pretraning_pt1.html#self-supervision-for-non-transformer-based-models","title":"Self Supervision for non-transformer-based models..\u00b6","text":"<p>...or in general, for models where the embeddigns can have all different dimensions. In this library, these are: <code>TabMlp</code>, <code>TabResNet</code> and <code>TabNet</code></p> <p>As shown in Figure, this is an encoder-encoder approach where we learn to predict values in the incoming data that have been masked. However, as I mentioned before, our implementation is a bit different, and the masking occurs in th embedding space.</p> <p>Nonetheless, the code below illustrates how to use this encoder-decoder approach with <code>pytorch-widedeep</code></p>"},{"location":"examples/15_Self_Supervised_Pretraning_pt2.html","title":"15 Self Supervised Pretraning pt2","text":"In\u00a0[1]: Copied! <pre>import torch\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.model_selection import train_test_split\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.models import WideDeep, FTTransformer\nfrom pytorch_widedeep.metrics import Accuracy\nfrom pytorch_widedeep.datasets import load_adult\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.self_supervised_training import (\n    ContrastiveDenoisingTrainer,\n)\n</pre> import torch from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split  from pytorch_widedeep import Trainer from pytorch_widedeep.models import WideDeep, FTTransformer from pytorch_widedeep.metrics import Accuracy from pytorch_widedeep.datasets import load_adult from pytorch_widedeep.preprocessing import TabPreprocessor from pytorch_widedeep.self_supervised_training import (     ContrastiveDenoisingTrainer, ) In\u00a0[2]: Copied! <pre>df = load_adult(as_frame=True)\ndf.columns = [c.replace(\"-\", \"_\") for c in df.columns]\ndf[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int)\ndf.drop(\"income\", axis=1, inplace=True)\n\n# one could chose to use a validation set for early stopping, hyperparam\n# optimization, etc. This is just an example, so we simply use train/test\n# split\ndf_tr, df_te = train_test_split(df, test_size=0.2, stratify=df.income_label)\n\ncat_embed_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital_status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"capital_gain\",\n    \"capital_loss\",\n    \"native_country\",\n]\ncontinuous_cols = [\"age\", \"hours_per_week\"]\ntarget_col = \"income_label\"\n\ntab_preprocessor = TabPreprocessor(\n    cat_embed_cols=cat_embed_cols,\n    continuous_cols=continuous_cols,\n    with_attention=True,\n    with_cls_token=True,  # this is optional\n)\nX_tab = tab_preprocessor.fit_transform(df_tr)\ntarget = df_tr[target_col].values\n</pre> df = load_adult(as_frame=True) df.columns = [c.replace(\"-\", \"_\") for c in df.columns] df[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int) df.drop(\"income\", axis=1, inplace=True)  # one could chose to use a validation set for early stopping, hyperparam # optimization, etc. This is just an example, so we simply use train/test # split df_tr, df_te = train_test_split(df, test_size=0.2, stratify=df.income_label)  cat_embed_cols = [     \"workclass\",     \"education\",     \"marital_status\",     \"occupation\",     \"relationship\",     \"race\",     \"gender\",     \"capital_gain\",     \"capital_loss\",     \"native_country\", ] continuous_cols = [\"age\", \"hours_per_week\"] target_col = \"income_label\"  tab_preprocessor = TabPreprocessor(     cat_embed_cols=cat_embed_cols,     continuous_cols=continuous_cols,     with_attention=True,     with_cls_token=True,  # this is optional ) X_tab = tab_preprocessor.fit_transform(df_tr) target = df_tr[target_col].values <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> In\u00a0[3]: Copied! <pre>ft_transformer = FTTransformer(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    embed_continuous_method=\"standard\",\n    input_dim=32,\n    kv_compression_factor=0.5,\n    n_blocks=3,\n    n_heads=4,\n)\n</pre> ft_transformer = FTTransformer(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=tab_preprocessor.continuous_cols,     embed_continuous_method=\"standard\",     input_dim=32,     kv_compression_factor=0.5,     n_blocks=3,     n_heads=4, ) In\u00a0[4]: Copied! <pre># for a full list of the params for the the ContrastiveDenoisingTrainer (which are many) please see the docs.\n# Note that using these params involves some knowledge of the routine and the architecture of the model used\ncontrastive_denoising_trainer = ContrastiveDenoisingTrainer(\n    model=ft_transformer,\n    preprocessor=tab_preprocessor,\n)\ncontrastive_denoising_trainer.pretrain(X_tab, n_epochs=5, batch_size=256)\n</pre> # for a full list of the params for the the ContrastiveDenoisingTrainer (which are many) please see the docs. # Note that using these params involves some knowledge of the routine and the architecture of the model used contrastive_denoising_trainer = ContrastiveDenoisingTrainer(     model=ft_transformer,     preprocessor=tab_preprocessor, ) contrastive_denoising_trainer.pretrain(X_tab, n_epochs=5, batch_size=256) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:13&lt;00:00, 11.73it/s, loss=579]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:12&lt;00:00, 12.56it/s, loss=143]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:12&lt;00:00, 12.49it/s, loss=141]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:11&lt;00:00, 12.77it/s, loss=138]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:11&lt;00:00, 13.29it/s, loss=137]\n</pre> In\u00a0[5]: Copied! <pre>contrastive_denoising_trainer.save(\n    path=\"pretrained_weights\", model_filename=\"contrastive_denoising_model.pt\"\n)\n</pre> contrastive_denoising_trainer.save(     path=\"pretrained_weights\", model_filename=\"contrastive_denoising_model.pt\" ) <p>some time has passed</p> In\u00a0[6]: Copied! <pre># some time has passed, we load the model with torch as usual:\ncontrastive_denoising_model = torch.load(\n    \"pretrained_weights/contrastive_denoising_model.pt\"\n)\n</pre> # some time has passed, we load the model with torch as usual: contrastive_denoising_model = torch.load(     \"pretrained_weights/contrastive_denoising_model.pt\" ) <p>NOW, AND THIS IS IMPORTANT! We have loaded the entire contrastive, denoising model. To proceed to the supervised training we ONLY need the attention-based model, which is the 'model' attribute of the trainer, let's have a look</p> In\u00a0[7]: Copied! <pre>contrastive_denoising_model.model\n</pre> contrastive_denoising_model.model Out[7]: <pre>FTTransformer(\n  (cat_embed): SameSizeCatEmbeddings(\n    (embed): Embedding(323, 32, padding_idx=0)\n    (dropout): Dropout(p=0.0, inplace=False)\n  )\n  (cont_norm): Identity()\n  (cont_embed): ContEmbeddings(\n    INFO: [ContLinear = weight(n_cont_cols, embed_dim) + bias(n_cont_cols, embed_dim)]\n    (linear): ContLinear(n_cont_cols=2, embed_dim=32, embed_dropout=0.0)\n    (dropout): Dropout(p=0.0, inplace=False)\n  )\n  (encoder): Sequential(\n    (fttransformer_block0): FTTransformerEncoder(\n      (attn): LinearAttentionLinformer(\n        (dropout): Dropout(p=0.2, inplace=False)\n        (qkv_proj): Linear(in_features=32, out_features=96, bias=False)\n        (out_proj): Linear(in_features=32, out_features=32, bias=False)\n      )\n      (ff): FeedForward(\n        (w_1): Linear(in_features=32, out_features=84, bias=True)\n        (w_2): Linear(in_features=42, out_features=32, bias=True)\n        (dropout): Dropout(p=0.1, inplace=False)\n        (activation): REGLU()\n      )\n      (attn_normadd): NormAdd(\n        (dropout): Dropout(p=0.2, inplace=False)\n        (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n      )\n      (ff_normadd): NormAdd(\n        (dropout): Dropout(p=0.1, inplace=False)\n        (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n      )\n    )\n    (fttransformer_block1): FTTransformerEncoder(\n      (attn): LinearAttentionLinformer(\n        (dropout): Dropout(p=0.2, inplace=False)\n        (qkv_proj): Linear(in_features=32, out_features=96, bias=False)\n        (out_proj): Linear(in_features=32, out_features=32, bias=False)\n      )\n      (ff): FeedForward(\n        (w_1): Linear(in_features=32, out_features=84, bias=True)\n        (w_2): Linear(in_features=42, out_features=32, bias=True)\n        (dropout): Dropout(p=0.1, inplace=False)\n        (activation): REGLU()\n      )\n      (attn_normadd): NormAdd(\n        (dropout): Dropout(p=0.2, inplace=False)\n        (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n      )\n      (ff_normadd): NormAdd(\n        (dropout): Dropout(p=0.1, inplace=False)\n        (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n      )\n    )\n    (fttransformer_block2): FTTransformerEncoder(\n      (attn): LinearAttentionLinformer(\n        (dropout): Dropout(p=0.2, inplace=False)\n        (qkv_proj): Linear(in_features=32, out_features=96, bias=False)\n        (out_proj): Linear(in_features=32, out_features=32, bias=False)\n      )\n      (ff): FeedForward(\n        (w_1): Linear(in_features=32, out_features=84, bias=True)\n        (w_2): Linear(in_features=42, out_features=32, bias=True)\n        (dropout): Dropout(p=0.1, inplace=False)\n        (activation): REGLU()\n      )\n      (attn_normadd): NormAdd(\n        (dropout): Dropout(p=0.2, inplace=False)\n        (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n      )\n      (ff_normadd): NormAdd(\n        (dropout): Dropout(p=0.1, inplace=False)\n        (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n      )\n    )\n  )\n)</pre> In\u00a0[8]: Copied! <pre>pretrained_model = contrastive_denoising_model.model\n</pre> pretrained_model = contrastive_denoising_model.model In\u00a0[9]: Copied! <pre># and as always, ANY supervised model in this library has to go throuth the WideDeep class:\nmodel = WideDeep(deeptabular=pretrained_model)\ntrainer = Trainer(model=model, objective=\"binary\", metrics=[Accuracy])\n\ntrainer.fit(X_tab=X_tab, target=target, n_epochs=5, batch_size=256)\n\n# And, you know...we get a test metric\nX_tab_te = tab_preprocessor.transform(df_te)\ntarget_te = df_te[target_col].values\n\npreds = trainer.predict(X_tab=X_tab_te)\ntest_acc = accuracy_score(target_te, preds)\nprint(test_acc)\n</pre> # and as always, ANY supervised model in this library has to go throuth the WideDeep class: model = WideDeep(deeptabular=pretrained_model) trainer = Trainer(model=model, objective=\"binary\", metrics=[Accuracy])  trainer.fit(X_tab=X_tab, target=target, n_epochs=5, batch_size=256)  # And, you know...we get a test metric X_tab_te = tab_preprocessor.transform(df_te) target_te = df_te[target_col].values  preds = trainer.predict(X_tab=X_tab_te) test_acc = accuracy_score(target_te, preds) print(test_acc) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:05&lt;00:00, 27.19it/s, loss=0.383, metrics={'acc': 0.8176}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:05&lt;00:00, 26.08it/s, loss=0.325, metrics={'acc': 0.8502}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:05&lt;00:00, 26.56it/s, loss=0.306, metrics={'acc': 0.8601}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:05&lt;00:00, 27.41it/s, loss=0.295, metrics={'acc': 0.8641}]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:06&lt;00:00, 24.70it/s, loss=0.289, metrics={'acc': 0.8656}]\npredict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 39/39 [00:00&lt;00:00, 97.26it/s]</pre> <pre>0.8695874705701709\n</pre> <pre>\n</pre> In\u00a0[\u00a0]: Copied! <pre>\n</pre>"},{"location":"examples/15_Self_Supervised_Pretraning_pt2.html#self-supervised-pretraining-for-tabular-data","title":"Self Supervised Pretraining for Tabular Data\u00b6","text":"<p>We have implemented two Self Supervised Pre-training routines that allow the user to pre-train all tabular models in the library with the exception of the TabPerceiver (which is a special monster).</p> <p>The two routines implemented are illustrated in the figures below. The 1st is from TabNet: Attentive Interpretable Tabular Learning and is designed for models that do not use transformer-based architectures, while the second is from SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training, and is designed for models that use transformer-based architectures.</p> <p> </p> <p>Fig 1. Figure 2 in their paper. I have included de original caption in case is useful, althought the Figure itself is pretty self explanatory</p> <p> </p> <p>Fig 2. Figure 1 in their paper. Here the caption is necessary \ud83d\ude0f</p> <p>It is beyond the scope of this notebook to explain in detail those implementations. Therefore, we strongly recommend the user to go and read the papers if this functionality is of interest to her/him.</p> <p>One thing is worth noticing however. As seen in Fig 1(the TabNet paper's Fig 2) the masking of the input features happens in the feature space. However, the implementation in this library is inspired by that at the dreamquark-ai repo, which is in itself inspired by the original implementation (by the way, at this point I will write it once again. All TabNet related things in this library are inspired when not directly based in the code in that repo, therefore, ALL CREDIT TO THE GUYS AT dreamquark-ai).</p> <p>In that implementation the masking happens in the embedding space, and currently does not mask the entire embedding (i.e. categorical feature). We decided to release as it is in this version and we will implement the exact same process described in the paper in future releases.</p> <p>Having said all of the above let's see how to use self supervision for tabular data with <code>pytorch-widedeep</code>. We will concentrate in this notebook on the 2nd of the two approaches (the 'SAINT approach'). For details on the 1st approach (the 'TabNet' approach) please see <code>16_Self_Supervised_Pretraning_pt1</code>.</p>"},{"location":"examples/15_Self_Supervised_Pretraning_pt2.html#self-supervision-transformer-based-models","title":"Self Supervision transformer-based models..\u00b6","text":"<p>...or in general, for models where the embeddigns have all the same dimensions. In this library, these are:</p> <ul> <li>TabTransformer</li> <li>FTTransformer</li> <li>SAINT</li> <li>TabFastFormer</li> </ul> <p>Note that there is one additional Transformer-based model, the <code>TabPerceiver</code>, however this is a \"particular\" model and at the moment we do not support self supervision for it, but it will come.</p> <p>Let see at one example using the <code>FTTransformer</code>.</p>"},{"location":"examples/16_Usign_a_custom_hugging_face_model.html","title":"16 Usign a custom hugging face model","text":"In\u00a0[1]: Copied! <pre>import numpy as np\nimport torch\nimport lightgbm as lgb\nfrom lightgbm import Dataset as lgbDataset\nfrom scipy.sparse import hstack, csr_matrix\nfrom sklearn.metrics import (\n    f1_score,\n    recall_score,\n    accuracy_score,\n    precision_score,\n    confusion_matrix,\n)\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\nfrom torch import Tensor, nn\nfrom transformers import DistilBertModel, DistilBertTokenizer\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep\nfrom pytorch_widedeep.metrics import F1Score, Accuracy\nfrom pytorch_widedeep.utils import Tokenizer, LabelEncoder\nfrom pytorch_widedeep.preprocessing import TextPreprocessor, TabPreprocessor\nfrom pytorch_widedeep.datasets import load_womens_ecommerce\nfrom pytorch_widedeep.utils.fastai_transforms import (\n    fix_html,\n    spec_add_spaces,\n    rm_useless_spaces,\n)\n</pre> import numpy as np import torch import lightgbm as lgb from lightgbm import Dataset as lgbDataset from scipy.sparse import hstack, csr_matrix from sklearn.metrics import (     f1_score,     recall_score,     accuracy_score,     precision_score,     confusion_matrix, ) from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer  from torch import Tensor, nn from transformers import DistilBertModel, DistilBertTokenizer from pytorch_widedeep import Trainer from pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep from pytorch_widedeep.metrics import F1Score, Accuracy from pytorch_widedeep.utils import Tokenizer, LabelEncoder from pytorch_widedeep.preprocessing import TextPreprocessor, TabPreprocessor from pytorch_widedeep.datasets import load_womens_ecommerce from pytorch_widedeep.utils.fastai_transforms import (     fix_html,     spec_add_spaces,     rm_useless_spaces, ) <p>Let's load the data and have a look:</p> In\u00a0[2]: Copied! <pre>df = load_womens_ecommerce(as_frame=True)\n\ndf.columns = [c.replace(\" \", \"_\").lower() for c in df.columns]\n\n# classes from [0,num_class)\ndf[\"rating\"] = (df[\"rating\"] - 1).astype(\"int64\")\n\n# group reviews with 1 and 2 scores into one class\ndf.loc[df.rating == 0, \"rating\"] = 1\n\n# and back again to [0,num_class)\ndf[\"rating\"] = (df[\"rating\"] - 1).astype(\"int64\")\n\n# drop short reviews\ndf = df[~df.review_text.isna()]\ndf[\"review_length\"] = df.review_text.apply(lambda x: len(x.split(\" \")))\ndf = df[df.review_length &gt;= 5]\ndf = df.drop(\"review_length\", axis=1).reset_index(drop=True)\n</pre> df = load_womens_ecommerce(as_frame=True)  df.columns = [c.replace(\" \", \"_\").lower() for c in df.columns]  # classes from [0,num_class) df[\"rating\"] = (df[\"rating\"] - 1).astype(\"int64\")  # group reviews with 1 and 2 scores into one class df.loc[df.rating == 0, \"rating\"] = 1  # and back again to [0,num_class) df[\"rating\"] = (df[\"rating\"] - 1).astype(\"int64\")  # drop short reviews df = df[~df.review_text.isna()] df[\"review_length\"] = df.review_text.apply(lambda x: len(x.split(\" \"))) df = df[df.review_length &gt;= 5] df = df.drop(\"review_length\", axis=1).reset_index(drop=True) In\u00a0[3]: Copied! <pre>df.head()\n</pre> df.head() Out[3]: clothing_id age title review_text rating recommended_ind positive_feedback_count division_name department_name class_name 0 767 33 None Absolutely wonderful - silky and sexy and comf... 2 1 0 Initmates Intimate Intimates 1 1080 34 None Love this dress!  it's sooo pretty.  i happene... 3 1 4 General Dresses Dresses 2 1077 60 Some major design flaws I had such high hopes for this dress and reall... 1 0 0 General Dresses Dresses 3 1049 50 My favorite buy! I love, love, love this jumpsuit. it's fun, fl... 3 1 0 General Petite Bottoms Pants 4 847 47 Flattering shirt This shirt is very flattering to all due to th... 3 1 6 General Tops Blouses <p>So, we will use the <code>review_text</code> column to predict the <code>rating</code>. Later on, we will try to combine it with some other columns (like <code>division_name</code> and <code>age</code>) see if these help.</p> <p>Let's first have a look to the distribution of ratings</p> In\u00a0[4]: Copied! <pre>df.rating.value_counts()\n</pre> df.rating.value_counts() Out[4]: <pre>rating\n3    12515\n2     4904\n1     2820\n0     2369\nName: count, dtype: int64</pre> <p>This shows that we could have perhaps grouped rating scores of 1, 2 and 3 into 1...but anyway, let's just move on with those 4 classes.</p> <p>We are not going to carry any hyperparameter optimization here, so, we will only need a train and a test set (i.e.  no need of a validation set for the example in this notebook)</p> In\u00a0[5]: Copied! <pre>train, test = train_test_split(df, train_size=0.8, random_state=1, stratify=df.rating)\n</pre> train, test = train_test_split(df, train_size=0.8, random_state=1, stratify=df.rating) <p>Let's see what we have to beat. What metrics would we obtain if we always predict the most common rating (3)?</p> In\u00a0[6]: Copied! <pre>most_common_pred = [train.rating.value_counts().index[0]] * len(test)\n\nmost_common_acc = accuracy_score(test.rating, most_common_pred)\nmost_common_f1 = f1_score(test.rating, most_common_pred, average=\"weighted\")\n</pre> most_common_pred = [train.rating.value_counts().index[0]] * len(test)  most_common_acc = accuracy_score(test.rating, most_common_pred) most_common_f1 = f1_score(test.rating, most_common_pred, average=\"weighted\") In\u00a0[7]: Copied! <pre>print(f\"Accuracy: {most_common_acc}. F1 Score: {most_common_f1}\")\n</pre> print(f\"Accuracy: {most_common_acc}. F1 Score: {most_common_f1}\") <pre>Accuracy: 0.553516143299425. F1 Score: 0.3944344218301668\n</pre> <p>ok, these are our \"baseline\" metrics.</p> <p>Let's start by using simply tf-idf + lightGBM</p> In\u00a0[8]: Copied! <pre># ?Tokenizer\n</pre> # ?Tokenizer In\u00a0[9]: Copied! <pre># this Tokenizer is part of our utils module but of course, any valid tokenizer can be used here.\n\n# When using notebooks there seems to be an issue related with multiprocessing (and sometimes tqdm)\n# that can only be solved by using only one CPU\ntok = Tokenizer(n_cpus=1)\ntok_reviews_tr = tok.process_all(train.review_text.tolist())\ntok_reviews_te = tok.process_all(test.review_text.tolist())\n</pre> # this Tokenizer is part of our utils module but of course, any valid tokenizer can be used here.  # When using notebooks there seems to be an issue related with multiprocessing (and sometimes tqdm) # that can only be solved by using only one CPU tok = Tokenizer(n_cpus=1) tok_reviews_tr = tok.process_all(train.review_text.tolist()) tok_reviews_te = tok.process_all(test.review_text.tolist()) In\u00a0[10]: Copied! <pre>vectorizer = TfidfVectorizer(\n    max_features=5000, preprocessor=lambda x: x, tokenizer=lambda x: x, min_df=5\n)\n\nX_text_tr = vectorizer.fit_transform(tok_reviews_tr)\nX_text_te = vectorizer.transform(tok_reviews_te)\n</pre> vectorizer = TfidfVectorizer(     max_features=5000, preprocessor=lambda x: x, tokenizer=lambda x: x, min_df=5 )  X_text_tr = vectorizer.fit_transform(tok_reviews_tr) X_text_te = vectorizer.transform(tok_reviews_te) <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:525: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n  warnings.warn(\n</pre> In\u00a0[11]: Copied! <pre>X_text_tr\n</pre> X_text_tr Out[11]: <pre>&lt;18086x4566 sparse matrix of type '&lt;class 'numpy.float64'&gt;'\n\twith 884074 stored elements in Compressed Sparse Row format&gt;</pre> <p>We now move our matrices to lightGBM <code>Dataset</code> format</p> In\u00a0[12]: Copied! <pre>lgbtrain_text = lgbDataset(\n    X_text_tr,\n    train.rating.values,\n    free_raw_data=False,\n)\n\nlgbtest_text = lgbDataset(\n    X_text_te,\n    test.rating.values,\n    reference=lgbtrain_text,\n    free_raw_data=False,\n)\n</pre> lgbtrain_text = lgbDataset(     X_text_tr,     train.rating.values,     free_raw_data=False, )  lgbtest_text = lgbDataset(     X_text_te,     test.rating.values,     reference=lgbtrain_text,     free_raw_data=False, ) <p>and off we go. By the way, I think as we run the next cell, we should appreciate how fast lightGBM runs. Yes, the input is a sparse matrix, but still, trains on 18086x4566 in a matter of secs</p> In\u00a0[\u00a0]: Copied! <pre>lgb_text_model = lgb.train(\n    {\"objective\": \"multiclass\", \"num_classes\": 4},\n    lgbtrain_text,\n    valid_sets=[lgbtest_text, lgbtrain_text],\n    valid_names=[\"test\", \"train\"],\n)\n</pre> lgb_text_model = lgb.train(     {\"objective\": \"multiclass\", \"num_classes\": 4},     lgbtrain_text,     valid_sets=[lgbtest_text, lgbtrain_text],     valid_names=[\"test\", \"train\"], ) In\u00a0[14]: Copied! <pre>preds_text = lgb_text_model.predict(X_text_te)\npred_text_class = np.argmax(preds_text, 1)\n</pre> preds_text = lgb_text_model.predict(X_text_te) pred_text_class = np.argmax(preds_text, 1) In\u00a0[15]: Copied! <pre>acc_text = accuracy_score(lgbtest_text.label, pred_text_class)\nf1_text = f1_score(lgbtest_text.label, pred_text_class, average=\"weighted\")\ncm_text = confusion_matrix(lgbtest_text.label, pred_text_class)\n</pre> acc_text = accuracy_score(lgbtest_text.label, pred_text_class) f1_text = f1_score(lgbtest_text.label, pred_text_class, average=\"weighted\") cm_text = confusion_matrix(lgbtest_text.label, pred_text_class) In\u00a0[16]: Copied! <pre>print(f\"LightGBM Accuracy: {acc_text}. LightGBM F1 Score: {f1_text}\")\n</pre> print(f\"LightGBM Accuracy: {acc_text}. LightGBM F1 Score: {f1_text}\") <pre>LightGBM Accuracy: 0.6444051304732419. LightGBM F1 Score: 0.617154488246181\n</pre> In\u00a0[17]: Copied! <pre>print(f\"LightGBM Confusion Matrix: \\n {cm_text}\")\n</pre> print(f\"LightGBM Confusion Matrix: \\n {cm_text}\") <pre>LightGBM Confusion Matrix: \n [[ 199  135   61   79]\n [ 123  169  149  123]\n [  30   94  279  578]\n [  16   30  190 2267]]\n</pre> <p>Ok, so, with no hyperparameter optimization lightGBM gets an accuracy of 0.64 and a F1 score of 0.62. This is significantly better than predicting always the most popular.</p> <p>Let's see if in this implementation, some additional features, like <code>age</code> or <code>class_name</code> are of any help</p> In\u00a0[18]: Copied! <pre>tab_cols = [\n    \"age\",\n    \"division_name\",\n    \"department_name\",\n    \"class_name\",\n]\n\nfor tab_df in [train, test]:\n    for c in [\"division_name\", \"department_name\", \"class_name\"]:\n        tab_df[c] = tab_df[c].str.lower()\n        tab_df[c].fillna(\"missing\", inplace=True)\n</pre> tab_cols = [     \"age\",     \"division_name\",     \"department_name\",     \"class_name\", ]  for tab_df in [train, test]:     for c in [\"division_name\", \"department_name\", \"class_name\"]:         tab_df[c] = tab_df[c].str.lower()         tab_df[c].fillna(\"missing\", inplace=True) In\u00a0[19]: Copied! <pre># This is our LabelEncoder. A class that is designed to work with the models in this library but\n# can be used for general purposes\nle = LabelEncoder(columns_to_encode=[\"division_name\", \"department_name\", \"class_name\"])\ntrain_tab_le = le.fit_transform(train)\ntest_tab_le = le.transform(test)\n</pre> # This is our LabelEncoder. A class that is designed to work with the models in this library but # can be used for general purposes le = LabelEncoder(columns_to_encode=[\"division_name\", \"department_name\", \"class_name\"]) train_tab_le = le.fit_transform(train) test_tab_le = le.transform(test) In\u00a0[20]: Copied! <pre>train_tab_le.head()\n</pre> train_tab_le.head() Out[20]: clothing_id age title review_text rating recommended_ind positive_feedback_count division_name department_name class_name 4541 836 35 None Bought this on sale in my reg size- 10. im 5'9... 2 1 2 1 1 1 18573 1022 25 Look like \"mom jeans\" Maybe i just have the wrong body type for thes... 1 0 0 2 2 2 1058 815 39 Ig brought me here Love the way this top layers under my jackets ... 2 1 0 1 1 1 12132 984 47 Runs small especially the arms I love this jacket. it's the prettiest and mos... 3 1 0 1 3 3 20756 1051 42 True red, true beauty. These pants are gorgeous--the fabric has a sat... 3 1 0 2 2 4 <p>let's for example have a look to the encodings for the categorical feature <code>class_name</code></p> In\u00a0[21]: Copied! <pre>le.encoding_dict[\"class_name\"]\n</pre> le.encoding_dict[\"class_name\"] Out[21]: <pre>{'blouses': 1,\n 'jeans': 2,\n 'jackets': 3,\n 'pants': 4,\n 'knits': 5,\n 'dresses': 6,\n 'skirts': 7,\n 'sweaters': 8,\n 'fine gauge': 9,\n 'legwear': 10,\n 'lounge': 11,\n 'shorts': 12,\n 'outerwear': 13,\n 'intimates': 14,\n 'swim': 15,\n 'trend': 16,\n 'sleep': 17,\n 'layering': 18,\n 'missing': 19,\n 'casual bottoms': 20,\n 'chemises': 21}</pre> In\u00a0[22]: Copied! <pre># tabular training and test sets\nX_tab_tr = csr_matrix(train_tab_le[tab_cols].values)\nX_tab_te = csr_matrix(test_tab_le[tab_cols].values)\n\n# text + tabular training and test sets\nX_tab_text_tr = hstack((X_tab_tr, X_text_tr))\nX_tab_text_te = hstack((X_tab_te, X_text_te))\n</pre> # tabular training and test sets X_tab_tr = csr_matrix(train_tab_le[tab_cols].values) X_tab_te = csr_matrix(test_tab_le[tab_cols].values)  # text + tabular training and test sets X_tab_text_tr = hstack((X_tab_tr, X_text_tr)) X_tab_text_te = hstack((X_tab_te, X_text_te)) In\u00a0[23]: Copied! <pre>X_tab_tr\n</pre> X_tab_tr Out[23]: <pre>&lt;18086x4 sparse matrix of type '&lt;class 'numpy.int64'&gt;'\n\twith 72344 stored elements in Compressed Sparse Row format&gt;</pre> In\u00a0[24]: Copied! <pre>X_tab_text_tr\n</pre> X_tab_text_tr Out[24]: <pre>&lt;18086x4570 sparse matrix of type '&lt;class 'numpy.float64'&gt;'\n\twith 956418 stored elements in Compressed Sparse Row format&gt;</pre> In\u00a0[25]: Copied! <pre>lgbtrain_tab_text = lgbDataset(\n    X_tab_text_tr,\n    train.rating.values,\n    categorical_feature=[0, 1, 2, 3],\n    free_raw_data=False,\n)\n\nlgbtest_tab_text = lgbDataset(\n    X_tab_text_te,\n    test.rating.values,\n    reference=lgbtrain_tab_text,\n    free_raw_data=False,\n)\n</pre> lgbtrain_tab_text = lgbDataset(     X_tab_text_tr,     train.rating.values,     categorical_feature=[0, 1, 2, 3],     free_raw_data=False, )  lgbtest_tab_text = lgbDataset(     X_tab_text_te,     test.rating.values,     reference=lgbtrain_tab_text,     free_raw_data=False, ) In\u00a0[26]: Copied! <pre>lgb_tab_text_model = lgb.train(\n    {\"objective\": \"multiclass\", \"num_classes\": 4},\n    lgbtrain_tab_text,\n    valid_sets=[lgbtrain_tab_text, lgbtest_tab_text],\n    valid_names=[\"test\", \"train\"],\n    verbose_eval=False,\n)\n</pre> lgb_tab_text_model = lgb.train(     {\"objective\": \"multiclass\", \"num_classes\": 4},     lgbtrain_tab_text,     valid_sets=[lgbtrain_tab_text, lgbtest_tab_text],     valid_names=[\"test\", \"train\"],     verbose_eval=False, ) <pre>/opt/conda/envs/wd38/lib/python3.8/site-packages/lightgbm/basic.py:2065: UserWarning: Using categorical_feature in Dataset.\n  _log_warning('Using categorical_feature in Dataset.')\n/opt/conda/envs/wd38/lib/python3.8/site-packages/lightgbm/basic.py:2068: UserWarning: categorical_feature in Dataset is overridden.\nNew categorical_feature is [0, 1, 2, 3]\n  _log_warning('categorical_feature in Dataset is overridden.\\n'\n/opt/conda/envs/wd38/lib/python3.8/site-packages/lightgbm/engine.py:239: UserWarning: 'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.\n  _log_warning(\"'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. \"\n</pre> <pre>[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.138280 seconds.\nYou can set `force_col_wise=true` to remove the overhead.\n[LightGBM] [Info] Total Bins 143432\n[LightGBM] [Info] Number of data points in the train set: 18086, number of used features: 2289\n[LightGBM] [Info] Start training from score -2.255919\n[LightGBM] [Info] Start training from score -2.081545\n[LightGBM] [Info] Start training from score -1.528281\n[LightGBM] [Info] Start training from score -0.591354\n</pre> <pre>/opt/conda/envs/wd38/lib/python3.8/site-packages/lightgbm/basic.py:1780: UserWarning: Overriding the parameters from Reference Dataset.\n  _log_warning('Overriding the parameters from Reference Dataset.')\n/opt/conda/envs/wd38/lib/python3.8/site-packages/lightgbm/basic.py:1513: UserWarning: categorical_column in param dict is overridden.\n  _log_warning(f'{cat_alias} in param dict is overridden.')\n</pre> In\u00a0[27]: Copied! <pre>preds_tab_text = lgb_tab_text_model.predict(X_tab_text_te)\npreds_tab_text_class = np.argmax(preds_tab_text, 1)\n\nacc_tab_text = accuracy_score(lgbtest_tab_text.label, preds_tab_text_class)\nf1_tab_text = f1_score(lgbtest_tab_text.label, preds_tab_text_class, average=\"weighted\")\ncm_tab_text = confusion_matrix(lgbtest_tab_text.label, preds_tab_text_class)\n</pre> preds_tab_text = lgb_tab_text_model.predict(X_tab_text_te) preds_tab_text_class = np.argmax(preds_tab_text, 1)  acc_tab_text = accuracy_score(lgbtest_tab_text.label, preds_tab_text_class) f1_tab_text = f1_score(lgbtest_tab_text.label, preds_tab_text_class, average=\"weighted\") cm_tab_text = confusion_matrix(lgbtest_tab_text.label, preds_tab_text_class) In\u00a0[28]: Copied! <pre>print(\n    f\"LightGBM text + tabular Accuracy: {acc_tab_text}. LightGBM text + tabular F1 Score: {f1_tab_text}\"\n)\n</pre> print(     f\"LightGBM text + tabular Accuracy: {acc_tab_text}. LightGBM text + tabular F1 Score: {f1_tab_text}\" ) <pre>LightGBM text + tabular Accuracy: 0.6382131800088456. LightGBM text + tabular F1 Score: 0.6080251307242649\n</pre> In\u00a0[29]: Copied! <pre>print(f\"LightGBM text + tabular Confusion Matrix:\\n {cm_tab_text}\")\n</pre> print(f\"LightGBM text + tabular Confusion Matrix:\\n {cm_tab_text}\") <pre>LightGBM text + tabular Confusion Matrix:\n [[ 193  123   68   90]\n [ 123  146  157  138]\n [  37   90  272  582]\n [  16   37  175 2275]]\n</pre> <p>So, in this set up, the addition tabular columns do not help performance.</p> In\u00a0[30]: Copied! <pre>text_preprocessor = TextPreprocessor(\n    text_col=\"review_text\", max_vocab=5000, min_freq=5, maxlen=90, n_cpus=1\n)\n\nwd_X_text_tr = text_preprocessor.fit_transform(train)\nwd_X_text_te = text_preprocessor.transform(test)\n</pre> text_preprocessor = TextPreprocessor(     text_col=\"review_text\", max_vocab=5000, min_freq=5, maxlen=90, n_cpus=1 )  wd_X_text_tr = text_preprocessor.fit_transform(train) wd_X_text_te = text_preprocessor.transform(test) <pre>The vocabulary contains 4328 tokens\n</pre> In\u00a0[31]: Copied! <pre>basic_rnn = BasicRNN(\n    vocab_size=len(text_preprocessor.vocab.itos),\n    embed_dim=300,\n    hidden_dim=64,\n    n_layers=3,\n    rnn_dropout=0.2,\n    head_hidden_dims=[32],\n)\n\n\nwd_text_model = WideDeep(deeptext=basic_rnn, pred_dim=4)\n</pre> basic_rnn = BasicRNN(     vocab_size=len(text_preprocessor.vocab.itos),     embed_dim=300,     hidden_dim=64,     n_layers=3,     rnn_dropout=0.2,     head_hidden_dims=[32], )   wd_text_model = WideDeep(deeptext=basic_rnn, pred_dim=4) In\u00a0[32]: Copied! <pre>wd_text_model\n</pre> wd_text_model Out[32]: <pre>WideDeep(\n  (deeptext): Sequential(\n    (0): BasicRNN(\n      (word_embed): Embedding(4328, 300, padding_idx=1)\n      (rnn): LSTM(300, 64, num_layers=3, batch_first=True, dropout=0.2)\n      (rnn_mlp): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=64, out_features=32, bias=True)\n            (1): ReLU(inplace=True)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=32, out_features=4, bias=True)\n  )\n)</pre> In\u00a0[33]: Copied! <pre>text_trainer = Trainer(\n    wd_text_model,\n    objective=\"multiclass\",\n    metrics=[Accuracy, F1Score(average=True)],\n    num_workers=0,  # As in the case of the tokenizer, in notebook I need to set this to 0 for the Trainer to work\n)\n</pre> text_trainer = Trainer(     wd_text_model,     objective=\"multiclass\",     metrics=[Accuracy, F1Score(average=True)],     num_workers=0,  # As in the case of the tokenizer, in notebook I need to set this to 0 for the Trainer to work ) In\u00a0[34]: Copied! <pre>text_trainer.fit(\n    X_text=wd_X_text_tr,\n    target=train.rating.values,\n    n_epochs=5,\n    batch_size=256,\n)\n</pre> text_trainer.fit(     X_text=wd_X_text_tr,     target=train.rating.values,     n_epochs=5,     batch_size=256, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:01&lt;00:00, 52.39it/s, loss=1.16, metrics={'acc': 0.5349, 'f1': 0.2011}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:01&lt;00:00, 70.35it/s, loss=0.964, metrics={'acc': 0.5827, 'f1': 0.3005}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:01&lt;00:00, 70.33it/s, loss=0.845, metrics={'acc': 0.6252, 'f1': 0.4133}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:01&lt;00:00, 69.99it/s, loss=0.765, metrics={'acc': 0.6575, 'f1': 0.4875}]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:01&lt;00:00, 69.55it/s, loss=0.709, metrics={'acc': 0.6879, 'f1': 0.5423}]\n</pre> In\u00a0[35]: Copied! <pre>wd_pred_text = text_trainer.predict_proba(X_text=wd_X_text_te)\nwd_pred_text_class = np.argmax(wd_pred_text, 1)\n</pre> wd_pred_text = text_trainer.predict_proba(X_text=wd_X_text_te) wd_pred_text_class = np.argmax(wd_pred_text, 1) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 18/18 [00:00&lt;00:00, 211.51it/s]\n</pre> In\u00a0[36]: Copied! <pre>wd_acc_text = accuracy_score(test.rating, wd_pred_text_class)\nwd_f1_text = f1_score(test.rating, wd_pred_text_class, average=\"weighted\")\nwd_cm_text = confusion_matrix(test.rating, wd_pred_text_class)\n</pre> wd_acc_text = accuracy_score(test.rating, wd_pred_text_class) wd_f1_text = f1_score(test.rating, wd_pred_text_class, average=\"weighted\") wd_cm_text = confusion_matrix(test.rating, wd_pred_text_class) In\u00a0[37]: Copied! <pre>print(f\"Basic RNN Accuracy: {wd_acc_text}. Basic RNN F1 Score: {wd_f1_text}\")\n</pre> print(f\"Basic RNN Accuracy: {wd_acc_text}. Basic RNN F1 Score: {wd_f1_text}\") <pre>Basic RNN Accuracy: 0.6076957098628926. Basic RNN F1 Score: 0.6017335854471788\n</pre> In\u00a0[38]: Copied! <pre>print(f\"Basic RNN Confusion Matrix:\\n {wd_cm_text}\")\n</pre> print(f\"Basic RNN Confusion Matrix:\\n {wd_cm_text}\") <pre>Basic RNN Confusion Matrix:\n [[ 327   76   62    9]\n [ 285  115  117   47]\n [ 131  122  315  413]\n [  42   69  401 1991]]\n</pre> <p>The performance is very similar to that of using simply tf-idf and lightgbm. Let see if adding tabular features helps when using <code>pytorch-widedeep</code></p> In\u00a0[39]: Copied! <pre># ?TabPreprocessor\n</pre> # ?TabPreprocessor In\u00a0[40]: Copied! <pre>tab_preprocessor = TabPreprocessor(cat_embed_cols=tab_cols)\n\nwd_X_tab_tr = tab_preprocessor.fit_transform(train)\nwd_X_tab_te = tab_preprocessor.transform(test)\n</pre> tab_preprocessor = TabPreprocessor(cat_embed_cols=tab_cols)  wd_X_tab_tr = tab_preprocessor.fit_transform(train) wd_X_tab_te = tab_preprocessor.transform(test) In\u00a0[41]: Copied! <pre># ?TabMlp\n</pre> # ?TabMlp In\u00a0[42]: Copied! <pre>tab_model = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    mlp_hidden_dims=[100, 50],\n)\n</pre> tab_model = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     mlp_hidden_dims=[100, 50], ) In\u00a0[43]: Copied! <pre>tab_model\n</pre> tab_model Out[43]: <pre>TabMlp(\n  (cat_and_cont_embed): DiffSizeCatAndContEmbeddings(\n    (cat_embed): DiffSizeCatEmbeddings(\n      (embed_layers): ModuleDict(\n        (emb_layer_age): Embedding(78, 18, padding_idx=0)\n        (emb_layer_division_name): Embedding(5, 3, padding_idx=0)\n        (emb_layer_department_name): Embedding(8, 5, padding_idx=0)\n        (emb_layer_class_name): Embedding(22, 9, padding_idx=0)\n      )\n      (embedding_dropout): Dropout(p=0.1, inplace=False)\n    )\n  )\n  (encoder): MLP(\n    (mlp): Sequential(\n      (dense_layer_0): Sequential(\n        (0): Dropout(p=0.1, inplace=False)\n        (1): Linear(in_features=35, out_features=100, bias=True)\n        (2): ReLU(inplace=True)\n      )\n      (dense_layer_1): Sequential(\n        (0): Dropout(p=0.1, inplace=False)\n        (1): Linear(in_features=100, out_features=50, bias=True)\n        (2): ReLU(inplace=True)\n      )\n    )\n  )\n)</pre> In\u00a0[44]: Copied! <pre>text_model = BasicRNN(\n    vocab_size=len(text_preprocessor.vocab.itos),\n    embed_dim=300,\n    hidden_dim=64,\n    n_layers=3,\n    rnn_dropout=0.2,\n    head_hidden_dims=[32],\n)\n</pre> text_model = BasicRNN(     vocab_size=len(text_preprocessor.vocab.itos),     embed_dim=300,     hidden_dim=64,     n_layers=3,     rnn_dropout=0.2,     head_hidden_dims=[32], ) In\u00a0[45]: Copied! <pre>wd_tab_and_text_model = WideDeep(deeptabular=tab_model, deeptext=text_model, pred_dim=4)\n</pre> wd_tab_and_text_model = WideDeep(deeptabular=tab_model, deeptext=text_model, pred_dim=4) In\u00a0[46]: Copied! <pre>wd_tab_and_text_model\n</pre> wd_tab_and_text_model Out[46]: <pre>WideDeep(\n  (deeptabular): Sequential(\n    (0): TabMlp(\n      (cat_and_cont_embed): DiffSizeCatAndContEmbeddings(\n        (cat_embed): DiffSizeCatEmbeddings(\n          (embed_layers): ModuleDict(\n            (emb_layer_age): Embedding(78, 18, padding_idx=0)\n            (emb_layer_division_name): Embedding(5, 3, padding_idx=0)\n            (emb_layer_department_name): Embedding(8, 5, padding_idx=0)\n            (emb_layer_class_name): Embedding(22, 9, padding_idx=0)\n          )\n          (embedding_dropout): Dropout(p=0.1, inplace=False)\n        )\n      )\n      (encoder): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Dropout(p=0.1, inplace=False)\n            (1): Linear(in_features=35, out_features=100, bias=True)\n            (2): ReLU(inplace=True)\n          )\n          (dense_layer_1): Sequential(\n            (0): Dropout(p=0.1, inplace=False)\n            (1): Linear(in_features=100, out_features=50, bias=True)\n            (2): ReLU(inplace=True)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=50, out_features=4, bias=True)\n  )\n  (deeptext): Sequential(\n    (0): BasicRNN(\n      (word_embed): Embedding(4328, 300, padding_idx=1)\n      (rnn): LSTM(300, 64, num_layers=3, batch_first=True, dropout=0.2)\n      (rnn_mlp): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=64, out_features=32, bias=True)\n            (1): ReLU(inplace=True)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=32, out_features=4, bias=True)\n  )\n)</pre> In\u00a0[47]: Copied! <pre>tab_and_text_trainer = Trainer(\n    wd_tab_and_text_model,\n    objective=\"multiclass\",\n    metrics=[Accuracy, F1Score(average=True)],\n    num_workers=0,  # As in the case of the tokenizer, in notebook I need to set this to 0 for the Trainer to work\n)\n</pre> tab_and_text_trainer = Trainer(     wd_tab_and_text_model,     objective=\"multiclass\",     metrics=[Accuracy, F1Score(average=True)],     num_workers=0,  # As in the case of the tokenizer, in notebook I need to set this to 0 for the Trainer to work ) In\u00a0[48]: Copied! <pre>tab_and_text_trainer.fit(\n    X_tab=wd_X_tab_tr,\n    X_text=wd_X_text_tr,\n    target=train.rating.values,\n    n_epochs=5,\n    batch_size=256,\n)\n</pre> tab_and_text_trainer.fit(     X_tab=wd_X_tab_tr,     X_text=wd_X_text_tr,     target=train.rating.values,     n_epochs=5,     batch_size=256, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:01&lt;00:00, 52.04it/s, loss=1.13, metrics={'acc': 0.538, 'f1': 0.1911}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:01&lt;00:00, 52.28it/s, loss=0.936, metrics={'acc': 0.5887, 'f1': 0.3507}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:01&lt;00:00, 52.26it/s, loss=0.825, metrics={'acc': 0.6394, 'f1': 0.4545}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:01&lt;00:00, 51.33it/s, loss=0.757, metrics={'acc': 0.6696, 'f1': 0.5214}]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:01&lt;00:00, 50.39it/s, loss=0.702, metrics={'acc': 0.6963, 'f1': 0.5654}]\n</pre> In\u00a0[49]: Copied! <pre>wd_pred_tab_and_text = tab_and_text_trainer.predict_proba(\n    X_tab=wd_X_tab_te, X_text=wd_X_text_te\n)\nwd_pred_tab_and_text_class = np.argmax(wd_pred_tab_and_text, 1)\n</pre> wd_pred_tab_and_text = tab_and_text_trainer.predict_proba(     X_tab=wd_X_tab_te, X_text=wd_X_text_te ) wd_pred_tab_and_text_class = np.argmax(wd_pred_tab_and_text, 1) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 18/18 [00:00&lt;00:00, 136.94it/s]\n</pre> In\u00a0[50]: Copied! <pre>wd_acc_tab_and_text = accuracy_score(test.rating, wd_pred_tab_and_text_class)\nwd_f1_tab_and_text = f1_score(\n    test.rating, wd_pred_tab_and_text_class, average=\"weighted\"\n)\nwd_cm_tab_and_text = confusion_matrix(test.rating, wd_pred_tab_and_text_class)\n</pre> wd_acc_tab_and_text = accuracy_score(test.rating, wd_pred_tab_and_text_class) wd_f1_tab_and_text = f1_score(     test.rating, wd_pred_tab_and_text_class, average=\"weighted\" ) wd_cm_tab_and_text = confusion_matrix(test.rating, wd_pred_tab_and_text_class) In\u00a0[51]: Copied! <pre>print(\n    f\"Basic RNN + Tabular  Accuracy: {wd_acc_tab_and_text}. Basic RNN + TabularF1 Score: {wd_f1_tab_and_text}\"\n)\nprint(f\"Basic RNN + Tabular  Confusion Matrix:\\n {wd_cm_tab_and_text}\")\n</pre> print(     f\"Basic RNN + Tabular  Accuracy: {wd_acc_tab_and_text}. Basic RNN + TabularF1 Score: {wd_f1_tab_and_text}\" ) print(f\"Basic RNN + Tabular  Confusion Matrix:\\n {wd_cm_tab_and_text}\") <pre>Basic RNN + Tabular  Accuracy: 0.6333480760725343. Basic RNN + TabularF1 Score: 0.6332310089593208\nBasic RNN + Tabular  Confusion Matrix:\n [[ 267  132   65   10]\n [ 198  168  159   39]\n [  57  113  410  401]\n [  12   58  414 2019]]\n</pre> <p>We are going to \"manually\" code the Tokenizer and the model and see how they can be used as part of the process along with the <code>pytorch-widedeep</code> library.</p> <p>Tokenizer:</p> In\u00a0[52]: Copied! <pre>class BertTokenizer(object):\n    def __init__(\n        self,\n        pretrained_tokenizer=\"distilbert-base-uncased\",\n        do_lower_case=True,\n        max_length=90,\n    ):\n        super(BertTokenizer, self).__init__()\n        self.pretrained_tokenizer = pretrained_tokenizer\n        self.do_lower_case = do_lower_case\n        self.max_length = max_length\n\n    def fit(self, texts):\n        self.tokenizer = DistilBertTokenizer.from_pretrained(\n            self.pretrained_tokenizer, do_lower_case=self.do_lower_case\n        )\n\n        return self\n\n    def transform(self, texts):\n        input_ids = []\n        for text in texts:\n            encoded_sent = self.tokenizer.encode_plus(\n                text=self._pre_rules(text),\n                add_special_tokens=True,\n                max_length=self.max_length,\n                padding=\"max_length\",\n                truncation=True,\n            )\n\n            input_ids.append(encoded_sent.get(\"input_ids\"))\n        return np.stack(input_ids)\n\n    def fit_transform(self, texts):\n        return self.fit(texts).transform(texts)\n\n    @staticmethod\n    def _pre_rules(text):\n        return fix_html(rm_useless_spaces(spec_add_spaces(text)))\n</pre> class BertTokenizer(object):     def __init__(         self,         pretrained_tokenizer=\"distilbert-base-uncased\",         do_lower_case=True,         max_length=90,     ):         super(BertTokenizer, self).__init__()         self.pretrained_tokenizer = pretrained_tokenizer         self.do_lower_case = do_lower_case         self.max_length = max_length      def fit(self, texts):         self.tokenizer = DistilBertTokenizer.from_pretrained(             self.pretrained_tokenizer, do_lower_case=self.do_lower_case         )          return self      def transform(self, texts):         input_ids = []         for text in texts:             encoded_sent = self.tokenizer.encode_plus(                 text=self._pre_rules(text),                 add_special_tokens=True,                 max_length=self.max_length,                 padding=\"max_length\",                 truncation=True,             )              input_ids.append(encoded_sent.get(\"input_ids\"))         return np.stack(input_ids)      def fit_transform(self, texts):         return self.fit(texts).transform(texts)      @staticmethod     def _pre_rules(text):         return fix_html(rm_useless_spaces(spec_add_spaces(text))) <p>Model:</p> In\u00a0[53]: Copied! <pre>class BertModel(nn.Module):\n    def __init__(\n        self,\n        model_name: str = \"distilbert-base-uncased\",\n        freeze_bert: bool = False,\n    ):\n        super(BertModel, self).__init__()\n\n        self.bert = DistilBertModel.from_pretrained(\n            model_name,\n        )\n\n        if freeze_bert:\n            for param in self.bert.parameters():\n                param.requires_grad = False\n\n    def forward(self, X_inp: Tensor) -&gt; Tensor:\n        attn_mask = (X_inp != 0).type(torch.int8)\n        outputs = self.bert(input_ids=X_inp, attention_mask=attn_mask)\n        return outputs[0][:, 0, :]\n\n    @property\n    def output_dim(self) -&gt; int:\n        # This is THE ONLY requirement for any model to work with pytorch-widedeep. Must\n        # have a 'output_dim' property so the WideDeep class knows the incoming dims\n        # from the custom model. in this case, I hardcoded it\n        return 768\n</pre> class BertModel(nn.Module):     def __init__(         self,         model_name: str = \"distilbert-base-uncased\",         freeze_bert: bool = False,     ):         super(BertModel, self).__init__()          self.bert = DistilBertModel.from_pretrained(             model_name,         )          if freeze_bert:             for param in self.bert.parameters():                 param.requires_grad = False      def forward(self, X_inp: Tensor) -&gt; Tensor:         attn_mask = (X_inp != 0).type(torch.int8)         outputs = self.bert(input_ids=X_inp, attention_mask=attn_mask)         return outputs[0][:, 0, :]      @property     def output_dim(self) -&gt; int:         # This is THE ONLY requirement for any model to work with pytorch-widedeep. Must         # have a 'output_dim' property so the WideDeep class knows the incoming dims         # from the custom model. in this case, I hardcoded it         return 768 In\u00a0[54]: Copied! <pre>bert_tokenizer = BertTokenizer()\nX_bert_tr = bert_tokenizer.fit_transform(train[\"review_text\"].tolist())\nX_bert_te = bert_tokenizer.transform(test[\"review_text\"].tolist())\n</pre> bert_tokenizer = BertTokenizer() X_bert_tr = bert_tokenizer.fit_transform(train[\"review_text\"].tolist()) X_bert_te = bert_tokenizer.transform(test[\"review_text\"].tolist()) <p>As I mentioned a number of times in the documentation and examples, <code>pytorch-widedeep</code> is designed for flexibility. For any of the data modes (tabular, text and images) there are available components/models in the library. However, the user can choose to use any model they want with the only requirement that such model must have a <code>output_dim</code> property.</p> <p>With that in mind, the <code>BertModel</code> class defined above can be used by <code>pytorch-widedeep</code> as any other of the internal components. In other words, simply...pass it to the <code>WideDeep</code> class. In this case we are going to add a FC-head as part of the classifier.</p> In\u00a0[55]: Copied! <pre>bert_model = BertModel(freeze_bert=True)\nwd_bert_model = WideDeep(\n    deeptext=bert_model,\n    head_hidden_dims=[256, 128, 64],\n    pred_dim=4,\n)\n</pre> bert_model = BertModel(freeze_bert=True) wd_bert_model = WideDeep(     deeptext=bert_model,     head_hidden_dims=[256, 128, 64],     pred_dim=4, ) <pre>Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']\n- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n</pre> In\u00a0[56]: Copied! <pre>wd_bert_model\n</pre> wd_bert_model Out[56]: <pre>WideDeep(\n  (deeptext): BertModel(\n    (bert): DistilBertModel(\n      (embeddings): Embeddings(\n        (word_embeddings): Embedding(30522, 768, padding_idx=0)\n        (position_embeddings): Embedding(512, 768)\n        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n        (dropout): Dropout(p=0.1, inplace=False)\n      )\n      (transformer): Transformer(\n        (layer): ModuleList(\n          (0): TransformerBlock(\n            (attention): MultiHeadSelfAttention(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n            )\n            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n            (ffn): FFN(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n              (activation): GELUActivation()\n            )\n            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n          )\n          (1): TransformerBlock(\n            (attention): MultiHeadSelfAttention(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n            )\n            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n            (ffn): FFN(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n              (activation): GELUActivation()\n            )\n            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n          )\n          (2): TransformerBlock(\n            (attention): MultiHeadSelfAttention(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n            )\n            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n            (ffn): FFN(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n              (activation): GELUActivation()\n            )\n            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n          )\n          (3): TransformerBlock(\n            (attention): MultiHeadSelfAttention(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n            )\n            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n            (ffn): FFN(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n              (activation): GELUActivation()\n            )\n            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n          )\n          (4): TransformerBlock(\n            (attention): MultiHeadSelfAttention(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n            )\n            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n            (ffn): FFN(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n              (activation): GELUActivation()\n            )\n            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n          )\n          (5): TransformerBlock(\n            (attention): MultiHeadSelfAttention(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n            )\n            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n            (ffn): FFN(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n              (activation): GELUActivation()\n            )\n            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n          )\n        )\n      )\n    )\n  )\n  (deephead): Sequential(\n    (0): MLP(\n      (mlp): Sequential(\n        (dense_layer_0): Sequential(\n          (0): Linear(in_features=768, out_features=256, bias=True)\n          (1): ReLU(inplace=True)\n          (2): Dropout(p=0.1, inplace=False)\n        )\n        (dense_layer_1): Sequential(\n          (0): Linear(in_features=256, out_features=128, bias=True)\n          (1): ReLU(inplace=True)\n          (2): Dropout(p=0.1, inplace=False)\n        )\n        (dense_layer_2): Sequential(\n          (0): Linear(in_features=128, out_features=64, bias=True)\n          (1): ReLU(inplace=True)\n          (2): Dropout(p=0.1, inplace=False)\n        )\n      )\n    )\n    (1): Linear(in_features=64, out_features=4, bias=True)\n  )\n)</pre> In\u00a0[57]: Copied! <pre>wd_bert_trainer = Trainer(\n    wd_bert_model,\n    objective=\"multiclass\",\n    metrics=[Accuracy, F1Score(average=True)],\n    num_workers=0,  # As in the case of the tokenizer, in notebook I need to set this to 0 for the Trainer to work\n)\n\nwd_bert_trainer.fit(\n    X_text=X_bert_tr,\n    target=train.rating.values,\n    n_epochs=3,\n    batch_size=64,\n)\n</pre> wd_bert_trainer = Trainer(     wd_bert_model,     objective=\"multiclass\",     metrics=[Accuracy, F1Score(average=True)],     num_workers=0,  # As in the case of the tokenizer, in notebook I need to set this to 0 for the Trainer to work )  wd_bert_trainer.fit(     X_text=X_bert_tr,     target=train.rating.values,     n_epochs=3,     batch_size=64, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 283/283 [00:14&lt;00:00, 19.68it/s, loss=0.968, metrics={'acc': 0.5879, 'f1': 0.3591}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 283/283 [00:14&lt;00:00, 19.63it/s, loss=0.884, metrics={'acc': 0.6178, 'f1': 0.4399}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 283/283 [00:14&lt;00:00, 19.55it/s, loss=0.87, metrics={'acc': 0.6234, 'f1': 0.4527}]\n</pre> In\u00a0[58]: Copied! <pre>wd_bert_pred_text = wd_bert_trainer.predict_proba(X_text=X_bert_te)\nwd_bert_pred_text_class = np.argmax(wd_bert_pred_text, 1)\n</pre> wd_bert_pred_text = wd_bert_trainer.predict_proba(X_text=X_bert_te) wd_bert_pred_text_class = np.argmax(wd_bert_pred_text, 1) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:03&lt;00:00, 21.97it/s]\n</pre> In\u00a0[59]: Copied! <pre>wd_bert_acc = accuracy_score(test.rating, wd_bert_pred_text_class)\nwd_bert_f1 = f1_score(test.rating, wd_bert_pred_text_class, average=\"weighted\")\nwd_bert_cm = confusion_matrix(test.rating, wd_bert_pred_text_class)\n</pre> wd_bert_acc = accuracy_score(test.rating, wd_bert_pred_text_class) wd_bert_f1 = f1_score(test.rating, wd_bert_pred_text_class, average=\"weighted\") wd_bert_cm = confusion_matrix(test.rating, wd_bert_pred_text_class) In\u00a0[60]: Copied! <pre>print(f\"Distilbert Accuracy: {wd_bert_acc}. Distilbert F1 Score: {wd_bert_f1}\")\nprint(f\"Distilbert Confusion Matrix:\\n {wd_bert_cm}\")\n</pre> print(f\"Distilbert Accuracy: {wd_bert_acc}. Distilbert F1 Score: {wd_bert_f1}\") print(f\"Distilbert Confusion Matrix:\\n {wd_bert_cm}\") <pre>Distilbert Accuracy: 0.6326846528084918. Distilbert F1 Score: 0.5796652991272998\nDistilbert Confusion Matrix:\n [[ 287   75   22   90]\n [ 197  136   62  169]\n [  68  119  123  671]\n [  40   64   84 2315]]\n</pre> <p>Now, adding a tabular model follows the exact same process as the one described in section 2.</p> In\u00a0[61]: Copied! <pre>tab_model = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    mlp_hidden_dims=[100, 50],\n)\n</pre> tab_model = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     mlp_hidden_dims=[100, 50], ) In\u00a0[62]: Copied! <pre>wd_tab_bert_model = WideDeep(\n    deeptabular=tab_model,\n    deeptext=bert_model,\n    head_hidden_dims=[256, 128, 64],\n    pred_dim=4,\n)\n</pre> wd_tab_bert_model = WideDeep(     deeptabular=tab_model,     deeptext=bert_model,     head_hidden_dims=[256, 128, 64],     pred_dim=4, ) In\u00a0[63]: Copied! <pre>wd_tab_bert_trainer = Trainer(\n    wd_tab_bert_model,\n    objective=\"multiclass\",\n    metrics=[Accuracy, F1Score(average=True)],\n    num_workers=0,  # As in the case of the tokenizer, in notebook I need to set this to 0 for the Trainer to work\n)\n</pre> wd_tab_bert_trainer = Trainer(     wd_tab_bert_model,     objective=\"multiclass\",     metrics=[Accuracy, F1Score(average=True)],     num_workers=0,  # As in the case of the tokenizer, in notebook I need to set this to 0 for the Trainer to work ) In\u00a0[64]: Copied! <pre>wd_tab_bert_trainer.fit(\n    X_tab=wd_X_tab_tr,\n    X_text=X_bert_tr,\n    target=train.rating.values,\n    n_epochs=3,\n    batch_size=64,\n)\n</pre> wd_tab_bert_trainer.fit(     X_tab=wd_X_tab_tr,     X_text=X_bert_tr,     target=train.rating.values,     n_epochs=3,     batch_size=64, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 283/283 [00:15&lt;00:00, 18.15it/s, loss=0.974, metrics={'acc': 0.5838, 'f1': 0.3404}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 283/283 [00:15&lt;00:00, 18.38it/s, loss=0.885, metrics={'acc': 0.618, 'f1': 0.4378}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 283/283 [00:15&lt;00:00, 18.40it/s, loss=0.868, metrics={'acc': 0.6252, 'f1': 0.4575}]\n</pre> In\u00a0[65]: Copied! <pre>wd_tab_bert_pred_text = wd_tab_bert_trainer.predict_proba(\n    X_tab=wd_X_tab_te, X_text=X_bert_te\n)\nwd_tab_bert_pred_text_class = np.argmax(wd_tab_bert_pred_text, 1)\n</pre> wd_tab_bert_pred_text = wd_tab_bert_trainer.predict_proba(     X_tab=wd_X_tab_te, X_text=X_bert_te ) wd_tab_bert_pred_text_class = np.argmax(wd_tab_bert_pred_text, 1) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:03&lt;00:00, 21.32it/s]\n</pre> In\u00a0[66]: Copied! <pre>wd_tab_bert_acc = accuracy_score(test.rating, wd_tab_bert_pred_text_class)\nwd_tab_bert_f1 = f1_score(test.rating, wd_tab_bert_pred_text_class, average=\"weighted\")\nwd_tab_bert_cm = confusion_matrix(test.rating, wd_tab_bert_pred_text_class)\n</pre> wd_tab_bert_acc = accuracy_score(test.rating, wd_tab_bert_pred_text_class) wd_tab_bert_f1 = f1_score(test.rating, wd_tab_bert_pred_text_class, average=\"weighted\") wd_tab_bert_cm = confusion_matrix(test.rating, wd_tab_bert_pred_text_class) In\u00a0[67]: Copied! <pre>print(\n    f\"Distilbert + Tabular Accuracy: {wd_tab_bert_acc}. Distilbert+ Tabular F1 Score: {wd_tab_bert_f1}\"\n)\nprint(f\"Distilbert + Tabular Confusion Matrix:\\n {wd_tab_bert_cm}\")\n</pre> print(     f\"Distilbert + Tabular Accuracy: {wd_tab_bert_acc}. Distilbert+ Tabular F1 Score: {wd_tab_bert_f1}\" ) print(f\"Distilbert + Tabular Confusion Matrix:\\n {wd_tab_bert_cm}\") <pre>Distilbert + Tabular Accuracy: 0.6242812914639541. Distilbert+ Tabular F1 Score: 0.5508351761564895\nDistilbert + Tabular Confusion Matrix:\n [[ 297   56   11  110]\n [ 229   91   38  206]\n [  86   90   71  734]\n [  49   48   42 2364]]\n</pre>"},{"location":"examples/16_Usign_a_custom_hugging_face_model.html#using-a-hugginface-model","title":"Using a Hugginface model\u00b6","text":"<p>In this notebook we will show how to use an \"external\" Hugginface model along with any other model in the libray. In particular we will show how to combine it with a tabular DL model.</p> <p>Since we are here, we will also compare the performance of a few models on a text classification problem.</p> <p>The notebook will go as follows:</p> <ol> <li>Text classification using tf-idf + LightGBM</li> <li>Text classification using a basic RNN</li> <li>Text classification using Distilbert</li> </ol> <p>In all 3 cases we will add some tabular features to see if these help.</p> <p>In general, I would not pay much attention to the results since I have placed no effort in getting the best possible results (i.e. no hyperparameter optimization or trying different architectures, for example).</p> <p>Let's go</p>"},{"location":"examples/16_Usign_a_custom_hugging_face_model.html#1-text-classification-using-tf-idf-lightgbm","title":"1. Text classification using tf-idf + LightGBM\u00b6","text":""},{"location":"examples/16_Usign_a_custom_hugging_face_model.html#2-text-classification-using-pytorch-widedeeps-built-in-models-a-basic-rnn","title":"2. Text classification using pytorch-widedeep's built-in models (a basic RNN)\u00b6","text":"<p>Moving on now to fully using <code>pytorch-widedeep</code> in this dataset, let's have a look on how one could use a simple RNN to predict the ratings with the library.</p>"},{"location":"examples/16_Usign_a_custom_hugging_face_model.html#3-text-classification-using-a-hugginface-model-as-a-custom-model-in-pytorch-widedeeps","title":"3. Text classification using a Hugginface model as a custom model in pytorch-widedeep's\u00b6","text":""},{"location":"examples/17_feature_importance_via_attention_weights.html","title":"17 feature importance via attention weights","text":"In\u00a0[1]: Copied! <pre>import torch\n\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import accuracy_score\n\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.models import TabTransformer, ContextAttentionMLP, WideDeep\nfrom pytorch_widedeep.callbacks import EarlyStopping\nfrom pytorch_widedeep.metrics import Accuracy\nfrom pytorch_widedeep.datasets import load_adult\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\n</pre> import torch  import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score   from pytorch_widedeep import Trainer from pytorch_widedeep.models import TabTransformer, ContextAttentionMLP, WideDeep from pytorch_widedeep.callbacks import EarlyStopping from pytorch_widedeep.metrics import Accuracy from pytorch_widedeep.datasets import load_adult from pytorch_widedeep.preprocessing import TabPreprocessor In\u00a0[2]: Copied! <pre># use_cuda = torch.cuda.is_available()\ndf = load_adult(as_frame=True)\ndf.columns = [c.replace(\"-\", \"_\") for c in df.columns]\ndf[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int)\ndf.drop([\"income\", \"fnlwgt\", \"educational_num\"], axis=1, inplace=True)\ntarget_colname = \"income_label\"\n</pre> # use_cuda = torch.cuda.is_available() df = load_adult(as_frame=True) df.columns = [c.replace(\"-\", \"_\") for c in df.columns] df[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int) df.drop([\"income\", \"fnlwgt\", \"educational_num\"], axis=1, inplace=True) target_colname = \"income_label\" In\u00a0[3]: Copied! <pre>df.head()\n</pre> df.head() Out[3]: age workclass education marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country income_label 0 25 Private 11th Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States 0 1 38 Private HS-grad Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States 0 2 28 Local-gov Assoc-acdm Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States 1 3 44 Private Some-college Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States 1 4 18 ? Some-college Never-married ? Own-child White Female 0 0 30 United-States 0 In\u00a0[4]: Copied! <pre>cat_embed_cols = []\nfor col in df.columns:\n    if df[col].dtype == \"O\" or df[col].nunique() &lt; 200 and col != target_colname:\n        cat_embed_cols.append(col)\n</pre> cat_embed_cols = [] for col in df.columns:     if df[col].dtype == \"O\" or df[col].nunique() &lt; 200 and col != target_colname:         cat_embed_cols.append(col) In\u00a0[5]: Copied! <pre># all cols will be categorical\nassert len(cat_embed_cols) == df.shape[1] - 1\n</pre> # all cols will be categorical assert len(cat_embed_cols) == df.shape[1] - 1 In\u00a0[6]: Copied! <pre>train, test = train_test_split(\n    df, test_size=0.1, random_state=1, stratify=df[[target_colname]]\n)\n</pre> train, test = train_test_split(     df, test_size=0.1, random_state=1, stratify=df[[target_colname]] ) In\u00a0[7]: Copied! <pre>tab_preprocessor = TabPreprocessor(cat_embed_cols=cat_embed_cols, with_attention=True)\n</pre> tab_preprocessor = TabPreprocessor(cat_embed_cols=cat_embed_cols, with_attention=True) In\u00a0[8]: Copied! <pre>X_tab_train = tab_preprocessor.fit_transform(train)\nX_tab_test = tab_preprocessor.transform(test)\ntarget = train[target_colname].values\n</pre> X_tab_train = tab_preprocessor.fit_transform(train) X_tab_test = tab_preprocessor.transform(test) target = train[target_colname].values In\u00a0[9]: Copied! <pre>tab_transformer = TabTransformer(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    embed_continuous_method=\"standard\",\n    input_dim=8,\n    n_heads=2,\n    n_blocks=1,\n    attn_dropout=0.1,\n    transformer_activation=\"relu\",\n)\n</pre> tab_transformer = TabTransformer(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     embed_continuous_method=\"standard\",     input_dim=8,     n_heads=2,     n_blocks=1,     attn_dropout=0.1,     transformer_activation=\"relu\", ) In\u00a0[10]: Copied! <pre>model = WideDeep(deeptabular=tab_transformer)\n</pre> model = WideDeep(deeptabular=tab_transformer) In\u00a0[11]: Copied! <pre>optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0)\n</pre> optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0) In\u00a0[12]: Copied! <pre>lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(\n    optimizer,\n    threshold=0.001,\n    threshold_mode=\"abs\",\n    patience=10,\n)\n</pre> lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(     optimizer,     threshold=0.001,     threshold_mode=\"abs\",     patience=10, ) In\u00a0[13]: Copied! <pre>early_stopping = EarlyStopping(\n    min_delta=0.001, patience=30, restore_best_weights=True, verbose=True\n)\n</pre> early_stopping = EarlyStopping(     min_delta=0.001, patience=30, restore_best_weights=True, verbose=True ) In\u00a0[14]: Copied! <pre>trainer = Trainer(\n    model,\n    objective=\"binary\",\n    optimizers=optimizer,\n    lr_schedulers=lr_scheduler,\n    reducelronplateau_criterion=\"loss\",\n    callbacks=[early_stopping],\n    metrics=[Accuracy],\n)\n</pre> trainer = Trainer(     model,     objective=\"binary\",     optimizers=optimizer,     lr_schedulers=lr_scheduler,     reducelronplateau_criterion=\"loss\",     callbacks=[early_stopping],     metrics=[Accuracy], ) <p>The feature importances will be computed after training, using a sample of the training dataset of size <code>feature_importance_sample_size</code></p> In\u00a0[15]: Copied! <pre>trainer.fit(\n    X_tab=X_tab_train,\n    target=target,\n    val_split=0.2,\n    n_epochs=100,\n    batch_size=128,\n    validation_freq=1,\n    feature_importance_sample_size=1000,\n)\n</pre> trainer.fit(     X_tab=X_tab_train,     target=target,     val_split=0.2,     n_epochs=100,     batch_size=128,     validation_freq=1,     feature_importance_sample_size=1000, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 81.80it/s, loss=0.334, metrics={'acc': 0.847}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 111.34it/s, loss=0.294, metrics={'acc': 0.8669}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 83.02it/s, loss=0.293, metrics={'acc': 0.8656}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 124.03it/s, loss=0.283, metrics={'acc': 0.8678}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 87.69it/s, loss=0.282, metrics={'acc': 0.8703}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 118.22it/s, loss=0.279, metrics={'acc': 0.8717}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.24it/s, loss=0.277, metrics={'acc': 0.8718}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 115.29it/s, loss=0.277, metrics={'acc': 0.8731}]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 83.76it/s, loss=0.275, metrics={'acc': 0.8727}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 120.80it/s, loss=0.276, metrics={'acc': 0.8727}]\nepoch 6: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 82.78it/s, loss=0.273, metrics={'acc': 0.873}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 100.43it/s, loss=0.276, metrics={'acc': 0.871}]\nepoch 7: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.35it/s, loss=0.271, metrics={'acc': 0.8742}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 116.14it/s, loss=0.275, metrics={'acc': 0.8726}]\nepoch 8: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 74.29it/s, loss=0.271, metrics={'acc': 0.875}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 117.46it/s, loss=0.276, metrics={'acc': 0.8718}]\nepoch 9: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.10it/s, loss=0.27, metrics={'acc': 0.8761}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 105.49it/s, loss=0.275, metrics={'acc': 0.8728}]\nepoch 10: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 70.40it/s, loss=0.269, metrics={'acc': 0.8747}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 105.47it/s, loss=0.275, metrics={'acc': 0.8726}]\nepoch 11: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 72.83it/s, loss=0.268, metrics={'acc': 0.8742}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 105.03it/s, loss=0.274, metrics={'acc': 0.873}]\nepoch 12: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 71.86it/s, loss=0.267, metrics={'acc': 0.8743}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 106.61it/s, loss=0.274, metrics={'acc': 0.8734}]\nepoch 13: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 72.39it/s, loss=0.267, metrics={'acc': 0.876}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 108.05it/s, loss=0.275, metrics={'acc': 0.8717}]\nepoch 14: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 73.36it/s, loss=0.265, metrics={'acc': 0.8767}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 109.60it/s, loss=0.276, metrics={'acc': 0.8747}]\nepoch 15: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 73.34it/s, loss=0.264, metrics={'acc': 0.876}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 117.55it/s, loss=0.276, metrics={'acc': 0.8706}]\nepoch 16: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.35it/s, loss=0.264, metrics={'acc': 0.8777}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 122.08it/s, loss=0.275, metrics={'acc': 0.8753}]\nepoch 17: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.96it/s, loss=0.263, metrics={'acc': 0.877}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 120.83it/s, loss=0.277, metrics={'acc': 0.8739}]\nepoch 18: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.38it/s, loss=0.263, metrics={'acc': 0.8779}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 118.18it/s, loss=0.278, metrics={'acc': 0.8714}]\nepoch 19: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.61it/s, loss=0.261, metrics={'acc': 0.8784}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 119.19it/s, loss=0.278, metrics={'acc': 0.8712}]\nepoch 20: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 72.43it/s, loss=0.261, metrics={'acc': 0.8791}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 118.87it/s, loss=0.28, metrics={'acc': 0.873}]\nepoch 21: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 73.97it/s, loss=0.26, metrics={'acc': 0.8787}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 107.50it/s, loss=0.279, metrics={'acc': 0.8732}]\nepoch 22: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 71.76it/s, loss=0.253, metrics={'acc': 0.8816}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 108.11it/s, loss=0.279, metrics={'acc': 0.8707}]\nepoch 23: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 71.92it/s, loss=0.252, metrics={'acc': 0.8828}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 100.14it/s, loss=0.28, metrics={'acc': 0.8711}]\nepoch 24: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.04it/s, loss=0.252, metrics={'acc': 0.8829}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 98.36it/s, loss=0.28, metrics={'acc': 0.8708}]\nepoch 25: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.31it/s, loss=0.251, metrics={'acc': 0.883}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 122.97it/s, loss=0.281, metrics={'acc': 0.8709}]\nepoch 26: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.63it/s, loss=0.25, metrics={'acc': 0.8834}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 123.07it/s, loss=0.281, metrics={'acc': 0.8698}]\nepoch 27: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 78.37it/s, loss=0.251, metrics={'acc': 0.884}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 114.75it/s, loss=0.281, metrics={'acc': 0.87}]\nepoch 28: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.19it/s, loss=0.25, metrics={'acc': 0.883}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 117.99it/s, loss=0.282, metrics={'acc': 0.8699}]\nepoch 29: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.19it/s, loss=0.25, metrics={'acc': 0.8829}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 111.11it/s, loss=0.282, metrics={'acc': 0.8695}]\nepoch 30: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.57it/s, loss=0.249, metrics={'acc': 0.8839}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 115.64it/s, loss=0.283, metrics={'acc': 0.8689}]\nepoch 31: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.55it/s, loss=0.249, metrics={'acc': 0.8846}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 125.10it/s, loss=0.283, metrics={'acc': 0.869}]\nepoch 32: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 82.56it/s, loss=0.248, metrics={'acc': 0.8841}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 118.45it/s, loss=0.284, metrics={'acc': 0.8687}]\nepoch 33: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 81.06it/s, loss=0.248, metrics={'acc': 0.8848}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 129.26it/s, loss=0.284, metrics={'acc': 0.8689}]\nepoch 34: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 80.53it/s, loss=0.248, metrics={'acc': 0.8854}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 120.61it/s, loss=0.283, metrics={'acc': 0.869}]\nepoch 35: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 80.78it/s, loss=0.248, metrics={'acc': 0.8853}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 127.31it/s, loss=0.283, metrics={'acc': 0.8694}]\nepoch 36: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 82.51it/s, loss=0.248, metrics={'acc': 0.8863}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 125.94it/s, loss=0.283, metrics={'acc': 0.8693}]\nepoch 37: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 81.35it/s, loss=0.247, metrics={'acc': 0.8844}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 126.77it/s, loss=0.283, metrics={'acc': 0.8692}]\nepoch 38: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 80.62it/s, loss=0.248, metrics={'acc': 0.8837}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 116.62it/s, loss=0.283, metrics={'acc': 0.8692}]\nepoch 39: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.82it/s, loss=0.248, metrics={'acc': 0.8842}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 123.64it/s, loss=0.283, metrics={'acc': 0.8695}]\nepoch 40: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.86it/s, loss=0.247, metrics={'acc': 0.8855}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 120.34it/s, loss=0.283, metrics={'acc': 0.8692}]\n</pre> <pre>Best Epoch: 10. Best val_loss: 0.27451\nRestoring model weights from the end of the best epoch\n</pre> In\u00a0[16]: Copied! <pre>trainer.feature_importance\n</pre> trainer.feature_importance Out[16]: <pre>{'age': 0.09718182,\n 'workclass': 0.090637445,\n 'education': 0.08910798,\n 'marital_status': 0.08971319,\n 'occupation': 0.12546304,\n 'relationship': 0.086381145,\n 'race': 0.050686445,\n 'gender': 0.05116429,\n 'capital_gain': 0.08165918,\n 'capital_loss': 0.07702667,\n 'hours_per_week': 0.08205996,\n 'native_country': 0.07891885}</pre> In\u00a0[17]: Copied! <pre>preds = trainer.predict(X_tab=X_tab_test)\n</pre> preds = trainer.predict(X_tab=X_tab_test) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 39/39 [00:00&lt;00:00, 199.63it/s]\n</pre> In\u00a0[18]: Copied! <pre>accuracy_score(preds, test.income_label)\n</pre> accuracy_score(preds, test.income_label) Out[18]: <pre>0.8685772773797339</pre> In\u00a0[19]: Copied! <pre>test.reset_index(drop=True, inplace=True)\n</pre> test.reset_index(drop=True, inplace=True) In\u00a0[20]: Copied! <pre>test[test.income_label == 0].head(1)\n</pre> test[test.income_label == 0].head(1) Out[20]: age workclass education marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country income_label 0 26 Private Some-college Never-married Exec-managerial Not-in-family White Male 0 0 60 United-States 0 In\u00a0[21]: Copied! <pre>test[test.income_label == 1].head(1)\n</pre> test[test.income_label == 1].head(1) Out[21]: age workclass education marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country income_label 3 36 Local-gov Doctorate Married-civ-spouse Prof-specialty Husband White Male 0 1887 50 United-States 1 <p>To get the feature importance of a test dataset, simply use the <code>explain</code> method</p> In\u00a0[22]: Copied! <pre>feat_imp_per_sample = trainer.explain(X_tab_test, save_step_masks=False)\n</pre> feat_imp_per_sample = trainer.explain(X_tab_test, save_step_masks=False) In\u00a0[23]: Copied! <pre>list(test.iloc[0].index[np.argsort(-feat_imp_per_sample[0])])\n</pre> list(test.iloc[0].index[np.argsort(-feat_imp_per_sample[0])]) Out[23]: <pre>['hours_per_week',\n 'education',\n 'relationship',\n 'occupation',\n 'workclass',\n 'capital_gain',\n 'native_country',\n 'marital_status',\n 'capital_loss',\n 'age',\n 'race',\n 'gender']</pre> In\u00a0[24]: Copied! <pre>list(test.iloc[3].index[np.argsort(-feat_imp_per_sample[3])])\n</pre> list(test.iloc[3].index[np.argsort(-feat_imp_per_sample[3])]) Out[24]: <pre>['age',\n 'capital_loss',\n 'hours_per_week',\n 'marital_status',\n 'native_country',\n 'relationship',\n 'race',\n 'education',\n 'occupation',\n 'capital_gain',\n 'gender',\n 'workclass']</pre> <p>We could do the same with the <code>ContextAttentionMLP</code></p> In\u00a0[25]: Copied! <pre>context_attn_mlp = ContextAttentionMLP(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.0,\n    input_dim=16,\n    attn_dropout=0.1,\n    attn_activation=\"relu\",\n)\n</pre> context_attn_mlp = ContextAttentionMLP(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.0,     input_dim=16,     attn_dropout=0.1,     attn_activation=\"relu\", ) In\u00a0[26]: Copied! <pre>mlp_model = WideDeep(deeptabular=context_attn_mlp)\n</pre> mlp_model = WideDeep(deeptabular=context_attn_mlp) In\u00a0[27]: Copied! <pre>mlp_optimizer = torch.optim.Adam(mlp_model.parameters(), lr=0.01, weight_decay=0.0)\n</pre> mlp_optimizer = torch.optim.Adam(mlp_model.parameters(), lr=0.01, weight_decay=0.0) In\u00a0[28]: Copied! <pre>mlp_lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(\n    mlp_optimizer,\n    threshold=0.001,\n    threshold_mode=\"abs\",\n    patience=10,\n)\n</pre> mlp_lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(     mlp_optimizer,     threshold=0.001,     threshold_mode=\"abs\",     patience=10, ) In\u00a0[29]: Copied! <pre>mlp_early_stopping = EarlyStopping(\n    min_delta=0.001, patience=30, restore_best_weights=True, verbose=True\n)\n</pre> mlp_early_stopping = EarlyStopping(     min_delta=0.001, patience=30, restore_best_weights=True, verbose=True ) In\u00a0[30]: Copied! <pre>mlp_trainer = Trainer(\n    mlp_model,\n    objective=\"binary\",\n    optimizers=mlp_optimizer,\n    lr_schedulers=mlp_lr_scheduler,\n    reducelronplateau_criterion=\"loss\",\n    callbacks=[mlp_early_stopping],\n    metrics=[Accuracy],\n)\n</pre> mlp_trainer = Trainer(     mlp_model,     objective=\"binary\",     optimizers=mlp_optimizer,     lr_schedulers=mlp_lr_scheduler,     reducelronplateau_criterion=\"loss\",     callbacks=[mlp_early_stopping],     metrics=[Accuracy], ) In\u00a0[31]: Copied! <pre>mlp_trainer.fit(\n    X_tab=X_tab_train,\n    target=target,\n    val_split=0.2,\n    n_epochs=100,\n    batch_size=128,\n    validation_freq=1,\n    feature_importance_sample_size=1000,\n)\n</pre> mlp_trainer.fit(     X_tab=X_tab_train,     target=target,     val_split=0.2,     n_epochs=100,     batch_size=128,     validation_freq=1,     feature_importance_sample_size=1000, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 73.11it/s, loss=0.405, metrics={'acc': 0.8094}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 119.26it/s, loss=0.309, metrics={'acc': 0.8583}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 71.70it/s, loss=0.332, metrics={'acc': 0.8447}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 116.36it/s, loss=0.293, metrics={'acc': 0.8646}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 78.42it/s, loss=0.319, metrics={'acc': 0.8505}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 118.05it/s, loss=0.293, metrics={'acc': 0.8654}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.00it/s, loss=0.312, metrics={'acc': 0.8554}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 116.49it/s, loss=0.291, metrics={'acc': 0.8661}]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.50it/s, loss=0.308, metrics={'acc': 0.8583}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 120.48it/s, loss=0.287, metrics={'acc': 0.8669}]\nepoch 6: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 80.84it/s, loss=0.303, metrics={'acc': 0.8605}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 128.70it/s, loss=0.288, metrics={'acc': 0.8673}]\nepoch 7: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.93it/s, loss=0.301, metrics={'acc': 0.8597}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 121.47it/s, loss=0.298, metrics={'acc': 0.8628}]\nepoch 8: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 80.56it/s, loss=0.3, metrics={'acc': 0.8592}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 119.84it/s, loss=0.281, metrics={'acc': 0.8718}]\nepoch 9: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.17it/s, loss=0.298, metrics={'acc': 0.8619}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 126.32it/s, loss=0.28, metrics={'acc': 0.8716}]\nepoch 10: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 82.13it/s, loss=0.297, metrics={'acc': 0.8615}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 120.50it/s, loss=0.281, metrics={'acc': 0.8718}]\nepoch 11: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 82.54it/s, loss=0.293, metrics={'acc': 0.8641}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 122.57it/s, loss=0.284, metrics={'acc': 0.867}]\nepoch 12: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 80.92it/s, loss=0.293, metrics={'acc': 0.863}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 126.42it/s, loss=0.282, metrics={'acc': 0.8701}]\nepoch 13: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.61it/s, loss=0.293, metrics={'acc': 0.8635}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 117.56it/s, loss=0.276, metrics={'acc': 0.8719}]\nepoch 14: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 74.92it/s, loss=0.29, metrics={'acc': 0.8633}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 112.06it/s, loss=0.286, metrics={'acc': 0.8669}]\nepoch 15: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.83it/s, loss=0.291, metrics={'acc': 0.865}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 112.88it/s, loss=0.282, metrics={'acc': 0.8677}]\nepoch 16: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.74it/s, loss=0.29, metrics={'acc': 0.8653}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 120.85it/s, loss=0.285, metrics={'acc': 0.8672}]\nepoch 17: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.99it/s, loss=0.29, metrics={'acc': 0.865}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 113.53it/s, loss=0.282, metrics={'acc': 0.8681}]\nepoch 18: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 71.22it/s, loss=0.288, metrics={'acc': 0.8651}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 112.89it/s, loss=0.288, metrics={'acc': 0.8676}]\nepoch 19: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.85it/s, loss=0.29, metrics={'acc': 0.8661}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 114.26it/s, loss=0.284, metrics={'acc': 0.8662}]\nepoch 20: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.80it/s, loss=0.289, metrics={'acc': 0.8661}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 119.44it/s, loss=0.281, metrics={'acc': 0.8703}]\nepoch 21: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.72it/s, loss=0.29, metrics={'acc': 0.8661}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 113.04it/s, loss=0.285, metrics={'acc': 0.8648}]\nepoch 22: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 74.86it/s, loss=0.289, metrics={'acc': 0.8656}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 111.75it/s, loss=0.282, metrics={'acc': 0.8666}]\nepoch 23: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.68it/s, loss=0.289, metrics={'acc': 0.8668}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 115.86it/s, loss=0.282, metrics={'acc': 0.8724}]\nepoch 24: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.93it/s, loss=0.288, metrics={'acc': 0.8653}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 114.69it/s, loss=0.285, metrics={'acc': 0.8656}]\nepoch 25: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.99it/s, loss=0.284, metrics={'acc': 0.8671}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 118.27it/s, loss=0.277, metrics={'acc': 0.8707}]\nepoch 26: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.86it/s, loss=0.282, metrics={'acc': 0.8686}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 123.94it/s, loss=0.276, metrics={'acc': 0.8712}]\nepoch 27: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.86it/s, loss=0.283, metrics={'acc': 0.8691}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 114.11it/s, loss=0.277, metrics={'acc': 0.8716}]\nepoch 28: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.43it/s, loss=0.281, metrics={'acc': 0.8696}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 114.64it/s, loss=0.277, metrics={'acc': 0.8712}]\nepoch 29: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.39it/s, loss=0.281, metrics={'acc': 0.8696}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 115.83it/s, loss=0.277, metrics={'acc': 0.872}]\nepoch 30: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 74.93it/s, loss=0.28, metrics={'acc': 0.8706}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 113.07it/s, loss=0.275, metrics={'acc': 0.8714}]\nepoch 31: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.35it/s, loss=0.281, metrics={'acc': 0.8697}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 112.68it/s, loss=0.276, metrics={'acc': 0.872}]\nepoch 32: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.72it/s, loss=0.28, metrics={'acc': 0.8693}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 122.50it/s, loss=0.276, metrics={'acc': 0.8709}]\nepoch 33: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.68it/s, loss=0.28, metrics={'acc': 0.8716}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 110.07it/s, loss=0.277, metrics={'acc': 0.8709}]\nepoch 34: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 73.58it/s, loss=0.279, metrics={'acc': 0.8704}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 111.07it/s, loss=0.274, metrics={'acc': 0.8719}]\nepoch 35: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 74.03it/s, loss=0.28, metrics={'acc': 0.8687}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 110.15it/s, loss=0.276, metrics={'acc': 0.871}]\nepoch 36: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 73.11it/s, loss=0.279, metrics={'acc': 0.8706}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 111.10it/s, loss=0.278, metrics={'acc': 0.8705}]\nepoch 37: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 74.30it/s, loss=0.279, metrics={'acc': 0.869}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 110.75it/s, loss=0.279, metrics={'acc': 0.8702}]\nepoch 38: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 72.34it/s, loss=0.28, metrics={'acc': 0.8691}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 117.79it/s, loss=0.277, metrics={'acc': 0.8698}]\nepoch 39: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.49it/s, loss=0.279, metrics={'acc': 0.8694}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 115.25it/s, loss=0.279, metrics={'acc': 0.87}]\nepoch 40: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.51it/s, loss=0.28, metrics={'acc': 0.8694}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 110.90it/s, loss=0.277, metrics={'acc': 0.8694}]\nepoch 41: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.35it/s, loss=0.278, metrics={'acc': 0.8716}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 115.74it/s, loss=0.28, metrics={'acc': 0.8675}]\nepoch 42: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.35it/s, loss=0.279, metrics={'acc': 0.8695}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 122.76it/s, loss=0.277, metrics={'acc': 0.8699}]\nepoch 43: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:04&lt;00:00, 66.14it/s, loss=0.279, metrics={'acc': 0.8681}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 106.20it/s, loss=0.277, metrics={'acc': 0.8714}]\nepoch 44: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 73.37it/s, loss=0.279, metrics={'acc': 0.8704}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 115.18it/s, loss=0.277, metrics={'acc': 0.8716}]\nepoch 45: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.23it/s, loss=0.278, metrics={'acc': 0.8702}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 124.83it/s, loss=0.278, metrics={'acc': 0.8707}]\nepoch 46: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.12it/s, loss=0.278, metrics={'acc': 0.8704}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 126.62it/s, loss=0.279, metrics={'acc': 0.8693}]\nepoch 47: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 78.55it/s, loss=0.276, metrics={'acc': 0.8713}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 117.99it/s, loss=0.279, metrics={'acc': 0.8691}]\nepoch 48: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.25it/s, loss=0.278, metrics={'acc': 0.8719}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 124.52it/s, loss=0.278, metrics={'acc': 0.8695}]\nepoch 49: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 78.35it/s, loss=0.277, metrics={'acc': 0.8721}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 119.82it/s, loss=0.279, metrics={'acc': 0.8691}]\nepoch 50: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.15it/s, loss=0.277, metrics={'acc': 0.8717}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 122.62it/s, loss=0.278, metrics={'acc': 0.8699}]\nepoch 51: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 78.55it/s, loss=0.277, metrics={'acc': 0.8713}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 117.63it/s, loss=0.278, metrics={'acc': 0.87}]\nepoch 52: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.04it/s, loss=0.276, metrics={'acc': 0.8721}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 126.39it/s, loss=0.278, metrics={'acc': 0.8697}]\nepoch 53: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 78.15it/s, loss=0.277, metrics={'acc': 0.8721}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 127.56it/s, loss=0.278, metrics={'acc': 0.8699}]\nepoch 54: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.41it/s, loss=0.277, metrics={'acc': 0.8711}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 118.95it/s, loss=0.278, metrics={'acc': 0.8698}]\nepoch 55: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.35it/s, loss=0.277, metrics={'acc': 0.8718}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 126.90it/s, loss=0.278, metrics={'acc': 0.8699}]\nepoch 56: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.83it/s, loss=0.277, metrics={'acc': 0.8707}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 115.13it/s, loss=0.279, metrics={'acc': 0.8691}]\nepoch 57: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.20it/s, loss=0.277, metrics={'acc': 0.8722}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 123.16it/s, loss=0.279, metrics={'acc': 0.8691}]\nepoch 58: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 73.33it/s, loss=0.276, metrics={'acc': 0.871}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 123.37it/s, loss=0.278, metrics={'acc': 0.8691}]\nepoch 59: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 78.41it/s, loss=0.277, metrics={'acc': 0.8714}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 125.17it/s, loss=0.278, metrics={'acc': 0.8695}]\nepoch 60: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.26it/s, loss=0.276, metrics={'acc': 0.8721}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 120.60it/s, loss=0.278, metrics={'acc': 0.869}]\nepoch 61: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.88it/s, loss=0.278, metrics={'acc': 0.8703}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 124.47it/s, loss=0.278, metrics={'acc': 0.8692}]\nepoch 62: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.13it/s, loss=0.276, metrics={'acc': 0.8711}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 125.80it/s, loss=0.278, metrics={'acc': 0.8691}]\nepoch 63: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.20it/s, loss=0.277, metrics={'acc': 0.8715}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 118.50it/s, loss=0.278, metrics={'acc': 0.8695}]\nepoch 64: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 78.11it/s, loss=0.276, metrics={'acc': 0.8719}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 114.52it/s, loss=0.278, metrics={'acc': 0.869}]\n</pre> <pre>Best Epoch: 34. Best val_loss: 0.27449\nRestoring model weights from the end of the best epoch\n</pre> In\u00a0[32]: Copied! <pre>mlp_trainer.feature_importance\n</pre> mlp_trainer.feature_importance Out[32]: <pre>{'age': 0.116632804,\n 'workclass': 0.050255153,\n 'education': 0.094621316,\n 'marital_status': 0.12328919,\n 'occupation': 0.107893184,\n 'relationship': 0.11747801,\n 'race': 0.054717205,\n 'gender': 0.07514235,\n 'capital_gain': 0.059732802,\n 'capital_loss': 0.06738944,\n 'hours_per_week': 0.0610674,\n 'native_country': 0.07178114}</pre> In\u00a0[33]: Copied! <pre>mlp_preds = mlp_trainer.predict(X_tab=X_tab_test)\n</pre> mlp_preds = mlp_trainer.predict(X_tab=X_tab_test) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 39/39 [00:00&lt;00:00, 212.38it/s]\n</pre> In\u00a0[34]: Copied! <pre>accuracy_score(mlp_preds, test.income_label)\n</pre> accuracy_score(mlp_preds, test.income_label) Out[34]: <pre>0.8726714431934494</pre>"},{"location":"examples/17_feature_importance_via_attention_weights.html#feature-importance-via-the-attention-weights","title":"Feature Importance via the attention weights\u00b6","text":"<p>I will start by saying that I consider this feature of the library purely experimental. First of all I think there are multiple ways one could address finding the features importances for these models. However, and more importantly, one has to bear in mind that even tree-based algorithms on the same dataset produce different feature importances. This is more \"dramatic\" if one uses different techniques, such as shap or feature permutation (see for example this and references therein). All this to say that, sometimes, feature importance is just a measure contained within the experiment run, and for the model used.</p> <p>With that in mind, each instantiation of a deep tabular model, that has millions of trainable parameters, will potentially produce a different set of feature importances, even if the model has the same architecture. Moreover, this effect will become more apparent if the dataset is relatively easy and there are dependent/related columns so that one could get to the same success metric with different parameters.</p> <p>In summary, feature importances are implemented in this librray for all attention-based models for tabular data, with the exception of the <code>TabPerceiver</code>. However this functionality has to be used and interpreted with care and consider of value within the 'universe' (or context) of the model with which these features were produced.</p> <p>Nonetheless, let's have a look to how one would access to the feature importances when using this library.</p>"},{"location":"examples/18_wide_and_deep_for_recsys_pt1.html","title":"18 wide and deep for recsys pt1","text":"<p>The goal of this, and the companion (part 2) notebooks is to illustrate how one could use this library in the context of recommendation systems. In particular, this notebook and the scripts at the <code>wide_deep_for_recsys</code> dir are a response to this issue. Therefore, we will use the Kaggle notebook referred in that issue here.</p> <p>In order to keep the length of the notebook tractable, we will split this exercise in 2. In this first notebook we will prepare the data in almost the exact same way as it is done in the Kaggle notebook and also show how one could use <code>pytorch-widedeep</code> to build a model almost identical to the one in that notebook.</p> <p>In a second notebook, we will show how one could use this library to implement other models, still following the same problem formulation.</p> In\u00a0[1]: Copied! <pre>from pathlib import Path\nimport warnings\n\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\n\nfrom pytorch_widedeep.datasets import load_movielens100k\n</pre> from pathlib import Path import warnings  import pandas as pd from sklearn.model_selection import train_test_split  from pytorch_widedeep.datasets import load_movielens100k In\u00a0[2]: Copied! <pre>warnings.filterwarnings(\"ignore\")\n</pre> warnings.filterwarnings(\"ignore\") In\u00a0[3]: Copied! <pre>save_path = Path(\"prepared_data\")\nif not save_path.exists():\n    save_path.mkdir(parents=True, exist_ok=True)\n</pre> save_path = Path(\"prepared_data\") if not save_path.exists():     save_path.mkdir(parents=True, exist_ok=True) In\u00a0[4]: Copied! <pre>data, users, items = load_movielens100k(as_frame=True)\n</pre> data, users, items = load_movielens100k(as_frame=True) In\u00a0[5]: Copied! <pre># Alternatively, as specified in the docs: 'The last 19 fields are the genres' so:\n# list_of_genres = items.columns.tolist()[-19:]\nlist_of_genres = [\n    \"unknown\",\n    \"Action\",\n    \"Adventure\",\n    \"Animation\",\n    \"Children's\",\n    \"Comedy\",\n    \"Crime\",\n    \"Documentary\",\n    \"Drama\",\n    \"Fantasy\",\n    \"Film-Noir\",\n    \"Horror\",\n    \"Musical\",\n    \"Mystery\",\n    \"Romance\",\n    \"Sci-Fi\",\n    \"Thriller\",\n    \"War\",\n    \"Western\",\n]\n</pre> # Alternatively, as specified in the docs: 'The last 19 fields are the genres' so: # list_of_genres = items.columns.tolist()[-19:] list_of_genres = [     \"unknown\",     \"Action\",     \"Adventure\",     \"Animation\",     \"Children's\",     \"Comedy\",     \"Crime\",     \"Documentary\",     \"Drama\",     \"Fantasy\",     \"Film-Noir\",     \"Horror\",     \"Musical\",     \"Mystery\",     \"Romance\",     \"Sci-Fi\",     \"Thriller\",     \"War\",     \"Western\", ] <p>Let's first start by loading the interactions, user and item data</p> In\u00a0[6]: Copied! <pre>data.head()\n</pre> data.head() Out[6]: user_id movie_id rating timestamp 0 196 242 3 881250949 1 186 302 3 891717742 2 22 377 1 878887116 3 244 51 2 880606923 4 166 346 1 886397596 In\u00a0[7]: Copied! <pre>users.head()\n</pre> users.head() Out[7]: user_id age gender occupation zip_code 0 1 24 M technician 85711 1 2 53 F other 94043 2 3 23 M writer 32067 3 4 24 M technician 43537 4 5 33 F other 15213 In\u00a0[8]: Copied! <pre>items.head()\n</pre> items.head() Out[8]: movie_id movie_title release_date video_release_date IMDb_URL unknown Action Adventure Animation Children's ... Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western 0 1 Toy Story (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Toy%20Story%2... 0 0 0 1 1 ... 0 0 0 0 0 0 0 0 0 0 1 2 GoldenEye (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?GoldenEye%20(... 0 1 1 0 0 ... 0 0 0 0 0 0 0 1 0 0 2 3 Four Rooms (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Four%20Rooms%... 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0 3 4 Get Shorty (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Get%20Shorty%... 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 4 5 Copycat (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Copycat%20(1995) 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0 <p>5 rows \u00d7 24 columns</p> In\u00a0[9]: Copied! <pre># adding a column with the number of movies watched per user\ndataset = data.sort_values([\"user_id\", \"timestamp\"]).reset_index(drop=True)\ndataset[\"one\"] = 1\ndataset[\"num_watched\"] = dataset.groupby(\"user_id\")[\"one\"].cumsum()\ndataset.drop(\"one\", axis=1, inplace=True)\ndataset.head()\n</pre> # adding a column with the number of movies watched per user dataset = data.sort_values([\"user_id\", \"timestamp\"]).reset_index(drop=True) dataset[\"one\"] = 1 dataset[\"num_watched\"] = dataset.groupby(\"user_id\")[\"one\"].cumsum() dataset.drop(\"one\", axis=1, inplace=True) dataset.head() Out[9]: user_id movie_id rating timestamp num_watched 0 1 168 5 874965478 1 1 1 172 5 874965478 2 2 1 165 5 874965518 3 3 1 156 4 874965556 4 4 1 196 5 874965677 5 In\u00a0[10]: Copied! <pre># adding a column with the mean rating at a point in time per user\ndataset[\"mean_rate\"] = (\n    dataset.groupby(\"user_id\")[\"rating\"].cumsum() / dataset[\"num_watched\"]\n)\ndataset.head()\n</pre> # adding a column with the mean rating at a point in time per user dataset[\"mean_rate\"] = (     dataset.groupby(\"user_id\")[\"rating\"].cumsum() / dataset[\"num_watched\"] ) dataset.head() Out[10]: user_id movie_id rating timestamp num_watched mean_rate 0 1 168 5 874965478 1 5.00 1 1 172 5 874965478 2 5.00 2 1 165 5 874965518 3 5.00 3 1 156 4 874965556 4 4.75 4 1 196 5 874965677 5 4.80 In\u00a0[11]: Copied! <pre>dataset[\"target\"] = dataset.groupby(\"user_id\")[\"movie_id\"].shift(-1)\n</pre> dataset[\"target\"] = dataset.groupby(\"user_id\")[\"movie_id\"].shift(-1) <p>Following the same processing used by the author in the before-mentioned Kaggle notebook, we build sequences of previous movies watched</p> In\u00a0[12]: Copied! <pre># Here the author builds the sequences\ndataset[\"prev_movies\"] = dataset[\"movie_id\"].apply(lambda x: str(x))\ndataset[\"prev_movies\"] = (\n    dataset.groupby(\"user_id\")[\"prev_movies\"]\n    .apply(lambda x: (x + \" \").cumsum().str.strip())\n    .reset_index(drop=True)\n)\ndataset[\"prev_movies\"] = dataset[\"prev_movies\"].apply(lambda x: x.split())\ndataset.head()\n</pre> # Here the author builds the sequences dataset[\"prev_movies\"] = dataset[\"movie_id\"].apply(lambda x: str(x)) dataset[\"prev_movies\"] = (     dataset.groupby(\"user_id\")[\"prev_movies\"]     .apply(lambda x: (x + \" \").cumsum().str.strip())     .reset_index(drop=True) ) dataset[\"prev_movies\"] = dataset[\"prev_movies\"].apply(lambda x: x.split()) dataset.head() Out[12]: user_id movie_id rating timestamp num_watched mean_rate target prev_movies 0 1 168 5 874965478 1 5.00 172.0 [168] 1 1 172 5 874965478 2 5.00 165.0 [168, 172] 2 1 165 5 874965518 3 5.00 156.0 [168, 172, 165] 3 1 156 4 874965556 4 4.75 196.0 [168, 172, 165, 156] 4 1 196 5 874965677 5 4.80 166.0 [168, 172, 165, 156, 196] <p>And now we add a <code>genre_rate</code> as the mean of all movies rated for a given genre per user</p> In\u00a0[13]: Copied! <pre>dataset = dataset.merge(items[[\"movie_id\"] + list_of_genres], on=\"movie_id\", how=\"left\")\nfor genre in list_of_genres:\n    dataset[f\"{genre}_rate\"] = dataset[genre] * dataset[\"rating\"]\n    dataset[genre] = dataset.groupby(\"user_id\")[genre].cumsum()\n    dataset[f\"{genre}_rate\"] = (\n        dataset.groupby(\"user_id\")[f\"{genre}_rate\"].cumsum() / dataset[genre]\n    )\ndataset[list_of_genres] = dataset[list_of_genres].apply(\n    lambda x: x / dataset[\"num_watched\"]\n)\ndataset.head()\n</pre> dataset = dataset.merge(items[[\"movie_id\"] + list_of_genres], on=\"movie_id\", how=\"left\") for genre in list_of_genres:     dataset[f\"{genre}_rate\"] = dataset[genre] * dataset[\"rating\"]     dataset[genre] = dataset.groupby(\"user_id\")[genre].cumsum()     dataset[f\"{genre}_rate\"] = (         dataset.groupby(\"user_id\")[f\"{genre}_rate\"].cumsum() / dataset[genre]     ) dataset[list_of_genres] = dataset[list_of_genres].apply(     lambda x: x / dataset[\"num_watched\"] ) dataset.head() Out[13]: user_id movie_id rating timestamp num_watched mean_rate target prev_movies unknown Action ... Fantasy_rate Film-Noir_rate Horror_rate Musical_rate Mystery_rate Romance_rate Sci-Fi_rate Thriller_rate War_rate Western_rate 0 1 168 5 874965478 1 5.00 172.0 [168] 0.0 0.000000 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1 1 172 5 874965478 2 5.00 165.0 [168, 172] 0.0 0.500000 ... NaN NaN NaN NaN NaN 5.0 5.0 NaN 5.0 NaN 2 1 165 5 874965518 3 5.00 156.0 [168, 172, 165] 0.0 0.333333 ... NaN NaN NaN NaN NaN 5.0 5.0 NaN 5.0 NaN 3 1 156 4 874965556 4 4.75 196.0 [168, 172, 165, 156] 0.0 0.250000 ... NaN NaN NaN NaN NaN 5.0 5.0 4.0 5.0 NaN 4 1 196 5 874965677 5 4.80 166.0 [168, 172, 165, 156, 196] 0.0 0.200000 ... NaN NaN NaN NaN NaN 5.0 5.0 4.0 5.0 NaN <p>5 rows \u00d7 46 columns</p> <p>Adding user features</p> In\u00a0[14]: Copied! <pre>dataset = dataset.merge(users, on=\"user_id\", how=\"left\")\ndataset.head()\n</pre> dataset = dataset.merge(users, on=\"user_id\", how=\"left\") dataset.head() Out[14]: user_id movie_id rating timestamp num_watched mean_rate target prev_movies unknown Action ... Mystery_rate Romance_rate Sci-Fi_rate Thriller_rate War_rate Western_rate age gender occupation zip_code 0 1 168 5 874965478 1 5.00 172.0 [168] 0.0 0.000000 ... NaN NaN NaN NaN NaN NaN 24 M technician 85711 1 1 172 5 874965478 2 5.00 165.0 [168, 172] 0.0 0.500000 ... NaN 5.0 5.0 NaN 5.0 NaN 24 M technician 85711 2 1 165 5 874965518 3 5.00 156.0 [168, 172, 165] 0.0 0.333333 ... NaN 5.0 5.0 NaN 5.0 NaN 24 M technician 85711 3 1 156 4 874965556 4 4.75 196.0 [168, 172, 165, 156] 0.0 0.250000 ... NaN 5.0 5.0 4.0 5.0 NaN 24 M technician 85711 4 1 196 5 874965677 5 4.80 166.0 [168, 172, 165, 156, 196] 0.0 0.200000 ... NaN 5.0 5.0 4.0 5.0 NaN 24 M technician 85711 <p>5 rows \u00d7 50 columns</p> <p>Again, we use the same settings as those in the Kaggle notebook, but <code>COLD_START_TRESH</code> is pretty aggressive</p> In\u00a0[15]: Copied! <pre>COLD_START_TRESH = 5\n\nfiltred_data = dataset[\n    (dataset[\"num_watched\"] &gt;= COLD_START_TRESH) &amp; ~(dataset[\"target\"].isna())\n].sort_values(\"timestamp\")\ntrain_data, _test_data = train_test_split(filtred_data, test_size=0.2, shuffle=False)\nvalid_data, test_data = train_test_split(_test_data, test_size=0.5, shuffle=False)\n</pre> COLD_START_TRESH = 5  filtred_data = dataset[     (dataset[\"num_watched\"] &gt;= COLD_START_TRESH) &amp; ~(dataset[\"target\"].isna()) ].sort_values(\"timestamp\") train_data, _test_data = train_test_split(filtred_data, test_size=0.2, shuffle=False) valid_data, test_data = train_test_split(_test_data, test_size=0.5, shuffle=False) In\u00a0[16]: Copied! <pre>cols_to_drop = [\n    # \"rating\",\n    \"timestamp\",\n    \"num_watched\",\n]\n\ndf_train = train_data.drop(cols_to_drop, axis=1)\ndf_valid = valid_data.drop(cols_to_drop, axis=1)\ndf_test = test_data.drop(cols_to_drop, axis=1)\n\ndf_train.to_pickle(save_path / \"df_train.pkl\")\ndf_valid.to_pickle(save_path / \"df_valid.pkl\")\ndf_test.to_pickle(save_path / \"df_test.pkl\")\n</pre> cols_to_drop = [     # \"rating\",     \"timestamp\",     \"num_watched\", ]  df_train = train_data.drop(cols_to_drop, axis=1) df_valid = valid_data.drop(cols_to_drop, axis=1) df_test = test_data.drop(cols_to_drop, axis=1)  df_train.to_pickle(save_path / \"df_train.pkl\") df_valid.to_pickle(save_path / \"df_valid.pkl\") df_test.to_pickle(save_path / \"df_test.pkl\") <p>Let's now build a model that is nearly identical to the one use in the Kaggle notebook</p> In\u00a0[17]: Copied! <pre>import numpy as np\nimport torch\nfrom torch import nn\nfrom scipy.sparse import coo_matrix\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\n</pre> import numpy as np import torch from torch import nn from scipy.sparse import coo_matrix  from pytorch_widedeep import Trainer from pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep from pytorch_widedeep.preprocessing import TabPreprocessor In\u00a0[18]: Copied! <pre>device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n\nsave_path = Path(\"prepared_data\")\n\nPAD_IDX = 0\n</pre> device = \"cuda\" if torch.cuda.is_available() else \"cpu\"  save_path = Path(\"prepared_data\")  PAD_IDX = 0 <p>Let's use some of the functions the author of the kaggle's notebook uses to prepare the data</p> In\u00a0[19]: Copied! <pre>def get_coo_indexes(lil):\n    rows = []\n    cols = []\n    for i, el in enumerate(lil):\n        if type(el) != list:\n            el = [el]\n        for j in el:\n            rows.append(i)\n            cols.append(j)\n    return rows, cols\n\n\ndef get_sparse_features(series, shape):\n    coo_indexes = get_coo_indexes(series.tolist())\n    sparse_df = coo_matrix(\n        (np.ones(len(coo_indexes[0])), (coo_indexes[0], coo_indexes[1])), shape=shape\n    )\n    return sparse_df\n\n\ndef sparse_to_idx(data, pad_idx=-1):\n    indexes = data.nonzero()\n    indexes_df = pd.DataFrame()\n    indexes_df[\"rows\"] = indexes[0]\n    indexes_df[\"cols\"] = indexes[1]\n    mdf = indexes_df.groupby(\"rows\").apply(lambda x: x[\"cols\"].tolist())\n    max_len = mdf.apply(lambda x: len(x)).max()\n    return mdf.apply(lambda x: pd.Series(x + [pad_idx] * (max_len - len(x)))).values\n</pre> def get_coo_indexes(lil):     rows = []     cols = []     for i, el in enumerate(lil):         if type(el) != list:             el = [el]         for j in el:             rows.append(i)             cols.append(j)     return rows, cols   def get_sparse_features(series, shape):     coo_indexes = get_coo_indexes(series.tolist())     sparse_df = coo_matrix(         (np.ones(len(coo_indexes[0])), (coo_indexes[0], coo_indexes[1])), shape=shape     )     return sparse_df   def sparse_to_idx(data, pad_idx=-1):     indexes = data.nonzero()     indexes_df = pd.DataFrame()     indexes_df[\"rows\"] = indexes[0]     indexes_df[\"cols\"] = indexes[1]     mdf = indexes_df.groupby(\"rows\").apply(lambda x: x[\"cols\"].tolist())     max_len = mdf.apply(lambda x: len(x)).max()     return mdf.apply(lambda x: pd.Series(x + [pad_idx] * (max_len - len(x)))).values <p>For the time being, we will not use a validation set for hyperparameter optimization, and we will simply concatenate the validation and the test set in one test set. I simply splitted the data into train/valid/test in case the reader wants to actually do hyperparameter optimization (and because I know in the future I will).</p> <p>There is also another caveat worth mentioning, related to the indexing of the movies. To build the matrices of movies watched, we use the entire dataset. A more realistic (and correct) approach would be to use ONLY the movies that appear in the training set and consider <code>unknown</code> or <code>unseen</code> those in the testing set that have not been seen during training. Nonetheless, this will not affect the purposes of this notebook, which is to illustrate how one could use <code>pytorch-widedeep</code> to build a recommendation algorithm. However, if one wanted to explore the performance of different algorithms in a \"proper\" way, these \"details\" need to be accounted for.</p> In\u00a0[20]: Copied! <pre>df_test = pd.concat([df_valid, df_test], ignore_index=True)\n</pre> df_test = pd.concat([df_valid, df_test], ignore_index=True) In\u00a0[21]: Copied! <pre>id_cols = [\"user_id\", \"movie_id\"]\nmax_movie_index = max(df_train.movie_id.max(), df_test.movie_id.max())\n</pre> id_cols = [\"user_id\", \"movie_id\"] max_movie_index = max(df_train.movie_id.max(), df_test.movie_id.max()) In\u00a0[22]: Copied! <pre>X_train = df_train.drop(id_cols + [\"rating\", \"prev_movies\", \"target\"], axis=1)\ny_train = np.array(df_train.target.values, dtype=\"int64\")\ntrain_movies_watched = get_sparse_features(\n    df_train[\"prev_movies\"], (len(df_train), max_movie_index + 1)\n)\n\nX_test = df_test.drop(id_cols + [\"rating\", \"prev_movies\", \"target\"], axis=1)\ny_test = np.array(df_test.target.values, dtype=\"int64\")\ntest_movies_watched = get_sparse_features(\n    df_test[\"prev_movies\"], (len(df_test), max_movie_index + 1)\n)\n</pre> X_train = df_train.drop(id_cols + [\"rating\", \"prev_movies\", \"target\"], axis=1) y_train = np.array(df_train.target.values, dtype=\"int64\") train_movies_watched = get_sparse_features(     df_train[\"prev_movies\"], (len(df_train), max_movie_index + 1) )  X_test = df_test.drop(id_cols + [\"rating\", \"prev_movies\", \"target\"], axis=1) y_test = np.array(df_test.target.values, dtype=\"int64\") test_movies_watched = get_sparse_features(     df_test[\"prev_movies\"], (len(df_test), max_movie_index + 1) ) <p>let's have a look to the information in each dataset</p> In\u00a0[23]: Copied! <pre>X_train.head()\n</pre> X_train.head() Out[23]: mean_rate unknown Action Adventure Animation Children's Comedy Crime Documentary Drama ... Mystery_rate Romance_rate Sci-Fi_rate Thriller_rate War_rate Western_rate age gender occupation zip_code 25423 4.000000 0.0 0.400000 0.200000 0.0 0.0 0.400000 0.0 0.0 0.200000 ... NaN 4.0 4.0 4.000000 4.0 NaN 21 M student 48823 25425 4.000000 0.0 0.285714 0.142857 0.0 0.0 0.428571 0.0 0.0 0.285714 ... NaN 4.0 4.0 4.000000 4.0 NaN 21 M student 48823 25424 4.000000 0.0 0.333333 0.166667 0.0 0.0 0.333333 0.0 0.0 0.333333 ... NaN 4.0 4.0 4.000000 4.0 NaN 21 M student 48823 25426 3.875000 0.0 0.250000 0.125000 0.0 0.0 0.375000 0.0 0.0 0.250000 ... NaN 4.0 4.0 3.666667 4.0 NaN 21 M student 48823 25427 3.888889 0.0 0.222222 0.111111 0.0 0.0 0.333333 0.0 0.0 0.333333 ... NaN 4.0 4.0 3.666667 4.0 NaN 21 M student 48823 <p>5 rows \u00d7 43 columns</p> In\u00a0[24]: Copied! <pre>y_train\n</pre> y_train Out[24]: <pre>array([772, 288, 108, ..., 183, 432, 509])</pre> In\u00a0[25]: Copied! <pre>train_movies_watched\n</pre> train_movies_watched Out[25]: <pre>&lt;76228x1683 sparse matrix of type '&lt;class 'numpy.float64'&gt;'\n\twith 7957390 stored elements in COOrdinate format&gt;</pre> In\u00a0[26]: Copied! <pre>sorted(df_train.prev_movies.tolist()[0])\n</pre> sorted(df_train.prev_movies.tolist()[0]) Out[26]: <pre>['173', '185', '255', '286', '298']</pre> In\u00a0[27]: Copied! <pre>np.where(train_movies_watched.todense()[0])\n</pre> np.where(train_movies_watched.todense()[0]) Out[27]: <pre>(array([0, 0, 0, 0, 0]), array([173, 185, 255, 286, 298]))</pre> <p>And from now on is when the specifics related to this library start to appear. The only component that is going to be a bit different is the so-called tabular component, referred as <code>continuous</code> in the notebook.</p> <p>In the case of <code>pytorch-widedeep</code> we have the <code>TabPreprocessor</code> that allows for a lot of flexibility as to how we would like to process the tabular component of this Wide and Deep model. In other words, here our tabular component is a bit more elaborated than that in the notebook, just a bit...</p> In\u00a0[28]: Copied! <pre>cat_cols = [\"gender\", \"occupation\", \"zip_code\"]\ncont_cols = [c for c in X_train if c not in cat_cols]\ntab_preprocessor = TabPreprocessor(\n    cat_embed_cols=cat_cols,\n    continuous_cols=cont_cols,\n)\n</pre> cat_cols = [\"gender\", \"occupation\", \"zip_code\"] cont_cols = [c for c in X_train if c not in cat_cols] tab_preprocessor = TabPreprocessor(     cat_embed_cols=cat_cols,     continuous_cols=cont_cols, ) In\u00a0[29]: Copied! <pre>X_train_tab = tab_preprocessor.fit_transform(X_train.fillna(0))\nX_test_tab = tab_preprocessor.transform(X_test.fillna(0))\n</pre> X_train_tab = tab_preprocessor.fit_transform(X_train.fillna(0)) X_test_tab = tab_preprocessor.transform(X_test.fillna(0)) <p>Now, in the notebook, the author moves the sparse matrices to sparse tensors and then turns them into dense tensors. In reality, this is not neccessary, one could feed sparse tensors to <code>nn.Linear</code> layers in pytorch. Nonetheless, this is not the most efficient implementation and is the reason why in our library the wide, linear component is implemented as an embedding layer.</p> <p>Nonetheless, to reproduce the notebook the best we can and because currently the <code>Wide</code> model in <code>pytorch-widedeep</code> is not designed to receive sparse tensors (we might consider implementing this functionality), we will turn the sparse COO matrices into dense arrays. We will then code a fairly simple, custom <code>Wide</code> component.</p> In\u00a0[30]: Copied! <pre>X_train_wide = np.array(train_movies_watched.todense())\nX_test_wide = np.array(test_movies_watched.todense())\n</pre> X_train_wide = np.array(train_movies_watched.todense()) X_test_wide = np.array(test_movies_watched.todense()) <p>Finally, the author of the notebook uses a simple <code>Embedding</code> layer to encode the sequences of movies watched, the <code>prev_movies</code> columns. In my opinion, there is an element of information redundancy here. This is because the wide and text components have implicitely the same information, but in different form. Moreover, both of the models used for these two components ignore the sequential element in the data. Nonetheless, we want to reproduce the Kaggle notebook as close as possible, AND as one can explore later (by simply performing simple ablation studies), the wide component seems to carry most of the predictive power.</p> In\u00a0[31]: Copied! <pre>X_train_text = sparse_to_idx(train_movies_watched, pad_idx=PAD_IDX)\nX_test_text = sparse_to_idx(test_movies_watched, pad_idx=PAD_IDX)\n</pre> X_train_text = sparse_to_idx(train_movies_watched, pad_idx=PAD_IDX) X_test_text = sparse_to_idx(test_movies_watched, pad_idx=PAD_IDX) <p>Let's now build the models</p> In\u00a0[32]: Copied! <pre>class Wide(nn.Module):\n    def __init__(self, input_dim: int, pred_dim: int):\n        super().__init__()\n\n        self.input_dim = input_dim\n        self.pred_dim = pred_dim\n\n        # When I coded the library I never though that someone would want to code\n        # their own wide component. However, if you do, the wide component must have\n        # a 'wide_linear' attribute. In other words, the linear layer must be\n        # called 'wide_linear'\n        self.wide_linear = nn.Linear(input_dim, pred_dim)\n\n    def forward(self, X):\n        out = self.wide_linear(X.type(torch.float32))\n        return out\n\n\nwide = Wide(X_train_wide.shape[1], max_movie_index + 1)\n</pre> class Wide(nn.Module):     def __init__(self, input_dim: int, pred_dim: int):         super().__init__()          self.input_dim = input_dim         self.pred_dim = pred_dim          # When I coded the library I never though that someone would want to code         # their own wide component. However, if you do, the wide component must have         # a 'wide_linear' attribute. In other words, the linear layer must be         # called 'wide_linear'         self.wide_linear = nn.Linear(input_dim, pred_dim)      def forward(self, X):         out = self.wide_linear(X.type(torch.float32))         return out   wide = Wide(X_train_wide.shape[1], max_movie_index + 1) In\u00a0[33]: Copied! <pre>wide\n</pre> wide Out[33]: <pre>Wide(\n  (wide_linear): Linear(in_features=1683, out_features=1683, bias=True)\n)</pre> In\u00a0[34]: Copied! <pre>class SimpleEmbed(nn.Module):\n    def __init__(self, vocab_size: int, embed_dim: int, pad_idx: int):\n        super().__init__()\n\n        self.vocab_size = vocab_size\n        self.embed_dim = embed_dim\n        self.pad_idx = pad_idx\n\n        # The sequences of movies watched are simply embedded in the Kaggle\n        # notebook. No RNN, Transformer or any model is used\n        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)\n\n    def forward(self, X):\n        embed = self.embed(X)\n        embed_mean = torch.mean(embed, dim=1)\n        return embed_mean\n\n    @property\n    def output_dim(self) -&gt; int:\n        # All deep components in a custom 'pytorch-widedeep' model must have\n        # an output_dim property\n        return self.embed_dim\n\n\n#  In the notebook the author uses simply embeddings\nsimple_embed = SimpleEmbed(max_movie_index + 1, 16, 0)\n</pre> class SimpleEmbed(nn.Module):     def __init__(self, vocab_size: int, embed_dim: int, pad_idx: int):         super().__init__()          self.vocab_size = vocab_size         self.embed_dim = embed_dim         self.pad_idx = pad_idx          # The sequences of movies watched are simply embedded in the Kaggle         # notebook. No RNN, Transformer or any model is used         self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)      def forward(self, X):         embed = self.embed(X)         embed_mean = torch.mean(embed, dim=1)         return embed_mean      @property     def output_dim(self) -&gt; int:         # All deep components in a custom 'pytorch-widedeep' model must have         # an output_dim property         return self.embed_dim   #  In the notebook the author uses simply embeddings simple_embed = SimpleEmbed(max_movie_index + 1, 16, 0) In\u00a0[35]: Copied! <pre>simple_embed\n</pre> simple_embed Out[35]: <pre>SimpleEmbed(\n  (embed): Embedding(1683, 16, padding_idx=0)\n)</pre> <p>Maybe one would like to use an RNN to account for the sequence nature of the problem. If that was the case it would be as easy as:</p> In\u00a0[36]: Copied! <pre>basic_rnn = BasicRNN(\n    vocab_size=max_movie_index + 1,\n    embed_dim=16,\n    hidden_dim=32,\n    n_layers=2,\n    rnn_type=\"gru\",\n)\n</pre> basic_rnn = BasicRNN(     vocab_size=max_movie_index + 1,     embed_dim=16,     hidden_dim=32,     n_layers=2,     rnn_type=\"gru\", ) <p>And finally, the tabular component, which is the notebook is simply a stak of linear + Rely layers. In our case we have an embedding layer before the linear layers to encode categorial and numerical cols</p> In\u00a0[37]: Copied! <pre>tab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    cont_norm_layer=None,\n    mlp_hidden_dims=[1024, 512, 256],\n    mlp_activation=\"relu\",\n)\n</pre> tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=tab_preprocessor.continuous_cols,     cont_norm_layer=None,     mlp_hidden_dims=[1024, 512, 256],     mlp_activation=\"relu\", ) In\u00a0[38]: Copied! <pre>tab_mlp\n</pre> tab_mlp Out[38]: <pre>TabMlp(\n  (cat_embed): DiffSizeCatEmbeddings(\n    (embed_layers): ModuleDict(\n      (emb_layer_gender): Embedding(3, 2, padding_idx=0)\n      (emb_layer_occupation): Embedding(22, 9, padding_idx=0)\n      (emb_layer_zip_code): Embedding(648, 60, padding_idx=0)\n    )\n    (embedding_dropout): Dropout(p=0.0, inplace=False)\n  )\n  (cont_norm): Identity()\n  (encoder): MLP(\n    (mlp): Sequential(\n      (dense_layer_0): Sequential(\n        (0): Linear(in_features=111, out_features=1024, bias=True)\n        (1): ReLU(inplace=True)\n        (2): Dropout(p=0.1, inplace=False)\n      )\n      (dense_layer_1): Sequential(\n        (0): Linear(in_features=1024, out_features=512, bias=True)\n        (1): ReLU(inplace=True)\n        (2): Dropout(p=0.1, inplace=False)\n      )\n      (dense_layer_2): Sequential(\n        (0): Linear(in_features=512, out_features=256, bias=True)\n        (1): ReLU(inplace=True)\n        (2): Dropout(p=0.1, inplace=False)\n      )\n    )\n  )\n)</pre> <p>Finally, we simply wrap up all models with the <code>WideDeep</code> 'collector' class and we are ready to train.</p> In\u00a0[39]: Copied! <pre>wide_deep_model = WideDeep(\n    wide=wide, deeptabular=tab_mlp, deeptext=simple_embed, pred_dim=max_movie_index + 1\n)\n</pre> wide_deep_model = WideDeep(     wide=wide, deeptabular=tab_mlp, deeptext=simple_embed, pred_dim=max_movie_index + 1 ) In\u00a0[40]: Copied! <pre>wide_deep_model\n</pre> wide_deep_model Out[40]: <pre>WideDeep(\n  (wide): Wide(\n    (wide_linear): Linear(in_features=1683, out_features=1683, bias=True)\n  )\n  (deeptabular): Sequential(\n    (0): TabMlp(\n      (cat_embed): DiffSizeCatEmbeddings(\n        (embed_layers): ModuleDict(\n          (emb_layer_gender): Embedding(3, 2, padding_idx=0)\n          (emb_layer_occupation): Embedding(22, 9, padding_idx=0)\n          (emb_layer_zip_code): Embedding(648, 60, padding_idx=0)\n        )\n        (embedding_dropout): Dropout(p=0.0, inplace=False)\n      )\n      (cont_norm): Identity()\n      (encoder): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=111, out_features=1024, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_1): Sequential(\n            (0): Linear(in_features=1024, out_features=512, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_2): Sequential(\n            (0): Linear(in_features=512, out_features=256, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=256, out_features=1683, bias=True)\n  )\n  (deeptext): Sequential(\n    (0): SimpleEmbed(\n      (embed): Embedding(1683, 16, padding_idx=0)\n    )\n    (1): Linear(in_features=16, out_features=1683, bias=True)\n  )\n)</pre> <p>Note that the main difference between this wide and deep model and the Wide and Deep model in the Kaggle notebook is that in that notebook, the author concatenates the embedings and the tabular features, then passes this concatenation through a stack of linear + Relu layers with a final output dim of 256. Then concatenates this output with the binary features and connects this concatenation with the final linear layer (so the final weights are of dim (batch_size, 256 + 1683)). Our implementation follows the notation of the original paper and instead of concatenating the tabular, text and wide components and then connect them to the output neurons, we first compute their output, and then add it (see here: https://arxiv.org/pdf/1606.07792.pdf, their Eq 3). Note that this is effectively the same, with the caveat that while in one case one initialises a big weight matrix \"at once\", in our implementation we initialise different matrices for different components. Anyway, let's give it a go.</p> In\u00a0[41]: Copied! <pre>trainer = Trainer(\n    model=wide_deep_model,\n    objective=\"multiclass\",\n    custom_loss_function=nn.CrossEntropyLoss(ignore_index=PAD_IDX),\n    optimizers=torch.optim.Adam(wide_deep_model.parameters(), lr=1e-3),\n)\n</pre> trainer = Trainer(     model=wide_deep_model,     objective=\"multiclass\",     custom_loss_function=nn.CrossEntropyLoss(ignore_index=PAD_IDX),     optimizers=torch.optim.Adam(wide_deep_model.parameters(), lr=1e-3), ) In\u00a0[42]: Copied! <pre>trainer.fit(\n    X_train={\n        \"X_wide\": X_train_wide,\n        \"X_tab\": X_train_tab,\n        \"X_text\": X_train_text,\n        \"target\": y_train,\n    },\n    X_val={\n        \"X_wide\": X_test_wide,\n        \"X_tab\": X_test_tab,\n        \"X_text\": X_test_text,\n        \"target\": y_test,\n    },\n    n_epochs=5,\n    batch_size=512,\n    shuffle=False,\n)\n</pre> trainer.fit(     X_train={         \"X_wide\": X_train_wide,         \"X_tab\": X_train_tab,         \"X_text\": X_train_text,         \"target\": y_train,     },     X_val={         \"X_wide\": X_test_wide,         \"X_tab\": X_test_tab,         \"X_text\": X_test_text,         \"target\": y_test,     },     n_epochs=5,     batch_size=512,     shuffle=False, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 149/149 [00:19&lt;00:00,  7.66it/s, loss=6.66]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 38/38 [00:02&lt;00:00, 18.75it/s, loss=6.6]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 149/149 [00:21&lt;00:00,  6.95it/s, loss=5.97]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 38/38 [00:01&lt;00:00, 21.03it/s, loss=6.52]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 149/149 [00:19&lt;00:00,  7.51it/s, loss=5.65]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 38/38 [00:01&lt;00:00, 20.16it/s, loss=6.53]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 149/149 [00:23&lt;00:00,  6.29it/s, loss=5.41]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 38/38 [00:02&lt;00:00, 13.97it/s, loss=6.57]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 149/149 [00:19&lt;00:00,  7.58it/s, loss=5.2]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 38/38 [00:02&lt;00:00, 18.82it/s, loss=6.63]\n</pre> <p>Now one could continue to the 'compare' metrics section of the Kaggle notebook. However, for the purposes of illustrating how one could use <code>pytorch-widedeep</code> to build recommendation algorithms we consider this notebook completed and move onto part 2</p>"},{"location":"examples/18_wide_and_deep_for_recsys_pt1.html#problem-formulation","title":"Problem formulation\u00b6","text":"<p>In this particular exercise the problem is formulated as predicting the next movie that will be watched (in consequence the last interactions will be discarded)</p>"},{"location":"examples/18_wide_and_deep_for_recsys_pt2.html","title":"18 wide and deep for recsys pt2","text":"<p>This is the second of the two notebooks where we aim to illustrate how one could use this library to build recommendation algorithms using the example in this Kaggle notebook as guidance. In the previous notebook we used <code>pytorch-widedeep</code> to build a model that replicated almost exactly that in the notebook. In this, shorter notebook we will show how one could use the library to explore other models, following the same problem formulation, this is: given a state of a user at a certain point in time having watched a series of movies, our goal is to predict which movie the user will watch next.</p> <p>Assuming that one has read (and run) the previous notebook, the required data will be stored in a local dir called <code>prepared_data</code>, so let's read it:</p> In\u00a0[1]: Copied! <pre>from pathlib import Path\n\nimport numpy as np\nimport torch\nimport pandas as pd\nfrom torch import nn\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.utils import pad_sequences\nfrom pytorch_widedeep.models import TabMlp, WideDeep, Transformer\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\n</pre> from pathlib import Path  import numpy as np import torch import pandas as pd from torch import nn  from pytorch_widedeep import Trainer from pytorch_widedeep.utils import pad_sequences from pytorch_widedeep.models import TabMlp, WideDeep, Transformer from pytorch_widedeep.preprocessing import TabPreprocessor In\u00a0[2]: Copied! <pre>save_path = Path(\"prepared_data\")\n\nPAD_IDX = 0\n\nid_cols = [\"user_id\", \"movie_id\"]\n\ndf_train = pd.read_pickle(save_path / \"df_train.pkl\")\ndf_valid = pd.read_pickle(save_path / \"df_valid.pkl\")\ndf_test = pd.read_pickle(save_path / \"df_test.pkl\")\n</pre> save_path = Path(\"prepared_data\")  PAD_IDX = 0  id_cols = [\"user_id\", \"movie_id\"]  df_train = pd.read_pickle(save_path / \"df_train.pkl\") df_valid = pd.read_pickle(save_path / \"df_valid.pkl\") df_test = pd.read_pickle(save_path / \"df_test.pkl\") <p>...remember that in the previous notebook we explained that we are not  going to use a validation set here (in a real-world example, or simply a more realistic example, one should always use it).</p> In\u00a0[3]: Copied! <pre>df_test = pd.concat([df_valid, df_test], ignore_index=True)\n</pre> df_test = pd.concat([df_valid, df_test], ignore_index=True) <p>Also remember that, in the previous notebook we discussed that the <code>'maxlen'</code> and <code>'max_movie_index'</code> parameters should be computed using only the train set. In particular, to properly do the tokenization, one would have to use ONLY train tokens and add a token for new 'unknown'/'unseen' movies in the test set. This can also be done with this library or manually, so I will leave it to the reader to implement that tokenzation appraoch.</p> In\u00a0[4]: Copied! <pre>maxlen = max(\n    df_train.prev_movies.apply(lambda x: len(x)).max(),\n    df_test.prev_movies.apply(lambda x: len(x)).max(),\n)\n\nmax_movie_index = max(df_train.movie_id.max(), df_test.movie_id.max())\n</pre> maxlen = max(     df_train.prev_movies.apply(lambda x: len(x)).max(),     df_test.prev_movies.apply(lambda x: len(x)).max(), )  max_movie_index = max(df_train.movie_id.max(), df_test.movie_id.max()) <p>From now one things are pretty simple, moreover bearing in mind that in this example we are not going to use a wide component since, in pple, one would believe that the information in that component is also 'carried' by the movie sequences (However in the previous notebook, if one performs ablation studies, these suggest that most of the prediction power comes from the linear, wide model).</p> <p>In the example here we are going to explore one (of many) possibilities. We are simply going to encode the triplet <code>(user, item, rating)</code> and use it as a <code>deeptabular</code> component and the sequences of previously watched movies as the <code>deeptext</code> component. For the <code>deeptext</code> component we are going to use a basic encoder-only transformer model.</p> <p>Let's start with the tabular data preparation</p> In\u00a0[5]: Copied! <pre>df_train_user_item = df_train[[\"user_id\", \"movie_id\", \"rating\"]]\ntrain_movies_sequences = df_train.prev_movies.apply(\n    lambda x: [int(el) for el in x]\n).to_list()\ny_train = df_train.target.values.astype(int)\n\ndf_test_user_item = df_train[[\"user_id\", \"movie_id\", \"rating\"]]\ntest_movies_sequences = df_test.prev_movies.apply(\n    lambda x: [int(el) for el in x]\n).to_list()\ny_test = df_test.target.values.astype(int)\n\ntab_preprocessor = tab_preprocessor = TabPreprocessor(\n    cat_embed_cols=[\"user_id\", \"movie_id\", \"rating\"],\n)\nX_train_tab = tab_preprocessor.fit_transform(df_train_user_item)\nX_test_tab = tab_preprocessor.transform(df_test_user_item)\n</pre> df_train_user_item = df_train[[\"user_id\", \"movie_id\", \"rating\"]] train_movies_sequences = df_train.prev_movies.apply(     lambda x: [int(el) for el in x] ).to_list() y_train = df_train.target.values.astype(int)  df_test_user_item = df_train[[\"user_id\", \"movie_id\", \"rating\"]] test_movies_sequences = df_test.prev_movies.apply(     lambda x: [int(el) for el in x] ).to_list() y_test = df_test.target.values.astype(int)  tab_preprocessor = tab_preprocessor = TabPreprocessor(     cat_embed_cols=[\"user_id\", \"movie_id\", \"rating\"], ) X_train_tab = tab_preprocessor.fit_transform(df_train_user_item) X_test_tab = tab_preprocessor.transform(df_test_user_item) <p>And not the text component, simply padding the sequences:</p> In\u00a0[6]: Copied! <pre>X_train_text = np.array(\n    [\n        pad_sequences(\n            s,\n            maxlen=maxlen,\n            pad_first=False,\n            pad_idx=PAD_IDX,\n        )\n        for s in train_movies_sequences\n    ]\n)\nX_test_text = np.array(\n    [\n        pad_sequences(\n            s,\n            maxlen=maxlen,\n            pad_first=False,\n            pad_idx=0,\n        )\n        for s in test_movies_sequences\n    ]\n)\n</pre> X_train_text = np.array(     [         pad_sequences(             s,             maxlen=maxlen,             pad_first=False,             pad_idx=PAD_IDX,         )         for s in train_movies_sequences     ] ) X_test_text = np.array(     [         pad_sequences(             s,             maxlen=maxlen,             pad_first=False,             pad_idx=0,         )         for s in test_movies_sequences     ] ) <p>We now define the model components and the wide and deep model.</p> In\u00a0[7]: Copied! <pre>tab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    mlp_hidden_dims=[1024, 512, 256],\n    mlp_activation=\"relu\",\n)\n\n# plenty of options here, see the docs\ntransformer = Transformer(\n    vocab_size=max_movie_index + 1,\n    embed_dim=32,\n    n_heads=2,\n    n_blocks=2,\n    seq_length=maxlen,\n)\n\nwide_deep_model = WideDeep(\n    deeptabular=tab_mlp, deeptext=transformer, pred_dim=max_movie_index + 1\n)\n</pre> tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     mlp_hidden_dims=[1024, 512, 256],     mlp_activation=\"relu\", )  # plenty of options here, see the docs transformer = Transformer(     vocab_size=max_movie_index + 1,     embed_dim=32,     n_heads=2,     n_blocks=2,     seq_length=maxlen, )  wide_deep_model = WideDeep(     deeptabular=tab_mlp, deeptext=transformer, pred_dim=max_movie_index + 1 ) In\u00a0[8]: Copied! <pre>wide_deep_model\n</pre> wide_deep_model Out[8]: <pre>WideDeep(\n  (deeptabular): Sequential(\n    (0): TabMlp(\n      (cat_embed): DiffSizeCatEmbeddings(\n        (embed_layers): ModuleDict(\n          (emb_layer_user_id): Embedding(749, 65, padding_idx=0)\n          (emb_layer_movie_id): Embedding(1612, 100, padding_idx=0)\n          (emb_layer_rating): Embedding(6, 4, padding_idx=0)\n        )\n        (embedding_dropout): Dropout(p=0.0, inplace=False)\n      )\n      (encoder): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=169, out_features=1024, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_1): Sequential(\n            (0): Linear(in_features=1024, out_features=512, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_2): Sequential(\n            (0): Linear(in_features=512, out_features=256, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=256, out_features=1683, bias=True)\n  )\n  (deeptext): Sequential(\n    (0): Transformer(\n      (embedding): Embedding(1683, 32, padding_idx=0)\n      (pos_encoder): PositionalEncoding(\n        (dropout): Dropout(p=0.1, inplace=False)\n      )\n      (encoder): Sequential(\n        (transformer_block0): TransformerEncoder(\n          (attn): MultiHeadedAttention(\n            (dropout): Dropout(p=0.1, inplace=False)\n            (q_proj): Linear(in_features=32, out_features=32, bias=False)\n            (kv_proj): Linear(in_features=32, out_features=64, bias=False)\n            (out_proj): Linear(in_features=32, out_features=32, bias=False)\n          )\n          (ff): FeedForward(\n            (w_1): Linear(in_features=32, out_features=128, bias=True)\n            (w_2): Linear(in_features=128, out_features=32, bias=True)\n            (dropout): Dropout(p=0.1, inplace=False)\n            (activation): GELU(approximate='none')\n          )\n          (attn_addnorm): AddNorm(\n            (dropout): Dropout(p=0.1, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n          (ff_addnorm): AddNorm(\n            (dropout): Dropout(p=0.1, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n        )\n        (transformer_block1): TransformerEncoder(\n          (attn): MultiHeadedAttention(\n            (dropout): Dropout(p=0.1, inplace=False)\n            (q_proj): Linear(in_features=32, out_features=32, bias=False)\n            (kv_proj): Linear(in_features=32, out_features=64, bias=False)\n            (out_proj): Linear(in_features=32, out_features=32, bias=False)\n          )\n          (ff): FeedForward(\n            (w_1): Linear(in_features=32, out_features=128, bias=True)\n            (w_2): Linear(in_features=128, out_features=32, bias=True)\n            (dropout): Dropout(p=0.1, inplace=False)\n            (activation): GELU(approximate='none')\n          )\n          (attn_addnorm): AddNorm(\n            (dropout): Dropout(p=0.1, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n          (ff_addnorm): AddNorm(\n            (dropout): Dropout(p=0.1, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=23552, out_features=1683, bias=True)\n  )\n)</pre> <p>And as in the previous notebook, let's train (you will need a GPU for this)</p> In\u00a0[\u00a0]: Copied! <pre>trainer = Trainer(\n    model=wide_deep_model,\n    objective=\"multiclass\",\n    custom_loss_function=nn.CrossEntropyLoss(ignore_index=PAD_IDX),\n    optimizers=torch.optim.Adam(wide_deep_model.parameters(), lr=1e-3),\n)\n\ntrainer.fit(\n    X_train={\n        \"X_tab\": X_train_tab,\n        \"X_text\": X_train_text,\n        \"target\": y_train,\n    },\n    X_val={\n        \"X_tab\": X_test_tab,\n        \"X_text\": X_test_text,\n        \"target\": y_test,\n    },\n    n_epochs=10,\n    batch_size=521,\n    shuffle=False,\n)\n</pre> trainer = Trainer(     model=wide_deep_model,     objective=\"multiclass\",     custom_loss_function=nn.CrossEntropyLoss(ignore_index=PAD_IDX),     optimizers=torch.optim.Adam(wide_deep_model.parameters(), lr=1e-3), )  trainer.fit(     X_train={         \"X_tab\": X_train_tab,         \"X_text\": X_train_text,         \"target\": y_train,     },     X_val={         \"X_tab\": X_test_tab,         \"X_text\": X_test_text,         \"target\": y_test,     },     n_epochs=10,     batch_size=521,     shuffle=False, ) <pre>epoch 1:   0%|                                                                                                         | 0/147 [00:34&lt;?, ?it/s]\n</pre> In\u00a0[\u00a0]: Copied! <pre>\n</pre>"},{"location":"examples/19_load_from_folder_functionality.html","title":"19 load from folder functionality","text":"<p>In this notebook I want to illustrate how one can use our <code>[...]FromFolder</code> functionalities along with the <code>[...]ChunkPreProcessors</code> in those cases where the dataset is too bit to fit in memory.</p> <p>These functionalities in the library have been designed for the following scenarop</p> In\u00a0[1]: Copied! <pre>import numpy as np\nimport torch\nimport pandas as pd\nfrom torch.utils.data import DataLoader\n\nfrom pytorch_widedeep.models import TabMlp, Vision, BasicRNN, WideDeep\nfrom pytorch_widedeep.training import TrainerFromFolder\nfrom pytorch_widedeep.callbacks import EarlyStopping, ModelCheckpoint\nfrom pytorch_widedeep.preprocessing import (\n    TabPreprocessor,\n    TextPreprocessor,\n    ImagePreprocessor,\n    ChunkTabPreprocessor,\n    ChunkTextPreprocessor,\n)\nfrom pytorch_widedeep.load_from_folder import (\n    TabFromFolder,\n    TextFromFolder,\n    ImageFromFolder,\n    WideDeepDatasetFromFolder,\n)\n</pre> import numpy as np import torch import pandas as pd from torch.utils.data import DataLoader  from pytorch_widedeep.models import TabMlp, Vision, BasicRNN, WideDeep from pytorch_widedeep.training import TrainerFromFolder from pytorch_widedeep.callbacks import EarlyStopping, ModelCheckpoint from pytorch_widedeep.preprocessing import (     TabPreprocessor,     TextPreprocessor,     ImagePreprocessor,     ChunkTabPreprocessor,     ChunkTextPreprocessor, ) from pytorch_widedeep.load_from_folder import (     TabFromFolder,     TextFromFolder,     ImageFromFolder,     WideDeepDatasetFromFolder, ) In\u00a0[2]: Copied! <pre># in my case, I place the data in a folder I call tmp_data, let's see how it looks\nairbnb_data = pd.read_csv(\"../tmp_data/airbnb/airbnb_sample.csv\")\nairbnb_data.sample(5)\n</pre> # in my case, I place the data in a folder I call tmp_data, let's see how it looks airbnb_data = pd.read_csv(\"../tmp_data/airbnb/airbnb_sample.csv\") airbnb_data.sample(5) Out[2]: id host_id description host_listings_count host_identity_verified neighbourhood_cleansed latitude longitude is_location_exact property_type ... amenity_wide_entrance amenity_wide_entrance_for_guests amenity_wide_entryway amenity_wide_hallways amenity_wifi amenity_window_guards amenity_wine_cooler security_deposit extra_people yield 39 53242.jpg 247650 A lovely big bright bedroom in a 2 bedroom fla... 2.0 t Lambeth 51.47075 -0.12913 t apartment ... 0 0 0 0 1 0 0 250.0 5.0 9.75 214 236716.jpg 1241070 We offer a warm welcome in our quiet double ro... 1.0 t Hackney 51.56593 -0.07482 t other ... 0 0 0 0 1 0 0 200.0 10.0 76.50 400 346523.jpg 1756532 Available for you to rent is a cozy studio in ... 2.0 t Kensington and Chelsea 51.48311 -0.18428 t other ... 0 0 0 0 1 0 0 0.0 50.0 180.90 512 389627.jpg 1949299 This gorgeous studio flat is situated in the v... 1.0 t Westminster 51.51838 -0.14238 f apartment ... 0 0 0 0 1 0 0 250.0 25.0 276.90 504 388767.jpg 1945165 If you want to experience London at it's best ... 2.0 f Camden 51.54293 -0.14073 t apartment ... 0 0 0 0 1 0 0 150.0 10.0 591.10 <p>5 rows \u00d7 223 columns</p> In\u00a0[3]: Copied! <pre># for example\nfrom IPython.display import Image\n\npil_img = Image(filename=\"../tmp_data/airbnb/property_picture/272908.jpg\")\ndisplay(pil_img)\n</pre> # for example from IPython.display import Image  pil_img = Image(filename=\"../tmp_data/airbnb/property_picture/272908.jpg\") display(pil_img) In\u00a0[4]: Copied! <pre># And the description for the property that that picture belongs to is:\nairbnb_data[airbnb_data.id == \"272908.jpg\"].description.tolist()\n</pre> # And the description for the property that that picture belongs to is: airbnb_data[airbnb_data.id == \"272908.jpg\"].description.tolist() Out[4]: <pre>[\"Bright, sunny beautiful room that will give you the perfect base to explore all of London. Come and explore one of London's best neighbourhoods - Herne Hill! As mentioned in (Website hidden by Airbnb)   (Website hidden by Airbnb)  WiFi availability with a fully stocked and clean uplifting home. Lovely sunny, airy and big double bedroom on a leafy south-London street.    Note: This room comes with a reserved Off-Street parking spot! The room is on the first floor and boasts an enormous Super King bed, gorgeous wooden floors, tall ceilings and large windows which let in the sunshine almost all day. (Yoga May or meditation cushion available on request) The flat is bright and airy and big! So lots of space for all.  Location wise you are only 10 minutes walk to either Herne Hill or West Dulwich stations, both of which will take you to Victoria and the city within minutes. You can also hop on a bus right outside the flat that will take you to Brixton tube station within 8 minutes where you \"]</pre> <p>Ok, so we have tabular data where one column is <code>description</code> and another <code>id</code>, points towards the images stored in disk. Now, remember the following, because this will appear a few times in the notebook: our \"reference dataset\" is the tabular data.</p> <p>Therefore, since I want to illustrate a \"semi-realistic\" case, if we need to split the data into training, validation and test datasets, these datasets needs to be separetely stored in disk. In my case I have done this and in the <code>tmp_data/airbnb</code> dir I have the following:</p> <pre>../tmp_data/airbnb\n\u251c\u2500\u2500 airbnb_sample.csv\n\u251c\u2500\u2500 airbnb_sample_eval.csv\n\u251c\u2500\u2500 airbnb_sample_test.csv\n\u251c\u2500\u2500 airbnb_sample_train.csv\n\u2514\u2500\u2500 property_picture\n</pre> <p>Where <code>airbnb_sample.csv</code> is the full sample (1001 rows) and the <code>train</code>, <code>eval</code> and <code>test</code> set is the corresponding splits. In a realistic example, the full sample would be the 'gigantic' dataset and the rest the corresponding splits. One has to do this 'offline', prior to start the coding.</p> <p>Also, one thing that one needs to know is the number of total observations/rows, as well as the splits. In our case the train size is 800, and the eval and test sizes are 100 and 101 respectively.</p> <p>With all that info, let's star</p> In\u00a0[5]: Copied! <pre># path to the tabular data and the splits\ndata_path = \"../tmp_data/airbnb/\"\ntrain_fname = \"airbnb_sample_train.csv\"\neval_fname = \"airbnb_sample_eval.csv\"\ntest_fname = \"airbnb_sample_test.csv\"\n\n# split sizes\ntrain_size = 800\neval_size = 100\ntest_size = 101\n\n# number of chunks for the Chunk Preprocessors\nchunksize = 100\nn_chunks = int(np.ceil(train_size / chunksize))\n\n# path to the image dataset and name of the image col\nimg_path = \"../tmp_data/airbnb/property_picture/\"\nimg_col = \"id\"\n\n# name of the text col\ntext_col = \"description\"\n\n# mane of the target\ntarget_col = \"yield\"\n\n# definition of the categorical and continuous cols for the TabPreprocessor\ncat_embed_cols = [\n    \"host_listings_count\",\n    \"neighbourhood_cleansed\",\n    \"is_location_exact\",\n    \"property_type\",\n    \"room_type\",\n    \"accommodates\",\n    \"bathrooms\",\n    \"bedrooms\",\n    \"beds\",\n    \"guests_included\",\n    \"minimum_nights\",\n    \"instant_bookable\",\n    \"cancellation_policy\",\n    \"has_house_rules\",\n    \"host_gender\",\n    \"accommodates_catg\",\n    \"guests_included_catg\",\n    \"minimum_nights_catg\",\n    \"host_listings_count_catg\",\n    \"bathrooms_catg\",\n    \"bedrooms_catg\",\n    \"beds_catg\",\n    \"security_deposit\",\n    \"extra_people\",\n]\ncont_cols = [\"latitude\", \"longitude\"]\n</pre> # path to the tabular data and the splits data_path = \"../tmp_data/airbnb/\" train_fname = \"airbnb_sample_train.csv\" eval_fname = \"airbnb_sample_eval.csv\" test_fname = \"airbnb_sample_test.csv\"  # split sizes train_size = 800 eval_size = 100 test_size = 101  # number of chunks for the Chunk Preprocessors chunksize = 100 n_chunks = int(np.ceil(train_size / chunksize))  # path to the image dataset and name of the image col img_path = \"../tmp_data/airbnb/property_picture/\" img_col = \"id\"  # name of the text col text_col = \"description\"  # mane of the target target_col = \"yield\"  # definition of the categorical and continuous cols for the TabPreprocessor cat_embed_cols = [     \"host_listings_count\",     \"neighbourhood_cleansed\",     \"is_location_exact\",     \"property_type\",     \"room_type\",     \"accommodates\",     \"bathrooms\",     \"bedrooms\",     \"beds\",     \"guests_included\",     \"minimum_nights\",     \"instant_bookable\",     \"cancellation_policy\",     \"has_house_rules\",     \"host_gender\",     \"accommodates_catg\",     \"guests_included_catg\",     \"minimum_nights_catg\",     \"host_listings_count_catg\",     \"bathrooms_catg\",     \"bedrooms_catg\",     \"beds_catg\",     \"security_deposit\",     \"extra_people\", ] cont_cols = [\"latitude\", \"longitude\"] In\u00a0[6]: Copied! <pre>tab_preprocessor = TabPreprocessor(\n    embed_cols=cat_embed_cols,\n    continuous_cols=cont_cols,\n    default_embed_dim=8,\n    verbose=0,\n)\n\ntext_preprocessor = TextPreprocessor(\n    text_col=text_col,\n    n_cpus=1,\n)\n\nimg_preprocessor = ImagePreprocessor(\n    img_col=img_col,\n    img_path=img_path,\n)\n</pre> tab_preprocessor = TabPreprocessor(     embed_cols=cat_embed_cols,     continuous_cols=cont_cols,     default_embed_dim=8,     verbose=0, )  text_preprocessor = TextPreprocessor(     text_col=text_col,     n_cpus=1, )  img_preprocessor = ImagePreprocessor(     img_col=img_col,     img_path=img_path, ) In\u00a0[7]: Copied! <pre>tab_preprocessor.fit(airbnb_data)\ntext_preprocessor.fit(airbnb_data)\nimg_preprocessor.fit(airbnb_data)\n</pre> tab_preprocessor.fit(airbnb_data) text_preprocessor.fit(airbnb_data) img_preprocessor.fit(airbnb_data) <pre>The vocabulary contains 2192 tokens\n</pre> Out[7]: <pre>ImagePreprocessor(img_col=id, img_path=../tmp_data/airbnb/property_picture/, width=224, height=224, verbose=1)</pre> In\u00a0[8]: Copied! <pre>chunk_tab_preprocessor = ChunkTabPreprocessor(\n    embed_cols=cat_embed_cols,\n    continuous_cols=cont_cols,\n    n_chunks=n_chunks,\n    default_embed_dim=8,\n    verbose=0,\n)\n\nchunk_text_preprocessor = ChunkTextPreprocessor(\n    n_chunks=n_chunks,\n    text_col=text_col,\n    n_cpus=1,\n    verbose=0,\n)\n\nfor i, chunk in enumerate(\n    pd.read_csv(\"/\".join([data_path, train_fname]), chunksize=chunksize)\n):\n    print(f\"chunk in loop: {i + 1}\")\n    chunk_tab_preprocessor.fit(chunk)\n    chunk_text_preprocessor.fit(chunk)\n</pre> chunk_tab_preprocessor = ChunkTabPreprocessor(     embed_cols=cat_embed_cols,     continuous_cols=cont_cols,     n_chunks=n_chunks,     default_embed_dim=8,     verbose=0, )  chunk_text_preprocessor = ChunkTextPreprocessor(     n_chunks=n_chunks,     text_col=text_col,     n_cpus=1,     verbose=0, )  for i, chunk in enumerate(     pd.read_csv(\"/\".join([data_path, train_fname]), chunksize=chunksize) ):     print(f\"chunk in loop: {i + 1}\")     chunk_tab_preprocessor.fit(chunk)     chunk_text_preprocessor.fit(chunk) <pre>chunk in loop: 1\nchunk in loop: 2\nchunk in loop: 3\nchunk in loop: 4\nchunk in loop: 5\nchunk in loop: 6\nchunk in loop: 7\nchunk in loop: 8\n</pre> In\u00a0[9]: Copied! <pre>train_tab_folder = TabFromFolder(\n    fname=train_fname,\n    directory=data_path,\n    target_col=target_col,\n    preprocessor=tab_preprocessor,\n    text_col=text_col,\n    img_col=img_col,\n)\n\n# Note how we can use the `train_tab_folder` as reference so we don't have to\n# define all parameters again\neval_tab_folder = TabFromFolder(fname=eval_fname, reference=train_tab_folder)\n\n# Note that for the test set we can ignore the target as no metric will be\n# computed by the `predict` method\ntest_tab_folder = TabFromFolder(\n    fname=test_fname, reference=train_tab_folder, ignore_target=True\n)\n</pre> train_tab_folder = TabFromFolder(     fname=train_fname,     directory=data_path,     target_col=target_col,     preprocessor=tab_preprocessor,     text_col=text_col,     img_col=img_col, )  # Note how we can use the `train_tab_folder` as reference so we don't have to # define all parameters again eval_tab_folder = TabFromFolder(fname=eval_fname, reference=train_tab_folder)  # Note that for the test set we can ignore the target as no metric will be # computed by the `predict` method test_tab_folder = TabFromFolder(     fname=test_fname, reference=train_tab_folder, ignore_target=True ) In\u00a0[10]: Copied! <pre># for the text and image datasets we do not need to specify eval or test loaders\ntext_folder = TextFromFolder(preprocessor=text_preprocessor)\nimg_folder = ImageFromFolder(preprocessor=img_preprocessor)\n</pre> # for the text and image datasets we do not need to specify eval or test loaders text_folder = TextFromFolder(preprocessor=text_preprocessor) img_folder = ImageFromFolder(preprocessor=img_preprocessor) In\u00a0[11]: Copied! <pre>train_dataset_folder = WideDeepDatasetFromFolder(\n    n_samples=train_size,\n    tab_from_folder=train_tab_folder,\n    text_from_folder=text_folder,\n    img_from_folder=img_folder,\n)\n\n# Note that the eval and test loaders only need their corresponding\n# `TabFromFolder` classes. The rest of the parameters can be defined\n# via a `reference` `TabFromFolder` class\neval_dataset_folder = WideDeepDatasetFromFolder(\n    n_samples=eval_size,\n    tab_from_folder=eval_tab_folder,\n    reference=train_dataset_folder,\n)\n\ntest_dataset_folder = WideDeepDatasetFromFolder(\n    n_samples=test_size,\n    tab_from_folder=test_tab_folder,\n    reference=train_dataset_folder,\n)\n</pre> train_dataset_folder = WideDeepDatasetFromFolder(     n_samples=train_size,     tab_from_folder=train_tab_folder,     text_from_folder=text_folder,     img_from_folder=img_folder, )  # Note that the eval and test loaders only need their corresponding # `TabFromFolder` classes. The rest of the parameters can be defined # via a `reference` `TabFromFolder` class eval_dataset_folder = WideDeepDatasetFromFolder(     n_samples=eval_size,     tab_from_folder=eval_tab_folder,     reference=train_dataset_folder, )  test_dataset_folder = WideDeepDatasetFromFolder(     n_samples=test_size,     tab_from_folder=test_tab_folder,     reference=train_dataset_folder, ) In\u00a0[12]: Copied! <pre>train_loader = DataLoader(train_dataset_folder, batch_size=16, num_workers=1)\neval_loader = DataLoader(eval_dataset_folder, batch_size=16, num_workers=1)\ntest_loader = DataLoader(test_dataset_folder, batch_size=16, num_workers=1)\n</pre> train_loader = DataLoader(train_dataset_folder, batch_size=16, num_workers=1) eval_loader = DataLoader(eval_dataset_folder, batch_size=16, num_workers=1) test_loader = DataLoader(test_dataset_folder, batch_size=16, num_workers=1) <p>And from here on is business as usual:</p> In\u00a0[13]: Copied! <pre># for example\nbasic_rnn = BasicRNN(\n    vocab_size=len(text_preprocessor.vocab.itos),\n    embed_dim=32,\n    hidden_dim=64,\n    n_layers=2,\n)\n\ndeepimage = Vision()\n\ndeepdense = TabMlp(\n    mlp_hidden_dims=[32, 16],\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=cont_cols,\n)\n\nmodel = WideDeep(\n    deeptabular=deepdense,\n    deeptext=basic_rnn,\n    deepimage=deepimage,\n)\n\nmodel\n</pre> # for example basic_rnn = BasicRNN(     vocab_size=len(text_preprocessor.vocab.itos),     embed_dim=32,     hidden_dim=64,     n_layers=2, )  deepimage = Vision()  deepdense = TabMlp(     mlp_hidden_dims=[32, 16],     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=cont_cols, )  model = WideDeep(     deeptabular=deepdense,     deeptext=basic_rnn,     deepimage=deepimage, )  model Out[13]: <pre>WideDeep(\n  (deeptabular): Sequential(\n    (0): TabMlp(\n      (cat_embed): DiffSizeCatEmbeddings(\n        (embed_layers): ModuleDict(\n          (emb_layer_host_listings_count): Embedding(28, 10, padding_idx=0)\n          (emb_layer_neighbourhood_cleansed): Embedding(33, 11, padding_idx=0)\n          (emb_layer_is_location_exact): Embedding(3, 2, padding_idx=0)\n          (emb_layer_property_type): Embedding(4, 3, padding_idx=0)\n          (emb_layer_room_type): Embedding(4, 3, padding_idx=0)\n          (emb_layer_accommodates): Embedding(14, 7, padding_idx=0)\n          (emb_layer_bathrooms): Embedding(11, 6, padding_idx=0)\n          (emb_layer_bedrooms): Embedding(7, 4, padding_idx=0)\n          (emb_layer_beds): Embedding(11, 6, padding_idx=0)\n          (emb_layer_guests_included): Embedding(11, 6, padding_idx=0)\n          (emb_layer_minimum_nights): Embedding(25, 9, padding_idx=0)\n          (emb_layer_instant_bookable): Embedding(3, 2, padding_idx=0)\n          (emb_layer_cancellation_policy): Embedding(6, 4, padding_idx=0)\n          (emb_layer_has_house_rules): Embedding(3, 2, padding_idx=0)\n          (emb_layer_host_gender): Embedding(4, 3, padding_idx=0)\n          (emb_layer_accommodates_catg): Embedding(4, 3, padding_idx=0)\n          (emb_layer_guests_included_catg): Embedding(4, 3, padding_idx=0)\n          (emb_layer_minimum_nights_catg): Embedding(4, 3, padding_idx=0)\n          (emb_layer_host_listings_count_catg): Embedding(5, 3, padding_idx=0)\n          (emb_layer_bathrooms_catg): Embedding(4, 3, padding_idx=0)\n          (emb_layer_bedrooms_catg): Embedding(5, 3, padding_idx=0)\n          (emb_layer_beds_catg): Embedding(5, 3, padding_idx=0)\n          (emb_layer_security_deposit): Embedding(53, 15, padding_idx=0)\n          (emb_layer_extra_people): Embedding(39, 12, padding_idx=0)\n        )\n        (embedding_dropout): Dropout(p=0.0, inplace=False)\n      )\n      (cont_norm): Identity()\n      (encoder): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=128, out_features=32, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_1): Sequential(\n            (0): Linear(in_features=32, out_features=16, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=16, out_features=1, bias=True)\n  )\n  (deeptext): Sequential(\n    (0): BasicRNN(\n      (word_embed): Embedding(2192, 32, padding_idx=1)\n      (rnn): LSTM(32, 64, num_layers=2, batch_first=True, dropout=0.1)\n      (rnn_mlp): Identity()\n    )\n    (1): Linear(in_features=64, out_features=1, bias=True)\n  )\n  (deepimage): Sequential(\n    (0): Vision(\n      (features): Sequential(\n        (conv_layer_0): Sequential(\n          (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)\n          (1): BatchNorm2d(64, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)\n          (2): LeakyReLU(negative_slope=0.1, inplace=True)\n          (maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n        )\n        (conv_layer_1): Sequential(\n          (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n          (1): BatchNorm2d(128, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)\n          (2): LeakyReLU(negative_slope=0.1, inplace=True)\n        )\n        (conv_layer_2): Sequential(\n          (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n          (1): BatchNorm2d(256, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)\n          (2): LeakyReLU(negative_slope=0.1, inplace=True)\n        )\n        (conv_layer_3): Sequential(\n          (0): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n          (1): BatchNorm2d(512, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)\n          (2): LeakyReLU(negative_slope=0.1, inplace=True)\n          (adaptiveavgpool): AdaptiveAvgPool2d(output_size=(1, 1))\n        )\n      )\n    )\n    (1): Linear(in_features=512, out_features=1, bias=True)\n  )\n)</pre> In\u00a0[14]: Copied! <pre>trainer = TrainerFromFolder(\n    model,\n    objective=\"regression\",\n)\n\ntrainer.fit(\n    train_loader=train_loader,\n    eval_loader=eval_loader,\n)\n</pre> trainer = TrainerFromFolder(     model,     objective=\"regression\", )  trainer.fit(     train_loader=train_loader,     eval_loader=eval_loader, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 50/50 [03:41&lt;00:00,  4.42s/it, loss=1.64e+4]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 7/7 [00:23&lt;00:00,  3.30s/it, loss=6.27e+3]\n</pre> In\u00a0[15]: Copied! <pre>preds = trainer.predict(test_loader=test_loader)\n</pre> preds = trainer.predict(test_loader=test_loader) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 7/7 [00:22&lt;00:00,  3.26s/it]\n</pre> <p>Note that in the case of predict you could also choose to do this</p> In\u00a0[16]: Copied! <pre>df_test = pd.read_csv(\"/\".join([data_path, test_fname]))\n</pre> df_test = pd.read_csv(\"/\".join([data_path, test_fname])) In\u00a0[17]: Copied! <pre># if the images for the test set fit in memory\nX_tab_test = chunk_tab_preprocessor.transform(df_test)\nX_text_test = chunk_text_preprocessor.transform(df_test)\nX_img_test = img_preprocessor.transform(df_test)\n</pre> # if the images for the test set fit in memory X_tab_test = chunk_tab_preprocessor.transform(df_test) X_text_test = chunk_text_preprocessor.transform(df_test) X_img_test = img_preprocessor.transform(df_test) <pre>Reading Images from ../tmp_data/airbnb/property_picture/\nResizing\n</pre> <pre>100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 101/101 [00:00&lt;00:00, 708.23it/s]</pre> <pre>Computing normalisation metrics\n</pre> <pre>\n</pre> In\u00a0[18]: Copied! <pre>preds = trainer.predict(\n    X_tab=X_tab_test, X_text=X_text_test, X_img=X_img_test, batch_size=32\n)\n</pre> preds = trainer.predict(     X_tab=X_tab_test, X_text=X_text_test, X_img=X_img_test, batch_size=32 ) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:03&lt;00:00,  1.14it/s]\n</pre>"},{"location":"examples/19_load_from_folder_functionality.html#scenario","title":"Scenario\u00b6","text":"<p>We have a tabular dataset combined with images and text and either some, or all these datasets do not fit in memory. Note that the tabular dataset MUST ALWAYS be present as it is considered the rerefence. This is, if we have an image dataset, the tabular dataset must contain a column that points to the image file names as stored in disk. Similarly, if we have a text dataset, then the tabular dataset must contain a column with the texts themselves or a the file names of the text files as stored in disk.</p> <p>If you only have text and/or images and not a tabular component, I would suggest using other libraries (such as hugginface probably).</p> <p>Within this scenario, they are two possible scenarios that we will cover here:</p> <ol> <li><p>The tabular data itsel fits in memory and is only the images that do not: in this case you could use the 'standard' <code>Preprocessors</code> in the library and off you go, move directly to the <code>[...]FromFolder</code> functionalities</p> </li> <li><p>The tabular data is also very large and does not fit in memory, so we have to process it in chuncks. For this second case I have created the so called <code>Chunk[...]Preprocessor</code> (Wide, Tab and Text).</p> </li> </ol> <p>Note that at the moment ONLY csv format is allowed for the tabular file. More formats will be supported in the future.</p> <p>Let's see a complete example to illustrate how each of these cases would be addressed with the new functionalities in the library. For this example we will use a sample of the airbnb dataset</p> <p>The airbnb dataset, which you could get from here, is too big to be included in our datasets module (when including images). Therefore, what I did was, go there, download it, and use the download_images.py script to get the images and the airbnb_data_processing.py to process the data. I did this ages ago and I believe the format of the dataset might be different now. Nonetheless, I will show samples of the dataset as we go through so you can extrapolate the content of this notebook to your particular problem.</p> <p>In the future we will find better datasets\ud83d\ude42. Finally, note that here we are only using a small sample to illustrate the use, so PLEASE ignore the results, just focus on usage.</p>"},{"location":"examples/19_load_from_folder_functionality.html#setting-variables-and-constants","title":"Setting variables and constants\u00b6","text":""},{"location":"examples/19_load_from_folder_functionality.html#step-1-the-preprocessors","title":"Step 1: the preprocessors\u00b6","text":""},{"location":"examples/19_load_from_folder_functionality.html#scenario-1-only-the-images-do-not-fit-in-disk","title":"Scenario 1: only the images do not fit in disk\u00b6","text":"<p>In this case we can prepare the data in the 'standard' way</p>"},{"location":"examples/19_load_from_folder_functionality.html#scenario-2-the-tabular-data-is-also-huge","title":"Scenario 2: the tabular data is also huge\u00b6","text":"<p>Then we need to prepare it in chunks. Note that, unfortunately, the tabular and text preprocessors need to see the whole dataset once. This is because to process tabular or text data we need to encode values. For those encodings to be consistent they need to have seen the whole dataset. Alternatively, one could code a solution with some streaming encoder for both datasets. However, such implementation is not trivial for this library (and in general). I also don't think that having to see the whole data once is such a big limitation. Let's see how is done.</p> <p>Note that I have not mentioned the image dataset. This is because the processing of the image dataset does not require any form of encoding and in consequence can be done 'on the fly'. Therefore, no <code>ChunkImgPreprocessor</code> processor is needed.</p>"},{"location":"examples/19_load_from_folder_functionality.html#step-2-the-fromfolder-classes","title":"Step 2: the <code>[...]FromFolder</code> classes\u00b6","text":"<p>Once we have the preprocessors, we need to instantiate the classes that will enable us to load the data from their respective folders. From now on I am going to proceed with the <code>chunk_tab_preprocessor</code>, <code>chunk_text_preprocessor</code> and <code>img_preprocessor</code>, but the code would be identical if instead of the first two preprocessors we decided to use the <code>tab_preprocessor</code> and <code>text_preprocessor</code>.</p> <p>Once more, our reference datasets are the tabular datasets, which we have splitted in train, eval and test prior to start the coding. Therefore, we will eventually need a loader for each split</p>"},{"location":"examples/19_load_from_folder_functionality.html#step-3-pytorch-datasets-and-dataloaders","title":"Step 3: pytorch datasets and dataloaders\u00b6","text":"<p>From here in advance, is all very 'standard' if you are familiar with pytorch. One needs to define a class that inherits from the <code>Dataset</code> class in pytorch. Then this will be passed to a <code>DataLoader</code> class and we are ready to train. Our <code>Dataset</code> child class is <code>WideDeepDatasetFromFolder</code>. This class will use the tabular dataset and the corresponding text and image columns to load the adequate data in the batches</p> <p>Let's do it</p>"},{"location":"examples/19_load_from_folder_functionality.html#step-4-define-the-model","title":"Step 4: define the model\u00b6","text":""},{"location":"examples/19_load_from_folder_functionality.html#step-5-fit-and-predict","title":"Step 5: fit and predict\u00b6","text":""},{"location":"examples/20_Using_huggingface_within_widedeep.html","title":"20 Using huggingface within widedeep","text":"<p>In this notebook we will show how to use Hugginface's tokenizers and models as they are integrated within the library. In notebook number 17 you can find examples on how to code your own, custom, Hugginface (hereafter HF) model and use it in combination of any other model in the library</p> In\u00a0[1]: Copied! <pre>import numpy as np\nimport pandas as pd\nfrom sklearn.metrics import f1_score, accuracy_score\nfrom sklearn.model_selection import train_test_split\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.models import HFModel, WideDeep\nfrom pytorch_widedeep.metrics import F1Score, Accuracy\nfrom pytorch_widedeep.datasets import load_womens_ecommerce\nfrom pytorch_widedeep.preprocessing import HFPreprocessor\n</pre> import numpy as np import pandas as pd from sklearn.metrics import f1_score, accuracy_score from sklearn.model_selection import train_test_split  from pytorch_widedeep import Trainer from pytorch_widedeep.models import HFModel, WideDeep from pytorch_widedeep.metrics import F1Score, Accuracy from pytorch_widedeep.datasets import load_womens_ecommerce from pytorch_widedeep.preprocessing import HFPreprocessor <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n</pre> In\u00a0[2]: Copied! <pre>df: pd.DataFrame = load_womens_ecommerce(as_frame=True)  # type: ignore\n</pre> df: pd.DataFrame = load_womens_ecommerce(as_frame=True)  # type: ignore In\u00a0[3]: Copied! <pre>df.shape\n</pre> df.shape Out[3]: <pre>(23486, 10)</pre> In\u00a0[4]: Copied! <pre>df.sample(3)\n</pre> df.sample(3) Out[4]: Clothing ID Age Title Review Text Rating Recommended IND Positive Feedback Count Division Name Department Name Class Name 7004 862 43 Cute and feminine Loved this sweater wrap and bought it in both ... 5 1 2 General Tops Knits 12508 975 66 Love it The linen fabric is elegantly thin feels and l... 5 1 3 General Jackets Jackets 10288 950 41 Perfect for fall This sweater is just as pictured. the fit is t... 5 1 0 General Tops Sweaters In\u00a0[5]: Copied! <pre># Let's do some mild preprocessing\ndf.columns = [c.replace(\" \", \"_\").lower() for c in df.columns]\n\n# classes from [0,num_class)\ndf[\"rating\"] = (df[\"rating\"] - 1).astype(\"int64\")\n\n# group reviews with 1 and 2 scores into one class\ndf.loc[df.rating == 0, \"rating\"] = 1\n\n# and back again to [0,num_class)\ndf[\"rating\"] = (df[\"rating\"] - 1).astype(\"int64\")\n</pre> # Let's do some mild preprocessing df.columns = [c.replace(\" \", \"_\").lower() for c in df.columns]  # classes from [0,num_class) df[\"rating\"] = (df[\"rating\"] - 1).astype(\"int64\")  # group reviews with 1 and 2 scores into one class df.loc[df.rating == 0, \"rating\"] = 1  # and back again to [0,num_class) df[\"rating\"] = (df[\"rating\"] - 1).astype(\"int64\") In\u00a0[6]: Copied! <pre># drop short reviews\ndf = df[~df.review_text.isna()]\ndf[\"review_length\"] = df.review_text.apply(lambda x: len(x.split(\" \")))\ndf = df[df.review_length &gt;= 5]\ndf = df.drop(\"review_length\", axis=1).reset_index(drop=True)\n</pre> # drop short reviews df = df[~df.review_text.isna()] df[\"review_length\"] = df.review_text.apply(lambda x: len(x.split(\" \"))) df = df[df.review_length &gt;= 5] df = df.drop(\"review_length\", axis=1).reset_index(drop=True) In\u00a0[7]: Copied! <pre>df.shape\n</pre> df.shape Out[7]: <pre>(22608, 10)</pre> In\u00a0[8]: Copied! <pre># if you run this on a CPU, you might want to subsample the dataset. With that in mind I am simply going to stratify-sample to the minimum category occurrence and then sample at random\n# If you run this on a GPU you can comment out the following two cells\ndf.rating.value_counts()\n</pre> # if you run this on a CPU, you might want to subsample the dataset. With that in mind I am simply going to stratify-sample to the minimum category occurrence and then sample at random # If you run this on a GPU you can comment out the following two cells df.rating.value_counts() Out[8]: <pre>rating\n3    12515\n2     4904\n1     2820\n0     2369\nName: count, dtype: int64</pre> In\u00a0[9]: Copied! <pre>df = (\n    df.groupby(\"rating\", group_keys=False)\n    .apply(lambda x: x.sample(min(len(x), 2369)))\n    .sample(1000)\n)\n</pre> df = (     df.groupby(\"rating\", group_keys=False)     .apply(lambda x: x.sample(min(len(x), 2369)))     .sample(1000) ) <pre>/var/folders/_2/lrjn1qn54c758tdtktr1bvkc0000gn/T/ipykernel_5886/895673206.py:3: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n  .apply(lambda x: x.sample(min(len(x), 2369)))\n</pre> In\u00a0[10]: Copied! <pre>train, test = train_test_split(df, train_size=0.8, random_state=1, stratify=df.rating)\n\n# possible model names currently supported in the library\nmodel_names = [\n    \"distilbert-base-uncased\",\n    \"bert-base-uncased\",\n    \"FacebookAI/roberta-base\",\n    \"albert-base-v2\",\n    \"google/electra-base-discriminator\",\n]\n\n# Let's choose one. The syntax is the same for all the models\nmodel_name = \"distilbert-base-uncased\"\n</pre> train, test = train_test_split(df, train_size=0.8, random_state=1, stratify=df.rating)  # possible model names currently supported in the library model_names = [     \"distilbert-base-uncased\",     \"bert-base-uncased\",     \"FacebookAI/roberta-base\",     \"albert-base-v2\",     \"google/electra-base-discriminator\", ]  # Let's choose one. The syntax is the same for all the models model_name = \"distilbert-base-uncased\" <p>Now we can use the <code>HFPreprocessor</code> class. As most things in this library, the integration with HF has been coded aiming for a flexible use. With this in mind, there are two ways one can use a <code>HFPreprocessor</code> class.</p> <ol> <li>Passing a <code>text_col</code> and <code>encode_params</code> as the class is instantiated and then using the <code>fit</code> and <code>transform</code> as with any other preprocessor in the library</li> <li>Without passing  <code>text_col</code> and <code>encode_params</code> as the class is instantiated and using the <code>encode</code> method of the <code>HFPreprocessor</code> which is simply a wrapper around the encode method of HF's tokenizers</li> </ol> <p>Let's have a look</p> In\u00a0[11]: Copied! <pre>tokenizer1 = HFPreprocessor(\n    model_name=model_name,\n    text_col=\"review_text\",\n    num_workers=1,\n    encode_params={\n        \"max_length\": 90,\n        \"padding\": \"max_length\",\n        \"truncation\": True,\n        \"add_special_tokens\": True,\n    },\n)\n\nX_text_tr1 = tokenizer1.fit_transform(train)\nX_text_te1 = tokenizer1.transform(test)\n</pre> tokenizer1 = HFPreprocessor(     model_name=model_name,     text_col=\"review_text\",     num_workers=1,     encode_params={         \"max_length\": 90,         \"padding\": \"max_length\",         \"truncation\": True,         \"add_special_tokens\": True,     }, )  X_text_tr1 = tokenizer1.fit_transform(train) X_text_te1 = tokenizer1.transform(test) In\u00a0[12]: Copied! <pre>tokenizer2 = HFPreprocessor(\n    model_name=model_name,\n    num_workers=1,\n)\n\nX_text_tr2 = tokenizer2.encode(\n    train.review_text.tolist(),\n    max_length=90,\n    padding=\"max_length\",\n    truncation=True,\n    add_special_tokens=True,\n)\nX_text_te2 = tokenizer2.encode(\n    test.review_text.tolist(),\n    max_length=90,\n    padding=\"max_length\",\n    truncation=True,\n    add_special_tokens=True,\n)\n</pre> tokenizer2 = HFPreprocessor(     model_name=model_name,     num_workers=1, )  X_text_tr2 = tokenizer2.encode(     train.review_text.tolist(),     max_length=90,     padding=\"max_length\",     truncation=True,     add_special_tokens=True, ) X_text_te2 = tokenizer2.encode(     test.review_text.tolist(),     max_length=90,     padding=\"max_length\",     truncation=True,     add_special_tokens=True, ) In\u00a0[13]: Copied! <pre>all(X_text_tr1[0] == X_text_tr2[0])\n</pre> all(X_text_tr1[0] == X_text_tr2[0]) Out[13]: <pre>True</pre> In\u00a0[14]: Copied! <pre># Now we define a model which is as easy as:\n# Note that this will instantiation will lead to NO parameter trainable in the HF model.\n# If you want to fine-tune the HF model, you can set the trainable parameters via the 'trainable_parameters' argument.\n# Alternatively, you can use a head (MLP) via the 'head'-related arguments (see the docs for more details)\nhf_model = HFModel(model_name=model_name)\n</pre> # Now we define a model which is as easy as: # Note that this will instantiation will lead to NO parameter trainable in the HF model. # If you want to fine-tune the HF model, you can set the trainable parameters via the 'trainable_parameters' argument. # Alternatively, you can use a head (MLP) via the 'head'-related arguments (see the docs for more details) hf_model = HFModel(model_name=model_name) In\u00a0[15]: Copied! <pre># And from here on is the same as any other WideDeep model\nmodel = WideDeep(\n    deeptext=hf_model,\n    pred_dim=4,\n)\n\ntrainer = Trainer(\n    model,\n    objective=\"multiclass\",\n    metrics=[Accuracy(), F1Score(average=True)],\n)\n\ntrainer.fit(\n    X_text=X_text_tr2,\n    target=train.rating.values,\n    n_epochs=1,\n    batch_size=64,\n)\n# If you run this on a CPU and you sampled the data, the metrics will not be better than a random guess. Remember, this is just a demo\n</pre> # And from here on is the same as any other WideDeep model model = WideDeep(     deeptext=hf_model,     pred_dim=4, )  trainer = Trainer(     model,     objective=\"multiclass\",     metrics=[Accuracy(), F1Score(average=True)], )  trainer.fit(     X_text=X_text_tr2,     target=train.rating.values,     n_epochs=1,     batch_size=64, ) # If you run this on a CPU and you sampled the data, the metrics will not be better than a random guess. Remember, this is just a demo <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 13/13 [02:06&lt;00:00,  9.75s/it, loss=3.2, metrics={'acc': 0.235, 'f1': 0.2336}]  \n</pre> In\u00a0[17]: Copied! <pre>preds_text = trainer.predict_proba(X_text=X_text_te2)\npred_text_class = np.argmax(preds_text, 1)\n\nacc_text = accuracy_score(test.rating, pred_text_class)\nf1_text = f1_score(test.rating, pred_text_class, average=\"weighted\")\nprint(f\"Accuracy: {acc_text:.4f}\")\nprint(f\"F1: {f1_text:.4f}\")\n</pre> preds_text = trainer.predict_proba(X_text=X_text_te2) pred_text_class = np.argmax(preds_text, 1)  acc_text = accuracy_score(test.rating, pred_text_class) f1_text = f1_score(test.rating, pred_text_class, average=\"weighted\") print(f\"Accuracy: {acc_text:.4f}\") print(f\"F1: {f1_text:.4f}\") <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:05&lt;00:00,  1.43s/it]</pre> <pre>Accuracy: 0.2500\nF1: 0.1000\n</pre> <pre>\n</pre>"},{"location":"pytorch-widedeep/bayesian_models.html","title":"The <code>bayesian models</code> module","text":"<p>This module contains the two Bayesian Models available in this library, namely the bayesian version of the <code>Wide</code> and <code>TabMlp</code> models, referred as <code>BayesianWide</code> and <code>BayesianTabMlp</code>. These models are very useful in scenarios where getting a measure of uncertainty is important.</p> <p>The models in this module are based on the publication: Weight Uncertainty in Neural Networks.</p>"},{"location":"pytorch-widedeep/bayesian_models.html#pytorch_widedeep.bayesian_models.tabular.bayesian_linear.bayesian_wide.BayesianWide","title":"BayesianWide","text":"<pre><code>BayesianWide(\n    input_dim,\n    pred_dim=1,\n    prior_sigma_1=1.0,\n    prior_sigma_2=0.002,\n    prior_pi=0.8,\n    posterior_mu_init=0.0,\n    posterior_rho_init=-7.0,\n)\n</code></pre> <p>               Bases: <code>BaseBayesianModel</code></p> <p>Defines a <code>Wide</code> model. This is a linear model where the non-linearlities are captured via crossed-columns</p> <p>Parameters:</p> <ul> <li> <code>input_dim</code>               (<code>int</code>)           \u2013            <p>size of the Embedding layer. <code>input_dim</code> is the summation of all the individual values for all the features that go through the wide component. For example, if the wide component receives 2 features with 5 individual values each, <code>input_dim = 10</code></p> </li> <li> <code>pred_dim</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>size of the ouput tensor containing the predictions</p> </li> <li> <code>prior_sigma_1</code>               (<code>float</code>, default:                   <code>1.0</code> )           \u2013            <p>The prior weight distribution is a scaled mixture of two Gaussian densities:</p> \\[    \\begin{aligned}    P(\\mathbf{w}) = \\prod_{i=j} \\pi N (\\mathbf{w}_j | 0, \\sigma_{1}^{2}) + (1 - \\pi) N (\\mathbf{w}_j | 0, \\sigma_{2}^{2})    \\end{aligned} \\] <p><code>prior_sigma_1</code> is the prior of the sigma parameter for the first of the two Gaussians that will be mixed to produce the prior weight distribution.</p> </li> <li> <code>prior_sigma_2</code>               (<code>float</code>, default:                   <code>0.002</code> )           \u2013            <p>Prior of the sigma parameter for the second of the two Gaussian distributions that will be mixed to produce the prior weight distribution</p> </li> <li> <code>prior_pi</code>               (<code>float</code>, default:                   <code>0.8</code> )           \u2013            <p>Scaling factor that will be used to mix the Gaussians to produce the prior weight distribution</p> </li> <li> <code>posterior_mu_init</code>               (<code>float</code>, default:                   <code>0.0</code> )           \u2013            <p>The posterior sample of the weights is defined as:</p> \\[    \\begin{aligned}    \\mathbf{w} &amp;= \\mu + log(1 + exp(\\rho))    \\end{aligned} \\] <p>where:</p> \\[    \\begin{aligned}    \\mathcal{N}(x\\vert \\mu, \\sigma) &amp;= \\frac{1}{\\sqrt{2\\pi}\\sigma}e^{-\\frac{(x-\\mu)^2}{2\\sigma^2}}\\\\    \\log{\\mathcal{N}(x\\vert \\mu, \\sigma)} &amp;= -\\log{\\sqrt{2\\pi}} -\\log{\\sigma} -\\frac{(x-\\mu)^2}{2\\sigma^2}\\\\    \\end{aligned} \\] <p>\\(\\mu\\) is initialised using a normal distributtion with mean <code>posterior_mu_init</code> and std equal to 0.1.</p> </li> <li> <code>posterior_rho_init</code>               (<code>float</code>, default:                   <code>-7.0</code> )           \u2013            <p>As in the case of \\(\\mu\\), \\(\\rho\\) is initialised using a normal distributtion with mean <code>posterior_rho_init</code> and std equal to 0.1.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>bayesian_wide_linear</code>               (<code>Module</code>)           \u2013            <p>the linear layer that comprises the wide branch of the model</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.bayesian_models import BayesianWide\n&gt;&gt;&gt; X = torch.empty(4, 4).random_(6)\n&gt;&gt;&gt; wide = BayesianWide(input_dim=X.unique().size(0), pred_dim=1)\n&gt;&gt;&gt; out = wide(X)\n</code></pre> Source code in <code>pytorch_widedeep/bayesian_models/tabular/bayesian_linear/bayesian_wide.py</code> <pre><code>def __init__(\n    self,\n    input_dim: int,\n    pred_dim: int = 1,\n    prior_sigma_1: float = 1.0,\n    prior_sigma_2: float = 0.002,\n    prior_pi: float = 0.8,\n    posterior_mu_init: float = 0.0,\n    posterior_rho_init: float = -7.0,\n):\n    super(BayesianWide, self).__init__()\n    #  Embeddings: val + 1 because 0 is reserved for padding/unseen cateogories.\n    self.bayesian_wide_linear = bnn.BayesianEmbedding(\n        n_embed=input_dim + 1,\n        embed_dim=pred_dim,\n        padding_idx=0,\n        prior_sigma_1=prior_sigma_1,\n        prior_sigma_2=prior_sigma_2,\n        prior_pi=prior_pi,\n        posterior_mu_init=posterior_mu_init,\n        posterior_rho_init=posterior_rho_init,\n    )\n    self.bias = nn.Parameter(torch.zeros(pred_dim))\n</code></pre>"},{"location":"pytorch-widedeep/bayesian_models.html#pytorch_widedeep.bayesian_models.tabular.bayesian_mlp.bayesian_tab_mlp.BayesianTabMlp","title":"BayesianTabMlp","text":"<pre><code>BayesianTabMlp(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_activation=None,\n    continuous_cols=None,\n    embed_continuous=None,\n    cont_embed_dim=None,\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    use_cont_bias=None,\n    cont_norm_layer=None,\n    mlp_hidden_dims=[200, 100],\n    mlp_activation=\"leaky_relu\",\n    prior_sigma_1=1,\n    prior_sigma_2=0.002,\n    prior_pi=0.8,\n    posterior_mu_init=0.0,\n    posterior_rho_init=-7.0,\n    pred_dim=1\n)\n</code></pre> <p>               Bases: <code>BaseBayesianModel</code></p> <p>Defines a <code>BayesianTabMlp</code> model.</p> <p>This class combines embedding representations of the categorical features with numerical (aka continuous) features, embedded or not. These are then passed through a series of probabilistic dense layers (i.e. a MLP).</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name, number of unique values and embedding dimension. e.g. [(education, 11, 32), ...]</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm', 'batchnorm' or None.</p> </li> <li> <code>embed_continuous</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the continuous columns will be embedded (i.e. passed each through a linear layer with or without activation)</p> </li> <li> <code>cont_embed_dim</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Size of the continuous embeddings</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings</p> </li> <li> <code>use_cont_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the continuous embeddings</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>List[int]</code>, default:                   <code>[200, 100]</code> )           \u2013            <p>List with the number of neurons per dense layer in the mlp.</p> </li> <li> <code>mlp_activation</code>               (<code>str</code>, default:                   <code>'leaky_relu'</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>prior_sigma_1</code>               (<code>float</code>, default:                   <code>1</code> )           \u2013            <p>The prior weight distribution is a scaled mixture of two Gaussian densities:</p> \\[    \\begin{aligned}    P(\\mathbf{w}) = \\prod_{i=j} \\pi N (\\mathbf{w}_j | 0, \\sigma_{1}^{2}) + (1 - \\pi) N (\\mathbf{w}_j | 0, \\sigma_{2}^{2})    \\end{aligned} \\] <p><code>prior_sigma_1</code> is the prior of the sigma parameter for the first of the two Gaussians that will be mixed to produce the prior weight distribution.</p> </li> <li> <code>prior_sigma_2</code>               (<code>float</code>, default:                   <code>0.002</code> )           \u2013            <p>Prior of the sigma parameter for the second of the two Gaussian distributions that will be mixed to produce the prior weight distribution for each Bayesian linear and embedding layer</p> </li> <li> <code>prior_pi</code>               (<code>float</code>, default:                   <code>0.8</code> )           \u2013            <p>Scaling factor that will be used to mix the Gaussians to produce the prior weight distribution ffor each Bayesian linear and embedding layer</p> </li> <li> <code>posterior_mu_init</code>               (<code>float</code>, default:                   <code>0.0</code> )           \u2013            <p>The posterior sample of the weights is defined as:</p> <p>$$    \\begin{aligned}    \\mathbf{w} &amp;= \\mu + log(1 + exp(\\rho))    \\end{aligned} $$ where:</p> \\[    \\begin{aligned}    \\mathcal{N}(x\\vert \\mu, \\sigma) &amp;= \\frac{1}{\\sqrt{2\\pi}\\sigma}e^{-\\frac{(x-\\mu)^2}{2\\sigma^2}}\\\\    \\log{\\mathcal{N}(x\\vert \\mu, \\sigma)} &amp;= -\\log{\\sqrt{2\\pi}} -\\log{\\sigma} -\\frac{(x-\\mu)^2}{2\\sigma^2}\\\\    \\end{aligned} \\] <p>\\(\\mu\\) is initialised using a normal distributtion with mean <code>posterior_mu_init</code> and std equal to 0.1.</p> </li> <li> <code>posterior_rho_init</code>               (<code>float</code>, default:                   <code>-7.0</code> )           \u2013            <p>As in the case of \\(\\mu\\), \\(\\rho\\) is initialised using a normal distributtion with mean <code>posterior_rho_init</code> and std equal to 0.1.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>bayesian_cat_and_cont_embed</code>               (<code>Module</code>)           \u2013            <p>This is the module that processes the categorical and continuous columns</p> </li> <li> <code>bayesian_tab_mlp</code>               (<code>Sequential</code>)           \u2013            <p>mlp model that will receive the concatenation of the embeddings and the continuous columns</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.bayesian_models import BayesianTabMlp\n&gt;&gt;&gt; X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = ['a', 'b', 'c', 'd', 'e']\n&gt;&gt;&gt; cat_embed_input = [(u,i,j) for u,i,j in zip(colnames[:4], [4]*4, [8]*4)]\n&gt;&gt;&gt; column_idx = {k:v for v,k in enumerate(colnames)}\n&gt;&gt;&gt; model = BayesianTabMlp(mlp_hidden_dims=[8,4], column_idx=column_idx, cat_embed_input=cat_embed_input,\n... continuous_cols = ['e'])\n&gt;&gt;&gt; out = model(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/bayesian_models/tabular/bayesian_mlp/bayesian_tab_mlp.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int, int]]] = None,\n    cat_embed_activation: Optional[str] = None,\n    continuous_cols: Optional[List[str]] = None,\n    embed_continuous: Optional[bool] = None,\n    cont_embed_dim: Optional[int] = None,\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    use_cont_bias: Optional[bool] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    mlp_hidden_dims: List[int] = [200, 100],\n    mlp_activation: str = \"leaky_relu\",\n    prior_sigma_1: float = 1,\n    prior_sigma_2: float = 0.002,\n    prior_pi: float = 0.8,\n    posterior_mu_init: float = 0.0,\n    posterior_rho_init: float = -7.0,\n    pred_dim=1,  # Bayesian models will require their own trainer and need the output layer\n):\n    super(BayesianTabMlp, self).__init__()\n\n    self.column_idx = column_idx\n    self.cat_embed_input = cat_embed_input\n    self.cat_embed_activation = cat_embed_activation\n\n    self.continuous_cols = continuous_cols\n    self.cont_norm_layer = cont_norm_layer\n    self.embed_continuous = embed_continuous\n    self.cont_embed_dim = cont_embed_dim\n    self.cont_embed_dropout = cont_embed_dropout\n    self.use_cont_bias = use_cont_bias\n    self.cont_embed_activation = cont_embed_activation\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n\n    self.prior_sigma_1 = prior_sigma_1\n    self.prior_sigma_2 = prior_sigma_2\n    self.prior_pi = prior_pi\n    self.posterior_mu_init = posterior_mu_init\n    self.posterior_rho_init = posterior_rho_init\n\n    self.pred_dim = pred_dim\n\n    allowed_activations = [\"relu\", \"leaky_relu\", \"tanh\", \"gelu\"]\n    if self.mlp_activation not in allowed_activations:\n        raise ValueError(\n            \"Currently, only the following activation functions are supported \"\n            \"for the Bayesian MLP's dense layers: {}. Got '{}' instead\".format(\n                \", \".join(allowed_activations),\n                self.mlp_activation,\n            )\n        )\n\n    # Categorical\n    if self.cat_embed_input is not None:\n        self.cat_embed = BayesianDiffSizeCatEmbeddings(\n            column_idx=self.column_idx,\n            embed_input=self.cat_embed_input,\n            prior_sigma_1=self.prior_sigma_1,\n            prior_sigma_2=self.prior_sigma_2,\n            prior_pi=self.prior_pi,\n            posterior_mu_init=self.posterior_mu_init,\n            posterior_rho_init=self.posterior_rho_init,\n            activation_fn=self.cat_embed_activation,\n        )\n        self.cat_out_dim = int(np.sum([embed[2] for embed in self.cat_embed_input]))\n    else:\n        self.cat_out_dim = 0\n\n    # Continuous\n    if self.continuous_cols is not None:\n        self.cont_idx = [self.column_idx[col] for col in self.continuous_cols]\n        if cont_norm_layer == \"layernorm\":\n            self.cont_norm: NormLayers = nn.LayerNorm(len(self.continuous_cols))\n        elif cont_norm_layer == \"batchnorm\":\n            self.cont_norm = nn.BatchNorm1d(len(self.continuous_cols))\n        else:\n            self.cont_norm = nn.Identity()\n        if self.embed_continuous:\n            assert self.cont_embed_dim is not None, (\n                \"If 'embed_continuous' is True, 'cont_embed_dim' must be \"\n                \"provided\"\n            )\n            self.cont_embed = BayesianContEmbeddings(\n                n_cont_cols=len(self.continuous_cols),\n                embed_dim=self.cont_embed_dim,\n                prior_sigma_1=self.prior_sigma_1,\n                prior_sigma_2=self.prior_sigma_2,\n                prior_pi=self.prior_pi,\n                posterior_mu_init=self.posterior_mu_init,\n                posterior_rho_init=self.posterior_rho_init,\n                use_bias=(\n                    False if self.use_cont_bias is None else self.use_cont_bias\n                ),\n                activation_fn=self.cont_embed_activation,\n            )\n            self.cont_out_dim = len(self.continuous_cols) * self.cont_embed_dim\n        else:\n            self.cont_out_dim = len(self.continuous_cols)\n    else:\n        self.cont_out_dim = 0\n\n    self.output_dim = self.cat_out_dim + self.cont_out_dim\n\n    mlp_hidden_dims = [self.output_dim] + mlp_hidden_dims + [pred_dim]\n    self.bayesian_tab_mlp = BayesianMLP(\n        mlp_hidden_dims,\n        mlp_activation,\n        True,  # use_bias\n        prior_sigma_1,\n        prior_sigma_2,\n        prior_pi,\n        posterior_mu_init,\n        posterior_rho_init,\n    )\n</code></pre>"},{"location":"pytorch-widedeep/bayesian_trainer.html","title":"Training Deep Learning Probabilistic Models","text":""},{"location":"pytorch-widedeep/bayesian_trainer.html#pytorch_widedeep.training.BayesianTrainer","title":"BayesianTrainer","text":"<pre><code>BayesianTrainer(\n    model,\n    objective,\n    custom_loss_function=None,\n    optimizer=None,\n    lr_scheduler=None,\n    callbacks=None,\n    metrics=None,\n    verbose=1,\n    seed=1,\n    **kwargs\n)\n</code></pre> <p>               Bases: <code>BaseBayesianTrainer</code></p> <p>Class to set the of attributes that will be used during the training process.</p> <p>Both the Bayesian models and the Trainer in this repo are based on the paper: Weight Uncertainty in Neural Networks.</p> <p>Parameters:</p> <ul> <li> <code>model</code>               (<code>BaseBayesianModel</code>)           \u2013            <p>An object of class <code>BaseBayesianModel</code>. See the <code>Model Components</code> section here in the docs.</p> </li> <li> <code>objective</code>               (<code>str</code>)           \u2013            <p>Defines the objective, loss or cost function. Param aliases: <code>loss_function</code>, <code>loss_fn</code>, <code>loss</code>, <code>cost_function</code>, <code>cost_fn</code>, <code>cost</code> Possible values are: 'binary', 'multiclass', 'regression'</p> </li> <li> <code>custom_loss_function</code>               (<code>Optional[Module]</code>, default:                   <code>None</code> )           \u2013            <p>If none of the loss functions available suits the user, it is possible to pass a custom loss function. See for example <code>pytorch_widedeep.losses.FocalLoss</code> for the required structure of the object or the Examples folder in the repo.</p> </li> <li> <code>optimizer</code>               (<code>Optional[Optimizer]</code>, default:                   <code>None</code> )           \u2013            <p>An instance of Pytorch's <code>Optimizer</code> object(e.g. <code>torch.optim.Adam ()</code>). if no optimizer is passed it will default to <code>AdamW</code>.</p> </li> <li> <code>lr_scheduler</code>               (<code>Optional[LRScheduler]</code>, default:                   <code>None</code> )           \u2013            <p>An instance of Pytorch's <code>LRScheduler</code> object (e.g <code>torch.optim.lr_scheduler.StepLR(opt, step_size=5)</code>).</p> </li> <li> <code>callbacks</code>               (<code>Optional[List[Callback]]</code>, default:                   <code>None</code> )           \u2013            <p>List with <code>Callback</code> objects. The three callbacks available in <code>pytorch-widedeep</code> are: <code>LRHistory</code>, <code>ModelCheckpoint</code> and <code>EarlyStopping</code>. This can also be a custom callback. See <code>pytorch_widedeep.callbacks.Callback</code> or the Examples folder in the repo.</p> </li> <li> <code>metrics</code>               (<code>Optional[Union[List[Metric], List[Metric]]]</code>, default:                   <code>None</code> )           \u2013            <ul> <li>List of objects of type <code>Metric</code>. Metrics available are:   <code>Accuracy</code>, <code>Precision</code>, <code>Recall</code>, <code>FBetaScore</code>,   <code>F1Score</code> and <code>R2Score</code>. This can also be a custom metric as   long as it is an object of type <code>Metric</code>. See   <code>pytorch_widedeep.metrics.Metric</code> or the Examples folder in the repo</li> <li>List of objects of type <code>torchmetrics.Metric</code>. This can be any   metric from torchmetrics library Examples   classification-metrics&gt;<code>_. It can also be a torchmetric custom metric as   long as it is an object of type</code>Metric<code>.   See</code>the instructions</li> </ul> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Setting it to 0 will print nothing during training.</p> </li> <li> <code>seed</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Random seed to be used internally for train_test_split</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>**kwargs</code>           \u2013            <p>Other infrequently used arguments that can also be passed as kwargs are:</p> <ul> <li> <p>device: <code>str</code>     string indicating the device. One of 'cpu' or 'gpu'</p> </li> <li> <p>num_workers: <code>int</code>     number of workers to be used internally by the data loaders</p> </li> <li> <p>class_weight: <code>List[float]</code>     This is the <code>weight</code> or <code>pos_weight</code> parameter in     <code>CrossEntropyLoss</code> and <code>BCEWithLogitsLoss</code>, depending on whether</p> </li> <li> <p>reducelronplateau_criterion: <code>str</code>     This sets the criterion that will be used by the lr scheduler to     take a step: One of 'loss' or 'metric'. The ReduceLROnPlateau     learning rate is a bit particular.</p> </li> </ul> </li> </ul> <p>Attributes:</p> <ul> <li> <code>cyclic_lr</code>               (<code>bool</code>)           \u2013            <p>Attribute that indicates if  the lr_scheduler is cyclic_lr (i.e. <code>CyclicLR</code> or <code>OneCycleLR</code>). See <code>Pytorch schedulers &lt;https://pytorch.org/docs/stable/optim.html&gt;</code>_.</p> </li> </ul> Source code in <code>pytorch_widedeep/training/bayesian_trainer.py</code> <pre><code>@alias(  # noqa: C901\n    \"objective\",\n    [\"loss_function\", \"loss_fn\", \"loss\", \"cost_function\", \"cost_fn\", \"cost\"],\n)\ndef __init__(\n    self,\n    model: BaseBayesianModel,\n    objective: str,\n    custom_loss_function: Optional[Module] = None,\n    optimizer: Optional[Optimizer] = None,\n    lr_scheduler: Optional[LRScheduler] = None,\n    callbacks: Optional[List[Callback]] = None,\n    metrics: Optional[Union[List[Metric], List[TorchMetric]]] = None,\n    verbose: int = 1,\n    seed: int = 1,\n    **kwargs,\n):\n    super().__init__(\n        model=model,\n        objective=objective,\n        custom_loss_function=custom_loss_function,\n        optimizer=optimizer,\n        lr_scheduler=lr_scheduler,\n        callbacks=callbacks,\n        metrics=metrics,\n        verbose=verbose,\n        seed=seed,\n        **kwargs,\n    )\n</code></pre>"},{"location":"pytorch-widedeep/bayesian_trainer.html#pytorch_widedeep.training.BayesianTrainer.fit","title":"fit","text":"<pre><code>fit(\n    X_tab,\n    target,\n    X_tab_val=None,\n    target_val=None,\n    val_split=None,\n    n_epochs=1,\n    validation_freq=1,\n    batch_size=32,\n    n_train_samples=2,\n    n_val_samples=2,\n)\n</code></pre> <p>Fit method.</p> <p>Parameters:</p> <ul> <li> <code>X_tab</code>               (<code>ndarray</code>)           \u2013            <p>tabular dataset</p> </li> <li> <code>target</code>               (<code>ndarray</code>)           \u2013            <p>target values</p> </li> <li> <code>X_tab_val</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>validation data</p> </li> <li> <code>target_val</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>validation target values</p> </li> <li> <code>val_split</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>An alterative to passing the validation set is to use a train/val split fraction via <code>val_split</code></p> </li> <li> <code>n_epochs</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>number of epochs</p> </li> <li> <code>validation_freq</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>epochs validation frequency</p> </li> <li> <code>batch_size</code>               (<code>int</code>, default:                   <code>32</code> )           \u2013            <p>batch size</p> </li> <li> <code>n_train_samples</code>               (<code>int</code>, default:                   <code>2</code> )           \u2013            <p>number of samples to average over during the training process. See Weight Uncertainty in Neural Networks for details.</p> </li> <li> <code>n_val_samples</code>               (<code>int</code>, default:                   <code>2</code> )           \u2013            <p>number of samples to average over during the validation process. See Weight Uncertainty in Neural Networks for details.</p> </li> </ul> Source code in <code>pytorch_widedeep/training/bayesian_trainer.py</code> <pre><code>def fit(  # noqa: C901\n    self,\n    X_tab: np.ndarray,\n    target: np.ndarray,\n    X_tab_val: Optional[np.ndarray] = None,\n    target_val: Optional[np.ndarray] = None,\n    val_split: Optional[float] = None,\n    n_epochs: int = 1,\n    validation_freq: int = 1,\n    batch_size: int = 32,\n    n_train_samples: int = 2,\n    n_val_samples: int = 2,\n):\n    r\"\"\"Fit method.\n\n    Parameters\n    ----------\n    X_tab: np.ndarray,\n        tabular dataset\n    target: np.ndarray\n        target values\n    X_tab_val: np.ndarray, Optional, default = None\n        validation data\n    target_val: np.ndarray, Optional, default = None\n        validation target values\n    val_split: float, Optional. default=None\n        An alterative to passing the validation set is to use a train/val\n        split fraction via `val_split`\n    n_epochs: int, default=1\n        number of epochs\n    validation_freq: int, default=1\n        epochs validation frequency\n    batch_size: int, default=32\n        batch size\n    n_train_samples: int, default=2\n        number of samples to average over during the training process.\n        See [Weight Uncertainty in Neural Networks](https://arxiv.org/pdf/1505.05424.pdf) for details.\n    n_val_samples: int, default=2\n        number of samples to average over during the validation process.\n        See [Weight Uncertainty in Neural Networks](https://arxiv.org/pdf/1505.05424.pdf) for details.\n    \"\"\"\n\n    self.batch_size = batch_size\n\n    train_set, eval_set = tabular_train_val_split(\n        self.seed, self.objective, X_tab, target, X_tab_val, target_val, val_split\n    )\n    train_loader = DataLoader(\n        dataset=train_set, batch_size=batch_size, num_workers=self.num_workers\n    )\n    train_steps = len(train_loader)\n\n    if eval_set is not None:\n        eval_loader = DataLoader(\n            dataset=eval_set,\n            batch_size=batch_size,\n            num_workers=self.num_workers,\n            shuffle=False,\n        )\n        eval_steps = len(eval_loader)\n\n    self.callback_container.on_train_begin(\n        {\n            \"batch_size\": batch_size,\n            \"train_steps\": train_steps,\n            \"n_epochs\": n_epochs,\n        }\n    )\n    for epoch in range(n_epochs):\n        epoch_logs: Dict[str, float] = {}\n        self.callback_container.on_epoch_begin(epoch, logs=epoch_logs)\n\n        self.train_running_loss = 0.0\n        with trange(train_steps, disable=self.verbose != 1) as t:\n            for batch_idx, (X, y) in zip(t, train_loader):\n                t.set_description(\"epoch %i\" % (epoch + 1))\n                train_score, train_loss = self._train_step(\n                    X, y, n_train_samples, train_steps, batch_idx\n                )\n                print_loss_and_metric(t, train_loss, train_score)\n                self.callback_container.on_batch_end(batch=batch_idx)\n        epoch_logs = save_epoch_logs(epoch_logs, train_loss, train_score, \"train\")\n\n        on_epoch_end_metric = None\n        if eval_set is not None and epoch % validation_freq == (\n            validation_freq - 1\n        ):\n            self.callback_container.on_eval_begin()\n            self.valid_running_loss = 0.0\n            with trange(eval_steps, disable=self.verbose != 1) as v:\n                for i, (X, y) in zip(v, eval_loader):\n                    v.set_description(\"valid\")\n                    val_score, val_loss = self._eval_step(\n                        X, y, n_val_samples, train_steps, i\n                    )\n                    print_loss_and_metric(v, val_loss, val_score)\n            epoch_logs = save_epoch_logs(epoch_logs, val_loss, val_score, \"val\")\n\n            if self.reducelronplateau:\n                if self.reducelronplateau_criterion == \"loss\":\n                    on_epoch_end_metric = val_loss\n                else:\n                    on_epoch_end_metric = val_score[\n                        self.reducelronplateau_criterion\n                    ]\n\n        self.callback_container.on_epoch_end(epoch, epoch_logs, on_epoch_end_metric)\n\n        if self.early_stop:\n            self.callback_container.on_train_end(epoch_logs)\n            break\n\n    self.callback_container.on_train_end(epoch_logs)\n    self._restore_best_weights()\n    self.model.train()\n</code></pre>"},{"location":"pytorch-widedeep/bayesian_trainer.html#pytorch_widedeep.training.BayesianTrainer.predict","title":"predict","text":"<pre><code>predict(\n    X_tab, n_samples=5, return_samples=False, batch_size=256\n)\n</code></pre> <p>Returns the predictions</p> <p>Parameters:</p> <ul> <li> <code>X_tab</code>               (<code>ndarray</code>)           \u2013            <p>tabular dataset</p> </li> <li> <code>n_samples</code>               (<code>int</code>, default:                   <code>5</code> )           \u2013            <p>number of samples that will be either returned or averaged to produce an overal prediction</p> </li> <li> <code>return_samples</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the n samples will be averaged or directly returned</p> </li> <li> <code>batch_size</code>               (<code>int</code>, default:                   <code>256</code> )           \u2013            <p>batch size</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>np.ndarray:</code>           \u2013            <p>array with the predictions</p> </li> </ul> Source code in <code>pytorch_widedeep/training/bayesian_trainer.py</code> <pre><code>def predict(  # type: ignore[return]\n    self,\n    X_tab: np.ndarray,\n    n_samples: int = 5,\n    return_samples: bool = False,\n    batch_size: int = 256,\n) -&gt; np.ndarray:\n    r\"\"\"Returns the predictions\n\n    Parameters\n    ----------\n    X_tab: np.ndarray,\n        tabular dataset\n    n_samples: int, default=5\n        number of samples that will be either returned or averaged to\n        produce an overal prediction\n    return_samples: bool, default = False\n        Boolean indicating whether the n samples will be averaged or directly returned\n    batch_size: int, default = 256\n        batch size\n\n    Returns\n    -------\n    np.ndarray:\n        array with the predictions\n    \"\"\"\n\n    preds_l = self._predict(X_tab, n_samples, return_samples, batch_size)\n    preds = np.hstack(preds_l) if return_samples else np.vstack(preds_l)\n    axis = 2 if return_samples else 1\n\n    if self.objective == \"regression\":\n        return preds.squeeze(axis)\n    if self.objective == \"binary\":\n        return (preds.squeeze(axis) &gt; 0.5).astype(\"int\")\n    if self.objective == \"multiclass\":\n        return np.argmax(preds, axis)\n</code></pre>"},{"location":"pytorch-widedeep/bayesian_trainer.html#pytorch_widedeep.training.BayesianTrainer.predict_proba","title":"predict_proba","text":"<pre><code>predict_proba(\n    X_tab, n_samples=5, return_samples=False, batch_size=256\n)\n</code></pre> <p>Returns the predicted probabilities</p> <p>Parameters:</p> <ul> <li> <code>X_tab</code>               (<code>ndarray</code>)           \u2013            <p>tabular dataset</p> </li> <li> <code>n_samples</code>               (<code>int</code>, default:                   <code>5</code> )           \u2013            <p>number of samples that will be either returned or averaged to produce an overal prediction</p> </li> <li> <code>return_samples</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the n samples will be averaged or directly returned</p> </li> <li> <code>batch_size</code>               (<code>int</code>, default:                   <code>256</code> )           \u2013            <p>batch size</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>array with the probabilities per class</p> </li> </ul> Source code in <code>pytorch_widedeep/training/bayesian_trainer.py</code> <pre><code>def predict_proba(  # type: ignore[return]\n    self,\n    X_tab: np.ndarray,\n    n_samples: int = 5,\n    return_samples: bool = False,\n    batch_size: int = 256,\n) -&gt; np.ndarray:\n    r\"\"\"Returns the predicted probabilities\n\n    Parameters\n    ----------\n    X_tab: np.ndarray,\n        tabular dataset\n    n_samples: int, default=5\n        number of samples that will be either returned or averaged to\n        produce an overal prediction\n    return_samples: bool, default = False\n        Boolean indicating whether the n samples will be averaged or directly returned\n    batch_size: int, default = 256\n        batch size\n\n    Returns\n    -------\n    np.ndarray\n        array with the probabilities per class\n    \"\"\"\n    preds_l = self._predict(X_tab, n_samples, return_samples, batch_size)\n    preds = np.hstack(preds_l) if return_samples else np.vstack(preds_l)\n\n    if self.objective == \"binary\":\n        if return_samples:\n            preds = preds.squeeze(2)\n            probs = np.zeros([n_samples, preds.shape[1], 2])\n            for i in range(n_samples):\n                probs[i, :, 0] = 1 - preds[i]\n                probs[i, :, 1] = preds[i]\n        else:\n            preds = preds.squeeze(1)\n            probs = np.zeros([preds.shape[0], 2])\n            probs[:, 0] = 1 - preds\n            probs[:, 1] = preds\n        return probs\n    if self.objective == \"multiclass\":\n        return preds\n</code></pre>"},{"location":"pytorch-widedeep/bayesian_trainer.html#pytorch_widedeep.training.BayesianTrainer.save","title":"save","text":"<pre><code>save(\n    path,\n    save_state_dict=False,\n    model_filename=\"bayesian_model.pt\",\n)\n</code></pre> <p>Saves the model, training and evaluation history to disk</p> <p>The <code>Trainer</code> class is built so that it 'just' trains a model. With that in mind, all the torch related parameters (such as optimizers or learning rate schedulers) have to be defined externally and then passed to the <code>Trainer</code>. As a result, the <code>Trainer</code> does not generate any attribute or additional data products that need to be saved other than the <code>model</code> object itself, which can be saved as any other torch model (e.g. <code>torch.save(model, path)</code>).</p> <p>Parameters:</p> <ul> <li> <code>path</code>               (<code>str</code>)           \u2013            <p>path to the directory where the model and the feature importance attribute will be saved.</p> </li> <li> <code>save_state_dict</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether to save directly the model or the model's state dictionary</p> </li> <li> <code>model_filename</code>               (<code>str</code>, default:                   <code>'bayesian_model.pt'</code> )           \u2013            <p>filename where the model weights will be store</p> </li> </ul> Source code in <code>pytorch_widedeep/training/bayesian_trainer.py</code> <pre><code>def save(\n    self,\n    path: str,\n    save_state_dict: bool = False,\n    model_filename: str = \"bayesian_model.pt\",\n):\n    r\"\"\"Saves the model, training and evaluation history to disk\n\n    The `Trainer` class is built so that it 'just' trains a model. With\n    that in mind, all the torch related parameters (such as optimizers or\n    learning rate schedulers) have to be defined externally and then\n    passed to the `Trainer`. As a result, the `Trainer` does not\n    generate any attribute or additional data products that need to be\n    saved other than the `model` object itself, which can be saved as\n    any other torch model (e.g. `torch.save(model, path)`).\n\n    Parameters\n    ----------\n    path: str\n        path to the directory where the model and the feature importance\n        attribute will be saved.\n    save_state_dict: bool, default = False\n        Boolean indicating whether to save directly the model or the\n        model's state dictionary\n    model_filename: str, Optional, default = \"wd_model.pt\"\n        filename where the model weights will be store\n    \"\"\"\n\n    save_dir = Path(path)\n    history_dir = save_dir / \"history\"\n    history_dir.mkdir(exist_ok=True, parents=True)\n\n    # the trainer is run with the History Callback by default\n    with open(history_dir / \"train_eval_history.json\", \"w\") as teh:\n        json.dump(self.history, teh)  # type: ignore[attr-defined]\n\n    has_lr_history = any(\n        [clbk.__class__.__name__ == \"LRHistory\" for clbk in self.callbacks]\n    )\n    if self.lr_scheduler is not None and has_lr_history:\n        with open(history_dir / \"lr_history.json\", \"w\") as lrh:\n            json.dump(self.lr_history, lrh)  # type: ignore[attr-defined]\n\n    model_path = save_dir / model_filename\n    if save_state_dict:\n        torch.save(self.model.state_dict(), model_path)\n    else:\n        torch.save(self.model, model_path)\n</code></pre>"},{"location":"pytorch-widedeep/callbacks.html","title":"Callbacks","text":"<p>Here are the 4 callbacks available to the user in <code>pytorch-widedepp</code>: <code>LRHistory</code>, <code>ModelCheckpoint</code>, <code>EarlyStopping</code> and <code>RayTuneReporter</code>.</p> <p> NOTE: other callbacks , like <code>History</code>, run always  by default. In particular, the <code>History</code> callback saves the metrics in the  <code>history</code> attribute of the <code>Trainer</code>.</p>"},{"location":"pytorch-widedeep/callbacks.html#pytorch_widedeep.callbacks.LRHistory","title":"LRHistory","text":"<pre><code>LRHistory(n_epochs)\n</code></pre> <p>               Bases: <code>Callback</code></p> <p>Saves the learning rates during training in the <code>lr_history</code> attribute of the <code>Trainer</code>.</p> <p>Callbacks are passed as input parameters to the <code>Trainer</code> class. See <code>pytorch_widedeep.trainer.Trainer</code></p> <p>Parameters:</p> <ul> <li> <code>n_epochs</code>               (<code>int</code>)           \u2013            <p>number of training epochs</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from pytorch_widedeep.callbacks import LRHistory\n&gt;&gt;&gt; from pytorch_widedeep.models import TabMlp, Wide, WideDeep\n&gt;&gt;&gt; from pytorch_widedeep.training import Trainer\n&gt;&gt;&gt;\n&gt;&gt;&gt; embed_input = [(u, i, j) for u, i, j in zip([\"a\", \"b\", \"c\"][:4], [4] * 3, [8] * 3)]\n&gt;&gt;&gt; column_idx = {k: v for v, k in enumerate([\"a\", \"b\", \"c\"])}\n&gt;&gt;&gt; wide = Wide(10, 1)\n&gt;&gt;&gt; deep = TabMlp(mlp_hidden_dims=[8, 4], column_idx=column_idx, cat_embed_input=embed_input)\n&gt;&gt;&gt; model = WideDeep(wide, deep)\n&gt;&gt;&gt; trainer = Trainer(model, objective=\"regression\", callbacks=[LRHistory(n_epochs=10)])\n</code></pre> Source code in <code>pytorch_widedeep/callbacks.py</code> <pre><code>def __init__(self, n_epochs: int):\n    super(LRHistory, self).__init__()\n    self.n_epochs = n_epochs\n</code></pre>"},{"location":"pytorch-widedeep/callbacks.html#pytorch_widedeep.callbacks.ModelCheckpoint","title":"ModelCheckpoint","text":"<pre><code>ModelCheckpoint(\n    filepath=None,\n    monitor=\"val_loss\",\n    min_delta=0.0,\n    verbose=0,\n    save_best_only=False,\n    mode=\"auto\",\n    period=1,\n    max_save=-1,\n)\n</code></pre> <p>               Bases: <code>Callback</code></p> <p>Saves the model after every epoch.</p> <p>This class is almost identical to the corresponding keras class. Therefore, credit to the Keras Team.</p> <p>Callbacks are passed as input parameters to the <code>Trainer</code> class. See <code>pytorch_widedeep.trainer.Trainer</code></p> <p>Parameters:</p> <ul> <li> <code>filepath</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Full path to save the output weights. It must contain only the root of the filenames. Epoch number and <code>.pt</code> extension (for pytorch) will be added. e.g. <code>filepath=\"path/to/output_weights/weights_out\"</code> And the saved files in that directory will be named: 'weights_out_1.pt', 'weights_out_2.pt', .... If set to <code>None</code> the class just report best metric and best_epoch.</p> </li> <li> <code>monitor</code>               (<code>str</code>, default:                   <code>'val_loss'</code> )           \u2013            <p>quantity to monitor. Typically 'val_loss' or metric name (e.g. 'val_acc')</p> </li> <li> <code>min_delta</code>               (<code>float</code>, default:                   <code>0.0</code> )           \u2013            <p>minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>0</code> )           \u2013            <p>verbosity mode</p> </li> <li> <code>save_best_only</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>the latest best model according to the quantity monitored will not be overwritten.</p> </li> <li> <code>mode</code>               (<code>str</code>, default:                   <code>'auto'</code> )           \u2013            <p>If <code>save_best_only=True</code>, the decision to overwrite the current save file is made based on either the maximization or the minimization of the monitored quantity. For 'acc', this should be 'max', for 'loss' this should be 'min', etc. In 'auto' mode, the direction is automatically inferred from the name of the monitored quantity.</p> </li> <li> <code>period</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Interval (number of epochs) between checkpoints.</p> </li> <li> <code>max_save</code>               (<code>int</code>, default:                   <code>-1</code> )           \u2013            <p>Maximum number of outputs to save. If -1 will save all outputs</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>best</code>               (<code>float</code>)           \u2013            <p>best metric</p> </li> <li> <code>best_epoch</code>               (<code>int</code>)           \u2013            <p>best epoch</p> </li> <li> <code>best_state_dict</code>               (<code>dict</code>)           \u2013            <p>best model state dictionary. To restore model to its best state use <code>Trainer.model.load_state_dict (model_checkpoint.best_state_dict)</code> where <code>model_checkpoint</code> is an instance of the class <code>ModelCheckpoint</code>. See the Examples folder in the repo or the Examples section in this documentation for details</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from pytorch_widedeep.callbacks import ModelCheckpoint\n&gt;&gt;&gt; from pytorch_widedeep.models import TabMlp, Wide, WideDeep\n&gt;&gt;&gt; from pytorch_widedeep.training import Trainer\n&gt;&gt;&gt;\n&gt;&gt;&gt; embed_input = [(u, i, j) for u, i, j in zip([\"a\", \"b\", \"c\"][:4], [4] * 3, [8] * 3)]\n&gt;&gt;&gt; column_idx = {k: v for v, k in enumerate([\"a\", \"b\", \"c\"])}\n&gt;&gt;&gt; wide = Wide(10, 1)\n&gt;&gt;&gt; deep = TabMlp(mlp_hidden_dims=[8, 4], column_idx=column_idx, cat_embed_input=embed_input)\n&gt;&gt;&gt; model = WideDeep(wide, deep)\n&gt;&gt;&gt; trainer = Trainer(model, objective=\"regression\", callbacks=[ModelCheckpoint(filepath='checkpoints/weights_out')])\n</code></pre> Source code in <code>pytorch_widedeep/callbacks.py</code> <pre><code>def __init__(\n    self,\n    filepath: Optional[str] = None,\n    monitor: str = \"val_loss\",\n    min_delta: float = 0.0,\n    verbose: int = 0,\n    save_best_only: bool = False,\n    mode: str = \"auto\",\n    period: int = 1,\n    max_save: int = -1,\n):\n    super(ModelCheckpoint, self).__init__()\n\n    self.filepath = filepath\n    self.monitor = monitor\n    self.min_delta = min_delta\n    self.verbose = verbose\n    self.save_best_only = save_best_only\n    self.mode = mode\n    self.period = period\n    self.max_save = max_save\n\n    self.epochs_since_last_save = 0\n\n    if self.filepath:\n        if len(self.filepath.split(\"/\")[:-1]) == 0:\n            raise ValueError(\n                \"'filepath' must be the full path to save the output weights,\"\n                \" including the root of the filenames. e.g. 'checkpoints/weights_out'\"\n            )\n\n        root_dir = (\"/\").join(self.filepath.split(\"/\")[:-1])\n        if not os.path.exists(root_dir):\n            os.makedirs(root_dir)\n\n    if self.max_save &gt; 0:\n        self.old_files: List[str] = []\n\n    if self.mode not in [\"auto\", \"min\", \"max\"]:\n        warnings.warn(\n            \"ModelCheckpoint mode %s is unknown, \"\n            \"fallback to auto mode.\" % (self.mode),\n            RuntimeWarning,\n        )\n        self.mode = \"auto\"\n    if self.mode == \"min\":\n        self.monitor_op = np.less\n        self.best = np.Inf\n    elif self.mode == \"max\":\n        self.monitor_op = np.greater  # type: ignore[assignment]\n        self.best = -np.Inf\n    else:\n        if _is_metric(self.monitor):\n            self.monitor_op = np.greater  # type: ignore[assignment]\n            self.best = -np.Inf\n        else:\n            self.monitor_op = np.less\n            self.best = np.Inf\n\n    if self.monitor_op == np.greater:\n        self.min_delta *= 1\n    else:\n        self.min_delta *= -1\n</code></pre>"},{"location":"pytorch-widedeep/callbacks.html#pytorch_widedeep.callbacks.EarlyStopping","title":"EarlyStopping","text":"<pre><code>EarlyStopping(\n    monitor=\"val_loss\",\n    min_delta=0.0,\n    patience=10,\n    verbose=0,\n    mode=\"auto\",\n    baseline=None,\n    restore_best_weights=False,\n)\n</code></pre> <p>               Bases: <code>Callback</code></p> <p>Stop training when a monitored quantity has stopped improving.</p> <p>This class is almost identical to the corresponding keras class. Therefore, credit to the Keras Team.</p> <p>Callbacks are passed as input parameters to the <code>Trainer</code> class. See <code>pytorch_widedeep.trainer.Trainer</code></p> <p>Parameters:</p> <ul> <li> <code>monitor</code>               (<code>str</code>, default:                   <code>'val_loss'</code> )           \u2013            <p>Quantity to monitor. Typically 'val_loss' or metric name (e.g. 'val_acc')</p> </li> <li> <code>min_delta</code>               (<code>float</code>, default:                   <code>0.0</code> )           \u2013            <p>minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement.</p> </li> <li> <code>patience</code>               (<code>int</code>, default:                   <code>10</code> )           \u2013            <p>Number of epochs that produced the monitored quantity with no improvement after which training will be stopped.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>0</code> )           \u2013            <p>verbosity mode.</p> </li> <li> <code>mode</code>               (<code>str</code>, default:                   <code>'auto'</code> )           \u2013            <p>one of {'auto', 'min', 'max'}. In 'min' mode, training will stop when the quantity monitored has stopped decreasing; in 'max' mode it will stop when the quantity monitored has stopped increasing; in 'auto' mode, the direction is automatically inferred from the name of the monitored quantity.</p> </li> <li> <code>baseline</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Baseline value for the monitored quantity to reach. Training will stop if the model does not show improvement over the baseline.</p> </li> <li> <code>restore_best_weights</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Whether to restore model weights from the epoch with the best value of the monitored quantity. If <code>False</code>, the model weights obtained at the last step of training are used.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>best</code>               (<code>float</code>)           \u2013            <p>best metric</p> </li> <li> <code>stopped_epoch</code>               (<code>int</code>)           \u2013            <p>epoch when the training stopped</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from pytorch_widedeep.callbacks import EarlyStopping\n&gt;&gt;&gt; from pytorch_widedeep.models import TabMlp, Wide, WideDeep\n&gt;&gt;&gt; from pytorch_widedeep.training import Trainer\n&gt;&gt;&gt;\n&gt;&gt;&gt; embed_input = [(u, i, j) for u, i, j in zip([\"a\", \"b\", \"c\"][:4], [4] * 3, [8] * 3)]\n&gt;&gt;&gt; column_idx = {k: v for v, k in enumerate([\"a\", \"b\", \"c\"])}\n&gt;&gt;&gt; wide = Wide(10, 1)\n&gt;&gt;&gt; deep = TabMlp(mlp_hidden_dims=[8, 4], column_idx=column_idx, cat_embed_input=embed_input)\n&gt;&gt;&gt; model = WideDeep(wide, deep)\n&gt;&gt;&gt; trainer = Trainer(model, objective=\"regression\", callbacks=[EarlyStopping(patience=10)])\n</code></pre> Source code in <code>pytorch_widedeep/callbacks.py</code> <pre><code>def __init__(\n    self,\n    monitor: str = \"val_loss\",\n    min_delta: float = 0.0,\n    patience: int = 10,\n    verbose: int = 0,\n    mode: str = \"auto\",\n    baseline: Optional[float] = None,\n    restore_best_weights: bool = False,\n):\n    super(EarlyStopping, self).__init__()\n\n    self.monitor = monitor\n    self.min_delta = min_delta\n    self.patience = patience\n    self.verbose = verbose\n    self.mode = mode\n    self.baseline = baseline\n    self.restore_best_weights = restore_best_weights\n\n    self.wait = 0\n    self.stopped_epoch = 0\n    self.state_dict = None\n\n    if self.mode not in [\"auto\", \"min\", \"max\"]:\n        warnings.warn(\n            \"EarlyStopping mode %s is unknown, \"\n            \"fallback to auto mode.\" % self.mode,\n            RuntimeWarning,\n        )\n        self.mode = \"auto\"\n\n    if self.mode == \"min\":\n        self.monitor_op = np.less\n    elif self.mode == \"max\":\n        self.monitor_op = np.greater  # type: ignore[assignment]\n    else:\n        if _is_metric(self.monitor):\n            self.monitor_op = np.greater  # type: ignore[assignment]\n        else:\n            self.monitor_op = np.less\n\n    if self.monitor_op == np.greater:\n        self.min_delta *= 1\n    else:\n        self.min_delta *= -1\n</code></pre>"},{"location":"pytorch-widedeep/dataloaders.html","title":"Dataloaders","text":"<p> NOTE: This module should contain custom dataloaders  that the user might want to implement. At the moment <code>pytorch-widedeep</code>  offers one custom dataloader, <code>DataLoaderImbalanced</code>.</p>"},{"location":"pytorch-widedeep/dataloaders.html#pytorch_widedeep.dataloaders.DataLoaderImbalanced","title":"DataLoaderImbalanced","text":"<pre><code>DataLoaderImbalanced(\n    dataset, batch_size, num_workers, **kwargs\n)\n</code></pre> <p>               Bases: <code>DataLoader</code></p> <p>Class to load and shuffle batches with adjusted weights for imbalanced datasets. If the classes do not begin from 0 remapping is necessary. See here.</p> <p>Parameters:</p> <ul> <li> <code>dataset</code>               (<code>WideDeepDataset</code>)           \u2013            <p>see <code>pytorch_widedeep.training._wd_dataset</code></p> </li> <li> <code>batch_size</code>               (<code>int</code>)           \u2013            <p>size of batch</p> </li> <li> <code>num_workers</code>               (<code>int</code>)           \u2013            <p>number of workers</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>**kwargs</code>           \u2013            <p>This can include any parameter that can be passed to the 'standard' pytorch DataLoader and that is not already explicitely passed to the class. In addition, the dictionary can also include the extra parameter <code>oversample_mul</code> which will multiply the number of samples of the minority class to be sampled by the <code>WeightedRandomSampler</code>.</p> <p>In other words, the <code>num_samples</code> param in <code>WeightedRandomSampler</code> will be defined as:</p> \\[ minority \\space class \\space count \\times number \\space of \\space classes \\times oversample\\_mul \\] </li> </ul> Source code in <code>pytorch_widedeep/dataloaders.py</code> <pre><code>def __init__(\n    self, dataset: WideDeepDataset, batch_size: int, num_workers: int, **kwargs\n):\n    assert dataset.Y is not None, (\n        \"The 'dataset' instance of WideDeepDataset must contain a \"\n        \"target array 'Y'\"\n    )\n\n    if \"oversample_mul\" in kwargs:\n        oversample_mul = kwargs[\"oversample_mul\"]\n        del kwargs[\"oversample_mul\"]\n    else:\n        oversample_mul = 1\n    weights, minor_cls_cnt, num_clss = get_class_weights(dataset)\n    num_samples = int(minor_cls_cnt * num_clss * oversample_mul)\n    samples_weight = list(np.array([weights[i] for i in dataset.Y]))\n    sampler = WeightedRandomSampler(samples_weight, num_samples, replacement=True)\n    super().__init__(\n        dataset, batch_size, num_workers=num_workers, sampler=sampler, **kwargs\n    )\n</code></pre>"},{"location":"pytorch-widedeep/load_from_folder.html","title":"The <code>load_from_folder</code> module","text":"<p>The <code>load_from_folder</code> module contains the classes that are necessary to load data from disk and these are inspired by the <code>ImageFolder</code> class in the <code>torchvision</code> library. This module is designed with one specific case in mind. Such case is the following: given a multi-modal dataset with tabular data, images and text, the images do not fit in memory, and therefore, they have to be loaded from disk. However, as any other functionality in this library, there is some flexibility and some additional cases can also be addressed using this module.</p> <p>For this module to be used, the datasets must be prepared in a certain way:</p> <ol> <li> <p>the tabular data must contain a column with the images names as stored in disk, including the extension (<code>.jpg</code>, <code>.png</code>, etc...).</p> </li> <li> <p>Regarding to the text dataset, the tabular data can contain a column with the texts themselves or the names of the files containing the texts as stored in disk.</p> </li> </ol> <p>The tabular data might or might not fit in disk itself. If it does not, please see the <code>ChunkPreprocessor</code> utilities at the[<code>preprocessing</code>] (preprocessing.md) module and the examples folder in the repo, which illustrate such case. Finally note that only <code>csv</code> format is currently supported in that case(more formats coming soon).</p>"},{"location":"pytorch-widedeep/load_from_folder.html#pytorch_widedeep.load_from_folder.tabular.tabular_from_folder.TabFromFolder","title":"TabFromFolder","text":"<pre><code>TabFromFolder(\n    fname,\n    directory=None,\n    target_col=None,\n    preprocessor=None,\n    text_col=None,\n    img_col=None,\n    ignore_target=False,\n    reference=None,\n    verbose=1,\n)\n</code></pre> <p>This class is used to load tabular data from disk. The current constrains are:</p> <ol> <li>The only file format supported right now is csv</li> <li>The csv file must contain headers</li> </ol> <p>For examples, please, see the examples folder in the repo.</p> <p>Parameters:</p> <ul> <li> <code>fname</code>               (<code>str</code>)           \u2013            <p>the name of the csv file</p> </li> <li> <code>directory</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>the path to the directory where the csv file is located. If None, a <code>TabFromFolder</code> reference object must be provided</p> </li> <li> <code>target_col</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>the name of the target column. If None, a <code>TabFromFolder</code> reference object must be provided</p> </li> <li> <code>preprocessor</code>               (<code>Optional[TabularPreprocessor]</code>, default:                   <code>None</code> )           \u2013            <p>a fitted <code>TabularPreprocessor</code> object. If None, a <code>TabFromFolder</code> reference object must be provided</p> </li> <li> <code>text_col</code>               (<code>Optional[Union[str, List[str]]]</code>, default:                   <code>None</code> )           \u2013            <p>the name of the column with the texts themselves or the names of the files that contain the text dataset. If None, either there is no text column or a <code>TabFromFolder</code> reference object must be provided</p> </li> <li> <code>img_col</code>               (<code>Optional[Union[str, List[str]]]</code>, default:                   <code>None</code> )           \u2013            <p>the name of the column with the the names of the images. If None, either there is no image column or a <code>TabFromFolder</code> reference object must be provided</p> </li> <li> <code>ignore_target</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>whether to ignore the target column. This is normally set to True when this class is used for a test dataset.</p> </li> <li> <code>reference</code>               (<code>Optional[Any]</code>, default:                   <code>None</code> )           \u2013            <p>a reference <code>TabFromFolder</code> object. If provided, the <code>TabFromFolder</code> object will be created using the attributes of the reference object. This is useful to instantiate a <code>TabFromFolder</code> object for evaluation or test purposes</p> </li> <li> <code>verbose</code>               (<code>Optional[int]</code>, default:                   <code>1</code> )           \u2013            <p>verbosity. If 0, no output will be printed during the process.</p> </li> </ul> Source code in <code>pytorch_widedeep/load_from_folder/tabular/tabular_from_folder.py</code> <pre><code>def __init__(\n    self,\n    fname: str,\n    directory: Optional[str] = None,\n    target_col: Optional[str] = None,\n    preprocessor: Optional[TabularPreprocessor] = None,\n    text_col: Optional[Union[str, List[str]]] = None,\n    img_col: Optional[Union[str, List[str]]] = None,\n    ignore_target: bool = False,\n    reference: Optional[Any] = None,  # is Type[\"TabFromFolder\"],\n    verbose: Optional[int] = 1,\n):\n    self.fname = fname\n    self.ignore_target = ignore_target\n    self.verbose = verbose\n\n    if reference is not None:\n        (\n            self.directory,\n            self.target_col,\n            self.preprocessor,\n            self.text_col,\n            self.img_col,\n        ) = self._set_from_reference(reference, preprocessor)\n    else:\n        assert (\n            directory is not None\n            and (target_col is not None and not ignore_target)\n            and preprocessor is not None\n        ), (\n            \"if no reference is provided, 'directory', 'target_col' and 'preprocessor' \"\n            \"must be provided\"\n        )\n\n        self.directory = directory\n        self.target_col = target_col\n        self.preprocessor = preprocessor\n        self.text_col = text_col\n        self.img_col = img_col\n\n    assert (\n        self.preprocessor.is_fitted\n    ), \"The preprocessor must be fitted before passing it to this class\"\n</code></pre>"},{"location":"pytorch-widedeep/load_from_folder.html#pytorch_widedeep.load_from_folder.tabular.tabular_from_folder.WideFromFolder","title":"WideFromFolder","text":"<pre><code>WideFromFolder(\n    fname,\n    directory=None,\n    target_col=None,\n    preprocessor=None,\n    text_col=None,\n    img_col=None,\n    ignore_target=False,\n    reference=None,\n    verbose=1,\n)\n</code></pre> <p>               Bases: <code>TabFromFolder</code></p> <p>This class is mostly identical to <code>TabFromFolder</code> but exists because we want to separate the treatment of the wide and the deep tabular components</p> <p>Parameters:</p> <ul> <li> <code>fname</code>               (<code>str</code>)           \u2013            <p>the name of the csv file</p> </li> <li> <code>directory</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>the path to the directory where the csv file is located. If None, a <code>WideFromFolder</code> reference object must be provided</p> </li> <li> <code>target_col</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>the name of the target column. If None, a <code>WideFromFolder</code> reference object must be provided</p> </li> <li> <code>preprocessor</code>               (<code>Optional[TabularPreprocessor]</code>, default:                   <code>None</code> )           \u2013            <p>a fitted <code>TabularPreprocessor</code> object. If None, a <code>WideFromFolder</code> reference object must be provided</p> </li> <li> <code>text_col</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>the name of the column with the texts themselves or the names of the files that contain the text dataset. If None, either there is no text column or a <code>WideFromFolder</code> reference object must be provided=</p> </li> <li> <code>img_col</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>the name of the column with the the names of the images. If None, either there is no image column or a <code>WideFromFolder</code> reference object must be provided</p> </li> <li> <code>ignore_target</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>whether to ignore the target column. This is normally used when this class is used for a test dataset.</p> </li> <li> <code>reference</code>               (<code>Optional[Any]</code>, default:                   <code>None</code> )           \u2013            <p>a reference <code>WideFromFolder</code> object. If provided, the <code>WideFromFolder</code> object will be created using the attributes of the reference object. This is useful to instantiate a <code>WideFromFolder</code> object for evaluation or test purposes</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>verbosity. If 0, no output will be printed during the process.</p> </li> </ul> Source code in <code>pytorch_widedeep/load_from_folder/tabular/tabular_from_folder.py</code> <pre><code>def __init__(\n    self,\n    fname: str,\n    directory: Optional[str] = None,\n    target_col: Optional[str] = None,\n    preprocessor: Optional[TabularPreprocessor] = None,\n    text_col: Optional[str] = None,\n    img_col: Optional[str] = None,\n    ignore_target: bool = False,\n    reference: Optional[Any] = None,  # is Type[\"WideFromFolder\"],\n    verbose: int = 1,\n):\n    super(WideFromFolder, self).__init__(\n        fname=fname,\n        directory=directory,\n        target_col=target_col,\n        preprocessor=preprocessor,\n        text_col=text_col,\n        img_col=img_col,\n        reference=reference,\n        ignore_target=ignore_target,\n        verbose=verbose,\n    )\n</code></pre>"},{"location":"pytorch-widedeep/load_from_folder.html#pytorch_widedeep.load_from_folder.text.text_from_folder.TextFromFolder","title":"TextFromFolder","text":"<pre><code>TextFromFolder(preprocessor)\n</code></pre> <p>This class is used to load the text dataset (i.e. the text files) from a folder, or to retrieve the text given a texts column specified within the preprocessor object.</p> <p>For examples, please, see the examples folder in the repo.</p> <p>Parameters:</p> <ul> <li> <code>preprocessor</code>               (<code>Union[TextPreprocessor, ChunkTextPreprocessor, HFPreprocessor, ChunkHFPreprocessor, List[TextPreprocessor], List[ChunkTextPreprocessor], List[HFPreprocessor], List[ChunkHFPreprocessor]]</code>)           \u2013            <p>The preprocessor used to process the text. It must be fitted before using this class</p> </li> </ul> Source code in <code>pytorch_widedeep/load_from_folder/text/text_from_folder.py</code> <pre><code>def __init__(\n    self,\n    preprocessor: Union[\n        TextPreprocessor,\n        ChunkTextPreprocessor,\n        HFPreprocessor,\n        ChunkHFPreprocessor,\n        List[TextPreprocessor],\n        List[ChunkTextPreprocessor],\n        List[HFPreprocessor],\n        List[ChunkHFPreprocessor],\n    ],\n):\n    if isinstance(preprocessor, list):\n        for p in preprocessor:\n            assert (\n                p.is_fitted\n            ), \"All preprocessors must be fitted before using this class\"\n    else:\n        assert (\n            preprocessor.is_fitted\n        ), \"The preprocessor must be fitted before using this class\"\n\n    self.preprocessor = preprocessor\n</code></pre>"},{"location":"pytorch-widedeep/load_from_folder.html#pytorch_widedeep.load_from_folder.image.image_from_folder.ImageFromFolder","title":"ImageFromFolder","text":"<pre><code>ImageFromFolder(\n    directory=None,\n    preprocessor=None,\n    loader=default_loader,\n    extensions=None,\n    transforms=None,\n)\n</code></pre> <p>This class is used to load the image dataset from disk. It is inspired by the <code>ImageFolder</code> class at the <code>torchvision</code> library. Here, we have simply adapted to work within the context of a Wide and Deep multi-modal model.</p> <p>For examples, please, see the examples folder in the repo.</p> <p>Parameters:</p> <ul> <li> <code>directory</code>               (<code>Optional[Union[str, List[str]]]</code>, default:                   <code>None</code> )           \u2013            <p>the path to the directory where the images are located. If None, a preprocessor must be provided.</p> </li> <li> <code>preprocessor</code>               (<code>Optional[Union[ImagePreprocessor, List[ImagePreprocessor]]]</code>, default:                   <code>None</code> )           \u2013            <p>a fitted <code>ImagePreprocessor</code> object.</p> </li> <li> <code>loader</code>               (<code>Callable[[str], Any]</code>, default:                   <code>default_loader</code> )           \u2013            <p>a function to load a sample given its path.</p> </li> <li> <code>extensions</code>               (<code>Optional[Tuple[str, ...]]</code>, default:                   <code>None</code> )           \u2013            <p>a tuple with the allowed extensions. If None, IMG_EXTENSIONS will be used where IMG_EXTENSIONS =\".jpg\", \".jpeg\", \".png\", \".ppm\", \".bmp\", \".pgm\", \".tif\", \".tiff\", \".webp\"</p> </li> <li> <code>transforms</code>               (<code>Optional[Any]</code>, default:                   <code>None</code> )           \u2013            <p>a <code>torchvision.transforms</code> object. If None, this class will simply return an array representation of the PIL Image</p> </li> </ul> Source code in <code>pytorch_widedeep/load_from_folder/image/image_from_folder.py</code> <pre><code>def __init__(\n    self,\n    directory: Optional[Union[str, List[str]]] = None,\n    preprocessor: Optional[\n        Union[ImagePreprocessor, List[ImagePreprocessor]]\n    ] = None,\n    loader: Callable[[str], Any] = default_loader,\n    extensions: Optional[Tuple[str, ...]] = None,\n    transforms: Optional[Any] = None,\n) -&gt; None:\n    assert (\n        directory is not None or preprocessor is not None\n    ), \"Either a directory or an instance of ImagePreprocessor(s) must be provided\"\n\n    if directory is not None and preprocessor is not None:  # pragma: no cover\n        error_msg = (\n            \"If both 'directory' and 'preprocessor' are provided, the 'img_path' \"\n            \"attribute of the 'preprocessor' must be the same as the 'directory'\"\n        )\n        if isinstance(directory, list):\n            assert isinstance(preprocessor, list)\n            assert len(directory) == len(preprocessor)\n            for d, p in zip(directory, preprocessor):\n                assert d == p.img_path, error_msg\n        else:\n            assert isinstance(preprocessor, ImagePreprocessor)\n            assert directory == preprocessor.img_path, error_msg\n\n    if directory is not None:\n        self.directory = directory\n    else:\n        assert (\n            preprocessor is not None\n        ), \"Either a directory or an instance of ImagePreprocessor must be provided\"\n        if isinstance(preprocessor, list):\n            self.directory = [p.img_path for p in preprocessor]\n        else:\n            self.directory = preprocessor.img_path\n\n    self.preprocessor = preprocessor\n    self.loader = loader\n    self.extensions = extensions if extensions is not None else IMG_EXTENSIONS\n    self.transforms = transforms\n    if self.transforms:\n        self.transforms_names = [\n            tr.__class__.__name__ for tr in self.transforms.transforms\n        ]\n    else:\n        self.transforms_names = []\n\n        self.transpose = True\n</code></pre>"},{"location":"pytorch-widedeep/load_from_folder.html#pytorch_widedeep.load_from_folder.wd_dataset_from_folder.WideDeepDatasetFromFolder","title":"WideDeepDatasetFromFolder","text":"<pre><code>WideDeepDatasetFromFolder(\n    n_samples,\n    tab_from_folder=None,\n    wide_from_folder=None,\n    text_from_folder=None,\n    img_from_folder=None,\n    reference=None,\n)\n</code></pre> <p>               Bases: <code>Dataset</code></p> <p>This class is the Dataset counterpart of the <code>WideDeepDataset</code> class.</p> <p>Given a reference tabular dataset, with columns that indicate the path to the images and to the text files or the texts themselves, it will use the <code>[...]FromFolder</code> classes to load the data consistently from disk per batch.</p> <p>For examples, please, see the examples folder in the repo.</p> <p>Parameters:</p> <ul> <li> <code>n_samples</code>               (<code>int</code>)           \u2013            <p>Number of samples in the dataset</p> </li> <li> <code>tab_from_folder</code>               (<code>Optional[TabFromFolder]</code>, default:                   <code>None</code> )           \u2013            <p>Instance of the <code>TabFromFolder</code> class</p> </li> <li> <code>wide_from_folder</code>               (<code>Optional[WideFromFolder]</code>, default:                   <code>None</code> )           \u2013            <p>Instance of the <code>WideFromFolder</code> class</p> </li> <li> <code>text_from_folder</code>               (<code>Optional[TextFromFolder]</code>, default:                   <code>None</code> )           \u2013            <p>Instance of the <code>TextFromFolder</code> class</p> </li> <li> <code>img_from_folder</code>               (<code>Optional[ImageFromFolder]</code>, default:                   <code>None</code> )           \u2013            <p>Instance of the <code>ImageFromFolder</code> class</p> </li> <li> <code>reference</code>               (<code>Optional[Any]</code>, default:                   <code>None</code> )           \u2013            <p>If not None, the 'text_from_folder' and 'img_from_folder' objects will be retrieved from the reference class. This is useful when we want to use a <code>WideDeepDatasetFromFolder</code> class used for a train dataset as a reference for the validation and test datasets. In this case, the <code>text_from_folder</code> and <code>img_from_folder</code> objects will be the same for all three datasets, so there is no need to create a new instance for each dataset.</p> </li> </ul> Source code in <code>pytorch_widedeep/load_from_folder/wd_dataset_from_folder.py</code> <pre><code>def __init__(\n    self,\n    n_samples: int,\n    tab_from_folder: Optional[TabFromFolder] = None,\n    wide_from_folder: Optional[WideFromFolder] = None,\n    text_from_folder: Optional[TextFromFolder] = None,\n    img_from_folder: Optional[ImageFromFolder] = None,\n    reference: Optional[Any] = None,  # is Type[\"WideDeepDatasetFromFolder\"],\n):\n    super(WideDeepDatasetFromFolder, self).__init__()\n\n    if tab_from_folder is None and wide_from_folder is None:\n        raise ValueError(\n            \"Either 'tab_from_folder' or 'wide_from_folder' must be not None\"\n        )\n\n    if reference is not None:\n        assert (\n            img_from_folder is None and text_from_folder is None\n        ), \"If reference is not None, 'img_from_folder' and 'text_from_folder' left as None\"\n        self.text_from_folder, self.img_from_folder = self._get_from_reference(\n            reference\n        )\n    else:\n        assert (\n            text_from_folder is not None and img_from_folder is not None\n        ), \"If reference is None, 'img_from_folder' and 'text_from_folder' must be not None\"\n        self.text_from_folder = text_from_folder\n        self.img_from_folder = img_from_folder\n\n    self.n_samples = n_samples\n    self.tab_from_folder = tab_from_folder\n    self.wide_from_folder = wide_from_folder\n</code></pre>"},{"location":"pytorch-widedeep/losses.html","title":"Losses","text":"<p><code>pytorch-widedeep</code> accepts a number of losses and objectives that can be passed to the <code>Trainer</code> class via the parameter <code>objective</code> (see <code>pytorch-widedeep.training.Trainer</code>). For most cases the loss function that <code>pytorch-widedeep</code> will use internally is already implemented in Pytorch.</p> <p>In addition, <code>pytorch-widedeep</code> implements a series of  \"custom\" loss functions. These are described below for completion since, as mentioned before, they are used internally by the <code>Trainer</code>. Of course, onen could always use them on their own and can be imported as:</p> <p><code>from pytorch_widedeep.losses import FocalLoss</code></p> <p> NOTE:  Losses in this module expect the predictions  and ground truth to have the same dimensions for regression and binary  classification problems \\((N_{samples}, 1)\\). In the case of multiclass  classification problems the ground truth is expected to be a 1D tensor with  the corresponding classes. See Examples below</p>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.MSELoss","title":"MSELoss","text":"<pre><code>MSELoss()\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Mean square error loss</p> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self):\n    super().__init__()\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.MSELoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual values</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses import MSELoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = MSELoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(\n    self,\n    input: Tensor,\n    target: Tensor,\n) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions\n    target: Tensor\n        Target tensor with the actual values\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt; from pytorch_widedeep.losses import MSELoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = MSELoss()(input, target)\n    \"\"\"\n    loss = (input - target) ** 2\n    return torch.mean(loss)\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.MSLELoss","title":"MSLELoss","text":"<pre><code>MSLELoss()\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Mean square log error loss</p> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self):\n    super().__init__()\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.MSLELoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions (not probabilities)</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual classes</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses import MSLELoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = MSLELoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(\n    self,\n    input: Tensor,\n    target: Tensor,\n) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions (not probabilities)\n    target: Tensor\n        Target tensor with the actual classes\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt; from pytorch_widedeep.losses import MSLELoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = MSLELoss()(input, target)\n    \"\"\"\n    assert (\n        input.min() &gt;= 0\n    ), \"\"\"All input values must be &gt;=0, if your model is predicting\n        values &lt;0 try to enforce positive values by activation function\n        on last layer with `trainer.enforce_positive_output=True`\"\"\"\n    assert target.min() &gt;= 0, \"All target values must be &gt;=0\"\n\n    loss = (torch.log(input + 1) - torch.log(target + 1)) ** 2\n    return torch.mean(loss)\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.RMSELoss","title":"RMSELoss","text":"<pre><code>RMSELoss()\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Root mean square error loss</p> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self):\n    super().__init__()\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.RMSELoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions (not probabilities)</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual classes</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses import RMSELoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = RMSELoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(self, input: Tensor, target: Tensor) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions (not probabilities)\n    target: Tensor\n        Target tensor with the actual classes\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt; from pytorch_widedeep.losses import RMSELoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = RMSELoss()(input, target)\n    \"\"\"\n    loss = (input - target) ** 2\n    return torch.sqrt(torch.mean(loss))\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.RMSLELoss","title":"RMSLELoss","text":"<pre><code>RMSLELoss()\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Root mean square log error loss</p> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self):\n    super().__init__()\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.RMSLELoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions (not probabilities)</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual classes</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses import RMSLELoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = RMSLELoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(self, input: Tensor, target: Tensor) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions (not probabilities)\n    target: Tensor\n        Target tensor with the actual classes\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt; from pytorch_widedeep.losses import RMSLELoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = RMSLELoss()(input, target)\n    \"\"\"\n    assert (\n        input.min() &gt;= 0\n    ), \"\"\"All input values must be &gt;=0, if your model is predicting\n        values &lt;0 try to enforce positive values by activation function\n        on last layer with `trainer.enforce_positive_output=True`\"\"\"\n    assert target.min() &gt;= 0, \"All target values must be &gt;=0\"\n\n    loss = (torch.log(input + 1) - torch.log(target + 1)) ** 2\n    return torch.sqrt(torch.mean(loss))\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.QuantileLoss","title":"QuantileLoss","text":"<pre><code>QuantileLoss(\n    quantiles=[0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98]\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Quantile loss defined as:</p> \\[ Loss = max(q \\times (y-y_{pred}), (1-q) \\times (y_{pred}-y)) \\] <p>All credits go to the implementation at pytorch-forecasting.</p> <p>Parameters:</p> <ul> <li> <code>quantiles</code>               (<code>List[float]</code>, default:                   <code>[0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98]</code> )           \u2013            <p>List of quantiles</p> </li> </ul> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(\n    self,\n    quantiles: List[float] = [0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98],\n):\n    super().__init__()\n    self.quantiles = quantiles\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.QuantileLoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual values</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.losses import QuantileLoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; # REGRESSION\n&gt;&gt;&gt; target = torch.tensor([[0.6, 1.5]]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([[.1, .2,], [.4, .5]])\n&gt;&gt;&gt; qloss = QuantileLoss([0.25, 0.75])\n&gt;&gt;&gt; loss = qloss(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(self, input: Tensor, target: Tensor) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions\n    target: Tensor\n        Target tensor with the actual values\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; from pytorch_widedeep.losses import QuantileLoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; # REGRESSION\n    &gt;&gt;&gt; target = torch.tensor([[0.6, 1.5]]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([[.1, .2,], [.4, .5]])\n    &gt;&gt;&gt; qloss = QuantileLoss([0.25, 0.75])\n    &gt;&gt;&gt; loss = qloss(input, target)\n    \"\"\"\n\n    assert input.shape == torch.Size([target.shape[0], len(self.quantiles)]), (\n        \"The input and target have inconsistent shape. The dimension of the prediction \"\n        \"of the model that is using QuantileLoss must be equal to number of quantiles, \"\n        f\"i.e. {len(self.quantiles)}.\"\n    )\n    target = target.view(-1, 1).float()\n    losses = []\n    for i, q in enumerate(self.quantiles):\n        errors = target - input[..., i]\n        losses.append(torch.max((q - 1) * errors, q * errors).unsqueeze(-1))\n\n    loss = torch.cat(losses, dim=2)\n\n    return torch.mean(loss)\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.FocalLoss","title":"FocalLoss","text":"<pre><code>FocalLoss(alpha=0.25, gamma=1.0)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Implementation of the Focal loss for both binary and multiclass classification:</p> \\[ FL(p_t) = \\alpha (1 - p_t)^{\\gamma} log(p_t) \\] <p>where, for a case of a binary classification problem</p> \\[ \\begin{equation} p_t= \\begin{cases}p, &amp; \\text{if $y=1$}.\\\\1-p, &amp; \\text{otherwise}. \\end{cases} \\end{equation} \\] <p>Parameters:</p> <ul> <li> <code>alpha</code>               (<code>float</code>, default:                   <code>0.25</code> )           \u2013            <p>Focal Loss <code>alpha</code> parameter</p> </li> <li> <code>gamma</code>               (<code>float</code>, default:                   <code>1.0</code> )           \u2013            <p>Focal Loss <code>gamma</code> parameter</p> </li> </ul> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self, alpha: float = 0.25, gamma: float = 1.0):\n    super().__init__()\n    self.alpha = alpha\n    self.gamma = gamma\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.FocalLoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions (not probabilities)</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual classes</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.losses import FocalLoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; # BINARY\n&gt;&gt;&gt; target = torch.tensor([0, 1, 0, 1]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([[0.6, 0.7, 0.3, 0.8]]).t()\n&gt;&gt;&gt; loss = FocalLoss()(input, target)\n&gt;&gt;&gt;\n&gt;&gt;&gt; # MULTICLASS\n&gt;&gt;&gt; target = torch.tensor([1, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([[0.2, 0.5, 0.3], [0.8, 0.1, 0.1], [0.7, 0.2, 0.1]])\n&gt;&gt;&gt; loss = FocalLoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(self, input: Tensor, target: Tensor) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions (not probabilities)\n    target: Tensor\n        Target tensor with the actual classes\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; from pytorch_widedeep.losses import FocalLoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; # BINARY\n    &gt;&gt;&gt; target = torch.tensor([0, 1, 0, 1]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([[0.6, 0.7, 0.3, 0.8]]).t()\n    &gt;&gt;&gt; loss = FocalLoss()(input, target)\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; # MULTICLASS\n    &gt;&gt;&gt; target = torch.tensor([1, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([[0.2, 0.5, 0.3], [0.8, 0.1, 0.1], [0.7, 0.2, 0.1]])\n    &gt;&gt;&gt; loss = FocalLoss()(input, target)\n    \"\"\"\n    input_prob = torch.sigmoid(input)\n    if input.size(1) == 1:\n        input_prob = torch.cat([1 - input_prob, input_prob], axis=1)  # type: ignore\n        num_class = 2\n    else:\n        num_class = input_prob.size(1)\n    binary_target = torch.eye(num_class)[target.squeeze().cpu().long()]\n    if use_cuda:\n        binary_target = binary_target.cuda()\n    binary_target = binary_target.contiguous()\n    weight = self._get_weight(input_prob, binary_target)\n\n    return F.binary_cross_entropy(\n        input_prob, binary_target, weight, reduction=\"mean\"\n    )\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.BayesianSELoss","title":"BayesianSELoss","text":"<pre><code>BayesianSELoss()\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Squared Loss (log Gaussian) for the case of a regression as specified in the original publication Weight Uncertainty in Neural Networks.</p> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self):\n    super().__init__()\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.BayesianSELoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions (not probabilities)</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual classes</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses import BayesianSELoss\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = BayesianSELoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(self, input: Tensor, target: Tensor) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions (not probabilities)\n    target: Tensor\n        Target tensor with the actual classes\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt; from pytorch_widedeep.losses import BayesianSELoss\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = BayesianSELoss()(input, target)\n    \"\"\"\n    return (0.5 * (input - target) ** 2).sum()\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.TweedieLoss","title":"TweedieLoss","text":"<pre><code>TweedieLoss()\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Tweedie loss for extremely unbalanced zero-inflated data</p> <p>All credits go to Wenbo Shi. See this post and the original publication for details.</p> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self):\n    super().__init__()\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.TweedieLoss.forward","title":"forward","text":"<pre><code>forward(input, target, p=1.5)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual values</p> </li> <li> <code>p</code>               (<code>float</code>, default:                   <code>1.5</code> )           \u2013            <p>the power to be used to compute the loss. See the original publication for details</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses import TweedieLoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = TweedieLoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(\n    self,\n    input: Tensor,\n    target: Tensor,\n    p: float = 1.5,\n) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions\n    target: Tensor\n        Target tensor with the actual values\n    p: float, default = 1.5\n        the power to be used to compute the loss. See the original\n        publication for details\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt; from pytorch_widedeep.losses import TweedieLoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = TweedieLoss()(input, target)\n    \"\"\"\n\n    assert (\n        input.min() &gt; 0\n    ), \"\"\"All input values must be &gt;=0, if your model is predicting\n        values &lt;0 try to enforce positive values by activation function\n        on last layer with `trainer.enforce_positive_output=True`\"\"\"\n    assert target.min() &gt;= 0, \"All target values must be &gt;=0\"\n    loss = -target * torch.pow(input, 1 - p) / (1 - p) + torch.pow(input, 2 - p) / (\n        2 - p\n    )\n    return torch.mean(loss)\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.ZILNLoss","title":"ZILNLoss","text":"<pre><code>ZILNLoss()\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Adjusted implementation of the Zero Inflated LogNormal Loss</p> <p>See A Deep Probabilistic Model for Customer Lifetime Value Prediction and the corresponding code.</p> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self):\n    super().__init__()\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.ZILNLoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions with spape (N,3), where N is the batch size</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual target values</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses import ZILNLoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([[0., 1.5]]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([[.1, .2, .3], [.4, .5, .6]])\n&gt;&gt;&gt; loss = ZILNLoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(self, input: Tensor, target: Tensor) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions with spape (N,3), where N is the batch size\n    target: Tensor\n        Target tensor with the actual target values\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt; from pytorch_widedeep.losses import ZILNLoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([[0., 1.5]]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([[.1, .2, .3], [.4, .5, .6]])\n    &gt;&gt;&gt; loss = ZILNLoss()(input, target)\n    \"\"\"\n    positive = target &gt; 0\n    positive = positive.float()\n\n    assert input.shape == torch.Size([target.shape[0], 3]), (\n        \"Wrong shape of the 'input' tensor. The pred_dim of the \"\n        \"model that is using ZILNLoss must be equal to 3.\"\n    )\n\n    positive_input = input[..., :1]\n\n    classification_loss = F.binary_cross_entropy_with_logits(\n        positive_input, positive, reduction=\"none\"\n    ).flatten()\n\n    loc = input[..., 1:2]\n\n    # when using max the two input tensors (input and other) have to be of\n    # the same type\n    max_input = F.softplus(input[..., 2:])\n    max_other = torch.sqrt(torch.Tensor([torch.finfo(torch.double).eps])).type(\n        max_input.type()\n    )\n    scale = torch.max(max_input, max_other)\n    safe_labels = positive * target + (1 - positive) * torch.ones_like(target)\n\n    regression_loss = -torch.mean(\n        positive\n        * torch.distributions.log_normal.LogNormal(loc=loc, scale=scale).log_prob(\n            safe_labels\n        ),\n        dim=-1,\n    )\n\n    return torch.mean(classification_loss + regression_loss)\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.L1Loss","title":"L1Loss","text":"<pre><code>L1Loss()\n</code></pre> <p>               Bases: <code>Module</code></p> <p>L1 loss</p> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self):\n    super().__init__()\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.L1Loss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual values</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.losses import L1Loss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = L1Loss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(self, input: Tensor, target: Tensor) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions\n    target: Tensor\n        Target tensor with the actual values\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; from pytorch_widedeep.losses import L1Loss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = L1Loss()(input, target)\n    \"\"\"\n    loss = F.l1_loss(input, target, reduction=\"none\")\n    return torch.mean(loss)\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.FocalR_L1Loss","title":"FocalR_L1Loss","text":"<pre><code>FocalR_L1Loss(beta=0.2, gamma=1.0, activation_fn='sigmoid')\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Focal-R L1 loss</p> <p>Based on Delving into Deep Imbalanced Regression.</p> <p>Parameters:</p> <ul> <li> <code>beta</code>               (<code>float</code>, default:                   <code>0.2</code> )           \u2013            <p>Focal Loss <code>beta</code> parameter in their implementation</p> </li> <li> <code>gamma</code>               (<code>float</code>, default:                   <code>1.0</code> )           \u2013            <p>Focal Loss <code>gamma</code> parameter</p> </li> <li> <code>activation_fn</code>               (<code>Literal[sigmoid, tanh]</code>, default:                   <code>'sigmoid'</code> )           \u2013            <p>Activation function to be used during the computation of the loss. Possible values are 'sigmoid' and 'tanh'. See the original publication for details.</p> </li> </ul> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(\n    self,\n    beta: float = 0.2,\n    gamma: float = 1.0,\n    activation_fn: Literal[\"sigmoid\", \"tanh\"] = \"sigmoid\",\n):\n    super().__init__()\n    self.beta = beta\n    self.gamma = gamma\n    self.activation_fn = activation_fn\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.FocalR_L1Loss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions (not probabilities)</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual classes</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.losses import FocalR_L1Loss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = FocalR_L1Loss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(\n    self,\n    input: Tensor,\n    target: Tensor,\n) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions (not probabilities)\n    target: Tensor\n        Target tensor with the actual classes\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; from pytorch_widedeep.losses import FocalR_L1Loss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = FocalR_L1Loss()(input, target)\n    \"\"\"\n    loss = F.l1_loss(input, target, reduction=\"none\")\n    if self.activation_fn == \"tanh\":\n        loss *= (torch.tanh(self.beta * torch.abs(input - target))) ** self.gamma\n    elif self.activation_fn == \"sigmoid\":\n        loss *= (\n            2 * torch.sigmoid(self.beta * torch.abs(input - target)) - 1\n        ) ** self.gamma\n    else:\n        ValueError(\n            \"Incorrect activation function value - must be in ['sigmoid', 'tanh']\"\n        )\n    return torch.mean(loss)\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.FocalR_MSELoss","title":"FocalR_MSELoss","text":"<pre><code>FocalR_MSELoss(\n    beta=0.2, gamma=1.0, activation_fn=\"sigmoid\"\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Focal-R MSE loss</p> <p>Based on Delving into Deep Imbalanced Regression.</p> <p>Parameters:</p> <ul> <li> <code>beta</code>               (<code>float</code>, default:                   <code>0.2</code> )           \u2013            <p>Focal Loss <code>beta</code> parameter in their implementation</p> </li> <li> <code>gamma</code>               (<code>float</code>, default:                   <code>1.0</code> )           \u2013            <p>Focal Loss <code>gamma</code> parameter</p> </li> <li> <code>activation_fn</code>               (<code>Literal[sigmoid, tanh]</code>, default:                   <code>'sigmoid'</code> )           \u2013            <p>Activation function to be used during the computation of the loss. Possible values are 'sigmoid' and 'tanh'. See the original publication for details.</p> </li> </ul> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(\n    self,\n    beta: float = 0.2,\n    gamma: float = 1.0,\n    activation_fn: Literal[\"sigmoid\", \"tanh\"] = \"sigmoid\",\n):\n    super().__init__()\n    self.beta = beta\n    self.gamma = gamma\n    self.activation_fn = activation_fn\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.FocalR_MSELoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions (not probabilities)</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual classes</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.losses import FocalR_MSELoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = FocalR_MSELoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(\n    self,\n    input: Tensor,\n    target: Tensor,\n) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions (not probabilities)\n    target: Tensor\n        Target tensor with the actual classes\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; from pytorch_widedeep.losses import FocalR_MSELoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = FocalR_MSELoss()(input, target)\n    \"\"\"\n    loss = (input - target) ** 2\n    if self.activation_fn == \"tanh\":\n        loss *= (torch.tanh(self.beta * torch.abs(input - target))) ** self.gamma\n    elif self.activation_fn == \"sigmoid\":\n        loss *= (\n            2 * torch.sigmoid(self.beta * torch.abs((input - target) ** 2)) - 1\n        ) ** self.gamma\n    else:\n        ValueError(\n            \"Incorrect activation function value - must be in ['sigmoid', 'tanh']\"\n        )\n    return torch.mean(loss)\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.FocalR_RMSELoss","title":"FocalR_RMSELoss","text":"<pre><code>FocalR_RMSELoss(\n    beta=0.2, gamma=1.0, activation_fn=\"sigmoid\"\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Focal-R RMSE loss</p> <p>Based on Delving into Deep Imbalanced Regression.</p> <p>Parameters:</p> <ul> <li> <code>beta</code>               (<code>float</code>, default:                   <code>0.2</code> )           \u2013            <p>Focal Loss <code>beta</code> parameter in their implementation</p> </li> <li> <code>gamma</code>               (<code>float</code>, default:                   <code>1.0</code> )           \u2013            <p>Focal Loss <code>gamma</code> parameter</p> </li> <li> <code>activation_fn</code>               (<code>Literal[sigmoid, tanh]</code>, default:                   <code>'sigmoid'</code> )           \u2013            <p>Activation function to be used during the computation of the loss. Possible values are 'sigmoid' and 'tanh'. See the original publication for details.</p> </li> </ul> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(\n    self,\n    beta: float = 0.2,\n    gamma: float = 1.0,\n    activation_fn: Literal[\"sigmoid\", \"tanh\"] = \"sigmoid\",\n):\n    super().__init__()\n    self.beta = beta\n    self.gamma = gamma\n    self.activation_fn = activation_fn\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.FocalR_RMSELoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions (not probabilities)</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual classes</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.losses import FocalR_RMSELoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = FocalR_RMSELoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(\n    self,\n    input: Tensor,\n    target: Tensor,\n) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions (not probabilities)\n    target: Tensor\n        Target tensor with the actual classes\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; from pytorch_widedeep.losses import FocalR_RMSELoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = FocalR_RMSELoss()(input, target)\n    \"\"\"\n    loss = (input - target) ** 2\n    if self.activation_fn == \"tanh\":\n        loss *= (torch.tanh(self.beta * torch.abs(input - target))) ** self.gamma\n    elif self.activation_fn == \"sigmoid\":\n        loss *= (\n            2 * torch.sigmoid(self.beta * torch.abs((input - target) ** 2)) - 1\n        ) ** self.gamma\n    else:\n        ValueError(\n            \"Incorrect activation function value - must be in ['sigmoid', 'tanh']\"\n        )\n    return torch.sqrt(torch.mean(loss))\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.HuberLoss","title":"HuberLoss","text":"<pre><code>HuberLoss(beta=0.2)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Hubbler Loss</p> <p>Based on Delving into Deep Imbalanced Regression.</p> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self, beta: float = 0.2):\n    super().__init__()\n    self.beta = beta\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.HuberLoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions (not probabilities)</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual classes</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.losses import HuberLoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = HuberLoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(\n    self,\n    input: Tensor,\n    target: Tensor,\n) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions (not probabilities)\n    target: Tensor\n        Target tensor with the actual classes\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; from pytorch_widedeep.losses import HuberLoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = HuberLoss()(input, target)\n    \"\"\"\n    l1_loss = torch.abs(input - target)\n    cond = l1_loss &lt; self.beta\n    loss = torch.where(\n        cond, 0.5 * l1_loss**2 / self.beta, l1_loss - 0.5 * self.beta\n    )\n    return torch.mean(loss)\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.InfoNCELoss","title":"InfoNCELoss","text":"<pre><code>InfoNCELoss(temperature=0.1, reduction='mean')\n</code></pre> <p>               Bases: <code>Module</code></p> <p>InfoNCE Loss. Loss applied during the Contrastive Denoising Self Supervised Pre-training routine available in this library</p> <p> NOTE: This loss is in principle not exposed to  the user, as it is used internally in the library, but it is included  here for completion.</p> <p>See SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training and references therein</p> <p>Partially inspired by the code in this repo</p> <p>Parameters:</p> <ul> <li> <code>temperature</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>The logits are divided by the temperature before computing the loss value</p> </li> <li> <code>reduction</code>               (<code>str</code>, default:                   <code>'mean'</code> )           \u2013            <p>Loss reduction method</p> </li> </ul> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self, temperature: float = 0.1, reduction: str = \"mean\"):\n    super(InfoNCELoss, self).__init__()\n\n    self.temperature = temperature\n    self.reduction = reduction\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.InfoNCELoss.forward","title":"forward","text":"<pre><code>forward(g_projs)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>g_projs</code>               (<code>Tuple[Tensor, Tensor]</code>)           \u2013            <p>Tuple with the two tensors corresponding to the output of the two projection heads, as described 'SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training'.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses import InfoNCELoss\n&gt;&gt;&gt; g_projs = (torch.rand(3, 5, 16), torch.rand(3, 5, 16))\n&gt;&gt;&gt; loss = InfoNCELoss()\n&gt;&gt;&gt; res = loss(g_projs)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(self, g_projs: Tuple[Tensor, Tensor]) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    g_projs: Tuple\n        Tuple with the two tensors corresponding to the output of the two\n        projection heads, as described 'SAINT: Improved Neural Networks\n        for Tabular Data via Row Attention and Contrastive Pre-Training'.\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt; from pytorch_widedeep.losses import InfoNCELoss\n    &gt;&gt;&gt; g_projs = (torch.rand(3, 5, 16), torch.rand(3, 5, 16))\n    &gt;&gt;&gt; loss = InfoNCELoss()\n    &gt;&gt;&gt; res = loss(g_projs)\n    \"\"\"\n    z, z_ = g_projs[0], g_projs[1]\n\n    norm_z = F.normalize(z, dim=-1).flatten(1)\n    norm_z_ = F.normalize(z_, dim=-1).flatten(1)\n\n    logits = (norm_z @ norm_z_.t()) / self.temperature\n    logits_ = (norm_z_ @ norm_z.t()) / self.temperature\n\n    # the target/labels are the entries on the diagonal\n    target = torch.arange(len(norm_z), device=norm_z.device)\n\n    loss = F.cross_entropy(logits, target, reduction=self.reduction)\n    loss_ = F.cross_entropy(logits_, target, reduction=self.reduction)\n\n    return (loss + loss_) / 2.0\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.DenoisingLoss","title":"DenoisingLoss","text":"<pre><code>DenoisingLoss(\n    lambda_cat=1.0, lambda_cont=1.0, reduction=\"mean\"\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Denoising Loss. Loss applied during the Contrastive Denoising Self Supervised Pre-training routine available in this library</p> <p> NOTE: This loss is in principle not exposed to  the user, as it is used internally in the library, but it is included  here for completion.</p> <p>See SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training and references therein</p> <p>Parameters:</p> <ul> <li> <code>lambda_cat</code>               (<code>float</code>, default:                   <code>1.0</code> )           \u2013            <p>Multiplicative factor that will be applied to loss associated to the categorical features</p> </li> <li> <code>lambda_cont</code>               (<code>float</code>, default:                   <code>1.0</code> )           \u2013            <p>Multiplicative factor that will be applied to loss associated to the continuous features</p> </li> <li> <code>reduction</code>               (<code>str</code>, default:                   <code>'mean'</code> )           \u2013            <p>Loss reduction method</p> </li> </ul> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(\n    self, lambda_cat: float = 1.0, lambda_cont: float = 1.0, reduction: str = \"mean\"\n):\n    super(DenoisingLoss, self).__init__()\n\n    self.lambda_cat = lambda_cat\n    self.lambda_cont = lambda_cont\n    self.reduction = reduction\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.DenoisingLoss.forward","title":"forward","text":"<pre><code>forward(x_cat_and_cat_, x_cont_and_cont_)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>x_cat_and_cat_</code>               (<code>Optional[Union[List[Tuple[Tensor, Tensor]], Tuple[Tensor, Tensor]]]</code>)           \u2013            <p>Tuple of tensors containing the raw input features and their encodings, referred in the SAINT paper as \\(x\\) and \\(x''\\) respectively. If one denoising MLP is used per categorical feature <code>x_cat_and_cat_</code> will be a list of tuples, one per categorical feature</p> </li> <li> <code>x_cont_and_cont_</code>               (<code>Optional[Union[List[Tuple[Tensor, Tensor]], Tuple[Tensor, Tensor]]]</code>)           \u2013            <p>same as <code>x_cat_and_cat_</code> but for continuous columns</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses import DenoisingLoss\n&gt;&gt;&gt; x_cat_and_cat_ = (torch.empty(3).random_(3).long(), torch.randn(3, 3))\n&gt;&gt;&gt; x_cont_and_cont_ = (torch.randn(3, 1), torch.randn(3, 1))\n&gt;&gt;&gt; loss = DenoisingLoss()\n&gt;&gt;&gt; res = loss(x_cat_and_cat_, x_cont_and_cont_)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(\n    self,\n    x_cat_and_cat_: Optional[\n        Union[List[Tuple[Tensor, Tensor]], Tuple[Tensor, Tensor]]\n    ],\n    x_cont_and_cont_: Optional[\n        Union[List[Tuple[Tensor, Tensor]], Tuple[Tensor, Tensor]]\n    ],\n) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    x_cat_and_cat_: tuple of Tensors or lists of tuples\n        Tuple of tensors containing the raw input features and their\n        encodings, referred in the SAINT paper as $x$ and $x''$\n        respectively. If one denoising MLP is used per categorical\n        feature `x_cat_and_cat_` will be a list of tuples, one per\n        categorical feature\n    x_cont_and_cont_: tuple of Tensors or lists of tuples\n        same as `x_cat_and_cat_` but for continuous columns\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt; from pytorch_widedeep.losses import DenoisingLoss\n    &gt;&gt;&gt; x_cat_and_cat_ = (torch.empty(3).random_(3).long(), torch.randn(3, 3))\n    &gt;&gt;&gt; x_cont_and_cont_ = (torch.randn(3, 1), torch.randn(3, 1))\n    &gt;&gt;&gt; loss = DenoisingLoss()\n    &gt;&gt;&gt; res = loss(x_cat_and_cat_, x_cont_and_cont_)\n    \"\"\"\n\n    loss_cat = (\n        self._compute_cat_loss(x_cat_and_cat_)\n        if x_cat_and_cat_ is not None\n        else torch.tensor(0.0)\n    )\n    loss_cont = (\n        self._compute_cont_loss(x_cont_and_cont_)\n        if x_cont_and_cont_ is not None\n        else torch.tensor(0.0)\n    )\n\n    return self.lambda_cat * loss_cat + self.lambda_cont * loss_cont\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.EncoderDecoderLoss","title":"EncoderDecoderLoss","text":"<pre><code>EncoderDecoderLoss(eps=1e-09)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>'Standard' Encoder Decoder Loss. Loss applied during the Endoder-Decoder  Self-Supervised Pre-Training routine available in this library</p> <p> NOTE: This loss is in principle not exposed to  the user, as it is used internally in the library, but it is included  here for completion.</p> <p>The implementation of this lost is based on that at the tabnet repo, which is in itself an adaptation of that in the original paper TabNet: Attentive Interpretable Tabular Learning.</p> <p>Parameters:</p> <ul> <li> <code>eps</code>               (<code>float</code>, default:                   <code>1e-09</code> )           \u2013            <p>Simply a small number to avoid dividing by zero</p> </li> </ul> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self, eps: float = 1e-9):\n    super(EncoderDecoderLoss, self).__init__()\n    self.eps = eps\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.EncoderDecoderLoss.forward","title":"forward","text":"<pre><code>forward(x_true, x_pred, mask)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>x_true</code>               (<code>Tensor</code>)           \u2013            <p>Embeddings of the input data</p> </li> <li> <code>x_pred</code>               (<code>Tensor</code>)           \u2013            <p>Reconstructed embeddings</p> </li> <li> <code>mask</code>               (<code>Tensor</code>)           \u2013            <p>Mask with 1s indicated that the reconstruction, and therefore the loss, is based on those features.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses import EncoderDecoderLoss\n&gt;&gt;&gt; x_true = torch.rand(3, 3)\n&gt;&gt;&gt; x_pred = torch.rand(3, 3)\n&gt;&gt;&gt; mask = torch.empty(3, 3).random_(2)\n&gt;&gt;&gt; loss = EncoderDecoderLoss()\n&gt;&gt;&gt; res = loss(x_true, x_pred, mask)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(self, x_true: Tensor, x_pred: Tensor, mask: Tensor) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    x_true: Tensor\n        Embeddings of the input data\n    x_pred: Tensor\n        Reconstructed embeddings\n    mask: Tensor\n        Mask with 1s indicated that the reconstruction, and therefore the\n        loss, is based on those features.\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt; from pytorch_widedeep.losses import EncoderDecoderLoss\n    &gt;&gt;&gt; x_true = torch.rand(3, 3)\n    &gt;&gt;&gt; x_pred = torch.rand(3, 3)\n    &gt;&gt;&gt; mask = torch.empty(3, 3).random_(2)\n    &gt;&gt;&gt; loss = EncoderDecoderLoss()\n    &gt;&gt;&gt; res = loss(x_true, x_pred, mask)\n    \"\"\"\n\n    errors = x_pred - x_true\n\n    reconstruction_errors = torch.mul(errors, mask) ** 2\n\n    x_true_means = torch.mean(x_true, dim=0)\n    x_true_means[x_true_means == 0] = 1\n\n    x_true_stds = torch.std(x_true, dim=0) ** 2\n    x_true_stds[x_true_stds == 0] = x_true_means[x_true_stds == 0]\n\n    features_loss = torch.matmul(reconstruction_errors, 1 / x_true_stds)\n    nb_reconstructed_variables = torch.sum(mask, dim=1)\n    features_loss_norm = features_loss / (nb_reconstructed_variables + self.eps)\n\n    loss = torch.mean(features_loss_norm)\n\n    return loss\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses_multitarget.MultiTargetRegressionLoss","title":"MultiTargetRegressionLoss","text":"<pre><code>MultiTargetRegressionLoss(weights=None, reduction='mean')\n</code></pre> <p>               Bases: <code>Module</code></p> <p>This class is a wrapper around the Pytorch MSELoss. It allows for multi-target regression problems. The user can provide a list of weights to apply to each target. The loss can be either the sum or the mean of the individual losses</p> <p>Parameters:</p> <ul> <li> <code>weights</code>               (<code>Optional[List[float]]</code>, default:                   <code>None</code> )           \u2013            <p>List of weights to apply to the loss associated to each target. The length of the list must match the number of targets. Alias: 'target_weights'</p> </li> <li> <code>reduction</code>               (<code>Literal[mean, sum]</code>, default:                   <code>'mean'</code> )           \u2013            <p>Specifies the reduction to apply to the loss associated to each target: 'mean' | 'sum'. Note that this is NOT the same as the reduction in the MSELoss. This reduction is applied after the loss for each target has been computed. Alias: 'target_reduction'</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses_multitarget import MultiTargetRegressionLoss\n&gt;&gt;&gt; input = torch.randn(3, 2)\n&gt;&gt;&gt; target = torch.randn(3, 2)\n&gt;&gt;&gt; loss = MultiTargetRegressionLoss(weights=[0.5, 0.5], reduction=\"mean\")\n&gt;&gt;&gt; output = loss(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses_multitarget.py</code> <pre><code>@alias(\"reduction\", [\"target_reduction\"])\n@alias(\"weights\", [\"target_weights\"])\ndef __init__(\n    self,\n    weights: Optional[List[float]] = None,\n    reduction: Literal[\"mean\", \"sum\"] = \"mean\",\n):\n    super(MultiTargetRegressionLoss, self).__init__()\n\n    self.weights = weights\n    self.reduction = reduction\n\n    if self.reduction not in [\"mean\", \"sum\"]:\n        raise ValueError(\"reduction must be either 'mean' or 'sum'\")\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses_multitarget.MultiTargetClassificationLoss","title":"MultiTargetClassificationLoss","text":"<pre><code>MultiTargetClassificationLoss(\n    binary_config=None,\n    multiclass_config=None,\n    weights=None,\n    reduction=\"mean\",\n    binary_trick=False,\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>This class is a wrapper around the Pytorch binary_cross_entropy_with_logits and cross_entropy losses. It allows for multi-target classification problems. The user can provide a list of weights to apply to each target. The loss can be either the sum or the mean of the individual losses</p> <p>Parameters:</p> <ul> <li> <code>binary_config</code>               (<code>Optional[List[Union[int, Tuple[int, float]]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of integers with the index of the target for binary classification or tuples with two elements: the index of the targets or binary classification and the positive weight for binary classification</p> </li> <li> <code>multiclass_config</code>               (<code>Optional[List[Union[Tuple[int, int], Tuple[int, int, List[float]]]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of tuples with two or three elements: the index of the target and the number of classes for multiclass classification, or a tuple with the index of the target, the number of classes and a list of weights to apply to each class (i.e. the 'weight' parameter in the cross_entropy loss)</p> </li> <li> <code>weights</code>               (<code>Optional[List[float]]</code>, default:                   <code>None</code> )           \u2013            <p>List of weights to apply to the loss associated to each target. The length of the list must match the number of targets. Alias: 'target_weights'</p> </li> <li> <code>reduction</code>               (<code>Literal[mean, sum]</code>, default:                   <code>'mean'</code> )           \u2013            <p>Specifies the reduction to apply to the loss associated to each target: 'mean' | 'sum'. Note that this is NOT the same as the reduction in the cross_entropy loss or the binary_cross_entropy_with_logits. This reduction is applied after the loss for each target has been computed. Alias: 'target_reduction'</p> </li> <li> <code>binary_trick</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>If True, each target will be considered independently and the loss will be computed as binary_cross_entropy_with_logits. This is a faster implementation. Note that the 'weights' parameter is not compatible with binary_trick=True. Also note that if binary_trick=True, the 'binary_config' must be a list of integers and the 'multiclass_config' must be a list of tuples with two integers: the index of the target and the number of classes. Finally, if binary_trick=True, the binary targets must be the first targets in the target tensor.</p> <p> NOTE: When using the binary_trick, the binary targets are   considered as 2 classes. Therefore, the pred_dim parametere of the   WideDeep class should be adjusted accordingly (adding 2 to per   binary target). For example, in a problem with a binary target and   a 4 class multiclassification target, the pred_dim should be 6.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses_multitarget import MultiTargetClassificationLoss\n&gt;&gt;&gt; input = torch.randn(5, 4)\n&gt;&gt;&gt; input_binary_trick = torch.randn(5, 5)\n&gt;&gt;&gt; target = torch.stack([torch.tensor([0, 1, 0, 1, 1]), torch.tensor([0, 1, 2, 0, 2])], 1)\n&gt;&gt;&gt; loss_1 = MultiTargetClassificationLoss(binary_config=[0], multiclass_config=[(1, 3)], reduction=\"mean\")\n&gt;&gt;&gt; output_1 = loss_1(input, target)\n&gt;&gt;&gt; loss_2 = MultiTargetClassificationLoss(binary_config=[(0, 0.5)], multiclass_config=[(1, 3, [1., 2., 3.])],\n... reduction=\"sum\", weights=[0.5, 0.5])\n&gt;&gt;&gt; output_2 = loss_2(input, target)\n&gt;&gt;&gt; loss_3 = MultiTargetClassificationLoss(binary_config=[0], multiclass_config=[(1, 3)], binary_trick=True)\n&gt;&gt;&gt; output_3 = loss_3(input_binary_trick, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses_multitarget.py</code> <pre><code>@alias(\"reduction\", [\"target_reduction\"])\n@alias(\"weights\", [\"target_weights\"])\ndef __init__(  # noqa: C901\n    self,\n    binary_config: Optional[List[Union[int, Tuple[int, float]]]] = None,\n    multiclass_config: Optional[\n        List[Union[Tuple[int, int], Tuple[int, int, List[float]]]]\n    ] = None,\n    weights: Optional[List[float]] = None,\n    reduction: Literal[\"mean\", \"sum\"] = \"mean\",\n    binary_trick: bool = False,\n):\n    super(MultiTargetClassificationLoss, self).__init__()\n\n    if reduction not in [\"mean\", \"sum\"]:\n        raise ValueError(\"reduction must be either 'mean' or 'sum'\")\n\n    self.binary_config = binary_config\n    self.multiclass_config = multiclass_config\n    self.weights = weights\n    self.reduction = reduction\n    self.binary_trick = binary_trick\n\n    if self.weights is not None:\n        if len(self.weights) != (\n            len(self.binary_config) if self.binary_config is not None else 0\n        ) + (\n            len(self.multiclass_config) if self.multiclass_config is not None else 0\n        ):\n            raise ValueError(\n                \"The number of weights must match the number of binary and multiclass targets\"\n            )\n\n    if self.binary_trick:\n        self._check_inputs_with_binary_trick()\n        self._binary_config: List[int] = binary_config  # type: ignore[assignment]\n        self._multiclass_config: List[Tuple[int, int]] = self.multiclass_config  # type: ignore[assignment]\n    else:\n        self.binary_config_with_pos_weights = (\n            (self._set_binary_config_without_binary_trick())\n            if self.binary_config is not None\n            else None\n        )\n        self.multiclass_config_with_weights = (\n            (self._set_multiclass_config_without_binary_trick())\n            if self.multiclass_config is not None\n            else None\n        )\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses_multitarget.MutilTargetRegressionAndClassificationLoss","title":"MutilTargetRegressionAndClassificationLoss","text":"<pre><code>MutilTargetRegressionAndClassificationLoss(\n    regression_config=[],\n    binary_config=None,\n    multiclass_config=None,\n    weights=None,\n    reduction=\"mean\",\n    binary_trick=False,\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>This class is a wrapper around the MultiTargetRegressionLoss and the MultiTargetClassificationLoss. It allows for multi-target regression and classification problems. The user can provide a list of weights to apply to each target. The loss can be either the sum or the mean of the individual losses</p> <p>Parameters:</p> <ul> <li> <code>regression_config</code>               (<code>List[int]</code>, default:                   <code>[]</code> )           \u2013            <p>List of integers with the indices of the regression targets</p> </li> <li> <code>binary_config</code>               (<code>Optional[List[Union[int, Tuple[int, float]]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of integers with the index of the target for binary classification or tuples with two elements: the index of the targets or binary classification and the positive weight for binary classification</p> </li> <li> <code>multiclass_config</code>               (<code>Optional[List[Union[Tuple[int, int], Tuple[int, int, List[float]]]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of tuples with two or three elements: the index of the target and the number of classes for multiclass classification, or a tuple with the index of the target, the number of classes and a list of weights to apply to each class (i.e. the 'weight' parameter in the cross_entropy loss)</p> </li> <li> <code>weights</code>               (<code>Optional[List[float]]</code>, default:                   <code>None</code> )           \u2013            <p>List of weights to apply to the loss associated to each target. The length of the list must match the number of targets. Alias: 'target_weights'</p> </li> <li> <code>reduction</code>               (<code>Literal[mean, sum]</code>, default:                   <code>'mean'</code> )           \u2013            <p>Specifies the reduction to apply to the output: 'mean' | 'sum'. Note that this is NOT the same as the reduction in the cross_entropy loss, the binary_cross_entropy_with_logits or the MSELoss. This reduction is applied after each target has been computed. Alias: 'target_reduction'</p> </li> <li> <code>binary_trick</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>If True, each target will be considered independently and the loss will be computed as binary_cross_entropy_with_logits. This is a faster implementation. Note that the 'weights' parameter is not compatible with binary_trick=True. Also note that if binary_trick=True, the 'binary_config' must be a list of integers and the 'multiclass_config' must be a list of tuples with two integers: the index of the target and the number of classes. Finally, if binary_trick=True, the binary targets must be the first targets in the target tensor.</p> <p> NOTE: When using the binary_trick, the binary targets are   considered as 2 classes. Therefore, the pred_dim parametere of the   WideDeep class should be adjusted accordingly (adding 2 to per   binary target). For example, in a problem with a binary target and   a 4 class multiclassification target, the pred_dim should be 6.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses_multitarget import MutilTargetRegressionAndClassificationLoss\n&gt;&gt;&gt; input = torch.randn(5, 5)\n&gt;&gt;&gt; target = torch.stack([torch.randn(5), torch.tensor([0, 1, 0, 1, 1]), torch.tensor([0, 1, 2, 0, 2])], 1)\n&gt;&gt;&gt; loss = MutilTargetRegressionAndClassificationLoss(regression_config=[0], binary_config=[2],\n... multiclass_config=[(2, 3)], reduction=\"mean\")\n&gt;&gt;&gt; output = loss(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses_multitarget.py</code> <pre><code>@alias(\"reduction\", [\"target_reduction\"])\n@alias(\"weights\", [\"target_weights\"])\ndef __init__(  # noqa: C901\n    self,\n    regression_config: List[int] = [],\n    binary_config: Optional[List[Union[int, Tuple[int, float]]]] = None,\n    multiclass_config: Optional[\n        List[Union[Tuple[int, int], Tuple[int, int, List[float]]]]\n    ] = None,\n    weights: Optional[List[float]] = None,\n    reduction: Literal[\"mean\", \"sum\"] = \"mean\",\n    binary_trick: bool = False,\n):\n\n    super(MutilTargetRegressionAndClassificationLoss, self).__init__()\n\n    self.regression_config = regression_config\n\n    assert binary_config is not None or multiclass_config is not None, (\n        \"Either binary_config or multiclass_config must be provided. \"\n        \"Otherwise, use the MultiTargetRegressionLoss\"\n    )\n\n    if binary_trick:\n        self._check_inputs_with_binary_trick(\n            regression_config, binary_config, multiclass_config\n        )\n\n    if weights is not None:\n        if len(weights) != (\n            len(regression_config)\n            + (len(binary_config) if binary_config is not None else 0)\n            + (len(multiclass_config) if multiclass_config is not None else 0)\n        ):\n            raise ValueError(\n                \"The number of weights must match the number of regression, binary and multiclass targets\"\n            )\n\n        self.weights_regression = self._prepare_weights_for_regression_targets(\n            weights, regression_config\n        )\n        self.weights_binary = self._prepare_weights_per_binary_targets(\n            weights, binary_config\n        )\n        self.weights_multiclass = self._prepare_weights_per_multiclass_targets(\n            weights, multiclass_config\n        )\n        self.weights = weights\n    else:\n        self.weights_regression = None\n        self.weights_binary = None\n        self.weights_multiclass = None\n\n    self.multi_target_regression_loss = MultiTargetRegressionLoss(\n        weights=self.weights_regression, reduction=reduction\n    )\n\n    self.multi_target_classification_loss = MultiTargetClassificationLoss(\n        binary_config=binary_config,\n        multiclass_config=multiclass_config,\n        weights=(\n            self.weights_binary + self.weights_multiclass\n            if self.weights_binary is not None\n            and self.weights_multiclass is not None\n            else (\n                self.weights_binary\n                if self.weights_binary is not None\n                else self.weights_multiclass\n            )\n        ),\n        reduction=reduction,\n        binary_trick=binary_trick,\n    )\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html","title":"Metrics","text":"<p> NOTE: metrics in this module expect the predictions  and ground truth to have the same dimensions for regression and binary  classification problems: \\((N_{samples}, 1)\\). In the case of multiclass  classification problems the ground truth is expected to be a 1D tensor with  the corresponding classes. See Examples below</p> <p>We have added the possibility of using the metrics available at the torchmetrics library. Note that this library is still in its early versions and therefore this option should be used with caution. To use <code>torchmetrics</code> simply import them and use them as any of the <code>pytorch-widedeep</code> metrics described below.</p> <pre><code>from torchmetrics import Accuracy, Precision\n\naccuracy = Accuracy(average=None, num_classes=2)\nprecision = Precision(average='micro', num_classes=2)\n\ntrainer = Trainer(model, objective=\"binary\", metrics=[accuracy, precision])\n</code></pre> <p>A functioning example for <code>pytorch-widedeep</code> using <code>torchmetrics</code> can be found in the Examples folder</p> <p> NOTE: the forward method for all metrics in this  module takes two tensors, <code>y_pred</code> and <code>y_true</code> (in that order). Therefore,  we do not include the method in the documentation.</p>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.Accuracy","title":"Accuracy","text":"<pre><code>Accuracy(top_k=1)\n</code></pre> <p>               Bases: <code>Metric</code></p> <p>Class to calculate the accuracy for both binary and categorical problems</p> <p>Parameters:</p> <ul> <li> <code>top_k</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Accuracy will be computed using the top k most likely classes in multiclass problems</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.metrics import Accuracy\n&gt;&gt;&gt;\n&gt;&gt;&gt; acc = Accuracy()\n&gt;&gt;&gt; y_true = torch.tensor([0, 1, 0, 1]).view(-1, 1)\n&gt;&gt;&gt; y_pred = torch.tensor([[0.3, 0.2, 0.6, 0.7]]).view(-1, 1)\n&gt;&gt;&gt; acc(y_pred, y_true)\narray(0.5)\n&gt;&gt;&gt;\n&gt;&gt;&gt; acc = Accuracy(top_k=2)\n&gt;&gt;&gt; y_true = torch.tensor([0, 1, 2])\n&gt;&gt;&gt; y_pred = torch.tensor([[0.3, 0.5, 0.2], [0.1, 0.1, 0.8], [0.1, 0.5, 0.4]])\n&gt;&gt;&gt; acc(y_pred, y_true)\narray(0.66666667)\n</code></pre> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def __init__(self, top_k: int = 1):\n    super(Accuracy, self).__init__()\n\n    self.top_k = top_k\n    self.correct_count = 0\n    self.total_count = 0\n    self._name = \"acc\"\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.Accuracy.reset","title":"reset","text":"<pre><code>reset()\n</code></pre> <p>resets counters to 0</p> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def reset(self):\n    \"\"\"\n    resets counters to 0\n    \"\"\"\n    self.correct_count = 0\n    self.total_count = 0\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.Precision","title":"Precision","text":"<pre><code>Precision(average=True)\n</code></pre> <p>               Bases: <code>Metric</code></p> <p>Class to calculate the precision for both binary and categorical problems</p> <p>Parameters:</p> <ul> <li> <code>average</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>This applies only to multiclass problems. if <code>True</code> calculate precision for each label, and finds their unweighted mean.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.metrics import Precision\n&gt;&gt;&gt;\n&gt;&gt;&gt; prec = Precision()\n&gt;&gt;&gt; y_true = torch.tensor([0, 1, 0, 1]).view(-1, 1)\n&gt;&gt;&gt; y_pred = torch.tensor([[0.3, 0.2, 0.6, 0.7]]).view(-1, 1)\n&gt;&gt;&gt; prec(y_pred, y_true)\narray(0.5)\n&gt;&gt;&gt;\n&gt;&gt;&gt; prec = Precision(average=True)\n&gt;&gt;&gt; y_true = torch.tensor([0, 1, 2])\n&gt;&gt;&gt; y_pred = torch.tensor([[0.7, 0.1, 0.2], [0.1, 0.1, 0.8], [0.1, 0.5, 0.4]])\n&gt;&gt;&gt; prec(y_pred, y_true)\narray(0.33333334)\n</code></pre> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def __init__(self, average: bool = True):\n    super(Precision, self).__init__()\n\n    self.average = average\n    self.true_positives = 0\n    self.all_positives = 0\n    self.eps = 1e-20\n    self._name = \"prec\"\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.Precision.reset","title":"reset","text":"<pre><code>reset()\n</code></pre> <p>resets counters to 0</p> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def reset(self):\n    \"\"\"\n    resets counters to 0\n    \"\"\"\n    self.true_positives = 0\n    self.all_positives = 0\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.Recall","title":"Recall","text":"<pre><code>Recall(average=True)\n</code></pre> <p>               Bases: <code>Metric</code></p> <p>Class to calculate the recall for both binary and categorical problems</p> <p>Parameters:</p> <ul> <li> <code>average</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>This applies only to multiclass problems. if <code>True</code> calculate recall for each label, and finds their unweighted mean.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.metrics import Recall\n&gt;&gt;&gt;\n&gt;&gt;&gt; rec = Recall()\n&gt;&gt;&gt; y_true = torch.tensor([0, 1, 0, 1]).view(-1, 1)\n&gt;&gt;&gt; y_pred = torch.tensor([[0.3, 0.2, 0.6, 0.7]]).view(-1, 1)\n&gt;&gt;&gt; rec(y_pred, y_true)\narray(0.5)\n&gt;&gt;&gt;\n&gt;&gt;&gt; rec = Recall(average=True)\n&gt;&gt;&gt; y_true = torch.tensor([0, 1, 2])\n&gt;&gt;&gt; y_pred = torch.tensor([[0.7, 0.1, 0.2], [0.1, 0.1, 0.8], [0.1, 0.5, 0.4]])\n&gt;&gt;&gt; rec(y_pred, y_true)\narray(0.33333334)\n</code></pre> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def __init__(self, average: bool = True):\n    super(Recall, self).__init__()\n\n    self.average = average\n    self.true_positives = 0\n    self.actual_positives = 0\n    self.eps = 1e-20\n    self._name = \"rec\"\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.Recall.reset","title":"reset","text":"<pre><code>reset()\n</code></pre> <p>resets counters to 0</p> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def reset(self):\n    \"\"\"\n    resets counters to 0\n    \"\"\"\n    self.true_positives = 0\n    self.actual_positives = 0\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.FBetaScore","title":"FBetaScore","text":"<pre><code>FBetaScore(beta, average=True)\n</code></pre> <p>               Bases: <code>Metric</code></p> <p>Class to calculate the fbeta score for both binary and categorical problems</p> \\[ F_{\\beta} = ((1 + {\\beta}^2) * \\frac{(precision * recall)}{({\\beta}^2 * precision + recall)} \\] <p>Parameters:</p> <ul> <li> <code>beta</code>               (<code>int</code>)           \u2013            <p>Coefficient to control the balance between precision and recall</p> </li> <li> <code>average</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>This applies only to multiclass problems. if <code>True</code> calculate fbeta for each label, and find their unweighted mean.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.metrics import FBetaScore\n&gt;&gt;&gt;\n&gt;&gt;&gt; fbeta = FBetaScore(beta=2)\n&gt;&gt;&gt; y_true = torch.tensor([0, 1, 0, 1]).view(-1, 1)\n&gt;&gt;&gt; y_pred = torch.tensor([[0.3, 0.2, 0.6, 0.7]]).view(-1, 1)\n&gt;&gt;&gt; fbeta(y_pred, y_true)\narray(0.5)\n&gt;&gt;&gt;\n&gt;&gt;&gt; fbeta = FBetaScore(beta=2)\n&gt;&gt;&gt; y_true = torch.tensor([0, 1, 2])\n&gt;&gt;&gt; y_pred = torch.tensor([[0.7, 0.1, 0.2], [0.1, 0.1, 0.8], [0.1, 0.5, 0.4]])\n&gt;&gt;&gt; fbeta(y_pred, y_true)\narray(0.33333334)\n</code></pre> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def __init__(self, beta: int, average: bool = True):\n    super(FBetaScore, self).__init__()\n\n    self.beta = beta\n    self.average = average\n    self.precision = Precision(average=False)\n    self.recall = Recall(average=False)\n    self.eps = 1e-20\n    self._name = \"\".join([\"f\", str(self.beta)])\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.FBetaScore.reset","title":"reset","text":"<pre><code>reset()\n</code></pre> <p>resets precision and recall</p> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def reset(self):\n    \"\"\"\n    resets precision and recall\n    \"\"\"\n    self.precision.reset()\n    self.recall.reset()\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.F1Score","title":"F1Score","text":"<pre><code>F1Score(average=True)\n</code></pre> <p>               Bases: <code>Metric</code></p> <p>Class to calculate the f1 score for both binary and categorical problems</p> <p>Parameters:</p> <ul> <li> <code>average</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>This applies only to multiclass problems. if <code>True</code> calculate f1 for each label, and find their unweighted mean.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.metrics import F1Score\n&gt;&gt;&gt;\n&gt;&gt;&gt; f1 = F1Score()\n&gt;&gt;&gt; y_true = torch.tensor([0, 1, 0, 1]).view(-1, 1)\n&gt;&gt;&gt; y_pred = torch.tensor([[0.3, 0.2, 0.6, 0.7]]).view(-1, 1)\n&gt;&gt;&gt; f1(y_pred, y_true)\narray(0.5)\n&gt;&gt;&gt;\n&gt;&gt;&gt; f1 = F1Score()\n&gt;&gt;&gt; y_true = torch.tensor([0, 1, 2])\n&gt;&gt;&gt; y_pred = torch.tensor([[0.7, 0.1, 0.2], [0.1, 0.1, 0.8], [0.1, 0.5, 0.4]])\n&gt;&gt;&gt; f1(y_pred, y_true)\narray(0.33333334)\n</code></pre> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def __init__(self, average: bool = True):\n    super(F1Score, self).__init__()\n\n    self.average = average\n    self.f1 = FBetaScore(beta=1, average=self.average)\n    self._name = self.f1._name\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.F1Score.reset","title":"reset","text":"<pre><code>reset()\n</code></pre> <p>resets counters to 0</p> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def reset(self):\n    \"\"\"\n    resets counters to 0\n    \"\"\"\n    self.f1.reset()\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.R2Score","title":"R2Score","text":"<pre><code>R2Score()\n</code></pre> <p>               Bases: <code>Metric</code></p> <p>Calculates R-Squared, the coefficient of determination:</p> \\[ R^2 = 1 - \\frac{\\sum_{j=1}^n(y_j - \\hat{y_j})^2}{\\sum_{j=1}^n(y_j - \\bar{y})^2} \\] <p>where \\(\\hat{y_j}\\) is the ground truth, \\(y_j\\) is the predicted value and \\(\\bar{y}\\) is the mean of the ground truth.</p> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.metrics import R2Score\n&gt;&gt;&gt;\n&gt;&gt;&gt; r2 = R2Score()\n&gt;&gt;&gt; y_true = torch.tensor([3, -0.5, 2, 7]).view(-1, 1)\n&gt;&gt;&gt; y_pred = torch.tensor([2.5, 0.0, 2, 8]).view(-1, 1)\n&gt;&gt;&gt; r2(y_pred, y_true)\narray(0.94860814)\n</code></pre> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def __init__(self):\n    self.numerator = 0\n    self.denominator = 0\n    self.num_examples = 0\n    self.y_true_sum = 0\n\n    self._name = \"r2\"\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.R2Score.reset","title":"reset","text":"<pre><code>reset()\n</code></pre> <p>resets counters to 0</p> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def reset(self):\n    \"\"\"\n    resets counters to 0\n    \"\"\"\n    self.numerator = 0\n    self.denominator = 0\n    self.num_examples = 0\n    self.y_true_sum = 0\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html","title":"The <code>models</code> module","text":"<p>This module contains the models that can be used as the four main components that will comprise a Wide and Deep model (<code>wide</code>, <code>deeptabular</code>, <code>deeptext</code>, <code>deepimage</code>), as well as the <code>WideDeep</code> \"constructor\" class. Note that each of the four components can be used independently. It also contains all the documentation for the models that can be used for self-supervised pre-training with tabular data.</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.linear.wide.Wide","title":"Wide","text":"<pre><code>Wide(input_dim, pred_dim=1)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Defines a <code>Wide</code> (linear) model where the non-linearities are captured via the so-called crossed-columns. This can be used as the <code>wide</code> component of a Wide &amp; Deep model.</p> <p>Parameters:</p> <ul> <li> <code>input_dim</code>               (<code>int</code>)           \u2013            <p>size of the Linear layer (implemented via an Embedding layer). <code>input_dim</code> is the summation of all the individual values for all the features that go through the wide model. For example, if the wide model receives 2 features with 5 individual values each, <code>input_dim = 10</code></p> </li> <li> <code>pred_dim</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>size of the ouput tensor containing the predictions. Note that unlike all the other models, the wide model is connected directly to the output neuron(s) when used to build a Wide and Deep model. Therefore, it requires the <code>pred_dim</code> parameter.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>wide_linear</code>               (<code>Module</code>)           \u2013            <p>the linear layer that comprises the wide branch of the model</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import Wide\n&gt;&gt;&gt; X = torch.empty(4, 4).random_(4)\n&gt;&gt;&gt; wide = Wide(input_dim=X.unique().size(0), pred_dim=1)\n&gt;&gt;&gt; out = wide(X)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/linear/wide.py</code> <pre><code>@alias(\"pred_dim\", [\"pred_size\", \"num_class\"])\ndef __init__(self, input_dim: int, pred_dim: int = 1):\n    super(Wide, self).__init__()\n\n    self.input_dim = input_dim\n    self.pred_dim = pred_dim\n\n    # Embeddings: val + 1 because 0 is reserved for padding/unseen cateogories.\n    self.wide_linear = nn.Embedding(input_dim + 1, pred_dim, padding_idx=0)\n    # (Sum(Embedding) + bias) is equivalent to (OneHotVector + Linear)\n    self.bias = nn.Parameter(torch.zeros(pred_dim))\n    self._reset_parameters()\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.linear.wide.Wide.forward","title":"forward","text":"<pre><code>forward(X)\n</code></pre> <p>Forward pass. Simply connecting the Embedding layer with the ouput neuron(s)</p> Source code in <code>pytorch_widedeep/models/tabular/linear/wide.py</code> <pre><code>def forward(self, X: Tensor) -&gt; Tensor:\n    r\"\"\"Forward pass. Simply connecting the Embedding layer with the ouput\n    neuron(s)\"\"\"\n    out = self.wide_linear(X.long()).sum(dim=1) + self.bias\n    return out\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.mlp.tab_mlp.TabMlp","title":"TabMlp","text":"<pre><code>TabMlp(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous=None,\n    embed_continuous_method=None,\n    cont_embed_dim=None,\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    mlp_hidden_dims=[200, 100],\n    mlp_activation=\"relu\",\n    mlp_dropout=0.1,\n    mlp_batchnorm=False,\n    mlp_batchnorm_last=False,\n    mlp_linear_first=True\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithoutAttention</code></p> <p>Defines a <code>TabMlp</code> model that can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>This class combines embedding representations of the categorical features with numerical (aka continuous) features, embedded or not. These are then passed through a series of dense layers (i.e. a MLP).</p> <p>Most of the parameters for this class are <code>Optional</code> since the use of categorical or continuous is in fact optional (i.e. one can use categorical features only, continuous features only or both).</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name, number of unique values and embedding dimension. e.g. [(education, 11, 32), ...]</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the continuous columns will be embedded using one of the available methods: 'standard', 'periodic' or 'piecewise'. If <code>None</code>, it will default to 'False'. NOTE: This parameter is deprecated and it  will be removed in future releases. Please, use the  <code>embed_continuous_method</code> parameter instead.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal[standard, piecewise, periodic]]</code>, default:                   <code>None</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dim</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Size of the continuous embeddings. If the continuous columns are embedded, <code>cont_embed_dim</code> must be passed.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>List[int]</code>, default:                   <code>[200, 100]</code> )           \u2013            <p>List with the number of neurons per dense layer in the mlp.</p> </li> <li> <code>mlp_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>mlp_dropout</code>               (<code>Union[float, List[float]]</code>, default:                   <code>0.1</code> )           \u2013            <p>float or List of floats with the dropout between the dense layers. e.g: [0.5,0.5]</p> </li> <li> <code>mlp_batchnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers</p> </li> <li> <code>mlp_linear_first</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>encoder</code>               (<code>Module</code>)           \u2013            <p>mlp model that will receive the concatenation of the embeddings and the continuous columns</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import TabMlp\n&gt;&gt;&gt; X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = [\"a\", \"b\", \"c\", \"d\", \"e\"]\n&gt;&gt;&gt; cat_embed_input = [(u, i, j) for u, i, j in zip(colnames[:4], [4] * 4, [8] * 4)]\n&gt;&gt;&gt; column_idx = {k: v for v, k in enumerate(colnames)}\n&gt;&gt;&gt; model = TabMlp(mlp_hidden_dims=[8, 4], column_idx=column_idx, cat_embed_input=cat_embed_input,\n... continuous_cols=[\"e\"])\n&gt;&gt;&gt; out = model(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/mlp/tab_mlp.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous: Optional[bool] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = None,\n    cont_embed_dim: Optional[int] = None,\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    mlp_hidden_dims: List[int] = [200, 100],\n    mlp_activation: str = \"relu\",\n    mlp_dropout: Union[float, List[float]] = 0.1,\n    mlp_batchnorm: bool = False,\n    mlp_batchnorm_last: bool = False,\n    mlp_linear_first: bool = True,\n):\n    super(TabMlp, self).__init__(\n        column_idx=column_idx,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous=embed_continuous,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dim=cont_embed_dim,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    # Embeddings are instantiated at the base model\n    # Mlp\n    mlp_input_dim = self.cat_out_dim + self.cont_out_dim\n    mlp_hidden_dims = [mlp_input_dim] + mlp_hidden_dims\n    self.encoder = MLP(\n        mlp_hidden_dims,\n        mlp_activation,\n        mlp_dropout,\n        mlp_batchnorm,\n        mlp_batchnorm_last,\n        mlp_linear_first,\n    )\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.mlp.tab_mlp.TabMlp.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.mlp.tab_mlp.TabMlpDecoder","title":"TabMlpDecoder","text":"<pre><code>TabMlpDecoder(\n    embed_dim,\n    mlp_hidden_dims=[100, 200],\n    mlp_activation=\"relu\",\n    mlp_dropout=0.1,\n    mlp_batchnorm=False,\n    mlp_batchnorm_last=False,\n    mlp_linear_first=True,\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Companion decoder model for the <code>TabMlp</code> model (which can be considered an encoder itself).</p> <p>This class is designed to be used with the <code>EncoderDecoderTrainer</code> when using self-supervised pre-training (see the corresponding section in the docs). The <code>TabMlpDecoder</code> will receive the output from the MLP and 'reconstruct' the embeddings.</p> <p>Parameters:</p> <ul> <li> <code>embed_dim</code>               (<code>int</code>)           \u2013            <p>Size of the embeddings tensor that needs to be reconstructed.</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>List[int]</code>, default:                   <code>[100, 200]</code> )           \u2013            <p>List with the number of neurons per dense layer in the mlp.</p> </li> <li> <code>mlp_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>mlp_dropout</code>               (<code>Union[float, List[float]]</code>, default:                   <code>0.1</code> )           \u2013            <p>float or List of floats with the dropout between the dense layers. e.g: [0.5,0.5]</p> </li> <li> <code>mlp_batchnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers</p> </li> <li> <code>mlp_linear_first</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>decoder</code>               (<code>Module</code>)           \u2013            <p>mlp model that will receive the output of the encoder</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import TabMlpDecoder\n&gt;&gt;&gt; x_inp = torch.rand(3, 8)\n&gt;&gt;&gt; decoder = TabMlpDecoder(embed_dim=32, mlp_hidden_dims=[8,16])\n&gt;&gt;&gt; res = decoder(x_inp)\n&gt;&gt;&gt; res.shape\ntorch.Size([3, 32])\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/mlp/tab_mlp.py</code> <pre><code>def __init__(\n    self,\n    embed_dim: int,\n    mlp_hidden_dims: List[int] = [100, 200],\n    mlp_activation: str = \"relu\",\n    mlp_dropout: Union[float, List[float]] = 0.1,\n    mlp_batchnorm: bool = False,\n    mlp_batchnorm_last: bool = False,\n    mlp_linear_first: bool = True,\n):\n    super(TabMlpDecoder, self).__init__()\n\n    self.embed_dim = embed_dim\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    self.decoder = MLP(\n        mlp_hidden_dims + [self.embed_dim],\n        mlp_activation,\n        mlp_dropout,\n        mlp_batchnorm,\n        mlp_batchnorm_last,\n        mlp_linear_first,\n    )\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.resnet.tab_resnet.TabResnet","title":"TabResnet","text":"<pre><code>TabResnet(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous=None,\n    embed_continuous_method=None,\n    cont_embed_dim=None,\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    blocks_dims=[200, 100, 100],\n    blocks_dropout=0.1,\n    simplify_blocks=False,\n    mlp_hidden_dims=None,\n    mlp_activation=None,\n    mlp_dropout=None,\n    mlp_batchnorm=None,\n    mlp_batchnorm_last=None,\n    mlp_linear_first=None\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithoutAttention</code></p> <p>Defines a <code>TabResnet</code> model that can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>This class combines embedding representations of the categorical features with numerical (aka continuous) features, embedded or not. These are then passed through a series of Resnet blocks. See <code>pytorch_widedeep.models.tab_resnet._layers</code> for details on the structure of each block.</p> <p>Most of the parameters for this class are <code>Optional</code> since the use of categorical or continuous is in fact optional (i.e. one can use categorical features only, continuous features only or both).</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name, number of unique values and embedding dimension. e.g. [(education, 11, 32), ...]</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the continuous columns will be embedded using one of the available methods: 'standard', 'periodic' or 'piecewise'. If <code>None</code>, it will default to 'False'. NOTE: This parameter is deprecated and it  will be removed in future releases. Please, use the  <code>embed_continuous_method</code> parameter instead.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal[standard, piecewise, periodic]]</code>, default:                   <code>None</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dim</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Size of the continuous embeddings. If the continuous columns are embedded, <code>cont_embed_dim</code> must be passed.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>blocks_dims</code>               (<code>List[int]</code>, default:                   <code>[200, 100, 100]</code> )           \u2013            <p>List of integers that define the input and output units of each block. For example: [200, 100, 100] will generate 2 blocks. The first will receive a tensor of size 200 and output a tensor of size 100, and the second will receive a tensor of size 100 and output a tensor of size 100. See <code>pytorch_widedeep.models.tab_resnet._layers</code> for details on the structure of each block.</p> </li> <li> <code>blocks_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Block's internal dropout.</p> </li> <li> <code>simplify_blocks</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the simplest possible residual blocks (<code>X -&gt; [ [LIN, BN, ACT]  + X ]</code>) will be used instead of a standard one (<code>X -&gt; [ [LIN1, BN1, ACT1] -&gt; [LIN2, BN2]  + X ]</code>).</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per dense layer in the MLP. e.g: [64, 32]. If <code>None</code> the  output of the Resnet Blocks will be connected directly to the output neuron(s).</p> </li> <li> <code>mlp_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky'_relu' and _'gelu' are supported. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 'relu'.</p> </li> <li> <code>mlp_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>float with the dropout between the dense layers of the MLP. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 0.0.</p> </li> <li> <code>mlp_batchnorm</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_linear_first</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code> If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to <code>True</code>.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>encoder</code>               (<code>Module</code>)           \u2013            <p>deep dense Resnet model that will receive the concatenation of the embeddings and the continuous columns</p> </li> <li> <code>mlp</code>               (<code>Module</code>)           \u2013            <p>if <code>mlp_hidden_dims</code> is <code>True</code>, this attribute will be an mlp model that will receive the results of the concatenation of the embeddings and the continuous columns -- if present --.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import TabResnet\n&gt;&gt;&gt; X_deep = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = ['a', 'b', 'c', 'd', 'e']\n&gt;&gt;&gt; cat_embed_input = [(u,i,j) for u,i,j in zip(colnames[:4], [4]*4, [8]*4)]\n&gt;&gt;&gt; column_idx = {k:v for v,k in enumerate(colnames)}\n&gt;&gt;&gt; model = TabResnet(blocks_dims=[16,4], column_idx=column_idx, cat_embed_input=cat_embed_input,\n... continuous_cols = ['e'])\n&gt;&gt;&gt; out = model(X_deep)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/resnet/tab_resnet.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous: Optional[bool] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = None,\n    cont_embed_dim: Optional[int] = None,\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    blocks_dims: List[int] = [200, 100, 100],\n    blocks_dropout: float = 0.1,\n    simplify_blocks: bool = False,\n    mlp_hidden_dims: Optional[List[int]] = None,\n    mlp_activation: Optional[str] = None,\n    mlp_dropout: Optional[float] = None,\n    mlp_batchnorm: Optional[bool] = None,\n    mlp_batchnorm_last: Optional[bool] = None,\n    mlp_linear_first: Optional[bool] = None,\n):\n    super(TabResnet, self).__init__(\n        column_idx=column_idx,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous=embed_continuous,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dim=cont_embed_dim,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    if len(blocks_dims) &lt; 2:\n        raise ValueError(\n            \"'blocks' must contain at least two elements, e.g. [256, 128]\"\n        )\n\n    self.blocks_dims = blocks_dims\n    self.blocks_dropout = blocks_dropout\n    self.simplify_blocks = simplify_blocks\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    # Embeddings are instantiated at the base model\n\n    # Resnet\n    dense_resnet_input_dim = self.cat_out_dim + self.cont_out_dim\n    self.encoder = DenseResnet(\n        dense_resnet_input_dim, blocks_dims, blocks_dropout, self.simplify_blocks\n    )\n\n    # Mlp: adding an MLP on top of the Resnet blocks is optional and\n    # therefore all related params are optional\n    if self.mlp_hidden_dims is not None:\n        self.mlp = MLP(\n            d_hidden=[self.blocks_dims[-1]] + self.mlp_hidden_dims,\n            activation=(\n                \"relu\" if self.mlp_activation is None else self.mlp_activation\n            ),\n            dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,\n            batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,\n            batchnorm_last=(\n                False\n                if self.mlp_batchnorm_last is None\n                else self.mlp_batchnorm_last\n            ),\n            linear_first=(\n                True if self.mlp_linear_first is None else self.mlp_linear_first\n            ),\n        )\n    else:\n        self.mlp = None\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.resnet.tab_resnet.TabResnet.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.resnet.tab_resnet.TabResnetDecoder","title":"TabResnetDecoder","text":"<pre><code>TabResnetDecoder(\n    embed_dim,\n    blocks_dims=[100, 100, 200],\n    blocks_dropout=0.1,\n    simplify_blocks=False,\n    mlp_hidden_dims=None,\n    mlp_activation=None,\n    mlp_dropout=None,\n    mlp_batchnorm=None,\n    mlp_batchnorm_last=None,\n    mlp_linear_first=None,\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Companion decoder model for the <code>TabResnet</code> model (which can be considered an encoder itself)</p> <p>This class is designed to be used with the <code>EncoderDecoderTrainer</code> when using self-supervised pre-training (see the corresponding section in the docs). This class will receive the output from the ResNet blocks or the MLP(if present) and 'reconstruct' the embeddings.</p> <p>Parameters:</p> <ul> <li> <code>embed_dim</code>               (<code>int</code>)           \u2013            <p>Size of the embeddings tensor to be reconstructed.</p> </li> <li> <code>blocks_dims</code>               (<code>List[int]</code>, default:                   <code>[100, 100, 200]</code> )           \u2013            <p>List of integers that define the input and output units of each block. For example: [200, 100, 100] will generate 2 blocks. The first will receive a tensor of size 200 and output a tensor of size 100, and the second will receive a tensor of size 100 and output a tensor of size 100. See <code>pytorch_widedeep.models.tab_resnet._layers</code> for details on the structure of each block.</p> </li> <li> <code>blocks_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Block's internal dropout.</p> </li> <li> <code>simplify_blocks</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the simplest possible residual blocks (<code>X -&gt; [ [LIN, BN, ACT]  + X ]</code>) will be used instead of a standard one (<code>X -&gt; [ [LIN1, BN1, ACT1] -&gt; [LIN2, BN2]  + X ]</code>).</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per dense layer in the MLP. e.g: [64, 32]. If <code>None</code> the  output of the Resnet Blocks will be connected directly to the output neuron(s).</p> </li> <li> <code>mlp_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky'_relu' and _'gelu' are supported. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 'relu'.</p> </li> <li> <code>mlp_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>float with the dropout between the dense layers of the MLP. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 0.0.</p> </li> <li> <code>mlp_batchnorm</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_linear_first</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code> If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to <code>True</code>.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>decoder</code>               (<code>Module</code>)           \u2013            <p>deep dense Resnet model that will receive the output of the encoder IF <code>mlp_hidden_dims</code> is None</p> </li> <li> <code>mlp</code>               (<code>Module</code>)           \u2013            <p>if <code>mlp_hidden_dims</code> is not None, the overall decoder will consist in an MLP that will receive the output of the encoder followed by the deep dense Resnet.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import TabResnetDecoder\n&gt;&gt;&gt; x_inp = torch.rand(3, 8)\n&gt;&gt;&gt; decoder = TabResnetDecoder(embed_dim=32, blocks_dims=[8, 16, 16])\n&gt;&gt;&gt; res = decoder(x_inp)\n&gt;&gt;&gt; res.shape\ntorch.Size([3, 32])\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/resnet/tab_resnet.py</code> <pre><code>def __init__(\n    self,\n    embed_dim: int,\n    blocks_dims: List[int] = [100, 100, 200],\n    blocks_dropout: float = 0.1,\n    simplify_blocks: bool = False,\n    mlp_hidden_dims: Optional[List[int]] = None,\n    mlp_activation: Optional[str] = None,\n    mlp_dropout: Optional[float] = None,\n    mlp_batchnorm: Optional[bool] = None,\n    mlp_batchnorm_last: Optional[bool] = None,\n    mlp_linear_first: Optional[bool] = None,\n):\n    super(TabResnetDecoder, self).__init__()\n\n    if len(blocks_dims) &lt; 2:\n        raise ValueError(\n            \"'blocks' must contain at least two elements, e.g. [256, 128]\"\n        )\n\n    self.embed_dim = embed_dim\n\n    self.blocks_dims = blocks_dims\n    self.blocks_dropout = blocks_dropout\n    self.simplify_blocks = simplify_blocks\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    if self.mlp_hidden_dims is not None:\n        self.mlp = MLP(\n            d_hidden=[self.mlp_first_hidden_dim] + self.mlp_hidden_dims,\n            activation=(\n                \"relu\" if self.mlp_activation is None else self.mlp_activation\n            ),\n            dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,\n            batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,\n            batchnorm_last=(\n                False\n                if self.mlp_batchnorm_last is None\n                else self.mlp_batchnorm_last\n            ),\n            linear_first=(\n                True if self.mlp_linear_first is None else self.mlp_linear_first\n            ),\n        )\n        self.decoder = DenseResnet(\n            self.mlp_hidden_dims[-1],\n            blocks_dims,\n            blocks_dropout,\n            self.simplify_blocks,\n        )\n    else:\n        self.mlp = None\n        self.decoder = DenseResnet(\n            blocks_dims[0], blocks_dims, blocks_dropout, self.simplify_blocks\n        )\n\n    self.reconstruction_layer = nn.Linear(blocks_dims[-1], embed_dim, bias=False)\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.tabnet.tab_net.TabNet","title":"TabNet","text":"<pre><code>TabNet(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous=None,\n    embed_continuous_method=None,\n    cont_embed_dim=None,\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    n_steps=3,\n    step_dim=8,\n    attn_dim=8,\n    dropout=0.0,\n    n_glu_step_dependent=2,\n    n_glu_shared=2,\n    ghost_bn=True,\n    virtual_batch_size=128,\n    momentum=0.02,\n    gamma=1.3,\n    epsilon=1e-15,\n    mask_type=\"sparsemax\"\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithoutAttention</code></p> <p>Defines a TabNet model that can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>The implementation in this library is fully based on that here by the dreamquark-ai team, simply adapted so that it can work within the <code>WideDeep</code> frame. Therefore, ALL CREDIT TO THE DREAMQUARK-AI TEAM.</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name, number of unique values and embedding dimension. e.g. [(education, 11, 32), ...]</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the continuous columns will be embedded using one of the available methods: 'standard', 'periodic' or 'piecewise'. If <code>None</code>, it will default to 'False'. NOTE: This parameter is deprecated and it  will be removed in future releases. Please, use the  <code>embed_continuous_method</code> parameter instead.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal[standard, piecewise, periodic]]</code>, default:                   <code>None</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dim</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Size of the continuous embeddings. If the continuous columns are embedded, <code>cont_embed_dim</code> must be passed.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>n_steps</code>               (<code>int</code>, default:                   <code>3</code> )           \u2013            <p>number of decision steps. For a better understanding of the function of <code>n_steps</code> and the upcoming parameters, please see the paper.</p> </li> <li> <code>step_dim</code>               (<code>int</code>, default:                   <code>8</code> )           \u2013            <p>Step's output dimension. This is the output dimension that <code>WideDeep</code> will collect and connect to the output neuron(s).</p> </li> <li> <code>attn_dim</code>               (<code>int</code>, default:                   <code>8</code> )           \u2013            <p>Attention dimension</p> </li> <li> <code>dropout</code>               (<code>float</code>, default:                   <code>0.0</code> )           \u2013            <p>GLU block's internal dropout</p> </li> <li> <code>n_glu_step_dependent</code>               (<code>int</code>, default:                   <code>2</code> )           \u2013            <p>number of GLU Blocks (<code>[FC -&gt; BN -&gt; GLU]</code>) that are step dependent</p> </li> <li> <code>n_glu_shared</code>               (<code>int</code>, default:                   <code>2</code> )           \u2013            <p>number of GLU Blocks (<code>[FC -&gt; BN -&gt; GLU]</code>) that will be shared across decision steps</p> </li> <li> <code>ghost_bn</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating if Ghost Batch Normalization will be used.</p> </li> <li> <code>virtual_batch_size</code>               (<code>int</code>, default:                   <code>128</code> )           \u2013            <p>Batch size when using Ghost Batch Normalization</p> </li> <li> <code>momentum</code>               (<code>float</code>, default:                   <code>0.02</code> )           \u2013            <p>Ghost Batch Normalization's momentum. The dreamquark-ai advises for very low values. However high values are used in the original publication. During our tests higher values lead to better results</p> </li> <li> <code>gamma</code>               (<code>float</code>, default:                   <code>1.3</code> )           \u2013            <p>Relaxation parameter in the paper. When gamma = 1, a feature is enforced to be used only at one decision step. As gamma increases, more flexibility is provided to use a feature at multiple decision steps</p> </li> <li> <code>epsilon</code>               (<code>float</code>, default:                   <code>1e-15</code> )           \u2013            <p>Float to avoid log(0). Always keep low</p> </li> <li> <code>mask_type</code>               (<code>str</code>, default:                   <code>'sparsemax'</code> )           \u2013            <p>Mask function to use. Either 'sparsemax' or 'entmax'</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>encoder</code>               (<code>Module</code>)           \u2013            <p>the TabNet encoder. For details see the original publication.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = [\"a\", \"b\", \"c\", \"d\", \"e\"]\n&gt;&gt;&gt; cat_embed_input = [(u, i, j) for u, i, j in zip(colnames[:4], [4] * 4, [8] * 4)]\n&gt;&gt;&gt; column_idx = {k: v for v, k in enumerate(colnames)}\n&gt;&gt;&gt; model = TabNet(column_idx=column_idx, cat_embed_input=cat_embed_input, continuous_cols=[\"e\"])\n&gt;&gt;&gt; out = model(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/tabnet/tab_net.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous: Optional[bool] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = None,\n    cont_embed_dim: Optional[int] = None,\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    n_steps: int = 3,\n    step_dim: int = 8,\n    attn_dim: int = 8,\n    dropout: float = 0.0,\n    n_glu_step_dependent: int = 2,\n    n_glu_shared: int = 2,\n    ghost_bn: bool = True,\n    virtual_batch_size: int = 128,\n    momentum: float = 0.02,\n    gamma: float = 1.3,\n    epsilon: float = 1e-15,\n    mask_type: str = \"sparsemax\",\n):\n    super(TabNet, self).__init__(\n        column_idx=column_idx,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous=embed_continuous,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dim=cont_embed_dim,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    self.n_steps = n_steps\n    self.step_dim = step_dim\n    self.attn_dim = attn_dim\n    self.dropout = dropout\n    self.n_glu_step_dependent = n_glu_step_dependent\n    self.n_glu_shared = n_glu_shared\n    self.ghost_bn = ghost_bn\n    self.virtual_batch_size = virtual_batch_size\n    self.momentum = momentum\n    self.gamma = gamma\n    self.epsilon = epsilon\n    self.mask_type = mask_type\n\n    # Embeddings are instantiated at the base model\n    self.embed_out_dim = self.cat_out_dim + self.cont_out_dim\n\n    # TabNet\n    self.encoder = TabNetEncoder(\n        self.embed_out_dim,\n        n_steps,\n        step_dim,\n        attn_dim,\n        dropout,\n        n_glu_step_dependent,\n        n_glu_shared,\n        ghost_bn,\n        virtual_batch_size,\n        momentum,\n        gamma,\n        epsilon,\n        mask_type,\n    )\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.tabnet.tab_net.TabNet.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.tabnet.tab_net.TabNetDecoder","title":"TabNetDecoder","text":"<pre><code>TabNetDecoder(\n    embed_dim,\n    n_steps=3,\n    step_dim=8,\n    dropout=0.0,\n    n_glu_step_dependent=2,\n    n_glu_shared=2,\n    ghost_bn=True,\n    virtual_batch_size=128,\n    momentum=0.02,\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Companion decoder model for the <code>TabNet</code> model (which can be considered an encoder itself)</p> <p>This class is designed to be used with the <code>EncoderDecoderTrainer</code> when using self-supervised pre-training (see the corresponding section in the docs). This class will receive the output from the <code>TabNet</code> encoder (i.e. the output from the so called 'steps') and 'reconstruct' the embeddings.</p> <p>Parameters:</p> <ul> <li> <code>embed_dim</code>               (<code>int</code>)           \u2013            <p>Size of the embeddings tensor to be reconstructed.</p> </li> <li> <code>n_steps</code>               (<code>int</code>, default:                   <code>3</code> )           \u2013            <p>number of decision steps. For a better understanding of the function of <code>n_steps</code> and the upcoming parameters, please see the paper.</p> </li> <li> <code>step_dim</code>               (<code>int</code>, default:                   <code>8</code> )           \u2013            <p>Step's output dimension. This is the output dimension that <code>WideDeep</code> will collect and connect to the output neuron(s).</p> </li> <li> <code>dropout</code>               (<code>float</code>, default:                   <code>0.0</code> )           \u2013            <p>GLU block's internal dropout</p> </li> <li> <code>n_glu_step_dependent</code>               (<code>int</code>, default:                   <code>2</code> )           \u2013            <p>number of GLU Blocks (<code>[FC -&gt; BN -&gt; GLU]</code>) that are step dependent</p> </li> <li> <code>n_glu_shared</code>               (<code>int</code>, default:                   <code>2</code> )           \u2013            <p>number of GLU Blocks (<code>[FC -&gt; BN -&gt; GLU]</code>) that will be shared across decision steps</p> </li> <li> <code>ghost_bn</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating if Ghost Batch Normalization will be used.</p> </li> <li> <code>virtual_batch_size</code>               (<code>int</code>, default:                   <code>128</code> )           \u2013            <p>Batch size when using Ghost Batch Normalization</p> </li> <li> <code>momentum</code>               (<code>float</code>, default:                   <code>0.02</code> )           \u2013            <p>Ghost Batch Normalization's momentum. The dreamquark-ai advises for very low values. However high values are used in the original publication. During our tests higher values lead to better results</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>decoder</code>               (<code>Module</code>)           \u2013            <p>decoder that will receive the output from the encoder's steps and will reconstruct the embeddings</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import TabNetDecoder\n&gt;&gt;&gt; x_inp = [torch.rand(3, 8), torch.rand(3, 8), torch.rand(3, 8)]\n&gt;&gt;&gt; decoder = TabNetDecoder(embed_dim=32, ghost_bn=False)\n&gt;&gt;&gt; res = decoder(x_inp)\n&gt;&gt;&gt; res.shape\ntorch.Size([3, 32])\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/tabnet/tab_net.py</code> <pre><code>def __init__(\n    self,\n    embed_dim: int,\n    n_steps: int = 3,\n    step_dim: int = 8,\n    dropout: float = 0.0,\n    n_glu_step_dependent: int = 2,\n    n_glu_shared: int = 2,\n    ghost_bn: bool = True,\n    virtual_batch_size: int = 128,\n    momentum: float = 0.02,\n):\n    super(TabNetDecoder, self).__init__()\n\n    self.n_steps = n_steps\n    self.step_dim = step_dim\n    self.dropout = dropout\n    self.n_glu_step_dependent = n_glu_step_dependent\n    self.n_glu_shared = n_glu_shared\n    self.ghost_bn = ghost_bn\n    self.virtual_batch_size = virtual_batch_size\n    self.momentum = momentum\n\n    shared_layers = nn.ModuleList()\n    for i in range(n_glu_shared):\n        if i == 0:\n            shared_layers.append(nn.Linear(step_dim, 2 * step_dim, bias=False))\n        else:\n            shared_layers.append(nn.Linear(step_dim, 2 * step_dim, bias=False))\n\n    self.decoder = nn.ModuleList()\n    for step in range(n_steps):\n        transformer = FeatTransformer(\n            step_dim,\n            step_dim,\n            dropout,\n            shared_layers,\n            n_glu_step_dependent,\n            ghost_bn,\n            virtual_batch_size,\n            momentum=momentum,\n        )\n        self.decoder.append(transformer)\n\n    self.reconstruction_layer = nn.Linear(step_dim, embed_dim, bias=False)\n    initialize_non_glu(self.reconstruction_layer, step_dim, embed_dim)\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.mlp.context_attention_mlp.ContextAttentionMLP","title":"ContextAttentionMLP","text":"<pre><code>ContextAttentionMLP(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    shared_embed=None,\n    add_shared_embed=None,\n    frac_shared_embed=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous_method=\"standard\",\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    input_dim=32,\n    attn_dropout=0.2,\n    with_addnorm=False,\n    attn_activation=\"leaky_relu\",\n    n_blocks=3\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithAttention</code></p> <p>Defines a <code>ContextAttentionMLP</code> model that can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>This class combines embedding representations of the categorical features with numerical (aka continuous) features that are also embedded. These are then passed through a series of attention blocks. Each attention block is comprised by a <code>ContextAttentionEncoder</code>. Such encoder is in part inspired by the attention mechanism described in Hierarchical Attention Networks for Document Classification. See <code>pytorch_widedeep.models.tabular.mlp._attention_layers</code> for details.</p> <p>Most of the parameters for this class are <code>Optional</code> since the use of categorical or continuous is in fact optional (i.e. one can use categorical features only, continuous features only or both).</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name and number of unique values and embedding dimension. e.g. [(education, 11), ...]</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the embeddings will be \"shared\". The idea behind <code>shared_embed</code> is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: <code>pytorch_widedeep.models.transformers._layers.SharedEmbeddings</code>.</p> </li> <li> <code>add_shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>The two embedding sharing strategies are: 1) add the shared embeddings to the column embeddings or 2) to replace the first <code>frac_shared_embed</code> with the shared embeddings. See <code>pytorch_widedeep.models.embeddings_layers.SharedEmbeddings</code> If 'None' is passed, it will default to 'False'.</p> </li> <li> <code>frac_shared_embed</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>The fraction of embeddings that will be shared (if <code>add_shared_embed = False</code>) by all the different categories for one particular column. If 'None' is passed, it will default to 0.0.</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal[standard, piecewise, periodic]]</code>, default:                   <code>'standard'</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>input_dim</code>               (<code>int</code>, default:                   <code>32</code> )           \u2013            <p>The so-called dimension of the model. Is the number of embeddings used to encode the categorical and/or continuous columns</p> </li> <li> <code>attn_dropout</code>               (<code>float</code>, default:                   <code>0.2</code> )           \u2013            <p>Dropout for each attention block</p> </li> <li> <code>with_addnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if residual connections will be used in the attention blocks</p> </li> <li> <code>attn_activation</code>               (<code>str</code>, default:                   <code>'leaky_relu'</code> )           \u2013            <p>String indicating the activation function to be applied to the dense layer in each attention encoder. 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported.</p> </li> <li> <code>n_blocks</code>               (<code>int</code>, default:                   <code>3</code> )           \u2013            <p>Number of attention blocks</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>encoder</code>               (<code>Module</code>)           \u2013            <p>Sequence of attention encoders.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import ContextAttentionMLP\n&gt;&gt;&gt; X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = ['a', 'b', 'c', 'd', 'e']\n&gt;&gt;&gt; cat_embed_input = [(u,i,j) for u,i,j in zip(colnames[:4], [4]*4, [8]*4)]\n&gt;&gt;&gt; column_idx = {k:v for v,k in enumerate(colnames)}\n&gt;&gt;&gt; model = ContextAttentionMLP(column_idx=column_idx, cat_embed_input=cat_embed_input, continuous_cols = ['e'])\n&gt;&gt;&gt; out = model(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/mlp/context_attention_mlp.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    shared_embed: Optional[bool] = None,\n    add_shared_embed: Optional[bool] = None,\n    frac_shared_embed: Optional[float] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = \"standard\",\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    input_dim: int = 32,\n    attn_dropout: float = 0.2,\n    with_addnorm: bool = False,\n    attn_activation: str = \"leaky_relu\",\n    n_blocks: int = 3,\n):\n    super(ContextAttentionMLP, self).__init__(\n        column_idx=column_idx,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        shared_embed=shared_embed,\n        add_shared_embed=add_shared_embed,\n        frac_shared_embed=frac_shared_embed,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous=None,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        input_dim=input_dim,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    self.attn_dropout = attn_dropout\n    self.with_addnorm = with_addnorm\n    self.attn_activation = attn_activation\n    self.n_blocks = n_blocks\n\n    self.with_cls_token = \"cls_token\" in column_idx\n    self.n_cat = len(cat_embed_input) if cat_embed_input is not None else 0\n    self.n_cont = len(continuous_cols) if continuous_cols is not None else 0\n\n    # Embeddings are instantiated at the base model\n    # Attention Blocks\n    self.encoder = nn.Sequential()\n    for i in range(n_blocks):\n        self.encoder.add_module(\n            \"attention_block\" + str(i),\n            ContextAttentionEncoder(\n                input_dim,\n                attn_dropout,\n                with_addnorm,\n                attn_activation,\n            ),\n        )\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.mlp.context_attention_mlp.ContextAttentionMLP.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.mlp.context_attention_mlp.ContextAttentionMLP.attention_weights","title":"attention_weights  <code>property</code>","text":"<pre><code>attention_weights\n</code></pre> <p>List with the attention weights per block</p> <p>The shape of the attention weights is \\((N, F)\\), where \\(N\\) is the batch size and \\(F\\) is the number of features/columns in the dataset</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.mlp.self_attention_mlp.SelfAttentionMLP","title":"SelfAttentionMLP","text":"<pre><code>SelfAttentionMLP(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    shared_embed=None,\n    add_shared_embed=None,\n    frac_shared_embed=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous_method=\"standard\",\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    input_dim=32,\n    attn_dropout=0.2,\n    n_heads=8,\n    use_bias=False,\n    with_addnorm=False,\n    attn_activation=\"leaky_relu\",\n    n_blocks=3\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithAttention</code></p> <p>Defines a <code>SelfAttentionMLP</code> model that can be used as the deeptabular component of a Wide &amp; Deep model or independently by itself.</p> <p>This class combines embedding representations of the categorical features with numerical (aka continuous) features that are also embedded. These are then passed through a series of attention blocks. Each attention block is comprised by what we would refer as a simplified <code>SelfAttentionEncoder</code>. See <code>pytorch_widedeep.models.tabular.mlp._attention_layers</code> for details. The reason to use a simplified version of self attention is because we observed that the 'standard' attention mechanism used in the TabTransformer has a notable tendency to overfit.</p> <p>In more detail, this model only uses Q and K (and not V). If we think about it as in terms of text (and intuitively), the Softmax(QK^T) is the attention mechanism that tells us how much, at each position in the input sentence, each word is represented or 'expressed'. We refer to that as 'attention weights'. These attention weighst are normally multiplied by a Value matrix to further strength the focus on the words that each word should be attending to (again, intuitively).</p> <p>In this implementation we skip this last multiplication and instead we multiply the attention weights directly by the input tensor. This is a simplification that we expect is beneficial in terms of avoiding overfitting for tabular data.</p> <p>Most of the parameters for this class are <code>Optional</code> since the use of categorical or continuous is in fact optional (i.e. one can use categorical features only, continuous features only or both).</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name and number of unique values and embedding dimension. e.g. [(education, 11), ...]</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the embeddings will be \"shared\". The idea behind <code>shared_embed</code> is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: <code>pytorch_widedeep.models.transformers._layers.SharedEmbeddings</code>.</p> </li> <li> <code>add_shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>The two embedding sharing strategies are: 1) add the shared embeddings to the column embeddings or 2) to replace the first <code>frac_shared_embed</code> with the shared embeddings. See <code>pytorch_widedeep.models.embeddings_layers.SharedEmbeddings</code> If 'None' is passed, it will default to 'False'.</p> </li> <li> <code>frac_shared_embed</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>The fraction of embeddings that will be shared (if <code>add_shared_embed = False</code>) by all the different categories for one particular column. If 'None' is passed, it will default to 0.0.</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal[standard, piecewise, periodic]]</code>, default:                   <code>'standard'</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>input_dim</code>               (<code>int</code>, default:                   <code>32</code> )           \u2013            <p>The so-called dimension of the model. Is the number of embeddings used to encode the categorical and/or continuous columns</p> </li> <li> <code>attn_dropout</code>               (<code>float</code>, default:                   <code>0.2</code> )           \u2013            <p>Dropout for each attention block</p> </li> <li> <code>n_heads</code>               (<code>int</code>, default:                   <code>8</code> )           \u2013            <p>Number of attention heads per attention block.</p> </li> <li> <code>use_bias</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to use bias in the Q, K projection layers.</p> </li> <li> <code>with_addnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if residual connections will be used in the attention blocks</p> </li> <li> <code>attn_activation</code>               (<code>str</code>, default:                   <code>'leaky_relu'</code> )           \u2013            <p>String indicating the activation function to be applied to the dense layer in each attention encoder. 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported.</p> </li> <li> <code>n_blocks</code>               (<code>int</code>, default:                   <code>3</code> )           \u2013            <p>Number of attention blocks</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>cat_and_cont_embed</code>               (<code>Module</code>)           \u2013            <p>This is the module that processes the categorical and continuous columns</p> </li> <li> <code>encoder</code>               (<code>Module</code>)           \u2013            <p>Sequence of attention encoders.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import SelfAttentionMLP\n&gt;&gt;&gt; X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = ['a', 'b', 'c', 'd', 'e']\n&gt;&gt;&gt; cat_embed_input = [(u,i,j) for u,i,j in zip(colnames[:4], [4]*4, [8]*4)]\n&gt;&gt;&gt; column_idx = {k:v for v,k in enumerate(colnames)}\n&gt;&gt;&gt; model = SelfAttentionMLP(column_idx=column_idx, cat_embed_input=cat_embed_input, continuous_cols = ['e'])\n&gt;&gt;&gt; out = model(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/mlp/self_attention_mlp.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    shared_embed: Optional[bool] = None,\n    add_shared_embed: Optional[bool] = None,\n    frac_shared_embed: Optional[float] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = \"standard\",\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    input_dim: int = 32,\n    attn_dropout: float = 0.2,\n    n_heads: int = 8,\n    use_bias: bool = False,\n    with_addnorm: bool = False,\n    attn_activation: str = \"leaky_relu\",\n    n_blocks: int = 3,\n):\n    super(SelfAttentionMLP, self).__init__(\n        column_idx=column_idx,\n        input_dim=input_dim,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        shared_embed=shared_embed,\n        add_shared_embed=add_shared_embed,\n        frac_shared_embed=frac_shared_embed,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous=None,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    self.attn_dropout = attn_dropout\n    self.n_heads = n_heads\n    self.use_bias = use_bias\n    self.with_addnorm = with_addnorm\n    self.attn_activation = attn_activation\n    self.n_blocks = n_blocks\n\n    self.with_cls_token = \"cls_token\" in column_idx\n    self.n_cat = len(cat_embed_input) if cat_embed_input is not None else 0\n    self.n_cont = len(continuous_cols) if continuous_cols is not None else 0\n\n    # Embeddings are instantiated at the base model\n    # Attention Blocks\n    self.encoder = nn.Sequential()\n    for i in range(n_blocks):\n        self.encoder.add_module(\n            \"attention_block\" + str(i),\n            SelfAttentionEncoder(\n                input_dim,\n                attn_dropout,\n                use_bias,\n                n_heads,\n                with_addnorm,\n                attn_activation,\n            ),\n        )\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.mlp.self_attention_mlp.SelfAttentionMLP.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the WideDeep class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.mlp.self_attention_mlp.SelfAttentionMLP.attention_weights","title":"attention_weights  <code>property</code>","text":"<pre><code>attention_weights\n</code></pre> <p>List with the attention weights per block</p> <p>The shape of the attention weights is \\((N, H, F, F)\\), where \\(N\\) is the batch size, \\(H\\) is the number of attention heads and \\(F\\) is the number of features/columns in the dataset</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.tab_transformer.TabTransformer","title":"TabTransformer","text":"<pre><code>TabTransformer(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    shared_embed=None,\n    add_shared_embed=None,\n    frac_shared_embed=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous=None,\n    embed_continuous_method=None,\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    input_dim=32,\n    n_heads=8,\n    use_qkv_bias=False,\n    n_blocks=4,\n    attn_dropout=0.2,\n    ff_dropout=0.1,\n    ff_factor=4,\n    transformer_activation=\"gelu\",\n    use_linear_attention=False,\n    use_flash_attention=False,\n    mlp_hidden_dims=None,\n    mlp_activation=\"relu\",\n    mlp_dropout=0.1,\n    mlp_batchnorm=False,\n    mlp_batchnorm_last=False,\n    mlp_linear_first=True\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithAttention</code></p> <p>Defines our adptation of the TabTransformer model that can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>Most of the parameters for this class are <code>Optional</code> since the use of categorical or continuous is in fact optional (i.e. one can use categorical features only, continuous features only or both).</p> <p> NOTE: This is an enhanced adaptation of the model described in the paper. It can be considered as the flagship of our transformer family of models for tabular data and offers mutiple, additional features relative to the original publication(and some other models in the library)</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name and number of unique values and embedding dimension. e.g. [(education, 11), ...]</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the embeddings will be \"shared\". The idea behind <code>shared_embed</code> is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: <code>pytorch_widedeep.models.transformers._layers.SharedEmbeddings</code>.</p> </li> <li> <code>add_shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>The two embedding sharing strategies are: 1) add the shared embeddings to the column embeddings or 2) to replace the first <code>frac_shared_embed</code> with the shared embeddings. See <code>pytorch_widedeep.models.embeddings_layers.SharedEmbeddings</code> If 'None' is passed, it will default to 'False'.</p> </li> <li> <code>frac_shared_embed</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>The fraction of embeddings that will be shared (if <code>add_shared_embed = False</code>) by all the different categories for one particular column. If 'None' is passed, it will default to 0.0.</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal[standard, piecewise, periodic]]</code>, default:                   <code>None</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>input_dim</code>               (<code>int</code>, default:                   <code>32</code> )           \u2013            <p>The so-called dimension of the model. Is the number of embeddings used to encode the categorical and/or continuous columns</p> </li> <li> <code>n_heads</code>               (<code>int</code>, default:                   <code>8</code> )           \u2013            <p>Number of attention heads per Transformer block</p> </li> <li> <code>use_qkv_bias</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to use bias in the Q, K, and V projection layers.</p> </li> <li> <code>n_blocks</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Number of Transformer blocks</p> </li> <li> <code>attn_dropout</code>               (<code>float</code>, default:                   <code>0.2</code> )           \u2013            <p>Dropout that will be applied to the Multi-Head Attention layers</p> </li> <li> <code>ff_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Dropout that will be applied to the FeedForward network</p> </li> <li> <code>ff_factor</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Multiplicative factor applied to the first layer of the FF network in each Transformer block, This is normally set to 4.</p> </li> <li> <code>transformer_activation</code>               (<code>str</code>, default:                   <code>'gelu'</code> )           \u2013            <p>Transformer Encoder activation function. 'tanh', 'relu', 'leaky_relu', 'gelu', 'geglu' and 'reglu' are supported</p> </li> <li> <code>use_linear_attention</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if Linear Attention (from Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention) will be used. The inclusing of this mode of attention is inspired by this post, where the Uber team finds that this attention mechanism leads to the best results for their tabular data.</p> </li> <li> <code>use_flash_attention</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if Flash Attention will be used. </p> </li> <li> <code>mlp_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per dense layer in the MLP. e.g: [64, 32]. If not provided no MLP on top of the final Transformer block will be used.</p> </li> <li> <code>mlp_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky'_relu' and _'gelu' are supported. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 'relu'.</p> </li> <li> <code>mlp_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>float with the dropout between the dense layers of the MLP. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 0.0.</p> </li> <li> <code>mlp_batchnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_linear_first</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code> If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to <code>True</code>.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>encoder</code>               (<code>Module</code>)           \u2013            <p>Sequence of Transformer blocks</p> </li> <li> <code>mlp</code>               (<code>Module</code>)           \u2013            <p>MLP component in the model</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import TabTransformer\n&gt;&gt;&gt; X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = ['a', 'b', 'c', 'd', 'e']\n&gt;&gt;&gt; cat_embed_input = [(u,i) for u,i in zip(colnames[:4], [4]*4)]\n&gt;&gt;&gt; continuous_cols = ['e']\n&gt;&gt;&gt; column_idx = {k:v for v,k in enumerate(colnames)}\n&gt;&gt;&gt; model = TabTransformer(column_idx=column_idx, cat_embed_input=cat_embed_input, continuous_cols=continuous_cols)\n&gt;&gt;&gt; out = model(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/transformers/tab_transformer.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    shared_embed: Optional[bool] = None,\n    add_shared_embed: Optional[bool] = None,\n    frac_shared_embed: Optional[float] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous: Optional[bool] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = None,\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    input_dim: int = 32,\n    n_heads: int = 8,\n    use_qkv_bias: bool = False,\n    n_blocks: int = 4,\n    attn_dropout: float = 0.2,\n    ff_dropout: float = 0.1,\n    ff_factor: int = 4,\n    transformer_activation: str = \"gelu\",\n    use_linear_attention: bool = False,\n    use_flash_attention: bool = False,\n    mlp_hidden_dims: Optional[List[int]] = None,\n    mlp_activation: str = \"relu\",\n    mlp_dropout: float = 0.1,\n    mlp_batchnorm: bool = False,\n    mlp_batchnorm_last: bool = False,\n    mlp_linear_first: bool = True,\n):\n    super(TabTransformer, self).__init__(\n        column_idx=column_idx,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        shared_embed=shared_embed,\n        add_shared_embed=add_shared_embed,\n        frac_shared_embed=frac_shared_embed,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous=embed_continuous,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        input_dim=input_dim,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    self.n_heads = n_heads\n    self.use_qkv_bias = use_qkv_bias\n    self.n_blocks = n_blocks\n    self.attn_dropout = attn_dropout\n    self.ff_dropout = ff_dropout\n    self.transformer_activation = transformer_activation\n    self.use_linear_attention = use_linear_attention\n    self.use_flash_attention = use_flash_attention\n    self.ff_factor = ff_factor\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    self.with_cls_token = \"cls_token\" in column_idx\n    self.n_cat = len(cat_embed_input) if cat_embed_input is not None else 0\n    self.n_cont = len(continuous_cols) if continuous_cols is not None else 0\n\n    if self.n_cont and not self.n_cat and not self.embed_continuous:\n        raise ValueError(\n            \"If only continuous features are used 'embed_continuous' must be set to 'True'\"\n        )\n\n    # Embeddings are instantiated at the base model\n    # Transformer blocks\n    self.encoder = nn.Sequential()\n    for i in range(n_blocks):\n        self.encoder.add_module(\n            \"transformer_block\" + str(i),\n            TransformerEncoder(\n                input_dim,\n                n_heads,\n                use_qkv_bias,\n                attn_dropout,\n                ff_dropout,\n                ff_factor,\n                transformer_activation,\n                use_linear_attention,\n                use_flash_attention,\n            ),\n        )\n\n    self.mlp_first_hidden_dim = self._mlp_first_hidden_dim()\n\n    if self.mlp_hidden_dims is not None:\n        self.mlp = MLP(\n            d_hidden=[self.mlp_first_hidden_dim] + self.mlp_hidden_dims,\n            activation=(\n                \"relu\" if self.mlp_activation is None else self.mlp_activation\n            ),\n            dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,\n            batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,\n            batchnorm_last=(\n                False\n                if self.mlp_batchnorm_last is None\n                else self.mlp_batchnorm_last\n            ),\n            linear_first=(\n                False if self.mlp_linear_first is None else self.mlp_linear_first\n            ),\n        )\n    else:\n        self.mlp = None\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.tab_transformer.TabTransformer.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.tab_transformer.TabTransformer.attention_weights","title":"attention_weights  <code>property</code>","text":"<pre><code>attention_weights\n</code></pre> <p>List with the attention weights per block</p> <p>The shape of the attention weights is \\((N, H, F, F)\\), where \\(N\\) is the batch size, \\(H\\) is the number of attention heads and \\(F\\) is the number of features/columns in the dataset</p> <p> NOTE: if flash attention or linear attention are used, no attention weights are saved during the training process and calling this property will throw a ValueError</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.saint.SAINT","title":"SAINT","text":"<pre><code>SAINT(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    shared_embed=None,\n    add_shared_embed=None,\n    frac_shared_embed=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous_method=\"standard\",\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    input_dim=32,\n    use_qkv_bias=False,\n    n_heads=8,\n    n_blocks=2,\n    attn_dropout=0.1,\n    ff_dropout=0.2,\n    ff_factor=4,\n    transformer_activation=\"gelu\",\n    mlp_hidden_dims=None,\n    mlp_activation=None,\n    mlp_dropout=None,\n    mlp_batchnorm=None,\n    mlp_batchnorm_last=None,\n    mlp_linear_first=None\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithAttention</code></p> <p>Defines a SAINT model that can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>Most of the parameters for this class are <code>Optional</code> since the use of categorical or continuous is in fact optional (i.e. one can use categorical features only, continuous features only or both).</p> <p> NOTE: This is an slightly modified and enhanced  version of the model described in the paper,</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name and number of unique values and embedding dimension. e.g. [(education, 11), ...]</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the embeddings will be \"shared\". The idea behind <code>shared_embed</code> is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: <code>pytorch_widedeep.models.transformers._layers.SharedEmbeddings</code>.</p> </li> <li> <code>add_shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>The two embedding sharing strategies are: 1) add the shared embeddings to the column embeddings or 2) to replace the first <code>frac_shared_embed</code> with the shared embeddings. See <code>pytorch_widedeep.models.embeddings_layers.SharedEmbeddings</code> If 'None' is passed, it will default to 'False'.</p> </li> <li> <code>frac_shared_embed</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>The fraction of embeddings that will be shared (if <code>add_shared_embed = False</code>) by all the different categories for one particular column. If 'None' is passed, it will default to 0.0.</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal[standard, piecewise, periodic]]</code>, default:                   <code>'standard'</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>input_dim</code>               (<code>int</code>, default:                   <code>32</code> )           \u2013            <p>The so-called dimension of the model. Is the number of embeddings used to encode the categorical and/or continuous columns</p> </li> <li> <code>n_heads</code>               (<code>int</code>, default:                   <code>8</code> )           \u2013            <p>Number of attention heads per Transformer block</p> </li> <li> <code>use_qkv_bias</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to use bias in the Q, K, and V projection layers</p> </li> <li> <code>n_blocks</code>               (<code>int</code>, default:                   <code>2</code> )           \u2013            <p>Number of SAINT-Transformer blocks.</p> </li> <li> <code>attn_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Dropout that will be applied to the Multi-Head Attention column and row layers</p> </li> <li> <code>ff_dropout</code>               (<code>float</code>, default:                   <code>0.2</code> )           \u2013            <p>Dropout that will be applied to the FeedForward network</p> </li> <li> <code>ff_factor</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Multiplicative factor applied to the first layer of the FF network in each Transformer block, This is normally set to 4.</p> </li> <li> <code>transformer_activation</code>               (<code>str</code>, default:                   <code>'gelu'</code> )           \u2013            <p>Transformer Encoder activation function. 'tanh', 'relu', 'leaky_relu', 'gelu', 'geglu' and 'reglu' are supported</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per dense layer in the MLP. e.g: [64, 32]. If not provided no MLP on top of the final Transformer block will be used.</p> </li> <li> <code>mlp_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky'_relu' and _'gelu' are supported. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 'relu'.</p> </li> <li> <code>mlp_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>float with the dropout between the dense layers of the MLP. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 0.0.</p> </li> <li> <code>mlp_batchnorm</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_linear_first</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code> If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to <code>True</code>.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>encoder</code>               (<code>Module</code>)           \u2013            <p>Sequence of SAINT-Transformer blocks</p> </li> <li> <code>mlp</code>               (<code>Module</code>)           \u2013            <p>MLP component in the model</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import SAINT\n&gt;&gt;&gt; X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = ['a', 'b', 'c', 'd', 'e']\n&gt;&gt;&gt; cat_embed_input = [(u,i) for u,i in zip(colnames[:4], [4]*4)]\n&gt;&gt;&gt; continuous_cols = ['e']\n&gt;&gt;&gt; column_idx = {k:v for v,k in enumerate(colnames)}\n&gt;&gt;&gt; model = SAINT(column_idx=column_idx, cat_embed_input=cat_embed_input, continuous_cols=continuous_cols)\n&gt;&gt;&gt; out = model(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/transformers/saint.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    shared_embed: Optional[bool] = None,\n    add_shared_embed: Optional[bool] = None,\n    frac_shared_embed: Optional[float] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = \"standard\",\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    input_dim: int = 32,\n    use_qkv_bias: bool = False,\n    n_heads: int = 8,\n    n_blocks: int = 2,\n    attn_dropout: float = 0.1,\n    ff_dropout: float = 0.2,\n    ff_factor: int = 4,\n    transformer_activation: str = \"gelu\",\n    mlp_hidden_dims: Optional[List[int]] = None,\n    mlp_activation: Optional[str] = None,\n    mlp_dropout: Optional[float] = None,\n    mlp_batchnorm: Optional[bool] = None,\n    mlp_batchnorm_last: Optional[bool] = None,\n    mlp_linear_first: Optional[bool] = None,\n):\n    super(SAINT, self).__init__(\n        column_idx=column_idx,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        shared_embed=shared_embed,\n        add_shared_embed=add_shared_embed,\n        frac_shared_embed=frac_shared_embed,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous=None,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        input_dim=input_dim,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    self.use_qkv_bias = use_qkv_bias\n    self.n_heads = n_heads\n    self.n_blocks = n_blocks\n    self.attn_dropout = attn_dropout\n    self.ff_dropout = ff_dropout\n    self.ff_factor = ff_factor\n    self.transformer_activation = transformer_activation\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    self.with_cls_token = \"cls_token\" in column_idx\n    self.n_cat = len(cat_embed_input) if cat_embed_input is not None else 0\n    self.n_cont = len(continuous_cols) if continuous_cols is not None else 0\n    self.n_feats = self.n_cat + self.n_cont\n\n    # Embeddings are instantiated at the base model\n    # Transformer blocks\n    self.encoder = nn.Sequential()\n    for i in range(n_blocks):\n        self.encoder.add_module(\n            \"saint_block\" + str(i),\n            SaintEncoder(\n                input_dim,\n                n_heads,\n                use_qkv_bias,\n                attn_dropout,\n                ff_dropout,\n                ff_factor,\n                transformer_activation,\n                self.n_feats,\n            ),\n        )\n\n    self.mlp_first_hidden_dim = (\n        self.input_dim if self.with_cls_token else (self.n_feats * self.input_dim)\n    )\n\n    # Mlp: adding an MLP on top of the Resnet blocks is optional and\n    # therefore all related params are optional\n    if self.mlp_hidden_dims is not None:\n        self.mlp = MLP(\n            d_hidden=[self.mlp_first_hidden_dim] + self.mlp_hidden_dims,\n            activation=(\n                \"relu\" if self.mlp_activation is None else self.mlp_activation\n            ),\n            dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,\n            batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,\n            batchnorm_last=(\n                False\n                if self.mlp_batchnorm_last is None\n                else self.mlp_batchnorm_last\n            ),\n            linear_first=(\n                False if self.mlp_linear_first is None else self.mlp_linear_first\n            ),\n        )\n    else:\n        self.mlp = None\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.saint.SAINT.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.saint.SAINT.attention_weights","title":"attention_weights  <code>property</code>","text":"<pre><code>attention_weights\n</code></pre> <p>List with the attention weights. Each element of the list is a tuple where the first and the second elements are the column and row attention weights respectively</p> <p>The shape of the attention weights is:</p> <ul> <li> <p>column attention: \\((N, H, F, F)\\)</p> </li> <li> <p>row attention: \\((1, H, N, N)\\)</p> </li> </ul> <p>where \\(N\\) is the batch size, \\(H\\) is the number of heads and \\(F\\) is the number of features/columns in the dataset</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.ft_transformer.FTTransformer","title":"FTTransformer","text":"<pre><code>FTTransformer(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    shared_embed=None,\n    add_shared_embed=None,\n    frac_shared_embed=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous_method=\"standard\",\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    input_dim=64,\n    kv_compression_factor=0.5,\n    kv_sharing=False,\n    use_qkv_bias=False,\n    n_heads=8,\n    n_blocks=4,\n    attn_dropout=0.2,\n    ff_dropout=0.1,\n    ff_factor=1.33,\n    transformer_activation=\"reglu\",\n    mlp_hidden_dims=None,\n    mlp_activation=None,\n    mlp_dropout=None,\n    mlp_batchnorm=None,\n    mlp_batchnorm_last=None,\n    mlp_linear_first=None\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithAttention</code></p> <p>Defines a FTTransformer model that can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>Most of the parameters for this class are <code>Optional</code> since the use of categorical or continuous is in fact optional (i.e. one can use categorical features only, continuous features only or both).</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name and number of unique values and embedding dimension. e.g. [(education, 11), ...]</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the embeddings will be \"shared\". The idea behind <code>shared_embed</code> is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: <code>pytorch_widedeep.models.transformers._layers.SharedEmbeddings</code>.</p> </li> <li> <code>add_shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>The two embedding sharing strategies are: 1) add the shared embeddings to the column embeddings or 2) to replace the first <code>frac_shared_embed</code> with the shared embeddings. See <code>pytorch_widedeep.models.embeddings_layers.SharedEmbeddings</code> If 'None' is passed, it will default to 'False'.</p> </li> <li> <code>frac_shared_embed</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>The fraction of embeddings that will be shared (if <code>add_shared_embed = False</code>) by all the different categories for one particular column. If 'None' is passed, it will default to 0.0.</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal[standard, piecewise, periodic]]</code>, default:                   <code>'standard'</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>input_dim</code>               (<code>int</code>, default:                   <code>64</code> )           \u2013            <p>The so-called dimension of the model. Is the number of embeddings used to encode the categorical and/or continuous columns.</p> </li> <li> <code>kv_compression_factor</code>               (<code>float</code>, default:                   <code>0.5</code> )           \u2013            <p>By default, the FTTransformer uses Linear Attention (See Linformer: Self-Attention with Linear Complexity ). The compression factor that will be used to reduce the input sequence length. If we denote the resulting sequence length as \\(k = int(kv_{compression \\space factor} \\times s)\\) where \\(s\\) is the input sequence length.</p> </li> <li> <code>kv_sharing</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the \\(E\\) and \\(F\\) projection matrices will share weights.  See Linformer: Self-Attention with Linear Complexity for details</p> </li> <li> <code>n_heads</code>               (<code>int</code>, default:                   <code>8</code> )           \u2013            <p>Number of attention heads per FTTransformer block</p> </li> <li> <code>use_qkv_bias</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to use bias in the Q, K, and V projection layers</p> </li> <li> <code>n_blocks</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Number of FTTransformer blocks</p> </li> <li> <code>attn_dropout</code>               (<code>float</code>, default:                   <code>0.2</code> )           \u2013            <p>Dropout that will be applied to the Linear-Attention layers</p> </li> <li> <code>ff_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Dropout that will be applied to the FeedForward network</p> </li> <li> <code>ff_factor</code>               (<code>float</code>, default:                   <code>1.33</code> )           \u2013            <p>Multiplicative factor applied to the first layer of the FF network in each Transformer block, This is normally set to 4, but they use 4/3 in the paper.</p> </li> <li> <code>transformer_activation</code>               (<code>str</code>, default:                   <code>'reglu'</code> )           \u2013            <p>Transformer Encoder activation function. 'tanh', 'relu', 'leaky_relu', 'gelu', 'geglu' and 'reglu' are supported</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per dense layer in the MLP. e.g: [64, 32]. If not provided no MLP on top of the final FTTransformer block will be used.</p> </li> <li> <code>mlp_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky'_relu' and _'gelu' are supported. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 'relu'.</p> </li> <li> <code>mlp_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>float with the dropout between the dense layers of the MLP. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 0.0.</p> </li> <li> <code>mlp_batchnorm</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_linear_first</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code> If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to <code>True</code>.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>encoder</code>               (<code>Module</code>)           \u2013            <p>Sequence of FTTransformer blocks</p> </li> <li> <code>mlp</code>               (<code>Module</code>)           \u2013            <p>MLP component in the model</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import FTTransformer\n&gt;&gt;&gt; X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = ['a', 'b', 'c', 'd', 'e']\n&gt;&gt;&gt; cat_embed_input = [(u,i) for u,i in zip(colnames[:4], [4]*4)]\n&gt;&gt;&gt; continuous_cols = ['e']\n&gt;&gt;&gt; column_idx = {k:v for v,k in enumerate(colnames)}\n&gt;&gt;&gt; model = FTTransformer(column_idx=column_idx, cat_embed_input=cat_embed_input, continuous_cols=continuous_cols)\n&gt;&gt;&gt; out = model(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/transformers/ft_transformer.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    shared_embed: Optional[bool] = None,\n    add_shared_embed: Optional[bool] = None,\n    frac_shared_embed: Optional[float] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = \"standard\",\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    input_dim: int = 64,\n    kv_compression_factor: float = 0.5,\n    kv_sharing: bool = False,\n    use_qkv_bias: bool = False,\n    n_heads: int = 8,\n    n_blocks: int = 4,\n    attn_dropout: float = 0.2,\n    ff_dropout: float = 0.1,\n    ff_factor: float = 1.33,\n    transformer_activation: str = \"reglu\",\n    mlp_hidden_dims: Optional[List[int]] = None,\n    mlp_activation: Optional[str] = None,\n    mlp_dropout: Optional[float] = None,\n    mlp_batchnorm: Optional[bool] = None,\n    mlp_batchnorm_last: Optional[bool] = None,\n    mlp_linear_first: Optional[bool] = None,\n):\n    super(FTTransformer, self).__init__(\n        column_idx=column_idx,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        shared_embed=shared_embed,\n        add_shared_embed=add_shared_embed,\n        frac_shared_embed=frac_shared_embed,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous=None,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        input_dim=input_dim,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    self.kv_compression_factor = kv_compression_factor\n    self.kv_sharing = kv_sharing\n    self.use_qkv_bias = use_qkv_bias\n    self.n_heads = n_heads\n    self.n_blocks = n_blocks\n    self.attn_dropout = attn_dropout\n    self.ff_dropout = ff_dropout\n    self.ff_factor = ff_factor\n    self.transformer_activation = transformer_activation\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    self.with_cls_token = \"cls_token\" in column_idx\n    self.n_cat = len(cat_embed_input) if cat_embed_input is not None else 0\n    self.n_cont = len(continuous_cols) if continuous_cols is not None else 0\n    self.n_feats = self.n_cat + self.n_cont\n\n    # Embeddings are instantiated at the base model\n    # Transformer blocks\n    is_first = True\n    self.encoder = nn.Sequential()\n    for i in range(n_blocks):\n        self.encoder.add_module(\n            \"fttransformer_block\" + str(i),\n            FTTransformerEncoder(\n                input_dim,\n                self.n_feats,\n                n_heads,\n                use_qkv_bias,\n                attn_dropout,\n                ff_dropout,\n                ff_factor,\n                kv_compression_factor,\n                kv_sharing,\n                transformer_activation,\n                is_first,\n            ),\n        )\n        is_first = False\n\n    self.mlp_first_hidden_dim = (\n        self.input_dim if self.with_cls_token else (self.n_feats * self.input_dim)\n    )\n\n    # Mlp: adding an MLP on top of the Resnet blocks is optional and\n    # therefore all related params are optional\n    if self.mlp_hidden_dims is not None:\n        self.mlp = MLP(\n            d_hidden=[self.mlp_first_hidden_dim] + self.mlp_hidden_dims,\n            activation=(\n                \"relu\" if self.mlp_activation is None else self.mlp_activation\n            ),\n            dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,\n            batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,\n            batchnorm_last=(\n                False\n                if self.mlp_batchnorm_last is None\n                else self.mlp_batchnorm_last\n            ),\n            linear_first=(\n                False if self.mlp_linear_first is None else self.mlp_linear_first\n            ),\n        )\n    else:\n        self.mlp = None\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.ft_transformer.FTTransformer.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.ft_transformer.FTTransformer.attention_weights","title":"attention_weights  <code>property</code>","text":"<pre><code>attention_weights\n</code></pre> <p>List with the attention weights per block</p> <p>The shape of the attention weights is: \\((N, H, F, k)\\), where \\(N\\) is the batch size, \\(H\\) is the number of attention heads, \\(F\\) is the number of features/columns and \\(k\\) is the reduced sequence length or dimension, i.e. \\(k = int(kv_{compression \\space factor} \\times s)\\)</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.tab_perceiver.TabPerceiver","title":"TabPerceiver","text":"<pre><code>TabPerceiver(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    shared_embed=None,\n    add_shared_embed=None,\n    frac_shared_embed=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous_method=\"standard\",\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    input_dim=32,\n    n_cross_attns=1,\n    n_cross_attn_heads=4,\n    n_latents=16,\n    latent_dim=128,\n    n_latent_heads=4,\n    n_latent_blocks=4,\n    n_perceiver_blocks=4,\n    share_weights=False,\n    attn_dropout=0.1,\n    ff_dropout=0.1,\n    ff_factor=4,\n    transformer_activation=\"geglu\",\n    mlp_hidden_dims=None,\n    mlp_activation=None,\n    mlp_dropout=None,\n    mlp_batchnorm=None,\n    mlp_batchnorm_last=None,\n    mlp_linear_first=None\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithAttention</code></p> <p>Defines an adaptation of a Perceiver  that can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model  or independently by itself.</p> <p>Most of the parameters for this class are <code>Optional</code> since the use of categorical or continuous is in fact optional (i.e. one can use categorical features only, continuous features only or both).</p> <p> NOTE: while there are scientific publications for  the <code>TabTransformer</code>, <code>SAINT</code> and <code>FTTransformer</code>, the <code>TabPerceiver</code>  and the <code>TabFastFormer</code> are our own adaptations of the  Perceiver and the  FastFormer for tabular data.</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name and number of unique values and embedding dimension. e.g. [(education, 11), ...]</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the embeddings will be \"shared\". The idea behind <code>shared_embed</code> is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: <code>pytorch_widedeep.models.transformers._layers.SharedEmbeddings</code>.</p> </li> <li> <code>add_shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>The two embedding sharing strategies are: 1) add the shared embeddings to the column embeddings or 2) to replace the first <code>frac_shared_embed</code> with the shared embeddings. See <code>pytorch_widedeep.models.embeddings_layers.SharedEmbeddings</code> If 'None' is passed, it will default to 'False'.</p> </li> <li> <code>frac_shared_embed</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>The fraction of embeddings that will be shared (if <code>add_shared_embed = False</code>) by all the different categories for one particular column. If 'None' is passed, it will default to 0.0.</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal[standard, piecewise, periodic]]</code>, default:                   <code>'standard'</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>input_dim</code>               (<code>int</code>, default:                   <code>32</code> )           \u2013            <p>The so-called dimension of the model. Is the number of embeddings used to encode the categorical and/or continuous columns.</p> </li> <li> <code>n_cross_attns</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Number of times each perceiver block will cross attend to the input data (i.e. number of cross attention components per perceiver block). This should normally be 1. However, in the paper they describe some architectures (normally computer vision-related problems) where the Perceiver attends multiple times to the input array. Therefore, maybe multiple cross attention to the input array is also useful in some cases for tabular data  .</p> </li> <li> <code>n_cross_attn_heads</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Number of attention heads for the cross attention component</p> </li> <li> <code>n_latents</code>               (<code>int</code>, default:                   <code>16</code> )           \u2013            <p>Number of latents. This is the \\(N\\) parameter in the paper. As indicated in the paper, this number should be significantly lower than \\(M\\) (the number of columns in the dataset). Setting \\(N\\) closer to \\(M\\) defies the main purpose of the Perceiver, which is to overcome the transformer quadratic bottleneck</p> </li> <li> <code>latent_dim</code>               (<code>int</code>, default:                   <code>128</code> )           \u2013            <p>Latent dimension.</p> </li> <li> <code>n_latent_heads</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Number of attention heads per Latent Transformer</p> </li> <li> <code>n_latent_blocks</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Number of transformer encoder blocks (normalised MHA + normalised FF) per Latent Transformer</p> </li> <li> <code>n_perceiver_blocks</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Number of Perceiver blocks defined as [Cross Attention + Latent Transformer]</p> </li> <li> <code>share_weights</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the weights will be shared between Perceiver blocks</p> </li> <li> <code>attn_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Dropout that will be applied to the Multi-Head Attention layers</p> </li> <li> <code>ff_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Dropout that will be applied to the FeedForward network</p> </li> <li> <code>ff_factor</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Multiplicative factor applied to the first layer of the FF network in each Transformer block, This is normally set to 4.</p> </li> <li> <code>transformer_activation</code>               (<code>str</code>, default:                   <code>'geglu'</code> )           \u2013            <p>Transformer Encoder activation function. 'tanh', 'relu', 'leaky_relu', 'gelu', 'geglu' and 'reglu' are supported</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per dense layer in the MLP. e.g: [64, 32]. If not provided no MLP on top of the final Transformer block will be used.</p> </li> <li> <code>mlp_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky'_relu' and _'gelu' are supported. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 'relu'.</p> </li> <li> <code>mlp_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>float with the dropout between the dense layers of the MLP. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 0.0.</p> </li> <li> <code>mlp_batchnorm</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_linear_first</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code> If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to <code>True</code>.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>encoder</code>               (<code>ModuleDict</code>)           \u2013            <p>ModuleDict with the Perceiver blocks</p> </li> <li> <code>latents</code>               (<code>Parameter</code>)           \u2013            <p>Latents that will be used for prediction</p> </li> <li> <code>mlp</code>               (<code>Module</code>)           \u2013            <p>MLP component in the model</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import TabPerceiver\n&gt;&gt;&gt; X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = ['a', 'b', 'c', 'd', 'e']\n&gt;&gt;&gt; cat_embed_input = [(u,i) for u,i in zip(colnames[:4], [4]*4)]\n&gt;&gt;&gt; continuous_cols = ['e']\n&gt;&gt;&gt; column_idx = {k:v for v,k in enumerate(colnames)}\n&gt;&gt;&gt; model = TabPerceiver(column_idx=column_idx, cat_embed_input=cat_embed_input,\n... continuous_cols=continuous_cols, n_latents=2, latent_dim=16,\n... n_perceiver_blocks=2)\n&gt;&gt;&gt; out = model(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/transformers/tab_perceiver.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    shared_embed: Optional[bool] = None,\n    add_shared_embed: Optional[bool] = None,\n    frac_shared_embed: Optional[float] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = \"standard\",\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    input_dim: int = 32,\n    n_cross_attns: int = 1,\n    n_cross_attn_heads: int = 4,\n    n_latents: int = 16,\n    latent_dim: int = 128,\n    n_latent_heads: int = 4,\n    n_latent_blocks: int = 4,\n    n_perceiver_blocks: int = 4,\n    share_weights: bool = False,\n    attn_dropout: float = 0.1,\n    ff_dropout: float = 0.1,\n    ff_factor: int = 4,\n    transformer_activation: str = \"geglu\",\n    mlp_hidden_dims: Optional[List[int]] = None,\n    mlp_activation: Optional[str] = None,\n    mlp_dropout: Optional[float] = None,\n    mlp_batchnorm: Optional[bool] = None,\n    mlp_batchnorm_last: Optional[bool] = None,\n    mlp_linear_first: Optional[bool] = None,\n):\n    super(TabPerceiver, self).__init__(\n        column_idx=column_idx,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        shared_embed=shared_embed,\n        add_shared_embed=add_shared_embed,\n        frac_shared_embed=frac_shared_embed,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous=None,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        input_dim=input_dim,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    self.n_cross_attns = n_cross_attns\n    self.n_cross_attn_heads = n_cross_attn_heads\n    self.n_latents = n_latents\n    self.latent_dim = latent_dim\n    self.n_latent_heads = n_latent_heads\n    self.n_latent_blocks = n_latent_blocks\n    self.n_perceiver_blocks = n_perceiver_blocks\n    self.share_weights = share_weights\n    self.attn_dropout = attn_dropout\n    self.ff_dropout = ff_dropout\n    self.ff_factor = ff_factor\n    self.transformer_activation = transformer_activation\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    # Embeddings are instantiated at the base model\n    # Transformer blocks\n    self.latents = nn.init.trunc_normal_(\n        nn.Parameter(torch.empty(n_latents, latent_dim))\n    )\n\n    self.encoder = nn.ModuleDict()\n    first_perceiver_block = self._build_perceiver_block()\n    self.encoder[\"perceiver_block0\"] = first_perceiver_block\n\n    if share_weights:\n        for n in range(1, n_perceiver_blocks):\n            self.encoder[\"perceiver_block\" + str(n)] = first_perceiver_block\n    else:\n        for n in range(1, n_perceiver_blocks):\n            self.encoder[\"perceiver_block\" + str(n)] = self._build_perceiver_block()\n\n    self.mlp_first_hidden_dim = self.latent_dim\n\n    # Mlp\n    if self.mlp_hidden_dims is not None:\n        self.mlp = MLP(\n            d_hidden=[self.mlp_first_hidden_dim] + self.mlp_hidden_dims,\n            activation=(\n                \"relu\" if self.mlp_activation is None else self.mlp_activation\n            ),\n            dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,\n            batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,\n            batchnorm_last=(\n                False\n                if self.mlp_batchnorm_last is None\n                else self.mlp_batchnorm_last\n            ),\n            linear_first=(\n                False if self.mlp_linear_first is None else self.mlp_linear_first\n            ),\n        )\n    else:\n        self.mlp = None\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.tab_perceiver.TabPerceiver.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.tab_perceiver.TabPerceiver.attention_weights","title":"attention_weights  <code>property</code>","text":"<pre><code>attention_weights\n</code></pre> <p>List with the attention weights. If the weights are not shared between perceiver blocks each element of the list will be a list itself containing the Cross Attention and Latent Transformer attention weights respectively</p> <p>The shape of the attention weights is:</p> <ul> <li> <p>Cross Attention: \\((N, C, L, F)\\)</p> </li> <li> <p>Latent Attention: \\((N, T, L, L)\\)</p> </li> </ul> <p>WHere \\(N\\) is the batch size, \\(C\\) is the number of Cross Attention heads, \\(L\\) is the number of Latents, \\(F\\) is the number of features/columns in the dataset and \\(T\\) is the number of Latent Attention heads</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.tab_fastformer.TabFastFormer","title":"TabFastFormer","text":"<pre><code>TabFastFormer(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    shared_embed=None,\n    add_shared_embed=None,\n    frac_shared_embed=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous_method=\"standard\",\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    input_dim=32,\n    n_heads=8,\n    use_bias=False,\n    n_blocks=4,\n    attn_dropout=0.1,\n    ff_dropout=0.2,\n    ff_factor=4,\n    share_qv_weights=False,\n    share_weights=False,\n    transformer_activation=\"relu\",\n    mlp_hidden_dims=None,\n    mlp_activation=None,\n    mlp_dropout=None,\n    mlp_batchnorm=None,\n    mlp_batchnorm_last=None,\n    mlp_linear_first=None\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithAttention</code></p> <p>Defines an adaptation of a FastFormer that can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>Most of the parameters for this class are <code>Optional</code> since the use of categorical or continuous is in fact optional (i.e. one can use categorical features only, continuous features only or both).</p> <p> NOTE: while there are scientific publications for  the <code>TabTransformer</code>, <code>SAINT</code> and <code>FTTransformer</code>, the <code>TabPerceiver</code>  and the <code>TabFastFormer</code> are our own adaptations of the  Perceiver and the  FastFormer for tabular data.</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name and number of unique values and embedding dimension. e.g. [(education, 11), ...]</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the embeddings will be \"shared\". The idea behind <code>shared_embed</code> is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: <code>pytorch_widedeep.models.transformers._layers.SharedEmbeddings</code>.</p> </li> <li> <code>add_shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>The two embedding sharing strategies are: 1) add the shared embeddings to the column embeddings or 2) to replace the first <code>frac_shared_embed</code> with the shared embeddings. See <code>pytorch_widedeep.models.embeddings_layers.SharedEmbeddings</code> If 'None' is passed, it will default to 'False'.</p> </li> <li> <code>frac_shared_embed</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>The fraction of embeddings that will be shared (if <code>add_shared_embed = False</code>) by all the different categories for one particular column. If 'None' is passed, it will default to 0.0.</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal[standard, piecewise, periodic]]</code>, default:                   <code>'standard'</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>input_dim</code>               (<code>int</code>, default:                   <code>32</code> )           \u2013            <p>The so-called dimension of the model. Is the number of embeddings used to encode the categorical and/or continuous columns</p> </li> <li> <code>n_heads</code>               (<code>int</code>, default:                   <code>8</code> )           \u2013            <p>Number of attention heads per FastFormer block</p> </li> <li> <code>use_bias</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to use bias in the Q, K, and V projection layers</p> </li> <li> <code>n_blocks</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Number of FastFormer blocks</p> </li> <li> <code>attn_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Dropout that will be applied to the Additive Attention layers</p> </li> <li> <code>ff_dropout</code>               (<code>float</code>, default:                   <code>0.2</code> )           \u2013            <p>Dropout that will be applied to the FeedForward network</p> </li> <li> <code>ff_factor</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Multiplicative factor applied to the first layer of the FF network in each Transformer block, This is normally set to 4.</p> </li> <li> <code>share_qv_weights</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Following the paper, this is a boolean indicating if the Value (\\(V\\)) and the Query (\\(Q\\)) transformation parameters will be shared.</p> </li> <li> <code>share_weights</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>In addition to sharing the \\(V\\) and \\(Q\\) transformation parameters, the parameters across different Fastformer layers can also be shared. Please, see <code>pytorch_widedeep/models/tabular/transformers/tab_fastformer.py</code> for details</p> </li> <li> <code>transformer_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Transformer Encoder activation function. 'tanh', 'relu', 'leaky_relu', 'gelu', 'geglu' and 'reglu' are supported</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>MLP hidden dimensions. If not provided no MLP on top of the final FTTransformer block will be used</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per dense layer in the MLP. e.g: [64, 32]. If not provided no MLP on top of the final Transformer block will be used.</p> </li> <li> <code>mlp_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky'_relu' and _'gelu' are supported. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 'relu'.</p> </li> <li> <code>mlp_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>float with the dropout between the dense layers of the MLP. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 0.0.</p> </li> <li> <code>mlp_batchnorm</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_linear_first</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code> If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to <code>True</code>.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>encoder</code>               (<code>Module</code>)           \u2013            <p>Sequence of FasFormer blocks.</p> </li> <li> <code>mlp</code>               (<code>Module</code>)           \u2013            <p>MLP component in the model</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import TabFastFormer\n&gt;&gt;&gt; X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = ['a', 'b', 'c', 'd', 'e']\n&gt;&gt;&gt; cat_embed_input = [(u,i) for u,i in zip(colnames[:4], [4]*4)]\n&gt;&gt;&gt; continuous_cols = ['e']\n&gt;&gt;&gt; column_idx = {k:v for v,k in enumerate(colnames)}\n&gt;&gt;&gt; model = TabFastFormer(column_idx=column_idx, cat_embed_input=cat_embed_input, continuous_cols=continuous_cols)\n&gt;&gt;&gt; out = model(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/transformers/tab_fastformer.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    shared_embed: Optional[bool] = None,\n    add_shared_embed: Optional[bool] = None,\n    frac_shared_embed: Optional[float] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = \"standard\",\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    input_dim: int = 32,\n    n_heads: int = 8,\n    use_bias: bool = False,\n    n_blocks: int = 4,\n    attn_dropout: float = 0.1,\n    ff_dropout: float = 0.2,\n    ff_factor: int = 4,\n    share_qv_weights: bool = False,\n    share_weights: bool = False,\n    transformer_activation: str = \"relu\",\n    mlp_hidden_dims: Optional[List[int]] = None,\n    mlp_activation: Optional[str] = None,\n    mlp_dropout: Optional[float] = None,\n    mlp_batchnorm: Optional[bool] = None,\n    mlp_batchnorm_last: Optional[bool] = None,\n    mlp_linear_first: Optional[bool] = None,\n):\n    super(TabFastFormer, self).__init__(\n        column_idx=column_idx,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        shared_embed=shared_embed,\n        add_shared_embed=add_shared_embed,\n        frac_shared_embed=frac_shared_embed,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous=None,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        input_dim=input_dim,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    self.n_heads = n_heads\n    self.use_bias = use_bias\n    self.n_blocks = n_blocks\n    self.attn_dropout = attn_dropout\n    self.ff_dropout = ff_dropout\n    self.ff_factor = ff_factor\n    self.share_qv_weights = share_qv_weights\n    self.share_weights = share_weights\n    self.transformer_activation = transformer_activation\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    self.with_cls_token = \"cls_token\" in column_idx\n    self.n_cat = len(cat_embed_input) if cat_embed_input is not None else 0\n    self.n_cont = len(continuous_cols) if continuous_cols is not None else 0\n    self.n_feats = self.n_cat + self.n_cont\n\n    # Embeddings are instantiated at the base model\n    # Transformer blocks\n    self.encoder = nn.Sequential()\n    first_fastformer_block = FastFormerEncoder(\n        input_dim,\n        n_heads,\n        use_bias,\n        attn_dropout,\n        ff_dropout,\n        ff_factor,\n        share_qv_weights,\n        transformer_activation,\n    )\n    self.encoder.add_module(\"fastformer_block0\", first_fastformer_block)\n    for i in range(1, n_blocks):\n        if share_weights:\n            self.encoder.add_module(\n                \"fastformer_block\" + str(i), first_fastformer_block\n            )\n        else:\n            self.encoder.add_module(\n                \"fastformer_block\" + str(i),\n                FastFormerEncoder(\n                    input_dim,\n                    n_heads,\n                    use_bias,\n                    attn_dropout,\n                    ff_dropout,\n                    ff_factor,\n                    share_qv_weights,\n                    transformer_activation,\n                ),\n            )\n\n    self.mlp_first_hidden_dim = (\n        self.input_dim if self.with_cls_token else (self.n_feats * self.input_dim)\n    )\n\n    # Mlp: adding an MLP on top of the Resnet blocks is optional and\n    # therefore all related params are optional\n    if self.mlp_hidden_dims is not None:\n        self.mlp = MLP(\n            d_hidden=[self.mlp_first_hidden_dim] + self.mlp_hidden_dims,\n            activation=(\n                \"relu\" if self.mlp_activation is None else self.mlp_activation\n            ),\n            dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,\n            batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,\n            batchnorm_last=(\n                False\n                if self.mlp_batchnorm_last is None\n                else self.mlp_batchnorm_last\n            ),\n            linear_first=(\n                False if self.mlp_linear_first is None else self.mlp_linear_first\n            ),\n        )\n    else:\n        self.mlp = None\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.tab_fastformer.TabFastFormer.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.tab_fastformer.TabFastFormer.attention_weights","title":"attention_weights  <code>property</code>","text":"<pre><code>attention_weights\n</code></pre> <p>List with the attention weights. Each element of the list is a tuple where the first and second elements are the \\(\\alpha\\) and \\(\\beta\\) attention weights in the paper.</p> <p>The shape of the attention weights is \\((N, H, F)\\) where \\(N\\) is the batch size, \\(H\\) is the number of attention heads and \\(F\\) is the number of features/columns in the dataset</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.text.rnns.basic_rnn.BasicRNN","title":"BasicRNN","text":"<pre><code>BasicRNN(\n    vocab_size,\n    embed_dim=None,\n    embed_matrix=None,\n    embed_trainable=True,\n    rnn_type=\"lstm\",\n    hidden_dim=64,\n    n_layers=3,\n    rnn_dropout=0.0,\n    bidirectional=False,\n    use_hidden_state=True,\n    padding_idx=1,\n    head_hidden_dims=None,\n    head_activation=\"relu\",\n    head_dropout=None,\n    head_batchnorm=False,\n    head_batchnorm_last=False,\n    head_linear_first=False,\n)\n</code></pre> <p>               Bases: <code>BaseWDModelComponent</code></p> <p>Standard text classifier/regressor comprised by a stack of RNNs (LSTMs or GRUs) that can be used as the <code>deeptext</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>In addition, there is the option to add a Fully Connected (FC) set of dense layers on top of the stack of RNNs</p> <p>Parameters:</p> <ul> <li> <code>vocab_size</code>               (<code>int</code>)           \u2013            <p>Number of words in the vocabulary</p> </li> <li> <code>embed_dim</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Dimension of the word embeddings if non-pretained word vectors are used</p> </li> <li> <code>embed_matrix</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>Pretrained word embeddings</p> </li> <li> <code>embed_trainable</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating if the pretrained embeddings are trainable</p> </li> <li> <code>rnn_type</code>               (<code>Literal[lstm, gru]</code>, default:                   <code>'lstm'</code> )           \u2013            <p>String indicating the type of RNN to use. One of 'lstm' or 'gru'</p> </li> <li> <code>hidden_dim</code>               (<code>int</code>, default:                   <code>64</code> )           \u2013            <p>Hidden dim of the RNN</p> </li> <li> <code>n_layers</code>               (<code>int</code>, default:                   <code>3</code> )           \u2013            <p>Number of recurrent layers</p> </li> <li> <code>rnn_dropout</code>               (<code>float</code>, default:                   <code>0.0</code> )           \u2013            <p>Dropout for each RNN layer except the last layer</p> </li> <li> <code>bidirectional</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the staked RNNs are bidirectional</p> </li> <li> <code>use_hidden_state</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating whether to use the final hidden state or the RNN's output as predicting features. Typically the former is used.</p> </li> <li> <code>padding_idx</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>index of the padding token in the padded-tokenised sequences. The <code>TextPreprocessor</code> class within this library uses fastai's tokenizer where the token index 0 is reserved for the 'unknown' word token. Therefore, the default value is set to 1.</p> </li> <li> <code>head_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the sizes of the dense layers in the head e.g: [128, 64]</p> </li> <li> <code>head_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Activation function for the dense layers in the head. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>head_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout of the dense layers in the head</p> </li> <li> <code>head_batchnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to include batch normalization in the dense layers that form the 'rnn_mlp'</p> </li> <li> <code>head_batchnorm_last</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to apply batch normalization to the last of the dense layers in the head</p> </li> <li> <code>head_linear_first</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>word_embed</code>               (<code>Module</code>)           \u2013            <p>word embedding matrix</p> </li> <li> <code>rnn</code>               (<code>Module</code>)           \u2013            <p>Stack of RNNs</p> </li> <li> <code>rnn_mlp</code>               (<code>Module</code>)           \u2013            <p>Stack of dense layers on top of the RNN. This will only exists if <code>head_layers_dim</code> is not None</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import BasicRNN\n&gt;&gt;&gt; X_text = torch.cat((torch.zeros([5,1]), torch.empty(5, 4).random_(1,4)), axis=1)\n&gt;&gt;&gt; model = BasicRNN(vocab_size=4, hidden_dim=4, n_layers=2, padding_idx=0, embed_dim=4)\n&gt;&gt;&gt; out = model(X_text)\n</code></pre> Source code in <code>pytorch_widedeep/models/text/rnns/basic_rnn.py</code> <pre><code>def __init__(\n    self,\n    vocab_size: int,\n    embed_dim: Optional[int] = None,\n    embed_matrix: Optional[np.ndarray] = None,\n    embed_trainable: bool = True,\n    rnn_type: Literal[\"lstm\", \"gru\"] = \"lstm\",\n    hidden_dim: int = 64,\n    n_layers: int = 3,\n    rnn_dropout: float = 0.0,\n    bidirectional: bool = False,\n    use_hidden_state: bool = True,\n    padding_idx: int = 1,\n    head_hidden_dims: Optional[List[int]] = None,\n    head_activation: str = \"relu\",\n    head_dropout: Optional[float] = None,\n    head_batchnorm: bool = False,\n    head_batchnorm_last: bool = False,\n    head_linear_first: bool = False,\n):\n    super(BasicRNN, self).__init__()\n\n    if embed_dim is None and embed_matrix is None:\n        raise ValueError(\n            \"If no 'embed_matrix' is passed, the embedding dimension must\"\n            \"be specified with 'embed_dim'\"\n        )\n\n    if rnn_type.lower() not in [\"lstm\", \"gru\"]:\n        raise ValueError(\n            f\"'rnn_type' must be 'lstm' or 'gru', got {rnn_type} instead\"\n        )\n\n    if (\n        embed_dim is not None\n        and embed_matrix is not None\n        and not embed_dim == embed_matrix.shape[1]\n    ):\n        warnings.warn(\n            \"the input embedding dimension {} and the dimension of the \"\n            \"pretrained embeddings {} do not match. The pretrained embeddings \"\n            \"dimension ({}) will be used\".format(\n                embed_dim, embed_matrix.shape[1], embed_matrix.shape[1]\n            ),\n            UserWarning,\n        )\n\n    self.vocab_size = vocab_size\n    self.embed_trainable = embed_trainable\n    self.embed_dim = embed_dim\n\n    self.rnn_type = rnn_type\n    self.hidden_dim = hidden_dim\n    self.n_layers = n_layers\n    self.rnn_dropout = rnn_dropout\n    self.bidirectional = bidirectional\n    self.use_hidden_state = use_hidden_state\n    self.padding_idx = padding_idx\n\n    self.head_hidden_dims = head_hidden_dims\n    self.head_activation = head_activation\n    self.head_dropout = head_dropout\n    self.head_batchnorm = head_batchnorm\n    self.head_batchnorm_last = head_batchnorm_last\n    self.head_linear_first = head_linear_first\n\n    # Embeddings\n    if embed_matrix is not None:\n        self.word_embed, self.embed_dim = self._set_embeddings(embed_matrix)\n    else:\n        assert self.embed_dim is not None\n        self.word_embed = nn.Embedding(\n            self.vocab_size, self.embed_dim, padding_idx=self.padding_idx\n        )\n\n    # RNN\n    rnn_params = {\n        \"input_size\": self.embed_dim,\n        \"hidden_size\": hidden_dim,\n        \"num_layers\": n_layers,\n        \"bidirectional\": bidirectional,\n        \"dropout\": rnn_dropout,\n        \"batch_first\": True,\n    }\n    if self.rnn_type.lower() == \"lstm\":\n        self.rnn: Union[nn.LSTM, nn.GRU] = nn.LSTM(**rnn_params)\n    elif self.rnn_type.lower() == \"gru\":\n        self.rnn = nn.GRU(**rnn_params)\n    else:\n        raise ValueError(\n            f\"'rnn_type' must be 'lstm' or 'gru', got {self.rnn_type} instead\"\n        )\n\n    self.rnn_output_dim = hidden_dim * 2 if bidirectional else hidden_dim\n\n    # FC-Head (Mlp)\n    if self.head_hidden_dims is not None:\n        head_hidden_dims = [self.rnn_output_dim] + self.head_hidden_dims\n        self.rnn_mlp: Union[MLP, nn.Identity] = MLP(\n            head_hidden_dims,\n            head_activation,\n            head_dropout,\n            head_batchnorm,\n            head_batchnorm_last,\n            head_linear_first,\n        )\n    else:\n        # simple hack to add readability in the forward pass\n        self.rnn_mlp = nn.Identity()\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.text.rnns.basic_rnn.BasicRNN.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.text.rnns.attentive_rnn.AttentiveRNN","title":"AttentiveRNN","text":"<pre><code>AttentiveRNN(\n    vocab_size,\n    embed_dim=None,\n    embed_matrix=None,\n    embed_trainable=True,\n    rnn_type=\"lstm\",\n    hidden_dim=64,\n    n_layers=3,\n    rnn_dropout=0.1,\n    bidirectional=False,\n    use_hidden_state=True,\n    padding_idx=1,\n    attn_concatenate=True,\n    attn_dropout=0.1,\n    head_hidden_dims=None,\n    head_activation=\"relu\",\n    head_dropout=None,\n    head_batchnorm=False,\n    head_batchnorm_last=False,\n    head_linear_first=False,\n)\n</code></pre> <p>               Bases: <code>BasicRNN</code></p> <p>Text classifier/regressor comprised by a stack of RNNs (LSTMs or GRUs) plus an attention layer. This model can be used as the <code>deeptext</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>In addition, there is the option to add a Fully Connected (FC) set of dense layers on top of attention layer</p> <p>Parameters:</p> <ul> <li> <code>vocab_size</code>               (<code>int</code>)           \u2013            <p>Number of words in the vocabulary</p> </li> <li> <code>embed_dim</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Dimension of the word embeddings if non-pretained word vectors are used</p> </li> <li> <code>embed_matrix</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>Pretrained word embeddings</p> </li> <li> <code>embed_trainable</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating if the pretrained embeddings are trainable</p> </li> <li> <code>rnn_type</code>               (<code>Literal[lstm, gru]</code>, default:                   <code>'lstm'</code> )           \u2013            <p>String indicating the type of RNN to use. One of 'lstm' or 'gru'</p> </li> <li> <code>hidden_dim</code>               (<code>int</code>, default:                   <code>64</code> )           \u2013            <p>Hidden dim of the RNN</p> </li> <li> <code>n_layers</code>               (<code>int</code>, default:                   <code>3</code> )           \u2013            <p>Number of recurrent layers</p> </li> <li> <code>rnn_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Dropout for each RNN layer except the last layer</p> </li> <li> <code>bidirectional</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the staked RNNs are bidirectional</p> </li> <li> <code>use_hidden_state</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating whether to use the final hidden state or the RNN's output as predicting features. Typically the former is used.</p> </li> <li> <code>padding_idx</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>index of the padding token in the padded-tokenised sequences. The <code>TextPreprocessor</code> class within this library uses fastai's tokenizer where the token index 0 is reserved for the 'unknown' word token. Therefore, the default value is set to 1.</p> </li> <li> <code>attn_concatenate</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating if the input to the attention mechanism will be the output of the RNN or the output of the RNN concatenated with the last hidden state.</p> </li> <li> <code>attn_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Internal dropout for the attention mechanism</p> </li> <li> <code>head_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the sizes of the dense layers in the head e.g: [128, 64]</p> </li> <li> <code>head_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Activation function for the dense layers in the head. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>head_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout of the dense layers in the head</p> </li> <li> <code>head_batchnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to include batch normalization in the dense layers that form the 'rnn_mlp'</p> </li> <li> <code>head_batchnorm_last</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to apply batch normalization to the last of the dense layers in the head</p> </li> <li> <code>head_linear_first</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>word_embed</code>               (<code>Module</code>)           \u2013            <p>word embedding matrix</p> </li> <li> <code>rnn</code>               (<code>Module</code>)           \u2013            <p>Stack of RNNs</p> </li> <li> <code>rnn_mlp</code>               (<code>Module</code>)           \u2013            <p>Stack of dense layers on top of the RNN. This will only exists if <code>head_layers_dim</code> is not <code>None</code></p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import AttentiveRNN\n&gt;&gt;&gt; X_text = torch.cat((torch.zeros([5,1]), torch.empty(5, 4).random_(1,4)), axis=1)\n&gt;&gt;&gt; model = AttentiveRNN(vocab_size=4, hidden_dim=4, n_layers=2, padding_idx=0, embed_dim=4)\n&gt;&gt;&gt; out = model(X_text)\n</code></pre> Source code in <code>pytorch_widedeep/models/text/rnns/attentive_rnn.py</code> <pre><code>def __init__(\n    self,\n    vocab_size: int,\n    embed_dim: Optional[int] = None,\n    embed_matrix: Optional[np.ndarray] = None,\n    embed_trainable: bool = True,\n    rnn_type: Literal[\"lstm\", \"gru\"] = \"lstm\",\n    hidden_dim: int = 64,\n    n_layers: int = 3,\n    rnn_dropout: float = 0.1,\n    bidirectional: bool = False,\n    use_hidden_state: bool = True,\n    padding_idx: int = 1,\n    attn_concatenate: bool = True,\n    attn_dropout: float = 0.1,\n    head_hidden_dims: Optional[List[int]] = None,\n    head_activation: str = \"relu\",\n    head_dropout: Optional[float] = None,\n    head_batchnorm: bool = False,\n    head_batchnorm_last: bool = False,\n    head_linear_first: bool = False,\n):\n    super(AttentiveRNN, self).__init__(\n        vocab_size=vocab_size,\n        embed_dim=embed_dim,\n        embed_matrix=embed_matrix,\n        embed_trainable=embed_trainable,\n        rnn_type=rnn_type,\n        hidden_dim=hidden_dim,\n        n_layers=n_layers,\n        rnn_dropout=rnn_dropout,\n        bidirectional=bidirectional,\n        use_hidden_state=use_hidden_state,\n        padding_idx=padding_idx,\n        head_hidden_dims=head_hidden_dims,\n        head_activation=head_activation,\n        head_dropout=head_dropout,\n        head_batchnorm=head_batchnorm,\n        head_batchnorm_last=head_batchnorm_last,\n        head_linear_first=head_linear_first,\n    )\n\n    # Embeddings and RNN defined in the BasicRNN inherited class\n\n    # Attention\n    self.attn_concatenate = attn_concatenate\n    self.attn_dropout = attn_dropout\n\n    if bidirectional and attn_concatenate:\n        self.rnn_output_dim = hidden_dim * 4\n    elif bidirectional or attn_concatenate:\n        self.rnn_output_dim = hidden_dim * 2\n    else:\n        self.rnn_output_dim = hidden_dim\n    self.attn = ContextAttention(\n        self.rnn_output_dim, attn_dropout, sum_along_seq=True\n    )\n\n    # FC-Head (Mlp)\n    if self.head_hidden_dims is not None:\n        head_hidden_dims = [self.rnn_output_dim] + self.head_hidden_dims\n        self.rnn_mlp = MLP(\n            head_hidden_dims,\n            head_activation,\n            head_dropout,\n            head_batchnorm,\n            head_batchnorm_last,\n            head_linear_first,\n        )\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.text.rnns.attentive_rnn.AttentiveRNN.attention_weights","title":"attention_weights  <code>property</code>","text":"<pre><code>attention_weights\n</code></pre> <p>List with the attention weights</p> <p>The shape of the attention weights is \\((N, S)\\), where \\(N\\) is the batch size and \\(S\\) is the length of the sequence</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.text.rnns.stacked_attentive_rnn.StackedAttentiveRNN","title":"StackedAttentiveRNN","text":"<pre><code>StackedAttentiveRNN(\n    vocab_size,\n    embed_dim=None,\n    embed_matrix=None,\n    embed_trainable=True,\n    rnn_type=\"lstm\",\n    hidden_dim=64,\n    bidirectional=False,\n    padding_idx=1,\n    n_blocks=3,\n    attn_concatenate=False,\n    attn_dropout=0.1,\n    with_addnorm=False,\n    head_hidden_dims=None,\n    head_activation=\"relu\",\n    head_dropout=None,\n    head_batchnorm=False,\n    head_batchnorm_last=False,\n    head_linear_first=False,\n)\n</code></pre> <p>               Bases: <code>BaseWDModelComponent</code></p> <p>Text classifier/regressor comprised by a stack of blocks: <code>[RNN + Attention]</code>. This can be used as the <code>deeptext</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>In addition, there is the option to add a Fully Connected (FC) set of dense layers on top of the attentiob blocks</p> <p>Parameters:</p> <ul> <li> <code>vocab_size</code>               (<code>int</code>)           \u2013            <p>Number of words in the vocabulary</p> </li> <li> <code>embed_dim</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Dimension of the word embeddings if non-pretained word vectors are used</p> </li> <li> <code>embed_matrix</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>Pretrained word embeddings</p> </li> <li> <code>embed_trainable</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating if the pretrained embeddings are trainable</p> </li> <li> <code>rnn_type</code>               (<code>Literal[lstm, gru]</code>, default:                   <code>'lstm'</code> )           \u2013            <p>String indicating the type of RNN to use. One of 'lstm' or 'gru'</p> </li> <li> <code>hidden_dim</code>               (<code>int</code>, default:                   <code>64</code> )           \u2013            <p>Hidden dim of the RNN</p> </li> <li> <code>bidirectional</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the staked RNNs are bidirectional</p> </li> <li> <code>padding_idx</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>index of the padding token in the padded-tokenised sequences. The <code>TextPreprocessor</code> class within this library uses fastai's tokenizer where the token index 0 is reserved for the 'unknown' word token. Therefore, the default value is set to 1.</p> </li> <li> <code>n_blocks</code>               (<code>int</code>, default:                   <code>3</code> )           \u2013            <p>Number of attention blocks. Each block is comprised by an RNN and a Context Attention Encoder</p> </li> <li> <code>attn_concatenate</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the input to the attention mechanism will be the output of the RNN or the output of the RNN concatenated with the last hidden state or simply</p> </li> <li> <code>attn_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Internal dropout for the attention mechanism</p> </li> <li> <code>with_addnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the output of each block will be added to the input and normalised</p> </li> <li> <code>head_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the sizes of the dense layers in the head e.g: [128, 64]</p> </li> <li> <code>head_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Activation function for the dense layers in the head. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>head_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout of the dense layers in the head</p> </li> <li> <code>head_batchnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to include batch normalization in the dense layers that form the 'rnn_mlp'</p> </li> <li> <code>head_batchnorm_last</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to apply batch normalization to the last of the dense layers in the head</p> </li> <li> <code>head_linear_first</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>word_embed</code>               (<code>Module</code>)           \u2013            <p>word embedding matrix</p> </li> <li> <code>rnn</code>               (<code>Module</code>)           \u2013            <p>Stack of RNNs</p> </li> <li> <code>rnn_mlp</code>               (<code>Module</code>)           \u2013            <p>Stack of dense layers on top of the RNN. This will only exists if <code>head_layers_dim</code> is not <code>None</code></p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import StackedAttentiveRNN\n&gt;&gt;&gt; X_text = torch.cat((torch.zeros([5,1]), torch.empty(5, 4).random_(1,4)), axis=1)\n&gt;&gt;&gt; model = StackedAttentiveRNN(vocab_size=4, hidden_dim=4, padding_idx=0, embed_dim=4)\n&gt;&gt;&gt; out = model(X_text)\n</code></pre> Source code in <code>pytorch_widedeep/models/text/rnns/stacked_attentive_rnn.py</code> <pre><code>def __init__(\n    self,\n    vocab_size: int,\n    embed_dim: Optional[int] = None,\n    embed_matrix: Optional[np.ndarray] = None,\n    embed_trainable: bool = True,\n    rnn_type: Literal[\"lstm\", \"gru\"] = \"lstm\",\n    hidden_dim: int = 64,\n    bidirectional: bool = False,\n    padding_idx: int = 1,\n    n_blocks: int = 3,\n    attn_concatenate: bool = False,\n    attn_dropout: float = 0.1,\n    with_addnorm: bool = False,\n    head_hidden_dims: Optional[List[int]] = None,\n    head_activation: str = \"relu\",\n    head_dropout: Optional[float] = None,\n    head_batchnorm: bool = False,\n    head_batchnorm_last: bool = False,\n    head_linear_first: bool = False,\n):\n    super(StackedAttentiveRNN, self).__init__()\n\n    if (\n        embed_dim is not None\n        and embed_matrix is not None\n        and not embed_dim == embed_matrix.shape[1]\n    ):\n        warnings.warn(\n            \"the input embedding dimension {} and the dimension of the \"\n            \"pretrained embeddings {} do not match. The pretrained embeddings \"\n            \"dimension ({}) will be used\".format(\n                embed_dim, embed_matrix.shape[1], embed_matrix.shape[1]\n            ),\n            UserWarning,\n        )\n\n    if rnn_type.lower() not in [\"lstm\", \"gru\"]:\n        raise ValueError(\n            f\"'rnn_type' must be 'lstm' or 'gru', got {rnn_type} instead\"\n        )\n\n    self.vocab_size = vocab_size\n    self.embed_trainable = embed_trainable\n    self.embed_dim = embed_dim\n\n    self.rnn_type = rnn_type\n    self.hidden_dim = hidden_dim\n    self.bidirectional = bidirectional\n    self.padding_idx = padding_idx\n\n    self.n_blocks = n_blocks\n    self.attn_concatenate = attn_concatenate\n    self.attn_dropout = attn_dropout\n    self.with_addnorm = with_addnorm\n\n    self.head_hidden_dims = head_hidden_dims\n    self.head_activation = head_activation\n    self.head_dropout = head_dropout\n    self.head_batchnorm = head_batchnorm\n    self.head_batchnorm_last = head_batchnorm_last\n    self.head_linear_first = head_linear_first\n\n    # Embeddings\n    self.word_embed, self.embed_dim = self._set_embeddings(embed_matrix)\n\n    # Linear Projection: if embed_dim is different that the input of the\n    # attention blocks we add a linear projection\n    if bidirectional and attn_concatenate:\n        self.rnn_output_dim = hidden_dim * 4\n    elif bidirectional or attn_concatenate:\n        self.rnn_output_dim = hidden_dim * 2\n    else:\n        self.rnn_output_dim = hidden_dim\n\n    if self.rnn_output_dim != self.embed_dim:\n        self.embed_proj: Union[nn.Linear, nn.Identity] = nn.Linear(\n            self.embed_dim, self.rnn_output_dim\n        )\n    else:\n        self.embed_proj = nn.Identity()\n\n    # RNN\n    rnn_params = {\n        \"input_size\": self.rnn_output_dim,\n        \"hidden_size\": hidden_dim,\n        \"bidirectional\": bidirectional,\n        \"batch_first\": True,\n    }\n    if self.rnn_type.lower() == \"lstm\":\n        self.rnn: Union[nn.LSTM, nn.GRU] = nn.LSTM(**rnn_params)\n    elif self.rnn_type.lower() == \"gru\":\n        self.rnn = nn.GRU(**rnn_params)\n\n    # FC-Head (Mlp)\n    self.attention_blks = nn.ModuleList()\n    for i in range(n_blocks):\n        self.attention_blks.append(\n            ContextAttentionEncoder(\n                self.rnn,\n                self.rnn_output_dim,\n                attn_dropout,\n                attn_concatenate,\n                with_addnorm=with_addnorm if i != n_blocks - 1 else False,\n                sum_along_seq=i == n_blocks - 1,\n            )\n        )\n\n    # Mlp\n    if self.head_hidden_dims is not None:\n        head_hidden_dims = [self.rnn_output_dim] + self.head_hidden_dims\n        self.rnn_mlp: Union[MLP, nn.Identity] = MLP(\n            head_hidden_dims,\n            head_activation,\n            head_dropout,\n            head_batchnorm,\n            head_batchnorm_last,\n            head_linear_first,\n        )\n    else:\n        # simple hack to add readability in the forward pass\n        self.rnn_mlp = nn.Identity()\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.text.rnns.stacked_attentive_rnn.StackedAttentiveRNN.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.text.rnns.stacked_attentive_rnn.StackedAttentiveRNN.attention_weights","title":"attention_weights  <code>property</code>","text":"<pre><code>attention_weights\n</code></pre> <p>List with the attention weights per block</p> <p>The shape of the attention weights is \\((N, S)\\) Where \\(N\\) is the batch size and \\(S\\) is the length of the sequence</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.text.miscellaneous.basic_transformer.Transformer","title":"Transformer","text":"<pre><code>Transformer(\n    vocab_size,\n    seq_length,\n    input_dim,\n    n_heads,\n    n_blocks,\n    attn_dropout=0.1,\n    ff_dropout=0.1,\n    ff_factor=4,\n    activation=\"gelu\",\n    use_linear_attention=False,\n    use_flash_attention=False,\n    padding_idx=0,\n    with_cls_token=False,\n    *,\n    with_pos_encoding=True,\n    pos_encoding_dropout=0.1,\n    pos_encoder=None\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Basic Encoder-Only Transformer Model for text classification/regression. As all other models in the library this model can be used as the <code>deeptext</code> component of a Wide &amp; Deep model or independently by itself.</p> <p> NOTE: This model is introduced in the context of recommendation systems and thought for sequences of any nature (e.g. items). It can, of course, still be used for text. However, at this stage, we have decided to not include the possibility of loading pretrained word vectors since we aim to integrate the library wit Huggingface in the (hopefully) near future</p> <p>Parameters:</p> <ul> <li> <code>vocab_size</code>               (<code>int</code>)           \u2013            <p>Number of words in the vocabulary</p> </li> <li> <code>input_dim</code>               (<code>int</code>)           \u2013            <p>Dimension of the token embeddings</p> <p>Param aliases: <code>embed_dim</code>, <code>d_model</code>. </p> </li> <li> <code>seq_length</code>               (<code>int</code>)           \u2013            <p>Input sequence length</p> </li> <li> <code>n_heads</code>               (<code>int</code>)           \u2013            <p>Number of attention heads per Transformer block</p> </li> <li> <code>n_blocks</code>               (<code>int</code>)           \u2013            <p>Number of Transformer blocks</p> </li> <li> <code>attn_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Dropout that will be applied to the Multi-Head Attention layers</p> </li> <li> <code>ff_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Dropout that will be applied to the FeedForward network</p> </li> <li> <code>ff_factor</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Multiplicative factor applied to the first layer of the FF network in each Transformer block, This is normally set to 4.</p> </li> <li> <code>activation</code>               (<code>str</code>, default:                   <code>'gelu'</code> )           \u2013            <p>Transformer Encoder activation function. 'tanh', 'relu', 'leaky_relu', 'gelu', 'geglu' and 'reglu' are supported</p> </li> <li> <code>padding_idx</code>               (<code>int</code>, default:                   <code>0</code> )           \u2013            <p>index of the padding token in the padded-tokenised sequences.</p> </li> <li> <code>with_cls_token</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if a <code>'[CLS]'</code> token is included in the tokenized sequences. If present, the final hidden state corresponding to this token is used as the aggregated representation for classification and regression tasks. NOTE: if included in the tokenized sequences it must be inserted as the first token in the sequences.</p> </li> <li> <code>with_pos_encoding</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating if positional encoding will be used</p> </li> <li> <code>pos_encoding_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Positional encoding dropout</p> </li> <li> <code>pos_encoder</code>               (<code>Optional[Module]</code>, default:                   <code>None</code> )           \u2013            <p>This model uses by default a standard positional encoding approach. However, any custom positional encoder can also be used and pass to the Transformer model via the 'pos_encoder' parameter</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>embedding</code>               (<code>Module</code>)           \u2013            <p>Standard token embedding layer</p> </li> <li> <code>pos_encoder</code>               (<code>Module</code>)           \u2013            <p>Positional Encoder</p> </li> <li> <code>encoder</code>               (<code>Module</code>)           \u2013            <p>Sequence of Transformer blocks</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import Transformer\n&gt;&gt;&gt; X_text = torch.cat((torch.zeros([5,1]), torch.empty(5, 4).random_(1,4)), axis=1)\n&gt;&gt;&gt; model = Transformer(vocab_size=4, seq_length=5, input_dim=8, n_heads=1, n_blocks=1)\n&gt;&gt;&gt; out = model(X_text)\n</code></pre> Source code in <code>pytorch_widedeep/models/text/miscellaneous/basic_transformer.py</code> <pre><code>@alias(\"input_dim\", [\"embed_dim\", \"d_model\"])\n@alias(\"seq_length\", [\"max_length\", \"maxlen\"])\ndef __init__(\n    self,\n    vocab_size: int,\n    seq_length: int,\n    input_dim: int,\n    n_heads: int,\n    n_blocks: int,\n    attn_dropout: float = 0.1,\n    ff_dropout: float = 0.1,\n    ff_factor: int = 4,\n    activation: str = \"gelu\",\n    use_linear_attention: bool = False,\n    use_flash_attention: bool = False,\n    padding_idx: int = 0,\n    with_cls_token: bool = False,\n    *,  # from here on pos encoding args\n    with_pos_encoding: bool = True,\n    pos_encoding_dropout: float = 0.1,\n    pos_encoder: Optional[nn.Module] = None,\n):\n    super().__init__()\n\n    self.input_dim = input_dim\n    self.seq_length = seq_length\n    self.n_heads = n_heads\n    self.n_blocks = n_blocks\n    self.attn_dropout = attn_dropout\n    self.ff_dropout = ff_dropout\n    self.ff_factor = ff_factor\n    self.activation = activation\n    self.use_linear_attention = use_linear_attention\n    self.use_flash_attention = use_flash_attention\n    self.padding_idx = padding_idx\n    self.with_cls_token = with_cls_token\n    self.with_pos_encoding = with_pos_encoding\n    self.pos_encoding_dropout = pos_encoding_dropout\n\n    self.embedding = nn.Embedding(\n        vocab_size, input_dim, padding_idx=self.padding_idx\n    )\n\n    if with_pos_encoding:\n        if pos_encoder is not None:\n            self.pos_encoder: Union[nn.Module, nn.Identity, PositionalEncoding] = (\n                pos_encoder\n            )\n        else:\n            self.pos_encoder = PositionalEncoding(\n                input_dim, pos_encoding_dropout, seq_length\n            )\n    else:\n        self.pos_encoder = nn.Identity()\n\n    self.encoder = nn.Sequential()\n    for i in range(n_blocks):\n        self.encoder.add_module(\n            \"transformer_block\" + str(i),\n            TransformerEncoder(\n                input_dim,\n                n_heads,\n                False,  # use_qkv_bias\n                attn_dropout,\n                ff_dropout,\n                ff_factor,\n                activation,\n                use_linear_attention,\n                use_flash_attention,\n            ),\n        )\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.text.huggingface_transformers.hf_model.HFModel","title":"HFModel","text":"<pre><code>HFModel(\n    model_name,\n    use_cls_token=True,\n    trainable_parameters=None,\n    head_hidden_dims=None,\n    head_activation=\"relu\",\n    head_dropout=None,\n    head_batchnorm=False,\n    head_batchnorm_last=False,\n    head_linear_first=False,\n    verbose=False,\n    **kwargs\n)\n</code></pre> <p>               Bases: <code>BaseWDModelComponent</code></p> <p>This class is a wrapper around the Hugging Face transformers library. It can be used as the text component of a Wide &amp; Deep model or independently by itself.</p> <p>At the moment only models from the families BERT, RoBERTa, DistilBERT, ALBERT and ELECTRA are supported. This is because this library is designed to address classification and regression tasks and these are the most 'popular' encoder-only models, which have proved to be those that work best for these tasks.</p> <p>Parameters:</p> <ul> <li> <code>model_name</code>               (<code>str</code>)           \u2013            <p>The model name from the transformers library e.g. 'bert-base-uncased'. Currently supported models are those from the families: BERT, RoBERTa, DistilBERT, ALBERT and ELECTRA.</p> </li> <li> <code>use_cls_token</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating whether to use the [CLS] token or the mean of the sequence of hidden states as the sentence embedding</p> </li> <li> <code>trainable_parameters</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the names of the model parameters that will be trained. If None, none of the parameters will be trainable</p> </li> <li> <code>head_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the sizes of the dense layers in the head e.g: [128, 64]</p> </li> <li> <code>head_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Activation function for the dense layers in the head. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>head_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout of the dense layers in the head</p> </li> <li> <code>head_batchnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to include batch normalization in the dense layers that form the head</p> </li> <li> <code>head_batchnorm_last</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to apply batch normalization to the last of the dense layers in the head</p> </li> <li> <code>head_linear_first</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> <li> <code>verbose</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>If True, it will print information about the model</p> </li> <li> <code>**kwargs</code>           \u2013            <p>Additional kwargs to be passed to the model</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>head</code>               (<code>Module</code>)           \u2013            <p>Stack of dense layers on top of the transformer. This will only exists if <code>head_layers_dim</code> is not None</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import HFModel\n&gt;&gt;&gt; X_text = torch.cat((torch.zeros([5,1]), torch.empty(5, 4).random_(1,4)), axis=1).long()\n&gt;&gt;&gt; model = HFModel(model_name='bert-base-uncased')\n&gt;&gt;&gt; out = model(X_text)\n</code></pre> Source code in <code>pytorch_widedeep/models/text/huggingface_transformers/hf_model.py</code> <pre><code>@alias(\"use_cls_token\", [\"use_special_token\"])\ndef __init__(\n    self,\n    model_name: str,\n    use_cls_token: bool = True,\n    trainable_parameters: Optional[List[str]] = None,\n    head_hidden_dims: Optional[List[int]] = None,\n    head_activation: str = \"relu\",\n    head_dropout: Optional[float] = None,\n    head_batchnorm: bool = False,\n    head_batchnorm_last: bool = False,\n    head_linear_first: bool = False,\n    verbose: bool = False,\n    **kwargs,\n):\n    super().__init__()\n\n    # TO DO: add warning regarging ELECTRA as ELECTRA does not have a cls\n    # token.  Research what happens with ELECTRA\n    self.model_name = model_name\n    self.use_cls_token = use_cls_token\n    self.trainable_parameters = trainable_parameters\n    self.head_hidden_dims = head_hidden_dims\n    self.head_activation = head_activation\n    self.head_dropout = head_dropout\n    self.head_batchnorm = head_batchnorm\n    self.head_batchnorm_last = head_batchnorm_last\n    self.head_linear_first = head_linear_first\n    self.verbose = verbose\n    self.kwargs = kwargs\n\n    if self.verbose and self.use_cls_token:\n        warnings.warn(\n            \"The model will use the [CLS] token. Make sure the tokenizer \"\n            \"was run with add_special_tokens=True\",\n            UserWarning,\n        )\n\n    self.model_class = get_model_class(model_name)\n\n    self.config, self.model = get_config_and_model(self.model_name)\n\n    self.output_attention_weights = kwargs.get(\"output_attentions\", False)\n\n    if self.trainable_parameters is not None:\n        for n, p in self.model.named_parameters():\n            p.requires_grad = any([tl in n for tl in self.trainable_parameters])\n\n    # FC-Head (Mlp). Note that the FC head will always be trainable\n    if self.head_hidden_dims is not None:\n        head_hidden_dims = [self.config.hidden_size] + self.head_hidden_dims\n        self.head = MLP(\n            head_hidden_dims,\n            head_activation,\n            head_dropout,\n            head_batchnorm,\n            head_batchnorm_last,\n            head_linear_first,\n        )\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.text.huggingface_transformers.hf_model.HFModel.attention_weight","title":"attention_weight  <code>property</code>","text":"<pre><code>attention_weight\n</code></pre> <p>Returns the attention weights if the model was created with the output_attention_weights=True argument. If not, it will raise an AttributeError.</p> <p>The shape of the attention weights is \\((N, H, F, F)\\), where \\(N\\) is the batch size, \\(H\\) is the number of attention heads and \\(F\\) is the sequence length.</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.image.vision.Vision","title":"Vision","text":"<pre><code>Vision(\n    pretrained_model_setup=None,\n    n_trainable=None,\n    trainable_params=None,\n    channel_sizes=[64, 128, 256, 512],\n    kernel_sizes=[7, 3, 3, 3],\n    strides=[2, 1, 1, 1],\n    head_hidden_dims=None,\n    head_activation=\"relu\",\n    head_dropout=0.1,\n    head_batchnorm=False,\n    head_batchnorm_last=False,\n    head_linear_first=False,\n)\n</code></pre> <p>               Bases: <code>BaseWDModelComponent</code></p> <p>Defines a standard image classifier/regressor using a pretrained network or a sequence of convolution layers that can be used as the <code>deepimage</code> component of a Wide &amp; Deep model or independently by itself.</p> <p> NOTE: this class represents the integration  between <code>pytorch-widedeep</code> and <code>torchvision</code>. New architectures will be  available as they are added to <code>torchvision</code>. In a distant future we aim  to bring transformer-based architectures as well. However, simple  CNN-based architectures (and even MLP-based) seem to produce SoTA  results. For the time being, we describe below the options available  through this class</p> <p>Parameters:</p> <ul> <li> <code>pretrained_model_setup</code>               (<code>Union[str, Dict[str, Union[str, WeightsEnum]]]</code>, default:                   <code>None</code> )           \u2013            <p>Name of the pretrained model. Should be a variant of the following architectures: 'resnet', 'shufflenet', 'resnext', 'wide_resnet', 'regnet', 'densenet', 'mobilenetv3', 'mobilenetv2', 'mnasnet', 'efficientnet' and 'squeezenet'. if <code>pretrained_model_setup = None</code> a basic, fully trainable CNN will be used. Alternatively, since Torchvision 0.13 one can use pretrained models with different weigths. Therefore, <code>pretrained_model_setup</code> can also be dictionary with the name of the model and the weights (e.g. <code>{'resnet50': ResNet50_Weights.DEFAULT}</code> or <code>{'resnet50': \"IMAGENET1K_V2\"}</code>).  Aliased as <code>pretrained_model_name</code>.</p> </li> <li> <code>n_trainable</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Number of trainable layers starting from the layer closer to the output neuron(s). Note that this number DOES NOT take into account the so-called 'head' which is ALWAYS trainable. If <code>trainable_params</code> is not None this parameter will be ignored</p> </li> <li> <code>trainable_params</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List of strings containing the names (or substring within the name) of the parameters that will be trained. For example, if we use a 'resnet18' pretrained model and we set <code>trainable_params = ['layer4']</code> only the parameters of 'layer4' of the network (and the head, as mentioned before) will be trained. Note that setting this or the previous parameter involves some knowledge of the architecture used.</p> </li> <li> <code>channel_sizes</code>               (<code>List[int]</code>, default:                   <code>[64, 128, 256, 512]</code> )           \u2013            <p>List of integers with the channel sizes of a CNN in case we choose not to use a pretrained model</p> </li> <li> <code>kernel_sizes</code>               (<code>Union[int, List[int]]</code>, default:                   <code>[7, 3, 3, 3]</code> )           \u2013            <p>List of integers with the kernel sizes of a CNN in case we choose not to use a pretrained model. Must be of length equal to <code>len(channel_sizes) - 1</code>.</p> </li> <li> <code>strides</code>               (<code>Union[int, List[int]]</code>, default:                   <code>[2, 1, 1, 1]</code> )           \u2013            <p>List of integers with the stride sizes of a CNN in case we choose not to use a pretrained model. Must be of length equal to <code>len(channel_sizes) - 1</code>.</p> </li> <li> <code>head_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per dense layer in the head. e.g: [64,32]</p> </li> <li> <code>head_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Activation function for the dense layers in the head. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>head_dropout</code>               (<code>Union[float, List[float]]</code>, default:                   <code>0.1</code> )           \u2013            <p>float indicating the dropout between the dense layers.</p> </li> <li> <code>head_batchnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers</p> </li> <li> <code>head_batchnorm_last</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers</p> </li> <li> <code>head_linear_first</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>features</code>               (<code>Module</code>)           \u2013            <p>The pretrained model or Standard CNN plus the optional head</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import Vision\n&gt;&gt;&gt; X_img = torch.rand((2,3,224,224))\n&gt;&gt;&gt; model = Vision(channel_sizes=[64, 128], kernel_sizes = [3, 3], strides=[1, 1], head_hidden_dims=[32, 8])\n&gt;&gt;&gt; out = model(X_img)\n</code></pre> Source code in <code>pytorch_widedeep/models/image/vision.py</code> <pre><code>@alias(\"pretrained_model_setup\", [\"pretrained_model_name\"])\ndef __init__(\n    self,\n    pretrained_model_setup: Union[str, Dict[str, Union[str, WeightsEnum]]] = None,\n    n_trainable: Optional[int] = None,\n    trainable_params: Optional[List[str]] = None,\n    channel_sizes: List[int] = [64, 128, 256, 512],\n    kernel_sizes: Union[int, List[int]] = [7, 3, 3, 3],\n    strides: Union[int, List[int]] = [2, 1, 1, 1],\n    head_hidden_dims: Optional[List[int]] = None,\n    head_activation: str = \"relu\",\n    head_dropout: Union[float, List[float]] = 0.1,\n    head_batchnorm: bool = False,\n    head_batchnorm_last: bool = False,\n    head_linear_first: bool = False,\n):\n    super(Vision, self).__init__()\n\n    self._check_pretrained_model_setup(\n        pretrained_model_setup, n_trainable, trainable_params\n    )\n\n    self.pretrained_model_setup = pretrained_model_setup\n    self.n_trainable = n_trainable\n    self.trainable_params = trainable_params\n    self.channel_sizes = channel_sizes\n    self.kernel_sizes = kernel_sizes\n    self.strides = strides\n    self.head_hidden_dims = head_hidden_dims\n    self.head_activation = head_activation\n    self.head_dropout = head_dropout\n    self.head_batchnorm = head_batchnorm\n    self.head_batchnorm_last = head_batchnorm_last\n    self.head_linear_first = head_linear_first\n\n    self.features, self.backbone_output_dim = self._get_features()\n\n    if pretrained_model_setup is not None:\n        self._freeze(self.features)\n\n    if self.head_hidden_dims is not None:\n        head_hidden_dims = [self.backbone_output_dim] + self.head_hidden_dims\n        self.vision_mlp = MLP(\n            head_hidden_dims,\n            self.head_activation,\n            self.head_dropout,\n            self.head_batchnorm,\n            self.head_batchnorm_last,\n            self.head_linear_first,\n        )\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.image.vision.Vision.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.model_fusion.ModelFuser","title":"ModelFuser","text":"<pre><code>ModelFuser(\n    models,\n    *,\n    fusion_method,\n    projection_method=None,\n    custom_head=None,\n    head_hidden_dims=None,\n    head_activation=None,\n    head_dropout=None,\n    head_batchnorm=None,\n    head_batchnorm_last=None,\n    head_linear_first=None\n)\n</code></pre> <p>               Bases: <code>BaseWDModelComponent</code></p> <p>This class is a wrapper around a list of models that are associated to the different text and/or image columns (and datasets) The class is designed to 'fuse' the models using a variety of methods.</p> <p>Parameters:</p> <ul> <li> <code>models</code>               (<code>List[BaseWDModelComponent]</code>)           \u2013            <p>List of models whose outputs will be fused</p> </li> <li> <code>fusion_method</code>               (<code>Union[Literal[concatenate, mean, max, sum, mult, head], List[Literal[concatenate, mean, max, sum, mult, head]]]</code>)           \u2013            <p>Method to fuse the output of the models. It can be one of ['concatenate', 'mean', 'max', 'sum', 'mult', 'head'] or a list of those. If a list is provided the output of the models will be fused using all the methods in the list and the final output will be the concatenation of the outputs of each method</p> </li> <li> <code>projection_method</code>               (<code>Optional[Literal[min, max, mean]]</code>, default:                   <code>None</code> )           \u2013            <p>If the fusion_method is not 'concatenate', this parameter will determine how to project the output of the models to a common dimension. It can be one of ['min', 'max', 'mean']. Default is None</p> </li> <li> <code>custom_head</code>               (<code>Optional[Union[BaseWDModelComponent, Module]]</code>, default:                   <code>None</code> )           \u2013            <p>Custom head to be used to fuse the output of the models. If provided, this will take precedence over head_hidden_dims. Also, if provided, 'projection_method' will be ignored.</p> </li> <li> <code>head_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per layer in the custom head. If custom_head is provided, this parameter will be ignored</p> </li> <li> <code>head_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function to be used in the custom head. Default is None</p> </li> <li> <code>head_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout to be used in the custom head. Default is None</p> </li> <li> <code>head_batchnorm</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Whether to use batchnorm in the custom head. Default is None</p> </li> <li> <code>head_batchnorm_last</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Whether or not batch normalization will be applied to the last of the dense layers</p> </li> <li> <code>head_linear_first</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>head</code>               (<code>Module or BaseWDModelComponent</code>)           \u2013            <p>Custom head to be used to fuse the output of the models. If custom_head is provided, this will take precedence over head_hidden_dims</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from pytorch_widedeep.preprocessing import TextPreprocessor\n&gt;&gt;&gt; from pytorch_widedeep.models import BasicRNN, ModelFuser\n&gt;&gt;&gt; import torch\n&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt;\n&gt;&gt;&gt; df = pd.DataFrame({'text_col1': ['hello world', 'this is a test'],\n... 'text_col2': ['goodbye world', 'this is another test']})\n&gt;&gt;&gt; text_preprocessor_1 = TextPreprocessor(\n...     text_col=\"text_col1\",\n...     max_vocab=10,\n...     min_freq=1,\n...     maxlen=5,\n...     n_cpus=1,\n...     verbose=0)\n&gt;&gt;&gt; text_preprocessor_2 = TextPreprocessor(\n...     text_col=\"text_col2\",\n...     max_vocab=10,\n...     min_freq=1,\n...     maxlen=5,\n...     n_cpus=1,\n...     verbose=0)\n&gt;&gt;&gt; X_text1 = text_preprocessor_1.fit_transform(df)\n&gt;&gt;&gt; X_text2 = text_preprocessor_2.fit_transform(df)\n&gt;&gt;&gt; X_text1_tnsr = torch.from_numpy(X_text1)\n&gt;&gt;&gt; X_text2_tnsr = torch.from_numpy(X_text2)\n&gt;&gt;&gt; rnn1 = BasicRNN(\n...     vocab_size=len(text_preprocessor_1.vocab.itos),\n...     embed_dim=4,\n...     hidden_dim=4,\n...     n_layers=1,\n...     bidirectional=False)\n&gt;&gt;&gt; rnn2 = BasicRNN(\n...     vocab_size=len(text_preprocessor_2.vocab.itos),\n...     embed_dim=4,\n...     hidden_dim=4,\n...     n_layers=1,\n...     bidirectional=False)\n&gt;&gt;&gt; fused_model = ModelFuser(models=[rnn1, rnn2], fusion_method='concatenate')\n&gt;&gt;&gt; out = fused_model([X_text1_tnsr, X_text2_tnsr])\n</code></pre> Source code in <code>pytorch_widedeep/models/model_fusion.py</code> <pre><code>def __init__(\n    self,\n    models: List[BaseWDModelComponent],\n    *,\n    fusion_method: Union[\n        Literal[\n            \"concatenate\",\n            \"mean\",\n            \"max\",\n            \"sum\",\n            \"mult\",\n            \"head\",\n        ],\n        List[Literal[\"concatenate\", \"mean\", \"max\", \"sum\", \"mult\", \"head\"]],\n    ],\n    projection_method: Optional[Literal[\"min\", \"max\", \"mean\"]] = None,\n    custom_head: Optional[Union[BaseWDModelComponent, nn.Module]] = None,\n    head_hidden_dims: Optional[List[int]] = None,\n    head_activation: Optional[str] = None,\n    head_dropout: Optional[float] = None,\n    head_batchnorm: Optional[bool] = None,\n    head_batchnorm_last: Optional[bool] = None,\n    head_linear_first: Optional[bool] = None,\n) -&gt; None:\n    super(ModelFuser, self).__init__()\n\n    self.models = nn.ModuleList(models)\n    self.fusion_method = fusion_method\n    self.projection_method = projection_method\n\n    self.all_output_dim_equal = all(\n        model.output_dim == self.models[0].output_dim for model in self.models\n    )\n\n    self.check_input_parameters()\n\n    if self.fusion_method == \"head\":\n        assert (\n            head_hidden_dims is not None or custom_head is not None\n        ), \"When using 'head' as fusion_method, either head_hidden_dims or custom_head must be provided\"\n        if custom_head is not None:\n            # custom_head takes precedence over head_hidden_dims (in case\n            # both are provided)\n            assert hasattr(\n                custom_head, \"output_dim\"\n            ), \"custom_head must have an 'output_dim' property\"\n            self.head: Union[BaseWDModelComponent, nn.Module] = custom_head\n        else:\n            assert head_hidden_dims is not None\n            self.head_hidden_dims = head_hidden_dims\n            self.head_activation = head_activation\n            self.head_dropout = head_dropout\n            self.head_batchnorm = head_batchnorm\n            self.head_batchnorm_last = head_batchnorm_last\n            self.head_linear_first = head_linear_first\n\n            self.head = MLP(\n                d_hidden=[sum([model.output_dim for model in self.models])]\n                + self.head_hidden_dims,\n                activation=(\n                    \"relu\" if self.head_activation is None else self.head_activation\n                ),\n                dropout=0.0 if self.head_dropout is None else self.head_dropout,\n                batchnorm=(\n                    False if self.head_batchnorm is None else self.head_batchnorm\n                ),\n                batchnorm_last=(\n                    False\n                    if self.head_batchnorm_last is None\n                    else self.head_batchnorm_last\n                ),\n                linear_first=(\n                    True\n                    if self.head_linear_first is None\n                    else self.head_linear_first\n                ),\n            )\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.model_fusion.ModelFuser.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>Returns the output dimension of the model.</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.model_fusion.ModelFuser.project","title":"project","text":"<pre><code>project(X)\n</code></pre> <p>Projects the output of the models to a common dimension.</p> Source code in <code>pytorch_widedeep/models/model_fusion.py</code> <pre><code>def project(self, X: List[Tensor]) -&gt; List[Tensor]:\n    r\"\"\"Projects the output of the models to a common dimension.\"\"\"\n\n    if self.all_output_dim_equal and self.projection_method is None:\n        return X\n\n    output_dims = [model.output_dim for model in self.models]\n\n    if self.projection_method == \"min\":\n        proj_dim = min(output_dims)\n        idx = output_dims.index(proj_dim)\n    elif self.projection_method == \"max\":\n        proj_dim = max(output_dims)\n        idx = output_dims.index(proj_dim)\n    elif self.projection_method == \"mean\":\n        proj_dim = int(sum(output_dims) / len(output_dims))\n        idx = None\n    else:\n        raise ValueError(\"projection_method must be one of ['min', 'max', 'mean']\")\n\n    x_proj: List[Tensor] = []\n    for i, x in enumerate(X):\n        if i == idx:\n            x_proj.append(x)\n        else:\n            x_proj.append(\n                nn.Linear(output_dims[i], proj_dim, bias=False, device=x.device)(x)\n            )\n\n    return x_proj\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.wide_deep.WideDeep","title":"WideDeep","text":"<pre><code>WideDeep(\n    wide=None,\n    deeptabular=None,\n    deeptext=None,\n    deepimage=None,\n    deephead=None,\n    head_hidden_dims=None,\n    head_activation=\"relu\",\n    head_dropout=0.1,\n    head_batchnorm=False,\n    head_batchnorm_last=False,\n    head_linear_first=True,\n    enforce_positive=False,\n    enforce_positive_activation=\"softplus\",\n    pred_dim=1,\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Main collector class that combines all <code>wide</code>, <code>deeptabular</code> <code>deeptext</code> and <code>deepimage</code> models.</p> <p>Note that all models described so far in this library must be passed to the <code>WideDeep</code> class once constructed. This is because the models output the last layer before the prediction layer. Such prediction layer is added by the <code>WideDeep</code> class as it collects the components for every data mode.</p> <p>There are two options to combine these models that correspond to the two main architectures that <code>pytorch-widedeep</code> can build.</p> <ul> <li> <p>Directly connecting the output of the model components to an ouput neuron(s).</p> </li> <li> <p>Adding a <code>Fully-Connected Head</code> (FC-Head) on top of the deep models.   This FC-Head will combine the output form the <code>deeptabular</code>, <code>deeptext</code> and   <code>deepimage</code> and will be then connected to the output neuron(s).</p> </li> </ul> <p>Parameters:</p> <ul> <li> <code>wide</code>               (<code>Optional[Module]</code>, default:                   <code>None</code> )           \u2013            <p><code>Wide</code> model. This is a linear model where the non-linearities are captured via crossed-columns.</p> </li> <li> <code>deeptabular</code>               (<code>Optional[BaseWDModelComponent]</code>, default:                   <code>None</code> )           \u2013            <p>Currently this library implements a number of possible architectures for the <code>deeptabular</code> component. See the documenation of the package.</p> </li> <li> <code>deeptext</code>               (<code>Optional[Union[BaseWDModelComponent, List[BaseWDModelComponent]]]</code>, default:                   <code>None</code> )           \u2013            <p>Currently this library implements a number of possible architectures for the <code>deeptext</code> component. See the documenation of the package. Note that <code>deeptext</code> can be a list of models. This is useful when using multiple text inputs.</p> </li> <li> <code>deepimage</code>               (<code>Optional[Union[BaseWDModelComponent, List[BaseWDModelComponent]]]</code>, default:                   <code>None</code> )           \u2013            <p>Currently this library uses <code>torchvision</code> and implements a number of possible architectures for the <code>deepimage</code> component. See the documenation of the package. Note that <code>deepimage</code> can be a list of models. This is useful when using multiple image inputs.</p> </li> <li> <code>deephead</code>               (<code>Optional[BaseWDModelComponent]</code>, default:                   <code>None</code> )           \u2013            <p>Alternatively, the user can pass a custom model that will receive the output of the deep component. If <code>deephead</code> is not None all the previous fc-head parameters will be ignored</p> </li> <li> <code>head_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the sizes of the dense layers in the head e.g: [128, 64]</p> </li> <li> <code>head_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Activation function for the dense layers in the head. Currently <code>'tanh'</code>, <code>'relu'</code>, <code>'leaky_relu'</code> and <code>'gelu'</code> are supported</p> </li> <li> <code>head_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Dropout of the dense layers in the head</p> </li> <li> <code>head_batchnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to include batch normalization in the dense layers that form the <code>'rnn_mlp'</code></p> </li> <li> <code>head_batchnorm_last</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to apply batch normalization to the last of the dense layers in the head</p> </li> <li> <code>head_linear_first</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating whether the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> <li> <code>enforce_positive</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the output from the final layer must be positive. This is important if you are using loss functions with non-negative input restrictions, e.g. RMSLE, or if you know your predictions are bounded in between 0 and inf</p> </li> <li> <code>enforce_positive_activation</code>               (<code>str</code>, default:                   <code>'softplus'</code> )           \u2013            <p>Activation function to enforce that the final layer has a positive output. <code>'softplus'</code> or <code>'relu'</code> are supported.</p> </li> <li> <code>pred_dim</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Size of the final wide and deep output layer containing the predictions. <code>1</code> for regression and binary classification or number of classes for multiclass classification.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from pytorch_widedeep.models import TabResnet, Vision, BasicRNN, Wide, WideDeep\n&gt;&gt;&gt; embed_input = [(u, i, j) for u, i, j in zip([\"a\", \"b\", \"c\"][:4], [4] * 3, [8] * 3)]\n&gt;&gt;&gt; column_idx = {k: v for v, k in enumerate([\"a\", \"b\", \"c\"])}\n&gt;&gt;&gt; wide = Wide(10, 1)\n&gt;&gt;&gt; deeptabular = TabResnet(blocks_dims=[8, 4], column_idx=column_idx, cat_embed_input=embed_input)\n&gt;&gt;&gt; deeptext = BasicRNN(vocab_size=10, embed_dim=4, padding_idx=0)\n&gt;&gt;&gt; deepimage = Vision()\n&gt;&gt;&gt; model = WideDeep(wide=wide, deeptabular=deeptabular, deeptext=deeptext, deepimage=deepimage)\n</code></pre> <p> NOTE: It is possible to use custom components to  build Wide &amp; Deep models. Simply, build them and pass them as the  corresponding parameters. Note that the custom models MUST return a last  layer of activations(i.e. not the final prediction) so that  these  activations are collected by <code>WideDeep</code> and combined accordingly. In  addition, the models MUST also contain an attribute <code>output_dim</code> with  the size of these last layers of activations. See for example  <code>pytorch_widedeep.models.tab_mlp.TabMlp</code></p> Source code in <code>pytorch_widedeep/models/wide_deep.py</code> <pre><code>@alias(  # noqa: C901\n    \"pred_dim\",\n    [\"num_class\", \"pred_size\"],\n)\ndef __init__(\n    self,\n    wide: Optional[nn.Module] = None,\n    deeptabular: Optional[BaseWDModelComponent] = None,\n    deeptext: Optional[\n        Union[BaseWDModelComponent, List[BaseWDModelComponent]]\n    ] = None,\n    deepimage: Optional[\n        Union[BaseWDModelComponent, List[BaseWDModelComponent]]\n    ] = None,\n    deephead: Optional[BaseWDModelComponent] = None,\n    head_hidden_dims: Optional[List[int]] = None,\n    head_activation: str = \"relu\",\n    head_dropout: float = 0.1,\n    head_batchnorm: bool = False,\n    head_batchnorm_last: bool = False,\n    head_linear_first: bool = True,\n    enforce_positive: bool = False,\n    enforce_positive_activation: str = \"softplus\",\n    pred_dim: int = 1,\n):\n    super(WideDeep, self).__init__()\n\n    self._check_inputs(\n        wide,\n        deeptabular,\n        deeptext,\n        deepimage,\n        deephead,\n        head_hidden_dims,\n        pred_dim,\n    )\n\n    # this attribute will be eventually over-written by the Trainer's\n    # device. Acts here as a 'placeholder'.\n    self.wd_device: Optional[str] = None\n\n    # required as attribute just in case we pass a deephead\n    self.pred_dim = pred_dim\n\n    self.enforce_positive = enforce_positive\n\n    # The main 5 components of the wide and deep assemble: wide,\n    # deeptabular, deeptext, deepimage and deephead\n    self.with_deephead = deephead is not None or head_hidden_dims is not None\n    if deephead is None and head_hidden_dims is not None:\n        self.deephead = self._build_deephead(\n            deeptabular,\n            deeptext,\n            deepimage,\n            head_hidden_dims,\n            head_activation,\n            head_dropout,\n            head_batchnorm,\n            head_batchnorm_last,\n            head_linear_first,\n        )\n    elif deephead is not None:\n        self.deephead = nn.Sequential(\n            deephead, nn.Linear(deephead.output_dim, self.pred_dim)\n        )\n    else:\n        # for consistency with other components we default to None\n        self.deephead = None\n\n    self.wide = wide\n    self.deeptabular, self.deeptext, self.deepimage = self._set_model_components(\n        deeptabular, deeptext, deepimage, self.with_deephead\n    )\n\n    if self.enforce_positive:\n        self.enf_pos = get_activation_fn(enforce_positive_activation)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html","title":"The <code>preprocessing</code> module","text":"<p>This module contains the classes that are used to prepare the data before being passed to the models. There is one Preprocessor per data mode or model component (<code>wide</code>, <code>deeptabular</code>, <code>deepimage</code> and <code>deeptext</code>) with the exception of the <code>deeptext</code> component. In this case, two processors are available: one for the case when no Hugging Face model is used (<code>TextPreprocessor</code>) and another one when a Hugging Face model is used (<code>HFPreprocessor</code>).</p>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor","title":"WidePreprocessor","text":"<pre><code>WidePreprocessor(wide_cols, crossed_cols=None)\n</code></pre> <p>               Bases: <code>BasePreprocessor</code></p> <p>Preprocessor to prepare the wide input dataset</p> <p>This Preprocessor prepares the data for the wide, linear component. This linear model is implemented via an Embedding layer that is connected to the output neuron. <code>WidePreprocessor</code> numerically encodes all the unique values of all categorical columns <code>wide_cols + crossed_cols</code>. See the Example below.</p> <p>Parameters:</p> <ul> <li> <code>wide_cols</code>               (<code>List[str]</code>)           \u2013            <p>List of strings with the name of the columns that will label encoded and passed through the <code>wide</code> component</p> </li> <li> <code>crossed_cols</code>               (<code>Optional[List[Tuple[str, str]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the name of the columns that will be <code>'crossed'</code> and then label encoded. e.g. [('education', 'occupation'), ...]. For binary features, a cross-product transformation is 1 if and only if the constituent features are all 1, and 0 otherwise.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>wide_crossed_cols</code>               (<code>List</code>)           \u2013            <p>List with the names of all columns that will be label encoded</p> </li> <li> <code>encoding_dict</code>               (<code>Dict</code>)           \u2013            <p>Dictionary where the keys are the result of pasting <code>colname + '_' + column value</code> and the values are the corresponding mapped integer.</p> </li> <li> <code>inverse_encoding_dict</code>               (<code>Dict</code>)           \u2013            <p>the inverse encoding dictionary</p> </li> <li> <code>wide_dim</code>               (<code>int</code>)           \u2013            <p>Dimension of the wide model (i.e. dim of the linear layer)</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from pytorch_widedeep.preprocessing import WidePreprocessor\n&gt;&gt;&gt; df = pd.DataFrame({'color': ['r', 'b', 'g'], 'size': ['s', 'n', 'l']})\n&gt;&gt;&gt; wide_cols = ['color']\n&gt;&gt;&gt; crossed_cols = [('color', 'size')]\n&gt;&gt;&gt; wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)\n&gt;&gt;&gt; X_wide = wide_preprocessor.fit_transform(df)\n&gt;&gt;&gt; X_wide\narray([[1, 4],\n       [2, 5],\n       [3, 6]])\n&gt;&gt;&gt; wide_preprocessor.encoding_dict\n{'color_r': 1, 'color_b': 2, 'color_g': 3, 'color_size_r-s': 4, 'color_size_b-n': 5, 'color_size_g-l': 6}\n&gt;&gt;&gt; wide_preprocessor.inverse_transform(X_wide)\n  color color_size\n0     r        r-s\n1     b        b-n\n2     g        g-l\n</code></pre> Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code> <pre><code>def __init__(\n    self, wide_cols: List[str], crossed_cols: Optional[List[Tuple[str, str]]] = None\n):\n    super(WidePreprocessor, self).__init__()\n\n    self.wide_cols = wide_cols\n    self.crossed_cols = crossed_cols\n\n    self.is_fitted = False\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor.fit","title":"fit","text":"<pre><code>fit(df)\n</code></pre> <p>Fits the Preprocessor and creates required attributes</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>WidePreprocessor</code>           \u2013            <p><code>WidePreprocessor</code> fitted object</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code> <pre><code>def fit(self, df: pd.DataFrame) -&gt; \"WidePreprocessor\":\n    r\"\"\"Fits the Preprocessor and creates required attributes\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    WidePreprocessor\n        `WidePreprocessor` fitted object\n    \"\"\"\n    df_wide = self._prepare_wide(df)\n    self.wide_crossed_cols = df_wide.columns.tolist()\n    glob_feature_list = self._make_global_feature_list(\n        df_wide[self.wide_crossed_cols]\n    )\n    # leave 0 for padding/\"unseen\" categories\n    self.encoding_dict = {v: i + 1 for i, v in enumerate(glob_feature_list)}\n    self.wide_dim = len(self.encoding_dict)\n    self.inverse_encoding_dict = {k: v for v, k in self.encoding_dict.items()}\n    self.inverse_encoding_dict[0] = \"unseen\"\n\n    self.is_fitted = True\n\n    return self\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor.transform","title":"transform","text":"<pre><code>transform(df)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>transformed input dataframe</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code> <pre><code>def transform(self, df: pd.DataFrame) -&gt; np.ndarray:\n    r\"\"\"\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    np.ndarray\n        transformed input dataframe\n    \"\"\"\n    check_is_fitted(self, attributes=[\"encoding_dict\"])\n    df_wide = self._prepare_wide(df)\n    encoded = np.zeros([len(df_wide), len(self.wide_crossed_cols)])\n    for col_i, col in enumerate(self.wide_crossed_cols):\n        encoded[:, col_i] = df_wide[col].apply(\n            lambda x: (\n                self.encoding_dict[col + \"_\" + str(x)]\n                if col + \"_\" + str(x) in self.encoding_dict\n                else 0\n            )\n        )\n    return encoded.astype(\"int64\")\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor.inverse_transform","title":"inverse_transform","text":"<pre><code>inverse_transform(encoded)\n</code></pre> <p>Takes as input the output from the <code>transform</code> method and it will return the original values.</p> <p>Parameters:</p> <ul> <li> <code>encoded</code>               (<code>ndarray</code>)           \u2013            <p>numpy array with the encoded values that are the output from the <code>transform</code> method</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>DataFrame</code>           \u2013            <p>Pandas dataframe with the original values</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code> <pre><code>def inverse_transform(self, encoded: np.ndarray) -&gt; pd.DataFrame:\n    r\"\"\"Takes as input the output from the `transform` method and it will\n    return the original values.\n\n    Parameters\n    ----------\n    encoded: np.ndarray\n        numpy array with the encoded values that are the output from the\n        `transform` method\n\n    Returns\n    -------\n    pd.DataFrame\n        Pandas dataframe with the original values\n    \"\"\"\n    decoded = pd.DataFrame(encoded, columns=self.wide_crossed_cols)\n\n    if pd.__version__ &gt;= \"2.1.0\":\n        decoded = decoded.map(lambda x: self.inverse_encoding_dict[x])\n    else:\n        decoded = decoded.applymap(lambda x: self.inverse_encoding_dict[x])\n\n    for col in decoded.columns:\n        rm_str = \"\".join([col, \"_\"])\n        decoded[col] = decoded[col].apply(lambda x: x.replace(rm_str, \"\"))\n    return decoded\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor.fit_transform","title":"fit_transform","text":"<pre><code>fit_transform(df)\n</code></pre> <p>Combines <code>fit</code> and <code>transform</code></p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>transformed input dataframe</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code> <pre><code>def fit_transform(self, df: pd.DataFrame) -&gt; np.ndarray:\n    \"\"\"Combines `fit` and `transform`\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    np.ndarray\n        transformed input dataframe\n    \"\"\"\n    return self.fit(df).transform(df)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor","title":"TabPreprocessor","text":"<pre><code>TabPreprocessor(\n    cat_embed_cols=None,\n    continuous_cols=None,\n    quantization_setup=None,\n    cols_to_scale=None,\n    auto_embed_dim=True,\n    embedding_rule=\"fastai_new\",\n    default_embed_dim=16,\n    with_attention=False,\n    with_cls_token=False,\n    shared_embed=False,\n    verbose=1,\n    *,\n    scale=False,\n    already_standard=None,\n    **kwargs\n)\n</code></pre> <p>               Bases: <code>BasePreprocessor</code></p> <p>Preprocessor to prepare the <code>deeptabular</code> component input dataset</p> <p>Parameters:</p> <ul> <li> <code>cat_embed_cols</code>               (<code>Optional[Union[List[str], List[Tuple[str, int]]]]</code>, default:                   <code>None</code> )           \u2013            <p>List containing the name of the categorical columns that will be represented by embeddings (e.g. ['education', 'relationship', ...]) or a Tuple with the name and the embedding dimension (e.g.: [ ('education',32), ('relationship',16), ...])</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the continuous cols</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Union[int, Dict[str, Union[int, List[float]]]]]</code>, default:                   <code>None</code> )           \u2013            <p>Continuous columns can be turned into categorical via <code>pd.cut</code>. If <code>quantization_setup</code> is an <code>int</code>, all continuous columns will be quantized using this value as the number of bins. Alternatively, a dictionary where the keys are the column names to quantize and the values are the either integers indicating the number of bins or a list of scalars indicating the bin edges can also be used.</p> </li> <li> <code>cols_to_scale</code>               (<code>Optional[Union[List[str], str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the names of the columns that will be standarised via sklearn's <code>StandardScaler</code>. It can also be the string <code>'all'</code> in which case all the continuous cols will be scaled.</p> </li> <li> <code>auto_embed_dim</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating whether the embedding dimensions will be automatically defined via rule of thumb. See <code>embedding_rule</code> below.</p> </li> <li> <code>embedding_rule</code>               (<code>Literal[google, fastai_old, fastai_new]</code>, default:                   <code>'fastai_new'</code> )           \u2013            <p>If <code>auto_embed_dim=True</code>, this is the choice of embedding rule of thumb. Choices are:</p> <ul> <li> <p>fastai_new: \\(min(600, round(1.6 \\times n_{cat}^{0.56}))\\)</p> </li> <li> <p>fastai_old: \\(min(50, (n_{cat}//{2})+1)\\)</p> </li> <li> <p>google: \\(min(600, round(n_{cat}^{0.24}))\\)</p> </li> </ul> </li> <li> <code>default_embed_dim</code>               (<code>int</code>, default:                   <code>16</code> )           \u2013            <p>Dimension for the embeddings if the embedding dimension is not provided in the <code>cat_embed_cols</code> parameter and <code>auto_embed_dim</code> is set to <code>False</code>.</p> </li> <li> <code>with_attention</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the preprocessed data will be passed to an attention-based model (more precisely a model where all embeddings must have the same dimensions). If <code>True</code>, the param <code>cat_embed_cols</code> must just be a list containing just the categorical column names: e.g. ['education', 'relationship', ...]. This is because they will all be  encoded using embeddings of the same dim, which will be specified  later when the model is defined.  Param alias:  <code>for_transformer</code></p> </li> <li> <code>with_cls_token</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if a <code>'[CLS]'</code> token will be added to the dataset when using attention-based models. The final hidden state corresponding to this token is used as the aggregated representation for classification and regression tasks. If not, the categorical and/or continuous embeddings will be concatenated before being passed to the final MLP (if present).</p> </li> <li> <code>shared_embed</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the embeddings will be \"shared\" when using attention-based models. The idea behind <code>shared_embed</code> is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: <code>pytorch_widedeep.models.transformers._layers.SharedEmbeddings</code>.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            </li> <li> <code>scale</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p> note: this arg will be removed in upcoming  releases. Please use <code>cols_to_scale</code> instead.  Bool indicating  whether or not to scale/standarise continuous cols. It is important  to emphasize that all the DL models for tabular data in the library  also include the possibility of normalising the input continuous  features via a <code>BatchNorm</code> or a <code>LayerNorm</code>.  Param alias:  <code>scale_cont_cols</code>.</p> </li> <li> <code>already_standard</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p> note: this arg will be removed in upcoming  releases. Please use <code>cols_to_scale</code> instead.  List with the  name of the continuous cols that do not need to be  scaled/standarised.</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>**kwargs</code>           \u2013            <p><code>pd.cut</code> and <code>StandardScaler</code> related args</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>embed_dim</code>               (<code>Dict</code>)           \u2013            <p>Dictionary where keys are the embed cols and values are the embedding dimensions. If <code>with_attention</code> is set to <code>True</code> this attribute is not generated during the <code>fit</code> process</p> </li> <li> <code>label_encoder</code>               (<code>LabelEncoder</code>)           \u2013            <p>see <code>pytorch_widedeep.utils.dense_utils.LabelEncder</code></p> </li> <li> <code>cat_embed_input</code>               (<code>List</code>)           \u2013            <p>List of Tuples with the column name, number of individual values for that column and, If <code>with_attention</code> is set to <code>False</code>, the corresponding embeddings dim, e.g. [('education', 16, 10), ('relationship', 6, 8), ...].</p> </li> <li> <code>standardize_cols</code>               (<code>List</code>)           \u2013            <p>List of the columns that will be standarized</p> </li> <li> <code>scaler</code>               (<code>StandardScaler</code>)           \u2013            <p>an instance of <code>sklearn.preprocessing.StandardScaler</code></p> </li> <li> <code>column_idx</code>               (<code>Dict</code>)           \u2013            <p>Dictionary where keys are column names and values are column indexes. This is neccesary to slice tensors</p> </li> <li> <code>quantizer</code>               (<code>Quantizer</code>)           \u2013            <p>an instance of <code>Quantizer</code></p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; from pytorch_widedeep.preprocessing import TabPreprocessor\n&gt;&gt;&gt; df = pd.DataFrame({'color': ['r', 'b', 'g'], 'size': ['s', 'n', 'l'], 'age': [25, 40, 55]})\n&gt;&gt;&gt; cat_embed_cols = [('color',5), ('size',5)]\n&gt;&gt;&gt; cont_cols = ['age']\n&gt;&gt;&gt; deep_preprocessor = TabPreprocessor(cat_embed_cols=cat_embed_cols, continuous_cols=cont_cols)\n&gt;&gt;&gt; X_tab = deep_preprocessor.fit_transform(df)\n&gt;&gt;&gt; deep_preprocessor.cat_embed_cols\n[('color', 5), ('size', 5)]\n&gt;&gt;&gt; deep_preprocessor.column_idx\n{'color': 0, 'size': 1, 'age': 2}\n&gt;&gt;&gt; cont_df = pd.DataFrame({\"col1\": np.random.rand(10), \"col2\": np.random.rand(10) + 1})\n&gt;&gt;&gt; cont_cols = [\"col1\", \"col2\"]\n&gt;&gt;&gt; tab_preprocessor = TabPreprocessor(continuous_cols=cont_cols, quantization_setup=3)\n&gt;&gt;&gt; ft_cont_df = tab_preprocessor.fit_transform(cont_df)\n&gt;&gt;&gt; # or...\n&gt;&gt;&gt; quantization_setup = {'col1': [0., 0.4, 1.], 'col2': [1., 1.4, 2.]}\n&gt;&gt;&gt; tab_preprocessor2 = TabPreprocessor(continuous_cols=cont_cols, quantization_setup=quantization_setup)\n&gt;&gt;&gt; ft_cont_df2 = tab_preprocessor2.fit_transform(cont_df)\n</code></pre> Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code> <pre><code>@alias(\"with_attention\", [\"for_transformer\"])\n@alias(\"cat_embed_cols\", [\"embed_cols\"])\n@alias(\"scale\", [\"scale_cont_cols\"])\n@alias(\"quantization_setup\", [\"cols_and_bins\"])\ndef __init__(\n    self,\n    cat_embed_cols: Optional[Union[List[str], List[Tuple[str, int]]]] = None,\n    continuous_cols: Optional[List[str]] = None,\n    quantization_setup: Optional[\n        Union[int, Dict[str, Union[int, List[float]]]]\n    ] = None,\n    cols_to_scale: Optional[Union[List[str], str]] = None,\n    auto_embed_dim: bool = True,\n    embedding_rule: Literal[\"google\", \"fastai_old\", \"fastai_new\"] = \"fastai_new\",\n    default_embed_dim: int = 16,\n    with_attention: bool = False,\n    with_cls_token: bool = False,\n    shared_embed: bool = False,\n    verbose: int = 1,\n    *,\n    scale: bool = False,\n    already_standard: Optional[List[str]] = None,\n    **kwargs,\n):\n    super(TabPreprocessor, self).__init__()\n\n    self.continuous_cols = continuous_cols\n    self.quantization_setup = quantization_setup\n    self.cols_to_scale = cols_to_scale\n    self.scale = scale\n    self.already_standard = already_standard\n    self.auto_embed_dim = auto_embed_dim\n    self.embedding_rule = embedding_rule\n    self.default_embed_dim = default_embed_dim\n    self.with_attention = with_attention\n    self.with_cls_token = with_cls_token\n    self.shared_embed = shared_embed\n    self.verbose = verbose\n\n    self.quant_args = {\n        k: v for k, v in kwargs.items() if k in pd.cut.__code__.co_varnames\n    }\n    self.scale_args = {\n        k: v for k, v in kwargs.items() if k in StandardScaler().get_params()\n    }\n\n    self._check_inputs(cat_embed_cols)\n\n    if with_cls_token:\n        self.cat_embed_cols = (\n            [\"cls_token\"] + cat_embed_cols  # type: ignore[operator]\n            if cat_embed_cols is not None\n            else [\"cls_token\"]\n        )\n    else:\n        self.cat_embed_cols = cat_embed_cols  # type: ignore[assignment]\n\n    self.is_fitted = False\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor.fit","title":"fit","text":"<pre><code>fit(df)\n</code></pre> <p>Fits the Preprocessor and creates required attributes</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>TabPreprocessor</code>           \u2013            <p><code>TabPreprocessor</code> fitted object</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code> <pre><code>def fit(self, df: pd.DataFrame) -&gt; BasePreprocessor:  # noqa: C901\n    \"\"\"Fits the Preprocessor and creates required attributes\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    TabPreprocessor\n        `TabPreprocessor` fitted object\n    \"\"\"\n\n    df_adj = self._insert_cls_token(df) if self.with_cls_token else df.copy()\n\n    self.column_idx: Dict[str, int] = {}\n\n    # Categorical embeddings logic\n    if self.cat_embed_cols is not None or self.quantization_setup is not None:\n        self.cat_embed_input: List[Union[Tuple[str, int], Tuple[str, int, int]]] = (\n            []\n        )\n\n    if self.cat_embed_cols is not None:\n        df_cat, cat_embed_dim = self._prepare_categorical(df_adj)\n\n        self.label_encoder = LabelEncoder(\n            columns_to_encode=df_cat.columns.tolist(),\n            shared_embed=self.shared_embed,\n            with_attention=self.with_attention,\n        )\n        self.label_encoder.fit(df_cat)\n\n        for k, v in self.label_encoder.encoding_dict.items():\n            if self.with_attention:\n                self.cat_embed_input.append((k, len(v)))\n            else:\n                self.cat_embed_input.append((k, len(v), cat_embed_dim[k]))\n\n        self.column_idx.update({k: v for v, k in enumerate(df_cat.columns)})\n\n    # Continuous columns logic\n    if self.continuous_cols is not None:\n        df_cont, cont_embed_dim = self._prepare_continuous(df_adj)\n\n        # Standardization logic\n        if self.standardize_cols is not None:\n            self.scaler = StandardScaler(**self.scale_args).fit(\n                df_cont[self.standardize_cols].values\n            )\n        elif self.verbose:\n            warnings.warn(\"Continuous columns will not be normalised\")\n\n        # Quantization logic\n        if self.cols_and_bins is not None:\n            # we do not run 'Quantizer.fit' here since in the wild case\n            # someone wants standardization and quantization for the same\n            # columns, the Quantizer will run on the scaled data\n            self.quantizer = Quantizer(self.cols_and_bins, **self.quant_args)\n\n            if self.with_attention:\n                for col, n_cat, _ in cont_embed_dim:\n                    self.cat_embed_input.append((col, n_cat))\n            else:\n                self.cat_embed_input.extend(cont_embed_dim)\n\n        self.column_idx.update(\n            {k: v + len(self.column_idx) for v, k in enumerate(df_cont)}\n        )\n\n    self.is_fitted = True\n\n    return self\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor.transform","title":"transform","text":"<pre><code>transform(df)\n</code></pre> <p>Returns the processed <code>dataframe</code> as a np.ndarray</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>transformed input dataframe</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code> <pre><code>def transform(self, df: pd.DataFrame) -&gt; np.ndarray:  # noqa: C901\n    \"\"\"Returns the processed `dataframe` as a np.ndarray\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    np.ndarray\n        transformed input dataframe\n    \"\"\"\n    check_is_fitted(self, condition=self.is_fitted)\n\n    df_adj = self._insert_cls_token(df) if self.with_cls_token else df.copy()\n\n    if self.cat_embed_cols is not None:\n        df_cat = df_adj[self.cat_cols]\n        df_cat = self.label_encoder.transform(df_cat)\n    if self.continuous_cols is not None:\n        df_cont = df_adj[self.continuous_cols]\n        # Standardization logic\n        if self.standardize_cols:\n            df_cont[self.standardize_cols] = self.scaler.transform(\n                df_cont[self.standardize_cols].values\n            )\n        # Quantization logic\n        if self.cols_and_bins is not None:\n            # Adjustment so I don't have to override the method\n            # in 'ChunkTabPreprocessor'\n            if self.quantizer.is_fitted:\n                df_cont = self.quantizer.transform(df_cont)\n            else:\n                df_cont = self.quantizer.fit_transform(df_cont)\n    try:\n        df_deep = pd.concat([df_cat, df_cont], axis=1)\n    except NameError:\n        try:\n            df_deep = df_cat.copy()\n        except NameError:\n            df_deep = df_cont.copy()\n\n    return df_deep.values\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor.inverse_transform","title":"inverse_transform","text":"<pre><code>inverse_transform(encoded)\n</code></pre> <p>Takes as input the output from the <code>transform</code> method and it will return the original values.</p> <p>Parameters:</p> <ul> <li> <code>encoded</code>               (<code>ndarray</code>)           \u2013            <p>array with the output of the <code>transform</code> method</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>DataFrame</code>           \u2013            <p>Pandas dataframe with the original values</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code> <pre><code>def inverse_transform(self, encoded: np.ndarray) -&gt; pd.DataFrame:  # noqa: C901\n    r\"\"\"Takes as input the output from the `transform` method and it will\n    return the original values.\n\n    Parameters\n    ----------\n    encoded: np.ndarray\n        array with the output of the `transform` method\n\n    Returns\n    -------\n    pd.DataFrame\n        Pandas dataframe with the original values\n    \"\"\"\n    decoded = pd.DataFrame(encoded, columns=list(self.column_idx.keys()))\n    # embeddings back to original category\n    if self.cat_embed_cols is not None:\n        decoded = self.label_encoder.inverse_transform(decoded)\n    if self.continuous_cols is not None:\n        # quantized cols to the mid point\n        if self.cols_and_bins is not None:\n            if self.verbose:\n                print(\n                    \"Note that quantized cols will be turned into the mid point of \"\n                    \"the corresponding bin\"\n                )\n            for k, v in self.quantizer.inversed_bins.items():\n                decoded[k] = decoded[k].map(v)\n        # continuous_cols back to non-standarised\n        try:\n            decoded[self.standardize_cols] = self.scaler.inverse_transform(\n                decoded[self.standardize_cols]\n            )\n        except Exception:  # KeyError:\n            pass\n\n    if \"cls_token\" in decoded.columns:\n        decoded.drop(\"cls_token\", axis=1, inplace=True)\n\n    return decoded\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor.fit_transform","title":"fit_transform","text":"<pre><code>fit_transform(df)\n</code></pre> <p>Combines <code>fit</code> and <code>transform</code></p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>transformed input dataframe</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code> <pre><code>def fit_transform(self, df: pd.DataFrame) -&gt; np.ndarray:\n    \"\"\"Combines `fit` and `transform`\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    np.ndarray\n        transformed input dataframe\n    \"\"\"\n    return self.fit(df).transform(df)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.tab_preprocessor.Quantizer","title":"Quantizer","text":"<pre><code>Quantizer(quantization_setup, **kwargs)\n</code></pre> <p>Helper class to perform the quantization of continuous columns. It is included in this docs for completion, since depending on the value of the parameter <code>'quantization_setup'</code> of the <code>TabPreprocessor</code> class, that class might have an attribute of type <code>Quantizer</code>. However, this class is designed to always run internally within the <code>TabPreprocessor</code> class.</p> <p>Parameters:</p> <ul> <li> <code>quantization_setup</code>               (<code>Dict[str, Union[int, List[float]]]</code>)           \u2013            <p>Dictionary where the keys are the column names to quantize and the values are the either integers indicating the number of bins or a list of scalars indicating the bin edges.</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code> <pre><code>def __init__(\n    self,\n    quantization_setup: Dict[str, Union[int, List[float]]],\n    **kwargs,\n):\n    self.quantization_setup = quantization_setup\n    self.quant_args = kwargs\n\n    self.is_fitted = False\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.text_preprocessor.TextPreprocessor","title":"TextPreprocessor","text":"<pre><code>TextPreprocessor(\n    text_col,\n    max_vocab=30000,\n    min_freq=5,\n    maxlen=80,\n    pad_first=True,\n    pad_idx=1,\n    already_processed=False,\n    word_vectors_path=None,\n    n_cpus=None,\n    verbose=1,\n)\n</code></pre> <p>               Bases: <code>BasePreprocessor</code></p> <p>Preprocessor to prepare the <code>deeptext</code> input dataset</p> <p>Parameters:</p> <ul> <li> <code>text_col</code>               (<code>str</code>)           \u2013            <p>column in the input dataframe containing the texts</p> </li> <li> <code>max_vocab</code>               (<code>int</code>, default:                   <code>30000</code> )           \u2013            <p>Maximum number of tokens in the vocabulary</p> </li> <li> <code>min_freq</code>               (<code>int</code>, default:                   <code>5</code> )           \u2013            <p>Minimum frequency for a token to be part of the vocabulary</p> </li> <li> <code>maxlen</code>               (<code>int</code>, default:                   <code>80</code> )           \u2013            <p>Maximum length of the tokenized sequences</p> </li> <li> <code>pad_first</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Indicates whether the padding index will be added at the beginning or the end of the sequences</p> </li> <li> <code>pad_idx</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>padding index. Fastai's Tokenizer leaves 0 for the 'unknown' token.</p> </li> <li> <code>already_processed</code>               (<code>Optional[bool]</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the sequence of elements is already processed or prepared. If this is the case, this Preprocessor will simply tokenize and pad the sequence. </p> <pre><code>Param aliases: `not_text`. &lt;br/&gt;\n</code></pre> <p>This parameter is thought for those cases where the input sequences are already fully processed or are directly not text (e.g. IDs)</p> </li> <li> <code>word_vectors_path</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Path to the pretrained word vectors</p> </li> <li> <code>n_cpus</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>number of CPUs to used during the tokenization process</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Enable verbose output.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>vocab</code>               (<code>Vocab</code>)           \u2013            <p>an instance of <code>pytorch_widedeep.utils.fastai_transforms.Vocab</code></p> </li> <li> <code>embedding_matrix</code>               (<code>ndarray</code>)           \u2013            <p>Array with the pretrained embeddings</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from pytorch_widedeep.preprocessing import TextPreprocessor\n&gt;&gt;&gt; df_train = pd.DataFrame({'text_column': [\"life is like a box of chocolates\",\n... \"You never know what you're gonna get\"]})\n&gt;&gt;&gt; text_preprocessor = TextPreprocessor(text_col='text_column', max_vocab=25, min_freq=1, maxlen=10)\n&gt;&gt;&gt; text_preprocessor.fit_transform(df_train)\nThe vocabulary contains 24 tokens\narray([[ 1,  1,  1,  1, 10, 11, 12, 13, 14, 15],\n       [ 5,  9, 16, 17, 18,  9, 19, 20, 21, 22]], dtype=int32)\n&gt;&gt;&gt; df_te = pd.DataFrame({'text_column': ['you never know what is in the box']})\n&gt;&gt;&gt; text_preprocessor.transform(df_te)\narray([[ 1,  1,  9, 16, 17, 18, 11,  0,  0, 13]], dtype=int32)\n</code></pre> Source code in <code>pytorch_widedeep/preprocessing/text_preprocessor.py</code> <pre><code>@alias(\"already_processed\", [\"not_text\"])\ndef __init__(\n    self,\n    text_col: str,\n    max_vocab: int = 30000,\n    min_freq: int = 5,\n    maxlen: int = 80,\n    pad_first: bool = True,\n    pad_idx: int = 1,\n    already_processed: Optional[bool] = False,\n    word_vectors_path: Optional[str] = None,\n    n_cpus: Optional[int] = None,\n    verbose: int = 1,\n):\n    super(TextPreprocessor, self).__init__()\n\n    self.text_col = text_col\n    self.max_vocab = max_vocab\n    self.min_freq = min_freq\n    self.maxlen = maxlen\n    self.pad_first = pad_first\n    self.pad_idx = pad_idx\n    self.already_processed = already_processed\n    self.word_vectors_path = word_vectors_path\n    self.verbose = verbose\n    self.n_cpus = n_cpus if n_cpus is not None else os.cpu_count()\n\n    self.is_fitted = False\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.text_preprocessor.TextPreprocessor.fit","title":"fit","text":"<pre><code>fit(df)\n</code></pre> <p>Builds the vocabulary</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>TextPreprocessor</code>           \u2013            <p><code>TextPreprocessor</code> fitted object</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/text_preprocessor.py</code> <pre><code>def fit(self, df: pd.DataFrame) -&gt; BasePreprocessor:\n    \"\"\"Builds the vocabulary\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    TextPreprocessor\n        `TextPreprocessor` fitted object\n    \"\"\"\n    texts = self._read_texts(df)\n\n    tokens = get_texts(texts, self.already_processed, self.n_cpus)\n\n    self.vocab: TVocab = Vocab(\n        max_vocab=self.max_vocab,\n        min_freq=self.min_freq,\n        pad_idx=self.pad_idx,\n    ).fit(\n        tokens,\n    )\n\n    if self.verbose:\n        print(\"The vocabulary contains {} tokens\".format(len(self.vocab.stoi)))\n    if self.word_vectors_path is not None:\n        self.embedding_matrix = build_embeddings_matrix(\n            self.vocab, self.word_vectors_path, self.min_freq\n        )\n\n    self.is_fitted = True\n\n    return self\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.text_preprocessor.TextPreprocessor.transform","title":"transform","text":"<pre><code>transform(df)\n</code></pre> <p>Returns the padded, 'numericalised' sequences</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>Padded, 'numericalised' sequences</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/text_preprocessor.py</code> <pre><code>def transform(self, df: pd.DataFrame) -&gt; np.ndarray:\n    \"\"\"Returns the padded, _'numericalised'_ sequences\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    np.ndarray\n        Padded, _'numericalised'_ sequences\n    \"\"\"\n    check_is_fitted(self, attributes=[\"vocab\"])\n    texts = self._read_texts(df)\n    tokens = get_texts(texts, self.already_processed, self.n_cpus)\n    return self._pad_sequences(tokens)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.text_preprocessor.TextPreprocessor.transform_sample","title":"transform_sample","text":"<pre><code>transform_sample(text)\n</code></pre> <p>Returns the padded, 'numericalised' sequence</p> <p>Parameters:</p> <ul> <li> <code>text</code>               (<code>str</code>)           \u2013            <p>text to be tokenized and padded</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>Padded, 'numericalised' sequence</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/text_preprocessor.py</code> <pre><code>def transform_sample(self, text: str) -&gt; np.ndarray:\n    \"\"\"Returns the padded, _'numericalised'_ sequence\n\n    Parameters\n    ----------\n    text: str\n        text to be tokenized and padded\n\n    Returns\n    -------\n    np.ndarray\n        Padded, _'numericalised'_ sequence\n    \"\"\"\n    check_is_fitted(self, attributes=[\"vocab\"])\n    tokens = get_texts([text], self.already_processed, self.n_cpus)\n    return self._pad_sequences(tokens)[0]\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.text_preprocessor.TextPreprocessor.fit_transform","title":"fit_transform","text":"<pre><code>fit_transform(df)\n</code></pre> <p>Combines <code>fit</code> and <code>transform</code></p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>Padded, 'numericalised' sequences</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/text_preprocessor.py</code> <pre><code>def fit_transform(self, df: pd.DataFrame) -&gt; np.ndarray:\n    \"\"\"Combines `fit` and `transform`\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    np.ndarray\n        Padded, _'numericalised'_ sequences\n    \"\"\"\n    return self.fit(df).transform(df)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.text_preprocessor.TextPreprocessor.inverse_transform","title":"inverse_transform","text":"<pre><code>inverse_transform(padded_seq)\n</code></pre> <p>Returns the original text plus the added 'special' tokens</p> <p>Parameters:</p> <ul> <li> <code>padded_seq</code>               (<code>ndarray</code>)           \u2013            <p>array with the output of the <code>transform</code> method</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>DataFrame</code>           \u2013            <p>Pandas dataframe with the original text plus the added 'special' tokens</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/text_preprocessor.py</code> <pre><code>def inverse_transform(self, padded_seq: np.ndarray) -&gt; pd.DataFrame:\n    \"\"\"Returns the original text plus the added 'special' tokens\n\n    Parameters\n    ----------\n    padded_seq: np.ndarray\n        array with the output of the `transform` method\n\n    Returns\n    -------\n    pd.DataFrame\n        Pandas dataframe with the original text plus the added 'special' tokens\n    \"\"\"\n    texts = [self.vocab.inverse_transform(num) for num in padded_seq]\n    return pd.DataFrame({self.text_col: texts})\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor","title":"HFPreprocessor","text":"<pre><code>HFPreprocessor(\n    model_name,\n    *,\n    use_fast_tokenizer=False,\n    text_col=None,\n    root_dir=None,\n    num_workers=None,\n    preprocessing_rules=None,\n    tokenizer_params=None,\n    encode_params=None,\n    **kwargs\n)\n</code></pre> <p>               Bases: <code>BasePreprocessor</code></p> <p>Text processor to prepare the <code>deeptext</code> input dataset that is a wrapper around HuggingFace's tokenizers.</p> <p>Following the main phylosophy of the <code>pytorch-widedeep</code> library, this class is designed to be as flexible as possible. Therefore, it is coded so that the user can use it as one would use any HuggingFace tokenizers, or following the API call 'protocol' of the rest of the library.</p> <p>Parameters:</p> <ul> <li> <code>model_name</code>               (<code>str</code>)           \u2013            <p>The model name from the transformers library e.g. 'bert-base-uncased'. Currently supported models are those from the families: BERT, RoBERTa, DistilBERT, ALBERT and ELECTRA.</p> </li> <li> <code>use_fast_tokenizer</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Whether to use the fast tokenizer from HuggingFace or not</p> </li> <li> <code>text_col</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>The column in the input dataframe containing the text data. If this tokenizer is used via the <code>fit</code> and <code>transform</code> methods, this argument is mandatory. If the tokenizer is used via the <code>encode</code> method, this argument is not needed since the input text is passed directly to the <code>encode</code> method.</p> </li> <li> <code>num_workers</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Number of workers to use when preprocessing the text data. If not None, and <code>use_fast_tokenizer</code> is False, the text data will be preprocessed in parallel using the number of workers specified. If <code>use_fast_tokenizer</code> is True, this argument is ignored.</p> </li> <li> <code>preprocessing_rules</code>               (<code>Optional[List[Callable[[str], str]]]</code>, default:                   <code>None</code> )           \u2013            <p>A list of functions to be applied to the text data before encoding. This can be useful to clean the text data before encoding. For example, removing html tags, special characters, etc.</p> </li> <li> <code>tokenizer_params</code>               (<code>Optional[Dict[str, Any]]</code>, default:                   <code>None</code> )           \u2013            <p>Additional parameters to be passed to the HuggingFace's <code>PreTrainedTokenizer</code>. Parameters to the <code>PreTrainedTokenizer</code> can also be passed via the <code>**kwargs</code> argument</p> </li> <li> <code>encode_params</code>               (<code>Optional[Dict[str, Any]]</code>, default:                   <code>None</code> )           \u2013            <p>Additional parameters to be passed to the <code>batch_encode_plus</code> method of the HuggingFace's <code>PreTrainedTokenizer</code>. If the <code>fit</code> and <code>transform</code> methods are used, the <code>encode_params</code> dict parameter is mandatory. If the <code>encode</code> method is used, this parameter is not needed since the input text is passed directly to the <code>encode</code> method.</p> </li> <li> <code>**kwargs</code>           \u2013            <p>Additional kwargs to be passed to the model, in particular to the <code>PreTrainedTokenizer</code> class.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>is_fitted</code>               (<code>bool</code>)           \u2013            <p>Boolean indicating if the preprocessor has been fitted. This is a HuggingFacea tokenizer, so it is always considered fitted and this attribute is manually set to True internally. This parameter exists for consistency with the rest of the library and because is needed for some functionality in the library.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from pytorch_widedeep.preprocessing import HFPreprocessor\n&gt;&gt;&gt; df = pd.DataFrame({\"text\": [\"this is the first text\", \"this is the second text\"]})\n&gt;&gt;&gt; hf_processor_1 = HFPreprocessor(model_name=\"bert-base-uncased\", text_col=\"text\")\n&gt;&gt;&gt; X_text_1 = hf_processor_1.fit_transform(df)\n&gt;&gt;&gt; texts = [\"this is a new text\", \"this is another text\"]\n&gt;&gt;&gt; hf_processor_2 = HFPreprocessor(model_name=\"bert-base-uncased\")\n&gt;&gt;&gt; X_text_2 = hf_processor_2.encode(texts, max_length=10, padding=\"max_length\", truncation=True)\n</code></pre> Source code in <code>pytorch_widedeep/preprocessing/hf_preprocessor.py</code> <pre><code>def __init__(\n    self,\n    model_name: str,\n    *,\n    use_fast_tokenizer: bool = False,\n    text_col: Optional[str] = None,\n    root_dir: Optional[str] = None,\n    num_workers: Optional[int] = None,\n    preprocessing_rules: Optional[List[Callable[[str], str]]] = None,\n    tokenizer_params: Optional[Dict[str, Any]] = None,\n    encode_params: Optional[Dict[str, Any]] = None,\n    **kwargs,\n):\n    self.model_name = model_name\n    self.use_fast_tokenizer = use_fast_tokenizer\n    self.text_col = text_col\n    self.root_dir = root_dir\n    self.num_workers = num_workers\n    self.preprocessing_rules = preprocessing_rules\n    self.tokenizer_params = tokenizer_params if tokenizer_params is not None else {}\n    self.encode_params = encode_params if encode_params is not None else {}\n\n    self._multiprocessing = (\n        num_workers is not None and num_workers &gt; 1 and not use_fast_tokenizer\n    )\n\n    if kwargs:\n        self.tokenizer_params.update(kwargs)\n\n    self.tokenizer = get_tokenizer(\n        model_name=self.model_name,\n        use_fast_tokenizer=self.use_fast_tokenizer,\n        **self.tokenizer_params,\n    )\n\n    # A HuggingFace tokenizer is already trained, since we need this\n    # attribute elsewhere in the library, we simply set it to True\n    self.is_fitted = True\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.encode","title":"encode","text":"<pre><code>encode(texts, **kwargs)\n</code></pre> <p>Encodes a list of texts. The method is a wrapper around the <code>batch_encode_plus</code> method of the HuggingFace's tokenizer.</p> <p>if 'use_fast_tokenizer' is True, the method will use the <code>batch_encode_plus</code></p> <p>Parameters:</p> <ul> <li> <code>texts</code>               (<code>List[str]</code>)           \u2013            <p>List of texts to be encoded</p> </li> <li> <code>**kwargs</code>           \u2013            <p>Additional parameters to be passed to the <code>batch_encode_plus</code> method of the HuggingFace's tokenizer. If the 'encode_params' dict was passed when instantiating the class, that dictionaly will be updated with the kwargs passed here.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>array</code>           \u2013            <p>The encoded texts</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/hf_preprocessor.py</code> <pre><code>def encode(self, texts: List[str], **kwargs) -&gt; npt.NDArray[np.int64]:\n    \"\"\"\n    Encodes a list of texts. The method is a wrapper around the\n    `batch_encode_plus` method of the HuggingFace's tokenizer.\n\n    if 'use_fast_tokenizer' is True, the method will use the `batch_encode_plus`\n\n    Parameters\n    ----------\n    texts: List[str]\n        List of texts to be encoded\n    **kwargs\n        Additional parameters to be passed to the `batch_encode_plus` method\n        of the HuggingFace's tokenizer. If the 'encode_params' dict was passed\n        when instantiating the class, that dictionaly will be updated with\n        the kwargs passed here.\n\n    Returns\n    -------\n    np.array\n        The encoded texts\n    \"\"\"\n    if kwargs:\n        self.encode_params.update(kwargs)\n\n    if self.preprocessing_rules:\n        if self._multiprocessing:\n            texts = self._process_text_parallel(texts)\n        else:\n            texts = [self._preprocess_text(text) for text in texts]\n\n    if self._multiprocessing:\n        input_ids = self._encode_paralell(texts, **self.encode_params)\n    else:\n        encoded_texts = self.tokenizer.batch_encode_plus(\n            texts,\n            **self.encode_params,\n        )\n        input_ids = encoded_texts.get(\"input_ids\")\n\n    self.is_fitted = True\n\n    try:\n        output = np.array(input_ids)\n    except ValueError:\n        warnings.warn(\n            \"Padding and Truncating parameters were not passed and all input arrays \"\n            \"do not have the same shape. Padding to the longest sequence. \"\n            \"Padding will be done with the index of the pad token for the model\",\n            UserWarning,\n        )\n        max_len = max([len(ids) for ids in input_ids])\n        output = np.array(\n            [\n                np.pad(ids, (self.tokenizer.pad_token_id, max_len - len(ids)))\n                for ids in input_ids\n            ]\n        )\n\n    return output\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.decode","title":"decode","text":"<pre><code>decode(input_ids, skip_special_tokens)\n</code></pre> <p>Decodes a list of input_ids. The method is a wrapper around the <code>convert_ids_to_tokens</code> and <code>convert_tokens_to_string</code> methods of the HuggingFace's tokenizer.</p> <p>Parameters:</p> <ul> <li> <code>input_ids</code>               (<code>NDArray[int64]</code>)           \u2013            <p>The input_ids to be decoded</p> </li> <li> <code>skip_special_tokens</code>               (<code>bool</code>)           \u2013            <p>Whether to skip the special tokens or not</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>List[str]</code>           \u2013            <p>The decoded texts</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/hf_preprocessor.py</code> <pre><code>def decode(\n    self, input_ids: npt.NDArray[np.int64], skip_special_tokens: bool\n) -&gt; List[str]:\n    \"\"\"\n    Decodes a list of input_ids. The method is a wrapper around the\n    `convert_ids_to_tokens` and `convert_tokens_to_string` methods of the\n    HuggingFace's tokenizer.\n\n    Parameters\n    ----------\n    input_ids: npt.NDArray[np.int64]\n        The input_ids to be decoded\n    skip_special_tokens: bool\n        Whether to skip the special tokens or not\n\n    Returns\n    -------\n    List[str]\n        The decoded texts\n    \"\"\"\n    texts = [\n        self.tokenizer.convert_tokens_to_string(\n            self.tokenizer.convert_ids_to_tokens(input_ids[i], skip_special_tokens)\n        )\n        for i in range(input_ids.shape[0])\n    ]\n    return texts\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.fit","title":"fit","text":"<pre><code>fit(df)\n</code></pre> <p>This method is included for consistency with the rest of the library in general and with the <code>BasePreprocessor</code> in particular. HuggingFace's tokenizers and models are already trained. Therefore, the 'fit' method here does nothing other than checking that the 'text_col' parameter is not <code>None</code>.</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>The dataframe containing the text data in the column specified by the 'text_col' parameter</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/hf_preprocessor.py</code> <pre><code>def fit(self, df: pd.DataFrame) -&gt; \"HFPreprocessor\":\n    \"\"\"\n    This method is included for consistency with the rest of the library\n    in general and with the `BasePreprocessor` in particular. HuggingFace's\n    tokenizers and models are already trained. Therefore, the 'fit' method\n    here does nothing other than checking that the 'text_col' parameter is\n    not `None`.\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        The dataframe containing the text data in the column specified by\n        the 'text_col' parameter\n    \"\"\"\n    if self.text_col is None:\n        raise ValueError(\n            \"'text_col' is None. Please specify the column name containing the text data\"\n            \" if you want to use the 'fit' method\"\n        )\n    return self\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.transform","title":"transform","text":"<pre><code>transform(df)\n</code></pre> <p>Encodes the text data in the input dataframe. This method simply calls the <code>encode</code> method under the hood. Similar to the <code>fit</code> method, this method is included for consistency with the rest of the library in general and with the <code>BasePreprocessor</code> in particular.</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>The dataframe containing the text data in the column specified by the 'text_col' parameter</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>array</code>           \u2013            <p>The encoded texts</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/hf_preprocessor.py</code> <pre><code>def transform(self, df: pd.DataFrame) -&gt; npt.NDArray[np.int64]:\n    \"\"\"\n    Encodes the text data in the input dataframe. This method simply\n    calls the `encode` method under the hood. Similar to the `fit` method,\n    this method is included for consistency with the rest of the library\n    in general and with the `BasePreprocessor` in particular.\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        The dataframe containing the text data in the column specified by\n        the 'text_col' parameter\n\n    Returns\n    -------\n    np.array\n        The encoded texts\n    \"\"\"\n    if self.text_col is None:\n        raise ValueError(\n            \"'text_col' is None. Please specify the column name containing the text data\"\n            \" if you want to use the 'fit' method\"\n        )\n\n    texts = self._read_texts(df, self.root_dir)\n\n    return self.encode(texts)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.transform_sample","title":"transform_sample","text":"<pre><code>transform_sample(text)\n</code></pre> <p>Encodes a single text sample.</p> <p>Parameters:</p> <ul> <li> <code>text</code>               (<code>str</code>)           \u2013            <p>The text sample to be encoded</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>array</code>           \u2013            <p>The encoded text</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/hf_preprocessor.py</code> <pre><code>def transform_sample(self, text: str) -&gt; npt.NDArray[np.int64]:\n    \"\"\"\n    Encodes a single text sample.\n\n    Parameters\n    ----------\n    text: str\n        The text sample to be encoded\n\n    Returns\n    -------\n    np.array\n        The encoded text\n    \"\"\"\n\n    if not self.is_fitted:\n        raise ValueError(\n            \"The `encode` (or `fit`) method must be called before calling `transform_sample`\"\n        )\n    return self.encode([text])[0]\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.fit_transform","title":"fit_transform","text":"<pre><code>fit_transform(df)\n</code></pre> <p>Encodes the text data in the input dataframe.</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>The dataframe containing the text data in the column specified by the 'text_col' parameter</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>array</code>           \u2013            <p>The encoded texts</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/hf_preprocessor.py</code> <pre><code>def fit_transform(self, df: pd.DataFrame) -&gt; npt.NDArray[np.int64]:\n    \"\"\"\n    Encodes the text data in the input dataframe.\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        The dataframe containing the text data in the column specified by\n        the 'text_col' parameter\n\n    Returns\n    -------\n    np.array\n        The encoded texts\n    \"\"\"\n    return self.fit(df).transform(df)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.inverse_transform","title":"inverse_transform","text":"<pre><code>inverse_transform(input_ids, skip_special_tokens)\n</code></pre> <p>Decodes a list of input_ids. The method simply calls the <code>decode</code> method under the hood.</p> <p>Parameters:</p> <ul> <li> <code>input_ids</code>               (<code>NDArray[int64]</code>)           \u2013            <p>The input_ids to be decoded</p> </li> <li> <code>skip_special_tokens</code>               (<code>bool</code>)           \u2013            <p>Whether to skip the special tokens or not</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>List[str]</code>           \u2013            <p>The decoded texts</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/hf_preprocessor.py</code> <pre><code>def inverse_transform(\n    self, input_ids: npt.NDArray[np.int64], skip_special_tokens: bool\n) -&gt; List[str]:\n    \"\"\"\n    Decodes a list of input_ids. The method simply calls the `decode` method\n    under the hood.\n\n    Parameters\n    ----------\n    input_ids: npt.NDArray[np.int64]\n        The input_ids to be decoded\n    skip_special_tokens: bool\n        Whether to skip the special tokens or not\n\n    Returns\n    -------\n    List[str]\n        The decoded texts\n    \"\"\"\n    return self.decode(input_ids, skip_special_tokens)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.image_preprocessor.ImagePreprocessor","title":"ImagePreprocessor","text":"<pre><code>ImagePreprocessor(\n    img_col, img_path, width=224, height=224, verbose=1\n)\n</code></pre> <p>               Bases: <code>BasePreprocessor</code></p> <p>Preprocessor to prepare the <code>deepimage</code> input dataset.</p> <p>The Preprocessing consists simply on resizing according to their aspect ratio</p> <p>Parameters:</p> <ul> <li> <code>img_col</code>               (<code>str</code>)           \u2013            <p>name of the column with the images filenames</p> </li> <li> <code>img_path</code>               (<code>str</code>)           \u2013            <p>path to the dicrectory where the images are stored</p> </li> <li> <code>width</code>               (<code>int</code>, default:                   <code>224</code> )           \u2013            <p>width of the resulting processed image.</p> </li> <li> <code>height</code>               (<code>int</code>, default:                   <code>224</code> )           \u2013            <p>width of the resulting processed image.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Enable verbose output.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>aap</code>               (<code>AspectAwarePreprocessor</code>)           \u2013            <p>an instance of <code>pytorch_widedeep.utils.image_utils.AspectAwarePreprocessor</code></p> </li> <li> <code>spp</code>               (<code>SimplePreprocessor</code>)           \u2013            <p>an instance of <code>pytorch_widedeep.utils.image_utils.SimplePreprocessor</code></p> </li> <li> <code>normalise_metrics</code>               (<code>Dict</code>)           \u2013            <p>Dict containing the normalisation metrics of the image dataset, i.e. mean and std for the R, G and B channels</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.preprocessing import ImagePreprocessor\n&gt;&gt;&gt;\n&gt;&gt;&gt; path_to_image1 = 'tests/test_data_utils/images/galaxy1.png'\n&gt;&gt;&gt; path_to_image2 = 'tests/test_data_utils/images/galaxy2.png'\n&gt;&gt;&gt;\n&gt;&gt;&gt; df_train = pd.DataFrame({'images_column': [path_to_image1]})\n&gt;&gt;&gt; df_test = pd.DataFrame({'images_column': [path_to_image2]})\n&gt;&gt;&gt; img_preprocessor = ImagePreprocessor(img_col='images_column', img_path='.', verbose=0)\n&gt;&gt;&gt; resized_images = img_preprocessor.fit_transform(df_train)\n&gt;&gt;&gt; new_resized_images = img_preprocessor.transform(df_train)\n</code></pre> <p> NOTE: Normalising metrics will only be computed when the <code>fit_transform</code> method is run. Running <code>transform</code> only will not change the computed metrics and running <code>fit</code> only simply instantiates the resizing functions.</p> Source code in <code>pytorch_widedeep/preprocessing/image_preprocessor.py</code> <pre><code>def __init__(\n    self,\n    img_col: str,\n    img_path: str,\n    width: int = 224,\n    height: int = 224,\n    verbose: int = 1,\n):\n    super(ImagePreprocessor, self).__init__()\n\n    self.img_col = img_col\n    self.img_path = img_path\n    self.width = width\n    self.height = height\n    self.verbose = verbose\n\n    self.aap = AspectAwarePreprocessor(self.width, self.height)\n    self.spp = SimplePreprocessor(self.width, self.height)\n\n    self.compute_normalising_computed = False\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.image_preprocessor.ImagePreprocessor.transform","title":"transform","text":"<pre><code>transform(df)\n</code></pre> <p>Resizes the images to the input height and width.</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe with the <code>img_col</code></p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>Resized images to the input height and width</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/image_preprocessor.py</code> <pre><code>def transform(self, df: pd.DataFrame) -&gt; np.ndarray:\n    \"\"\"Resizes the images to the input height and width.\n\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe with the `img_col`\n\n    Returns\n    -------\n    np.ndarray\n        Resized images to the input height and width\n    \"\"\"\n    image_list = df[self.img_col].tolist()\n    if self.verbose:\n        print(\"Reading Images from {}\".format(self.img_path))\n    imgs = [cv2.imread(\"/\".join([self.img_path, img])) for img in image_list]\n\n    # finding images with different height and width\n    aspect = [(im.shape[0], im.shape[1]) for im in imgs]\n    aspect_r = [a[0] / a[1] for a in aspect]\n    diff_idx = [i for i, r in enumerate(aspect_r) if r != 1.0]\n\n    if self.verbose:\n        print(\"Resizing\")\n    resized_imgs = []\n    for i, img in tqdm(enumerate(imgs), total=len(imgs), disable=self.verbose != 1):\n        if i in diff_idx:\n            resized_imgs.append(self.aap.preprocess(img))\n        else:\n            # if aspect ratio is 1:1, no need for AspectAwarePreprocessor\n            resized_imgs.append(self.spp.preprocess(img))\n\n    if not self.compute_normalising_computed:\n        if self.verbose:\n            print(\"Computing normalisation metrics\")\n        # mean and std deviation will only be computed when the fit method\n        # is called\n        mean_R, mean_G, mean_B = [], [], []\n        std_R, std_G, std_B = [], [], []\n        for rsz_img in resized_imgs:\n            (mean_b, mean_g, mean_r), (std_b, std_g, std_r) = cv2.meanStdDev(\n                rsz_img\n            )\n            mean_R.append(mean_r)\n            mean_G.append(mean_g)\n            mean_B.append(mean_b)\n            std_R.append(std_r)\n            std_G.append(std_g)\n            std_B.append(std_b)\n        self.normalise_metrics = dict(\n            mean={\n                \"R\": np.mean(mean_R) / 255.0,\n                \"G\": np.mean(mean_G) / 255.0,\n                \"B\": np.mean(mean_B) / 255.0,\n            },\n            std={\n                \"R\": np.mean(std_R) / 255.0,\n                \"G\": np.mean(std_G) / 255.0,\n                \"B\": np.mean(std_B) / 255.0,\n            },\n        )\n        self.compute_normalising_computed = True\n    return np.asarray(resized_imgs)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.image_preprocessor.ImagePreprocessor.fit_transform","title":"fit_transform","text":"<pre><code>fit_transform(df)\n</code></pre> <p>Combines <code>fit</code> and <code>transform</code></p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>Resized images to the input height and width</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/image_preprocessor.py</code> <pre><code>def fit_transform(self, df: pd.DataFrame) -&gt; np.ndarray:\n    \"\"\"Combines `fit` and `transform`\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    np.ndarray\n        Resized images to the input height and width\n    \"\"\"\n    return self.fit(df).transform(df)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#chunked-versions","title":"Chunked versions","text":"<p>Chunked versions of the preprocessors are also available. These are useful when the data is too big to fit in memory. See also the <code>load_from_folder</code> module in the library and the corresponding section here in the documentation.</p> <p>Note that there is not a <code>ChunkImagePreprocessor</code>. This is because the processing of the images will occur inside the <code>ImageFromFolder</code> class in the <code>load_from_folder</code> module.</p>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.wide_preprocessor.ChunkWidePreprocessor","title":"ChunkWidePreprocessor","text":"<pre><code>ChunkWidePreprocessor(\n    wide_cols, n_chunks, crossed_cols=None\n)\n</code></pre> <p>               Bases: <code>WidePreprocessor</code></p> <p>Preprocessor to prepare the wide input dataset</p> <p>This Preprocessor prepares the data for the wide, linear component. This linear model is implemented via an Embedding layer that is connected to the output neuron. <code>ChunkWidePreprocessor</code> numerically encodes all the unique values of all categorical columns <code>wide_cols + crossed_cols</code>. See the Example below.</p> <p>Parameters:</p> <ul> <li> <code>wide_cols</code>               (<code>List[str]</code>)           \u2013            <p>List of strings with the name of the columns that will label encoded and passed through the <code>wide</code> component</p> </li> <li> <code>crossed_cols</code>               (<code>Optional[List[Tuple[str, str]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the name of the columns that will be <code>'crossed'</code> and then label encoded. e.g. [('education', 'occupation'), ...]. For binary features, a cross-product transformation is 1 if and only if the constituent features are all 1, and 0 otherwise.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>wide_crossed_cols</code>               (<code>List</code>)           \u2013            <p>List with the names of all columns that will be label encoded</p> </li> <li> <code>encoding_dict</code>               (<code>Dict</code>)           \u2013            <p>Dictionary where the keys are the result of pasting <code>colname + '_' + column value</code> and the values are the corresponding mapped integer.</p> </li> <li> <code>inverse_encoding_dict</code>               (<code>Dict</code>)           \u2013            <p>the inverse encoding dictionary</p> </li> <li> <code>wide_dim</code>               (<code>int</code>)           \u2013            <p>Dimension of the wide model (i.e. dim of the linear layer)</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from pytorch_widedeep.preprocessing import ChunkWidePreprocessor\n&gt;&gt;&gt; chunk = pd.DataFrame({'color': ['r', 'b', 'g'], 'size': ['s', 'n', 'l']})\n&gt;&gt;&gt; wide_cols = ['color']\n&gt;&gt;&gt; crossed_cols = [('color', 'size')]\n&gt;&gt;&gt; chunk_wide_preprocessor = ChunkWidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols,\n... n_chunks=1)\n&gt;&gt;&gt; X_wide = chunk_wide_preprocessor.fit_transform(chunk)\n</code></pre> Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code> <pre><code>def __init__(\n    self,\n    wide_cols: List[str],\n    n_chunks: int,\n    crossed_cols: Optional[List[Tuple[str, str]]] = None,\n):\n    super(ChunkWidePreprocessor, self).__init__(wide_cols, crossed_cols)\n\n    self.n_chunks = n_chunks\n\n    self.chunk_counter = 0\n\n    self.is_fitted = False\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.wide_preprocessor.ChunkWidePreprocessor.partial_fit","title":"partial_fit","text":"<pre><code>partial_fit(chunk)\n</code></pre> <p>Fits the Preprocessor and creates required attributes</p> <p>Parameters:</p> <ul> <li> <code>chunk</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ChunkWidePreprocessor</code>           \u2013            <p><code>ChunkWidePreprocessor</code> fitted object</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code> <pre><code>def partial_fit(self, chunk: pd.DataFrame) -&gt; \"ChunkWidePreprocessor\":\n    r\"\"\"Fits the Preprocessor and creates required attributes\n\n    Parameters\n    ----------\n    chunk: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    ChunkWidePreprocessor\n        `ChunkWidePreprocessor` fitted object\n    \"\"\"\n    df_wide = self._prepare_wide(chunk)\n    self.wide_crossed_cols = df_wide.columns.tolist()\n\n    if self.chunk_counter == 0:\n        self.glob_feature_set = set(\n            self._make_global_feature_list(df_wide[self.wide_crossed_cols])\n        )\n    else:\n        self.glob_feature_set.update(\n            self._make_global_feature_list(df_wide[self.wide_crossed_cols])\n        )\n\n    self.chunk_counter += 1\n\n    if self.chunk_counter == self.n_chunks:\n        self.encoding_dict = {v: i + 1 for i, v in enumerate(self.glob_feature_set)}\n        self.wide_dim = len(self.encoding_dict)\n        self.inverse_encoding_dict = {k: v for v, k in self.encoding_dict.items()}\n        self.inverse_encoding_dict[0] = \"unseen\"\n\n        self.is_fitted = True\n\n    return self\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.wide_preprocessor.ChunkWidePreprocessor.fit","title":"fit","text":"<pre><code>fit(df)\n</code></pre> <p>Runs <code>partial_fit</code>. This is just to override the fit method in the base class. This class is not designed or thought to run fit</p> Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code> <pre><code>def fit(self, df: pd.DataFrame) -&gt; \"ChunkWidePreprocessor\":\n    \"\"\"\n    Runs `partial_fit`. This is just to override the fit method in the base\n    class. This class is not designed or thought to run fit\n    \"\"\"\n    return self.partial_fit(df)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.tab_preprocessor.ChunkTabPreprocessor","title":"ChunkTabPreprocessor","text":"<pre><code>ChunkTabPreprocessor(\n    n_chunks,\n    cat_embed_cols=None,\n    continuous_cols=None,\n    cols_and_bins=None,\n    cols_to_scale=None,\n    default_embed_dim=16,\n    with_attention=False,\n    with_cls_token=False,\n    shared_embed=False,\n    verbose=1,\n    *,\n    scale=False,\n    already_standard=None,\n    **kwargs\n)\n</code></pre> <p>               Bases: <code>TabPreprocessor</code></p> <p>Preprocessor to prepare the <code>deeptabular</code> component input dataset</p> <p>Parameters:</p> <ul> <li> <code>n_chunks</code>               (<code>int</code>)           \u2013            <p>Number of chunks that the tabular dataset is divided by.</p> </li> <li> <code>cat_embed_cols</code>               (<code>Optional[Union[List[str], List[Tuple[str, int]]]]</code>, default:                   <code>None</code> )           \u2013            <p>List containing the name of the categorical columns that will be represented by embeddings (e.g. ['education', 'relationship', ...]) or a Tuple with the name and the embedding dimension (e.g.: [ ('education',32), ('relationship',16), ...])</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the continuous cols</p> </li> <li> <code>cols_and_bins</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>Continuous columns can be turned into categorical via <code>pd.cut</code>. 'cols_and_bins' is dictionary where the keys are the column names to quantize and the values are a list of scalars indicating the bin edges.</p> </li> <li> <code>cols_to_scale</code>               (<code>Optional[Union[List[str], str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the names of the columns that will be standarised via sklearn's <code>StandardScaler</code></p> </li> <li> <code>default_embed_dim</code>               (<code>int</code>, default:                   <code>16</code> )           \u2013            <p>Dimension for the embeddings if the embed_dim is not provided in the <code>cat_embed_cols</code> parameter and <code>auto_embed_dim</code> is set to <code>False</code>.</p> </li> <li> <code>with_attention</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the preprocessed data will be passed to an attention-based model (more precisely a model where all embeddings must have the same dimensions). If <code>True</code>, the param <code>cat_embed_cols</code> must just be a list containing just the categorical column names: e.g. ['education', 'relationship', ...]. This is because they will all be  encoded using embeddings of the same dim, which will be specified  later when the model is defined.  Param alias:  <code>for_transformer</code></p> </li> <li> <code>with_cls_token</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if a <code>'[CLS]'</code> token will be added to the dataset when using attention-based models. The final hidden state corresponding to this token is used as the aggregated representation for classification and regression tasks. If not, the categorical (and continuous embeddings if present) will be concatenated before being passed to the final MLP (if present).</p> </li> <li> <code>shared_embed</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the embeddings will be \"shared\" when using attention-based models. The idea behind <code>shared_embed</code> is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: <code>pytorch_widedeep.models.transformers._layers.SharedEmbeddings</code>.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            </li> <li> <code>scale</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p> note: this arg will be removed in upcoming  releases. Please use <code>cols_to_scale</code> instead.  Bool indicating  whether or not to scale/standarise continuous cols. It is important  to emphasize that all the DL models for tabular data in the library  also include the possibility of normalising the input continuous  features via a <code>BatchNorm</code> or a <code>LayerNorm</code>.  Param alias:  <code>scale_cont_cols</code>.</p> </li> <li> <code>already_standard</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p> note: this arg will be removed in upcoming  releases. Please use <code>cols_to_scale</code> instead.  List with the  name of the continuous cols that do not need to be  scaled/standarised.</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>**kwargs</code>           \u2013            <p><code>pd.cut</code> and <code>StandardScaler</code> related args</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>embed_dim</code>               (<code>Dict</code>)           \u2013            <p>Dictionary where keys are the embed cols and values are the embedding dimensions. If <code>with_attention</code> is set to <code>True</code> this attribute is not generated during the <code>fit</code> process</p> </li> <li> <code>label_encoder</code>               (<code>LabelEncoder</code>)           \u2013            <p>see <code>pytorch_widedeep.utils.dense_utils.LabelEncder</code></p> </li> <li> <code>cat_embed_input</code>               (<code>List</code>)           \u2013            <p>List of Tuples with the column name, number of individual values for that column and, If <code>with_attention</code> is set to <code>False</code>, the corresponding embeddings dim, e.g. [('education', 16, 10), ('relationship', 6, 8), ...].</p> </li> <li> <code>standardize_cols</code>               (<code>List</code>)           \u2013            <p>List of the columns that will be standarized</p> </li> <li> <code>scaler</code>               (<code>StandardScaler</code>)           \u2013            <p>an instance of <code>sklearn.preprocessing.StandardScaler</code> if 'cols_to_scale' is not None or 'scale' is 'True'</p> </li> <li> <code>column_idx</code>               (<code>Dict</code>)           \u2013            <p>Dictionary where keys are column names and values are column indexes. This is neccesary to slice tensors</p> </li> <li> <code>quantizer</code>               (<code>Quantizer</code>)           \u2013            <p>an instance of <code>Quantizer</code></p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; from pytorch_widedeep.preprocessing import ChunkTabPreprocessor\n&gt;&gt;&gt; np.random.seed(42)\n&gt;&gt;&gt; chunk_df = pd.DataFrame({'cat_col': np.random.choice(['A', 'B', 'C'], size=8),\n... 'cont_col': np.random.uniform(1, 100, size=8)})\n&gt;&gt;&gt; cat_embed_cols = [('cat_col',4)]\n&gt;&gt;&gt; cont_cols = ['cont_col']\n&gt;&gt;&gt; tab_preprocessor = ChunkTabPreprocessor(\n... n_chunks=1, cat_embed_cols=cat_embed_cols, continuous_cols=cont_cols\n... )\n&gt;&gt;&gt; X_tab = tab_preprocessor.fit_transform(chunk_df)\n&gt;&gt;&gt; tab_preprocessor.cat_embed_cols\n[('cat_col', 4)]\n&gt;&gt;&gt; tab_preprocessor.column_idx\n{'cat_col': 0, 'cont_col': 1}\n</code></pre> Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code> <pre><code>@alias(\"with_attention\", [\"for_transformer\"])\n@alias(\"cat_embed_cols\", [\"embed_cols\"])\n@alias(\"scale\", [\"scale_cont_cols\"])\n@alias(\"cols_and_bins\", [\"quantization_setup\"])\ndef __init__(\n    self,\n    n_chunks: int,\n    cat_embed_cols: Optional[Union[List[str], List[Tuple[str, int]]]] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cols_and_bins: Optional[Dict[str, List[float]]] = None,\n    cols_to_scale: Optional[Union[List[str], str]] = None,\n    default_embed_dim: int = 16,\n    with_attention: bool = False,\n    with_cls_token: bool = False,\n    shared_embed: bool = False,\n    verbose: int = 1,\n    *,\n    scale: bool = False,\n    already_standard: Optional[List[str]] = None,\n    **kwargs,\n):\n    super(ChunkTabPreprocessor, self).__init__(\n        cat_embed_cols=cat_embed_cols,\n        continuous_cols=continuous_cols,\n        quantization_setup=None,\n        cols_to_scale=cols_to_scale,\n        auto_embed_dim=False,\n        embedding_rule=\"google\",  # does not matter, irrelevant\n        default_embed_dim=default_embed_dim,\n        with_attention=with_attention,\n        with_cls_token=with_cls_token,\n        shared_embed=shared_embed,\n        verbose=verbose,\n        scale=scale,\n        already_standard=already_standard,\n        **kwargs,\n    )\n\n    self.n_chunks = n_chunks\n    self.chunk_counter = 0\n\n    self.cols_and_bins = cols_and_bins  # type: ignore[assignment]\n    if self.cols_and_bins is not None:\n        self.quantizer = Quantizer(self.cols_and_bins, **self.quant_args)\n\n    self.embed_prepared = False\n    self.continuous_prepared = False\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.text_preprocessor.ChunkTextPreprocessor","title":"ChunkTextPreprocessor","text":"<pre><code>ChunkTextPreprocessor(\n    text_col,\n    n_chunks,\n    root_dir=None,\n    max_vocab=30000,\n    min_freq=5,\n    maxlen=80,\n    pad_first=True,\n    pad_idx=1,\n    already_processed=False,\n    word_vectors_path=None,\n    n_cpus=None,\n    verbose=1,\n)\n</code></pre> <p>               Bases: <code>TextPreprocessor</code></p> <p>Preprocessor to prepare the <code>deeptext</code> input dataset</p> <p>Parameters:</p> <ul> <li> <code>text_col</code>               (<code>str</code>)           \u2013            <p>column in the input dataframe containing either the texts or the filenames where the text documents are stored</p> </li> <li> <code>n_chunks</code>               (<code>int</code>)           \u2013            <p>Number of chunks that the text dataset is divided by.</p> </li> <li> <code>root_dir</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>If 'text_col' contains the filenames with the text documents, this is the path to the directory where those documents are stored.</p> </li> <li> <code>max_vocab</code>               (<code>int</code>, default:                   <code>30000</code> )           \u2013            <p>Maximum number of tokens in the vocabulary</p> </li> <li> <code>min_freq</code>               (<code>int</code>, default:                   <code>5</code> )           \u2013            <p>Minimum frequency for a token to be part of the vocabulary</p> </li> <li> <code>maxlen</code>               (<code>int</code>, default:                   <code>80</code> )           \u2013            <p>Maximum length of the tokenized sequences</p> </li> <li> <code>pad_first</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Indicates whether the padding index will be added at the beginning or the end of the sequences</p> </li> <li> <code>pad_idx</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>padding index. Fastai's Tokenizer leaves 0 for the 'unknown' token.</p> </li> <li> <code>word_vectors_path</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Path to the pretrained word vectors</p> </li> <li> <code>n_cpus</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>number of CPUs to used during the tokenization process</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Enable verbose output.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>vocab</code>               (<code>Vocab</code>)           \u2013            <p>an instance of <code>pytorch_widedeep.utils.fastai_transforms.ChunkVocab</code></p> </li> <li> <code>embedding_matrix</code>               (<code>ndarray</code>)           \u2013            <p>Array with the pretrained embeddings if <code>word_vectors_path</code> is not None</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from pytorch_widedeep.preprocessing import ChunkTextPreprocessor\n&gt;&gt;&gt; chunk_df = pd.DataFrame({'text_column': [\"life is like a box of chocolates\",\n... \"You never know what you're gonna get\"]})\n&gt;&gt;&gt; chunk_text_preprocessor = ChunkTextPreprocessor(text_col='text_column', n_chunks=1,\n... max_vocab=25, min_freq=1, maxlen=10, verbose=0, n_cpus=1)\n&gt;&gt;&gt; processed_chunk = chunk_text_preprocessor.fit_transform(chunk_df)\n</code></pre> Source code in <code>pytorch_widedeep/preprocessing/text_preprocessor.py</code> <pre><code>def __init__(\n    self,\n    text_col: str,\n    n_chunks: int,\n    root_dir: Optional[str] = None,\n    max_vocab: int = 30000,\n    min_freq: int = 5,\n    maxlen: int = 80,\n    pad_first: bool = True,\n    pad_idx: int = 1,\n    already_processed: Optional[bool] = False,\n    word_vectors_path: Optional[str] = None,\n    n_cpus: Optional[int] = None,\n    verbose: int = 1,\n):\n    super(ChunkTextPreprocessor, self).__init__(\n        text_col=text_col,\n        max_vocab=max_vocab,\n        min_freq=min_freq,\n        maxlen=maxlen,\n        pad_first=pad_first,\n        pad_idx=pad_idx,\n        already_processed=already_processed,\n        word_vectors_path=word_vectors_path,\n        n_cpus=n_cpus,\n        verbose=verbose,\n    )\n\n    self.n_chunks = n_chunks\n    self.root_dir = root_dir\n\n    self.chunk_counter = 0\n\n    self.is_fitted = False\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.hf_preprocessor.ChunkHFPreprocessor","title":"ChunkHFPreprocessor","text":"<pre><code>ChunkHFPreprocessor(\n    model_name,\n    *,\n    text_col,\n    root_dir=None,\n    use_fast_tokenizer=True,\n    num_workers=None,\n    preprocessing_rules=None,\n    tokenizer_params=None,\n    encode_params=None\n)\n</code></pre> <p>               Bases: <code>HFPreprocessor</code></p> <p>Text processor to prepare the <code>deeptext</code> input dataset that is a wrapper around HuggingFace's tokenizers.</p> <p>Hugginface Tokenizer's are already 'trained'. Therefore, unlike the <code>ChunkTextPreprocessor</code> this is mostly identical to the <code>HFPreprocessor</code> with the only difference that the class needs a 'text_col' parameter to be passed. Also the parameter <code>encode_params</code> is not really optional when using this class. It must be passed containing at least the 'max_length' encoding parameter. This is because we need to ensure that  all sequences have the same length when encoding in chunks.</p> <p>Parameters:</p> <ul> <li> <code>model_name</code>               (<code>str</code>)           \u2013            <p>The model name from the transformers library e.g. 'bert-base-uncased'. Currently supported models are those from the families: BERT, RoBERTa, DistilBERT, ALBERT and ELECTRA.</p> </li> <li> <code>text_col</code>               (<code>str</code>)           \u2013            <p>The column in the input dataframe containing the text data. When using the <code>ChunkHFPreprocessor</code> the <code>text_col</code> parameter is mandatory.</p> </li> <li> <code>root_dir</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>The root directory where the text files are located. This is only needed if the text data is stored in text files. If the text data is stored in a column in the input dataframe, this parameter is not needed.</p> </li> <li> <code>use_fast_tokenizer</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Whether to use the fast tokenizer from HuggingFace or not</p> </li> <li> <code>num_workers</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Number of workers to use when preprocessing the text data. If not None, and <code>use_fast_tokenizer</code> is False, the text data will be preprocessed in parallel using the number of workers specified. If <code>use_fast_tokenizer</code> is True, this argument is ignored.</p> </li> <li> <code>preprocessing_rules</code>               (<code>Optional[List[Callable[[str], str]]]</code>, default:                   <code>None</code> )           \u2013            <p>A list of functions to be applied to the text data before encoding. This can be useful to clean the text data before encoding. For example, removing html tags, special characters, etc.</p> </li> <li> <code>tokenizer_params</code>               (<code>Optional[Dict[str, Any]]</code>, default:                   <code>None</code> )           \u2013            <p>Additional parameters to be passed to the HuggingFace's <code>PreTrainedTokenizer</code>.</p> </li> <li> <code>encode_params</code>               (<code>Optional[Dict[str, Any]]</code>, default:                   <code>None</code> )           \u2013            <p>Additional parameters to be passed to the <code>batch_encode_plus</code> method of the HuggingFace's <code>PreTrainedTokenizer</code>. In the case of the <code>ChunkHFPreprocessor</code>, this parameter is not really <code>Optional</code>. It must be passed containing at least the 'max_length' encoding parameter</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>is_fitted</code>               (<code>bool</code>)           \u2013            <p>Boolean indicating if the preprocessor has been fitted. This is a HuggingFacea tokenizer, so it is always considered fitted and this attribute is manually set to True internally. This parameter exists for consistency with the rest of the library and because is needed for some functionality in the library.</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/hf_preprocessor.py</code> <pre><code>def __init__(\n    self,\n    model_name: str,\n    *,\n    text_col: str,\n    root_dir: Optional[str] = None,\n    use_fast_tokenizer: bool = True,\n    num_workers: Optional[int] = None,\n    preprocessing_rules: Optional[List[Callable[[str], str]]] = None,\n    tokenizer_params: Optional[Dict[str, Any]] = None,\n    encode_params: Optional[Dict[str, Any]] = None,\n):\n    super().__init__(\n        model_name=model_name,\n        use_fast_tokenizer=use_fast_tokenizer,\n        text_col=text_col,\n        num_workers=num_workers,\n        preprocessing_rules=preprocessing_rules,\n        tokenizer_params=tokenizer_params,\n        encode_params=encode_params,\n    )\n\n    self.root_dir = root_dir\n\n    # when using in chunks encode_params is not really optional. I will\n    # review types in due time\n    if self.encode_params is None:\n        raise ValueError(\n            \"The 'encode_params' dict must be passed to the ChunkHFTokenizer \"\n            \"containing at least the 'max_length' encoding parameter\"\n        )\n\n    if \"padding\" not in self.encode_params or not self.encode_params[\"padding\"]:\n        self.encode_params[\"padding\"] = True\n\n    if (\n        \"truncation\" not in self.encode_params\n        or not self.encode_params[\"truncation\"]\n    ):\n        self.encode_params[\"truncation\"] = True\n</code></pre>"},{"location":"pytorch-widedeep/self_supervised_pretraining.html","title":"Self Supervised Pre-training for tabular data","text":"<p>In this library we have implemented two methods or routines that allow the user to use self-suerpvised pre-training for all tabular models in the library with the exception of the <code>TabPerceiver</code> (this is a particular model and self-supervised pre-training requires some adjustments that will be implemented in future versions). Please see the examples folder in the repo or the examples section in the docs for details on how to use self-supervised pre-training with this library.</p> <p>The two routines implemented are illustrated in the figures below. The first is from TabNet: Attentive Interpretable Tabular Learning. It is a 'standard' encoder-decoder architecture and and is designed here for models that do not use transformer-based architectures (or when the embeddings can all have different dimensions). The second is from SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training, it is based on Contrastive and Denoising learning and is designed for models that use transformer-based architectures (or when the embeddings all need to have the same dimension):</p> <p> </p> <p>Figure 1. Figure 2 in their paper. The caption of the original paper is included in case it is useful.</p> <p> </p> <p>Figure 2. Figure 1 in their paper. The caption of the original paper is included in case it is useful.</p> <p>Note that the self-supervised pre-trainers described below focus, of course, on the self-supervised pre-training phase, i.e. the left side in Figure 1 and the upper part in Figure 2. When combined with the <code>Trainer</code> described earlier in the documenation, one can reproduce the full process illustrated in the figures above.</p> <p>Also Note that it is beyond the scope of this docs to explain in detail these routines. In addition, to fully utilise the self-supervised trainers implemented in this library a minimum understanding of the processes as described in the papers is required. Therefore, we strongly encourage the users to have a look to the papers.</p>"},{"location":"pytorch-widedeep/self_supervised_pretraining.html#pytorch_widedeep.self_supervised_training.EncoderDecoderTrainer","title":"EncoderDecoderTrainer","text":"<pre><code>EncoderDecoderTrainer(\n    encoder,\n    decoder=None,\n    masked_prob=0.2,\n    optimizer=None,\n    lr_scheduler=None,\n    callbacks=None,\n    verbose=1,\n    seed=1,\n    **kwargs\n)\n</code></pre> <p>               Bases: <code>BaseEncoderDecoderTrainer</code></p> <p>This class implements an Encoder-Decoder self-supervised 'routine' inspired by TabNet: Attentive Interpretable Tabular Learning. See Figure 1 above.</p> <p>Parameters:</p> <ul> <li> <code>encoder</code>               (<code>ModelWithoutAttention</code>)           \u2013            <p>An instance of a <code>TabMlp</code>, <code>TabResNet</code> or <code>TabNet</code> model</p> </li> <li> <code>decoder</code>               (<code>Optional[DecoderWithoutAttention]</code>, default:                   <code>None</code> )           \u2013            <p>An instance of  a <code>TabMlpDecoder</code>, <code>TabResNetDecoder</code> or <code>TabNetDecoder</code> model. if <code>None</code> the decoder will be automatically built as a 'simetric' model to the Encoder</p> </li> <li> <code>masked_prob</code>               (<code>float</code>, default:                   <code>0.2</code> )           \u2013            <p>Indicates the fraction of elements in the embedding tensor that will be masked and hence used for reconstruction</p> </li> <li> <code>optimizer</code>               (<code>Optional[Optimizer]</code>, default:                   <code>None</code> )           \u2013            <p>An instance of Pytorch's <code>Optimizer</code> object (e.g. <code>torch.optim.Adam ()</code>). if no optimizer is passed it will default to <code>AdamW</code>.</p> </li> <li> <code>lr_scheduler</code>               (<code>Optional[LRScheduler]</code>, default:                   <code>None</code> )           \u2013            <p>An instance of Pytorch's <code>LRScheduler</code> object (e.g <code>torch.optim.lr_scheduler.StepLR(opt, step_size=5)</code>).</p> </li> <li> <code>callbacks</code>               (<code>Optional[List[Callback]]</code>, default:                   <code>None</code> )           \u2013            <p>List with <code>Callback</code> objects. The three callbacks available in <code>pytorch-widedeep</code> are: <code>LRHistory</code>, <code>ModelCheckpoint</code> and <code>EarlyStopping</code>. This can also be a custom callback. See <code>pytorch_widedeep.callbacks.Callback</code> or the Examples folder in the repo.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Setting it to 0 will print nothing during training.</p> </li> <li> <code>seed</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Random seed to be used internally for train_test_split</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>**kwargs</code>           \u2013            <p>Other infrequently used arguments that can also be passed as kwargs are:</p> <ul> <li> <p>device: <code>str</code>     string indicating the device. One of 'cpu' or 'gpu'</p> </li> <li> <p>num_workers: <code>int</code>     number of workers to be used internally by the data loaders</p> </li> <li> <p>reducelronplateau_criterion: <code>str</code>     This sets the criterion that will be used by the lr scheduler to     take a step: One of 'loss' or 'metric'. The ReduceLROnPlateau     learning rate is a bit particular.</p> </li> </ul> </li> </ul> Source code in <code>pytorch_widedeep/self_supervised_training/encoder_decoder_trainer.py</code> <pre><code>def __init__(\n    self,\n    encoder: ModelWithoutAttention,\n    decoder: Optional[DecoderWithoutAttention] = None,\n    masked_prob: float = 0.2,\n    optimizer: Optional[Optimizer] = None,\n    lr_scheduler: Optional[LRScheduler] = None,\n    callbacks: Optional[List[Callback]] = None,\n    verbose: int = 1,\n    seed: int = 1,\n    **kwargs,\n):\n    super().__init__(\n        encoder=encoder,\n        decoder=decoder,\n        masked_prob=masked_prob,\n        optimizer=optimizer,\n        lr_scheduler=lr_scheduler,\n        callbacks=callbacks,\n        verbose=verbose,\n        seed=seed,\n        **kwargs,\n    )\n</code></pre>"},{"location":"pytorch-widedeep/self_supervised_pretraining.html#pytorch_widedeep.self_supervised_training.EncoderDecoderTrainer.pretrain","title":"pretrain","text":"<pre><code>pretrain(\n    X_tab,\n    X_tab_val=None,\n    val_split=None,\n    validation_freq=1,\n    n_epochs=1,\n    batch_size=32,\n)\n</code></pre> <p>Pretrain method. Can also be called using <code>.fit(&lt;same_args&gt;)</code></p> <p>Parameters:</p> <ul> <li> <code>X_tab</code>               (<code>ndarray</code>)           \u2013            <p>tabular dataset</p> </li> <li> <code>X_tab_val</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>validation data</p> </li> <li> <code>val_split</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>An alterative to passing the validation set is to use a train/val split fraction via <code>val_split</code></p> </li> <li> <code>validation_freq</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>epochs validation frequency</p> </li> <li> <code>n_epochs</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>number of epochs</p> </li> <li> <code>batch_size</code>               (<code>int</code>, default:                   <code>32</code> )           \u2013            <p>batch size</p> </li> </ul> Source code in <code>pytorch_widedeep/self_supervised_training/encoder_decoder_trainer.py</code> <pre><code>def pretrain(\n    self,\n    X_tab: np.ndarray,\n    X_tab_val: Optional[np.ndarray] = None,\n    val_split: Optional[float] = None,\n    validation_freq: int = 1,\n    n_epochs: int = 1,\n    batch_size: int = 32,\n):\n    r\"\"\"Pretrain method. Can also be called using `.fit(&lt;same_args&gt;)`\n\n    Parameters\n    ----------\n    X_tab: np.ndarray,\n        tabular dataset\n    X_tab_val: np.ndarray, Optional, default = None\n        validation data\n    val_split: float, Optional. default=None\n        An alterative to passing the validation set is to use a train/val\n        split fraction via `val_split`\n    validation_freq: int, default=1\n        epochs validation frequency\n    n_epochs: int, default=1\n        number of epochs\n    batch_size: int, default=32\n        batch size\n    \"\"\"\n\n    self.batch_size = batch_size\n\n    train_set, eval_set = self._train_eval_split(X_tab, X_tab_val, val_split)\n    train_loader = DataLoader(\n        dataset=train_set, batch_size=batch_size, num_workers=self.num_workers\n    )\n    train_steps = len(train_loader)\n    if eval_set is not None:\n        eval_loader = DataLoader(\n            dataset=eval_set,\n            batch_size=batch_size,\n            num_workers=self.num_workers,\n            shuffle=False,\n        )\n        eval_steps = len(eval_loader)\n\n    self.callback_container.on_train_begin(\n        {\n            \"batch_size\": batch_size,\n            \"train_steps\": train_steps,\n            \"n_epochs\": n_epochs,\n        }\n    )\n    for epoch in range(n_epochs):\n        epoch_logs: Dict[str, float] = {}\n        self.callback_container.on_epoch_begin(epoch, logs=epoch_logs)\n\n        self.train_running_loss = 0.0\n        with trange(train_steps, disable=self.verbose != 1) as t:\n            for batch_idx, X in zip(t, train_loader):\n                t.set_description(\"epoch %i\" % (epoch + 1))\n                train_loss = self._train_step(X[0], batch_idx)\n                self.callback_container.on_batch_end(batch=batch_idx)\n                print_loss_and_metric(t, train_loss)\n\n        epoch_logs = save_epoch_logs(epoch_logs, train_loss, None, \"train\")\n\n        on_epoch_end_metric = None\n        if eval_set is not None and epoch % validation_freq == (\n            validation_freq - 1\n        ):\n            self.callback_container.on_eval_begin()\n            self.valid_running_loss = 0.0\n            with trange(eval_steps, disable=self.verbose != 1) as v:\n                for batch_idx, X in zip(v, eval_loader):\n                    v.set_description(\"valid\")\n                    val_loss = self._eval_step(X[0], batch_idx)\n                    print_loss_and_metric(v, val_loss)\n            epoch_logs = save_epoch_logs(epoch_logs, val_loss, None, \"val\")\n            on_epoch_end_metric = val_loss\n        else:\n            if self.reducelronplateau:\n                raise NotImplementedError(\n                    \"ReduceLROnPlateau scheduler can be used only with validation data.\"\n                )\n\n        self.callback_container.on_epoch_end(epoch, epoch_logs, on_epoch_end_metric)\n\n        if self.early_stop:\n            self.callback_container.on_train_end(epoch_logs)\n            break\n\n    self.callback_container.on_train_end(epoch_logs)\n    self._restore_best_weights()\n    self.ed_model.train()\n</code></pre>"},{"location":"pytorch-widedeep/self_supervised_pretraining.html#pytorch_widedeep.self_supervised_training.ContrastiveDenoisingTrainer","title":"ContrastiveDenoisingTrainer","text":"<pre><code>ContrastiveDenoisingTrainer(\n    model,\n    preprocessor,\n    optimizer=None,\n    lr_scheduler=None,\n    callbacks=None,\n    loss_type=\"both\",\n    projection_head1_dims=None,\n    projection_head2_dims=None,\n    projection_heads_activation=\"relu\",\n    cat_mlp_type=\"multiple\",\n    cont_mlp_type=\"multiple\",\n    denoise_mlps_activation=\"relu\",\n    verbose=1,\n    seed=1,\n    **kwargs\n)\n</code></pre> <p>               Bases: <code>BaseContrastiveDenoisingTrainer</code></p> <p>This class trains a Contrastive, Denoising Self Supervised 'routine' that is based on the one described in SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training, their Figure 1.</p> <p>Parameters:</p> <ul> <li> <code>model</code>               (<code>ModelWithAttention</code>)           \u2013            <p>An instance of a <code>TabTransformer</code>, <code>SAINT</code>, <code>FTTransformer</code>, <code>TabFastFormer</code>, <code>TabPerceiver</code>, <code>ContextAttentionMLP</code> and <code>SelfAttentionMLP</code>.</p> </li> <li> <code>preprocessor</code>               (<code>TabPreprocessor</code>)           \u2013            <p>A fitted <code>TabPreprocessor</code> object. See <code>pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor</code></p> </li> <li> <code>optimizer</code>               (<code>Optional[Optimizer]</code>, default:                   <code>None</code> )           \u2013            <p>An instance of Pytorch's <code>Optimizer</code> object (e.g. <code>torch.optim.Adam ()</code>). if no optimizer is passed it will default to <code>AdamW</code>.</p> </li> <li> <code>lr_scheduler</code>               (<code>Optional[LRScheduler]</code>, default:                   <code>None</code> )           \u2013            <p>An instance of Pytorch's <code>LRScheduler</code> object (e.g <code>torch.optim.lr_scheduler.StepLR(opt, step_size=5)</code>).</p> </li> <li> <code>callbacks</code>               (<code>Optional[List[Callback]]</code>, default:                   <code>None</code> )           \u2013            <p>List with <code>Callback</code> objects. The three callbacks available in <code>pytorch-widedeep</code> are: <code>LRHistory</code>, <code>ModelCheckpoint</code> and <code>EarlyStopping</code>. This can also be a custom callback. See <code>pytorch_widedeep.callbacks.Callback</code> or the Examples folder in the repo.</p> </li> <li> <code>loss_type</code>               (<code>Literal[contrastive, denoising, both]</code>, default:                   <code>'both'</code> )           \u2013            <p>One of 'contrastive', 'denoising' or 'both'. See SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training, their figure (1) and their equation (5).</p> </li> <li> <code>projection_head1_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>The projection heads are simply MLPs. This parameter is a list of integers with the dimensions of the MLP hidden layers. See the paper for details. Note that setting up this parameter requires some knowledge of the architecture one is using. For example, if we are representing the features with embeddings of dim 32 (i.e. the so called dimension of the model is 32), then the first dimension of the projection head must be 32 (e.g. [32, 16])</p> </li> <li> <code>projection_head2_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>Same as 'projection_head1_dims' for the second head</p> </li> <li> <code>projection_heads_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Activation function for the projection heads</p> </li> <li> <code>cat_mlp_type</code>               (<code>Literal[single, multiple]</code>, default:                   <code>'multiple'</code> )           \u2013            <p>If 'denoising' loss is used, one can choose two types of 'stacked' MLPs to process the output from the transformer-based encoder that receives 'corrupted' (cut-mixed and mixed-up) features. These are 'single' or 'multiple'. The former approach will apply a single MLP to all the categorical features while the latter will use one MLP per categorical feature</p> </li> <li> <code>cont_mlp_type</code>               (<code>Literal[single, multiple]</code>, default:                   <code>'multiple'</code> )           \u2013            <p>Same as 'cat_mlp_type' but for the continuous features</p> </li> <li> <code>denoise_mlps_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>activation function for the so called 'denoising mlps'.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Setting it to 0 will print nothing during training.</p> </li> <li> <code>seed</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Random seed to be used internally for train_test_split</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>**kwargs</code>           \u2013            <p>Other infrequently used arguments that can also be passed as kwargs are:</p> <ul> <li> <p>device: <code>str</code>     string indicating the device. One of 'cpu' or 'gpu'</p> </li> <li> <p>num_workers: <code>int</code>     number of workers to be used internally by the data loaders</p> </li> <li> <p>reducelronplateau_criterion: <code>str</code>     This sets the criterion that will be used by the lr scheduler to     take a step: One of 'loss' or 'metric'. The ReduceLROnPlateau     learning rate is a bit particular.</p> </li> </ul> </li> </ul> Source code in <code>pytorch_widedeep/self_supervised_training/contrastive_denoising_trainer.py</code> <pre><code>def __init__(\n    self,\n    model: ModelWithAttention,\n    preprocessor: TabPreprocessor,\n    optimizer: Optional[Optimizer] = None,\n    lr_scheduler: Optional[LRScheduler] = None,\n    callbacks: Optional[List[Callback]] = None,\n    loss_type: Literal[\"contrastive\", \"denoising\", \"both\"] = \"both\",\n    projection_head1_dims: Optional[List[int]] = None,\n    projection_head2_dims: Optional[List[int]] = None,\n    projection_heads_activation: str = \"relu\",\n    cat_mlp_type: Literal[\"single\", \"multiple\"] = \"multiple\",\n    cont_mlp_type: Literal[\"single\", \"multiple\"] = \"multiple\",\n    denoise_mlps_activation: str = \"relu\",\n    verbose: int = 1,\n    seed: int = 1,\n    **kwargs,\n):\n    super().__init__(\n        model=model,\n        preprocessor=preprocessor,\n        loss_type=loss_type,\n        optimizer=optimizer,\n        lr_scheduler=lr_scheduler,\n        callbacks=callbacks,\n        projection_head1_dims=projection_head1_dims,\n        projection_head2_dims=projection_head2_dims,\n        projection_heads_activation=projection_heads_activation,\n        cat_mlp_type=cat_mlp_type,\n        cont_mlp_type=cont_mlp_type,\n        denoise_mlps_activation=denoise_mlps_activation,\n        verbose=verbose,\n        seed=seed,\n        **kwargs,\n    )\n</code></pre>"},{"location":"pytorch-widedeep/self_supervised_pretraining.html#pytorch_widedeep.self_supervised_training.ContrastiveDenoisingTrainer.pretrain","title":"pretrain","text":"<pre><code>pretrain(\n    X_tab,\n    X_tab_val=None,\n    val_split=None,\n    validation_freq=1,\n    n_epochs=1,\n    batch_size=32,\n)\n</code></pre> <p>Pretrain method. Can also be called using <code>.fit(&lt;same_args&gt;)</code></p> <p>Parameters:</p> <ul> <li> <code>X_tab</code>               (<code>ndarray</code>)           \u2013            <p>tabular dataset</p> </li> <li> <code>X_tab_val</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>validation data. Note that, although it is possible to use contrastive-denoising training with a validation set, such set must include feature values that are all seen in the training set in the case of the categorical columns. This is because the values of the columns themselves will be used as targets when computing the loss. Therefore, if a new category is present in the validation set that was not seen in training this will effectively be like trying to predict a new, never seen category (and Pytorch will throw an error)</p> </li> <li> <code>val_split</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>An alterative to passing the validation set is to use a train/val split fraction via <code>val_split</code></p> </li> <li> <code>validation_freq</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>epochs validation frequency</p> </li> <li> <code>n_epochs</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>number of epochs</p> </li> <li> <code>batch_size</code>               (<code>int</code>, default:                   <code>32</code> )           \u2013            <p>batch size</p> </li> </ul> Source code in <code>pytorch_widedeep/self_supervised_training/contrastive_denoising_trainer.py</code> <pre><code>def pretrain(\n    self,\n    X_tab: np.ndarray,\n    X_tab_val: Optional[np.ndarray] = None,\n    val_split: Optional[float] = None,\n    validation_freq: int = 1,\n    n_epochs: int = 1,\n    batch_size: int = 32,\n):\n    r\"\"\"Pretrain method. Can also be called using `.fit(&lt;same_args&gt;)`\n\n    Parameters\n    ----------\n    X_tab: np.ndarray,\n        tabular dataset\n    X_tab_val: np.ndarray, Optional, default = None\n        validation data. Note that, although it is possible to use\n        contrastive-denoising training with a validation set, such set\n        must include feature values that are _all_ seen in the training\n        set in the case of the categorical columns. This is because the\n        values of the columns themselves will be used as targets when\n        computing the loss. Therefore, if a new category is present in\n        the validation set that was not seen in training this will\n        effectively be like trying to predict a new, never seen category\n        (and Pytorch will throw an error)\n    val_split: float, Optional. default=None\n        An alterative to passing the validation set is to use a train/val\n        split fraction via `val_split`\n    validation_freq: int, default=1\n        epochs validation frequency\n    n_epochs: int, default=1\n        number of epochs\n    batch_size: int, default=32\n        batch size\n    \"\"\"\n\n    self.batch_size = batch_size\n\n    train_set, eval_set = self._train_eval_split(X_tab, X_tab_val, val_split)\n    train_loader = DataLoader(\n        dataset=train_set, batch_size=batch_size, num_workers=self.num_workers\n    )\n    train_steps = len(train_loader)\n    if eval_set is not None:\n        eval_loader = DataLoader(\n            dataset=eval_set,\n            batch_size=batch_size,\n            num_workers=self.num_workers,\n            shuffle=False,\n        )\n        eval_steps = len(eval_loader)\n\n    self.callback_container.on_train_begin(\n        {\n            \"batch_size\": batch_size,\n            \"train_steps\": train_steps,\n            \"n_epochs\": n_epochs,\n        }\n    )\n    for epoch in range(n_epochs):\n        epoch_logs: Dict[str, float] = {}\n        self.callback_container.on_epoch_begin(epoch, logs=epoch_logs)\n\n        self.train_running_loss = 0.0\n        with trange(train_steps, disable=self.verbose != 1) as t:\n            for batch_idx, X in zip(t, train_loader):\n                t.set_description(\"epoch %i\" % (epoch + 1))\n                train_loss = self._train_step(X[0], batch_idx)\n                self.callback_container.on_batch_end(batch=batch_idx)\n                print_loss_and_metric(t, train_loss)\n\n        epoch_logs = save_epoch_logs(epoch_logs, train_loss, None, \"train\")\n\n        on_epoch_end_metric = None\n        if eval_set is not None and epoch % validation_freq == (\n            validation_freq - 1\n        ):\n            self.callback_container.on_eval_begin()\n            self.valid_running_loss = 0.0\n            with trange(eval_steps, disable=self.verbose != 1) as v:\n                for batch_idx, X in zip(v, eval_loader):\n                    v.set_description(\"valid\")\n                    val_loss = self._eval_step(X[0], batch_idx)\n                    print_loss_and_metric(v, val_loss)\n            epoch_logs = save_epoch_logs(epoch_logs, val_loss, None, \"val\")\n            on_epoch_end_metric = val_loss\n        else:\n            if self.reducelronplateau:\n                raise NotImplementedError(\n                    \"ReduceLROnPlateau scheduler can be used only with validation data.\"\n                )\n\n        self.callback_container.on_epoch_end(epoch, epoch_logs, on_epoch_end_metric)\n\n        if self.early_stop:\n            self.callback_container.on_train_end(epoch_logs)\n            break\n\n    self.callback_container.on_train_end(epoch_logs)\n    self._restore_best_weights()\n    self.cd_model.train()\n</code></pre>"},{"location":"pytorch-widedeep/tab2vec.html","title":"Tab2Vec","text":""},{"location":"pytorch-widedeep/tab2vec.html#pytorch_widedeep.tab2vec.Tab2Vec","title":"Tab2Vec","text":"<pre><code>Tab2Vec(\n    tab_preprocessor,\n    model,\n    return_dataframe=False,\n    verbose=False,\n)\n</code></pre> <p>Class to transform an input dataframe into vectorized form.</p> <p>This class will take an input dataframe in the form of the dataframe used for training, and it will turn it into a vectorised form based on the processing applied by the model to the categorical and continuous columns.</p> <p> NOTE: Currently this class is only implemented  for the deeptabular component. Therefore, if the input dataframe has a  text column or a column with the path to images, these will be ignored.  We will be adding these functionalities in future versions</p> <p>Parameters:</p> <ul> <li> <code>model</code>               (<code>Union[WideDeep, BayesianWide, BayesianTabMlp]</code>)           \u2013            <p><code>WideDeep</code>, <code>BayesianWide</code> or <code>BayesianTabMlp</code> model. Must be trained.</p> </li> <li> <code>tab_preprocessor</code>               (<code>TabPreprocessor</code>)           \u2013            <p><code>TabPreprocessor</code> object. Must be fitted.</p> </li> <li> <code>return_dataframe</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating of the returned object(s) will be array(s) or pandas dataframe(s)</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>vectorizer</code>               (<code>Module</code>)           \u2013            <p>Torch module with the categorical and continuous encoding process</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import string\n&gt;&gt;&gt; from random import choices\n&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from pytorch_widedeep import Tab2Vec\n&gt;&gt;&gt; from pytorch_widedeep.models import TabMlp, WideDeep\n&gt;&gt;&gt; from pytorch_widedeep.preprocessing import TabPreprocessor\n&gt;&gt;&gt;\n&gt;&gt;&gt; colnames = list(string.ascii_lowercase)[:4]\n&gt;&gt;&gt; cat_col1_vals = [\"a\", \"b\", \"c\"]\n&gt;&gt;&gt; cat_col2_vals = [\"d\", \"e\", \"f\"]\n&gt;&gt;&gt;\n&gt;&gt;&gt; # Create the toy input dataframe and a toy dataframe to be vectorised\n&gt;&gt;&gt; cat_inp = [np.array(choices(c, k=5)) for c in [cat_col1_vals, cat_col2_vals]]\n&gt;&gt;&gt; cont_inp = [np.round(np.random.rand(5), 2) for _ in range(2)]\n&gt;&gt;&gt; df_inp = pd.DataFrame(np.vstack(cat_inp + cont_inp).transpose(), columns=colnames)\n&gt;&gt;&gt; cat_t2v = [np.array(choices(c, k=5)) for c in [cat_col1_vals, cat_col2_vals]]\n&gt;&gt;&gt; cont_t2v = [np.round(np.random.rand(5), 2) for _ in range(2)]\n&gt;&gt;&gt; df_t2v = pd.DataFrame(np.vstack(cat_t2v + cont_t2v).transpose(), columns=colnames)\n&gt;&gt;&gt;\n&gt;&gt;&gt; # fit the TabPreprocessor\n&gt;&gt;&gt; embed_cols = [(\"a\", 2), (\"b\", 4)]\n&gt;&gt;&gt; cont_cols = [\"c\", \"d\"]\n&gt;&gt;&gt; tab_preprocessor = TabPreprocessor(cat_embed_cols=embed_cols, continuous_cols=cont_cols)\n&gt;&gt;&gt; X_tab = tab_preprocessor.fit_transform(df_inp)\n&gt;&gt;&gt;\n&gt;&gt;&gt; # define the model (and let's assume we train it)\n&gt;&gt;&gt; tabmlp = TabMlp(\n... column_idx=tab_preprocessor.column_idx,\n... cat_embed_input=tab_preprocessor.cat_embed_input,\n... continuous_cols=tab_preprocessor.continuous_cols,\n... mlp_hidden_dims=[8, 4])\n&gt;&gt;&gt; model = WideDeep(deeptabular=tabmlp)\n&gt;&gt;&gt; # ...train the model...\n&gt;&gt;&gt;\n&gt;&gt;&gt; # vectorise the dataframe\n&gt;&gt;&gt; t2v = Tab2Vec(tab_preprocessor, model)\n&gt;&gt;&gt; X_vec = t2v.transform(df_t2v)\n</code></pre> Source code in <code>pytorch_widedeep/tab2vec.py</code> <pre><code>def __init__(\n    self,\n    tab_preprocessor: TabPreprocessor,\n    model: Union[WideDeep, BayesianWide, BayesianTabMlp],\n    return_dataframe: bool = False,\n    verbose: bool = False,\n):\n    super(Tab2Vec, self).__init__()\n\n    self._check_inputs(tab_preprocessor, model, verbose)\n\n    self.tab_preprocessor = tab_preprocessor\n    self.return_dataframe = return_dataframe\n    self.verbose = verbose\n\n    self.vectorizer = self._set_vectorizer(model)\n\n    self._set_dim_attributes(tab_preprocessor, model)\n</code></pre>"},{"location":"pytorch-widedeep/tab2vec.html#pytorch_widedeep.tab2vec.Tab2Vec.fit","title":"fit","text":"<pre><code>fit(df, target_col=None)\n</code></pre> <p>This is an empty method i.e. Returns the unchanged object itself. Is only included for consistency in case <code>Tab2Vec</code> is used as part of a Pipeline</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>DataFrame to be vectorised, i.e. the categorical and continuous columns will be encoded based on the processing applied within the model</p> </li> <li> <code>target_col</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Column name of the target_col variable. If <code>None</code> only the array of predictors will be returned</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>Tab2Vec</code>           \u2013            </li> </ul> Source code in <code>pytorch_widedeep/tab2vec.py</code> <pre><code>def fit(self, df: pd.DataFrame, target_col: Optional[str] = None) -&gt; \"Tab2Vec\":\n    r\"\"\"This is an empty method i.e. Returns the unchanged object itself. Is\n    only included for consistency in case `Tab2Vec` is used as part of a\n    Pipeline\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        DataFrame to be vectorised, i.e. the categorical and continuous\n        columns will be encoded based on the processing applied within\n        the model\n    target_col: str, Optional\n        Column name of the target_col variable. If `None` only the array of\n        predictors will be returned\n\n    Returns\n    -------\n    Tab2Vec\n    \"\"\"\n\n    return self\n</code></pre>"},{"location":"pytorch-widedeep/tab2vec.html#pytorch_widedeep.tab2vec.Tab2Vec.transform","title":"transform","text":"<pre><code>transform(df, target_col=None)\n</code></pre> <p>Transforms the input dataframe into vectorized form. If a target column name is passed the target values will be returned separately in their corresponding type (np.ndarray or pd.DataFrame)</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>DataFrame to be vectorised, i.e. the categorical and continuous columns will be encoded based on the processing applied within the model</p> </li> <li> <code>target_col</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Column name of the target_col variable. If <code>None</code> only the array of predictors will be returned</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>Union[np.ndarray, Tuple[np.ndarray, np.ndarray], pd.DataFrame, Tuple[pd.DataFrame, pd.Series]</code>           \u2013            <p>Returns eiter a numpy array with the vectorised values, or a Tuple of numpy arrays with the vectorised values and the target. The same applies to dataframes in case we choose to set <code>return_dataframe = True</code></p> </li> </ul> Source code in <code>pytorch_widedeep/tab2vec.py</code> <pre><code>def transform(\n    self,\n    df: pd.DataFrame,\n    target_col: Optional[str] = None,\n) -&gt; Union[\n    np.ndarray,\n    Tuple[np.ndarray, np.ndarray],\n    pd.DataFrame,\n    Tuple[pd.DataFrame, pd.Series],\n]:\n    r\"\"\"Transforms the input dataframe into vectorized form. If a target\n    column name is passed the target values will be returned separately\n    in their corresponding type (np.ndarray or pd.DataFrame)\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        DataFrame to be vectorised, i.e. the categorical and continuous\n        columns will be encoded based on the processing applied within\n        the model\n    target_col: str, Optional\n        Column name of the target_col variable. If `None` only the array of\n        predictors will be returned\n\n    Returns\n    -------\n    Union[np.ndarray, Tuple[np.ndarray, np.ndarray], pd.DataFrame, Tuple[pd.DataFrame, pd.Series]\n        Returns eiter a numpy array with the vectorised values, or a Tuple\n        of numpy arrays with the vectorised values and the target. The\n        same applies to dataframes in case we choose to set\n        `return_dataframe = True`\n    \"\"\"\n\n    X_tab = self.tab_preprocessor.transform(df)\n    X = torch.from_numpy(X_tab.astype(\"float\")).to(device)\n\n    with torch.no_grad():\n        if self.is_tab_transformer:\n            x_vec, x_cont_not_embed = self.vectorizer(X)\n        else:\n            x_vec = self.vectorizer(X)\n            x_cont_not_embed = None\n\n    if self.tab_preprocessor.with_cls_token:\n        x_vec = x_vec[:, 1:, :]\n\n    if self.tab_preprocessor.with_attention:\n        x_vec = einops.rearrange(x_vec, \"s c e -&gt; s (c e)\")\n\n    if x_cont_not_embed is not None:\n        x_vec = torch.cat([x_vec, x_cont_not_embed], 1).detach().cpu().numpy()\n    else:\n        x_vec = x_vec.detach().cpu().numpy()\n\n    if self.return_dataframe:\n        new_colnames = self._new_colnames()\n        if target_col:\n            return pd.DataFrame(data=x_vec, columns=new_colnames), df[[target_col]]\n        else:\n            return pd.DataFrame(data=x_vec, columns=new_colnames)\n    else:\n        if target_col:\n            return x_vec, df[target_col].values\n        else:\n            return x_vec\n</code></pre>"},{"location":"pytorch-widedeep/tab2vec.html#pytorch_widedeep.tab2vec.Tab2Vec.fit_transform","title":"fit_transform","text":"<pre><code>fit_transform(df, target_col=None)\n</code></pre> <p>Combines <code>fit</code> and <code>transform</code></p> Source code in <code>pytorch_widedeep/tab2vec.py</code> <pre><code>def fit_transform(\n    self, df: pd.DataFrame, target_col: Optional[str] = None\n) -&gt; Union[\n    np.ndarray,\n    Tuple[np.ndarray, np.ndarray],\n    pd.DataFrame,\n    Tuple[pd.DataFrame, pd.Series],\n]:\n    r\"\"\"Combines `fit` and `transform`\"\"\"\n    return self.fit(df, target_col).transform(df, target_col)\n</code></pre>"},{"location":"pytorch-widedeep/trainer.html","title":"Training multimodal Deep Learning Models","text":"<p>Here is the documentation for the <code>Trainer</code> class, that will do all the heavy lifting.</p> <p>Trainer is also available from <code>pytorch-widedeep</code> directly, for example, one could do:</p> <pre><code>    from pytorch-widedeep.training import Trainer\n</code></pre> <p>or also:</p> <pre><code>    from pytorch-widedeep import Trainer\n</code></pre>"},{"location":"pytorch-widedeep/trainer.html#pytorch_widedeep.training.Trainer","title":"Trainer","text":"<pre><code>Trainer(\n    model,\n    objective,\n    custom_loss_function=None,\n    optimizers=None,\n    lr_schedulers=None,\n    initializers=None,\n    transforms=None,\n    callbacks=None,\n    metrics=None,\n    verbose=1,\n    seed=1,\n    **kwargs\n)\n</code></pre> <p>               Bases: <code>BaseTrainer</code></p> <p>Class to set the of attributes that will be used during the training process.</p> <p>Parameters:</p> <ul> <li> <code>model</code>               (<code>WideDeep</code>)           \u2013            <p>An object of class <code>WideDeep</code></p> </li> <li> <code>objective</code>               (<code>str</code>)           \u2013            <p>Defines the objective, loss or cost function. </p> <p>Param aliases: <code>loss_function</code>, <code>loss_fn</code>, <code>loss</code>, <code>cost_function</code>, <code>cost_fn</code>, <code>cost</code>. </p> <p>Possible values are:</p> <ul> <li> <p><code>binary</code>, aliases: <code>logistic</code>, <code>binary_logloss</code>, <code>binary_cross_entropy</code></p> </li> <li> <p><code>binary_focal_loss</code></p> </li> <li> <p><code>multiclass</code>, aliases: <code>multi_logloss</code>, <code>cross_entropy</code>, <code>categorical_cross_entropy</code>,</p> </li> <li> <p><code>multiclass_focal_loss</code></p> </li> <li> <p><code>regression</code>, aliases: <code>mse</code>, <code>l2</code>, <code>mean_squared_error</code></p> </li> <li> <p><code>mean_absolute_error</code>, aliases: <code>mae</code>, <code>l1</code></p> </li> <li> <p><code>mean_squared_log_error</code>, aliases: <code>msle</code></p> </li> <li> <p><code>root_mean_squared_error</code>, aliases:  <code>rmse</code></p> </li> <li> <p><code>root_mean_squared_log_error</code>, aliases: <code>rmsle</code></p> </li> <li> <p><code>zero_inflated_lognormal</code>, aliases: <code>ziln</code></p> </li> <li> <p><code>quantile</code></p> </li> <li> <p><code>tweedie</code></p> </li> <li> <p><code>multitarget</code>, aliases: <code>multi_target</code></p> </li> </ul> <p>NOTE: For <code>multitarget</code> a custom loss function must be passed</p> </li> <li> <code>custom_loss_function</code>               (<code>Optional[Module]</code>, default:                   <code>None</code> )           \u2013            <p>It is possible to pass a custom loss function. See for example <code>pytorch_widedeep.losses.FocalLoss</code> for the required structure of the object or the Examples section in this documentation or in the repo. Note that if <code>custom_loss_function</code> is not <code>None</code>, <code>objective</code> must be 'binary', 'multiclass' or 'regression', consistent with the loss function</p> </li> <li> <code>optimizers</code>               (<code>Optional[Union[Optimizer, Dict[str, Union[Optimizer, List[Optimizer]]]]]</code>, default:                   <code>None</code> )           \u2013            <ul> <li>An instance of Pytorch's <code>Optimizer</code> object   (e.g. <code>torch.optim.Adam()</code>) or</li> <li>a dictionary where there keys are the model components (i.e.   'wide', 'deeptabular', 'deeptext', 'deepimage'   and/or 'deephead')  and the values are the corresponding   optimizers or list of optimizers if multiple models are used for   the given data mode (e.g. two text columns/models for the deeptext   component). If multiple optimizers are used the   dictionary MUST contain an optimizer per model component.</li> </ul> <p>if no optimizers are passed it will default to <code>Adam</code> for all model components</p> </li> <li> <code>lr_schedulers</code>               (<code>Optional[Union[LRScheduler, Dict[str, Union[LRScheduler, List[LRScheduler]]]]]</code>, default:                   <code>None</code> )           \u2013            <ul> <li>An instance of Pytorch's <code>LRScheduler</code> object (e.g   <code>torch.optim.lr_scheduler.StepLR(opt, step_size=5)</code>) or</li> <li>a dictionary where there keys are the model componenst (i.e. 'wide',   'deeptabular', 'deeptext', 'deepimage' and/or 'deephead') and the   values are the corresponding learning rate schedulers or list of     learning rate schedulers if multiple models are used for the given     data mode (e.g. two text columns/models for the deeptext component).</li> </ul> </li> <li> <code>initializers</code>               (<code>Optional[Union[Initializer, Dict[str, Union[Initializer, List[Initializer]]]]]</code>, default:                   <code>None</code> )           \u2013            <ul> <li>An instance of an <code>Initializer</code> object see <code>pytorch-widedeep.initializers</code> or</li> <li>a dictionary where there keys are the model components (i.e. 'wide',   'deeptabular', 'deeptext', 'deepimage' and/or 'deephead')   and the values are the corresponding initializers or list of     initializers if multiple models are used for the given data mode (e.g.     two text columns/models for the deeptext component).</li> </ul> </li> <li> <code>transforms</code>               (<code>Optional[List[Transforms]]</code>, default:                   <code>None</code> )           \u2013            <p>List with <code>torchvision.transforms</code> to be applied to the image component of the model (i.e. <code>deepimage</code>) See torchvision transforms.</p> </li> <li> <code>callbacks</code>               (<code>Optional[List[Callback]]</code>, default:                   <code>None</code> )           \u2013            <p>List with <code>Callback</code> objects. The three callbacks available in <code>pytorch-widedeep</code> are: <code>LRHistory</code>, <code>ModelCheckpoint</code> and <code>EarlyStopping</code>. The <code>History</code> and the <code>LRShedulerCallback</code> callbacks are used by default. This can also be a custom callback as long as the object of type <code>Callback</code>. See <code>pytorch_widedeep.callbacks.Callback</code> or the examples folder in the repo.</p> </li> <li> <code>metrics</code>               (<code>Optional[Union[List[Metric], List[Metric]]]</code>, default:                   <code>None</code> )           \u2013            <ul> <li>List of objects of type <code>Metric</code>. Metrics available are:   <code>Accuracy</code>, <code>Precision</code>, <code>Recall</code>, <code>FBetaScore</code>,   <code>F1Score</code> and <code>R2Score</code>. This can also be a custom metric as long   as it is an object of type <code>Metric</code>. See   <code>pytorch_widedeep.metrics.Metric</code> or the examples folder in the   repo</li> <li>List of objects of type <code>torchmetrics.Metric</code>. This can be any   metric from torchmetrics library   Examples.   This can also be a custom metric as long as   it is an object of type <code>Metric</code>. See   the instructions.</li> </ul> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Verbosity level. If set to 0 nothing will be printed during training</p> </li> <li> <code>seed</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Random seed to be used internally for train/test split</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>**kwargs</code>           \u2013            <p>Other infrequently used arguments that can also be passed as kwargs are:</p> <ul> <li> <p>device: <code>str</code>     string indicating the device. One of 'cpu' or 'gpu'</p> </li> <li> <p>num_workers: <code>int</code>     number of workers to be used internally by the data loaders</p> </li> <li> <p>lambda_sparse: <code>float</code>     lambda sparse parameter in case the <code>deeptabular</code> component is <code>TabNet</code></p> </li> <li> <p>class_weight: <code>List[float]</code>     This is the <code>weight</code> or <code>pos_weight</code> parameter in     <code>CrossEntropyLoss</code> and <code>BCEWithLogitsLoss</code>, depending on whether</p> </li> <li>reducelronplateau_criterion: <code>str</code>     This sets the criterion that will be used by the lr scheduler to     take a step: One of 'loss' or 'metric'. The ReduceLROnPlateau     learning rate is a bit particular.</li> </ul> </li> </ul> <p>Attributes:</p> <ul> <li> <code>cyclic_lr</code>               (<code>bool</code>)           \u2013            <p>Attribute that indicates if any of the lr_schedulers is cyclic_lr (i.e. <code>CyclicLR</code> or <code>OneCycleLR</code>). See Pytorch schedulers.</p> </li> <li> <code>feature_importance</code>               (<code>dict</code>)           \u2013            <p>dict where the keys are the column names and the values are the corresponding feature importances. This attribute will only exist if the <code>deeptabular</code> component is a Tabnet model.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from torchvision.transforms import ToTensor\n&gt;&gt;&gt;\n&gt;&gt;&gt; # wide deep imports\n&gt;&gt;&gt; from pytorch_widedeep.callbacks import EarlyStopping, LRHistory\n&gt;&gt;&gt; from pytorch_widedeep.initializers import KaimingNormal, KaimingUniform, Normal, Uniform\n&gt;&gt;&gt; from pytorch_widedeep.models import TabResnet, Vision, BasicRNN, Wide, WideDeep\n&gt;&gt;&gt; from pytorch_widedeep import Trainer\n&gt;&gt;&gt;\n&gt;&gt;&gt; embed_input = [(u, i, j) for u, i, j in zip([\"a\", \"b\", \"c\"][:4], [4] * 3, [8] * 3)]\n&gt;&gt;&gt; column_idx = {k: v for v, k in enumerate([\"a\", \"b\", \"c\"])}\n&gt;&gt;&gt; wide = Wide(10, 1)\n&gt;&gt;&gt;\n&gt;&gt;&gt; # build the model\n&gt;&gt;&gt; deeptabular = TabResnet(blocks_dims=[8, 4], column_idx=column_idx, cat_embed_input=embed_input)\n&gt;&gt;&gt; deeptext = BasicRNN(vocab_size=10, embed_dim=4, padding_idx=0)\n&gt;&gt;&gt; deepimage = Vision()\n&gt;&gt;&gt; model = WideDeep(wide=wide, deeptabular=deeptabular, deeptext=deeptext, deepimage=deepimage)\n&gt;&gt;&gt;\n&gt;&gt;&gt; # set optimizers and schedulers\n&gt;&gt;&gt; wide_opt = torch.optim.Adam(model.wide.parameters())\n&gt;&gt;&gt; deep_opt = torch.optim.AdamW(model.deeptabular.parameters())\n&gt;&gt;&gt; text_opt = torch.optim.Adam(model.deeptext.parameters())\n&gt;&gt;&gt; img_opt = torch.optim.AdamW(model.deepimage.parameters())\n&gt;&gt;&gt;\n&gt;&gt;&gt; wide_sch = torch.optim.lr_scheduler.StepLR(wide_opt, step_size=5)\n&gt;&gt;&gt; deep_sch = torch.optim.lr_scheduler.StepLR(deep_opt, step_size=3)\n&gt;&gt;&gt; text_sch = torch.optim.lr_scheduler.StepLR(text_opt, step_size=5)\n&gt;&gt;&gt; img_sch = torch.optim.lr_scheduler.StepLR(img_opt, step_size=3)\n&gt;&gt;&gt;\n&gt;&gt;&gt; optimizers = {\"wide\": wide_opt, \"deeptabular\": deep_opt, \"deeptext\": text_opt, \"deepimage\": img_opt}\n&gt;&gt;&gt; schedulers = {\"wide\": wide_sch, \"deeptabular\": deep_sch, \"deeptext\": text_sch, \"deepimage\": img_sch}\n&gt;&gt;&gt;\n&gt;&gt;&gt; # set initializers and callbacks\n&gt;&gt;&gt; initializers = {\"wide\": Uniform, \"deeptabular\": Normal, \"deeptext\": KaimingNormal, \"deepimage\": KaimingUniform}\n&gt;&gt;&gt; transforms = [ToTensor]\n&gt;&gt;&gt; callbacks = [LRHistory(n_epochs=4), EarlyStopping]\n&gt;&gt;&gt;\n&gt;&gt;&gt; # set the trainer\n&gt;&gt;&gt; trainer = Trainer(model, objective=\"regression\", initializers=initializers, optimizers=optimizers,\n... lr_schedulers=schedulers, callbacks=callbacks, transforms=transforms)\n</code></pre> Source code in <code>pytorch_widedeep/training/trainer.py</code> <pre><code>@alias(  # noqa: C901\n    \"objective\",\n    [\"loss_function\", \"loss_fn\", \"loss\", \"cost_function\", \"cost_fn\", \"cost\"],\n)\ndef __init__(\n    self,\n    model: WideDeep,\n    objective: str,\n    custom_loss_function: Optional[nn.Module] = None,\n    optimizers: Optional[\n        Union[Optimizer, Dict[str, Union[Optimizer, List[Optimizer]]]]\n    ] = None,\n    lr_schedulers: Optional[\n        Union[LRScheduler, Dict[str, Union[LRScheduler, List[LRScheduler]]]]\n    ] = None,\n    initializers: Optional[\n        Union[Initializer, Dict[str, Union[Initializer, List[Initializer]]]]\n    ] = None,\n    transforms: Optional[List[Transforms]] = None,\n    callbacks: Optional[List[Callback]] = None,\n    metrics: Optional[Union[List[Metric], List[TorchMetric]]] = None,\n    verbose: int = 1,\n    seed: int = 1,\n    **kwargs,\n):\n    super().__init__(\n        model=model,\n        objective=objective,\n        custom_loss_function=custom_loss_function,\n        optimizers=optimizers,\n        lr_schedulers=lr_schedulers,\n        initializers=initializers,\n        transforms=transforms,\n        callbacks=callbacks,\n        metrics=metrics,\n        verbose=verbose,\n        seed=seed,\n        **kwargs,\n    )\n</code></pre>"},{"location":"pytorch-widedeep/trainer.html#pytorch_widedeep.training.Trainer.fit","title":"fit","text":"<pre><code>fit(\n    X_wide=None,\n    X_tab=None,\n    X_text=None,\n    X_img=None,\n    X_train=None,\n    X_val=None,\n    val_split=None,\n    target=None,\n    n_epochs=1,\n    validation_freq=1,\n    batch_size=32,\n    custom_dataloader=None,\n    feature_importance_sample_size=None,\n    finetune=False,\n    **kwargs\n)\n</code></pre> <p>Fit method.</p> <p>The input datasets can be passed either directly via numpy arrays (<code>X_wide</code>, <code>X_tab</code>, <code>X_text</code> or <code>X_img</code>) or alternatively, in dictionaries (<code>X_train</code> or <code>X_val</code>).</p> <p>Parameters:</p> <ul> <li> <code>X_wide</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>wide</code> model component. See <code>pytorch_widedeep.preprocessing.WidePreprocessor</code></p> </li> <li> <code>X_tab</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deeptabular</code> model component. See <code>pytorch_widedeep.preprocessing.TabPreprocessor</code></p> </li> <li> <code>X_text</code>               (<code>Optional[Union[ndarray, List[ndarray]]]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deeptext</code> model component. See <code>pytorch_widedeep.preprocessing.TextPreprocessor</code>. If multiple text columns/models are used, this should be a list of numpy arrays</p> </li> <li> <code>X_img</code>               (<code>Optional[Union[ndarray, List[ndarray]]]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deepimage</code> model component. See <code>pytorch_widedeep.preprocessing.ImagePreprocessor</code>. If multiple image columns/models are used, this should be a list of numpy arrays</p> </li> <li> <code>X_train</code>               (<code>Optional[Dict[str, Union[ndarray, List[ndarray]]]]</code>, default:                   <code>None</code> )           \u2013            <p>The training dataset can also be passed in a dictionary. Keys are 'X_wide', 'X_tab', 'X_text', 'X_img' and 'target'. Values are the corresponding matrices. Note that of multiple text or image columns/models are used, the corresponding values should be lists of numpy arrays</p> </li> <li> <code>X_val</code>               (<code>Optional[Dict[str, Union[ndarray, List[ndarray]]]]</code>, default:                   <code>None</code> )           \u2013            <p>The validation dataset can also be passed in a dictionary. Keys are 'X_wide', 'X_tab', 'X_text', 'X_img' and 'target'. Values are the corresponding matrices. Note that of multiple text or image columns/models are used, the corresponding values should be lists of numpy arrays</p> </li> <li> <code>val_split</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>train/val split fraction</p> </li> <li> <code>target</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>target values</p> </li> <li> <code>n_epochs</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>number of epochs</p> </li> <li> <code>validation_freq</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>epochs validation frequency</p> </li> <li> <code>batch_size</code>               (<code>int</code>, default:                   <code>32</code> )           \u2013            <p>batch size</p> </li> <li> <code>custom_dataloader</code>               (<code>Optional[DataLoader]</code>, default:                   <code>None</code> )           \u2013            <p>object of class <code>torch.utils.data.DataLoader</code>. Available predefined dataloaders are in <code>pytorch-widedeep.dataloaders</code>.If <code>None</code>, a standard torch <code>DataLoader</code> is used.</p> </li> <li> <code>finetune</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>fine-tune individual model components. This functionality can also be used to 'warm-up' (and hence the alias <code>warmup</code>) individual components before the joined training starts, and hence its alias. See the Examples folder in the repo for more details</p> <p><code>pytorch_widedeep</code> implements 3 fine-tune routines.</p> <ul> <li>fine-tune all trainable layers at once. This routine is   inspired by the work of Howard &amp; Sebastian Ruder 2018 in their   ULMfit paper. Using a   Slanted Triangular learing (see   Leslie N. Smith paper ) ,   the process is the following: i) the learning rate will   gradually increase for 10% of the training steps from max_lr/10   to max_lr. ii) It will then gradually decrease to max_lr/10   for the remaining 90% of the steps. The optimizer used in the   process is <code>Adam</code>.</li> </ul> <p>and two gradual fine-tune routines, where only certain layers are trained at a time.</p> <ul> <li>The so called <code>Felbo</code> gradual fine-tune rourine, based on the the   Felbo et al., 2017 DeepEmoji paper.</li> <li>The <code>Howard</code> routine based on the work of Howard &amp; Sebastian Ruder 2018 in their   ULMfit paper.</li> </ul> <p>For details on how these routines work, please see the Examples section in this documentation and the Examples folder in the repo.  Param Alias: <code>warmup</code></p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>**kwargs</code>           \u2013            <p>Other keyword arguments are:</p> <ul> <li> <p>DataLoader related parameters:     For example,  <code>sampler</code>, <code>batch_sampler</code>, <code>collate_fn</code>, etc.     Please, see the pytorch     DataLoader docs     for details.</p> </li> <li> <p>Finetune related parameters:     see the source code at <code>pytorch_widedeep._finetune</code>. Namely, these are:</p> <ul> <li><code>finetune_epochs</code> (<code>int</code>):     number of epochs use for fine tuning</li> <li><code>finetune_max_lr</code> (<code>float</code>):    max lr during fine tuning</li> <li><code>routine</code> (<code>str</code>):    one of 'howard' or 'felbo'</li> <li><code>deeptabular_gradual</code> (<code>bool</code>):    boolean indicating if the <code>deeptabular</code> component will be fine tuned gradually</li> <li><code>deeptabular_layers</code> (<code>Optional[Union[List[nn.Module], List[List[nn.Module]]]]</code>):    List of pytorch modules indicating the layers of the    <code>deeptabular</code> that will be fine tuned</li> <li><code>deeptabular_max_lr</code> (<code>Union[float, List[float]]</code>):    max lr for the <code>deeptabular</code> componet during fine tuning</li> <li><code>deeptext_gradual</code> (<code>bool</code>):    same as <code>deeptabular_gradual</code> but for the <code>deeptext</code> component</li> <li><code>deeptext_layers</code> (<code>Optional[Union[List[nn.Module], List[List[nn.Module]]]]</code>):    same as <code>deeptabular_gradual</code> but for the <code>deeptext</code> component.    If there are multiple text columns/models, this should be a list of lists</li> <li><code>deeptext_max_lr</code> (<code>Union[float, List[float]]</code>):    same as <code>deeptabular_gradual</code> but for the <code>deeptext</code> component    If there are multiple text columns/models, this should be a list of floats</li> <li><code>deepimage_gradual</code> (<code>bool</code>):    same as <code>deeptext_layers</code> but for the <code>deepimage</code> component</li> <li><code>deepimage_layers</code> (<code>Optional[Union[List[nn.Module], List[List[nn.Module]]]]</code>):    same as <code>deeptext_layers</code> but for the <code>deepimage</code> component</li> <li><code>deepimage_max_lr</code> (<code>Union[float, List[float]]</code>):     same as <code>deeptext_layers</code> but for the <code>deepimage</code> component</li> </ul> </li> </ul> </li> </ul> <p>Examples:</p> <p>For a series of comprehensive examples on how to use the <code>fit</code> method, please see the Examples folder in the repo</p> Source code in <code>pytorch_widedeep/training/trainer.py</code> <pre><code>@alias(\"finetune\", [\"warmup\"])\ndef fit(  # noqa: C901\n    self,\n    X_wide: Optional[np.ndarray] = None,\n    X_tab: Optional[np.ndarray] = None,\n    X_text: Optional[Union[np.ndarray, List[np.ndarray]]] = None,\n    X_img: Optional[Union[np.ndarray, List[np.ndarray]]] = None,\n    X_train: Optional[Dict[str, Union[np.ndarray, List[np.ndarray]]]] = None,\n    X_val: Optional[Dict[str, Union[np.ndarray, List[np.ndarray]]]] = None,\n    val_split: Optional[float] = None,\n    target: Optional[np.ndarray] = None,\n    n_epochs: int = 1,\n    validation_freq: int = 1,\n    batch_size: int = 32,\n    custom_dataloader: Optional[DataLoader] = None,\n    feature_importance_sample_size: Optional[int] = None,\n    finetune: bool = False,\n    **kwargs,\n):\n    r\"\"\"Fit method.\n\n    The input datasets can be passed either directly via numpy arrays\n    (`X_wide`, `X_tab`, `X_text` or `X_img`) or alternatively, in\n    dictionaries (`X_train` or `X_val`).\n\n    Parameters\n    ----------\n    X_wide: np.ndarray, Optional. default=None\n        Input for the `wide` model component.\n        See `pytorch_widedeep.preprocessing.WidePreprocessor`\n    X_tab: np.ndarray, Optional. default=None\n        Input for the `deeptabular` model component.\n        See `pytorch_widedeep.preprocessing.TabPreprocessor`\n    X_text: Union[np.ndarray, List[np.ndarray]], Optional. default=None\n        Input for the `deeptext` model component.\n        See `pytorch_widedeep.preprocessing.TextPreprocessor`.\n        If multiple text columns/models are used, this should be a list of\n        numpy arrays\n    X_img: np.ndarray, Optional. default=None\n        Input for the `deepimage` model component.\n        See `pytorch_widedeep.preprocessing.ImagePreprocessor`.\n        If multiple image columns/models are used, this should be a list of\n        numpy arrays\n    X_train: Dict, Optional. default=None\n        The training dataset can also be passed in a dictionary. Keys are\n        _'X_wide'_, _'X_tab'_, _'X_text'_, _'X_img'_ and _'target'_. Values\n        are the corresponding matrices. Note that of multiple text or image\n        columns/models are used, the corresponding values should be lists\n        of numpy arrays\n    X_val: Dict, Optional. default=None\n        The validation dataset can also be passed in a dictionary. Keys\n        are _'X_wide'_, _'X_tab'_, _'X_text'_, _'X_img'_ and _'target'_.\n        Values are the corresponding matrices. Note that of multiple text\n        or image columns/models are used, the corresponding values should\n        be lists of numpy arrays\n    val_split: float, Optional. default=None\n        train/val split fraction\n    target: np.ndarray, Optional. default=None\n        target values\n    n_epochs: int, default=1\n        number of epochs\n    validation_freq: int, default=1\n        epochs validation frequency\n    batch_size: int, default=32\n        batch size\n    custom_dataloader: `DataLoader`, Optional, default=None\n        object of class `torch.utils.data.DataLoader`. Available\n        predefined dataloaders are in `pytorch-widedeep.dataloaders`.If\n        `None`, a standard torch `DataLoader` is used.\n    finetune: bool, default=False\n        fine-tune individual model components. This functionality can also\n        be used to 'warm-up' (and hence the alias `warmup`) individual\n        components before the joined training starts, and hence its\n        alias. See the Examples folder in the repo for more details\n\n        `pytorch_widedeep` implements 3 fine-tune routines.\n\n        - fine-tune all trainable layers at once. This routine is\n          inspired by the work of Howard &amp; Sebastian Ruder 2018 in their\n          [ULMfit paper](https://arxiv.org/abs/1801.06146). Using a\n          Slanted Triangular learing (see\n          [Leslie N. Smith paper](https://arxiv.org/pdf/1506.01186.pdf) ) ,\n          the process is the following: *i*) the learning rate will\n          gradually increase for 10% of the training steps from max_lr/10\n          to max_lr. *ii*) It will then gradually decrease to max_lr/10\n          for the remaining 90% of the steps. The optimizer used in the\n          process is `Adam`.\n\n        and two gradual fine-tune routines, where only certain layers are\n        trained at a time.\n\n        - The so called `Felbo` gradual fine-tune rourine, based on the the\n          Felbo et al., 2017 [DeepEmoji paper](https://arxiv.org/abs/1708.00524).\n        - The `Howard` routine based on the work of Howard &amp; Sebastian Ruder 2018 in their\n          [ULMfit paper](https://arxiv.org/abs/1801.06146&gt;).\n\n        For details on how these routines work, please see the Examples\n        section in this documentation and the Examples folder in the repo. &lt;br/&gt;\n        Param Alias: `warmup`\n\n    Other Parameters\n    ----------------\n    **kwargs:\n        Other keyword arguments are:\n\n        - **DataLoader related parameters**:&lt;br/&gt;\n            For example,  `sampler`, `batch_sampler`, `collate_fn`, etc.\n            Please, see the pytorch\n            [DataLoader docs](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader)\n            for details.\n\n        - **Finetune related parameters**:&lt;br/&gt;\n            see the source code at `pytorch_widedeep._finetune`. Namely, these are:\n\n            - `finetune_epochs` (`int`):\n                number of epochs use for fine tuning\n            - `finetune_max_lr` (`float`):\n               max lr during fine tuning\n            - `routine` (`str`):\n               one of _'howard'_ or _'felbo'_\n            - `deeptabular_gradual` (`bool`):\n               boolean indicating if the `deeptabular` component will be fine tuned gradually\n            - `deeptabular_layers` (`Optional[Union[List[nn.Module], List[List[nn.Module]]]]`):\n               List of pytorch modules indicating the layers of the\n               `deeptabular` that will be fine tuned\n            - `deeptabular_max_lr` (`Union[float, List[float]]`):\n               max lr for the `deeptabular` componet during fine tuning\n            - `deeptext_gradual` (`bool`):\n               same as `deeptabular_gradual` but for the `deeptext` component\n            - `deeptext_layers` (`Optional[Union[List[nn.Module], List[List[nn.Module]]]]`):\n               same as `deeptabular_gradual` but for the `deeptext` component.\n               If there are multiple text columns/models, this should be a list of lists\n            - `deeptext_max_lr` (`Union[float, List[float]]`):\n               same as `deeptabular_gradual` but for the `deeptext` component\n               If there are multiple text columns/models, this should be a list of floats\n            - `deepimage_gradual` (`bool`):\n               same as `deeptext_layers` but for the `deepimage` component\n            - `deepimage_layers` (`Optional[Union[List[nn.Module], List[List[nn.Module]]]]`):\n               same as `deeptext_layers` but for the `deepimage` component\n            - `deepimage_max_lr` (`Union[float, List[float]]`):\n                same as `deeptext_layers` but for the `deepimage` component\n\n    Examples\n    --------\n\n    For a series of comprehensive examples on how to use the `fit` method, please see the\n    [Examples](https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples)\n    folder in the repo\n    \"\"\"\n\n    dataloader_args, finetune_args = self._extract_kwargs(kwargs)\n\n    self.batch_size = batch_size\n\n    train_set, eval_set = wd_train_val_split(\n        self.seed,\n        self.method,  # type: ignore\n        X_wide,\n        X_tab,\n        X_text,\n        X_img,\n        X_train,\n        X_val,\n        val_split,\n        target,\n        self.transforms,\n    )\n    if custom_dataloader is not None:\n        # make sure is callable (and HAS to be an subclass of DataLoader)\n        assert isinstance(custom_dataloader, type)\n        train_loader = custom_dataloader(  # type: ignore[misc]\n            dataset=train_set,\n            batch_size=batch_size,\n            num_workers=self.num_workers,\n            **dataloader_args,\n        )\n    else:\n        train_loader = DataLoader(\n            dataset=train_set,\n            batch_size=batch_size,\n            num_workers=self.num_workers,\n            **dataloader_args,\n        )\n    train_steps = len(train_loader)\n    if eval_set is not None:\n        eval_loader = DataLoader(\n            dataset=eval_set,\n            batch_size=batch_size,\n            num_workers=self.num_workers,\n            shuffle=False,\n        )\n        eval_steps = len(eval_loader)\n\n    if finetune:\n        self.with_finetuning: bool = True\n        self._finetune(train_loader, **finetune_args)\n        if self.verbose:\n            print(\n                \"Fine-tuning (or warmup) of individual components completed. \"\n                \"Training the whole model for {} epochs\".format(n_epochs)\n            )\n    else:\n        self.with_finetuning = False\n\n    self.callback_container.on_train_begin(\n        {\"batch_size\": batch_size, \"train_steps\": train_steps, \"n_epochs\": n_epochs}\n    )\n    for epoch in range(n_epochs):\n        epoch_logs: Dict[str, float] = {}\n        self.callback_container.on_epoch_begin(epoch, logs=epoch_logs)\n\n        self.train_running_loss = 0.0\n        with trange(train_steps, disable=self.verbose != 1) as t:\n            for batch_idx, (data, targett) in zip(t, train_loader):\n                t.set_description(\"epoch %i\" % (epoch + 1))\n                train_score, train_loss = self._train_step(data, targett, batch_idx)\n                print_loss_and_metric(t, train_loss, train_score)\n                self.callback_container.on_batch_end(batch=batch_idx)\n        epoch_logs = save_epoch_logs(epoch_logs, train_loss, train_score, \"train\")\n\n        on_epoch_end_metric = None\n        if eval_set is not None and epoch % validation_freq == (\n            validation_freq - 1\n        ):\n            self.callback_container.on_eval_begin()\n            self.valid_running_loss = 0.0\n            with trange(eval_steps, disable=self.verbose != 1) as v:\n                for i, (data, targett) in zip(v, eval_loader):\n                    v.set_description(\"valid\")\n                    val_score, val_loss = self._eval_step(data, targett, i)\n                    print_loss_and_metric(v, val_loss, val_score)\n            epoch_logs = save_epoch_logs(epoch_logs, val_loss, val_score, \"val\")\n\n            if self.reducelronplateau:\n                if self.reducelronplateau_criterion == \"loss\":\n                    on_epoch_end_metric = val_loss\n                else:\n                    on_epoch_end_metric = val_score[\n                        self.reducelronplateau_criterion\n                    ]\n        else:\n            if self.reducelronplateau:\n                raise NotImplementedError(\n                    \"ReduceLROnPlateau scheduler can be used only with validation data.\"\n                )\n        self.callback_container.on_epoch_end(epoch, epoch_logs, on_epoch_end_metric)\n\n        if self.early_stop:\n            # self.callback_container.on_train_end(epoch_logs)\n            break\n\n    self.callback_container.on_train_end(epoch_logs)\n\n    if feature_importance_sample_size is not None:\n        self.feature_importance = FeatureImportance(\n            self.device, feature_importance_sample_size\n        ).feature_importance(train_loader, self.model)\n    self._restore_best_weights()\n    self.model.train()\n</code></pre>"},{"location":"pytorch-widedeep/trainer.html#pytorch_widedeep.training.Trainer.predict","title":"predict","text":"<pre><code>predict(\n    X_wide=None,\n    X_tab=None,\n    X_text=None,\n    X_img=None,\n    X_test=None,\n    batch_size=None,\n)\n</code></pre> <p>Returns the predictions</p> <p>The input datasets can be passed either directly via numpy arrays (<code>X_wide</code>, <code>X_tab</code>, <code>X_text</code> or <code>X_img</code>) or alternatively, in a dictionary (<code>X_test</code>)</p> <p>Parameters:</p> <ul> <li> <code>X_wide</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>wide</code> model component. See <code>pytorch_widedeep.preprocessing.WidePreprocessor</code></p> </li> <li> <code>X_tab</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deeptabular</code> model component. See <code>pytorch_widedeep.preprocessing.TabPreprocessor</code></p> </li> <li> <code>X_text</code>               (<code>Optional[Union[ndarray, List[ndarray]]]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deeptext</code> model component. See <code>pytorch_widedeep.preprocessing.TextPreprocessor</code></p> </li> <li> <code>X_img</code>               (<code>Optional[Union[ndarray, List[ndarray]]]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deepimage</code> model component. See <code>pytorch_widedeep.preprocessing.ImagePreprocessor</code></p> </li> <li> <code>X_test</code>               (<code>Optional[Dict[str, Union[ndarray, List[ndarray]]]]</code>, default:                   <code>None</code> )           \u2013            <p>The test dataset can also be passed in a dictionary. Keys are <code>X_wide</code>, 'X_tab', 'X_text', 'X_img' and 'target'. Values are the corresponding matrices.</p> </li> <li> <code>batch_size</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>If a trainer is used to predict after having trained a model, the <code>batch_size</code> needs to be defined as it will not be defined as the <code>Trainer</code> is instantiated</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>np.ndarray:</code>           \u2013            <p>array with the predictions</p> </li> </ul> Source code in <code>pytorch_widedeep/training/trainer.py</code> <pre><code>def predict(  # type: ignore[override, return]\n    self,\n    X_wide: Optional[np.ndarray] = None,\n    X_tab: Optional[np.ndarray] = None,\n    X_text: Optional[Union[np.ndarray, List[np.ndarray]]] = None,\n    X_img: Optional[Union[np.ndarray, List[np.ndarray]]] = None,\n    X_test: Optional[Dict[str, Union[np.ndarray, List[np.ndarray]]]] = None,\n    batch_size: Optional[int] = None,\n) -&gt; np.ndarray:\n    r\"\"\"Returns the predictions\n\n    The input datasets can be passed either directly via numpy arrays\n    (`X_wide`, `X_tab`, `X_text` or `X_img`) or alternatively, in\n    a dictionary (`X_test`)\n\n\n    Parameters\n    ----------\n    X_wide: np.ndarray, Optional. default=None\n        Input for the `wide` model component.\n        See `pytorch_widedeep.preprocessing.WidePreprocessor`\n    X_tab: np.ndarray, Optional. default=None\n        Input for the `deeptabular` model component.\n        See `pytorch_widedeep.preprocessing.TabPreprocessor`\n    X_text: np.ndarray, Optional. default=None\n        Input for the `deeptext` model component.\n        See `pytorch_widedeep.preprocessing.TextPreprocessor`\n    X_img: np.ndarray, Optional. default=None\n        Input for the `deepimage` model component.\n        See `pytorch_widedeep.preprocessing.ImagePreprocessor`\n    X_test: Dict, Optional. default=None\n        The test dataset can also be passed in a dictionary. Keys are\n        `X_wide`, _'X_tab'_, _'X_text'_, _'X_img'_ and _'target'_. Values\n        are the corresponding matrices.\n    batch_size: int, default = 256\n        If a trainer is used to predict after having trained a model, the\n        `batch_size` needs to be defined as it will not be defined as\n        the `Trainer` is instantiated\n\n    Returns\n    -------\n    np.ndarray:\n        array with the predictions\n    \"\"\"\n    preds_l = self._predict(X_wide, X_tab, X_text, X_img, X_test, batch_size)\n    if self.method == \"regression\":\n        return np.vstack(preds_l).squeeze(1)\n    if self.method == \"binary\":\n        preds = np.vstack(preds_l).squeeze(1)\n        return (preds &gt; 0.5).astype(\"int\")\n    if self.method == \"qregression\":\n        return np.vstack(preds_l)\n    if self.method == \"multiclass\":\n        preds = np.vstack(preds_l)\n        return np.argmax(preds, 1)  # type: ignore[return-value]\n</code></pre>"},{"location":"pytorch-widedeep/trainer.html#pytorch_widedeep.training.Trainer.predict_uncertainty","title":"predict_uncertainty","text":"<pre><code>predict_uncertainty(\n    X_wide=None,\n    X_tab=None,\n    X_text=None,\n    X_img=None,\n    X_test=None,\n    batch_size=None,\n    uncertainty_granularity=1000,\n)\n</code></pre> <p>Returns the predicted ucnertainty of the model for the test dataset using a Monte Carlo method during which dropout layers are activated in the evaluation/prediction phase and each sample is predicted N times (<code>uncertainty_granularity</code> times).</p> <p>This is based on Dropout as a Bayesian Approximation: Representing Model Uncertainty in Deep Learning.</p> <p>Parameters:</p> <ul> <li> <code>X_wide</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>wide</code> model component. See <code>pytorch_widedeep.preprocessing.WidePreprocessor</code></p> </li> <li> <code>X_tab</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deeptabular</code> model component. See <code>pytorch_widedeep.preprocessing.TabPreprocessor</code></p> </li> <li> <code>X_text</code>               (<code>Optional[Union[ndarray, List[ndarray]]]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deeptext</code> model component. See <code>pytorch_widedeep.preprocessing.TextPreprocessor</code></p> </li> <li> <code>X_img</code>               (<code>Optional[Union[ndarray, List[ndarray]]]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deepimage</code> model component. See <code>pytorch_widedeep.preprocessing.ImagePreprocessor</code></p> </li> <li> <code>X_test</code>               (<code>Optional[Dict[str, Union[ndarray, List[ndarray]]]]</code>, default:                   <code>None</code> )           \u2013            <p>The test dataset can also be passed in a dictionary. Keys are 'X_wide', 'X_tab', 'X_text', 'X_img' and 'target'. Values are the corresponding matrices.</p> </li> <li> <code>batch_size</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>If a trainer is used to predict after having trained a model, the <code>batch_size</code> needs to be defined as it will not be defined as the <code>Trainer</code> is instantiated</p> </li> <li> <code>uncertainty_granularity</code>           \u2013            <p>number of times the model does prediction for each sample</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>np.ndarray:</code>           \u2013            <ul> <li>if <code>method = regression</code>, it will return an array with <code>(max, min, mean, stdev)</code>   values for each sample.</li> <li>if <code>method = binary</code> it will return an array with   <code>(mean_cls_0_prob, mean_cls_1_prob, predicted_cls)</code> for each sample.</li> <li>if <code>method = multiclass</code> it will return an array with   <code>(mean_cls_0_prob, mean_cls_1_prob, mean_cls_2_prob, ... , predicted_cls)</code>   values for each sample.</li> </ul> </li> </ul> Source code in <code>pytorch_widedeep/training/trainer.py</code> <pre><code>def predict_uncertainty(  # type: ignore[return]\n    self,\n    X_wide: Optional[np.ndarray] = None,\n    X_tab: Optional[np.ndarray] = None,\n    X_text: Optional[Union[np.ndarray, List[np.ndarray]]] = None,\n    X_img: Optional[Union[np.ndarray, List[np.ndarray]]] = None,\n    X_test: Optional[Dict[str, Union[np.ndarray, List[np.ndarray]]]] = None,\n    batch_size: Optional[int] = None,\n    uncertainty_granularity=1000,\n) -&gt; np.ndarray:\n    r\"\"\"Returns the predicted ucnertainty of the model for the test dataset\n    using a Monte Carlo method during which dropout layers are activated\n    in the evaluation/prediction phase and each sample is predicted N\n    times (`uncertainty_granularity` times).\n\n    This is based on\n    [Dropout as a Bayesian Approximation: Representing\n    Model Uncertainty in Deep Learning](https://arxiv.org/abs/1506.02142?context=stat).\n\n    Parameters\n    ----------\n    X_wide: np.ndarray, Optional. default=None\n        Input for the `wide` model component.\n        See `pytorch_widedeep.preprocessing.WidePreprocessor`\n    X_tab: np.ndarray, Optional. default=None\n        Input for the `deeptabular` model component.\n        See `pytorch_widedeep.preprocessing.TabPreprocessor`\n    X_text: np.ndarray, Optional. default=None\n        Input for the `deeptext` model component.\n        See `pytorch_widedeep.preprocessing.TextPreprocessor`\n    X_img: np.ndarray, Optional. default=None\n        Input for the `deepimage` model component.\n        See `pytorch_widedeep.preprocessing.ImagePreprocessor`\n    X_test: Dict, Optional. default=None\n        The test dataset can also be passed in a dictionary. Keys are\n        _'X_wide'_, _'X_tab'_, _'X_text'_, _'X_img'_ and _'target'_. Values\n        are the corresponding matrices.\n    batch_size: int, default = 256\n        If a trainer is used to predict after having trained a model, the\n        `batch_size` needs to be defined as it will not be defined as\n        the `Trainer` is instantiated\n    uncertainty_granularity: int default = 1000\n        number of times the model does prediction for each sample\n\n    Returns\n    -------\n    np.ndarray:\n        - if `method = regression`, it will return an array with `(max, min, mean, stdev)`\n          values for each sample.\n        - if `method = binary` it will return an array with\n          `(mean_cls_0_prob, mean_cls_1_prob, predicted_cls)` for each sample.\n        - if `method = multiclass` it will return an array with\n          `(mean_cls_0_prob, mean_cls_1_prob, mean_cls_2_prob, ... , predicted_cls)`\n          values for each sample.\n\n    \"\"\"\n    preds_l = self._predict(\n        X_wide,\n        X_tab,\n        X_text,\n        X_img,\n        X_test,\n        batch_size,\n        uncertainty_granularity,\n        uncertainty=True,\n    )\n    preds = np.vstack(preds_l)\n    samples_num = int(preds.shape[0] / uncertainty_granularity)\n    if self.method == \"regression\":\n        preds = preds.squeeze(1)\n        preds = preds.reshape((uncertainty_granularity, samples_num))\n        return np.array(\n            (\n                preds.max(axis=0),\n                preds.min(axis=0),\n                preds.mean(axis=0),\n                preds.std(axis=0),\n            )\n        ).T\n    if self.method == \"qregression\":\n        raise ValueError(\n            \"Currently predict_uncertainty is not supported for qregression method\"\n        )\n    if self.method == \"binary\":\n        preds = preds.squeeze(1)\n        preds = preds.reshape((uncertainty_granularity, samples_num))\n        preds = preds.mean(axis=0)\n        probs = np.zeros([preds.shape[0], 3])\n        probs[:, 0] = 1 - preds\n        probs[:, 1] = preds\n        return probs\n    if self.method == \"multiclass\":\n        preds = preds.reshape(uncertainty_granularity, samples_num, preds.shape[1])\n        preds = preds.mean(axis=0)\n        preds = np.hstack((preds, np.vstack(np.argmax(preds, 1))))\n        return preds\n</code></pre>"},{"location":"pytorch-widedeep/trainer.html#pytorch_widedeep.training.Trainer.predict_proba","title":"predict_proba","text":"<pre><code>predict_proba(\n    X_wide=None,\n    X_tab=None,\n    X_text=None,\n    X_img=None,\n    X_test=None,\n    batch_size=None,\n)\n</code></pre> <p>Returns the predicted probabilities for the test dataset for  binary and multiclass methods</p> <p>The input datasets can be passed either directly via numpy arrays (<code>X_wide</code>, <code>X_tab</code>, <code>X_text</code> or <code>X_img</code>) or alternatively, in a dictionary (<code>X_test</code>)</p> <p>Parameters:</p> <ul> <li> <code>X_wide</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>wide</code> model component. See <code>pytorch_widedeep.preprocessing.WidePreprocessor</code></p> </li> <li> <code>X_tab</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deeptabular</code> model component. See <code>pytorch_widedeep.preprocessing.TabPreprocessor</code></p> </li> <li> <code>X_text</code>               (<code>Optional[Union[ndarray, List[ndarray]]]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deeptext</code> model component. See <code>pytorch_widedeep.preprocessing.TextPreprocessor</code></p> </li> <li> <code>X_img</code>               (<code>Optional[Union[ndarray, List[ndarray]]]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deepimage</code> model component. See <code>pytorch_widedeep.preprocessing.ImagePreprocessor</code></p> </li> <li> <code>X_test</code>               (<code>Optional[Dict[str, Union[ndarray, List[ndarray]]]]</code>, default:                   <code>None</code> )           \u2013            <p>The test dataset can also be passed in a dictionary. Keys are <code>X_wide</code>, 'X_tab', 'X_text', 'X_img' and 'target'. Values are the corresponding matrices.</p> </li> <li> <code>batch_size</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>If a trainer is used to predict after having trained a model, the <code>batch_size</code> needs to be defined as it will not be defined as the <code>Trainer</code> is instantiated</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>array with the probabilities per class</p> </li> </ul> Source code in <code>pytorch_widedeep/training/trainer.py</code> <pre><code>def predict_proba(  # type: ignore[override, return]  # noqa: C901\n    self,\n    X_wide: Optional[np.ndarray] = None,\n    X_tab: Optional[np.ndarray] = None,\n    X_text: Optional[Union[np.ndarray, List[np.ndarray]]] = None,\n    X_img: Optional[Union[np.ndarray, List[np.ndarray]]] = None,\n    X_test: Optional[Dict[str, Union[np.ndarray, List[np.ndarray]]]] = None,\n    batch_size: Optional[int] = None,\n) -&gt; np.ndarray:\n    r\"\"\"Returns the predicted probabilities for the test dataset for  binary\n    and multiclass methods\n\n    The input datasets can be passed either directly via numpy arrays\n    (`X_wide`, `X_tab`, `X_text` or `X_img`) or alternatively, in\n    a dictionary (`X_test`)\n\n    Parameters\n    ----------\n    X_wide: np.ndarray, Optional. default=None\n        Input for the `wide` model component.\n        See `pytorch_widedeep.preprocessing.WidePreprocessor`\n    X_tab: np.ndarray, Optional. default=None\n        Input for the `deeptabular` model component.\n        See `pytorch_widedeep.preprocessing.TabPreprocessor`\n    X_text: np.ndarray, Optional. default=None\n        Input for the `deeptext` model component.\n        See `pytorch_widedeep.preprocessing.TextPreprocessor`\n    X_img: np.ndarray, Optional. default=None\n        Input for the `deepimage` model component.\n        See `pytorch_widedeep.preprocessing.ImagePreprocessor`\n    X_test: Dict, Optional. default=None\n        The test dataset can also be passed in a dictionary. Keys are\n        `X_wide`, _'X_tab'_, _'X_text'_, _'X_img'_ and _'target'_. Values\n        are the corresponding matrices.\n    batch_size: int, default = 256\n        If a trainer is used to predict after having trained a model, the\n        `batch_size` needs to be defined as it will not be defined as\n        the `Trainer` is instantiated\n\n    Returns\n    -------\n    np.ndarray\n        array with the probabilities per class\n    \"\"\"\n\n    preds_l = self._predict(X_wide, X_tab, X_text, X_img, X_test, batch_size)\n    if self.method == \"binary\":\n        preds = np.vstack(preds_l).squeeze(1)\n        probs = np.zeros([preds.shape[0], 2])\n        probs[:, 0] = 1 - preds\n        probs[:, 1] = preds\n        return probs\n    if self.method == \"multiclass\":\n        return np.vstack(preds_l)\n</code></pre>"},{"location":"pytorch-widedeep/trainer.html#pytorch_widedeep.training.Trainer.save","title":"save","text":"<pre><code>save(\n    path,\n    save_state_dict=False,\n    save_optimizer=False,\n    model_filename=\"wd_model.pt\",\n)\n</code></pre> <p>Saves the model, training and evaluation history, and the <code>feature_importance</code> attribute (if the <code>deeptabular</code> component is a Tabnet model) to disk</p> <p>The <code>Trainer</code> class is built so that it 'just' trains a model. With that in mind, all the torch related parameters (such as optimizers, learning rate schedulers, initializers, etc) have to be defined externally and then passed to the <code>Trainer</code>. As a result, the <code>Trainer</code> does not generate any attribute or additional data products that need to be saved other than the <code>model</code> object itself, which can be saved as any other torch model (e.g. <code>torch.save(model, path)</code>).</p> <p>The exception is Tabnet. If the <code>deeptabular</code> component is a Tabnet model, an attribute (a dict) called <code>feature_importance</code> will be created at the end of the training process. Therefore, a <code>save</code> method was created that will save the feature importance dictionary to a json file and, since we are here, the model weights, training history and learning rate history.</p> <p>Parameters:</p> <ul> <li> <code>path</code>               (<code>str</code>)           \u2013            <p>path to the directory where the model and the feature importance attribute will be saved.</p> </li> <li> <code>save_state_dict</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether to save directly the model (and optimizer) or the model's (and optimizer's) state dictionary</p> </li> <li> <code>save_optimizer</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether to save the optimizer</p> </li> <li> <code>model_filename</code>               (<code>str</code>, default:                   <code>'wd_model.pt'</code> )           \u2013            <p>filename where the model weights will be store</p> </li> </ul> Source code in <code>pytorch_widedeep/training/trainer.py</code> <pre><code>def save(\n    self,\n    path: str,\n    save_state_dict: bool = False,\n    save_optimizer: bool = False,\n    model_filename: str = \"wd_model.pt\",\n):\n    r\"\"\"Saves the model, training and evaluation history, and the\n    `feature_importance` attribute (if the `deeptabular` component is a\n    Tabnet model) to disk\n\n    The `Trainer` class is built so that it 'just' trains a model. With\n    that in mind, all the torch related parameters (such as optimizers,\n    learning rate schedulers, initializers, etc) have to be defined\n    externally and then passed to the `Trainer`. As a result, the\n    `Trainer` does not generate any attribute or additional data\n    products that need to be saved other than the `model` object itself,\n    which can be saved as any other torch model (e.g. `torch.save(model,\n    path)`).\n\n    The exception is Tabnet. If the `deeptabular` component is a Tabnet\n    model, an attribute (a dict) called `feature_importance` will be\n    created at the end of the training process. Therefore, a `save`\n    method was created that will save the feature importance dictionary\n    to a json file and, since we are here, the model weights, training\n    history and learning rate history.\n\n    Parameters\n    ----------\n    path: str\n        path to the directory where the model and the feature importance\n        attribute will be saved.\n    save_state_dict: bool, default = False\n        Boolean indicating whether to save directly the model\n        (and optimizer) or the model's (and optimizer's) state\n        dictionary\n    save_optimizer: bool, default = False\n        Boolean indicating whether to save the optimizer\n    model_filename: str, Optional, default = \"wd_model.pt\"\n        filename where the model weights will be store\n    \"\"\"\n\n    self._save_history(path)\n\n    self._save_model_and_optimizer(\n        path, save_state_dict, save_optimizer, model_filename\n    )\n\n    if self.model.is_tabnet:\n        with open(Path(path) / \"feature_importance.json\", \"w\") as fi:\n            json.dump(self.feature_importance, fi)\n</code></pre>"},{"location":"pytorch-widedeep/utils/index.html","title":"The <code>utils</code> module","text":"<p>These are a series of utilities that might be useful for a number of preprocessing tasks, even not directly related to <code>pytorch-widedeep</code>. All the classes and functions discussed here are available directly from the <code>utils</code> module. For example, the <code>LabelEncoder</code> within the <code>deeptabular_utils</code> submodule can be imported as:</p> <pre><code>from pytorch_widedeep.utils import LabelEncoder\n</code></pre> <p>These are classes and functions that are internally used in the library. We include them here in case the user finds them useful for other purposes.</p>"},{"location":"pytorch-widedeep/utils/deeptabular_utils.html","title":"deeptabular utils","text":""},{"location":"pytorch-widedeep/utils/deeptabular_utils.html#pytorch_widedeep.utils.deeptabular_utils.LabelEncoder","title":"LabelEncoder","text":"<pre><code>LabelEncoder(\n    columns_to_encode=None,\n    with_attention=False,\n    shared_embed=False,\n)\n</code></pre> <p>Label Encode categorical values for multiple columns at once</p> <p> NOTE: LabelEncoder reserves 0 for <code>unseen</code> new categories. This is convenient when defining the embedding layers, since we can just set padding idx to 0.</p> <p>Parameters:</p> <ul> <li> <code>columns_to_encode</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List of strings containing the names of the columns to encode. If <code>None</code> all columns of type <code>object</code> in the dataframe will be label encoded.</p> </li> <li> <code>with_attention</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the preprocessed data will be passed to an attention-based model. Aliased as <code>for_transformer</code>.</p> </li> <li> <code>shared_embed</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the embeddings will be \"shared\" when using attention-based models. The idea behind <code>shared_embed</code> is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: <code>pytorch_widedeep.models.transformers._layers.SharedEmbeddings</code>.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>encoding_dict</code>               (<code>Dict</code>)           \u2013            <p>Dictionary containing the encoding mappings in the format, e.g. :  <code>{'colname1': {'cat1': 1, 'cat2': 2, ...}, 'colname2': {'cat1': 1, 'cat2': 2, ...}, ...}</code></p> </li> <li> <code>inverse_encoding_dict</code>               (<code>Dict</code>)           \u2013            <p>Dictionary containing the inverse encoding mappings in the format, e.g. :  <code>{'colname1': {1: 'cat1', 2: 'cat2', ...}, 'colname2': {1: 'cat1', 2: 'cat2', ...}, ...}</code></p> </li> </ul> Source code in <code>pytorch_widedeep/utils/deeptabular_utils.py</code> <pre><code>@alias(\"with_attention\", [\"for_transformer\"])\ndef __init__(\n    self,\n    columns_to_encode: Optional[List[str]] = None,\n    with_attention: bool = False,\n    shared_embed: bool = False,\n):\n    self.columns_to_encode = columns_to_encode\n\n    self.shared_embed = shared_embed\n    self.with_attention = with_attention\n\n    self.reset_embed_idx = not self.with_attention or self.shared_embed\n</code></pre>"},{"location":"pytorch-widedeep/utils/deeptabular_utils.html#pytorch_widedeep.utils.deeptabular_utils.LabelEncoder.partial_fit","title":"partial_fit","text":"<pre><code>partial_fit(df)\n</code></pre> <p>Main method. Creates encoding attributes.</p> <p>Returns:</p> <ul> <li> <code>LabelEncoder</code>           \u2013            <p><code>LabelEncoder</code> fitted object</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/deeptabular_utils.py</code> <pre><code>def partial_fit(self, df: pd.DataFrame) -&gt; \"LabelEncoder\":  # noqa: C901\n    \"\"\"Main method. Creates encoding attributes.\n\n    Returns\n    -------\n    LabelEncoder\n        `LabelEncoder` fitted object\n    \"\"\"\n    # here df is a chunk of the data. this is meant to be run when the\n    # data is large and we pass a chunk at a time. Therefore, we do not\n    # copy the input chunk as mutating a chunk is ok\n    if self.columns_to_encode is None:\n        self.columns_to_encode = list(df.select_dtypes(include=[\"object\"]).columns)\n    else:\n        # sanity check to make sure all categorical columns are in an adequate\n        # format\n        for col in self.columns_to_encode:\n            df[col] = df[col].astype(\"O\")\n\n    unique_column_vals: Dict[str, List[str]] = {}\n    for c in self.columns_to_encode:\n        unique_column_vals[c] = df[c].unique().tolist()\n\n    if not hasattr(self, \"encoding_dict\"):\n        # we run the method 'partial_fit' for the 1st time\n        self.encoding_dict: Dict[str, Dict[str, int]] = {}\n        if \"cls_token\" in unique_column_vals and self.shared_embed:\n            self.encoding_dict[\"cls_token\"] = {\"[CLS]\": 0}\n            del unique_column_vals[\"cls_token\"]\n\n        # leave 0 for padding/\"unseen\" categories. Also we need an\n        # attribute to keep track of the encoding in case we use\n        # attention and we do not re-start the index/counter\n        self.cum_idx: int = 1\n        for k, v in unique_column_vals.items():\n            self.encoding_dict[k] = {o: i + self.cum_idx for i, o in enumerate(v)}\n            self.cum_idx = 1 if self.reset_embed_idx else self.cum_idx + len(v)\n    else:\n        # the 'partial_fit' method has already run.\n        # \"cls_token\" will have been added already\n        if \"cls_token\" in unique_column_vals and self.shared_embed:\n            del unique_column_vals[\"cls_token\"]\n\n        # Classes in the new df/chunk of the dataset that have not been seen\n        # before\n        unseen_classes: Dict[str, List[str]] = {}\n        for c in self.columns_to_encode:\n            unseen_classes[c] = list(\n                np.setdiff1d(\n                    unique_column_vals[c], list(self.encoding_dict[c].keys())\n                )\n            )\n\n        # leave 0 for padding/\"unseen\" categories\n        for k, v in unique_column_vals.items():\n            # if we use attention we need to start encoding from the\n            # last 'overall' encoding index. Otherwise, we use the max\n            # encoding index per categorical col\n            _idx = (\n                max(self.encoding_dict[k].values()) + 1\n                if self.reset_embed_idx\n                else self.cum_idx\n            )\n            if len(unseen_classes[k]) != 0:\n                for i, o in enumerate(unseen_classes[k]):\n                    if o not in self.encoding_dict[k]:\n                        self.encoding_dict[k][o] = i + _idx\n                # if self.reset_embed_idx is True it will be 1 anyway\n                self.cum_idx = (\n                    1\n                    if self.reset_embed_idx\n                    else self.cum_idx + len(unseen_classes[k])\n                )\n\n    return self\n</code></pre>"},{"location":"pytorch-widedeep/utils/deeptabular_utils.html#pytorch_widedeep.utils.deeptabular_utils.LabelEncoder.fit","title":"fit","text":"<pre><code>fit(df)\n</code></pre> <p>Simply runs the <code>partial_fit</code> method when the data fits in memory</p> <p>Returns:</p> <ul> <li> <code>LabelEncoder</code>           \u2013            <p><code>LabelEncoder</code> fitted object</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/deeptabular_utils.py</code> <pre><code>def fit(self, df: pd.DataFrame) -&gt; \"LabelEncoder\":\n    \"\"\"Simply runs the `partial_fit` method when the data fits in memory\n\n    Returns\n    -------\n    LabelEncoder\n        `LabelEncoder` fitted object\n    \"\"\"\n    # this is meant to be run when the data fits in memory and therefore,\n    # we do not want to mutate the original df, so we copy it\n    self.partial_fit(df.copy())\n\n    self.inverse_encoding_dict = self.create_inverse_encoding_dict()\n\n    return self\n</code></pre>"},{"location":"pytorch-widedeep/utils/deeptabular_utils.html#pytorch_widedeep.utils.deeptabular_utils.LabelEncoder.transform","title":"transform","text":"<pre><code>transform(df)\n</code></pre> <p>Label Encoded the categories in <code>columns_to_encode</code></p> <p>Returns:</p> <ul> <li> <code>DataFrame</code>           \u2013            <p>label-encoded dataframe</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/deeptabular_utils.py</code> <pre><code>def transform(self, df: pd.DataFrame) -&gt; pd.DataFrame:\n    \"\"\"Label Encoded the categories in `columns_to_encode`\n\n    Returns\n    -------\n    pd.DataFrame\n        label-encoded dataframe\n    \"\"\"\n    try:\n        self.encoding_dict\n    except AttributeError:\n        raise NotFittedError(\n            \"This LabelEncoder instance is not fitted yet. \"\n            \"Call 'fit' with appropriate arguments before using this LabelEncoder.\"\n        )\n\n    df_inp = df.copy()\n    # sanity check to make sure all categorical columns are in an adequate\n    # format\n    for col in self.columns_to_encode:  # type: ignore\n        df_inp[col] = df_inp[col].astype(\"O\")\n\n    for k, v in self.encoding_dict.items():\n        df_inp[k] = df_inp[k].apply(lambda x: v[x] if x in v.keys() else 0)\n\n    return df_inp\n</code></pre>"},{"location":"pytorch-widedeep/utils/deeptabular_utils.html#pytorch_widedeep.utils.deeptabular_utils.LabelEncoder.fit_transform","title":"fit_transform","text":"<pre><code>fit_transform(df)\n</code></pre> <p>Combines <code>fit</code> and <code>transform</code></p> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from pytorch_widedeep.utils import LabelEncoder\n&gt;&gt;&gt; df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})\n&gt;&gt;&gt; columns_to_encode = ['col2']\n&gt;&gt;&gt; encoder = LabelEncoder(columns_to_encode)\n&gt;&gt;&gt; encoder.fit_transform(df)\n   col1  col2\n0     1     1\n1     2     2\n2     3     3\n&gt;&gt;&gt; encoder.encoding_dict\n{'col2': {'me': 1, 'you': 2, 'him': 3}}\n</code></pre> <p>Returns:</p> <ul> <li> <code>DataFrame</code>           \u2013            <p>label-encoded dataframe</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/deeptabular_utils.py</code> <pre><code>def fit_transform(self, df: pd.DataFrame) -&gt; pd.DataFrame:\n    \"\"\"Combines `fit` and `transform`\n\n    Examples\n    --------\n\n    &gt;&gt;&gt; import pandas as pd\n    &gt;&gt;&gt; from pytorch_widedeep.utils import LabelEncoder\n    &gt;&gt;&gt; df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})\n    &gt;&gt;&gt; columns_to_encode = ['col2']\n    &gt;&gt;&gt; encoder = LabelEncoder(columns_to_encode)\n    &gt;&gt;&gt; encoder.fit_transform(df)\n       col1  col2\n    0     1     1\n    1     2     2\n    2     3     3\n    &gt;&gt;&gt; encoder.encoding_dict\n    {'col2': {'me': 1, 'you': 2, 'him': 3}}\n\n    Returns\n    -------\n    pd.DataFrame\n        label-encoded dataframe\n    \"\"\"\n    return self.fit(df).transform(df)\n</code></pre>"},{"location":"pytorch-widedeep/utils/deeptabular_utils.html#pytorch_widedeep.utils.deeptabular_utils.LabelEncoder.inverse_transform","title":"inverse_transform","text":"<pre><code>inverse_transform(df)\n</code></pre> <p>Returns the original categories</p> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from pytorch_widedeep.utils import LabelEncoder\n&gt;&gt;&gt; df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})\n&gt;&gt;&gt; columns_to_encode = ['col2']\n&gt;&gt;&gt; encoder = LabelEncoder(columns_to_encode)\n&gt;&gt;&gt; df_enc = encoder.fit_transform(df)\n&gt;&gt;&gt; encoder.inverse_transform(df_enc)\n   col1 col2\n0     1   me\n1     2  you\n2     3  him\n</code></pre> <p>Returns:</p> <ul> <li> <code>DataFrame</code>           \u2013            <p>DataFrame with original categories</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/deeptabular_utils.py</code> <pre><code>def inverse_transform(self, df: pd.DataFrame) -&gt; pd.DataFrame:\n    \"\"\"Returns the original categories\n\n    Examples\n    --------\n\n    &gt;&gt;&gt; import pandas as pd\n    &gt;&gt;&gt; from pytorch_widedeep.utils import LabelEncoder\n    &gt;&gt;&gt; df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})\n    &gt;&gt;&gt; columns_to_encode = ['col2']\n    &gt;&gt;&gt; encoder = LabelEncoder(columns_to_encode)\n    &gt;&gt;&gt; df_enc = encoder.fit_transform(df)\n    &gt;&gt;&gt; encoder.inverse_transform(df_enc)\n       col1 col2\n    0     1   me\n    1     2  you\n    2     3  him\n\n    Returns\n    -------\n    pd.DataFrame\n        DataFrame with original categories\n    \"\"\"\n\n    if not hasattr(self, \"inverse_encoding_dict\"):\n        self.inverse_encoding_dict = self.create_inverse_encoding_dict()\n\n    for k, v in self.inverse_encoding_dict.items():\n        df[k] = df[k].apply(lambda x: v[x])\n\n    return df\n</code></pre>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html","title":"Fastai transforms","text":"<p>I directly copied and pasted part of the <code>transforms.py</code> module from the <code>fastai</code> library (from an old version). The reason to do such a thing is because <code>pytorch_widedeep</code> only needs the <code>Tokenizer</code> and the <code>Vocab</code> classes there. This way I avoid extra dependencies. Credit for all the code in the <code>fastai_transforms</code> module in this <code>pytorch-widedeep</code> package goes to Jeremy Howard and the <code>fastai</code> team. I only include the documentation here for completion, but I strongly advise the user to read the <code>fastai</code> documentation.</p>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html#pytorch_widedeep.utils.fastai_transforms.Tokenizer","title":"Tokenizer","text":"<pre><code>Tokenizer(\n    tok_func=SpacyTokenizer,\n    lang=\"en\",\n    pre_rules=None,\n    post_rules=None,\n    special_cases=None,\n    n_cpus=None,\n)\n</code></pre> <p>Class to combine a series of rules and a tokenizer function to tokenize text with multiprocessing.</p> <p>Setting some of the parameters of this class require perhaps some familiarity with the source code.</p> <p>Parameters:</p> <ul> <li> <code>tok_func</code>               (<code>Callable</code>, default:                   <code>SpacyTokenizer</code> )           \u2013            <p>Tokenizer Object. See <code>pytorch_widedeep.utils.fastai_transforms.SpacyTokenizer</code></p> </li> <li> <code>lang</code>               (<code>str</code>, default:                   <code>'en'</code> )           \u2013            <p>Text's Language</p> </li> <li> <code>pre_rules</code>               (<code>Optional[ListRules]</code>, default:                   <code>None</code> )           \u2013            <p>Custom type: <code>Collection[Callable[[str], str]]</code>. These are <code>Callable</code> objects that will be applied to the text (str) directly as <code>rule(tok)</code> before being tokenized.</p> </li> <li> <code>post_rules</code>               (<code>Optional[ListRules]</code>, default:                   <code>None</code> )           \u2013            <p>Custom type: <code>Collection[Callable[[str], str]]</code>. These are <code>Callable</code> objects that will be applied to the tokens as <code>rule(tokens)</code> after the text has been tokenized.</p> </li> <li> <code>special_cases</code>               (<code>Optional[Collection[str]]</code>, default:                   <code>None</code> )           \u2013            <p>special cases to be added to the tokenizer via <code>Spacy</code>'s <code>add_special_case</code> method</p> </li> <li> <code>n_cpus</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>number of CPUs to used during the tokenization process</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code> <pre><code>def __init__(\n    self,\n    tok_func: Callable = SpacyTokenizer,\n    lang: str = \"en\",\n    pre_rules: Optional[ListRules] = None,\n    post_rules: Optional[ListRules] = None,\n    special_cases: Optional[Collection[str]] = None,\n    n_cpus: Optional[int] = None,\n):\n    self.tok_func, self.lang, self.special_cases = tok_func, lang, special_cases\n    self.pre_rules = ifnone(pre_rules, defaults.text_pre_rules)\n    self.post_rules = ifnone(post_rules, defaults.text_post_rules)\n    self.special_cases = (\n        special_cases if special_cases is not None else defaults.text_spec_tok\n    )\n    self.n_cpus = ifnone(n_cpus, defaults.cpus)\n</code></pre>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html#pytorch_widedeep.utils.fastai_transforms.Tokenizer.process_text","title":"process_text","text":"<pre><code>process_text(t, tok)\n</code></pre> <p>Process and tokenize one text <code>t</code> with tokenizer <code>tok</code>.</p> <p>Parameters:</p> <ul> <li> <code>t</code>               (<code>str</code>)           \u2013            <p>text to be processed and tokenized</p> </li> <li> <code>tok</code>               (<code>BaseTokenizer</code>)           \u2013            <p>Instance of <code>BaseTokenizer</code>. See <code>pytorch_widedeep.utils.fastai_transforms.BaseTokenizer</code></p> </li> </ul> <p>Returns:</p> <ul> <li> <code>List[str]</code>           \u2013            <p>List of tokens</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code> <pre><code>def process_text(self, t: str, tok: BaseTokenizer) -&gt; List[str]:\n    r\"\"\"Process and tokenize one text ``t`` with tokenizer ``tok``.\n\n    Parameters\n    ----------\n    t: str\n        text to be processed and tokenized\n    tok: ``BaseTokenizer``\n        Instance of `BaseTokenizer`. See\n        `pytorch_widedeep.utils.fastai_transforms.BaseTokenizer`\n\n    Returns\n    -------\n    List[str]\n        List of tokens\n    \"\"\"\n    for rule in self.pre_rules:\n        t = rule(t)\n    toks = tok.tokenizer(t)\n    for rule in self.post_rules:\n        toks = rule(toks)\n    return toks\n</code></pre>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html#pytorch_widedeep.utils.fastai_transforms.Tokenizer.process_all","title":"process_all","text":"<pre><code>process_all(texts)\n</code></pre> <p>Process a list of texts. Parallel execution of <code>process_text</code>.</p> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from pytorch_widedeep.utils import Tokenizer\n&gt;&gt;&gt; texts = ['Machine learning is great', 'but building stuff is even better']\n&gt;&gt;&gt; tok = Tokenizer()\n&gt;&gt;&gt; tok.process_all(texts)\n[['xxmaj', 'machine', 'learning', 'is', 'great'], ['but', 'building', 'stuff', 'is', 'even', 'better']]\n</code></pre> <p> NOTE: Note the token <code>TK_MAJ</code> (<code>xxmaj</code>), used to indicate the next word begins with a capital in the original text. For more details of special tokens please see the <code>fastai</code> docs.</p> <p>Returns:</p> <ul> <li> <code>List[List[str]]</code>           \u2013            <p>List containing lists of tokens. One list per \"document\"</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code> <pre><code>def process_all(self, texts: Collection[str]) -&gt; List[List[str]]:\n    r\"\"\"Process a list of texts. Parallel execution of ``process_text``.\n\n    Examples\n    --------\n    &gt;&gt;&gt; from pytorch_widedeep.utils import Tokenizer\n    &gt;&gt;&gt; texts = ['Machine learning is great', 'but building stuff is even better']\n    &gt;&gt;&gt; tok = Tokenizer()\n    &gt;&gt;&gt; tok.process_all(texts)\n    [['xxmaj', 'machine', 'learning', 'is', 'great'], ['but', 'building', 'stuff', 'is', 'even', 'better']]\n\n    :information_source: **NOTE**:\n    Note the token ``TK_MAJ`` (`xxmaj`), used to indicate the\n    next word begins with a capital in the original text. For more\n    details of special tokens please see the [``fastai`` docs](https://docs.fast.ai/text.core.html#Tokenizing).\n\n    Returns\n    -------\n    List[List[str]]\n        List containing lists of tokens. One list per \"_document_\"\n\n    \"\"\"\n\n    if self.n_cpus &lt;= 1:\n        return self._process_all_1(texts)\n    with ProcessPoolExecutor(self.n_cpus) as e:\n        return sum(\n            e.map(self._process_all_1, partition_by_cores(texts, self.n_cpus)), []\n        )\n</code></pre>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html#pytorch_widedeep.utils.fastai_transforms.Vocab","title":"Vocab","text":"<pre><code>Vocab(max_vocab, min_freq, pad_idx=None)\n</code></pre> <p>Contains the correspondence between numbers and tokens.</p> <p>Parameters:</p> <ul> <li> <code>max_vocab</code>               (<code>int</code>)           \u2013            <p>maximum vocabulary size</p> </li> <li> <code>min_freq</code>               (<code>int</code>)           \u2013            <p>minimum frequency for a token to be considereds</p> </li> <li> <code>pad_idx</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>padding index. If <code>None</code>, Fastai's Tokenizer leaves the 0 index for the unknown token ('xxunk') and defaults to 1 for the padding token ('xxpad').</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>itos</code>               (<code>Collection</code>)           \u2013            <p><code>index to str</code>. Collection of strings that are the tokens of the vocabulary</p> </li> <li> <code>stoi</code>               (<code>defaultdict</code>)           \u2013            <p><code>str to index</code>. Dictionary containing the tokens of the vocabulary and their corresponding index</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code> <pre><code>def __init__(\n    self,\n    max_vocab: int,\n    min_freq: int,\n    pad_idx: Optional[int] = None,\n):\n    self.max_vocab = max_vocab\n    self.min_freq = min_freq\n    self.pad_idx = pad_idx\n</code></pre>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html#pytorch_widedeep.utils.fastai_transforms.Vocab.create","title":"create","text":"<pre><code>create(tokens)\n</code></pre> <p>Create a vocabulary object from a set of tokens.</p> <p>Parameters:</p> <ul> <li> <code>tokens</code>               (<code>Tokens</code>)           \u2013            <p>Custom type: <code>Collection[Collection[str]]</code>  see <code>pytorch_widedeep.wdtypes</code>. Collection of collection of strings (e.g. list of tokenized sentences)</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from pytorch_widedeep.utils import Tokenizer, Vocab\n&gt;&gt;&gt; texts = ['Machine learning is great', 'but building stuff is even better']\n&gt;&gt;&gt; tokens = Tokenizer().process_all(texts)\n&gt;&gt;&gt; vocab = Vocab(max_vocab=18, min_freq=1).create(tokens)\n&gt;&gt;&gt; vocab.numericalize(['machine', 'learning', 'is', 'great'])\n[10, 11, 9, 12]\n&gt;&gt;&gt; vocab.textify([10, 11, 9, 12])\n'machine learning is great'\n</code></pre> <p> NOTE: Note the many special tokens that <code>fastai</code>'s' tokenizer adds. These are particularly useful when building Language models and/or in classification/Regression tasks. Please see the <code>fastai</code> docs.</p> <p>Returns:</p> <ul> <li> <code>Vocab</code>           \u2013            <p>An instance of a <code>Vocab</code> object</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code> <pre><code>def create(\n    self,\n    tokens: Tokens,\n) -&gt; \"Vocab\":\n    r\"\"\"Create a vocabulary object from a set of tokens.\n\n    Parameters\n    ----------\n    tokens: Tokens\n        Custom type: ``Collection[Collection[str]]``  see\n        `pytorch_widedeep.wdtypes`. Collection of collection of\n        strings (e.g. list of tokenized sentences)\n\n    Examples\n    --------\n    &gt;&gt;&gt; from pytorch_widedeep.utils import Tokenizer, Vocab\n    &gt;&gt;&gt; texts = ['Machine learning is great', 'but building stuff is even better']\n    &gt;&gt;&gt; tokens = Tokenizer().process_all(texts)\n    &gt;&gt;&gt; vocab = Vocab(max_vocab=18, min_freq=1).create(tokens)\n    &gt;&gt;&gt; vocab.numericalize(['machine', 'learning', 'is', 'great'])\n    [10, 11, 9, 12]\n    &gt;&gt;&gt; vocab.textify([10, 11, 9, 12])\n    'machine learning is great'\n\n    :information_source: **NOTE**:\n    Note the many special tokens that ``fastai``'s' tokenizer adds. These\n    are particularly useful when building Language models and/or in\n    classification/Regression tasks. Please see the [``fastai`` docs](https://docs.fast.ai/text.core.html#Tokenizing).\n\n    Returns\n    -------\n    Vocab\n        An instance of a `Vocab` object\n    \"\"\"\n\n    freq = Counter(p for o in tokens for p in o)\n    itos = [o for o, c in freq.most_common(self.max_vocab) if c &gt;= self.min_freq]\n    for o in reversed(defaults.text_spec_tok):\n        if o in itos:\n            itos.remove(o)\n        itos.insert(0, o)\n\n    if self.pad_idx is not None and self.pad_idx != 1:\n        itos.remove(PAD)\n        itos.insert(self.pad_idx, PAD)\n        # get the new 'xxunk' index\n        xxunk_idx = np.where([el == \"xxunk\" for el in itos])[0][0]\n    else:\n        xxunk_idx = 0\n\n    itos = itos[: self.max_vocab]\n    if (\n        len(itos) &lt; self.max_vocab\n    ):  # Make sure vocab size is a multiple of 8 for fast mixed precision training\n        while len(itos) % 8 != 0:\n            itos.append(\"xxfake\")\n\n    self.itos = itos\n    self.stoi = defaultdict(\n        lambda: xxunk_idx, {v: k for k, v in enumerate(self.itos)}\n    )\n\n    return self\n</code></pre>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html#pytorch_widedeep.utils.fastai_transforms.Vocab.fit","title":"fit","text":"<pre><code>fit(tokens)\n</code></pre> <p>Calls the <code>create</code> method. I simply want to honor fast ai naming, but for consistency with the rest of the library I am including a fit method</p> Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code> <pre><code>def fit(\n    self,\n    tokens: Tokens,\n) -&gt; \"Vocab\":\n    \"\"\"\n    Calls the `create` method. I simply want to honor fast ai naming, but\n    for consistency with the rest of the library I am including a fit method\n    \"\"\"\n    return self.create(tokens)\n</code></pre>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html#pytorch_widedeep.utils.fastai_transforms.Vocab.numericalize","title":"numericalize","text":"<pre><code>numericalize(t)\n</code></pre> <p>Convert a list of tokens <code>t</code> to their ids.</p> <p>Returns:</p> <ul> <li> <code>List[int]</code>           \u2013            <p>List of 'numericalsed' tokens</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code> <pre><code>def numericalize(self, t: Collection[str]) -&gt; List[int]:\n    \"\"\"Convert a list of tokens ``t`` to their ids.\n\n    Returns\n    -------\n    List[int]\n        List of '_numericalsed_' tokens\n    \"\"\"\n    return [self.stoi[w] for w in t]\n</code></pre>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html#pytorch_widedeep.utils.fastai_transforms.Vocab.transform","title":"transform","text":"<pre><code>transform(t)\n</code></pre> <p>Calls the <code>numericalize</code> method. I simply want to honor fast ai naming, but for consistency with the rest of the library I am including a transform method</p> Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code> <pre><code>def transform(self, t: Collection[str]) -&gt; List[int]:\n    \"\"\"\n    Calls the `numericalize` method. I simply want to honor fast ai naming,\n    but for consistency with the rest of the library I am including a\n    transform method\n    \"\"\"\n    return self.numericalize(t)\n</code></pre>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html#pytorch_widedeep.utils.fastai_transforms.Vocab.textify","title":"textify","text":"<pre><code>textify(nums, sep=' ')\n</code></pre> <p>Convert a list of <code>nums</code> (or indexes) to their tokens.</p> <p>Returns:</p> <ul> <li> <code>List[str]</code>           \u2013            <p>List of tokens</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code> <pre><code>def textify(self, nums: Collection[int], sep=\" \") -&gt; Union[str, List[str]]:\n    \"\"\"Convert a list of ``nums`` (or indexes) to their tokens.\n\n    Returns\n    -------\n    List[str]\n        List of tokens\n    \"\"\"\n    return (\n        sep.join([self.itos[i] for i in nums])\n        if sep is not None\n        else [self.itos[i] for i in nums]\n    )\n</code></pre>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html#pytorch_widedeep.utils.fastai_transforms.Vocab.inverse_transform","title":"inverse_transform","text":"<pre><code>inverse_transform(nums, sep=' ')\n</code></pre> <p>Calls the <code>textify</code> method. I simply want to honor fast ai naming, but for consistency with the rest of the library I am including an inverse_transform method</p> Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code> <pre><code>def inverse_transform(\n    self, nums: Collection[int], sep=\" \"\n) -&gt; Union[str, List[str]]:\n    \"\"\"\n    Calls the `textify` method. I simply want to honor fast ai naming, but\n    for consistency with the rest of the library I am including an\n    inverse_transform method\n    \"\"\"\n    # I simply want to honor fast ai naming, but for consistency with the\n    # rest of the library I am including an inverse_transform method\n    return self.textify(nums, sep)\n</code></pre>"},{"location":"pytorch-widedeep/utils/image_utils.html","title":"Image utils","text":"<p>SimplePreprocessor and AspectAwarePreprocessor are directly taked from the great series of Books `Deep Learning for Computer Vision by Adrian. Therefore, all credit for the code in the <code>image_utils</code> module goes to Adrian Rosebrock.</p>"},{"location":"pytorch-widedeep/utils/image_utils.html#pytorch_widedeep.utils.image_utils.AspectAwarePreprocessor","title":"AspectAwarePreprocessor","text":"<pre><code>AspectAwarePreprocessor(\n    width, height, inter=cv2.INTER_AREA\n)\n</code></pre> <p>Class to resize an image to a certain width and height taking into account the image aspect ratio</p> <p>Parameters:</p> <ul> <li> <code>width</code>               (<code>int</code>)           \u2013            <p>output width</p> </li> <li> <code>height</code>               (<code>int</code>)           \u2013            <p>output height</p> </li> <li> <code>inter</code>           \u2013            <p><code>opencv</code> interpolation method. See <code>opencv</code> <code>InterpolationFlags</code>.</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/image_utils.py</code> <pre><code>def __init__(self, width: int, height: int, inter=cv2.INTER_AREA):\n    self.width = width\n    self.height = height\n    self.inter = inter\n</code></pre>"},{"location":"pytorch-widedeep/utils/image_utils.html#pytorch_widedeep.utils.image_utils.AspectAwarePreprocessor.preprocess","title":"preprocess","text":"<pre><code>preprocess(image)\n</code></pre> <p>Returns the resized input image taking into account the image aspect ratio</p> <p>Parameters:</p> <ul> <li> <code>image</code>               (<code>ndarray</code>)           \u2013            <p>Input image to be resized</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import cv2\n&gt;&gt;&gt; from pytorch_widedeep.utils import AspectAwarePreprocessor\n&gt;&gt;&gt; img = cv2.imread(\"tests/test_data_utils/images/galaxy1.png\")\n&gt;&gt;&gt; img.shape\n(694, 890, 3)\n&gt;&gt;&gt; app = AspectAwarePreprocessor(width=224, height=224)\n&gt;&gt;&gt; resized_img = app.preprocess(img)\n&gt;&gt;&gt; resized_img.shape\n(224, 224, 3)\n</code></pre> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>Resized image according to its original image aspect ratio</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/image_utils.py</code> <pre><code>def preprocess(self, image: np.ndarray) -&gt; np.ndarray:\n    r\"\"\"Returns the resized input image taking into account the image aspect ratio\n\n    Parameters\n    ----------\n    image: np.ndarray\n        Input image to be resized\n\n    Examples\n    --------\n    &gt;&gt;&gt; import cv2\n    &gt;&gt;&gt; from pytorch_widedeep.utils import AspectAwarePreprocessor\n    &gt;&gt;&gt; img = cv2.imread(\"tests/test_data_utils/images/galaxy1.png\")\n    &gt;&gt;&gt; img.shape\n    (694, 890, 3)\n    &gt;&gt;&gt; app = AspectAwarePreprocessor(width=224, height=224)\n    &gt;&gt;&gt; resized_img = app.preprocess(img)\n    &gt;&gt;&gt; resized_img.shape\n    (224, 224, 3)\n\n    Returns\n    -------\n    np.ndarray\n        Resized image according to its original image aspect ratio\n    \"\"\"\n    (h, w) = image.shape[:2]\n    dW = 0\n    dH = 0\n\n    if w &lt; h:\n        image = imutils.resize(image, width=self.width, inter=self.inter)\n        dH = int((image.shape[0] - self.height) / 2.0)\n    else:\n        image = imutils.resize(image, height=self.height, inter=self.inter)\n        dW = int((image.shape[1] - self.width) / 2.0)\n\n    (h, w) = image.shape[:2]\n    image = image[dH : h - dH, dW : w - dW]\n\n    resized_image = cv2.resize(\n        image, (self.width, self.height), interpolation=self.inter\n    )\n\n    return resized_image\n</code></pre>"},{"location":"pytorch-widedeep/utils/image_utils.html#pytorch_widedeep.utils.image_utils.SimplePreprocessor","title":"SimplePreprocessor","text":"<pre><code>SimplePreprocessor(width, height, inter=cv2.INTER_AREA)\n</code></pre> <p>Class to resize an image to a certain width and height</p> <p>Parameters:</p> <ul> <li> <code>width</code>               (<code>int</code>)           \u2013            <p>output width</p> </li> <li> <code>height</code>               (<code>int</code>)           \u2013            <p>output height</p> </li> <li> <code>inter</code>           \u2013            <p><code>opencv</code> interpolation method. See <code>opencv</code> <code>InterpolationFlags</code>.</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/image_utils.py</code> <pre><code>def __init__(self, width: int, height: int, inter=cv2.INTER_AREA):\n    self.width = width\n    self.height = height\n    self.inter = inter\n</code></pre>"},{"location":"pytorch-widedeep/utils/image_utils.html#pytorch_widedeep.utils.image_utils.SimplePreprocessor.preprocess","title":"preprocess","text":"<pre><code>preprocess(image)\n</code></pre> <p>Returns the resized input image</p> <p>Parameters:</p> <ul> <li> <code>image</code>               (<code>ndarray</code>)           \u2013            <p>Input image to be resized</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>Resized image</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/image_utils.py</code> <pre><code>def preprocess(self, image: np.ndarray) -&gt; np.ndarray:\n    r\"\"\"Returns the resized input image\n\n    Parameters\n    ----------\n    image: np.ndarray\n        Input image to be resized\n\n    Returns\n    -------\n    np.ndarray\n        Resized image\n\n    \"\"\"\n    resized_image = cv2.resize(\n        image, (self.width, self.height), interpolation=self.inter\n    )\n\n    return resized_image\n</code></pre>"},{"location":"pytorch-widedeep/utils/text_utils.html","title":"Text utils","text":"<p>Collection of helper function that facilitate processing text.</p>"},{"location":"pytorch-widedeep/utils/text_utils.html#pytorch_widedeep.utils.text_utils.simple_preprocess","title":"simple_preprocess","text":"<pre><code>simple_preprocess(\n    doc, lower=False, deacc=False, min_len=2, max_len=15\n)\n</code></pre> <p>This is <code>Gensim</code>'s <code>simple_preprocess</code> with a <code>lower</code> param to indicate wether or not to lower case all the token in the doc</p> <p>For more information see: <code>Gensim</code> utils module</p> <p>Parameters:</p> <ul> <li> <code>doc</code>               (<code>str</code>)           \u2013            <p>Input document.</p> </li> <li> <code>lower</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Lower case tokens in the input doc</p> </li> <li> <code>deacc</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Remove accent marks from tokens using <code>Gensim</code>'s <code>deaccent</code></p> </li> <li> <code>min_len</code>               (<code>int</code>, default:                   <code>2</code> )           \u2013            <p>Minimum length of token (inclusive). Shorter tokens are discarded.</p> </li> <li> <code>max_len</code>               (<code>int</code>, default:                   <code>15</code> )           \u2013            <p>Maximum length of token in result (inclusive). Longer tokens are discarded.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from pytorch_widedeep.utils import simple_preprocess\n&gt;&gt;&gt; simple_preprocess('Machine learning is great')\n['Machine', 'learning', 'is', 'great']\n</code></pre> <p>Returns:</p> <ul> <li> <code>List[str]</code>           \u2013            <p>List with the processed tokens</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/text_utils.py</code> <pre><code>def simple_preprocess(\n    doc: str,\n    lower: bool = False,\n    deacc: bool = False,\n    min_len: int = 2,\n    max_len: int = 15,\n) -&gt; List[str]:\n    r\"\"\"\n    This is `Gensim`'s `simple_preprocess` with a `lower` param to\n    indicate wether or not to lower case all the token in the doc\n\n    For more information see: `Gensim` [utils module](https://radimrehurek.com/gensim/utils.html)\n\n    Parameters\n    ----------\n    doc: str\n        Input document.\n    lower: bool, default = False\n        Lower case tokens in the input doc\n    deacc: bool, default = False\n        Remove accent marks from tokens using `Gensim`'s `deaccent`\n    min_len: int, default = 2\n        Minimum length of token (inclusive). Shorter tokens are discarded.\n    max_len: int, default = 15\n        Maximum length of token in result (inclusive). Longer tokens are discarded.\n\n    Examples\n    --------\n    &gt;&gt;&gt; from pytorch_widedeep.utils import simple_preprocess\n    &gt;&gt;&gt; simple_preprocess('Machine learning is great')\n    ['Machine', 'learning', 'is', 'great']\n\n    Returns\n    -------\n    List[str]\n        List with the processed tokens\n    \"\"\"\n    tokens = [\n        token\n        for token in tokenize(doc, lower=lower, deacc=deacc, errors=\"ignore\")\n        if min_len &lt;= len(token) &lt;= max_len and not token.startswith(\"_\")\n    ]\n    return tokens\n</code></pre>"},{"location":"pytorch-widedeep/utils/text_utils.html#pytorch_widedeep.utils.text_utils.get_texts","title":"get_texts","text":"<pre><code>get_texts(texts, already_processed=False, n_cpus=None)\n</code></pre> <p>Tokenization using <code>Fastai</code>'s <code>Tokenizer</code> because it does a series of very convenients things during the tokenization process</p> <p>See <code>pytorch_widedeep.utils.fastai_utils.Tokenizer</code></p> <p>Parameters:</p> <ul> <li> <code>texts</code>               (<code>List[str]</code>)           \u2013            <p>List of str with the texts (or documents). One str per document</p> </li> <li> <code>already_processed</code>               (<code>Optional[bool]</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the text is already processed and we simply want to tokenize it. This parameter is thought for those cases where the input sequences might not be text (but IDs, or anything else) and we just want to tokenize it</p> </li> <li> <code>n_cpus</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>number of CPUs to used during the tokenization process</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from pytorch_widedeep.utils import get_texts\n&gt;&gt;&gt; texts = ['Machine learning is great', 'but building stuff is even better']\n&gt;&gt;&gt; get_texts(texts)\n[['xxmaj', 'machine', 'learning', 'is', 'great'], ['but', 'building', 'stuff', 'is', 'even', 'better']]\n</code></pre> <p>Returns:</p> <ul> <li> <code>List[List[str]]</code>           \u2013            <p>List of lists, one list per 'document' containing its corresponding tokens</p> </li> <li> <code>information_source: **NOTE**:</code>           \u2013            </li> <li> <code>`get_texts` uses `pytorch_widedeep.utils.fastai_transforms.Tokenizer`.</code>           \u2013            </li> <li> <code>Such tokenizer uses a series of convenient processing steps, including</code>           \u2013            </li> <li> <code>the  addition of some special tokens, such as `TK_MAJ` (`xxmaj`), used to</code>           \u2013            </li> <li> <code>indicate the next word begins with a capital in the original text. For more</code>           \u2013            </li> <li> <code>details of special tokens please see the [`fastai` `docs](https://docs.fast.ai/text.core.html#Tokenizing)</code>           \u2013            </li> </ul> Source code in <code>pytorch_widedeep/utils/text_utils.py</code> <pre><code>def get_texts(\n    texts: List[str],\n    already_processed: Optional[bool] = False,\n    n_cpus: Optional[int] = None,\n) -&gt; List[List[str]]:\n    r\"\"\"Tokenization using `Fastai`'s `Tokenizer` because it does a\n    series of very convenients things during the tokenization process\n\n    See `pytorch_widedeep.utils.fastai_utils.Tokenizer`\n\n    Parameters\n    ----------\n    texts: List\n        List of str with the texts (or documents). One str per document\n    already_processed: bool, Optional, default = False\n        Boolean indicating if the text is already processed and we simply want\n        to tokenize it. This parameter is thought for those cases where the\n        input sequences might not be text (but IDs, or anything else) and we\n        just want to tokenize it\n    n_cpus: int, Optional, default = None\n        number of CPUs to used during the tokenization process\n\n    Examples\n    --------\n    &gt;&gt;&gt; from pytorch_widedeep.utils import get_texts\n    &gt;&gt;&gt; texts = ['Machine learning is great', 'but building stuff is even better']\n    &gt;&gt;&gt; get_texts(texts)\n    [['xxmaj', 'machine', 'learning', 'is', 'great'], ['but', 'building', 'stuff', 'is', 'even', 'better']]\n\n    Returns\n    -------\n    List[List[str]]\n        List of lists, one list per '_document_' containing its corresponding tokens\n\n    :information_source: **NOTE**:\n    `get_texts` uses `pytorch_widedeep.utils.fastai_transforms.Tokenizer`.\n    Such tokenizer uses a series of convenient processing steps, including\n    the  addition of some special tokens, such as `TK_MAJ` (`xxmaj`), used to\n    indicate the next word begins with a capital in the original text. For more\n    details of special tokens please see the [`fastai` `docs](https://docs.fast.ai/text.core.html#Tokenizing)\n    \"\"\"\n\n    num_cpus = n_cpus if n_cpus is not None else os.cpu_count()\n\n    if not already_processed:\n        processed_texts = [\" \".join(simple_preprocess(t)) for t in texts]\n    else:\n        processed_texts = texts\n    tok = Tokenizer(n_cpus=num_cpus).process_all(processed_texts)\n    return tok\n</code></pre>"},{"location":"pytorch-widedeep/utils/text_utils.html#pytorch_widedeep.utils.text_utils.pad_sequences","title":"pad_sequences","text":"<pre><code>pad_sequences(seq, maxlen, pad_first=True, pad_idx=1)\n</code></pre> <p>Given a List of tokenized and <code>numericalised</code> sequences it will return padded sequences according to the input parameters.</p> <p>Parameters:</p> <ul> <li> <code>seq</code>               (<code>List[int]</code>)           \u2013            <p>List of int with the <code>numericalised</code> tokens</p> </li> <li> <code>maxlen</code>               (<code>int</code>)           \u2013            <p>Maximum length of the padded sequences</p> </li> <li> <code>pad_first</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Indicates whether the padding index will be added at the beginning or the end of the sequences</p> </li> <li> <code>pad_idx</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>padding index. Fastai's Tokenizer leaves 0 for the 'unknown' token.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from pytorch_widedeep.utils import pad_sequences\n&gt;&gt;&gt; seq = [1,2,3]\n&gt;&gt;&gt; pad_sequences(seq, maxlen=5, pad_idx=0)\narray([0, 0, 1, 2, 3], dtype=int32)\n</code></pre> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>numpy array with the padded sequences</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/text_utils.py</code> <pre><code>def pad_sequences(\n    seq: List[int], maxlen: int, pad_first: bool = True, pad_idx: int = 1\n) -&gt; np.ndarray:\n    r\"\"\"\n    Given a List of tokenized and `numericalised` sequences it will return\n    padded sequences according to the input parameters.\n\n    Parameters\n    ----------\n    seq: List\n        List of int with the `numericalised` tokens\n    maxlen: int\n        Maximum length of the padded sequences\n    pad_first: bool,  default = True\n        Indicates whether the padding index will be added at the beginning or the\n        end of the sequences\n    pad_idx: int, default = 1\n        padding index. Fastai's Tokenizer leaves 0 for the 'unknown' token.\n\n    Examples\n    --------\n    &gt;&gt;&gt; from pytorch_widedeep.utils import pad_sequences\n    &gt;&gt;&gt; seq = [1,2,3]\n    &gt;&gt;&gt; pad_sequences(seq, maxlen=5, pad_idx=0)\n    array([0, 0, 1, 2, 3], dtype=int32)\n\n    Returns\n    -------\n    np.ndarray\n        numpy array with the padded sequences\n    \"\"\"\n    if len(seq) == 0:\n        return np.zeros(maxlen, dtype=\"int32\") + pad_idx\n    elif len(seq) &gt;= maxlen:\n        res = np.array(seq[-maxlen:]).astype(\"int32\")\n        return res\n    else:\n        res = np.zeros(maxlen, dtype=\"int32\") + pad_idx\n        if pad_first:\n            res[-len(seq) :] = seq\n        else:\n            res[: len(seq) :] = seq\n        return res\n</code></pre>"},{"location":"pytorch-widedeep/utils/text_utils.html#pytorch_widedeep.utils.text_utils.build_embeddings_matrix","title":"build_embeddings_matrix","text":"<pre><code>build_embeddings_matrix(\n    vocab, word_vectors_path, min_freq, verbose=1\n)\n</code></pre> <p>Build the embedding matrix using pretrained word vectors.</p> <p>Returns pretrained word embeddings. If a word in our vocabulary is not among the pretrained embeddings it will be assigned the mean pretrained word-embeddings vector</p> <p>Parameters:</p> <ul> <li> <code>vocab</code>               (<code>Union[Vocab, ChunkVocab]</code>)           \u2013            <p>see <code>pytorch_widedeep.utils.fastai_utils.Vocab</code></p> </li> <li> <code>word_vectors_path</code>               (<code>str</code>)           \u2013            <p>path to the pretrained word embeddings</p> </li> <li> <code>min_freq</code>               (<code>int</code>)           \u2013            <p>minimum frequency required for a word to be in the vocabulary</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>level of verbosity. Set to 0 for no verbosity</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>Pretrained word embeddings</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/text_utils.py</code> <pre><code>def build_embeddings_matrix(\n    vocab: Union[Vocab, ChunkVocab],\n    word_vectors_path: str,\n    min_freq: int,\n    verbose: int = 1,\n) -&gt; np.ndarray:  # pragma: no cover\n    r\"\"\"Build the embedding matrix using pretrained word vectors.\n\n    Returns pretrained word embeddings. If a word in our vocabulary is not\n    among the pretrained embeddings it will be assigned the mean pretrained\n    word-embeddings vector\n\n    Parameters\n    ----------\n    vocab: Vocab\n        see `pytorch_widedeep.utils.fastai_utils.Vocab`\n    word_vectors_path: str\n        path to the pretrained word embeddings\n    min_freq: int\n        minimum frequency required for a word to be in the vocabulary\n    verbose: int,  default=1\n        level of verbosity. Set to 0 for no verbosity\n\n    Returns\n    -------\n    np.ndarray\n        Pretrained word embeddings\n    \"\"\"\n    if not os.path.isfile(word_vectors_path):\n        raise FileNotFoundError(\"{} not found\".format(word_vectors_path))\n    if verbose:\n        print(\"Indexing word vectors...\")\n\n    embeddings_index = {}\n    f = open(word_vectors_path)\n    for line in f:\n        values = line.split()\n        word = values[0]\n        coefs = np.asarray(values[1:], dtype=\"float32\")\n        embeddings_index[word] = coefs\n    f.close()\n\n    if verbose:\n        print(\"Loaded {} word vectors\".format(len(embeddings_index)))\n        print(\"Preparing embeddings matrix...\")\n\n    mean_word_vector = np.mean(list(embeddings_index.values()), axis=0)  # type: ignore[arg-type]\n    embedding_dim = len(list(embeddings_index.values())[0])\n    num_words = len(vocab.itos)\n    embedding_matrix = np.zeros((num_words, embedding_dim))\n    found_words = 0\n    for i, word in enumerate(vocab.itos):\n        embedding_vector = embeddings_index.get(word)\n        if embedding_vector is not None:\n            embedding_matrix[i] = embedding_vector\n            found_words += 1\n        else:\n            embedding_matrix[i] = mean_word_vector\n\n    if verbose:\n        print(\n            \"{} words in the vocabulary had {} vectors and appear more than {} times\".format(\n                found_words, word_vectors_path, min_freq\n            )\n        )\n\n    return embedding_matrix.astype(\"float32\")\n</code></pre>"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"index.html","title":"Home","text":""},{"location":"index.html#pytorch-widedeep","title":"pytorch-widedeep","text":"<p>A flexible package for multimodal-deep-learning to combine tabular data with text and images using Wide and Deep models in Pytorch</p> <p>Documentation: https://pytorch-widedeep.readthedocs.io</p> <p>Companion posts and tutorials: infinitoml</p> <p>Experiments and comparison with <code>LightGBM</code>: TabularDL vs LightGBM</p> <p>Slack: if you want to contribute or just want to chat with us, join slack</p> <p>The content of this document is organized as follows:</p> <ul> <li>pytorch-widedeep<ul> <li>Introduction</li> <li>Architectures</li> <li>The <code>deeptabular</code> component</li> <li>The <code>rec</code> module</li> <li>Text and Images</li> <li>Acknowledgments</li> <li>License</li> <li>Cite<ul> <li>BibTex</li> <li>APA</li> </ul> </li> </ul> </li> </ul>"},{"location":"index.html#introduction","title":"Introduction","text":"<p><code>pytorch-widedeep</code> is based on Google's Wide and Deep Algorithm, adjusted for multi-modal datasets.</p> <p>In general terms, <code>pytorch-widedeep</code> is a package to use deep learning with tabular data. In particular, is intended to facilitate the combination of text and images with corresponding tabular data using wide and deep models. With that in mind there are a number of architectures that can be implemented with the library. The main components of those architectures are shown in the Figure below:</p> <p> </p> <p>In math terms, and following the notation in the paper, the expression for the architecture without a <code>deephead</code> component can be formulated as:</p> <p> </p> <p>Where \u03c3 is the sigmoid function, 'W' are the weight matrices applied to the wide model and to the final activations of the deep models, 'a' are these final activations, \u03c6(x) are the cross product transformations of the original features 'x', and , and 'b' is the bias term. In case you are wondering what are \"cross product transformations\", here is a quote taken directly from the paper: \"For binary features, a cross-product transformation (e.g., \u201cAND(gender=female, language=en)\u201d) is 1 if and only if the constituent features (\u201cgender=female\u201d and \u201clanguage=en\u201d) are all 1, and 0 otherwise\".</p> <p>It is perfectly possible to use custom models (and not necessarily those in the library) as long as the the custom models have a property called <code>output_dim</code> with the size of the last layer of activations, so that <code>WideDeep</code> can be constructed. Examples on how to use custom components can be found in the Examples folder and the section below.</p>"},{"location":"index.html#architectures","title":"Architectures","text":"<p>The <code>pytorch-widedeep</code> library offers a number of different architectures. In this section we will show some of them in their simplest form (i.e. with default param values in most cases) with their corresponding code snippets. Note that all the snippets below shoud run locally. For a more detailed explanation of the different components and their parameters, please refer to the documentation.</p> <p>For the examples below we will be using a toy dataset generated as follows:</p> <pre><code>import os\nimport random\n\nimport numpy as np\nimport pandas as pd\nfrom PIL import Image\nfrom faker import Faker\n\n\ndef create_and_save_random_image(image_number, size=(32, 32)):\n\n    if not os.path.exists(\"images\"):\n        os.makedirs(\"images\")\n\n    array = np.random.randint(0, 256, (size[0], size[1], 3), dtype=np.uint8)\n\n    image = Image.fromarray(array)\n\n    image_name = f\"image_{image_number}.png\"\n    image.save(os.path.join(\"images\", image_name))\n\n    return image_name\n\n\nfake = Faker()\n\ncities = [\"New York\", \"Los Angeles\", \"Chicago\", \"Houston\"]\nnames = [\"Alice\", \"Bob\", \"Charlie\", \"David\", \"Eva\"]\n\ndata = {\n    \"city\": [random.choice(cities) for _ in range(100)],\n    \"name\": [random.choice(names) for _ in range(100)],\n    \"age\": [random.uniform(18, 70) for _ in range(100)],\n    \"height\": [random.uniform(150, 200) for _ in range(100)],\n    \"sentence\": [fake.sentence() for _ in range(100)],\n    \"other_sentence\": [fake.sentence() for _ in range(100)],\n    \"image_name\": [create_and_save_random_image(i) for i in range(100)],\n    \"target\": [random.choice([0, 1]) for _ in range(100)],\n}\n\ndf = pd.DataFrame(data)\n</code></pre> <p>This will create a 100 rows dataframe and a dir in your local folder, called <code>images</code> with 100 random images (or images with just noise).</p> <p>Perhaps the simplest architecture would be just one component, <code>wide</code>, <code>deeptabular</code>, <code>deeptext</code> or <code>deepimage</code> on their own, which is also possible, but let's start the examples with a standard Wide and Deep architecture. From there, how to build a model comprised only of one component will be straightforward.</p> <p>Note that the examples shown below would be almost identical using any of the models available in the library. For example, <code>TabMlp</code> can be replaced by <code>TabResnet</code>, <code>TabNet</code>, <code>TabTransformer</code>, etc. Similarly, <code>BasicRNN</code> can be replaced by <code>AttentiveRNN</code>, <code>StackedAttentiveRNN</code>, or <code>HFModel</code> with their corresponding parameters and preprocessor in the case of the Hugging Face models.</p> <p>1. Wide and Tabular component (aka deeptabular)</p> <p> </p> <pre><code>from pytorch_widedeep.preprocessing import TabPreprocessor, WidePreprocessor\nfrom pytorch_widedeep.models import Wide, TabMlp, WideDeep\nfrom pytorch_widedeep.training import Trainer\n\n# Wide\nwide_cols = [\"city\"]\ncrossed_cols = [(\"city\", \"name\")]\nwide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)\nX_wide = wide_preprocessor.fit_transform(df)\nwide = Wide(input_dim=np.unique(X_wide).shape[0])\n\n# Tabular\ntab_preprocessor = TabPreprocessor(\n    embed_cols=[\"city\", \"name\"], continuous_cols=[\"age\", \"height\"]\n)\nX_tab = tab_preprocessor.fit_transform(df)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    mlp_hidden_dims=[64, 32],\n)\n\n# WideDeep\nmodel = WideDeep(wide=wide, deeptabular=tab_mlp)\n\n# Train\ntrainer = Trainer(model, objective=\"binary\")\n\ntrainer.fit(\n    X_wide=X_wide,\n    X_tab=X_tab,\n    target=df[\"target\"].values,\n    n_epochs=1,\n    batch_size=32,\n)\n</code></pre> <p>2. Tabular and Text data</p> <p> </p> <pre><code>from pytorch_widedeep.preprocessing import TabPreprocessor, TextPreprocessor\nfrom pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep\nfrom pytorch_widedeep.training import Trainer\n\n# Tabular\ntab_preprocessor = TabPreprocessor(\n    embed_cols=[\"city\", \"name\"], continuous_cols=[\"age\", \"height\"]\n)\nX_tab = tab_preprocessor.fit_transform(df)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    mlp_hidden_dims=[64, 32],\n)\n\n# Text\ntext_preprocessor = TextPreprocessor(\n    text_col=\"sentence\", maxlen=20, max_vocab=100, n_cpus=1\n)\nX_text = text_preprocessor.fit_transform(df)\nrnn = BasicRNN(\n    vocab_size=len(text_preprocessor.vocab.itos),\n    embed_dim=16,\n    hidden_dim=8,\n    n_layers=1,\n)\n\n# WideDeep\nmodel = WideDeep(deeptabular=tab_mlp, deeptext=rnn)\n\n# Train\ntrainer = Trainer(model, objective=\"binary\")\n\ntrainer.fit(\n    X_tab=X_tab,\n    X_text=X_text,\n    target=df[\"target\"].values,\n    n_epochs=1,\n    batch_size=32,\n)\n</code></pre> <p>3. Tabular and text with a FC head on top via the <code>head_hidden_dims</code> param   in <code>WideDeep</code></p> <p> </p> <pre><code>from pytorch_widedeep.preprocessing import TabPreprocessor, TextPreprocessor\nfrom pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep\nfrom pytorch_widedeep.training import Trainer\n\n# Tabular\ntab_preprocessor = TabPreprocessor(\n    embed_cols=[\"city\", \"name\"], continuous_cols=[\"age\", \"height\"]\n)\nX_tab = tab_preprocessor.fit_transform(df)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    mlp_hidden_dims=[64, 32],\n)\n\n# Text\ntext_preprocessor = TextPreprocessor(\n    text_col=\"sentence\", maxlen=20, max_vocab=100, n_cpus=1\n)\nX_text = text_preprocessor.fit_transform(df)\nrnn = BasicRNN(\n    vocab_size=len(text_preprocessor.vocab.itos),\n    embed_dim=16,\n    hidden_dim=8,\n    n_layers=1,\n)\n\n# WideDeep\nmodel = WideDeep(deeptabular=tab_mlp, deeptext=rnn, head_hidden_dims=[32, 16])\n\n# Train\ntrainer = Trainer(model, objective=\"binary\")\n\ntrainer.fit(\n    X_tab=X_tab,\n    X_text=X_text,\n    target=df[\"target\"].values,\n    n_epochs=1,\n    batch_size=32,\n)\n</code></pre> <p>4. Tabular and multiple text columns that are passed directly to   <code>WideDeep</code></p> <p> </p> <pre><code>from pytorch_widedeep.preprocessing import TabPreprocessor, TextPreprocessor\nfrom pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep\nfrom pytorch_widedeep.training import Trainer\n\n\n# Tabular\ntab_preprocessor = TabPreprocessor(\n    embed_cols=[\"city\", \"name\"], continuous_cols=[\"age\", \"height\"]\n)\nX_tab = tab_preprocessor.fit_transform(df)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    mlp_hidden_dims=[64, 32],\n)\n\n# Text\ntext_preprocessor_1 = TextPreprocessor(\n    text_col=\"sentence\", maxlen=20, max_vocab=100, n_cpus=1\n)\nX_text_1 = text_preprocessor_1.fit_transform(df)\ntext_preprocessor_2 = TextPreprocessor(\n    text_col=\"other_sentence\", maxlen=20, max_vocab=100, n_cpus=1\n)\nX_text_2 = text_preprocessor_2.fit_transform(df)\nrnn_1 = BasicRNN(\n    vocab_size=len(text_preprocessor_1.vocab.itos),\n    embed_dim=16,\n    hidden_dim=8,\n    n_layers=1,\n)\nrnn_2 = BasicRNN(\n    vocab_size=len(text_preprocessor_2.vocab.itos),\n    embed_dim=16,\n    hidden_dim=8,\n    n_layers=1,\n)\n\n# WideDeep\nmodel = WideDeep(deeptabular=tab_mlp, deeptext=[rnn_1, rnn_2])\n\n# Train\ntrainer = Trainer(model, objective=\"binary\")\n\ntrainer.fit(\n    X_tab=X_tab,\n    X_text=[X_text_1, X_text_2],\n    target=df[\"target\"].values,\n    n_epochs=1,\n    batch_size=32,\n)\n</code></pre> <p>5. Tabular data and multiple text columns that are fused via a the library's   <code>ModelFuser</code> class</p> <p> </p> <pre><code>from pytorch_widedeep.preprocessing import TabPreprocessor, TextPreprocessor\nfrom pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep, ModelFuser\nfrom pytorch_widedeep import Trainer\n\n# Tabular\ntab_preprocessor = TabPreprocessor(\n    embed_cols=[\"city\", \"name\"], continuous_cols=[\"age\", \"height\"]\n)\nX_tab = tab_preprocessor.fit_transform(df)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    mlp_hidden_dims=[64, 32],\n)\n\n# Text\ntext_preprocessor_1 = TextPreprocessor(\n    text_col=\"sentence\", maxlen=20, max_vocab=100, n_cpus=1\n)\nX_text_1 = text_preprocessor_1.fit_transform(df)\ntext_preprocessor_2 = TextPreprocessor(\n    text_col=\"other_sentence\", maxlen=20, max_vocab=100, n_cpus=1\n)\nX_text_2 = text_preprocessor_2.fit_transform(df)\n\nrnn_1 = BasicRNN(\n    vocab_size=len(text_preprocessor_1.vocab.itos),\n    embed_dim=16,\n    hidden_dim=8,\n    n_layers=1,\n)\nrnn_2 = BasicRNN(\n    vocab_size=len(text_preprocessor_2.vocab.itos),\n    embed_dim=16,\n    hidden_dim=8,\n    n_layers=1,\n)\n\nmodels_fuser = ModelFuser(models=[rnn_1, rnn_2], fusion_method=\"mult\")\n\n# WideDeep\nmodel = WideDeep(deeptabular=tab_mlp, deeptext=models_fuser)\n\n# Train\ntrainer = Trainer(model, objective=\"binary\")\n\ntrainer.fit(\n    X_tab=X_tab,\n    X_text=[X_text_1, X_text_2],\n    target=df[\"target\"].values,\n    n_epochs=1,\n    batch_size=32,\n)\n</code></pre> <p>6. Tabular and multiple text columns, with an image column. The text columns   are fused via the library's <code>ModelFuser</code> and then all fused via the   deephead paramenter in <code>WideDeep</code> which is a custom <code>ModelFuser</code> coded by   the user</p> <p>This is perhaps the less elegant solution as it involves a custom component by the user and slicing the 'incoming' tensor. In the future, we will include a <code>TextAndImageModelFuser</code> to make this process more straightforward. Still, is not really complicated and it is a good example of how to use custom components in <code>pytorch-widedeep</code>.</p> <p>Note that the only requirement for the custom component is that it has a property called <code>output_dim</code> that returns the size of the last layer of activations. In other words, it does not need to inherit from <code>BaseWDModelComponent</code>. This base class simply checks the existence of such property and avoids some typing errors internally.</p> <p> </p> <pre><code>import torch\n\nfrom pytorch_widedeep.preprocessing import TabPreprocessor, TextPreprocessor, ImagePreprocessor\nfrom pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep, ModelFuser, Vision\nfrom pytorch_widedeep.models._base_wd_model_component import BaseWDModelComponent\nfrom pytorch_widedeep import Trainer\n\n# Tabular\ntab_preprocessor = TabPreprocessor(\n    embed_cols=[\"city\", \"name\"], continuous_cols=[\"age\", \"height\"]\n)\nX_tab = tab_preprocessor.fit_transform(df)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    mlp_hidden_dims=[16, 8],\n)\n\n# Text\ntext_preprocessor_1 = TextPreprocessor(\n    text_col=\"sentence\", maxlen=20, max_vocab=100, n_cpus=1\n)\nX_text_1 = text_preprocessor_1.fit_transform(df)\ntext_preprocessor_2 = TextPreprocessor(\n    text_col=\"other_sentence\", maxlen=20, max_vocab=100, n_cpus=1\n)\nX_text_2 = text_preprocessor_2.fit_transform(df)\nrnn_1 = BasicRNN(\n    vocab_size=len(text_preprocessor_1.vocab.itos),\n    embed_dim=16,\n    hidden_dim=8,\n    n_layers=1,\n)\nrnn_2 = BasicRNN(\n    vocab_size=len(text_preprocessor_2.vocab.itos),\n    embed_dim=16,\n    hidden_dim=8,\n    n_layers=1,\n)\nmodels_fuser = ModelFuser(\n    models=[rnn_1, rnn_2],\n    fusion_method=\"mult\",\n)\n\n# Image\nimage_preprocessor = ImagePreprocessor(img_col=\"image_name\", img_path=\"images\")\nX_img = image_preprocessor.fit_transform(df)\nvision = Vision(pretrained_model_setup=\"resnet18\", head_hidden_dims=[16, 8])\n\n# deephead (custom model fuser)\nclass MyModelFuser(BaseWDModelComponent):\n    \"\"\"\n    Simply a Linear + Relu sequence on top of the text + images followed by a\n    Linear -&gt; Relu -&gt; Linear for the concatenation of tabular slice of the\n    tensor and the output of the text and image sequential model\n    \"\"\"\n    def __init__(\n        self,\n        tab_incoming_dim: int,\n        text_incoming_dim: int,\n        image_incoming_dim: int,\n        output_units: int,\n    ):\n\n        super(MyModelFuser, self).__init__()\n\n        self.tab_incoming_dim = tab_incoming_dim\n        self.text_incoming_dim = text_incoming_dim\n        self.image_incoming_dim = image_incoming_dim\n        self.output_units = output_units\n        self.text_and_image_fuser = torch.nn.Sequential(\n            torch.nn.Linear(text_incoming_dim + image_incoming_dim, output_units),\n            torch.nn.ReLU(),\n        )\n        self.out = torch.nn.Sequential(\n            torch.nn.Linear(output_units + tab_incoming_dim, output_units * 4),\n            torch.nn.ReLU(),\n            torch.nn.Linear(output_units * 4, output_units),\n        )\n\n    def forward(self, X: torch.Tensor) -&gt; torch.Tensor:\n        tab_slice = slice(0, self.tab_incoming_dim)\n        text_slice = slice(\n            self.tab_incoming_dim, self.tab_incoming_dim + self.text_incoming_dim\n        )\n        image_slice = slice(\n            self.tab_incoming_dim + self.text_incoming_dim,\n            self.tab_incoming_dim + self.text_incoming_dim + self.image_incoming_dim,\n        )\n        X_tab = X[:, tab_slice]\n        X_text = X[:, text_slice]\n        X_img = X[:, image_slice]\n        X_text_and_image = self.text_and_image_fuser(torch.cat([X_text, X_img], dim=1))\n        return self.out(torch.cat([X_tab, X_text_and_image], dim=1))\n\n    @property\n    def output_dim(self):\n        return self.output_units\n\ndeephead = MyModelFuser(\n    tab_incoming_dim=tab_mlp.output_dim,\n    text_incoming_dim=models_fuser.output_dim,\n    image_incoming_dim=vision.output_dim,\n    output_units=8,\n)\n\n# WideDeep\nmodel = WideDeep(\n    deeptabular=tab_mlp,\n    deeptext=models_fuser,\n    deepimage=vision,\n    deephead=deephead,\n)\n\n# Train\ntrainer = Trainer(model, objective=\"binary\")\n\ntrainer.fit(\n    X_tab=X_tab,\n    X_text=[X_text_1, X_text_2],\n    X_img=X_img,\n    target=df[\"target\"].values,\n    n_epochs=1,\n    batch_size=32,\n)\n</code></pre> <p>7. A two-tower model</p> <p>This is a popular model in the context of recommendation systems. Let's say we have a tabular dataset formed my triples (user features, item features, target). We can create a two-tower model where the user and item features are passed through two separate models and then \"fused\" via a dot product.</p> <p> </p> <pre><code>import numpy as np\nimport pandas as pd\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.models import TabMlp, WideDeep, ModelFuser\n\n# Let's create the interaction dataset\n# user_features dataframe\nnp.random.seed(42)\nuser_ids = np.arange(1, 101)\nages = np.random.randint(18, 60, size=100)\ngenders = np.random.choice([\"male\", \"female\"], size=100)\nlocations = np.random.choice([\"city_a\", \"city_b\", \"city_c\", \"city_d\"], size=100)\nuser_features = pd.DataFrame(\n    {\"id\": user_ids, \"age\": ages, \"gender\": genders, \"location\": locations}\n)\n\n# item_features dataframe\nitem_ids = np.arange(1, 101)\nprices = np.random.uniform(10, 500, size=100).round(2)\ncolors = np.random.choice([\"red\", \"blue\", \"green\", \"black\"], size=100)\ncategories = np.random.choice([\"electronics\", \"clothing\", \"home\", \"toys\"], size=100)\n\nitem_features = pd.DataFrame(\n    {\"id\": item_ids, \"price\": prices, \"color\": colors, \"category\": categories}\n)\n\n# Interactions dataframe\ninteraction_user_ids = np.random.choice(user_ids, size=1000)\ninteraction_item_ids = np.random.choice(item_ids, size=1000)\npurchased = np.random.choice([0, 1], size=1000, p=[0.7, 0.3])\ninteractions = pd.DataFrame(\n    {\n        \"user_id\": interaction_user_ids,\n        \"item_id\": interaction_item_ids,\n        \"purchased\": purchased,\n    }\n)\nuser_item_purchased = interactions.merge(\n    user_features, left_on=\"user_id\", right_on=\"id\"\n).merge(item_features, left_on=\"item_id\", right_on=\"id\")\n\n# Users\ntab_preprocessor_user = TabPreprocessor(\n    cat_embed_cols=[\"gender\", \"location\"],\n    continuous_cols=[\"age\"],\n)\nX_user = tab_preprocessor_user.fit_transform(user_item_purchased)\ntab_mlp_user = TabMlp(\n    column_idx=tab_preprocessor_user.column_idx,\n    cat_embed_input=tab_preprocessor_user.cat_embed_input,\n    continuous_cols=[\"age\"],\n    mlp_hidden_dims=[16, 8],\n    mlp_dropout=[0.2, 0.2],\n)\n\n# Items\ntab_preprocessor_item = TabPreprocessor(\n    cat_embed_cols=[\"color\", \"category\"],\n    continuous_cols=[\"price\"],\n)\nX_item = tab_preprocessor_item.fit_transform(user_item_purchased)\ntab_mlp_item = TabMlp(\n    column_idx=tab_preprocessor_item.column_idx,\n    cat_embed_input=tab_preprocessor_item.cat_embed_input,\n    continuous_cols=[\"price\"],\n    mlp_hidden_dims=[16, 8],\n    mlp_dropout=[0.2, 0.2],\n)\n\ntwo_tower_model = ModelFuser([tab_mlp_user, tab_mlp_item], fusion_method=\"dot\")\n\nmodel = WideDeep(deeptabular=two_tower_model)\n\ntrainer = Trainer(model, objective=\"binary\")\n\ntrainer.fit(\n    X_tab=[X_user, X_item],\n    target=interactions.purchased.values,\n    n_epochs=1,\n    batch_size=32,\n)\n</code></pre> <p>8. Tabular with a multi-target loss</p> <p>This one is \"a bonus\" to illustrate the use of multi-target losses, more than actually a different architecture.</p> <p> </p> <pre><code>from pytorch_widedeep.preprocessing import TabPreprocessor, TextPreprocessor, ImagePreprocessor\nfrom pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep, ModelFuser, Vision\nfrom pytorch_widedeep.losses_multitarget import MultiTargetClassificationLoss\nfrom pytorch_widedeep.models._base_wd_model_component import BaseWDModelComponent\nfrom pytorch_widedeep import Trainer\n\n# let's add a second target to the dataframe\ndf[\"target2\"] = [random.choice([0, 1]) for _ in range(100)]\n\n# Tabular\ntab_preprocessor = TabPreprocessor(\n    embed_cols=[\"city\", \"name\"], continuous_cols=[\"age\", \"height\"]\n)\nX_tab = tab_preprocessor.fit_transform(df)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    mlp_hidden_dims=[64, 32],\n)\n\n# 'pred_dim=2' because we have two binary targets. For other types of targets,\n#  please, see the documentation\nmodel = WideDeep(deeptabular=tab_mlp, pred_dim=2).\n\nloss = MultiTargetClassificationLoss(binary_config=[0, 1], reduction=\"mean\")\n\n# When a multi-target loss is used, 'custom_loss_function' must not be None.\n# See the docs\ntrainer = Trainer(model, objective=\"multitarget\", custom_loss_function=loss)\n\ntrainer.fit(\n    X_tab=X_tab,\n    target=df[[\"target\", \"target2\"]].values,\n    n_epochs=1,\n    batch_size=32,\n)\n</code></pre>"},{"location":"index.html#the-deeptabular-component","title":"The <code>deeptabular</code> component","text":"<p>It is important to emphasize again that each individual component, <code>wide</code>, <code>deeptabular</code>, <code>deeptext</code> and <code>deepimage</code>, can be used independently and in isolation. For example, one could use only <code>wide</code>, which is in simply a linear model. In fact, one of the most interesting functionalities in<code>pytorch-widedeep</code> would be the use of the <code>deeptabular</code> component on its own, i.e. what one might normally refer as Deep Learning for Tabular Data. Currently, <code>pytorch-widedeep</code> offers the following different models for that component:</p> <ol> <li>Wide: a simple linear model where the nonlinearities are captured via cross-product transformations, as explained before.</li> <li>TabMlp: a simple MLP that receives embeddings representing the categorical features, concatenated with the continuous features, which can also be embedded.</li> <li>TabResnet: similar to the previous model but the embeddings are passed through a series of ResNet blocks built with dense layers.</li> <li>TabNet: details on TabNet can be found in TabNet: Attentive Interpretable Tabular Learning</li> </ol> <p>Two simpler attention based models that we call:</p> <ol> <li>ContextAttentionMLP: MLP with at attention mechanism \"on top\" that is based on     Hierarchical Attention Networks for Document Classification</li> <li>SelfAttentionMLP: MLP with an attention mechanism that is a simplified     version of a transformer block that we refer as \"query-key self-attention\".</li> </ol> <p>The <code>Tabformer</code> family, i.e. Transformers for Tabular data:</p> <ol> <li>TabTransformer: details on the TabTransformer can be found in TabTransformer: Tabular Data Modeling Using Contextual Embeddings.</li> <li>SAINT: Details on SAINT can be found in SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training.</li> <li>FT-Transformer: details on the FT-Transformer can be found in Revisiting Deep Learning Models for Tabular Data.</li> <li>TabFastFormer: adaptation of the FastFormer for tabular data. Details on the Fasformer can be found in FastFormers: Highly Efficient Transformer Models for Natural Language Understanding</li> <li>TabPerceiver: adaptation of the Perceiver for tabular data. Details on the Perceiver can be found in Perceiver: General Perception with Iterative Attention</li> </ol> <p>And probabilistic DL models for tabular data based on Weight Uncertainty in Neural Networks:</p> <ol> <li>BayesianWide: Probabilistic adaptation of the <code>Wide</code> model.</li> <li>BayesianTabMlp: Probabilistic adaptation of the <code>TabMlp</code> model</li> </ol> <p>Note that while there are scientific publications for the TabTransformer, SAINT and FT-Transformer, the TabFasfFormer and TabPerceiver are our own adaptation of those algorithms for tabular data.</p> <p>In addition, Self-Supervised pre-training can be used for all <code>deeptabular</code> models, with the exception of the <code>TabPerceiver</code>. Self-Supervised pre-training can be used via two methods or routines which we refer as: encoder-decoder method and constrastive-denoising method. Please, see the documentation and the examples for details on this functionality, and all other options in the library.</p>"},{"location":"index.html#the-rec-module","title":"The <code>rec</code> module","text":"<p>This module was introduced as an extension to the existing components in the library, addressing questions and issues related to recommendation systems. While still under active development, it currently includes a select number of powerful recommendation models.</p> <p>It's worth noting that this library already supported the implementation of various recommendation algorithms using existing components. For example, models like Wide and Deep, Two-Tower, or Neural Collaborative Filtering could be constructed using the library's core functionalities.</p> <p>The recommendation algorithms in the <code>rec</code> module are:</p> <ol> <li>DeepFM: A Factorization-Machine based Neural Network for CTR Prediction</li> <li>(Deep) Field Aware Factorization Machine (FFM): a Deep Learning version of the algorithm presented in Field-aware Factorization Machines in a Real-world Online Advertising System</li> <li>xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems</li> <li>Deep Interest Network for Click-Through Rate Prediction</li> </ol> <p>These can all be used as the <code>deeptabular</code> component in the <code>WideDeep</code> model. See the examples for more details.</p>"},{"location":"index.html#text-and-images","title":"Text and Images","text":"<p>For the text component, <code>deeptext</code>, the library offers the following models:</p> <ol> <li>BasicRNN: a simple RNN 2. AttentiveRNN: a RNN with an attention mechanism based on the Hierarchical Attention Networks for DocumentClassification</li> <li>StackedAttentiveRNN: a stack of AttentiveRNNs</li> <li>HFModel: a wrapper around Hugging Face Transfomer-based models. At the moment only models from the families BERT, RoBERTa, DistilBERT, ALBERT and ELECTRA are supported. This is because this library is designed to address classification and regression tasks and these are the most 'popular' encoder-only models, which have proved to be those that work best for these tasks. If there is demand for other models, they will be included in the future.</li> </ol> <p>For the image component, <code>deepimage</code>, the library supports models from the following families: 'resnet', 'shufflenet', 'resnext', 'wide_resnet', 'regnet', 'densenet', 'mobilenetv3',  'mobilenetv2', 'mnasnet', 'efficientnet' and 'squeezenet'.  These are  offered via <code>torchvision</code> and wrapped up in the <code>Vision</code> class.</p>"},{"location":"index.html#acknowledgments","title":"Acknowledgments","text":"<p>This library takes from a series of other libraries, so I think it is just fair to mention them here in the README (specific mentions are also included in the code).</p> <p>The <code>Callbacks</code> and <code>Initializers</code> structure and code is inspired by the <code>torchsample</code> library, which in itself partially inspired by <code>Keras</code>.</p> <p>The <code>TextProcessor</code> class in this library uses the <code>fastai</code>'s <code>Tokenizer</code> and <code>Vocab</code>. The code at <code>utils.fastai_transforms</code> is a minor adaptation of their code so it functions within this library. To my experience their <code>Tokenizer</code> is the best in class.</p> <p>The <code>ImageProcessor</code> class in this library uses code from the fantastic Deep Learning for Computer Vision (DL4CV) book by Adrian Rosebrock.</p>"},{"location":"index.html#license","title":"License","text":"<p>This work is dual-licensed under Apache 2.0 and MIT (or any later version). You can choose between one of them if you use this work.</p> <p><code>SPDX-License-Identifier: Apache-2.0 AND MIT</code></p>"},{"location":"index.html#cite","title":"Cite","text":""},{"location":"index.html#bibtex","title":"BibTex","text":"<pre><code>@article{Zaurin_pytorch-widedeep_A_flexible_2023,\nauthor = {Zaurin, Javier Rodriguez and Mulinka, Pavol},\ndoi = {10.21105/joss.05027},\njournal = {Journal of Open Source Software},\nmonth = jun,\nnumber = {86},\npages = {5027},\ntitle = {{pytorch-widedeep: A flexible package for multimodal deep learning}},\nurl = {https://joss.theoj.org/papers/10.21105/joss.05027},\nvolume = {8},\nyear = {2023}\n}\n</code></pre>"},{"location":"index.html#apa","title":"APA","text":"<pre><code>Zaurin, J. R., &amp; Mulinka, P. (2023). pytorch-widedeep: A flexible package for\nmultimodal deep learning. Journal of Open Source Software, 8(86), 5027.\nhttps://doi.org/10.21105/joss.05027\n</code></pre>"},{"location":"contributing.html","title":"Contributing","text":"<p>Pytorch-widedeep is being developed and used by many active community members. Your help is very valuable to make it better for everyone.</p> <ul> <li>Check for the Roadmap or Open an issue to report problems or recommend new features and submit a draft pull requests, which will be changed to pull request after intial review</li> <li>Contribute to the tests to make it more reliable.</li> <li>Contribute to the documentation to make it clearer for everyone.</li> <li>Contribute to the examples to share your experience with other users.</li> <li>Join the dicussion on slack</li> </ul>"},{"location":"installation.html","title":"Installation","text":"<p>This section explains how to install <code>pytorch-widedeep</code>.</p> <p>For the latest stable release, execute:</p> <pre><code>pip install pytorch-widedeep\n</code></pre> <p>For the bleeding-edge version, execute:</p> <pre><code>pip install git+https://github.com/jrzaurin/pytorch-widedeep.git\n</code></pre> <p>For developer install</p> <pre><code># Clone the repository\ngit clone https://github.com/jrzaurin/pytorch-widedeep\ncd pytorch-widedeep\n\n# Install in dev mode\npip install -e .\n</code></pre>"},{"location":"installation.html#dependencies","title":"Dependencies","text":"<ul> <li>pandas&gt;=1.3.5</li> <li>numpy&gt;=1.21.6</li> <li>scipy&gt;=1.7.3,&lt;=1.12.0</li> <li>scikit-learn&gt;=1.0.2</li> <li>gensim</li> <li>spacy</li> <li>opencv-contrib-python</li> <li>imutils</li> <li>tqdm</li> <li>torch &gt;= 2.0.0</li> <li>torchvision &gt;= 0.15.0</li> <li>einops</li> <li>wrapt</li> <li>torchmetrics</li> <li>pyarrow</li> <li>fastparquet&gt;=0.8.1</li> <li>transformers</li> <li>sentence-transformers</li> <li>sentencepiece</li> </ul>"},{"location":"quick_start.html","title":"Quick Start","text":"<p>This is an example of a binary classification with the adult census dataset using a combination of a wide and deep model (in this case a so called <code>deeptabular</code> model) with defaults settings.</p> <pre><code>import numpy as np\nimport torch\nfrom sklearn.model_selection import train_test_split\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor\nfrom pytorch_widedeep.models import Wide, TabMlp, WideDeep\nfrom pytorch_widedeep.metrics import Accuracy\nfrom pytorch_widedeep.datasets import load_adult\n\n\ndf = load_adult(as_frame=True)\ndf[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int)\ndf.drop(\"income\", axis=1, inplace=True)\ndf_train, df_test = train_test_split(df, test_size=0.2, stratify=df.income_label)\n\n# Define the 'column set up'\nwide_cols = [\n    \"education\",\n    \"relationship\",\n    \"workclass\",\n    \"occupation\",\n    \"native-country\",\n    \"gender\",\n]\ncrossed_cols = [(\"education\", \"occupation\"), (\"native-country\", \"occupation\")]\n\ncat_embed_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital-status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"capital-gain\",\n    \"capital-loss\",\n    \"native-country\",\n]\ncontinuous_cols = [\"age\", \"hours-per-week\"]\ntarget = \"income_label\"\ntarget = df_train[target].values\n\n# prepare the data\nwide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)\nX_wide = wide_preprocessor.fit_transform(df_train)\n\ntab_preprocessor = TabPreprocessor(\n    cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols  # type: ignore[arg-type]\n)\nX_tab = tab_preprocessor.fit_transform(df_train)\n\n# build the model\nwide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=continuous_cols,\n)\nmodel = WideDeep(wide=wide, deeptabular=tab_mlp)\n\n# train and validate\ntrainer = Trainer(model, objective=\"binary\", metrics=[Accuracy])\ntrainer.fit(\n    X_wide=X_wide,\n    X_tab=X_tab,\n    target=target,\n    n_epochs=5,\n    batch_size=256,\n)\n\n# predict on test\nX_wide_te = wide_preprocessor.transform(df_test)\nX_tab_te = tab_preprocessor.transform(df_test)\npreds = trainer.predict(X_wide=X_wide_te, X_tab=X_tab_te)\n\n# Save and load\n\n# Option 1: this will also save training history and lr history if the\n# LRHistory callback is used\ntrainer.save(path=\"model_weights\", save_state_dict=True)\n\n# Option 2: save as any other torch model\ntorch.save(model.state_dict(), \"model_weights/wd_model.pt\")\n\n# From here in advance, Option 1 or 2 are the same. I assume the user has\n# prepared the data and defined the new model components:\n# 1. Build the model\nmodel_new = WideDeep(wide=wide, deeptabular=tab_mlp)\nmodel_new.load_state_dict(torch.load(\"model_weights/wd_model.pt\"))\n\n# 2. Instantiate the trainer\ntrainer_new = Trainer(model_new, objective=\"binary\")\n\n# 3. Either start the fit or directly predict\npreds = trainer_new.predict(X_wide=X_wide, X_tab=X_tab, batch_size=32)\n</code></pre>"},{"location":"examples/01_preprocessors_and_utils.html","title":"01_preprocessors_and_utils","text":"<p>For example</p> In\u00a0[1]: Copied! <pre>import numpy as np\nimport pandas as pd\nimport pytorch_widedeep as wd\n\nfrom pytorch_widedeep.datasets import load_adult\nfrom pytorch_widedeep.preprocessing import WidePreprocessor\n</pre> import numpy as np import pandas as pd import pytorch_widedeep as wd  from pytorch_widedeep.datasets import load_adult from pytorch_widedeep.preprocessing import WidePreprocessor <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n</pre> In\u00a0[2]: Copied! <pre>df = load_adult(as_frame=True)\ndf.head()\n</pre> df = load_adult(as_frame=True) df.head() Out[2]: age workclass fnlwgt education educational-num marital-status occupation relationship race gender capital-gain capital-loss hours-per-week native-country income 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States &lt;=50K 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States &lt;=50K 2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States &gt;50K 3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States &gt;50K 4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States &lt;=50K In\u00a0[3]: Copied! <pre>wide_cols = [\n    \"education\",\n    \"relationship\",\n    \"workclass\",\n    \"occupation\",\n    \"native-country\",\n    \"gender\",\n]\ncrossed_cols = [(\"education\", \"occupation\"), (\"native-country\", \"occupation\")]\n</pre> wide_cols = [     \"education\",     \"relationship\",     \"workclass\",     \"occupation\",     \"native-country\",     \"gender\", ] crossed_cols = [(\"education\", \"occupation\"), (\"native-country\", \"occupation\")] In\u00a0[4]: Copied! <pre>wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)\nX_wide = wide_preprocessor.fit_transform(df)\n# From here on, any new observation can be prepared by simply running `.transform`\n# new_X_wide = wide_preprocessor.transform(new_df)\n</pre> wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = wide_preprocessor.fit_transform(df) # From here on, any new observation can be prepared by simply running `.transform` # new_X_wide = wide_preprocessor.transform(new_df) In\u00a0[5]: Copied! <pre>X_wide\n</pre> X_wide Out[5]: <pre>array([[  1,  17,  23, ...,  89,  91, 316],\n       [  2,  18,  23, ...,  89,  92, 317],\n       [  3,  18,  24, ...,  89,  93, 318],\n       ...,\n       [  2,  20,  23, ...,  90, 103, 323],\n       [  2,  17,  23, ...,  89, 103, 323],\n       [  2,  21,  29, ...,  90, 115, 324]])</pre> <p>Note that the label encoding starts from <code>1</code>. This is because it is convenient to leave <code>0</code> for padding, i.e. unknown categories. Let's take from example the first entry</p> In\u00a0[6]: Copied! <pre>X_wide[0]\n</pre> X_wide[0] Out[6]: <pre>array([  1,  17,  23,  32,  47,  89,  91, 316])</pre> In\u00a0[7]: Copied! <pre>wide_preprocessor.inverse_transform(X_wide[:1])\n</pre> wide_preprocessor.inverse_transform(X_wide[:1]) Out[7]: education relationship workclass occupation native-country gender education_occupation native-country_occupation 0 11th Own-child Private Machine-op-inspct United-States Male 11th-Machine-op-inspct United-States-Machine-op-inspct <p>As we can see, <code>wide_preprocessor</code> numerically encodes the <code>wide_cols</code> and the <code>crossed_cols</code>, which can be recovered using the method <code>inverse_transform</code>.</p> In\u00a0[8]: Copied! <pre>from pytorch_widedeep.preprocessing import TabPreprocessor\n</pre> from pytorch_widedeep.preprocessing import TabPreprocessor In\u00a0[9]: Copied! <pre># cat_embed_cols = [(column_name, embed_dim), ...]\ncat_embed_cols = [\n    (\"education\", 10),\n    (\"relationship\", 8),\n    (\"workclass\", 10),\n    (\"occupation\", 10),\n    (\"native-country\", 10),\n]\ncontinuous_cols = [\"age\", \"hours-per-week\"]\n</pre> # cat_embed_cols = [(column_name, embed_dim), ...] cat_embed_cols = [     (\"education\", 10),     (\"relationship\", 8),     (\"workclass\", 10),     (\"occupation\", 10),     (\"native-country\", 10), ] continuous_cols = [\"age\", \"hours-per-week\"] In\u00a0[10]: Copied! <pre>tab_preprocessor = TabPreprocessor(\n    cat_embed_cols=cat_embed_cols,\n    continuous_cols=continuous_cols,\n    cols_to_scale=[\"age\"],  # or scale=True or cols_to_scale=continuous_cols\n)\nX_tab = tab_preprocessor.fit_transform(df)\n# From here on, any new observation can be prepared by simply running `.transform`\n# new_X_deep = deep_preprocessor.transform(new_df)\n</pre> tab_preprocessor = TabPreprocessor(     cat_embed_cols=cat_embed_cols,     continuous_cols=continuous_cols,     cols_to_scale=[\"age\"],  # or scale=True or cols_to_scale=continuous_cols ) X_tab = tab_preprocessor.fit_transform(df) # From here on, any new observation can be prepared by simply running `.transform` # new_X_deep = deep_preprocessor.transform(new_df) In\u00a0[11]: Copied! <pre>X_tab\n</pre> X_tab Out[11]: <pre>array([[ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,\n         1.00000000e+00, -9.95128932e-01,  4.00000000e+01],\n       [ 2.00000000e+00,  2.00000000e+00,  1.00000000e+00, ...,\n         1.00000000e+00, -4.69415091e-02,  5.00000000e+01],\n       [ 3.00000000e+00,  2.00000000e+00,  2.00000000e+00, ...,\n         1.00000000e+00, -7.76316450e-01,  4.00000000e+01],\n       ...,\n       [ 2.00000000e+00,  4.00000000e+00,  1.00000000e+00, ...,\n         1.00000000e+00,  1.41180837e+00,  4.00000000e+01],\n       [ 2.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,\n         1.00000000e+00, -1.21394141e+00,  2.00000000e+01],\n       [ 2.00000000e+00,  5.00000000e+00,  7.00000000e+00, ...,\n         1.00000000e+00,  9.74183408e-01,  4.00000000e+01]])</pre> <p>Note that the label encoding starts from <code>1</code>. This is because it is convenient to leave <code>0</code> for padding, i.e. unknown categories. Let's take from example the first entry</p> In\u00a0[12]: Copied! <pre>X_tab[0]\n</pre> X_tab[0] Out[12]: <pre>array([ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,\n       -0.99512893, 40.        ])</pre> In\u00a0[13]: Copied! <pre>tab_preprocessor.inverse_transform(X_tab[:1])\n</pre> tab_preprocessor.inverse_transform(X_tab[:1]) Out[13]: education relationship workclass occupation native-country age hours-per-week 0 11th Own-child Private Machine-op-inspct United-States 25.0 40.0 <p>The <code>TabPreprocessor</code> will have a series of useful attributes that can later be used when instantiating the different Tabular Models, such us for example, the column indexes (used to slice the tensors, internally in the models) or the categorical embeddings set up</p> In\u00a0[14]: Copied! <pre>tab_preprocessor.column_idx\n</pre> tab_preprocessor.column_idx Out[14]: <pre>{'education': 0,\n 'relationship': 1,\n 'workclass': 2,\n 'occupation': 3,\n 'native-country': 4,\n 'age': 5,\n 'hours-per-week': 6}</pre> In\u00a0[15]: Copied! <pre># column name, num unique, embedding dim\ntab_preprocessor.cat_embed_input\n</pre> # column name, num unique, embedding dim tab_preprocessor.cat_embed_input Out[15]: <pre>[('education', 16, 10),\n ('relationship', 6, 8),\n ('workclass', 9, 10),\n ('occupation', 15, 10),\n ('native-country', 42, 10)]</pre> <p>As I mentioned, there is more one can do, such as for example, quantize (or bucketize) the continuous cols. For this we could use the <code>quantization_setup</code> param. This parameter accepts a number of different inputs and uses <code>pd.cut</code> under the hood to quantize the continuous cols. For more info, please, read the docs. Let's use it here to quantize \"age\" and \"hours-per-week\" in 4 and 5 \"buckets\" respectively</p> In\u00a0[16]: Copied! <pre>quantization_setup = {\n    \"age\": 4,\n    \"hours-per-week\": 5,\n}  # you can also pass a list of floats with the boundaries if you wanted\nquant_tab_preprocessor = TabPreprocessor(\n    cat_embed_cols=cat_embed_cols,\n    continuous_cols=continuous_cols,\n    quantization_setup=quantization_setup,\n)\nqX_tab = quant_tab_preprocessor.fit_transform(df)\n</pre> quantization_setup = {     \"age\": 4,     \"hours-per-week\": 5, }  # you can also pass a list of floats with the boundaries if you wanted quant_tab_preprocessor = TabPreprocessor(     cat_embed_cols=cat_embed_cols,     continuous_cols=continuous_cols,     quantization_setup=quantization_setup, ) qX_tab = quant_tab_preprocessor.fit_transform(df) <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> In\u00a0[17]: Copied! <pre>qX_tab\n</pre> qX_tab Out[17]: <pre>array([[1, 1, 1, ..., 1, 1, 2],\n       [2, 2, 1, ..., 1, 2, 3],\n       [3, 2, 2, ..., 1, 1, 2],\n       ...,\n       [2, 4, 1, ..., 1, 3, 2],\n       [2, 1, 1, ..., 1, 1, 1],\n       [2, 5, 7, ..., 1, 2, 2]])</pre> <p>Note that the continuous columns that have been bucketised into quantiles are treated as any other categorical column</p> In\u00a0[18]: Copied! <pre>quant_tab_preprocessor.cat_embed_input\n</pre> quant_tab_preprocessor.cat_embed_input Out[18]: <pre>[('education', 16, 10),\n ('relationship', 6, 8),\n ('workclass', 9, 10),\n ('occupation', 15, 10),\n ('native-country', 42, 10),\n ('age', 4, 4),\n ('hours-per-week', 5, 4)]</pre> <p>Where the column 'age' has now 4 categories, which will be encoded using embeddings of 4 dims. Note that, as any other categorical columns, the categorical \"counter\" starts with 1. This is because all incoming values that are lower/higher than the existing lowest/highest value in the train (or already seen) dataset, will be encoded as 0.</p> In\u00a0[19]: Copied! <pre>np.unique(qX_tab[:, quant_tab_preprocessor.column_idx[\"age\"]])\n</pre> np.unique(qX_tab[:, quant_tab_preprocessor.column_idx[\"age\"]]) Out[19]: <pre>array([1, 2, 3, 4])</pre> <p>Finally, if we now wanted to <code>inverse_transform</code> the transformed array into the original dataframe, we could still do it, but the continuous, bucketized columns will be transformed back to the middle of their quantile/bucket range</p> In\u00a0[20]: Copied! <pre>df_decoded = quant_tab_preprocessor.inverse_transform(qX_tab)\n</pre> df_decoded = quant_tab_preprocessor.inverse_transform(qX_tab) <pre>Note that quantized cols will be turned into the mid point of the corresponding bin\n</pre> In\u00a0[21]: Copied! <pre>df.head(2)\n</pre> df.head(2) Out[21]: age workclass fnlwgt education educational-num marital-status occupation relationship race gender capital-gain capital-loss hours-per-week native-country income 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States &lt;=50K 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States &lt;=50K In\u00a0[22]: Copied! <pre>df_decoded.head(2)\n</pre> df_decoded.head(2) Out[22]: education relationship workclass occupation native-country age hours-per-week 0 11th Own-child Private Machine-op-inspct United-States 26.0885 30.4 1 HS-grad Husband Private Farming-fishing United-States 44.3750 50.0 <p>there is one final comment to make regarding to the <code>inverse_transform</code> functionality. As we mentioned before, the encoding <code>0</code> is reserved for values that fall outside the range covered by the data we used to run the <code>fit</code> method. For example</p> In\u00a0[23]: Copied! <pre>df.age.min(), df.age.max()\n</pre> df.age.min(), df.age.max() Out[23]: <pre>(17, 90)</pre> <p>All future age values outside that range will be encoded as 0 and decoded as <code>NaN</code></p> In\u00a0[24]: Copied! <pre>tmp_df = df.head(1).copy()\ntmp_df.loc[:, \"age\"] = 5\n</pre> tmp_df = df.head(1).copy() tmp_df.loc[:, \"age\"] = 5 In\u00a0[25]: Copied! <pre>tmp_df\n</pre> tmp_df Out[25]: age workclass fnlwgt education educational-num marital-status occupation relationship race gender capital-gain capital-loss hours-per-week native-country income 0 5 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States &lt;=50K In\u00a0[26]: Copied! <pre># quant_tab_preprocessor has already been fitted with a data that has an age range between 17 and 90\ntmp_qX_tab = quant_tab_preprocessor.transform(tmp_df)\n</pre> # quant_tab_preprocessor has already been fitted with a data that has an age range between 17 and 90 tmp_qX_tab = quant_tab_preprocessor.transform(tmp_df) In\u00a0[27]: Copied! <pre>tmp_qX_tab\n</pre> tmp_qX_tab Out[27]: <pre>array([[1, 1, 1, 1, 1, 0, 2]])</pre> In\u00a0[28]: Copied! <pre>quant_tab_preprocessor.inverse_transform(tmp_qX_tab)\n</pre> quant_tab_preprocessor.inverse_transform(tmp_qX_tab) <pre>Note that quantized cols will be turned into the mid point of the corresponding bin\n</pre> Out[28]: education relationship workclass occupation native-country age hours-per-week 0 11th Own-child Private Machine-op-inspct United-States NaN 30.4 In\u00a0[29]: Copied! <pre>from pytorch_widedeep.preprocessing import TextPreprocessor\n</pre> from pytorch_widedeep.preprocessing import TextPreprocessor In\u00a0[30]: Copied! <pre># The airbnb dataset, which you could get from here:\n# http://insideairbnb.com/get-the-data.html, is too big to be included in\n# our datasets module (when including images). Therefore, go there,\n# download it, and use the download_images.py script to get the images\n# and the airbnb_data_processing.py to process the data. We'll find\n# better datasets in the future ;). Note that here we are only using a\n# small sample to illustrate the use, so PLEASE ignore the results, just\n# focus on usage\ndf = pd.read_csv(\"../tmp_data/airbnb/airbnb_sample.csv\")\n</pre> # The airbnb dataset, which you could get from here: # http://insideairbnb.com/get-the-data.html, is too big to be included in # our datasets module (when including images). Therefore, go there, # download it, and use the download_images.py script to get the images # and the airbnb_data_processing.py to process the data. We'll find # better datasets in the future ;). Note that here we are only using a # small sample to illustrate the use, so PLEASE ignore the results, just # focus on usage df = pd.read_csv(\"../tmp_data/airbnb/airbnb_sample.csv\") In\u00a0[31]: Copied! <pre>texts = df.description.tolist()\ntexts[:2]\n</pre> texts = df.description.tolist() texts[:2] Out[31]: <pre>[\"My bright double bedroom with a large window has a relaxed feeling! It comfortably fits one or two and is centrally located just two blocks from Finsbury Park. Enjoy great restaurants in the area and easy access to easy transport tubes, trains and buses. Babies and children of all ages are welcome. Hello Everyone, I'm offering my lovely double bedroom in Finsbury Park area (zone 2) for let in a shared apartment.  You will share the apartment with me and it is fully furnished with a self catering kitchen. Two people can easily sleep well as the room has a queen size bed. I also have a travel cot for a baby for guest with small children.  I will require a deposit up front as a security gesture on both our parts and will be given back to you when you return the keys.  I trust anyone who will be responding to this add would treat my home with care and respect .  Best Wishes  Alina Guest will have access to the self catering kitchen and bathroom. There is the flat is equipped wifi internet,\",\n \"Lots of windows and light.  St Luke's Gardens are at the end of the block, and the river not too far the other way. Ten minutes walk if you go slowly. Buses to everywhere round the corner and shops, restaurants, pubs, the cinema and Waitrose . Bright Chelsea Apartment  This is a bright one bedroom ground floor apartment in an interesting listed building. There is one double bedroom and a living room/kitchen The apartment has a full  bathroom and the kitchen is fully equipped. Two wardrobes are available exclusively for guests and bedside tables and two long drawers. This sunny convenient compact flat is just around the corner from the Waitrose supermarket and all sorts of shops, cinemas, restaurants and pubs.  This is a lovely part of London. There is a fun farmers market in the King's Road at the weekend.  Buses to everywhere are just round the corner, and two underground stations are within ten minutes walk. There is a very nice pub round by St. Luke's gardens, 4 mins slow walk, the \"]</pre> In\u00a0[32]: Copied! <pre>text_preprocessor = TextPreprocessor(text_col=\"description\")\nX_text = text_preprocessor.fit_transform(df)\n# From here on, any new observation can be prepared by simply running `.transform`\n# new_X_text = text_preprocessor.transform(new_df)\n</pre> text_preprocessor = TextPreprocessor(text_col=\"description\") X_text = text_preprocessor.fit_transform(df) # From here on, any new observation can be prepared by simply running `.transform` # new_X_text = text_preprocessor.transform(new_df) <pre>The vocabulary contains 2192 tokens\n</pre> In\u00a0[33]: Copied! <pre>print(X_text[0])\n</pre> print(X_text[0]) <pre>[  29   48   37  367  818   17  910   17  177   15  122  349   53  879\n 1174  126  393   40  911    0   23  228   71  819    9   53   55 1380\n  225   11   18  308   18 1564   10  755    0  942  239   53   55    0\n   11   36 1013  277 1974   70   62   15 1475    9  943    5  251    5\n    0    5    0    5  177   53   37   75   11   10  294  726   32    9\n   42    5   25   12   10   22   12  136  100  145]\n</pre> In\u00a0[34]: Copied! <pre>from pytorch_widedeep.preprocessing import ImagePreprocessor\n</pre> from pytorch_widedeep.preprocessing import ImagePreprocessor In\u00a0[35]: Copied! <pre>image_preprocessor = wd.preprocessing.ImagePreprocessor(\n    img_col=\"id\", img_path=\"../tmp_data/airbnb/property_picture/\"\n)\nX_images = image_preprocessor.fit_transform(df)\n# From here on, any new observation can be prepared by simply running `.transform`\n# new_X_images = image_preprocessor.transform(new_df)\n</pre> image_preprocessor = wd.preprocessing.ImagePreprocessor(     img_col=\"id\", img_path=\"../tmp_data/airbnb/property_picture/\" ) X_images = image_preprocessor.fit_transform(df) # From here on, any new observation can be prepared by simply running `.transform` # new_X_images = image_preprocessor.transform(new_df) <pre>Reading Images from ../tmp_data/airbnb/property_picture/\nResizing\n</pre> <pre>100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1001/1001 [00:01&lt;00:00, 667.89it/s]\n</pre> <pre>Computing normalisation metrics\n</pre> In\u00a0[36]: Copied! <pre>X_images[0].shape\n</pre> X_images[0].shape Out[36]: <pre>(224, 224, 3)</pre>"},{"location":"examples/01_preprocessors_and_utils.html#processors-and-utils","title":"Processors and Utils\u00b6","text":"<p>Description of the main tools and utilities that one needs to prepare the data for a <code>WideDeep</code> model constructor.</p>"},{"location":"examples/01_preprocessors_and_utils.html#the-preprocessing-module","title":"The <code>preprocessing</code>  module\u00b6","text":"<p>There are 4 preprocessors, corresponding to 4 main components of the <code>WideDeep</code> model. These are</p> <ul> <li><code>WidePreprocessor</code></li> <li><code>TabPreprocessor</code></li> <li><code>TextPreprocessor</code></li> <li><code>ImagePreprocessor</code></li> </ul> <p>Behind the scenes, these preprocessors use a series of helper funcions and classes that are in the <code>utils</code> module. If you were interested please go and have a look to the documentation</p>"},{"location":"examples/01_preprocessors_and_utils.html#1-widepreprocessor","title":"1. WidePreprocessor\u00b6","text":"<p>The <code>wide</code> component of the model is a linear model that in principle, could be implemented as a linear layer receiving the result of on one-hot encoding categorical columns. However, this is not memory efficient. Therefore, we implement a liner layer as an Embedding layer plus a bias. I will explain in a bit more detail later.</p> <p>With that in mind, <code>WidePreprocessor</code> simply encodes the categories numerically so that they are the indexes of the lookup table that is an Embedding layer.</p>"},{"location":"examples/01_preprocessors_and_utils.html#2-tabpreprocessor","title":"2. TabPreprocessor\u00b6","text":"<p>The <code>TabPreprocessor</code> has a lot of different functionalities. Let's explore some of them in detail. In its basic use, the <code>TabPreprocessor</code> simply label encodes the categorical columns and normalises the numerical ones (unless otherwised specified).</p>"},{"location":"examples/01_preprocessors_and_utils.html#3-textpreprocessor","title":"3. TextPreprocessor\u00b6","text":"<p>This preprocessor returns the tokenised, padded sequences that will be directly fed to the stack of LSTMs.</p>"},{"location":"examples/01_preprocessors_and_utils.html#4-imagepreprocessor","title":"4. ImagePreprocessor\u00b6","text":"<p><code>ImagePreprocessor</code> simply resizes the images, being aware of the aspect ratio.</p>"},{"location":"examples/02_model_components.html","title":"02_model_components","text":"In\u00a0[1]: Copied! <pre>import torch\nimport pandas as pd\nimport numpy as np\n\nfrom torch import nn\n</pre> import torch import pandas as pd import numpy as np  from torch import nn In\u00a0[2]: Copied! <pre>df = pd.DataFrame({\"color\": [\"r\", \"b\", \"g\"], \"size\": [\"s\", \"n\", \"l\"]})\ndf.head()\n</pre> df = pd.DataFrame({\"color\": [\"r\", \"b\", \"g\"], \"size\": [\"s\", \"n\", \"l\"]}) df.head() Out[2]: color size 0 r s 1 b n 2 g l <p>one hot encoded, the first observation would be</p> In\u00a0[3]: Copied! <pre>obs_0_oh = (np.array([1.0, 0.0, 0.0, 1.0, 0.0, 0.0])).astype(\"float32\")\n</pre> obs_0_oh = (np.array([1.0, 0.0, 0.0, 1.0, 0.0, 0.0])).astype(\"float32\") <p>if we simply numerically encode (label encode or <code>le</code>) the values:</p> In\u00a0[4]: Copied! <pre>obs_0_le = (np.array([0, 3])).astype(\"int64\")\n</pre> obs_0_le = (np.array([0, 3])).astype(\"int64\") <p>Note that in the functioning implementation of the package we start from 1, saving 0 for padding, i.e. unseen values.</p> <p>Now, let's see if the two implementations are equivalent</p> In\u00a0[5]: Copied! <pre># we have 6 different values. Let's assume we are performing a regression, so pred_dim = 1\nlin = nn.Linear(6, 1)\n</pre> # we have 6 different values. Let's assume we are performing a regression, so pred_dim = 1 lin = nn.Linear(6, 1) In\u00a0[6]: Copied! <pre>emb = nn.Embedding(6, 1)\nemb.weight = nn.Parameter(lin.weight.reshape_as(emb.weight))\n</pre> emb = nn.Embedding(6, 1) emb.weight = nn.Parameter(lin.weight.reshape_as(emb.weight)) In\u00a0[7]: Copied! <pre>lin(torch.tensor(obs_0_oh))\n</pre> lin(torch.tensor(obs_0_oh)) Out[7]: <pre>tensor([-0.5181], grad_fn=&lt;ViewBackward0&gt;)</pre> In\u00a0[8]: Copied! <pre>emb(torch.tensor(obs_0_le)).sum() + lin.bias\n</pre> emb(torch.tensor(obs_0_le)).sum() + lin.bias Out[8]: <pre>tensor([-0.5181], grad_fn=&lt;AddBackward0&gt;)</pre> <p>And this is precisely how the linear model <code>Wide</code> is implemented</p> In\u00a0[9]: Copied! <pre>from pytorch_widedeep.models import Wide\n</pre> from pytorch_widedeep.models import Wide <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n</pre> In\u00a0[10]: Copied! <pre># ?Wide\n</pre> # ?Wide In\u00a0[11]: Copied! <pre>wide = Wide(input_dim=10, pred_dim=1)\nwide\n</pre> wide = Wide(input_dim=10, pred_dim=1) wide Out[11]: <pre>Wide(\n  (wide_linear): Embedding(11, 1, padding_idx=0)\n)</pre> <p>Note that even though the input dim is 10, the Embedding layer has 11 weights. Again, this is because we save <code>0</code> for padding, which is used for unseen values during the encoding process.</p> <p>As I mentioned, <code>deeptabular</code> has enough complexity on its own and it will be described in a separated notebook. Let's then jump to <code>deeptext</code>.</p> In\u00a0[12]: Copied! <pre>from pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.models import TabMlp\n</pre> from pytorch_widedeep.preprocessing import TabPreprocessor from pytorch_widedeep.models import TabMlp In\u00a0[13]: Copied! <pre>data = {\n    \"cat1\": np.random.choice([\"A\", \"B\", \"C\"], size=20),\n    \"cat2\": np.random.choice([\"X\", \"Y\"], size=20),\n    \"cont1\": np.random.rand(20),\n    \"cont2\": np.random.rand(20),\n}\n\ndf = pd.DataFrame(data)\n</pre> data = {     \"cat1\": np.random.choice([\"A\", \"B\", \"C\"], size=20),     \"cat2\": np.random.choice([\"X\", \"Y\"], size=20),     \"cont1\": np.random.rand(20),     \"cont2\": np.random.rand(20), }  df = pd.DataFrame(data) In\u00a0[14]: Copied! <pre>df.head()\n</pre> df.head() Out[14]: cat1 cat2 cont1 cont2 0 A Y 0.789347 0.561789 1 C X 0.050822 0.061538 2 A Y 0.863784 0.241967 3 C X 0.917848 0.644658 4 C Y 0.042328 0.417303 In\u00a0[15]: Copied! <pre># see the docs for details on all params/options\ntab_preprocessor = TabPreprocessor(\n    cat_embed_cols=[\"cat1\", \"cat2\"],\n    continuous_cols=[\"cont1\", \"cont2\"],\n    embedding_rule=\"fastai\",\n)\n</pre> # see the docs for details on all params/options tab_preprocessor = TabPreprocessor(     cat_embed_cols=[\"cat1\", \"cat2\"],     continuous_cols=[\"cont1\", \"cont2\"],     embedding_rule=\"fastai\", ) In\u00a0[16]: Copied! <pre>X_tab = tab_preprocessor.fit_transform(df)\n</pre> X_tab = tab_preprocessor.fit_transform(df) <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> In\u00a0[17]: Copied! <pre># toy example just to build a model.\ntabmlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    embed_continuous_method=\"standard\",\n    cont_embed_dim=4,\n    mlp_hidden_dims=[8, 4],\n    mlp_linear_first=True,\n)\ntabmlp\n</pre> # toy example just to build a model. tabmlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=tab_preprocessor.continuous_cols,     embed_continuous_method=\"standard\",     cont_embed_dim=4,     mlp_hidden_dims=[8, 4],     mlp_linear_first=True, ) tabmlp Out[17]: <pre>TabMlp(\n  (cat_embed): DiffSizeCatEmbeddings(\n    (embed_layers): ModuleDict(\n      (emb_layer_cat1): Embedding(4, 3, padding_idx=0)\n      (emb_layer_cat2): Embedding(3, 2, padding_idx=0)\n    )\n    (embedding_dropout): Dropout(p=0.0, inplace=False)\n  )\n  (cont_norm): Identity()\n  (cont_embed): ContEmbeddings(\n    INFO: [ContLinear = weight(n_cont_cols, embed_dim) + bias(n_cont_cols, embed_dim)]\n    (linear): ContLinear(n_cont_cols=2, embed_dim=4, embed_dropout=0.0)\n    (dropout): Dropout(p=0.0, inplace=False)\n  )\n  (encoder): MLP(\n    (mlp): Sequential(\n      (dense_layer_0): Sequential(\n        (0): Linear(in_features=13, out_features=8, bias=True)\n        (1): ReLU(inplace=True)\n        (2): Dropout(p=0.1, inplace=False)\n      )\n      (dense_layer_1): Sequential(\n        (0): Linear(in_features=8, out_features=4, bias=True)\n        (1): ReLU(inplace=True)\n        (2): Dropout(p=0.1, inplace=False)\n      )\n    )\n  )\n)</pre> <p>Lets describe a bit the model: first we have what we call a <code>DiffSizeCatEmbeddings</code>, where categorical columns with different number of unique categories will be encoded with embeddings of different dimensions. Then the continuous columns will not be normalised (the normalised layer is just the identity) and they will be embedded via a \"standard\" method, using a so-called <code>ContLinear</code> layer. This layer displays some <code>INFO</code> that tells us what it is (<code>ContLinear = weight(n_cont_cols, embed_dim) + bias(n_cont_cols, embed_dim)]</code>). There are two other options available to embed the continuous cols based on the paper On Embeddings for Numerical Features in Tabular Deep Learning. These are <code>PieceWise</code> and <code>Periodic</code> and all available via the <code>embed_continuous_method</code> param, which can adopt values <code>\"standard\", \"piecewise\"</code> and <code>\"periodic\"</code>. The embedded categorical and continuous columns will be then concatenated ($3 + 2 + (4 * 2) = 13$ input dims) and passed to an MLP.</p> In\u00a0[18]: Copied! <pre>from pytorch_widedeep.models import BasicRNN\n</pre> from pytorch_widedeep.models import BasicRNN In\u00a0[19]: Copied! <pre>basic_rnn = BasicRNN(vocab_size=4, hidden_dim=4, n_layers=1, padding_idx=0, embed_dim=4)\n</pre> basic_rnn = BasicRNN(vocab_size=4, hidden_dim=4, n_layers=1, padding_idx=0, embed_dim=4) <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/torch/nn/modules/rnn.py:82: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.1 and num_layers=1\n  warnings.warn(\"dropout option adds dropout after all but last \"\n</pre> In\u00a0[20]: Copied! <pre>basic_rnn\n</pre> basic_rnn Out[20]: <pre>BasicRNN(\n  (word_embed): Embedding(4, 4, padding_idx=0)\n  (rnn): LSTM(4, 4, batch_first=True, dropout=0.1)\n  (rnn_mlp): Identity()\n)</pre> <p>You could, if you wanted, add a Fully Connected Head (FC-Head) on top of it</p> In\u00a0[21]: Copied! <pre>from pytorch_widedeep.models import Vision\n</pre> from pytorch_widedeep.models import Vision In\u00a0[22]: Copied! <pre>resnet = Vision(pretrained_model_setup=\"resnet18\", n_trainable=0)\n</pre> resnet = Vision(pretrained_model_setup=\"resnet18\", n_trainable=0) In\u00a0[23]: Copied! <pre>resnet\n</pre> resnet Out[23]: <pre>Vision(\n  (features): Sequential(\n    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)\n    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n    (2): ReLU(inplace=True)\n    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)\n    (4): Sequential(\n      (0): BasicBlock(\n        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (relu): ReLU(inplace=True)\n        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      )\n      (1): BasicBlock(\n        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (relu): ReLU(inplace=True)\n        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      )\n    )\n    (5): Sequential(\n      (0): BasicBlock(\n        (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (relu): ReLU(inplace=True)\n        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (downsample): Sequential(\n          (0): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)\n          (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        )\n      )\n      (1): BasicBlock(\n        (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (relu): ReLU(inplace=True)\n        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      )\n    )\n    (6): Sequential(\n      (0): BasicBlock(\n        (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (relu): ReLU(inplace=True)\n        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (downsample): Sequential(\n          (0): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)\n          (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        )\n      )\n      (1): BasicBlock(\n        (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (relu): ReLU(inplace=True)\n        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      )\n    )\n    (7): Sequential(\n      (0): BasicBlock(\n        (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (relu): ReLU(inplace=True)\n        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (downsample): Sequential(\n          (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)\n          (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        )\n      )\n      (1): BasicBlock(\n        (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (relu): ReLU(inplace=True)\n        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      )\n    )\n    (8): AdaptiveAvgPool2d(output_size=(1, 1))\n  )\n)</pre>"},{"location":"examples/02_model_components.html#model-components","title":"Model Components\u00b6","text":"<p>The main components of a <code>WideDeep</code> (i.e. Multimodal) model are tabular data, text and images, which are feed into the model via so called <code>wide</code>, <code>deeptabular</code>, <code>deeptext</code> and <code>deepimage</code> model components</p>"},{"location":"examples/02_model_components.html#1-wide","title":"1. <code>wide</code>\u00b6","text":"<p>The <code>wide</code> component is a Linear layer \"plugged\" into the output neuron(s). Here, the non-linearities are captured via crossed columns. Crossed columns are, quoting directly the paper: \"For binary features, a cross-product transformation (e.g., \u201cAND(gender=female, language=en)\u201d) is 1 if and only if the constituent features (\u201cgender=female\u201d and \u201clanguage=en\u201d) are all 1, and 0 otherwise\".</p> <p>The only particularity of our implementation is that we have implemented the linear layer via an Embedding layer plus a bias. While the implementations are equivalent, the latter is faster and far more memory efficient, since we do not need to one hot encode the categorical features.</p> <p>Let's assume we the following dataset:</p>"},{"location":"examples/02_model_components.html#2-deeptabular","title":"2. <code>deeptabular</code>\u00b6","text":"<p>The <code>deeptabular</code> model alone is what normally would be referred as Deep Learning for tabular data. As mentioned a number of times throughout the library, each component can be used independently. Therefore, if you wanted to use any of the models below alone, it is perfectly possible. There are just a couple of simple requirement that will be covered in a later notebook.</p> <p>By the time of writing, there are a number of models available in <code>pytorch-widedeep</code> to do DL for tabular data. These are:</p> <ol> <li><code>TabMlp</code></li> <li><code>ContextAttentionMLP</code></li> <li><code>SelfAttentionMLP</code></li> <li><code>TabResnet</code></li> <li><code>Tabnet</code></li> <li><code>TabTransformer</code></li> <li><code>FT-Tabransformer</code></li> <li><code>SAINT</code></li> <li><code>TabFastFormer</code></li> <li><code>TabPerceiver</code></li> </ol> <p>Let's have a look to one of them. For more information on each of these models, please, have a look to the documentation</p>"},{"location":"examples/02_model_components.html#3-deeptext","title":"3. <code>deeptext</code>\u00b6","text":"<p>At the time of writing, <code>pytorch-widedeep</code> offers three models that can be passed to <code>WideDeep</code> as the <code>deeptext</code> component. These are:</p> <ol> <li>BasicRNN</li> <li>AttentiveRNN</li> <li>StackedAttentiveRNN</li> </ol> <p>For details on each of these models, please, have a look to the documentation of the package.</p> <p>We will soon integrate with Hugginface, but let me insist. It is perfectly possible to use custom models for each component, please, have a look to the corresponding notebook. In general, simply, build them and pass them as the corresponding parameters. Note that the custom models MUST return a last layer of activations (i.e. not the final prediction) so that  these activations are collected by <code>WideDeep</code> and combined accordingly. In  addition, the models MUST also contain an attribute <code>output_dim</code> with the size of these last layers of activations.</p> <p>Let's have a look to the <code>BasicRNN</code> model</p>"},{"location":"examples/02_model_components.html#4-deepimage","title":"4. <code>deepimage</code>\u00b6","text":"<p>At the time of writing <code>pytorch-widedeep</code> is integrated with torchvision via the <code>Vision</code> class. This means that the it is possible to use a variant of the following architectures:</p> <ol> <li>resnet</li> <li>shufflenet</li> <li>resnext</li> <li>wide_resnet</li> <li>regnet</li> <li>densenet</li> <li>mobilenet</li> <li>mnasnet</li> <li>efficientnet</li> <li>squeezenet</li> </ol> <p>The user can choose which layers will be trainable. Alternatively, in none of these architectures is useful, one could use a simple, fully trained CNN (please see the package documentation) or pass a custom model.</p> <p>let's have a look</p>"},{"location":"examples/03_binary_classification_with_defaults.html","title":"03_binary_classification_with_defaults","text":"In\u00a0[1]: Copied! <pre>import numpy as np\nimport pandas as pd\nimport torch\n\nfrom pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor\nfrom pytorch_widedeep.training import Trainer\nfrom pytorch_widedeep.models import Wide, TabMlp, WideDeep\nfrom pytorch_widedeep.metrics import Accuracy, Precision\nfrom pytorch_widedeep.datasets import load_adult\n</pre> import numpy as np import pandas as pd import torch  from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor from pytorch_widedeep.training import Trainer from pytorch_widedeep.models import Wide, TabMlp, WideDeep from pytorch_widedeep.metrics import Accuracy, Precision from pytorch_widedeep.datasets import load_adult <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n</pre> In\u00a0[2]: Copied! <pre>df = load_adult(as_frame=True)\ndf.head()\n</pre> df = load_adult(as_frame=True) df.head() Out[2]: age workclass fnlwgt education educational-num marital-status occupation relationship race gender capital-gain capital-loss hours-per-week native-country income 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States &lt;=50K 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States &lt;=50K 2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States &gt;50K 3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States &gt;50K 4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States &lt;=50K In\u00a0[3]: Copied! <pre># For convenience, we'll replace '-' with '_'\ndf.columns = [c.replace(\"-\", \"_\") for c in df.columns]\n# binary target\ndf[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int)\ndf.drop(\"income\", axis=1, inplace=True)\ndf.head()\n</pre> # For convenience, we'll replace '-' with '_' df.columns = [c.replace(\"-\", \"_\") for c in df.columns] # binary target df[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int) df.drop(\"income\", axis=1, inplace=True) df.head() Out[3]: age workclass fnlwgt education educational_num marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country income_label 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States 0 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States 0 2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States 1 3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States 1 4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States 0 In\u00a0[4]: Copied! <pre>df.drop([\"fnlwgt\", \"educational_num\"], axis=1, inplace=True)\n</pre> df.drop([\"fnlwgt\", \"educational_num\"], axis=1, inplace=True) In\u00a0[5]: Copied! <pre># Define wide, crossed and deep tabular columns\nwide_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital_status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"native_country\",\n]\ncrossed_cols = [(\"education\", \"occupation\"), (\"native_country\", \"occupation\")]\n</pre> # Define wide, crossed and deep tabular columns wide_cols = [     \"workclass\",     \"education\",     \"marital_status\",     \"occupation\",     \"relationship\",     \"race\",     \"gender\",     \"native_country\", ] crossed_cols = [(\"education\", \"occupation\"), (\"native_country\", \"occupation\")] In\u00a0[6]: Copied! <pre>cat_embed_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital_status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"capital_gain\",\n    \"capital_loss\",\n    \"native_country\",\n]\ncontinuous_cols = [\"age\", \"hours_per_week\"]\n</pre> cat_embed_cols = [     \"workclass\",     \"education\",     \"marital_status\",     \"occupation\",     \"relationship\",     \"race\",     \"gender\",     \"capital_gain\",     \"capital_loss\",     \"native_country\", ] continuous_cols = [\"age\", \"hours_per_week\"] In\u00a0[7]: Copied! <pre># TARGET\ntarget_col = \"income_label\"\ntarget = df[target_col].values\n</pre> # TARGET target_col = \"income_label\" target = df[target_col].values <p>let's see what the preprocessors do</p> In\u00a0[8]: Copied! <pre># wide\nwide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)\nX_wide = wide_preprocessor.fit_transform(df)\n</pre> # wide wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = wide_preprocessor.fit_transform(df) In\u00a0[9]: Copied! <pre># # wide_preprocessor has an attribute called encoding_dict with the encoding dictionary\n# wide_preprocessor.encoding_dict\n</pre> # # wide_preprocessor has an attribute called encoding_dict with the encoding dictionary # wide_preprocessor.encoding_dict In\u00a0[10]: Copied! <pre># deeptabular\ntab_preprocessor = TabPreprocessor(\n    embed_cols=cat_embed_cols,\n    continuous_cols=continuous_cols,\n    cols_to_scale=continuous_cols,\n)\nX_tab = tab_preprocessor.fit_transform(df)\n</pre> # deeptabular tab_preprocessor = TabPreprocessor(     embed_cols=cat_embed_cols,     continuous_cols=continuous_cols,     cols_to_scale=continuous_cols, ) X_tab = tab_preprocessor.fit_transform(df) In\u00a0[11]: Copied! <pre># check the docs to understand the useful attributes that the tab_preprocessor has. For example,\n# as well as an encoding dictionary, tab_preprocessor has an attribute called cat_embed_input\n# that specifies the categortical columns that will be represented as embeddings, the number\n# of different categories per feature, and the dimension of the embeddings as defined by some\n# of the internal rules of thumb that the preprocessor has (have a look to the docs)\ntab_preprocessor.cat_embed_input\n</pre> # check the docs to understand the useful attributes that the tab_preprocessor has. For example, # as well as an encoding dictionary, tab_preprocessor has an attribute called cat_embed_input # that specifies the categortical columns that will be represented as embeddings, the number # of different categories per feature, and the dimension of the embeddings as defined by some # of the internal rules of thumb that the preprocessor has (have a look to the docs) tab_preprocessor.cat_embed_input Out[11]: <pre>[('workclass', 9, 5),\n ('education', 16, 8),\n ('marital_status', 7, 5),\n ('occupation', 15, 7),\n ('relationship', 6, 4),\n ('race', 5, 4),\n ('gender', 2, 2),\n ('capital_gain', 123, 24),\n ('capital_loss', 99, 21),\n ('native_country', 42, 13)]</pre> In\u00a0[12]: Copied! <pre>print(X_wide)\nprint(X_wide.shape)\n</pre> print(X_wide) print(X_wide.shape) <pre>[[  1  10  26 ...  61 103 328]\n [  1  11  27 ...  61 104 329]\n [  2  12  27 ...  61 105 330]\n ...\n [  1  11  28 ...  61 115 335]\n [  1  11  26 ...  61 115 335]\n [  7  11  27 ...  61 127 336]]\n(48842, 10)\n</pre> In\u00a0[13]: Copied! <pre>print(X_tab)\nprint(X_tab.shape)\n</pre> print(X_tab) print(X_tab.shape) <pre>[[ 1.          1.          1.         ...  1.         -0.99512893\n  -0.03408696]\n [ 1.          2.          2.         ...  1.         -0.04694151\n   0.77292975]\n [ 2.          3.          2.         ...  1.         -0.77631645\n  -0.03408696]\n ...\n [ 1.          2.          3.         ...  1.          1.41180837\n  -0.03408696]\n [ 1.          2.          1.         ...  1.         -1.21394141\n  -1.64812038]\n [ 7.          2.          2.         ...  1.          0.97418341\n  -0.03408696]]\n(48842, 12)\n</pre> In\u00a0[14]: Copied! <pre>wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    mlp_hidden_dims=[400, 200],\n    mlp_dropout=0.5,\n    mlp_activation=\"leaky_relu\",\n)\n</pre> wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1) tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     mlp_hidden_dims=[400, 200],     mlp_dropout=0.5,     mlp_activation=\"leaky_relu\", ) <p>Let's first find out how a linear model performs</p> In\u00a0[15]: Copied! <pre>wide\n</pre> wide Out[15]: <pre>Wide(\n  (wide_linear): Embedding(809, 1, padding_idx=0)\n)</pre> <p>Before being passed to the Trainer, the models need to be \"constructed\" with the <code>WideDeep</code> constructor class. For the particular case of the wide/linear model, not much really happens</p> In\u00a0[16]: Copied! <pre>lin_model = WideDeep(wide=wide)\n</pre> lin_model = WideDeep(wide=wide) In\u00a0[17]: Copied! <pre>lin_model\n</pre> lin_model Out[17]: <pre>WideDeep(\n  (wide): Wide(\n    (wide_linear): Embedding(809, 1, padding_idx=0)\n  )\n)</pre> In\u00a0[18]: Copied! <pre>lin_trainer = Trainer(\n    model=lin_model,\n    objective=\"binary\",\n    optimizers=torch.optim.AdamW(lin_model.parameters(), lr=0.01),\n    metrics=[Accuracy, Precision],\n)\n</pre> lin_trainer = Trainer(     model=lin_model,     objective=\"binary\",     optimizers=torch.optim.AdamW(lin_model.parameters(), lr=0.01),     metrics=[Accuracy, Precision], ) In\u00a0[19]: Copied! <pre>lin_trainer.fit(X_wide=X_wide, target=target, n_epochs=4, batch_size=128, val_split=0.2)\n</pre> lin_trainer.fit(X_wide=X_wide, target=target, n_epochs=4, batch_size=128, val_split=0.2) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:02&lt;00:00, 109.04it/s, loss=0.426, metrics={'acc': 0.7983, 'prec': 0.6152}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 102.46it/s, loss=0.366, metrics={'acc': 0.832, 'prec': 0.6916}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:02&lt;00:00, 130.27it/s, loss=0.364, metrics={'acc': 0.8305, 'prec': 0.6933}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 150.46it/s, loss=0.361, metrics={'acc': 0.8357, 'prec': 0.6982}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:02&lt;00:00, 133.19it/s, loss=0.359, metrics={'acc': 0.8329, 'prec': 0.6994}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 145.75it/s, loss=0.361, metrics={'acc': 0.836, 'prec': 0.7009}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:02&lt;00:00, 130.91it/s, loss=0.358, metrics={'acc': 0.8333, 'prec': 0.7005}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 155.08it/s, loss=0.361, metrics={'acc': 0.8364, 'prec': 0.702}]\n</pre> <p>Bear in mind that <code>wide</code> is a linear model where the non-linearities are captured via the crossed columns. For the crossed-columns to be effective one needs proper business knowledge. There is no magic formula to produce them</p> <p>Let's have a look to the tabular model by itself</p> In\u00a0[20]: Copied! <pre>tab_model = WideDeep(deeptabular=tab_mlp)\n</pre> tab_model = WideDeep(deeptabular=tab_mlp) In\u00a0[21]: Copied! <pre>tab_model\n</pre> tab_model Out[21]: <pre>WideDeep(\n  (deeptabular): Sequential(\n    (0): TabMlp(\n      (cat_embed): DiffSizeCatEmbeddings(\n        (embed_layers): ModuleDict(\n          (emb_layer_workclass): Embedding(10, 5, padding_idx=0)\n          (emb_layer_education): Embedding(17, 8, padding_idx=0)\n          (emb_layer_marital_status): Embedding(8, 5, padding_idx=0)\n          (emb_layer_occupation): Embedding(16, 7, padding_idx=0)\n          (emb_layer_relationship): Embedding(7, 4, padding_idx=0)\n          (emb_layer_race): Embedding(6, 4, padding_idx=0)\n          (emb_layer_gender): Embedding(3, 2, padding_idx=0)\n          (emb_layer_capital_gain): Embedding(124, 24, padding_idx=0)\n          (emb_layer_capital_loss): Embedding(100, 21, padding_idx=0)\n          (emb_layer_native_country): Embedding(43, 13, padding_idx=0)\n        )\n        (embedding_dropout): Dropout(p=0.1, inplace=False)\n      )\n      (cont_norm): Identity()\n      (encoder): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=95, out_features=400, bias=True)\n            (1): LeakyReLU(negative_slope=0.01, inplace=True)\n            (2): Dropout(p=0.5, inplace=False)\n          )\n          (dense_layer_1): Sequential(\n            (0): Linear(in_features=400, out_features=200, bias=True)\n            (1): LeakyReLU(negative_slope=0.01, inplace=True)\n            (2): Dropout(p=0.5, inplace=False)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=200, out_features=1, bias=True)\n  )\n)</pre> <p>You can see how the <code>WideDeep</code> class has added a final prediction layer that collects the activations from the last layer of the model and plugs them into the output neuron. If this was a multiclass classification problem, the prediction dimension (i.e. the size of that final layer) needs to be specified via the <code>pred_dim</code> when instantiating the <code>WideDeep</code> class, as we will see later</p> In\u00a0[22]: Copied! <pre>tab_trainer = Trainer(\n    model=tab_model,\n    objective=\"binary\",\n    optimizers=torch.optim.AdamW(tab_model.parameters(), lr=0.001),\n    metrics=[Accuracy, Precision],\n)\n</pre> tab_trainer = Trainer(     model=tab_model,     objective=\"binary\",     optimizers=torch.optim.AdamW(tab_model.parameters(), lr=0.001),     metrics=[Accuracy, Precision], ) In\u00a0[23]: Copied! <pre>tab_trainer.fit(X_tab=X_tab, target=target, n_epochs=4, batch_size=128, val_split=0.2)\n</pre> tab_trainer.fit(X_tab=X_tab, target=target, n_epochs=4, batch_size=128, val_split=0.2) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:03&lt;00:00, 97.00it/s, loss=0.37, metrics={'acc': 0.8267, 'prec': 0.7037}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 134.91it/s, loss=0.313, metrics={'acc': 0.8588, 'prec': 0.7577}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:03&lt;00:00, 86.86it/s, loss=0.319, metrics={'acc': 0.8514, 'prec': 0.761}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:01&lt;00:00, 73.13it/s, loss=0.296, metrics={'acc': 0.8675, 'prec': 0.7685}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:03&lt;00:00, 79.07it/s, loss=0.305, metrics={'acc': 0.8574, 'prec': 0.7646}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 130.11it/s, loss=0.289, metrics={'acc': 0.8696, 'prec': 0.7765}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:03&lt;00:00, 87.39it/s, loss=0.296, metrics={'acc': 0.8622, 'prec': 0.7769}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 90.63it/s, loss=0.285, metrics={'acc': 0.8697, 'prec': 0.7741}]\n</pre> <p>The best result I ever obtained with <code>LightGBM</code> on this dataset is 0.8782...so we are pretty close.</p> <p>Let's combine the <code>wide</code> and <code>tab_mlp</code> components see if it helps</p> In\u00a0[24]: Copied! <pre>wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    mlp_hidden_dims=[400, 200],\n    mlp_dropout=0.5,\n    mlp_activation=\"leaky_relu\",\n)\nwd_model = WideDeep(wide=wide, deeptabular=tab_mlp)\n</pre> wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1) tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     mlp_hidden_dims=[400, 200],     mlp_dropout=0.5,     mlp_activation=\"leaky_relu\", ) wd_model = WideDeep(wide=wide, deeptabular=tab_mlp) In\u00a0[25]: Copied! <pre>wd_trainer = Trainer(\n    model=wd_model,\n    objective=\"binary\",\n    optimizers=torch.optim.AdamW(wd_model.parameters(), lr=0.001),\n    metrics=[Accuracy, Precision],\n)\n</pre> wd_trainer = Trainer(     model=wd_model,     objective=\"binary\",     optimizers=torch.optim.AdamW(wd_model.parameters(), lr=0.001),     metrics=[Accuracy, Precision], ) In\u00a0[26]: Copied! <pre>wd_trainer.fit(\n    X_wide=X_wide, X_tab=X_tab, target=target, n_epochs=4, batch_size=128, val_split=0.2\n)\n</pre> wd_trainer.fit(     X_wide=X_wide, X_tab=X_tab, target=target, n_epochs=4, batch_size=128, val_split=0.2 ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:03&lt;00:00, 77.48it/s, loss=0.418, metrics={'acc': 0.8047, 'prec': 0.6154}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 110.51it/s, loss=0.321, metrics={'acc': 0.8521, 'prec': 0.7059}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:03&lt;00:00, 82.70it/s, loss=0.333, metrics={'acc': 0.8428, 'prec': 0.7141}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 112.52it/s, loss=0.299, metrics={'acc': 0.866, 'prec': 0.7447}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:04&lt;00:00, 74.34it/s, loss=0.312, metrics={'acc': 0.8533, 'prec': 0.7404}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 89.86it/s, loss=0.29, metrics={'acc': 0.8683, 'prec': 0.7496}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:04&lt;00:00, 65.32it/s, loss=0.301, metrics={'acc': 0.8591, 'prec': 0.7542}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:00&lt;00:00, 86.81it/s, loss=0.286, metrics={'acc': 0.8712, 'prec': 0.7552}]\n</pre> <p>For this particular case, the combination of both did not lead to better results that using just the tab_mlp model, when using only 4 epochs.</p> <p>Note that we have use a <code>TabMlp</code> model, but we could use any other model in the library using the same syntax</p> In\u00a0[27]: Copied! <pre>from pytorch_widedeep.models import TabTransformer\n</pre> from pytorch_widedeep.models import TabTransformer <p>The parameters for the <code>TabTransformer</code> are this</p> <pre><code>column_idx: Dict[str, int],\ncat_embed_input: Optional[List[Tuple[str, int]]] = None,\ncat_embed_dropout: Optional[float] = None,\nuse_cat_bias: Optional[bool] = None,\ncat_embed_activation: Optional[str] = None,\nshared_embed: Optional[bool] = None,\nadd_shared_embed: Optional[bool] = None,\nfrac_shared_embed: Optional[float] = None,\ncontinuous_cols: Optional[List[str]] = None,\ncont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\nembed_continuous: Optional[bool] = None,\nembed_continuous_method: Optional[Literal[\"standard\", \"piecewise\", \"periodic\"]] = None,\ncont_embed_dropout: Optional[float] = None,\ncont_embed_activation: Optional[str] = None,\nquantization_setup: Optional[Dict[str, List[float]]] = None,\nn_frequencies: Optional[int] = None,\nsigma: Optional[float] = None,\nshare_last_layer: Optional[bool] = None,\nfull_embed_dropout: Optional[bool] = None,\ninput_dim: int = 32,\nn_heads: int = 8,\nuse_qkv_bias: bool = False,\nn_blocks: int = 4,\nattn_dropout: float = 0.2,\nff_dropout: float = 0.1,\nff_factor: int = 4,\ntransformer_activation: str = \"gelu\",\nuse_linear_attention: bool = False,\nuse_flash_attention: bool = False,\nmlp_hidden_dims: Optional[List[int]] = None,\nmlp_activation: str = \"relu\",\nmlp_dropout: float = 0.1,\nmlp_batchnorm: bool = False,\nmlp_batchnorm_last: bool = False,\nmlp_linear_first: bool = True,\n</code></pre> <p>Please, see the documentation for details on each one of them, for now let's see how one could use a <code>TabTransformer</code> model in a few lines of code</p> In\u00a0[28]: Copied! <pre>tab_transformer = TabTransformer(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    embed_continuous_method=\"standard\",\n    cont_norm_layer=\"layernorm\",\n    cont_embed_dropout=0.2,\n    cont_embed_activation=\"leaky_relu\",\n    n_heads=4,\n    ff_dropout=0.2,\n    mlp_dropout=0.5,\n    mlp_activation=\"leaky_relu\",\n    mlp_linear_first=\"True\",\n)\n</pre> tab_transformer = TabTransformer(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     embed_continuous_method=\"standard\",     cont_norm_layer=\"layernorm\",     cont_embed_dropout=0.2,     cont_embed_activation=\"leaky_relu\",     n_heads=4,     ff_dropout=0.2,     mlp_dropout=0.5,     mlp_activation=\"leaky_relu\",     mlp_linear_first=\"True\", ) In\u00a0[29]: Copied! <pre>tab_model = WideDeep(deeptabular=tab_transformer)\n</pre> tab_model = WideDeep(deeptabular=tab_transformer) In\u00a0[30]: Copied! <pre>tab_model\n</pre> tab_model Out[30]: <pre>WideDeep(\n  (deeptabular): Sequential(\n    (0): TabTransformer(\n      (cat_embed): SameSizeCatEmbeddings(\n        (embed): Embedding(325, 32, padding_idx=0)\n        (dropout): Dropout(p=0.1, inplace=False)\n      )\n      (cont_norm): LayerNorm((2,), eps=1e-05, elementwise_affine=True)\n      (cont_embed): ContEmbeddings(\n        INFO: [ContLinear = weight(n_cont_cols, embed_dim) + bias(n_cont_cols, embed_dim)]\n        (linear): ContLinear(n_cont_cols=2, embed_dim=32, embed_dropout=0.2)\n        (activation_fn): LeakyReLU(negative_slope=0.01, inplace=True)\n        (dropout): Dropout(p=0.2, inplace=False)\n      )\n      (encoder): Sequential(\n        (transformer_block0): TransformerEncoder(\n          (attn): MultiHeadedAttention(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (q_proj): Linear(in_features=32, out_features=32, bias=False)\n            (kv_proj): Linear(in_features=32, out_features=64, bias=False)\n            (out_proj): Linear(in_features=32, out_features=32, bias=False)\n          )\n          (ff): FeedForward(\n            (w_1): Linear(in_features=32, out_features=128, bias=True)\n            (w_2): Linear(in_features=128, out_features=32, bias=True)\n            (dropout): Dropout(p=0.2, inplace=False)\n            (activation): GELU(approximate='none')\n          )\n          (attn_addnorm): AddNorm(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n          (ff_addnorm): AddNorm(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n        )\n        (transformer_block1): TransformerEncoder(\n          (attn): MultiHeadedAttention(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (q_proj): Linear(in_features=32, out_features=32, bias=False)\n            (kv_proj): Linear(in_features=32, out_features=64, bias=False)\n            (out_proj): Linear(in_features=32, out_features=32, bias=False)\n          )\n          (ff): FeedForward(\n            (w_1): Linear(in_features=32, out_features=128, bias=True)\n            (w_2): Linear(in_features=128, out_features=32, bias=True)\n            (dropout): Dropout(p=0.2, inplace=False)\n            (activation): GELU(approximate='none')\n          )\n          (attn_addnorm): AddNorm(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n          (ff_addnorm): AddNorm(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n        )\n        (transformer_block2): TransformerEncoder(\n          (attn): MultiHeadedAttention(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (q_proj): Linear(in_features=32, out_features=32, bias=False)\n            (kv_proj): Linear(in_features=32, out_features=64, bias=False)\n            (out_proj): Linear(in_features=32, out_features=32, bias=False)\n          )\n          (ff): FeedForward(\n            (w_1): Linear(in_features=32, out_features=128, bias=True)\n            (w_2): Linear(in_features=128, out_features=32, bias=True)\n            (dropout): Dropout(p=0.2, inplace=False)\n            (activation): GELU(approximate='none')\n          )\n          (attn_addnorm): AddNorm(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n          (ff_addnorm): AddNorm(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n        )\n        (transformer_block3): TransformerEncoder(\n          (attn): MultiHeadedAttention(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (q_proj): Linear(in_features=32, out_features=32, bias=False)\n            (kv_proj): Linear(in_features=32, out_features=64, bias=False)\n            (out_proj): Linear(in_features=32, out_features=32, bias=False)\n          )\n          (ff): FeedForward(\n            (w_1): Linear(in_features=32, out_features=128, bias=True)\n            (w_2): Linear(in_features=128, out_features=32, bias=True)\n            (dropout): Dropout(p=0.2, inplace=False)\n            (activation): GELU(approximate='none')\n          )\n          (attn_addnorm): AddNorm(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n          (ff_addnorm): AddNorm(\n            (dropout): Dropout(p=0.2, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=384, out_features=1, bias=True)\n  )\n)</pre> In\u00a0[31]: Copied! <pre>tab_trainer = Trainer(\n    model=tab_model,\n    objective=\"binary\",\n    optimizers=torch.optim.AdamW(tab_model.parameters(), lr=0.001),\n    metrics=[Accuracy, Precision],\n)\n</pre> tab_trainer = Trainer(     model=tab_model,     objective=\"binary\",     optimizers=torch.optim.AdamW(tab_model.parameters(), lr=0.001),     metrics=[Accuracy, Precision], ) In\u00a0[32]: Copied! <pre>tab_trainer.fit(X_tab=X_tab, target=target, n_epochs=1, batch_size=128, val_split=0.2)\n</pre> tab_trainer.fit(X_tab=X_tab, target=target, n_epochs=1, batch_size=128, val_split=0.2) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 306/306 [00:11&lt;00:00, 27.57it/s, loss=0.359, metrics={'acc': 0.8334, 'prec': 0.7082}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 77/77 [00:01&lt;00:00, 57.89it/s, loss=0.33, metrics={'acc': 0.8536, 'prec': 0.7152}]\n</pre>"},{"location":"examples/03_binary_classification_with_defaults.html#simple-binary-classification-with-defaults","title":"Simple Binary Classification with defaults\u00b6","text":"<p>In this notebook we will train a Wide and Deep model and simply a \"Deep\" model using the well known adult dataset</p>"},{"location":"examples/03_binary_classification_with_defaults.html#preparing-the-data","title":"Preparing the data\u00b6","text":""},{"location":"examples/03_binary_classification_with_defaults.html#defining-the-model","title":"Defining the model\u00b6","text":""},{"location":"examples/04_regression_with_images_and_text.html","title":"04_regression_with_images_and_text","text":"In\u00a0[1]: Copied! <pre>import numpy as np\nimport pandas as pd\nimport os\nimport torch\nfrom torchvision.transforms import ToTensor, Normalize\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.preprocessing import (\n    WidePreprocessor,\n    TabPreprocessor,\n    TextPreprocessor,\n    ImagePreprocessor,\n)\nfrom pytorch_widedeep.models import (\n    Wide,\n    TabMlp,\n    Vision,\n    BasicRNN,\n    WideDeep,\n)\nfrom pytorch_widedeep.losses import RMSELoss\nfrom pytorch_widedeep.initializers import *\nfrom pytorch_widedeep.callbacks import *\n</pre> import numpy as np import pandas as pd import os import torch from torchvision.transforms import ToTensor, Normalize  from pytorch_widedeep import Trainer from pytorch_widedeep.preprocessing import (     WidePreprocessor,     TabPreprocessor,     TextPreprocessor,     ImagePreprocessor, ) from pytorch_widedeep.models import (     Wide,     TabMlp,     Vision,     BasicRNN,     WideDeep, ) from pytorch_widedeep.losses import RMSELoss from pytorch_widedeep.initializers import * from pytorch_widedeep.callbacks import * <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n</pre> In\u00a0[2]: Copied! <pre>df = pd.read_csv(\"../tmp_data/airbnb/airbnb_sample.csv\")\ndf.head()\n</pre> df = pd.read_csv(\"../tmp_data/airbnb/airbnb_sample.csv\") df.head() Out[2]: id host_id description host_listings_count host_identity_verified neighbourhood_cleansed latitude longitude is_location_exact property_type ... amenity_wide_entrance amenity_wide_entrance_for_guests amenity_wide_entryway amenity_wide_hallways amenity_wifi amenity_window_guards amenity_wine_cooler security_deposit extra_people yield 0 13913.jpg 54730 My bright double bedroom with a large window h... 4.0 f Islington 51.56802 -0.11121 t apartment ... 1 0 0 0 1 0 0 100.0 15.0 12.00 1 15400.jpg 60302 Lots of windows and light.  St Luke's Gardens ... 1.0 t Kensington and Chelsea 51.48796 -0.16898 t apartment ... 0 0 0 0 1 0 0 150.0 0.0 109.50 2 17402.jpg 67564 Open from June 2018 after a 3-year break, we a... 19.0 t Westminster 51.52098 -0.14002 t apartment ... 0 0 0 0 1 0 0 350.0 10.0 149.65 3 24328.jpg 41759 Artist house, bright high ceiling rooms, priva... 2.0 t Wandsworth 51.47298 -0.16376 t other ... 0 0 0 0 1 0 0 250.0 0.0 215.60 4 25023.jpg 102813 Large, all comforts, 2-bed flat; first floor; ... 1.0 f Wandsworth 51.44687 -0.21874 t apartment ... 0 0 0 0 1 0 0 250.0 11.0 79.35 <p>5 rows \u00d7 223 columns</p> In\u00a0[3]: Copied! <pre># There are a number of columns that are already binary. Therefore, no need to one hot encode them\ncrossed_cols = [(\"property_type\", \"room_type\")]\nalready_dummies = [c for c in df.columns if \"amenity\" in c] + [\"has_house_rules\"]\nwide_cols = [\n    \"is_location_exact\",\n    \"property_type\",\n    \"room_type\",\n    \"host_gender\",\n    \"instant_bookable\",\n] + already_dummies\n\ncat_embed_cols = [(c, 16) for c in df.columns if \"catg\" in c] + [\n    (\"neighbourhood_cleansed\", 64),\n    (\"cancellation_policy\", 16),\n]\ncontinuous_cols = [\"latitude\", \"longitude\", \"security_deposit\", \"extra_people\"]\n\n# text and image colnames\ntext_col = \"description\"\nimg_col = \"id\"\n\n# path to pretrained word embeddings and the images\nword_vectors_path = \"../tmp_data/glove.6B/glove.6B.100d.txt\"\nimg_path = \"../tmp_data/airbnb/property_picture\"\n\n# target\ntarget_col = \"yield\"\n</pre> # There are a number of columns that are already binary. Therefore, no need to one hot encode them crossed_cols = [(\"property_type\", \"room_type\")] already_dummies = [c for c in df.columns if \"amenity\" in c] + [\"has_house_rules\"] wide_cols = [     \"is_location_exact\",     \"property_type\",     \"room_type\",     \"host_gender\",     \"instant_bookable\", ] + already_dummies  cat_embed_cols = [(c, 16) for c in df.columns if \"catg\" in c] + [     (\"neighbourhood_cleansed\", 64),     (\"cancellation_policy\", 16), ] continuous_cols = [\"latitude\", \"longitude\", \"security_deposit\", \"extra_people\"]  # text and image colnames text_col = \"description\" img_col = \"id\"  # path to pretrained word embeddings and the images word_vectors_path = \"../tmp_data/glove.6B/glove.6B.100d.txt\" img_path = \"../tmp_data/airbnb/property_picture\"  # target target_col = \"yield\" In\u00a0[4]: Copied! <pre>target = df[target_col].values\n</pre> target = df[target_col].values In\u00a0[5]: Copied! <pre>wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)\nX_wide = wide_preprocessor.fit_transform(df)\n</pre> wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = wide_preprocessor.fit_transform(df) In\u00a0[6]: Copied! <pre>tab_preprocessor = TabPreprocessor(\n    cat_embed_cols=cat_embed_cols,\n    continuous_cols=continuous_cols,\n)\nX_tab = tab_preprocessor.fit_transform(df)\n</pre> tab_preprocessor = TabPreprocessor(     cat_embed_cols=cat_embed_cols,     continuous_cols=continuous_cols, ) X_tab = tab_preprocessor.fit_transform(df) <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> In\u00a0[7]: Copied! <pre>text_preprocessor = TextPreprocessor(\n    word_vectors_path=word_vectors_path, text_col=text_col\n)\nX_text = text_preprocessor.fit_transform(df)\n</pre> text_preprocessor = TextPreprocessor(     word_vectors_path=word_vectors_path, text_col=text_col ) X_text = text_preprocessor.fit_transform(df) <pre>The vocabulary contains 2192 tokens\nIndexing word vectors...\nLoaded 400000 word vectors\nPreparing embeddings matrix...\n2175 words in the vocabulary had ../tmp_data/glove.6B/glove.6B.100d.txt vectors and appear more than 5 times\n</pre> In\u00a0[8]: Copied! <pre>image_processor = ImagePreprocessor(img_col=img_col, img_path=img_path)\nX_images = image_processor.fit_transform(df)\n</pre> image_processor = ImagePreprocessor(img_col=img_col, img_path=img_path) X_images = image_processor.fit_transform(df) <pre>Reading Images from ../tmp_data/airbnb/property_picture\nResizing\n</pre> <pre>100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1001/1001 [00:01&lt;00:00, 638.00it/s]\n</pre> <pre>Computing normalisation metrics\n</pre> In\u00a0[9]: Copied! <pre># Linear model\nwide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\n\n# DeepDense: 2 Dense layers\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=continuous_cols,\n    mlp_hidden_dims=[128, 64],\n    mlp_dropout=0.1,\n)\n\n# DeepText: a stack of 2 LSTMs\nbasic_rnn = BasicRNN(\n    vocab_size=len(text_preprocessor.vocab.itos),\n    embed_matrix=text_preprocessor.embedding_matrix,\n    n_layers=2,\n    hidden_dim=64,\n    rnn_dropout=0.5,\n)\n\n# Pretrained Resnet 18\nresnet = Vision(pretrained_model_setup=\"resnet18\", n_trainable=4)\n</pre> # Linear model wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)  # DeepDense: 2 Dense layers tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=continuous_cols,     mlp_hidden_dims=[128, 64],     mlp_dropout=0.1, )  # DeepText: a stack of 2 LSTMs basic_rnn = BasicRNN(     vocab_size=len(text_preprocessor.vocab.itos),     embed_matrix=text_preprocessor.embedding_matrix,     n_layers=2,     hidden_dim=64,     rnn_dropout=0.5, )  # Pretrained Resnet 18 resnet = Vision(pretrained_model_setup=\"resnet18\", n_trainable=4) <p>Combine them all with the \"collector\" class <code>WideDeep</code></p> In\u00a0[10]: Copied! <pre>model = WideDeep(\n    wide=wide,\n    deeptabular=tab_mlp,\n    deeptext=basic_rnn,\n    deepimage=resnet,\n    head_hidden_dims=[256, 128],\n)\n</pre> model = WideDeep(     wide=wide,     deeptabular=tab_mlp,     deeptext=basic_rnn,     deepimage=resnet,     head_hidden_dims=[256, 128], ) In\u00a0[11]: Copied! <pre>trainer = Trainer(model, objective=\"rmse\")\n</pre> trainer = Trainer(model, objective=\"rmse\") In\u00a0[12]: Copied! <pre>trainer.fit(\n    X_wide=X_wide,\n    X_tab=X_tab,\n    X_text=X_text,\n    X_img=X_images,\n    target=target,\n    n_epochs=1,\n    batch_size=32,\n    val_split=0.2,\n)\n</pre> trainer.fit(     X_wide=X_wide,     X_tab=X_tab,     X_text=X_text,     X_img=X_images,     target=target,     n_epochs=1,     batch_size=32,     val_split=0.2, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 25/25 [00:19&lt;00:00,  1.28it/s, loss=115]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 7/7 [00:04&lt;00:00,  1.62it/s, loss=94.1]\n</pre> <p>Both, the Text and Image components allow FC-heads on their own (have a look to the documentation).</p> <p>Now let's go \"kaggle crazy\". Let's use different optimizers, initializers and schedulers for different components. Moreover, let's use a different learning rate for different parameter groups, for the <code>deeptabular</code> component</p> In\u00a0[13]: Copied! <pre>deep_params = []\nfor childname, child in model.named_children():\n    if childname == \"deeptabular\":\n        for n, p in child.named_parameters():\n            if \"embed_layer\" in n:\n                deep_params.append({\"params\": p, \"lr\": 1e-4})\n            else:\n                deep_params.append({\"params\": p, \"lr\": 1e-3})\n</pre> deep_params = [] for childname, child in model.named_children():     if childname == \"deeptabular\":         for n, p in child.named_parameters():             if \"embed_layer\" in n:                 deep_params.append({\"params\": p, \"lr\": 1e-4})             else:                 deep_params.append({\"params\": p, \"lr\": 1e-3}) In\u00a0[14]: Copied! <pre>wide_opt = torch.optim.Adam(model.wide.parameters(), lr=0.03)\ndeep_opt = torch.optim.Adam(deep_params)\ntext_opt = torch.optim.AdamW(model.deeptext.parameters())\nimg_opt = torch.optim.AdamW(model.deepimage.parameters())\nhead_opt = torch.optim.Adam(model.deephead.parameters())\n</pre> wide_opt = torch.optim.Adam(model.wide.parameters(), lr=0.03) deep_opt = torch.optim.Adam(deep_params) text_opt = torch.optim.AdamW(model.deeptext.parameters()) img_opt = torch.optim.AdamW(model.deepimage.parameters()) head_opt = torch.optim.Adam(model.deephead.parameters()) In\u00a0[15]: Copied! <pre>wide_sch = torch.optim.lr_scheduler.StepLR(wide_opt, step_size=5)\ndeep_sch = torch.optim.lr_scheduler.MultiStepLR(deep_opt, milestones=[3, 8])\ntext_sch = torch.optim.lr_scheduler.StepLR(text_opt, step_size=5)\nimg_sch = torch.optim.lr_scheduler.MultiStepLR(deep_opt, milestones=[3, 8])\nhead_sch = torch.optim.lr_scheduler.StepLR(head_opt, step_size=5)\n</pre> wide_sch = torch.optim.lr_scheduler.StepLR(wide_opt, step_size=5) deep_sch = torch.optim.lr_scheduler.MultiStepLR(deep_opt, milestones=[3, 8]) text_sch = torch.optim.lr_scheduler.StepLR(text_opt, step_size=5) img_sch = torch.optim.lr_scheduler.MultiStepLR(deep_opt, milestones=[3, 8]) head_sch = torch.optim.lr_scheduler.StepLR(head_opt, step_size=5) In\u00a0[16]: Copied! <pre># remember, one optimizer per model components, for lr_schedures and initializers is not neccesary\noptimizers = {\n    \"wide\": wide_opt,\n    \"deeptabular\": deep_opt,\n    \"deeptext\": text_opt,\n    \"deepimage\": img_opt,\n    \"deephead\": head_opt,\n}\nschedulers = {\n    \"wide\": wide_sch,\n    \"deeptabular\": deep_sch,\n    \"deeptext\": text_sch,\n    \"deepimage\": img_sch,\n    \"deephead\": head_sch,\n}\n\n# Now...we have used pretrained word embeddings, so you do not want to\n# initialise these  embeddings. However you might still want to initialise the\n# other layers in the DeepText component. No probs, you can do that with the\n# parameter pattern and your knowledge on regular  expressions. Here we are\n# telling to the KaimingNormal initializer to NOT touch the  parameters whose\n# name contains the string word_embed.\ninitializers = {\n    \"wide\": KaimingNormal,\n    \"deeptabular\": KaimingNormal,\n    \"deeptext\": KaimingNormal(pattern=r\"^(?!.*word_embed).*$\"),\n    \"deepimage\": KaimingNormal,\n}\n\nmean = [0.406, 0.456, 0.485]  # BGR\nstd = [0.225, 0.224, 0.229]  # BGR\ntransforms = [ToTensor, Normalize(mean=mean, std=std)]\ncallbacks = [\n    LRHistory(n_epochs=10),\n    EarlyStopping,\n    ModelCheckpoint(filepath=\"model_weights/wd_out\"),\n]\n</pre> # remember, one optimizer per model components, for lr_schedures and initializers is not neccesary optimizers = {     \"wide\": wide_opt,     \"deeptabular\": deep_opt,     \"deeptext\": text_opt,     \"deepimage\": img_opt,     \"deephead\": head_opt, } schedulers = {     \"wide\": wide_sch,     \"deeptabular\": deep_sch,     \"deeptext\": text_sch,     \"deepimage\": img_sch,     \"deephead\": head_sch, }  # Now...we have used pretrained word embeddings, so you do not want to # initialise these  embeddings. However you might still want to initialise the # other layers in the DeepText component. No probs, you can do that with the # parameter pattern and your knowledge on regular  expressions. Here we are # telling to the KaimingNormal initializer to NOT touch the  parameters whose # name contains the string word_embed. initializers = {     \"wide\": KaimingNormal,     \"deeptabular\": KaimingNormal,     \"deeptext\": KaimingNormal(pattern=r\"^(?!.*word_embed).*$\"),     \"deepimage\": KaimingNormal, }  mean = [0.406, 0.456, 0.485]  # BGR std = [0.225, 0.224, 0.229]  # BGR transforms = [ToTensor, Normalize(mean=mean, std=std)] callbacks = [     LRHistory(n_epochs=10),     EarlyStopping,     ModelCheckpoint(filepath=\"model_weights/wd_out\"), ] In\u00a0[17]: Copied! <pre>trainer = Trainer(\n    model,\n    objective=\"rmse\",\n    initializers=initializers,\n    optimizers=optimizers,\n    lr_schedulers=schedulers,\n    callbacks=callbacks,\n    transforms=transforms,\n)\n</pre> trainer = Trainer(     model,     objective=\"rmse\",     initializers=initializers,     optimizers=optimizers,     lr_schedulers=schedulers,     callbacks=callbacks,     transforms=transforms, ) <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/initializers.py:34: UserWarning: No initializer found for deephead\n  warnings.warn(\n</pre> In\u00a0[18]: Copied! <pre>trainer.fit(\n    X_wide=X_wide,\n    X_tab=X_tab,\n    X_text=X_text,\n    X_img=X_images,\n    target=target,\n    n_epochs=1,\n    batch_size=32,\n    val_split=0.2,\n)\n</pre> trainer.fit(     X_wide=X_wide,     X_tab=X_tab,     X_text=X_text,     X_img=X_images,     target=target,     n_epochs=1,     batch_size=32,     val_split=0.2, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 25/25 [00:19&lt;00:00,  1.25it/s, loss=101]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 7/7 [00:04&lt;00:00,  1.62it/s, loss=90.6]\n</pre> <pre>Model weights after training corresponds to the those of the final epoch which might not be the best performing weights. Use the 'ModelCheckpoint' Callback to restore the best epoch weights.\n</pre> <p>we have only run one epoch, but let's check that the LRHistory callback records the lr values for each group</p> In\u00a0[19]: Copied! <pre>trainer.lr_history\n</pre> trainer.lr_history Out[19]: <pre>{'lr_wide_0': [0.03, 0.03],\n 'lr_deeptabular_0': [0.0001, 0.0001],\n 'lr_deeptabular_1': [0.0001, 0.0001],\n 'lr_deeptabular_2': [0.0001, 0.0001],\n 'lr_deeptabular_3': [0.0001, 0.0001],\n 'lr_deeptabular_4': [0.0001, 0.0001],\n 'lr_deeptabular_5': [0.0001, 0.0001],\n 'lr_deeptabular_6': [0.0001, 0.0001],\n 'lr_deeptabular_7': [0.0001, 0.0001],\n 'lr_deeptabular_8': [0.0001, 0.0001],\n 'lr_deeptabular_9': [0.001, 0.001],\n 'lr_deeptabular_10': [0.001, 0.001],\n 'lr_deeptabular_11': [0.001, 0.001],\n 'lr_deeptabular_12': [0.001, 0.001],\n 'lr_deeptext_0': [0.001, 0.001],\n 'lr_deepimage_0': [0.001, 0.001],\n 'lr_deephead_0': [0.001, 0.001]}</pre>"},{"location":"examples/04_regression_with_images_and_text.html#regression-with-images-and-text","title":"Regression with Images and Text\u00b6","text":"<p>In this notebook we will go through a series of examples on how to combine all Wide &amp; Deep components.</p> <p>To that aim I will use the Airbnb listings dataset for London, which you can download from here. I use this dataset simply because it contains tabular data, images and text.</p> <p>I have taken a sample of 1000 listings to keep the data tractable in this notebook. Also, I have preprocessed the data and prepared it for this exercise. All preprocessing steps can be found in the notebook <code>airbnb_data_preprocessing.ipynb</code> in this <code>examples</code> folder.</p>"},{"location":"examples/04_regression_with_images_and_text.html#regression-with-the-defaults","title":"Regression with the defaults\u00b6","text":"<p>The set up</p>"},{"location":"examples/04_regression_with_images_and_text.html#prepare-the-data","title":"Prepare the data\u00b6","text":"<p>I will focus here on how to prepare the data and run the model. Check notebooks 1 and 2 to see what's going on behind the scences</p> <p>Preparing the data is rather simple</p>"},{"location":"examples/04_regression_with_images_and_text.html#build-the-model-components","title":"Build the model components\u00b6","text":""},{"location":"examples/04_regression_with_images_and_text.html#build-the-trainer-and-fit","title":"Build the trainer and fit\u00b6","text":""},{"location":"examples/05_save_and_load_model_and_artifacts.html","title":"05_save_and_load_model_and_artifacts","text":"In\u00a0[1]: Copied! <pre>import pickle\nimport numpy as np\nimport pandas as pd\nimport torch\nimport shutil\n\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.training import Trainer\nfrom pytorch_widedeep.callbacks import EarlyStopping, ModelCheckpoint, LRHistory\nfrom pytorch_widedeep.models import TabMlp, WideDeep\nfrom pytorch_widedeep.metrics import Accuracy\nfrom pytorch_widedeep.datasets import load_adult\nfrom sklearn.model_selection import train_test_split\n</pre> import pickle import numpy as np import pandas as pd import torch import shutil  from pytorch_widedeep.preprocessing import TabPreprocessor from pytorch_widedeep.training import Trainer from pytorch_widedeep.callbacks import EarlyStopping, ModelCheckpoint, LRHistory from pytorch_widedeep.models import TabMlp, WideDeep from pytorch_widedeep.metrics import Accuracy from pytorch_widedeep.datasets import load_adult from sklearn.model_selection import train_test_split <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n</pre> In\u00a0[2]: Copied! <pre>df = load_adult(as_frame=True)\ndf.head()\n</pre> df = load_adult(as_frame=True) df.head() Out[2]: age workclass fnlwgt education educational-num marital-status occupation relationship race gender capital-gain capital-loss hours-per-week native-country income 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States &lt;=50K 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States &lt;=50K 2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States &gt;50K 3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States &gt;50K 4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States &lt;=50K In\u00a0[3]: Copied! <pre># For convenience, we'll replace '-' with '_'\ndf.columns = [c.replace(\"-\", \"_\") for c in df.columns]\n# binary target\ndf[\"target\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int)\ndf.drop(\"income\", axis=1, inplace=True)\ndf.head()\n</pre> # For convenience, we'll replace '-' with '_' df.columns = [c.replace(\"-\", \"_\") for c in df.columns] # binary target df[\"target\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int) df.drop(\"income\", axis=1, inplace=True) df.head() Out[3]: age workclass fnlwgt education educational_num marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country target 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States 0 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States 0 2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States 1 3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States 1 4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States 0 In\u00a0[4]: Copied! <pre>train, valid = train_test_split(df, test_size=0.2, stratify=df.target)\n# the test data will be used lately as if it was \"fresh\", new data coming after some time...\nvalid, test = train_test_split(valid, test_size=0.5, stratify=valid.target)\n</pre> train, valid = train_test_split(df, test_size=0.2, stratify=df.target) # the test data will be used lately as if it was \"fresh\", new data coming after some time... valid, test = train_test_split(valid, test_size=0.5, stratify=valid.target) In\u00a0[5]: Copied! <pre>print(f\"train shape: {train.shape}\")\nprint(f\"valid shape: {valid.shape}\")\nprint(f\"test shape: {test.shape}\")\n</pre> print(f\"train shape: {train.shape}\") print(f\"valid shape: {valid.shape}\") print(f\"test shape: {test.shape}\") <pre>train shape: (39073, 15)\nvalid shape: (4884, 15)\ntest shape: (4885, 15)\n</pre> In\u00a0[6]: Copied! <pre>cat_embed_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital_status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"capital_gain\",\n    \"capital_loss\",\n    \"native_country\",\n]\ncontinuous_cols = [\"age\", \"hours_per_week\"]\n</pre> cat_embed_cols = [     \"workclass\",     \"education\",     \"marital_status\",     \"occupation\",     \"relationship\",     \"race\",     \"gender\",     \"capital_gain\",     \"capital_loss\",     \"native_country\", ] continuous_cols = [\"age\", \"hours_per_week\"] In\u00a0[7]: Copied! <pre>tab_preprocessor = TabPreprocessor(\n    embed_cols=cat_embed_cols,\n    continuous_cols=continuous_cols,\n)\nX_tab_train = tab_preprocessor.fit_transform(train)\ny_train = train.target.values\nX_tab_valid = tab_preprocessor.transform(valid)\ny_valid = valid.target.values\n</pre> tab_preprocessor = TabPreprocessor(     embed_cols=cat_embed_cols,     continuous_cols=continuous_cols, ) X_tab_train = tab_preprocessor.fit_transform(train) y_train = train.target.values X_tab_valid = tab_preprocessor.transform(valid) y_valid = valid.target.values <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> In\u00a0[8]: Copied! <pre>tab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    cont_norm_layer=\"layernorm\",\n    embed_continuous_method=\"standard\",\n    cont_embed_dim=8,\n    mlp_hidden_dims=[64, 32],\n    mlp_dropout=0.2,\n    mlp_activation=\"leaky_relu\",\n)\nmodel = WideDeep(deeptabular=tab_mlp)\n</pre> tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     cont_norm_layer=\"layernorm\",     embed_continuous_method=\"standard\",     cont_embed_dim=8,     mlp_hidden_dims=[64, 32],     mlp_dropout=0.2,     mlp_activation=\"leaky_relu\", ) model = WideDeep(deeptabular=tab_mlp) In\u00a0[9]: Copied! <pre>model\n</pre> model Out[9]: <pre>WideDeep(\n  (deeptabular): Sequential(\n    (0): TabMlp(\n      (cat_embed): DiffSizeCatEmbeddings(\n        (embed_layers): ModuleDict(\n          (emb_layer_workclass): Embedding(10, 5, padding_idx=0)\n          (emb_layer_education): Embedding(17, 8, padding_idx=0)\n          (emb_layer_marital_status): Embedding(8, 5, padding_idx=0)\n          (emb_layer_occupation): Embedding(16, 7, padding_idx=0)\n          (emb_layer_relationship): Embedding(7, 4, padding_idx=0)\n          (emb_layer_race): Embedding(6, 4, padding_idx=0)\n          (emb_layer_gender): Embedding(3, 2, padding_idx=0)\n          (emb_layer_capital_gain): Embedding(122, 23, padding_idx=0)\n          (emb_layer_capital_loss): Embedding(97, 21, padding_idx=0)\n          (emb_layer_native_country): Embedding(43, 13, padding_idx=0)\n        )\n        (embedding_dropout): Dropout(p=0.1, inplace=False)\n      )\n      (cont_norm): LayerNorm((2,), eps=1e-05, elementwise_affine=True)\n      (cont_embed): ContEmbeddings(\n        INFO: [ContLinear = weight(n_cont_cols, embed_dim) + bias(n_cont_cols, embed_dim)]\n        (linear): ContLinear(n_cont_cols=2, embed_dim=8, embed_dropout=0.0)\n        (dropout): Dropout(p=0.0, inplace=False)\n      )\n      (encoder): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=108, out_features=64, bias=True)\n            (1): LeakyReLU(negative_slope=0.01, inplace=True)\n            (2): Dropout(p=0.2, inplace=False)\n          )\n          (dense_layer_1): Sequential(\n            (0): Linear(in_features=64, out_features=32, bias=True)\n            (1): LeakyReLU(negative_slope=0.01, inplace=True)\n            (2): Dropout(p=0.2, inplace=False)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=32, out_features=1, bias=True)\n  )\n)</pre> In\u00a0[10]: Copied! <pre>early_stopping = EarlyStopping()\nmodel_checkpoint = ModelCheckpoint(\n    filepath=\"tmp_dir/adult_tabmlp_model\",\n    save_best_only=True,\n    verbose=1,\n    max_save=1,\n)\n\ntrainer = Trainer(\n    model,\n    objective=\"binary\",\n    callbacks=[early_stopping, model_checkpoint],\n    metrics=[Accuracy],\n)\n\ntrainer.fit(\n    X_train={\"X_tab\": X_tab_train, \"target\": y_train},\n    X_val={\"X_tab\": X_tab_valid, \"target\": y_valid},\n    n_epochs=4,\n    batch_size=256,\n)\n</pre> early_stopping = EarlyStopping() model_checkpoint = ModelCheckpoint(     filepath=\"tmp_dir/adult_tabmlp_model\",     save_best_only=True,     verbose=1,     max_save=1, )  trainer = Trainer(     model,     objective=\"binary\",     callbacks=[early_stopping, model_checkpoint],     metrics=[Accuracy], )  trainer.fit(     X_train={\"X_tab\": X_tab_train, \"target\": y_train},     X_val={\"X_tab\": X_tab_valid, \"target\": y_valid},     n_epochs=4,     batch_size=256, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:02&lt;00:00, 76.25it/s, loss=0.452, metrics={'acc': 0.7867}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 125.36it/s, loss=0.335, metrics={'acc': 0.8532}]\n</pre> <pre>\nEpoch 1: val_loss improved from inf to 0.33532 Saving model to tmp_dir/adult_tabmlp_model_1.p\n</pre> <pre>epoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 76.98it/s, loss=0.355, metrics={'acc': 0.8401}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 106.51it/s, loss=0.303, metrics={'acc': 0.8665}]\n</pre> <pre>\nEpoch 2: val_loss improved from 0.33532 to 0.30273 Saving model to tmp_dir/adult_tabmlp_model_2.p\n</pre> <pre>epoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 82.71it/s, loss=0.332, metrics={'acc': 0.849}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 107.80it/s, loss=0.288, metrics={'acc': 0.8757}]\n</pre> <pre>\nEpoch 3: val_loss improved from 0.30273 to 0.28791 Saving model to tmp_dir/adult_tabmlp_model_3.p\n</pre> <pre>epoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 79.02it/s, loss=0.32, metrics={'acc': 0.8541}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 127.07it/s, loss=0.282, metrics={'acc': 0.8763}]</pre> <pre>\nEpoch 4: val_loss improved from 0.28791 to 0.28238 Saving model to tmp_dir/adult_tabmlp_model_4.p\nModel weights restored to best epoch: 4\n</pre> <pre>\n</pre> In\u00a0[11]: Copied! <pre>torch.save(model, \"tmp_dir/model_saved_option_1.pt\")\n</pre> torch.save(model, \"tmp_dir/model_saved_option_1.pt\") In\u00a0[12]: Copied! <pre>torch.save(model.state_dict(), \"tmp_dir/model_state_dict_saved_option_1.pt\")\n</pre> torch.save(model.state_dict(), \"tmp_dir/model_state_dict_saved_option_1.pt\") In\u00a0[13]: Copied! <pre>trainer.save(path=\"tmp_dir/\", model_filename=\"model_saved_option_2.pt\")\n</pre> trainer.save(path=\"tmp_dir/\", model_filename=\"model_saved_option_2.pt\") <p>or the state dict</p> In\u00a0[14]: Copied! <pre>trainer.save(\n    path=\"tmp_dir/\",\n    model_filename=\"model_state_dict_saved_option_2.pt\",\n    save_state_dict=True,\n)\n</pre> trainer.save(     path=\"tmp_dir/\",     model_filename=\"model_state_dict_saved_option_2.pt\",     save_state_dict=True, ) In\u00a0[15]: Copied! <pre>%%bash\n\nls tmp_dir/\n</pre> %%bash  ls tmp_dir/ <pre>adult_tabmlp_model_4.p\nhistory\nmodel_saved_option_1.pt\nmodel_saved_option_2.pt\nmodel_state_dict_saved_option_1.pt\nmodel_state_dict_saved_option_2.pt\n</pre> In\u00a0[16]: Copied! <pre>%%bash\n\nls tmp_dir/history/\n</pre> %%bash  ls tmp_dir/history/ <pre>train_eval_history.json\n</pre> <p>Note that since we have used the <code>ModelCheckpoint</code> Callback, <code>adult_tabmlp_model_2.p</code> is the model state dict of the model at epoch 2, i.e. same as <code>model_state_dict_saved_option_1.p</code> or <code>model_state_dict_saved_option_2.p</code>.</p> In\u00a0[17]: Copied! <pre>with open(\"tmp_dir/tab_preproc.pkl\", \"wb\") as dp:\n    pickle.dump(tab_preprocessor, dp)\n</pre> with open(\"tmp_dir/tab_preproc.pkl\", \"wb\") as dp:     pickle.dump(tab_preprocessor, dp) In\u00a0[18]: Copied! <pre>with open(\"tmp_dir/eary_stop.pkl\", \"wb\") as es:\n    pickle.dump(early_stopping, es)\n</pre> with open(\"tmp_dir/eary_stop.pkl\", \"wb\") as es:     pickle.dump(early_stopping, es) In\u00a0[19]: Copied! <pre>%%bash\n\nls tmp_dir/\n</pre> %%bash  ls tmp_dir/ <pre>adult_tabmlp_model_4.p\neary_stop.pkl\nhistory\nmodel_saved_option_1.pt\nmodel_saved_option_2.pt\nmodel_state_dict_saved_option_1.pt\nmodel_state_dict_saved_option_2.pt\ntab_preproc.pkl\n</pre> <p>And that is pretty much all you need to resume training or directly predict, let's see</p> In\u00a0[20]: Copied! <pre>test.head()\n</pre> test.head() Out[20]: age workclass fnlwgt education educational_num marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country target 10103 43 Private 198282 HS-grad 9 Married-civ-spouse Craft-repair Husband White Male 0 0 40 United-States 1 31799 20 Private 228686 11th 7 Married-civ-spouse Other-service Husband White Male 0 0 40 United-States 0 19971 26 Private 291968 HS-grad 9 Married-civ-spouse Transport-moving Husband White Male 0 0 44 United-States 0 3039 48 Private 175958 Bachelors 13 Divorced Prof-specialty Not-in-family White Male 0 0 30 United-States 0 20725 18 Private 232024 11th 7 Never-married Machine-op-inspct Own-child White Male 0 0 55 United-States 0 In\u00a0[21]: Copied! <pre>with open(\"tmp_dir/tab_preproc.pkl\", \"rb\") as tp:\n    tab_preprocessor_new = pickle.load(tp)\n</pre> with open(\"tmp_dir/tab_preproc.pkl\", \"rb\") as tp:     tab_preprocessor_new = pickle.load(tp) In\u00a0[22]: Copied! <pre>X_test_tab = tab_preprocessor_new.transform(test)\ny_test = test.target\n</pre> X_test_tab = tab_preprocessor_new.transform(test) y_test = test.target In\u00a0[23]: Copied! <pre>tab_mlp_new = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    cont_norm_layer=\"layernorm\",\n    embed_continuous_method=\"standard\",\n    cont_embed_dim=8,\n    mlp_hidden_dims=[64, 32],\n    mlp_dropout=0.2,\n    mlp_activation=\"leaky_relu\",\n)\nnew_model = WideDeep(deeptabular=tab_mlp)\n</pre> tab_mlp_new = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     cont_norm_layer=\"layernorm\",     embed_continuous_method=\"standard\",     cont_embed_dim=8,     mlp_hidden_dims=[64, 32],     mlp_dropout=0.2,     mlp_activation=\"leaky_relu\", ) new_model = WideDeep(deeptabular=tab_mlp) In\u00a0[24]: Copied! <pre>new_model.load_state_dict(torch.load(\"tmp_dir/model_state_dict_saved_option_2.pt\"))\n</pre> new_model.load_state_dict(torch.load(\"tmp_dir/model_state_dict_saved_option_2.pt\")) Out[24]: <pre>&lt;All keys matched successfully&gt;</pre> In\u00a0[25]: Copied! <pre>trainer = Trainer(\n    model,\n    objective=\"binary\",\n)\n</pre> trainer = Trainer(     model,     objective=\"binary\", ) In\u00a0[26]: Copied! <pre>preds = trainer.predict(X_tab=X_test_tab, batch_size=32)\n</pre> preds = trainer.predict(X_tab=X_test_tab, batch_size=32) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:00&lt;00:00, 309.83it/s]\n</pre> In\u00a0[27]: Copied! <pre>from sklearn.metrics import accuracy_score\n</pre> from sklearn.metrics import accuracy_score In\u00a0[28]: Copied! <pre>accuracy_score(y_test, preds)\n</pre> accuracy_score(y_test, preds) Out[28]: <pre>0.8595701125895598</pre> In\u00a0[29]: Copied! <pre>shutil.rmtree(\"tmp_dir/\")\n</pre> shutil.rmtree(\"tmp_dir/\")"},{"location":"examples/05_save_and_load_model_and_artifacts.html#save-and-load-model-and-artifacts","title":"Save and load model and artifacts\u00b6","text":"<p>In this notebook I will show the different options to save and load a model, as well as some additional objects produced during training.</p> <p>On a given day, you train a model...</p>"},{"location":"examples/05_save_and_load_model_and_artifacts.html#save-model-option-1","title":"Save model: option 1\u00b6","text":"<p>save (and load) a model as you woud do with any other torch model</p>"},{"location":"examples/05_save_and_load_model_and_artifacts.html#save-model-option-2","title":"Save model: option 2\u00b6","text":"<p>use the <code>trainer</code>. The <code>trainer</code> will also save the training history and the learning rate history (if learning rate schedulers are used)</p>"},{"location":"examples/05_save_and_load_model_and_artifacts.html#save-preprocessors-and-callbacks","title":"Save preprocessors and callbacks\u00b6","text":"<p>...just pickle them</p>"},{"location":"examples/05_save_and_load_model_and_artifacts.html#run-new-experiment-prepare-new-dataset-load-model-and-predict","title":"Run New experiment: prepare new dataset, load model, and predict\u00b6","text":""},{"location":"examples/06_finetune_and_warmup.html","title":"06_finetune_and_warmup","text":"In\u00a0[1]: Copied! <pre>import numpy as np\nimport pandas as pd\nimport torch\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor\nfrom pytorch_widedeep.models import Wide, TabMlp, TabResnet, WideDeep\nfrom pytorch_widedeep.metrics import Accuracy\nfrom pytorch_widedeep.datasets import load_adult\n</pre> import numpy as np import pandas as pd import torch  from pytorch_widedeep import Trainer from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor from pytorch_widedeep.models import Wide, TabMlp, TabResnet, WideDeep from pytorch_widedeep.metrics import Accuracy from pytorch_widedeep.datasets import load_adult <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n</pre> In\u00a0[2]: Copied! <pre>df = load_adult(as_frame=True)\n# For convenience, we'll replace '-' with '_'\ndf.columns = [c.replace(\"-\", \"_\") for c in df.columns]\n# binary target\ndf[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int)\ndf.drop(\"income\", axis=1, inplace=True)\ndf.head()\n</pre> df = load_adult(as_frame=True) # For convenience, we'll replace '-' with '_' df.columns = [c.replace(\"-\", \"_\") for c in df.columns] # binary target df[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int) df.drop(\"income\", axis=1, inplace=True) df.head() Out[2]: age workclass fnlwgt education educational_num marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country income_label 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States 0 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States 0 2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States 1 3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States 1 4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States 0 In\u00a0[3]: Copied! <pre># Define wide, crossed and deep tabular columns\nwide_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital_status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"native_country\",\n]\ncrossed_cols = [(\"education\", \"occupation\"), (\"native_country\", \"occupation\")]\ncat_embed_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital_status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"capital_gain\",\n    \"capital_loss\",\n    \"native_country\",\n]\ncontinuous_cols = [\"age\", \"hours_per_week\"]\ntarget_col = \"income_label\"\ntarget = df[target_col].values\n</pre> # Define wide, crossed and deep tabular columns wide_cols = [     \"workclass\",     \"education\",     \"marital_status\",     \"occupation\",     \"relationship\",     \"race\",     \"gender\",     \"native_country\", ] crossed_cols = [(\"education\", \"occupation\"), (\"native_country\", \"occupation\")] cat_embed_cols = [     \"workclass\",     \"education\",     \"marital_status\",     \"occupation\",     \"relationship\",     \"race\",     \"gender\",     \"capital_gain\",     \"capital_loss\",     \"native_country\", ] continuous_cols = [\"age\", \"hours_per_week\"] target_col = \"income_label\" target = df[target_col].values In\u00a0[4]: Copied! <pre># TARGET\ntarget = df[target_col].values\n\n# WIDE\nwide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)\nX_wide = wide_preprocessor.fit_transform(df)\n\n# DEEP\ntab_preprocessor = TabPreprocessor(\n    cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols\n)\nX_tab = tab_preprocessor.fit_transform(df)\n</pre> # TARGET target = df[target_col].values  # WIDE wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = wide_preprocessor.fit_transform(df)  # DEEP tab_preprocessor = TabPreprocessor(     cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols ) X_tab = tab_preprocessor.fit_transform(df) <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> In\u00a0[5]: Copied! <pre>wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    embed_continuous_method=\"standard\",\n    cont_embed_dim=8,\n    mlp_hidden_dims=[64, 32],\n    mlp_dropout=0.2,\n    mlp_activation=\"leaky_relu\",\n)\nmodel = WideDeep(wide=wide, deeptabular=tab_mlp)\n</pre> wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1) tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     embed_continuous_method=\"standard\",     cont_embed_dim=8,     mlp_hidden_dims=[64, 32],     mlp_dropout=0.2,     mlp_activation=\"leaky_relu\", ) model = WideDeep(wide=wide, deeptabular=tab_mlp) In\u00a0[6]: Copied! <pre>trainer = Trainer(\n    model,\n    objective=\"binary\",\n    optimizers=torch.optim.Adam(model.parameters(), lr=0.01),\n    metrics=[Accuracy],\n)\n</pre> trainer = Trainer(     model,     objective=\"binary\",     optimizers=torch.optim.Adam(model.parameters(), lr=0.01),     metrics=[Accuracy], ) In\u00a0[7]: Copied! <pre>trainer.fit(\n    X_wide=X_wide, X_tab=X_tab, target=target, n_epochs=2, val_split=0.2, batch_size=256\n)\n</pre> trainer.fit(     X_wide=X_wide, X_tab=X_tab, target=target, n_epochs=2, val_split=0.2, batch_size=256 ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:02&lt;00:00, 74.26it/s, loss=0.399, metrics={'acc': 0.8163}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 39/39 [00:00&lt;00:00, 91.03it/s, loss=0.296, metrics={'acc': 0.8677}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 81.31it/s, loss=0.3, metrics={'acc': 0.8614}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 39/39 [00:00&lt;00:00, 106.45it/s, loss=0.285, metrics={'acc': 0.8721}]\n</pre> In\u00a0[8]: Copied! <pre>trainer.save(path=\"models_dir/\", save_state_dict=True, model_filename=\"model_1.pt\")\n</pre> trainer.save(path=\"models_dir/\", save_state_dict=True, model_filename=\"model_1.pt\") <p>Now time goes by...and we want to fine-tune the model to another, new dataset (for example, a dataset that is identical to the one you used to train the previous model but for another country).</p> <p>Here I will use the same dataset just for illustration purposes, but the flow would be identical to that new dataset</p> In\u00a0[9]: Copied! <pre>wide_1 = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\ntab_mlp_1 = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    embed_continuous_method=\"standard\",\n    cont_embed_dim=8,\n    mlp_hidden_dims=[64, 32],\n    mlp_dropout=0.2,\n    mlp_activation=\"leaky_relu\",\n)\nmodel_1 = WideDeep(wide=wide, deeptabular=tab_mlp)\n</pre> wide_1 = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1) tab_mlp_1 = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     embed_continuous_method=\"standard\",     cont_embed_dim=8,     mlp_hidden_dims=[64, 32],     mlp_dropout=0.2,     mlp_activation=\"leaky_relu\", ) model_1 = WideDeep(wide=wide, deeptabular=tab_mlp) In\u00a0[10]: Copied! <pre>model_1.load_state_dict(torch.load(\"models_dir/model_1.pt\"))\n</pre> model_1.load_state_dict(torch.load(\"models_dir/model_1.pt\")) Out[10]: <pre>&lt;All keys matched successfully&gt;</pre> In\u00a0[11]: Copied! <pre>trainer_1 = Trainer(model_1, objective=\"binary\", metrics=[Accuracy])\n</pre> trainer_1 = Trainer(model_1, objective=\"binary\", metrics=[Accuracy]) In\u00a0[12]: Copied! <pre>trainer_1.fit(\n    X_wide=X_wide,\n    X_tab=X_tab,\n    target=target,\n    n_epochs=2,\n    batch_size=256,\n    finetune=True,\n    finetune_epochs=2,\n)\n</pre> trainer_1.fit(     X_wide=X_wide,     X_tab=X_tab,     target=target,     n_epochs=2,     batch_size=256,     finetune=True,     finetune_epochs=2, ) <pre>Training wide for 2 epochs\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 191/191 [00:01&lt;00:00, 97.37it/s, loss=0.39, metrics={'acc': 0.8152}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 191/191 [00:01&lt;00:00, 104.04it/s, loss=0.359, metrics={'acc': 0.824}]\n</pre> <pre>Training deeptabular for 2 epochs\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 191/191 [00:02&lt;00:00, 83.83it/s, loss=0.297, metrics={'acc': 0.8365}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 191/191 [00:02&lt;00:00, 82.78it/s, loss=0.283, metrics={'acc': 0.8445}]\n</pre> <pre>Fine-tuning (or warmup) of individual components completed. Training the whole model for 2 epochs\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 191/191 [00:02&lt;00:00, 72.84it/s, loss=0.281, metrics={'acc': 0.8716}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 191/191 [00:02&lt;00:00, 77.46it/s, loss=0.273, metrics={'acc': 0.8744}]\n</pre> <p>Note that, as I describe above, in scenario 2, we can just use this to warm up models before they joined training begins:</p> In\u00a0[13]: Copied! <pre>wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    embed_continuous_method=\"standard\",\n    cont_embed_dim=8,\n    mlp_hidden_dims=[64, 32],\n    mlp_dropout=0.2,\n    mlp_activation=\"leaky_relu\",\n)\nmodel = WideDeep(wide=wide, deeptabular=tab_mlp)\n</pre> wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1) tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     embed_continuous_method=\"standard\",     cont_embed_dim=8,     mlp_hidden_dims=[64, 32],     mlp_dropout=0.2,     mlp_activation=\"leaky_relu\", ) model = WideDeep(wide=wide, deeptabular=tab_mlp) In\u00a0[14]: Copied! <pre>trainer_2 = Trainer(model, objective=\"binary\", metrics=[Accuracy])\n</pre> trainer_2 = Trainer(model, objective=\"binary\", metrics=[Accuracy]) In\u00a0[15]: Copied! <pre>trainer_2.fit(\n    X_wide=X_wide,\n    X_tab=X_tab,\n    target=target,\n    val_split=0.1,\n    warmup=True,\n    warmup_epochs=2,\n    n_epochs=2,\n    batch_size=256,\n)\n</pre> trainer_2.fit(     X_wide=X_wide,     X_tab=X_tab,     target=target,     val_split=0.1,     warmup=True,     warmup_epochs=2,     n_epochs=2,     batch_size=256, ) <pre>Training wide for 2 epochs\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:01&lt;00:00, 102.49it/s, loss=0.52, metrics={'acc': 0.7519}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:01&lt;00:00, 98.15it/s, loss=0.381, metrics={'acc': 0.7891}]\n</pre> <pre>Training deeptabular for 2 epochs\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 82.97it/s, loss=0.356, metrics={'acc': 0.8043}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 80.27it/s, loss=0.295, metrics={'acc': 0.8195}]\n</pre> <pre>Fine-tuning (or warmup) of individual components completed. Training the whole model for 2 epochs\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 77.27it/s, loss=0.291, metrics={'acc': 0.8667}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 89.57it/s, loss=0.289, metrics={'acc': 0.8665}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 72.69it/s, loss=0.283, metrics={'acc': 0.8693}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 91.81it/s, loss=0.284, metrics={'acc': 0.869}]\n</pre> <p>We need to explicitly indicate</p> <ol> <li><p>That we want fine-tune</p> </li> <li><p>The components that we want to individually fine-tune</p> </li> <li><p>In case of gradual fine-tuning, the routine (\"felbo\" or \"howard\")</p> </li> <li><p>The layers we want to fine-tune.</p> </li> </ol> <p>For example</p> In\u00a0[16]: Copied! <pre>wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\ntab_resnet = TabResnet(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    blocks_dims=[200, 200, 200],\n)\nmodel = WideDeep(wide=wide, deeptabular=tab_resnet)\n</pre> wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1) tab_resnet = TabResnet(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     blocks_dims=[200, 200, 200], ) model = WideDeep(wide=wide, deeptabular=tab_resnet) In\u00a0[17]: Copied! <pre>model\n</pre> model Out[17]: <pre>WideDeep(\n  (wide): Wide(\n    (wide_linear): Embedding(809, 1, padding_idx=0)\n  )\n  (deeptabular): Sequential(\n    (0): TabResnet(\n      (cat_embed): DiffSizeCatEmbeddings(\n        (embed_layers): ModuleDict(\n          (emb_layer_workclass): Embedding(10, 5, padding_idx=0)\n          (emb_layer_education): Embedding(17, 8, padding_idx=0)\n          (emb_layer_marital_status): Embedding(8, 5, padding_idx=0)\n          (emb_layer_occupation): Embedding(16, 7, padding_idx=0)\n          (emb_layer_relationship): Embedding(7, 4, padding_idx=0)\n          (emb_layer_race): Embedding(6, 4, padding_idx=0)\n          (emb_layer_gender): Embedding(3, 2, padding_idx=0)\n          (emb_layer_capital_gain): Embedding(124, 24, padding_idx=0)\n          (emb_layer_capital_loss): Embedding(100, 21, padding_idx=0)\n          (emb_layer_native_country): Embedding(43, 13, padding_idx=0)\n        )\n        (embedding_dropout): Dropout(p=0.1, inplace=False)\n      )\n      (cont_norm): Identity()\n      (encoder): DenseResnet(\n        (dense_resnet): Sequential(\n          (lin_inp): Linear(in_features=95, out_features=200, bias=False)\n          (bn_inp): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n          (block_0): BasicBlock(\n            (lin1): Linear(in_features=200, out_features=200, bias=False)\n            (bn1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n            (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n            (dp): Dropout(p=0.1, inplace=False)\n            (lin2): Linear(in_features=200, out_features=200, bias=False)\n            (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n          )\n          (block_1): BasicBlock(\n            (lin1): Linear(in_features=200, out_features=200, bias=False)\n            (bn1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n            (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n            (dp): Dropout(p=0.1, inplace=False)\n            (lin2): Linear(in_features=200, out_features=200, bias=False)\n            (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=200, out_features=1, bias=True)\n  )\n)</pre> <p>let's first train as usual</p> In\u00a0[18]: Copied! <pre>trainer_3 = Trainer(model, objective=\"binary\", metrics=[Accuracy])\n</pre> trainer_3 = Trainer(model, objective=\"binary\", metrics=[Accuracy]) In\u00a0[19]: Copied! <pre>trainer_3.fit(\n    X_wide=X_wide, X_tab=X_tab, target=target, val_split=0.1, n_epochs=2, batch_size=256\n)\n</pre> trainer_3.fit(     X_wide=X_wide, X_tab=X_tab, target=target, val_split=0.1, n_epochs=2, batch_size=256 ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:03&lt;00:00, 54.23it/s, loss=0.382, metrics={'acc': 0.8239}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 84.72it/s, loss=0.331, metrics={'acc': 0.8526}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:03&lt;00:00, 54.35it/s, loss=0.33, metrics={'acc': 0.8465}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 68.15it/s, loss=0.312, metrics={'acc': 0.8604}]\n</pre> In\u00a0[20]: Copied! <pre>trainer_3.save(path=\"models_dir\", save_state_dict=True, model_filename=\"model_3.pt\")\n</pre> trainer_3.save(path=\"models_dir\", save_state_dict=True, model_filename=\"model_3.pt\") <p>Now we are going to fine-tune the model components, and in the case of the <code>deeptabular</code> component, we will fine-tune the resnet-blocks and the linear layer but NOT the embeddings.</p> <p>For this, we need to access the model component's children: <code>deeptabular</code> $\\rightarrow$ <code>tab_resnet</code> $\\rightarrow$ <code>dense_resnet</code> $\\rightarrow$ <code>blocks</code></p> In\u00a0[21]: Copied! <pre>wide_3 = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\ntab_resnet_3 = TabResnet(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    blocks_dims=[200, 200, 200],\n)\nmodel_3 = WideDeep(wide=wide, deeptabular=tab_resnet)\n</pre> wide_3 = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1) tab_resnet_3 = TabResnet(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     blocks_dims=[200, 200, 200], ) model_3 = WideDeep(wide=wide, deeptabular=tab_resnet) In\u00a0[22]: Copied! <pre>model_3.load_state_dict(torch.load(\"models_dir/model_3.pt\"))\n</pre> model_3.load_state_dict(torch.load(\"models_dir/model_3.pt\")) Out[22]: <pre>&lt;All keys matched successfully&gt;</pre> In\u00a0[23]: Copied! <pre>model_3\n</pre> model_3 Out[23]: <pre>WideDeep(\n  (wide): Wide(\n    (wide_linear): Embedding(809, 1, padding_idx=0)\n  )\n  (deeptabular): Sequential(\n    (0): TabResnet(\n      (cat_embed): DiffSizeCatEmbeddings(\n        (embed_layers): ModuleDict(\n          (emb_layer_workclass): Embedding(10, 5, padding_idx=0)\n          (emb_layer_education): Embedding(17, 8, padding_idx=0)\n          (emb_layer_marital_status): Embedding(8, 5, padding_idx=0)\n          (emb_layer_occupation): Embedding(16, 7, padding_idx=0)\n          (emb_layer_relationship): Embedding(7, 4, padding_idx=0)\n          (emb_layer_race): Embedding(6, 4, padding_idx=0)\n          (emb_layer_gender): Embedding(3, 2, padding_idx=0)\n          (emb_layer_capital_gain): Embedding(124, 24, padding_idx=0)\n          (emb_layer_capital_loss): Embedding(100, 21, padding_idx=0)\n          (emb_layer_native_country): Embedding(43, 13, padding_idx=0)\n        )\n        (embedding_dropout): Dropout(p=0.1, inplace=False)\n      )\n      (cont_norm): Identity()\n      (encoder): DenseResnet(\n        (dense_resnet): Sequential(\n          (lin_inp): Linear(in_features=95, out_features=200, bias=False)\n          (bn_inp): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n          (block_0): BasicBlock(\n            (lin1): Linear(in_features=200, out_features=200, bias=False)\n            (bn1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n            (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n            (dp): Dropout(p=0.1, inplace=False)\n            (lin2): Linear(in_features=200, out_features=200, bias=False)\n            (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n          )\n          (block_1): BasicBlock(\n            (lin1): Linear(in_features=200, out_features=200, bias=False)\n            (bn1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n            (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n            (dp): Dropout(p=0.1, inplace=False)\n            (lin2): Linear(in_features=200, out_features=200, bias=False)\n            (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=200, out_features=1, bias=True)\n  )\n)</pre> In\u00a0[24]: Copied! <pre>tab_lin_layer = list(model_3.deeptabular.children())[1]\n</pre> tab_lin_layer = list(model_3.deeptabular.children())[1] In\u00a0[25]: Copied! <pre>tab_lin_layer\n</pre> tab_lin_layer Out[25]: <pre>Linear(in_features=200, out_features=1, bias=True)</pre> In\u00a0[26]: Copied! <pre>tab_deep_layers = []\nfor n1, c1 in model_3.deeptabular.named_children():\n    if (\n        n1 == \"0\"\n    ):  # 0 is the model component and 1 is always the prediction layer added by the `WideDeep` class\n        for n2, c2 in c1.named_children():\n            if n2 == \"encoder\":  # TabResnet\n                for _, c3 in c2.named_children():\n                    for n4, c4 in c3.named_children():  # dense_resnet\n                        if \"block\" in n4:\n                            tab_deep_layers.append((n4, c4))\n</pre> tab_deep_layers = [] for n1, c1 in model_3.deeptabular.named_children():     if (         n1 == \"0\"     ):  # 0 is the model component and 1 is always the prediction layer added by the `WideDeep` class         for n2, c2 in c1.named_children():             if n2 == \"encoder\":  # TabResnet                 for _, c3 in c2.named_children():                     for n4, c4 in c3.named_children():  # dense_resnet                         if \"block\" in n4:                             tab_deep_layers.append((n4, c4)) In\u00a0[27]: Copied! <pre>tab_deep_layers\n</pre> tab_deep_layers Out[27]: <pre>[('block_0',\n  BasicBlock(\n    (lin1): Linear(in_features=200, out_features=200, bias=False)\n    (bn1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n    (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n    (dp): Dropout(p=0.1, inplace=False)\n    (lin2): Linear(in_features=200, out_features=200, bias=False)\n    (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n  )),\n ('block_1',\n  BasicBlock(\n    (lin1): Linear(in_features=200, out_features=200, bias=False)\n    (bn1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n    (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n    (dp): Dropout(p=0.1, inplace=False)\n    (lin2): Linear(in_features=200, out_features=200, bias=False)\n    (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n  ))]</pre> <p>Now remember, we need to pass ONLY LAYERS (before I included the name for clarity) the layers in WARM UP ORDER, therefore:</p> In\u00a0[28]: Copied! <pre>tab_deep_layers = [el[1] for el in tab_deep_layers][::-1]\n</pre> tab_deep_layers = [el[1] for el in tab_deep_layers][::-1] In\u00a0[29]: Copied! <pre>tab_layers = [tab_lin_layer] + tab_deep_layers[::-1]\n</pre> tab_layers = [tab_lin_layer] + tab_deep_layers[::-1] In\u00a0[30]: Copied! <pre>tab_layers\n</pre> tab_layers Out[30]: <pre>[Linear(in_features=200, out_features=1, bias=True),\n BasicBlock(\n   (lin1): Linear(in_features=200, out_features=200, bias=False)\n   (bn1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n   (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n   (dp): Dropout(p=0.1, inplace=False)\n   (lin2): Linear(in_features=200, out_features=200, bias=False)\n   (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n ),\n BasicBlock(\n   (lin1): Linear(in_features=200, out_features=200, bias=False)\n   (bn1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n   (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n   (dp): Dropout(p=0.1, inplace=False)\n   (lin2): Linear(in_features=200, out_features=200, bias=False)\n   (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n )]</pre> <p>And now simply</p> In\u00a0[31]: Copied! <pre>trainer_4 = Trainer(model_3, objective=\"binary\", metrics=[Accuracy])\n</pre> trainer_4 = Trainer(model_3, objective=\"binary\", metrics=[Accuracy]) In\u00a0[32]: Copied! <pre>trainer_4.fit(\n    X_wide=X_wide,\n    X_tab=X_tab,\n    target=target,\n    val_split=0.1,\n    finetune=True,\n    finetune_epochs=2,\n    deeptabular_gradual=True,\n    deeptabular_layers=tab_layers,\n    deeptabular_max_lr=0.01,\n    n_epochs=2,\n    batch_size=256,\n)\n</pre> trainer_4.fit(     X_wide=X_wide,     X_tab=X_tab,     target=target,     val_split=0.1,     finetune=True,     finetune_epochs=2,     deeptabular_gradual=True,     deeptabular_layers=tab_layers,     deeptabular_max_lr=0.01,     n_epochs=2,     batch_size=256, ) <pre>Training wide for 2 epochs\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:01&lt;00:00, 95.17it/s, loss=0.504, metrics={'acc': 0.7523}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:01&lt;00:00, 99.83it/s, loss=0.384, metrics={'acc': 0.789}]\n</pre> <pre>Training deeptabular, layer 1 of 3\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 72.31it/s, loss=0.317, metrics={'acc': 0.8098}]\n</pre> <pre>Training deeptabular, layer 2 of 3\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 65.97it/s, loss=0.312, metrics={'acc': 0.8214}]\n</pre> <pre>Training deeptabular, layer 3 of 3\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 63.92it/s, loss=0.306, metrics={'acc': 0.8284}]\n</pre> <pre>Fine-tuning (or warmup) of individual components completed. Training the whole model for 2 epochs\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:03&lt;00:00, 57.26it/s, loss=0.292, metrics={'acc': 0.8664}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 84.56it/s, loss=0.292, metrics={'acc': 0.8696}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:03&lt;00:00, 53.61it/s, loss=0.282, metrics={'acc': 0.8693}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 80.59it/s, loss=0.289, metrics={'acc': 0.8719}]\n</pre> <p>Finally, there is one more use case I would like to consider. The case where we train only one component and we just want to fine-tune and stop the training afterwards, since there is no joined training. This is a simple as</p> In\u00a0[33]: Copied! <pre>tab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    mlp_hidden_dims=[64, 32],\n    mlp_dropout=0.2,\n    mlp_activation=\"leaky_relu\",\n)\nmodel = WideDeep(deeptabular=tab_mlp)\n</pre> tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     mlp_hidden_dims=[64, 32],     mlp_dropout=0.2,     mlp_activation=\"leaky_relu\", ) model = WideDeep(deeptabular=tab_mlp) In\u00a0[34]: Copied! <pre>trainer_5 = Trainer(\n    model,\n    objective=\"binary\",\n    optimizers=torch.optim.Adam(model.parameters(), lr=0.01),\n    metrics=[Accuracy],\n)\n</pre> trainer_5 = Trainer(     model,     objective=\"binary\",     optimizers=torch.optim.Adam(model.parameters(), lr=0.01),     metrics=[Accuracy], ) In\u00a0[35]: Copied! <pre>trainer_5.fit(\n    X_wide=X_wide, X_tab=X_tab, target=target, val_split=0.1, n_epochs=1, batch_size=256\n)\n</pre> trainer_5.fit(     X_wide=X_wide, X_tab=X_tab, target=target, val_split=0.1, n_epochs=1, batch_size=256 ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 73.69it/s, loss=0.365, metrics={'acc': 0.8331}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 92.56it/s, loss=0.299, metrics={'acc': 0.8673}]\n</pre> In\u00a0[36]: Copied! <pre>trainer_5.save(path=\"models_dir\", save_state_dict=True, model_filename=\"model_5.pt\")\n</pre> trainer_5.save(path=\"models_dir\", save_state_dict=True, model_filename=\"model_5.pt\") In\u00a0[37]: Copied! <pre>tab_mlp_5 = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    mlp_hidden_dims=[64, 32],\n    mlp_dropout=0.2,\n    mlp_activation=\"leaky_relu\",\n)\nmodel_5 = WideDeep(deeptabular=tab_mlp_5)\n</pre> tab_mlp_5 = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     mlp_hidden_dims=[64, 32],     mlp_dropout=0.2,     mlp_activation=\"leaky_relu\", ) model_5 = WideDeep(deeptabular=tab_mlp_5) In\u00a0[38]: Copied! <pre>model_5.load_state_dict(torch.load(\"models_dir/model_5.pt\"))\n</pre> model_5.load_state_dict(torch.load(\"models_dir/model_5.pt\")) Out[38]: <pre>&lt;All keys matched successfully&gt;</pre> <p>...times go by...</p> In\u00a0[39]: Copied! <pre>trainer_6 = Trainer(\n    model_5,\n    objective=\"binary\",\n    optimizers=torch.optim.Adam(model.parameters(), lr=0.01),\n    metrics=[Accuracy],\n)\n</pre> trainer_6 = Trainer(     model_5,     objective=\"binary\",     optimizers=torch.optim.Adam(model.parameters(), lr=0.01),     metrics=[Accuracy], ) In\u00a0[40]: Copied! <pre>trainer_6.fit(\n    X_wide=X_wide,\n    X_tab=X_tab,\n    target=target,\n    val_split=0.1,\n    finetune=True,\n    finetune_epochs=2,\n    finetune_max_lr=0.01,\n    stop_after_finetuning=True,\n    batch_size=256,\n)\n</pre> trainer_6.fit(     X_wide=X_wide,     X_tab=X_tab,     target=target,     val_split=0.1,     finetune=True,     finetune_epochs=2,     finetune_max_lr=0.01,     stop_after_finetuning=True,     batch_size=256, ) <pre>Training deeptabular for 2 epochs\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 73.86it/s, loss=0.298, metrics={'acc': 0.8652}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 75.45it/s, loss=0.286, metrics={'acc': 0.8669}]\n</pre> <pre>Fine-tuning (or warmup) of individual components completed. Training the whole model for 1 epochs\n</pre> <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 172/172 [00:02&lt;00:00, 76.29it/s, loss=0.282, metrics={'acc': 0.8698}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 20/20 [00:00&lt;00:00, 84.93it/s, loss=0.281, metrics={'acc': 0.8749}]\n</pre> In\u00a0[42]: Copied! <pre>import shutil\n\nshutil.rmtree(\"models_dir/\")\nshutil.rmtree(\"model_weights/\")\n</pre> import shutil  shutil.rmtree(\"models_dir/\") shutil.rmtree(\"model_weights/\") In\u00a0[\u00a0]: Copied! <pre>\n</pre>"},{"location":"examples/06_finetune_and_warmup.html#the-finetunewarm-up-option","title":"The FineTune/Warm Up option\u00b6","text":"<p>Let's place ourselves in two possible scenarios.</p> <ol> <li><p>Let's assume we have run a model and we want to just transfer the learnings (you know...transfer-learning) to another dataset, or simply we have received new data and we do not want to start the training of each component from scratch. Simply, we want to load the pre-trained weights and fine-tune.</p> </li> <li><p>We just want to \"warm up\" individual model components individually before the joined training begins.</p> </li> </ol> <p>This can be done with the <code>finetune</code> set of parameters. There are 3 fine-tuning routines:</p> <ol> <li>Fine-tune all trainable layers at once with a triangular one-cycle learning rate (referred as slanted triangular learning rates in Howard &amp; Ruder 2018)</li> <li>Gradual fine-tuning inspired by the work of Felbo et al., 2017</li> <li>Gradual fine-tuning based on the work of Howard &amp; Ruder 2018</li> </ol> <p>Currently fine-tunning is only supported without a fully connected head, i.e. if <code>deephead=None</code>. In addition, <code>Felbo</code> and <code>Howard</code> routines only applied, of course, to the <code>deeptabular</code>, <code>deeptext</code> and <code>deepimage</code> models. The <code>wide</code> component can also be fine-tuned, but only in an \"all at once\" mode.</p>"},{"location":"examples/06_finetune_and_warmup.html#fine-tune-or-warm-up-all-at-once","title":"Fine-tune or warm-up all at once\u00b6","text":"<p>Here, the model components will be trained for <code>finetune_epochs</code> using a triangular one-cycle learning rate (slanted triangular learning rate) ranging from <code>finetune_max_lr/10</code> to <code>finetune_max_lr</code> (default is 0.01). 10% of the training steps are used to increase the learning rate which then decreases for the remaining 90%.</p> <p>Here all trainable layers are fine-tuned.</p> <p>Let's have a look to one example.</p>"},{"location":"examples/06_finetune_and_warmup.html#fine-tune-gradually-the-felbo-and-the-howard-routines","title":"Fine-tune Gradually: The \"felbo\"  and the \"howard\" routines\u00b6","text":"<p>The Felbo routine can be illustrated as follows:</p> <p> </p> <p>Figure 1. The figure can be described as follows: fine-tune (or train) the last layer for one epoch using a one cycle triangular learning rate. Then fine-tune the next deeper layer for one epoch, with a learning rate that is a factor of 2.5 lower than the previous learning rate (the 2.5 factor is fixed) while freezing the already warmed up layer(s). Repeat untill all individual layers are warmed. Then warm one last epoch with all warmed layers trainable. The vanishing color gradient in the figure attempts to illustrate the decreasing learning rate.</p> <p>Note that this is not identical to the Fine-Tunning routine described in Felbo et al, 2017, this is why I used the word 'inspired'.</p> <p>The Howard routine can be illustrated as follows:</p> <p> </p> <p>Figure 2. The figure can be described as follows: fine-tune (or train) the last layer for one epoch using a one cycle triangular learning rate. Then fine-tune the next deeper layer for one epoch, with a learning rate that is a factor of 2.5 lower than the previous learning rate (the 2.5 factor is fixed) while keeping the already warmed up layer(s) trainable. Repeat. The vanishing color gradient in the figure attempts to illustrate the decreasing learning rate.</p> <p>Note that I write \"fine-tune (or train) the last layer for one epoch [...]\". However, in practice the user will have to specify the order of the layers to be fine-tuned. This is another reason why I wrote that the fine-tune routines I have implemented are inspired by the work of Felbo and Howard and not identical to their implemenations.</p> <p>The <code>felbo</code> and <code>howard</code> routines can be accessed with via the <code>fine-tune</code> parameters.</p>"},{"location":"examples/07_custom_components.html","title":"07_custom_components","text":"In\u00a0[1]: Copied! <pre>import numpy as np\nimport pandas as pd\nimport os\nimport torch\n\nfrom torch import Tensor\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.preprocessing import (\n    WidePreprocessor,\n    TabPreprocessor,\n    TextPreprocessor,\n    ImagePreprocessor,\n)\nfrom pytorch_widedeep.models import (\n    Wide,\n    TabMlp,\n    Vision,\n    BasicRNN,\n    WideDeep,\n)\nfrom pytorch_widedeep.losses import RMSELoss\nfrom pytorch_widedeep.initializers import *\nfrom pytorch_widedeep.callbacks import *\nfrom pytorch_widedeep.datasets import load_adult\n</pre> import numpy as np import pandas as pd import os import torch  from torch import Tensor from pytorch_widedeep import Trainer from pytorch_widedeep.preprocessing import (     WidePreprocessor,     TabPreprocessor,     TextPreprocessor,     ImagePreprocessor, ) from pytorch_widedeep.models import (     Wide,     TabMlp,     Vision,     BasicRNN,     WideDeep, ) from pytorch_widedeep.losses import RMSELoss from pytorch_widedeep.initializers import * from pytorch_widedeep.callbacks import * from pytorch_widedeep.datasets import load_adult <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n</pre> In\u00a0[2]: Copied! <pre>df = pd.read_csv(\"../tmp_data/airbnb/airbnb_sample.csv\")\ndf.head()\n</pre> df = pd.read_csv(\"../tmp_data/airbnb/airbnb_sample.csv\") df.head() Out[2]: id host_id description host_listings_count host_identity_verified neighbourhood_cleansed latitude longitude is_location_exact property_type ... amenity_wide_entrance amenity_wide_entrance_for_guests amenity_wide_entryway amenity_wide_hallways amenity_wifi amenity_window_guards amenity_wine_cooler security_deposit extra_people yield 0 13913.jpg 54730 My bright double bedroom with a large window h... 4.0 f Islington 51.56802 -0.11121 t apartment ... 1 0 0 0 1 0 0 100.0 15.0 12.00 1 15400.jpg 60302 Lots of windows and light.  St Luke's Gardens ... 1.0 t Kensington and Chelsea 51.48796 -0.16898 t apartment ... 0 0 0 0 1 0 0 150.0 0.0 109.50 2 17402.jpg 67564 Open from June 2018 after a 3-year break, we a... 19.0 t Westminster 51.52098 -0.14002 t apartment ... 0 0 0 0 1 0 0 350.0 10.0 149.65 3 24328.jpg 41759 Artist house, bright high ceiling rooms, priva... 2.0 t Wandsworth 51.47298 -0.16376 t other ... 0 0 0 0 1 0 0 250.0 0.0 215.60 4 25023.jpg 102813 Large, all comforts, 2-bed flat; first floor; ... 1.0 f Wandsworth 51.44687 -0.21874 t apartment ... 0 0 0 0 1 0 0 250.0 11.0 79.35 <p>5 rows \u00d7 223 columns</p> In\u00a0[3]: Copied! <pre># There are a number of columns that are already binary. Therefore, no need to one hot encode them\ncrossed_cols = [(\"property_type\", \"room_type\")]\nalready_dummies = [c for c in df.columns if \"amenity\" in c] + [\"has_house_rules\"]\nwide_cols = [\n    \"is_location_exact\",\n    \"property_type\",\n    \"room_type\",\n    \"host_gender\",\n    \"instant_bookable\",\n] + already_dummies\n\ncat_embed_cols = [(c, 16) for c in df.columns if \"catg\" in c] + [\n    (\"neighbourhood_cleansed\", 64),\n    (\"cancellation_policy\", 16),\n]\ncontinuous_cols = [\"latitude\", \"longitude\", \"security_deposit\", \"extra_people\"]\n# it does not make sense to standarised Latitude and Longitude\nalready_standard = [\"latitude\", \"longitude\"]\n\n# text and image colnames\ntext_col = \"description\"\nimg_col = \"id\"\n\n# path to pretrained word embeddings and the images\nword_vectors_path = \"../tmp_data/glove.6B/glove.6B.100d.txt\"\nimg_path = \"../tmp_data/airbnb/property_picture\"\n\n# target\ntarget_col = \"yield\"\n</pre> # There are a number of columns that are already binary. Therefore, no need to one hot encode them crossed_cols = [(\"property_type\", \"room_type\")] already_dummies = [c for c in df.columns if \"amenity\" in c] + [\"has_house_rules\"] wide_cols = [     \"is_location_exact\",     \"property_type\",     \"room_type\",     \"host_gender\",     \"instant_bookable\", ] + already_dummies  cat_embed_cols = [(c, 16) for c in df.columns if \"catg\" in c] + [     (\"neighbourhood_cleansed\", 64),     (\"cancellation_policy\", 16), ] continuous_cols = [\"latitude\", \"longitude\", \"security_deposit\", \"extra_people\"] # it does not make sense to standarised Latitude and Longitude already_standard = [\"latitude\", \"longitude\"]  # text and image colnames text_col = \"description\" img_col = \"id\"  # path to pretrained word embeddings and the images word_vectors_path = \"../tmp_data/glove.6B/glove.6B.100d.txt\" img_path = \"../tmp_data/airbnb/property_picture\"  # target target_col = \"yield\" In\u00a0[4]: Copied! <pre>target = df[target_col].values\n</pre> target = df[target_col].values In\u00a0[5]: Copied! <pre>wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)\nX_wide = wide_preprocessor.fit_transform(df)\n\ntab_preprocessor = TabPreprocessor(\n    embed_cols=cat_embed_cols, continuous_cols=continuous_cols\n)\nX_tab = tab_preprocessor.fit_transform(df)\n\ntext_preprocessor = TextPreprocessor(\n    word_vectors_path=word_vectors_path, text_col=text_col\n)\nX_text = text_preprocessor.fit_transform(df)\n\nimage_processor = ImagePreprocessor(img_col=img_col, img_path=img_path)\nX_images = image_processor.fit_transform(df)\n</pre> wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = wide_preprocessor.fit_transform(df)  tab_preprocessor = TabPreprocessor(     embed_cols=cat_embed_cols, continuous_cols=continuous_cols ) X_tab = tab_preprocessor.fit_transform(df)  text_preprocessor = TextPreprocessor(     word_vectors_path=word_vectors_path, text_col=text_col ) X_text = text_preprocessor.fit_transform(df)  image_processor = ImagePreprocessor(img_col=img_col, img_path=img_path) X_images = image_processor.fit_transform(df) <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> <pre>The vocabulary contains 2192 tokens\nIndexing word vectors...\nLoaded 400000 word vectors\nPreparing embeddings matrix...\n2175 words in the vocabulary had ../tmp_data/glove.6B/glove.6B.100d.txt vectors and appear more than 5 times\nReading Images from ../tmp_data/airbnb/property_picture\nResizing\n</pre> <pre>100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1001/1001 [00:02&lt;00:00, 497.80it/s]\n</pre> <pre>Computing normalisation metrics\n</pre> <p>Now we are ready to build a wide and deep model. Three of the four components we will use are included in this package, and they will be combined with a custom <code>deeptext</code> component. Then the fit process will run with a custom loss function.</p> <p>Let's have a look</p> In\u00a0[6]: Copied! <pre># Linear model\nwide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\n\n# DeepDense: 2 Dense layers\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.1,\n    continuous_cols=continuous_cols,\n    mlp_hidden_dims=[128, 64],\n    mlp_dropout=0.1,\n)\n\n# Pretrained Resnet 18\nresnet = Vision(pretrained_model_name=\"resnet18\", n_trainable=0)\n</pre> # Linear model wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)  # DeepDense: 2 Dense layers tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.1,     continuous_cols=continuous_cols,     mlp_hidden_dims=[128, 64],     mlp_dropout=0.1, )  # Pretrained Resnet 18 resnet = Vision(pretrained_model_name=\"resnet18\", n_trainable=0) In\u00a0[7]: Copied! <pre>class MyDeepText(nn.Module):\n    def __init__(self, vocab_size, padding_idx=1, embed_dim=100, hidden_dim=64):\n        super(MyDeepText, self).__init__()\n\n        # word/token embeddings\n        self.word_embed = nn.Embedding(vocab_size, embed_dim, padding_idx=padding_idx)\n\n        # stack of RNNs\n        self.rnn = nn.GRU(\n            embed_dim,\n            hidden_dim,\n            num_layers=2,\n            bidirectional=True,\n            batch_first=True,\n        )\n\n        # Remember, this MUST be defined. If not WideDeep will through an error\n        self.output_dim = hidden_dim * 2\n\n    def forward(self, X):\n        embed = self.word_embed(X.long())\n        o, h = self.rnn(embed)\n        return torch.cat((h[-2], h[-1]), dim=1)\n</pre> class MyDeepText(nn.Module):     def __init__(self, vocab_size, padding_idx=1, embed_dim=100, hidden_dim=64):         super(MyDeepText, self).__init__()          # word/token embeddings         self.word_embed = nn.Embedding(vocab_size, embed_dim, padding_idx=padding_idx)          # stack of RNNs         self.rnn = nn.GRU(             embed_dim,             hidden_dim,             num_layers=2,             bidirectional=True,             batch_first=True,         )          # Remember, this MUST be defined. If not WideDeep will through an error         self.output_dim = hidden_dim * 2      def forward(self, X):         embed = self.word_embed(X.long())         o, h = self.rnn(embed)         return torch.cat((h[-2], h[-1]), dim=1) In\u00a0[8]: Copied! <pre>mydeeptext = MyDeepText(vocab_size=len(text_preprocessor.vocab.itos))\n</pre> mydeeptext = MyDeepText(vocab_size=len(text_preprocessor.vocab.itos)) In\u00a0[9]: Copied! <pre>model = WideDeep(wide=wide, deeptabular=tab_mlp, deeptext=mydeeptext, deepimage=resnet)\n</pre> model = WideDeep(wide=wide, deeptabular=tab_mlp, deeptext=mydeeptext, deepimage=resnet) In\u00a0[10]: Copied! <pre>class RMSELoss(nn.Module):\n    def __init__(self):\n        \"\"\"root mean squared error\"\"\"\n        super().__init__()\n        self.mse = nn.MSELoss()\n\n    def forward(self, input: Tensor, target: Tensor) -&gt; Tensor:\n        return torch.sqrt(self.mse(input, target))\n</pre> class RMSELoss(nn.Module):     def __init__(self):         \"\"\"root mean squared error\"\"\"         super().__init__()         self.mse = nn.MSELoss()      def forward(self, input: Tensor, target: Tensor) -&gt; Tensor:         return torch.sqrt(self.mse(input, target)) <p>and now we just instantiate the <code>Trainer</code> as usual. Needless to say, but this runs with 1000 random observations, so loss and metric values are meaningless. This is just an example</p> In\u00a0[11]: Copied! <pre>trainer = Trainer(model, objective=\"regression\", custom_loss_function=RMSELoss())\n</pre> trainer = Trainer(model, objective=\"regression\", custom_loss_function=RMSELoss()) In\u00a0[12]: Copied! <pre>trainer.fit(\n    X_wide=X_wide,\n    X_tab=X_tab,\n    X_text=X_text,\n    X_img=X_images,\n    target=target,\n    n_epochs=1,\n    batch_size=32,\n    val_split=0.2,\n)\n</pre> trainer.fit(     X_wide=X_wide,     X_tab=X_tab,     X_text=X_text,     X_img=X_images,     target=target,     n_epochs=1,     batch_size=32,     val_split=0.2, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 25/25 [00:23&lt;00:00,  1.07it/s, loss=126]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 7/7 [00:05&lt;00:00,  1.24it/s, loss=97.4]\n</pre> <p>In addition to model components and loss functions, we can also use custom callbacks or custom metrics. The former need to be of type <code>Callback</code> and the latter need to be of type <code>Metric</code>. See:</p> <pre>pytorch-widedeep.callbacks\n</pre> <p>and</p> <pre>pytorch-widedeep.metrics\n</pre> <p>For this example let me use the adult dataset. Again, we first prepare the data as usual</p> In\u00a0[13]: Copied! <pre>df = load_adult(as_frame=True)\ndf.head()\n</pre> df = load_adult(as_frame=True) df.head() Out[13]: age workclass fnlwgt education educational-num marital-status occupation relationship race gender capital-gain capital-loss hours-per-week native-country income 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States &lt;=50K 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States &lt;=50K 2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States &gt;50K 3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States &gt;50K 4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States &lt;=50K In\u00a0[14]: Copied! <pre># For convenience, we'll replace '-' with '_'\ndf.columns = [c.replace(\"-\", \"_\") for c in df.columns]\n# binary target\ndf[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int)\ndf.drop(\"income\", axis=1, inplace=True)\ndf.head()\n</pre> # For convenience, we'll replace '-' with '_' df.columns = [c.replace(\"-\", \"_\") for c in df.columns] # binary target df[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int) df.drop(\"income\", axis=1, inplace=True) df.head() Out[14]: age workclass fnlwgt education educational_num marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country income_label 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States 0 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States 0 2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States 1 3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States 1 4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States 0 In\u00a0[15]: Copied! <pre># Define wide, crossed and deep tabular columns\nwide_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital_status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"native_country\",\n]\ncrossed_cols = [(\"education\", \"occupation\"), (\"native_country\", \"occupation\")]\ncat_embed_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital_status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"capital_gain\",\n    \"capital_loss\",\n    \"native_country\",\n]\ncontinuous_cols = [\"age\", \"hours_per_week\"]\ntarget_col = \"income_label\"\ntarget = df[target_col].values\n</pre> # Define wide, crossed and deep tabular columns wide_cols = [     \"workclass\",     \"education\",     \"marital_status\",     \"occupation\",     \"relationship\",     \"race\",     \"gender\",     \"native_country\", ] crossed_cols = [(\"education\", \"occupation\"), (\"native_country\", \"occupation\")] cat_embed_cols = [     \"workclass\",     \"education\",     \"marital_status\",     \"occupation\",     \"relationship\",     \"race\",     \"gender\",     \"capital_gain\",     \"capital_loss\",     \"native_country\", ] continuous_cols = [\"age\", \"hours_per_week\"] target_col = \"income_label\" target = df[target_col].values In\u00a0[16]: Copied! <pre># wide\nwide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)\nX_wide = wide_preprocessor.fit_transform(df)\n\n# deeptabular\ntab_preprocessor = TabPreprocessor(\n    embed_cols=cat_embed_cols, continuous_cols=continuous_cols\n)\nX_tab = tab_preprocessor.fit_transform(df)\n</pre> # wide wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = wide_preprocessor.fit_transform(df)  # deeptabular tab_preprocessor = TabPreprocessor(     embed_cols=cat_embed_cols, continuous_cols=continuous_cols ) X_tab = tab_preprocessor.fit_transform(df) <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> In\u00a0[17]: Copied! <pre>wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=continuous_cols,\n    mlp_hidden_dims=[128, 64],\n    mlp_dropout=0.2,\n    mlp_activation=\"leaky_relu\",\n)\nmodel = WideDeep(wide=wide, deeptabular=tab_mlp)\n</pre> wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1) tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=continuous_cols,     mlp_hidden_dims=[128, 64],     mlp_dropout=0.2,     mlp_activation=\"leaky_relu\", ) model = WideDeep(wide=wide, deeptabular=tab_mlp) In\u00a0[18]: Copied! <pre>from pytorch_widedeep.metrics import Metric\n</pre> from pytorch_widedeep.metrics import Metric In\u00a0[19]: Copied! <pre>class Accuracy(Metric):\n    def __init__(self, top_k: int = 1):\n        super(Accuracy, self).__init__()\n\n        self.top_k = top_k\n        self.correct_count = 0\n        self.total_count = 0\n\n        # \u00a0metric name needs to be defined\n        self._name = \"acc\"\n\n    def reset(self):\n        self.correct_count = 0\n        self.total_count = 0\n\n    def __call__(self, y_pred: Tensor, y_true: Tensor) -&gt; np.ndarray:\n        num_classes = y_pred.size(1)\n\n        if num_classes == 1:\n            y_pred = y_pred.round()\n            y_true = y_true\n        elif num_classes &gt; 1:\n            y_pred = y_pred.topk(self.top_k, 1)[1]\n            y_true = y_true.view(-1, 1).expand_as(y_pred)\n\n        self.correct_count += y_pred.eq(y_true).sum().item()\n        self.total_count += len(y_pred)\n        accuracy = float(self.correct_count) / float(self.total_count)\n        return np.array(accuracy)\n</pre> class Accuracy(Metric):     def __init__(self, top_k: int = 1):         super(Accuracy, self).__init__()          self.top_k = top_k         self.correct_count = 0         self.total_count = 0          # \u00a0metric name needs to be defined         self._name = \"acc\"      def reset(self):         self.correct_count = 0         self.total_count = 0      def __call__(self, y_pred: Tensor, y_true: Tensor) -&gt; np.ndarray:         num_classes = y_pred.size(1)          if num_classes == 1:             y_pred = y_pred.round()             y_true = y_true         elif num_classes &gt; 1:             y_pred = y_pred.topk(self.top_k, 1)[1]             y_true = y_true.view(-1, 1).expand_as(y_pred)          self.correct_count += y_pred.eq(y_true).sum().item()         self.total_count += len(y_pred)         accuracy = float(self.correct_count) / float(self.total_count)         return np.array(accuracy) In\u00a0[20]: Copied! <pre># have a look to the class\nfrom pytorch_widedeep.callbacks import Callback\n</pre> # have a look to the class from pytorch_widedeep.callbacks import Callback In\u00a0[21]: Copied! <pre>class SillyCallback(Callback):\n    def on_train_begin(self, logs=None):\n        # recordings will be the trainer object attributes\n        self.trainer.silly_callback = {}\n\n        self.trainer.silly_callback[\"beginning\"] = []\n        self.trainer.silly_callback[\"end\"] = []\n\n    def on_epoch_begin(self, epoch, logs=None):\n        self.trainer.silly_callback[\"beginning\"].append(epoch + 1)\n\n    def on_epoch_end(self, epoch, logs=None, metric=None):\n        self.trainer.silly_callback[\"end\"].append(epoch + 1)\n</pre> class SillyCallback(Callback):     def on_train_begin(self, logs=None):         # recordings will be the trainer object attributes         self.trainer.silly_callback = {}          self.trainer.silly_callback[\"beginning\"] = []         self.trainer.silly_callback[\"end\"] = []      def on_epoch_begin(self, epoch, logs=None):         self.trainer.silly_callback[\"beginning\"].append(epoch + 1)      def on_epoch_end(self, epoch, logs=None, metric=None):         self.trainer.silly_callback[\"end\"].append(epoch + 1) <p>and now, as usual:</p> In\u00a0[22]: Copied! <pre>trainer = Trainer(\n    model, objective=\"binary\", metrics=[Accuracy], callbacks=[SillyCallback]\n)\n</pre> trainer = Trainer(     model, objective=\"binary\", metrics=[Accuracy], callbacks=[SillyCallback] ) In\u00a0[23]: Copied! <pre>trainer.fit(\n    X_wide=X_wide, X_tab=X_tab, target=target, n_epochs=5, batch_size=64, val_split=0.2\n)\n</pre> trainer.fit(     X_wide=X_wide, X_tab=X_tab, target=target, n_epochs=5, batch_size=64, val_split=0.2 ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 611/611 [00:06&lt;00:00, 94.39it/s, loss=0.411, metrics={'acc': 0.814}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 121.91it/s, loss=0.327, metrics={'acc': 0.8449}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 611/611 [00:07&lt;00:00, 85.39it/s, loss=0.324, metrics={'acc': 0.8495}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 88.68it/s, loss=0.298, metrics={'acc': 0.8612}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 611/611 [00:08&lt;00:00, 74.35it/s, loss=0.302, metrics={'acc': 0.8593}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 100.51it/s, loss=0.29, metrics={'acc': 0.8665}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 611/611 [00:08&lt;00:00, 73.83it/s, loss=0.292, metrics={'acc': 0.8637}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 105.98it/s, loss=0.286, metrics={'acc': 0.8695}]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 611/611 [00:08&lt;00:00, 72.15it/s, loss=0.286, metrics={'acc': 0.866}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 92.27it/s, loss=0.284, metrics={'acc': 0.8698}]\n</pre> In\u00a0[24]: Copied! <pre>trainer.silly_callback\n</pre> trainer.silly_callback Out[24]: <pre>{'beginning': [1, 2, 3, 4, 5], 'end': [1, 2, 3, 4, 5]}</pre>"},{"location":"examples/07_custom_components.html#custom-components","title":"Custom components\u00b6","text":"<p>As I mentioned earlier in the example notebooks, and also in the <code>README</code>, it is possible to customise almost every component in <code>pytorch-widedeep</code>.</p> <p>Let's now go through a couple of simple examples to illustrate how that could be done.</p> <p>First let's load and process the data \"as usual\", let's start with a regression and the airbnb dataset.</p>"},{"location":"examples/07_custom_components.html#custom-deeptext","title":"Custom <code>deeptext</code>\u00b6","text":"<p>Standard Pytorch model</p>"},{"location":"examples/07_custom_components.html#custom-loss-function","title":"Custom loss function\u00b6","text":"<p>Loss functions must simply inherit pytorch's <code>nn.Module</code>. For example, let's say we want to use <code>RMSE</code> (note that this is already available in the package, but I will pass it here as a custom loss for illustration purposes)</p>"},{"location":"examples/07_custom_components.html#custom-metric","title":"Custom metric\u00b6","text":"<p>Let's say we want to use our own accuracy metric (again, this is already available in the package, but I will pass it here as a custom loss for illustration purposes).</p> <p>This could be done as:</p>"},{"location":"examples/07_custom_components.html#custom-callback","title":"Custom Callback\u00b6","text":"<p>Let's code a callback that records the current epoch at the beginning and the end of each epoch (silly, but you know, this is just an example)</p>"},{"location":"examples/08_custom_dataLoader_imbalanced_dataset.html","title":"08_custom_dataLoader_imbalanced_dataset","text":"<ul> <li>In this notebook we will use the higly imbalanced Protein Homology Dataset from KDD cup 2004</li> </ul> <pre><code>* The first element of each line is a BLOCK ID that denotes to which native sequence this example belongs. There is a unique BLOCK ID for each native sequence. BLOCK IDs are integers running from 1 to 303 (one for each native sequence, i.e. for each query). BLOCK IDs were assigned before the blocks were split into the train and test sets, so they do not run consecutively in either file.\n* The second element of each line is an EXAMPLE ID that uniquely describes the example. You will need this EXAMPLE ID and the BLOCK ID when you submit results.\n* The third element is the class of the example. Proteins that are homologous to the native sequence are denoted by 1, non-homologous proteins (i.e. decoys) by 0. Test examples have a \"?\" in this position.\n* All following elements are feature values. There are 74 feature values in each line. The features describe the match (e.g. the score of a sequence alignment) between the native protein sequence and the sequence that is tested for homology.\n</code></pre> In\u00a0[1]: Copied! <pre>import numpy as np\nimport pandas as pd\nimport torch\nfrom torch.optim import SGD, lr_scheduler\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.models import TabMlp, WideDeep\nfrom pytorch_widedeep.dataloaders import DataLoaderImbalanced, DataLoaderDefault\nfrom torchmetrics import F1Score as F1_torchmetrics\nfrom torchmetrics import Accuracy as Accuracy_torchmetrics\nfrom torchmetrics import Precision as Precision_torchmetrics\nfrom torchmetrics import Recall as Recall_torchmetrics\nfrom pytorch_widedeep.metrics import Accuracy, Recall, Precision, F1Score, R2Score\nfrom pytorch_widedeep.initializers import XavierNormal\nfrom pytorch_widedeep.datasets import load_bio_kdd04\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import classification_report\n\nimport time\nimport datetime\n\nimport warnings\n\nwarnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n\n# increase displayed columns in jupyter notebook\npd.set_option(\"display.max_columns\", 200)\npd.set_option(\"display.max_rows\", 300)\n</pre> import numpy as np import pandas as pd import torch from torch.optim import SGD, lr_scheduler  from pytorch_widedeep import Trainer from pytorch_widedeep.preprocessing import TabPreprocessor from pytorch_widedeep.models import TabMlp, WideDeep from pytorch_widedeep.dataloaders import DataLoaderImbalanced, DataLoaderDefault from torchmetrics import F1Score as F1_torchmetrics from torchmetrics import Accuracy as Accuracy_torchmetrics from torchmetrics import Precision as Precision_torchmetrics from torchmetrics import Recall as Recall_torchmetrics from pytorch_widedeep.metrics import Accuracy, Recall, Precision, F1Score, R2Score from pytorch_widedeep.initializers import XavierNormal from pytorch_widedeep.datasets import load_bio_kdd04  from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report  import time import datetime  import warnings  warnings.filterwarnings(\"ignore\", category=DeprecationWarning)  # increase displayed columns in jupyter notebook pd.set_option(\"display.max_columns\", 200) pd.set_option(\"display.max_rows\", 300) <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n</pre> In\u00a0[2]: Copied! <pre>df = load_bio_kdd04(as_frame=True)\n# drop columns we won't need in this example\ndf.drop(columns=[\"EXAMPLE_ID\", \"BLOCK_ID\"], inplace=True)\n\ndf_train, df_valid = train_test_split(\n    df, test_size=0.2, stratify=df[\"target\"], random_state=1\n)\ndf_valid, df_test = train_test_split(\n    df_valid, test_size=0.5, stratify=df_valid[\"target\"], random_state=1\n)\n\ncontinuous_cols = df.drop(columns=[\"target\"]).columns.values.tolist()\n</pre> df = load_bio_kdd04(as_frame=True) # drop columns we won't need in this example df.drop(columns=[\"EXAMPLE_ID\", \"BLOCK_ID\"], inplace=True)  df_train, df_valid = train_test_split(     df, test_size=0.2, stratify=df[\"target\"], random_state=1 ) df_valid, df_test = train_test_split(     df_valid, test_size=0.5, stratify=df_valid[\"target\"], random_state=1 )  continuous_cols = df.drop(columns=[\"target\"]).columns.values.tolist() In\u00a0[3]: Copied! <pre># deeptabular\ntab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True)\nX_tab_train = tab_preprocessor.fit_transform(df_train)\nX_tab_valid = tab_preprocessor.transform(df_valid)\nX_tab_test = tab_preprocessor.transform(df_test)\n\n# target\ny_train = df_train[\"target\"].values\ny_valid = df_valid[\"target\"].values\ny_test = df_test[\"target\"].values\n</pre> # deeptabular tab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True) X_tab_train = tab_preprocessor.fit_transform(df_train) X_tab_valid = tab_preprocessor.transform(df_valid) X_tab_test = tab_preprocessor.transform(df_test)  # target y_train = df_train[\"target\"].values y_valid = df_valid[\"target\"].values y_test = df_test[\"target\"].values In\u00a0[4]: Copied! <pre># Define the model\ninput_layer = len(tab_preprocessor.continuous_cols)\noutput_layer = 1\nhidden_layers = np.linspace(\n    input_layer * 2, output_layer, 5, endpoint=False, dtype=int\n).tolist()\n\ndeeptabular = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    mlp_hidden_dims=hidden_layers,\n)\nmodel = WideDeep(deeptabular=deeptabular)\nmodel\n</pre> # Define the model input_layer = len(tab_preprocessor.continuous_cols) output_layer = 1 hidden_layers = np.linspace(     input_layer * 2, output_layer, 5, endpoint=False, dtype=int ).tolist()  deeptabular = TabMlp(     column_idx=tab_preprocessor.column_idx,     continuous_cols=tab_preprocessor.continuous_cols,     mlp_hidden_dims=hidden_layers, ) model = WideDeep(deeptabular=deeptabular) model Out[4]: <pre>WideDeep(\n  (deeptabular): Sequential(\n    (0): TabMlp(\n      (cont_norm): Identity()\n      (encoder): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=74, out_features=148, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_1): Sequential(\n            (0): Linear(in_features=148, out_features=118, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_2): Sequential(\n            (0): Linear(in_features=118, out_features=89, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_3): Sequential(\n            (0): Linear(in_features=89, out_features=59, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_4): Sequential(\n            (0): Linear(in_features=59, out_features=30, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=30, out_features=1, bias=True)\n  )\n)</pre> In\u00a0[5]: Copied! <pre># Metrics from pytorch-widedeep\naccuracy = Accuracy(top_k=2)\nprecision = Precision(average=False)\n\n# # Metrics from torchmetrics\n# accuracy = Accuracy_torchmetrics(average=None, num_classes=1)\n# precision = Precision_torchmetrics(average=\"micro\", num_classes=1)\n\n# Optimizers\ndeep_opt = SGD(model.deeptabular.parameters(), lr=0.1)\n\n# LR Scheduler\ndeep_sch = lr_scheduler.StepLR(deep_opt, step_size=3)\n\ntrainer = Trainer(\n    model,\n    objective=\"binary\",\n    lr_schedulers={\"deeptabular\": deep_sch},\n    initializers={\"deeptabular\": XavierNormal},\n    optimizers={\"deeptabular\": deep_opt},\n    metrics=[accuracy, precision],\n    verbose=1,\n)\n</pre> # Metrics from pytorch-widedeep accuracy = Accuracy(top_k=2) precision = Precision(average=False)  # # Metrics from torchmetrics # accuracy = Accuracy_torchmetrics(average=None, num_classes=1) # precision = Precision_torchmetrics(average=\"micro\", num_classes=1)  # Optimizers deep_opt = SGD(model.deeptabular.parameters(), lr=0.1)  # LR Scheduler deep_sch = lr_scheduler.StepLR(deep_opt, step_size=3)  trainer = Trainer(     model,     objective=\"binary\",     lr_schedulers={\"deeptabular\": deep_sch},     initializers={\"deeptabular\": XavierNormal},     optimizers={\"deeptabular\": deep_opt},     metrics=[accuracy, precision],     verbose=1, ) In\u00a0[6]: Copied! <pre>start = time.time()\ntrainer.fit(\n    X_train={\"X_tab\": X_tab_train, \"target\": y_train},\n    X_val={\"X_tab\": X_tab_valid, \"target\": y_valid},\n    n_epochs=1,\n    batch_size=32,\n    custom_dataloader=DataLoaderImbalanced,\n    oversample_mul=5,\n)\nprint(\n    \"Training time[s]: {}\".format(\n        datetime.timedelta(seconds=round(time.time() - start))\n    )\n)\n\npd.DataFrame(trainer.history)\n\ndf_pred = trainer.predict(X_tab=X_tab_test)\nprint(classification_report(df_test[\"target\"].to_list(), df_pred))\nprint(\"Actual predicted values:\\n{}\".format(np.unique(df_pred, return_counts=True)))\n</pre> start = time.time() trainer.fit(     X_train={\"X_tab\": X_tab_train, \"target\": y_train},     X_val={\"X_tab\": X_tab_valid, \"target\": y_valid},     n_epochs=1,     batch_size=32,     custom_dataloader=DataLoaderImbalanced,     oversample_mul=5, ) print(     \"Training time[s]: {}\".format(         datetime.timedelta(seconds=round(time.time() - start))     ) )  pd.DataFrame(trainer.history)  df_pred = trainer.predict(X_tab=X_tab_test) print(classification_report(df_test[\"target\"].to_list(), df_pred)) print(\"Actual predicted values:\\n{}\".format(np.unique(df_pred, return_counts=True))) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 325/325 [00:02&lt;00:00, 153.99it/s, loss=0.163, metrics={'acc': 0.9363, 'prec': [0.9358]}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 456/456 [00:02&lt;00:00, 205.93it/s, loss=0.1, metrics={'acc': 0.9501, 'prec': [0.1447]}]\n</pre> <pre>Training time[s]: 0:00:04\n</pre> <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 456/456 [00:01&lt;00:00, 368.16it/s]\n</pre> <pre>              precision    recall  f1-score   support\n\n           0       1.00      0.95      0.97     14446\n           1       0.15      0.95      0.25       130\n\n    accuracy                           0.95     14576\n   macro avg       0.57      0.95      0.61     14576\nweighted avg       0.99      0.95      0.97     14576\n\nActual predicted values:\n(array([0, 1]), array([13736,   840]))\n</pre>"},{"location":"examples/08_custom_dataLoader_imbalanced_dataset.html#custom-dataloader-for-imbalanced-dataset","title":"Custom DataLoader for Imbalanced dataset\u00b6","text":""},{"location":"examples/08_custom_dataLoader_imbalanced_dataset.html#initial-imports","title":"Initial imports\u00b6","text":""},{"location":"examples/09_extracting_embeddings.html","title":"09_extracting_embeddings","text":"In\u00a0[1]: Copied! <pre>import numpy as np\nimport pandas as pd\nimport torch\n\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.training import Trainer\nfrom pytorch_widedeep.models import FTTransformer, WideDeep\nfrom pytorch_widedeep.metrics import Accuracy\nfrom pytorch_widedeep import Tab2Vec\nfrom pytorch_widedeep.datasets import load_adult\n</pre> import numpy as np import pandas as pd import torch  from pytorch_widedeep.preprocessing import TabPreprocessor from pytorch_widedeep.training import Trainer from pytorch_widedeep.models import FTTransformer, WideDeep from pytorch_widedeep.metrics import Accuracy from pytorch_widedeep import Tab2Vec from pytorch_widedeep.datasets import load_adult In\u00a0[2]: Copied! <pre>df = load_adult(as_frame=True)\ndf.head()\n</pre> df = load_adult(as_frame=True) df.head() Out[2]: age workclass fnlwgt education educational-num marital-status occupation relationship race gender capital-gain capital-loss hours-per-week native-country income 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States &lt;=50K 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States &lt;=50K 2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States &gt;50K 3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States &gt;50K 4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States &lt;=50K In\u00a0[3]: Copied! <pre># For convenience, we'll replace '-' with '_'\ndf.columns = [c.replace(\"-\", \"_\") for c in df.columns]\n# binary target\ndf[\"target\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int)\ndf.drop([\"income\", \"educational_num\"], axis=1, inplace=True)\n\ndf.head()\n</pre> # For convenience, we'll replace '-' with '_' df.columns = [c.replace(\"-\", \"_\") for c in df.columns] # binary target df[\"target\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int) df.drop([\"income\", \"educational_num\"], axis=1, inplace=True)  df.head() Out[3]: age workclass fnlwgt education marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country target 0 25 Private 226802 11th Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States 0 1 38 Private 89814 HS-grad Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States 0 2 28 Local-gov 336951 Assoc-acdm Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States 1 3 44 Private 160323 Some-college Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States 1 4 18 ? 103497 Some-college Never-married ? Own-child White Female 0 0 30 United-States 0 In\u00a0[4]: Copied! <pre>cat_cols, cont_cols = [], []\nfor col in df.columns:\n    # 50 is just a random number I choose here for this example\n    if df[col].dtype == \"O\" or df[col].nunique() &lt; 50 and col != \"target\":\n        cat_cols.append(col)\n    elif col != \"target\":\n        cont_cols.append(col)\ntarget_col = \"target\"\n</pre> cat_cols, cont_cols = [], [] for col in df.columns:     # 50 is just a random number I choose here for this example     if df[col].dtype == \"O\" or df[col].nunique() &lt; 50 and col != \"target\":         cat_cols.append(col)     elif col != \"target\":         cont_cols.append(col) target_col = \"target\" In\u00a0[5]: Copied! <pre>target = df[target_col].values\n\ntab_preprocessor = TabPreprocessor(\n    embed_cols=cat_cols, continuous_cols=cont_cols, for_transformer=True\n)\nX_tab = tab_preprocessor.fit_transform(df)\n</pre> target = df[target_col].values  tab_preprocessor = TabPreprocessor(     embed_cols=cat_cols, continuous_cols=cont_cols, for_transformer=True ) X_tab = tab_preprocessor.fit_transform(df) <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> In\u00a0[6]: Copied! <pre>ft_transformer = FTTransformer(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    embed_continuous_method=\"standard\",\n    n_blocks=2,\n    n_heads=4,\n    input_dim=16,\n)\n</pre> ft_transformer = FTTransformer(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=tab_preprocessor.continuous_cols,     embed_continuous_method=\"standard\",     n_blocks=2,     n_heads=4,     input_dim=16, ) <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/utils/general_utils.py:12: DeprecationWarning: The 'embed_continuous' parameter is deprecated and will be removed in the next release. Please use 'embed_continuous_method' instead See the documentation for more details.\n  return func(*args, **kwargs)\n</pre> In\u00a0[7]: Copied! <pre>model = WideDeep(deeptabular=ft_transformer)\ntrainer = Trainer(model, objective=\"binary\", metrics=[Accuracy])\ntrainer.fit(X_tab=X_tab, target=target, n_epochs=1, batch_size=256, val_split=0.2)\n</pre> model = WideDeep(deeptabular=ft_transformer) trainer = Trainer(model, objective=\"binary\", metrics=[Accuracy]) trainer.fit(X_tab=X_tab, target=target, n_epochs=1, batch_size=256, val_split=0.2) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:03&lt;00:00, 41.47it/s, loss=221, metrics={'acc': 0.686}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 39/39 [00:00&lt;00:00, 86.24it/s, loss=9.28, metrics={'acc': 0.76}]\n</pre> In\u00a0[8]: Copied! <pre>t2v = Tab2Vec(model=model, tab_preprocessor=tab_preprocessor)\n</pre> t2v = Tab2Vec(model=model, tab_preprocessor=tab_preprocessor) In\u00a0[9]: Copied! <pre># assuming is a test set with target col\nX_vec, y = t2v.transform(df.sample(100), target_col=\"target\")\n</pre> # assuming is a test set with target col X_vec, y = t2v.transform(df.sample(100), target_col=\"target\") In\u00a0[10]: Copied! <pre># X vec is the dataframe turned into the embeddings\nX_vec.shape\n</pre> # X vec is the dataframe turned into the embeddings X_vec.shape Out[10]: <pre>(100, 208)</pre> <p><code>208 = input_dim (16) * n_cols (13)</code></p> In\u00a0[11]: Copied! <pre># ...or if we don't have target col\nX_vec = t2v.transform(df.sample(100))\n</pre> # ...or if we don't have target col X_vec = t2v.transform(df.sample(100))"},{"location":"examples/09_extracting_embeddings.html#extracting-embeddings","title":"Extracting embeddings\u00b6","text":"<p>This notebook is a simple guide to extracting learned feature embeddings using Tab2Vec</p>"},{"location":"examples/10_3rd_party_integration-RayTune_WnB.html","title":"10_3rd_party_integration-RayTune_WnB","text":"In\u00a0[11]: Copied! <pre>from typing import Optional, Dict\nimport os\n\nimport numpy as np\nimport pandas as pd\nimport torch\nimport wandb\nfrom torch.optim import SGD, lr_scheduler\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.models import TabMlp, WideDeep\nfrom torchmetrics import F1Score as F1_torchmetrics\nfrom torchmetrics import Accuracy as Accuracy_torchmetrics\nfrom torchmetrics import Precision as Precision_torchmetrics\nfrom torchmetrics import Recall as Recall_torchmetrics\nfrom pytorch_widedeep.metrics import Accuracy, Recall, Precision, F1Score, R2Score\nfrom pytorch_widedeep.initializers import XavierNormal\nfrom pytorch_widedeep.callbacks import (\n    EarlyStopping,\n    ModelCheckpoint,\n    Callback,\n)\nfrom pytorch_widedeep.datasets import load_bio_kdd04\n\nfrom sklearn.model_selection import train_test_split\nimport warnings\n\nwarnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n\nfrom ray import tune\nfrom ray.tune.schedulers import AsyncHyperBandScheduler\nfrom ray.tune import JupyterNotebookReporter\nfrom ray.air.integrations.wandb import WandbLoggerCallback\n\n# from ray.tune.integration.wandb import wandb_mixin\n\nimport tracemalloc\n\ntracemalloc.start()\n\n# increase displayed columns in jupyter notebook\npd.set_option(\"display.max_columns\", 200)\npd.set_option(\"display.max_rows\", 300)\n</pre> from typing import Optional, Dict import os  import numpy as np import pandas as pd import torch import wandb from torch.optim import SGD, lr_scheduler  from pytorch_widedeep import Trainer from pytorch_widedeep.preprocessing import TabPreprocessor from pytorch_widedeep.models import TabMlp, WideDeep from torchmetrics import F1Score as F1_torchmetrics from torchmetrics import Accuracy as Accuracy_torchmetrics from torchmetrics import Precision as Precision_torchmetrics from torchmetrics import Recall as Recall_torchmetrics from pytorch_widedeep.metrics import Accuracy, Recall, Precision, F1Score, R2Score from pytorch_widedeep.initializers import XavierNormal from pytorch_widedeep.callbacks import (     EarlyStopping,     ModelCheckpoint,     Callback, ) from pytorch_widedeep.datasets import load_bio_kdd04  from sklearn.model_selection import train_test_split import warnings  warnings.filterwarnings(\"ignore\", category=DeprecationWarning)  from ray import tune from ray.tune.schedulers import AsyncHyperBandScheduler from ray.tune import JupyterNotebookReporter from ray.air.integrations.wandb import WandbLoggerCallback  # from ray.tune.integration.wandb import wandb_mixin  import tracemalloc  tracemalloc.start()  # increase displayed columns in jupyter notebook pd.set_option(\"display.max_columns\", 200) pd.set_option(\"display.max_rows\", 300) In\u00a0[12]: Copied! <pre>class RayTuneReporter(Callback):\n    r\"\"\"Callback that allows reporting history and lr_history values to RayTune\n    during Hyperparameter tuning\n\n    Callbacks are passed as input parameters to the ``Trainer`` class. See\n    :class:`pytorch_widedeep.trainer.Trainer`\n\n    For examples see the examples folder at:\n\n        .. code-block:: bash\n\n            /examples/12_HyperParameter_tuning_w_RayTune.ipynb\n    \"\"\"\n\n    def on_epoch_end(\n        self, epoch: int, logs: Optional[Dict] = None, metric: Optional[float] = None\n    ):\n        report_dict = {}\n        for k, v in self.trainer.history.items():\n            report_dict.update({k: v[-1]})\n        if hasattr(self.trainer, \"lr_history\"):\n            for k, v in self.trainer.lr_history.items():\n                report_dict.update({k: v[-1]})\n        tune.report(report_dict)\n\n\nclass WnBReportBest(Callback):\n    r\"\"\"Callback that allows reporting best performance of a run to WnB\n    during Hyperparameter tuning. It is an adjusted pytorch_widedeep.callbacks.ModelCheckpoint\n    with added WnB and removed checkpoint saving.\n\n    Callbacks are passed as input parameters to the ``Trainer`` class.\n\n    Parameters\n    ----------\n    wb: obj\n        Weights&amp;Biases API interface to report single best result usable for\n        comparisson of multiple paramater combinations by, for example,\n        `parallel coordinates\n        &lt;https://docs.wandb.ai/ref/app/features/panels/parallel-coordinates&gt;`_.\n        E.g W&amp;B summary report `wandb.run.summary[\"best\"]`.\n    monitor: str, default=\"loss\"\n        quantity to monitor. Typically `'val_loss'` or metric name\n        (e.g. `'val_acc'`)\n    mode: str, default=\"auto\"\n        If ``save_best_only=True``, the decision to overwrite the current save\n        file is made based on either the maximization or the minimization of\n        the monitored quantity. For `'acc'`, this should be `'max'`, for\n        `'loss'` this should be `'min'`, etc. In `'auto'` mode, the\n        direction is automatically inferred from the name of the monitored\n        quantity.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        wb: object,\n        monitor: str = \"val_loss\",\n        mode: str = \"auto\",\n    ):\n        super(WnBReportBest, self).__init__()\n\n        self.monitor = monitor\n        self.mode = mode\n        self.wb = wb\n\n        if self.mode not in [\"auto\", \"min\", \"max\"]:\n            warnings.warn(\n                \"WnBReportBest mode %s is unknown, \"\n                \"fallback to auto mode.\" % (self.mode),\n                RuntimeWarning,\n            )\n            self.mode = \"auto\"\n        if self.mode == \"min\":\n            self.monitor_op = np.less\n            self.best = np.Inf\n        elif self.mode == \"max\":\n            self.monitor_op = np.greater  # type: ignore[assignment]\n            self.best = -np.Inf\n        else:\n            if self._is_metric(self.monitor):\n                self.monitor_op = np.greater  # type: ignore[assignment]\n                self.best = -np.Inf\n            else:\n                self.monitor_op = np.less\n                self.best = np.Inf\n\n    def on_epoch_end(  # noqa: C901\n        self, epoch: int, logs: Optional[Dict] = None, metric: Optional[float] = None\n    ):\n        logs = logs or {}\n        current = logs.get(self.monitor)\n        if current is not None:\n            if self.monitor_op(current, self.best):\n                self.wb.run.summary[\"best\"] = current  # type: ignore[attr-defined]\n                self.best = current\n                self.best_epoch = epoch\n\n    @staticmethod\n    def _is_metric(monitor: str):\n        \"copied from pytorch_widedeep.callbacks\"\n        if any([s in monitor for s in [\"acc\", \"prec\", \"rec\", \"fscore\", \"f1\", \"f2\"]]):\n            return True\n        else:\n            return False\n</pre> class RayTuneReporter(Callback):     r\"\"\"Callback that allows reporting history and lr_history values to RayTune     during Hyperparameter tuning      Callbacks are passed as input parameters to the ``Trainer`` class. See     :class:`pytorch_widedeep.trainer.Trainer`      For examples see the examples folder at:          .. code-block:: bash              /examples/12_HyperParameter_tuning_w_RayTune.ipynb     \"\"\"      def on_epoch_end(         self, epoch: int, logs: Optional[Dict] = None, metric: Optional[float] = None     ):         report_dict = {}         for k, v in self.trainer.history.items():             report_dict.update({k: v[-1]})         if hasattr(self.trainer, \"lr_history\"):             for k, v in self.trainer.lr_history.items():                 report_dict.update({k: v[-1]})         tune.report(report_dict)   class WnBReportBest(Callback):     r\"\"\"Callback that allows reporting best performance of a run to WnB     during Hyperparameter tuning. It is an adjusted pytorch_widedeep.callbacks.ModelCheckpoint     with added WnB and removed checkpoint saving.      Callbacks are passed as input parameters to the ``Trainer`` class.      Parameters     ----------     wb: obj         Weights&amp;Biases API interface to report single best result usable for         comparisson of multiple paramater combinations by, for example,         `parallel coordinates         `_.         E.g W&amp;B summary report `wandb.run.summary[\"best\"]`.     monitor: str, default=\"loss\"         quantity to monitor. Typically `'val_loss'` or metric name         (e.g. `'val_acc'`)     mode: str, default=\"auto\"         If ``save_best_only=True``, the decision to overwrite the current save         file is made based on either the maximization or the minimization of         the monitored quantity. For `'acc'`, this should be `'max'`, for         `'loss'` this should be `'min'`, etc. In `'auto'` mode, the         direction is automatically inferred from the name of the monitored         quantity.      \"\"\"      def __init__(         self,         wb: object,         monitor: str = \"val_loss\",         mode: str = \"auto\",     ):         super(WnBReportBest, self).__init__()          self.monitor = monitor         self.mode = mode         self.wb = wb          if self.mode not in [\"auto\", \"min\", \"max\"]:             warnings.warn(                 \"WnBReportBest mode %s is unknown, \"                 \"fallback to auto mode.\" % (self.mode),                 RuntimeWarning,             )             self.mode = \"auto\"         if self.mode == \"min\":             self.monitor_op = np.less             self.best = np.Inf         elif self.mode == \"max\":             self.monitor_op = np.greater  # type: ignore[assignment]             self.best = -np.Inf         else:             if self._is_metric(self.monitor):                 self.monitor_op = np.greater  # type: ignore[assignment]                 self.best = -np.Inf             else:                 self.monitor_op = np.less                 self.best = np.Inf      def on_epoch_end(  # noqa: C901         self, epoch: int, logs: Optional[Dict] = None, metric: Optional[float] = None     ):         logs = logs or {}         current = logs.get(self.monitor)         if current is not None:             if self.monitor_op(current, self.best):                 self.wb.run.summary[\"best\"] = current  # type: ignore[attr-defined]                 self.best = current                 self.best_epoch = epoch      @staticmethod     def _is_metric(monitor: str):         \"copied from pytorch_widedeep.callbacks\"         if any([s in monitor for s in [\"acc\", \"prec\", \"rec\", \"fscore\", \"f1\", \"f2\"]]):             return True         else:             return False In\u00a0[13]: Copied! <pre>df = load_bio_kdd04(as_frame=True)\ndf.head()\n</pre> df = load_bio_kdd04(as_frame=True) df.head() Out[13]: EXAMPLE_ID BLOCK_ID target 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 0 279 261532 0 52.0 32.69 0.30 2.5 20.0 1256.8 -0.89 0.33 11.0 -55.0 267.2 0.52 0.05 -2.36 49.6 252.0 0.43 1.16 -2.06 -33.0 -123.2 1.60 -0.49 -6.06 65.0 296.1 -0.28 -0.26 -3.83 -22.6 -170.0 3.06 -1.05 -3.29 22.9 286.3 0.12 2.58 4.08 -33.0 -178.9 1.88 0.53 -7.0 -44.0 1987.0 -5.41 0.95 -4.0 -57.0 722.9 -3.26 -0.55 -7.5 125.5 1547.2 -0.36 1.12 9.0 -37.0 72.5 0.47 0.74 -11.0 -8.0 1595.1 -1.64 2.83 -2.0 -50.0 445.2 -0.35 0.26 0.76 1 279 261533 0 58.0 33.33 0.00 16.5 9.5 608.1 0.50 0.07 20.5 -52.5 521.6 -1.08 0.58 -0.02 -3.2 103.6 -0.95 0.23 -2.87 -25.9 -52.2 -0.21 0.87 -1.81 10.4 62.0 -0.28 -0.04 1.48 -17.6 -198.3 3.43 2.84 5.87 -16.9 72.6 -0.31 2.79 2.71 -33.5 -11.6 -1.11 4.01 5.0 -57.0 666.3 1.13 4.38 5.0 -64.0 39.3 1.07 -0.16 32.5 100.0 1893.7 -2.80 -0.22 2.5 -28.5 45.0 0.58 0.41 -19.0 -6.0 762.9 0.29 0.82 -3.0 -35.0 140.3 1.16 0.39 0.73 2 279 261534 0 77.0 27.27 -0.91 6.0 58.5 1623.6 -1.40 0.02 -6.5 -48.0 621.0 -1.20 0.14 -0.20 73.6 609.1 -0.44 -0.58 -0.04 -23.0 -27.4 -0.72 -1.04 -1.09 91.1 635.6 -0.88 0.24 0.59 -18.7 -7.2 -0.60 -2.82 -0.71 52.4 504.1 0.89 -0.67 -9.30 -20.8 -25.7 -0.77 -0.85 0.0 -20.0 2259.0 -0.94 1.15 -4.0 -44.0 -22.7 0.94 -0.98 -19.0 105.0 1267.9 1.03 1.27 11.0 -39.5 82.3 0.47 -0.19 -10.0 7.0 1491.8 0.32 -1.29 0.0 -34.0 658.2 -0.76 0.26 0.24 3 279 261535 0 41.0 27.91 -0.35 3.0 46.0 1921.6 -1.36 -0.47 -32.0 -51.5 560.9 -0.29 -0.10 -1.11 124.3 791.6 0.00 0.39 -1.85 -21.7 -44.9 -0.21 0.02 0.89 133.9 797.8 -0.08 1.06 -0.26 -16.4 -74.1 0.97 -0.80 -0.41 66.9 955.3 -1.90 1.28 -6.65 -28.1 47.5 -1.91 1.42 1.0 -30.0 1846.7 0.76 1.10 -4.0 -52.0 -53.9 1.71 -0.22 -12.0 97.5 1969.8 -1.70 0.16 -1.0 -32.5 255.9 -0.46 1.57 10.0 6.0 2047.7 -0.98 1.53 0.0 -49.0 554.2 -0.83 0.39 0.73 4 279 261536 0 50.0 28.00 -1.32 -9.0 12.0 464.8 0.88 0.19 8.0 -51.5 98.1 1.09 -0.33 -2.16 -3.9 102.7 0.39 -1.22 -3.39 -15.2 -42.2 -1.18 -1.11 -3.55 8.9 141.3 -0.16 -0.43 -4.15 -12.9 -13.4 -1.32 -0.98 -3.69 8.8 136.1 -0.30 4.13 1.89 -13.0 -18.7 -1.37 -0.93 0.0 -1.0 810.1 -2.29 6.72 1.0 -23.0 -29.7 0.58 -1.10 -18.5 33.5 206.8 1.84 -0.13 4.0 -29.0 30.1 0.80 -0.24 5.0 -14.0 479.5 0.68 -0.59 2.0 -36.0 -6.9 2.02 0.14 -0.23 In\u00a0[14]: Copied! <pre># imbalance of the classes\ndf[\"target\"].value_counts()\n</pre> # imbalance of the classes df[\"target\"].value_counts() Out[14]: <pre>target\n0    144455\n1      1296\nName: count, dtype: int64</pre> In\u00a0[15]: Copied! <pre># drop columns we won't need in this example\ndf.drop(columns=[\"EXAMPLE_ID\", \"BLOCK_ID\"], inplace=True)\n</pre> # drop columns we won't need in this example df.drop(columns=[\"EXAMPLE_ID\", \"BLOCK_ID\"], inplace=True) In\u00a0[16]: Copied! <pre>df_train, df_valid = train_test_split(\n    df, test_size=0.2, stratify=df[\"target\"], random_state=1\n)\ndf_valid, df_test = train_test_split(\n    df_valid, test_size=0.5, stratify=df_valid[\"target\"], random_state=1\n)\n</pre> df_train, df_valid = train_test_split(     df, test_size=0.2, stratify=df[\"target\"], random_state=1 ) df_valid, df_test = train_test_split(     df_valid, test_size=0.5, stratify=df_valid[\"target\"], random_state=1 ) In\u00a0[17]: Copied! <pre>continuous_cols = df.drop(columns=[\"target\"]).columns.values.tolist()\n</pre> continuous_cols = df.drop(columns=[\"target\"]).columns.values.tolist() In\u00a0[18]: Copied! <pre># deeptabular\ntab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True)\nX_tab_train = tab_preprocessor.fit_transform(df_train)\nX_tab_valid = tab_preprocessor.transform(df_valid)\nX_tab_test = tab_preprocessor.transform(df_test)\n\n# target\ny_train = df_train[\"target\"].values\ny_valid = df_valid[\"target\"].values\ny_test = df_test[\"target\"].values\n</pre> # deeptabular tab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True) X_tab_train = tab_preprocessor.fit_transform(df_train) X_tab_valid = tab_preprocessor.transform(df_valid) X_tab_test = tab_preprocessor.transform(df_test)  # target y_train = df_train[\"target\"].values y_valid = df_valid[\"target\"].values y_test = df_test[\"target\"].values In\u00a0[19]: Copied! <pre>input_layer = len(tab_preprocessor.continuous_cols)\noutput_layer = 1\nhidden_layers = np.linspace(\n    input_layer * 2, output_layer, 5, endpoint=False, dtype=int\n).tolist()\n</pre> input_layer = len(tab_preprocessor.continuous_cols) output_layer = 1 hidden_layers = np.linspace(     input_layer * 2, output_layer, 5, endpoint=False, dtype=int ).tolist() In\u00a0[20]: Copied! <pre>deeptabular = TabMlp(\n    mlp_hidden_dims=hidden_layers,\n    column_idx=tab_preprocessor.column_idx,\n    continuous_cols=tab_preprocessor.continuous_cols,\n)\nmodel = WideDeep(deeptabular=deeptabular)\nmodel\n</pre> deeptabular = TabMlp(     mlp_hidden_dims=hidden_layers,     column_idx=tab_preprocessor.column_idx,     continuous_cols=tab_preprocessor.continuous_cols, ) model = WideDeep(deeptabular=deeptabular) model Out[20]: <pre>WideDeep(\n  (deeptabular): Sequential(\n    (0): TabMlp(\n      (cont_norm): Identity()\n      (encoder): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=74, out_features=148, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_1): Sequential(\n            (0): Linear(in_features=148, out_features=118, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_2): Sequential(\n            (0): Linear(in_features=118, out_features=89, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_3): Sequential(\n            (0): Linear(in_features=89, out_features=59, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_4): Sequential(\n            (0): Linear(in_features=59, out_features=30, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=30, out_features=1, bias=True)\n  )\n)</pre> In\u00a0[24]: Copied! <pre># Metrics from torchmetrics\naccuracy = Accuracy_torchmetrics(average=None, num_classes=1, task=\"binary\")\nprecision = Precision_torchmetrics(average=\"micro\", num_classes=1, task=\"binary\")\nf1 = F1_torchmetrics(average=None, num_classes=1, task=\"binary\")\nrecall = Recall_torchmetrics(average=None, num_classes=1, task=\"binary\")\n</pre> # Metrics from torchmetrics accuracy = Accuracy_torchmetrics(average=None, num_classes=1, task=\"binary\") precision = Precision_torchmetrics(average=\"micro\", num_classes=1, task=\"binary\") f1 = F1_torchmetrics(average=None, num_classes=1, task=\"binary\") recall = Recall_torchmetrics(average=None, num_classes=1, task=\"binary\") <p>Note:</p> <p>Following cells includes usage of both <code>RayTuneReporter</code> and <code>WnBReportBest</code> callbacks. In case you want to use just <code>RayTuneReporter</code>, remove following:</p> <ul> <li>wandb from config</li> <li><code>WandbLoggerCallback</code></li> <li><code>WnBReportBest</code></li> <li><code>@wandb_mixin</code> decorator</li> </ul> <p>We do not see strong reason to use WnB without RayTune for a single paramater combination run, but it is possible:</p> <ul> <li>option01: define paramaters in config only for a single value <code>tune.grid_search([1000])</code> (single value RayTune run)</li> <li>option02: define WnB callback that reports currnet validation/training loss, metrics, etc. at the end of batch, ie. do not report to WnB at <code>epoch_end</code> as in <code>WnBReportBest</code> but at the <code>on_batch_end</code>, see <code>pytorch_widedeep.callbacks.Callback</code></li> </ul> In\u00a0[26]: Copied! <pre>config = {\n    \"batch_size\": tune.grid_search([1000, 5000]),\n    \"wandb\": {\n        \"project\": \"test\",\n        # \"api_key_file\": os.getcwd() + \"/wandb_api.key\",\n        \"api_key\": \"WNB_API_KEY\",\n    },\n}\n\n# Optimizers\ndeep_opt = SGD(model.deeptabular.parameters(), lr=0.1)\n# LR Scheduler\ndeep_sch = lr_scheduler.StepLR(deep_opt, step_size=3)\n\n\n@wandb_mixin\ndef training_function(config, X_train, X_val):\n    early_stopping = EarlyStopping()\n    model_checkpoint = ModelCheckpoint(save_best_only=True)\n    # Hyperparameters\n    batch_size = config[\"batch_size\"]\n    trainer = Trainer(\n        model,\n        objective=\"binary_focal_loss\",\n        callbacks=[\n            RayTuneReporter,\n            WnBReportBest(wb=wandb),\n            early_stopping,\n            model_checkpoint,\n        ],\n        lr_schedulers={\"deeptabular\": deep_sch},\n        initializers={\"deeptabular\": XavierNormal},\n        optimizers={\"deeptabular\": deep_opt},\n        metrics=[accuracy, precision, recall, f1],\n        verbose=0,\n    )\n\n    trainer.fit(X_train=X_train, X_val=X_val, n_epochs=5, batch_size=batch_size)\n\n\nX_train = {\"X_tab\": X_tab_train, \"target\": y_train}\nX_val = {\"X_tab\": X_tab_valid, \"target\": y_valid}\n\nasha_scheduler = AsyncHyperBandScheduler(\n    time_attr=\"training_iteration\",\n    metric=\"_metric/val_loss\",\n    mode=\"min\",\n    max_t=100,\n    grace_period=10,\n    reduction_factor=3,\n    brackets=1,\n)\n\nanalysis = tune.run(\n    tune.with_parameters(training_function, X_train=X_train, X_val=X_val),\n    resources_per_trial={\"cpu\": 1, \"gpu\": 0},\n    progress_reporter=JupyterNotebookReporter(overwrite=True),\n    scheduler=asha_scheduler,\n    config=config,\n    callbacks=[\n        WandbLoggerCallback(\n            project=config[\"wandb\"][\"project\"],\n            # api_key_file=config[\"wandb\"][\"api_key_file\"],\n            api_key=config[\"wandb\"][\"api_key\"],\n            log_config=True,\n        )\n    ],\n)\n</pre> config = {     \"batch_size\": tune.grid_search([1000, 5000]),     \"wandb\": {         \"project\": \"test\",         # \"api_key_file\": os.getcwd() + \"/wandb_api.key\",         \"api_key\": \"WNB_API_KEY\",     }, }  # Optimizers deep_opt = SGD(model.deeptabular.parameters(), lr=0.1) # LR Scheduler deep_sch = lr_scheduler.StepLR(deep_opt, step_size=3)   @wandb_mixin def training_function(config, X_train, X_val):     early_stopping = EarlyStopping()     model_checkpoint = ModelCheckpoint(save_best_only=True)     # Hyperparameters     batch_size = config[\"batch_size\"]     trainer = Trainer(         model,         objective=\"binary_focal_loss\",         callbacks=[             RayTuneReporter,             WnBReportBest(wb=wandb),             early_stopping,             model_checkpoint,         ],         lr_schedulers={\"deeptabular\": deep_sch},         initializers={\"deeptabular\": XavierNormal},         optimizers={\"deeptabular\": deep_opt},         metrics=[accuracy, precision, recall, f1],         verbose=0,     )      trainer.fit(X_train=X_train, X_val=X_val, n_epochs=5, batch_size=batch_size)   X_train = {\"X_tab\": X_tab_train, \"target\": y_train} X_val = {\"X_tab\": X_tab_valid, \"target\": y_valid}  asha_scheduler = AsyncHyperBandScheduler(     time_attr=\"training_iteration\",     metric=\"_metric/val_loss\",     mode=\"min\",     max_t=100,     grace_period=10,     reduction_factor=3,     brackets=1, )  analysis = tune.run(     tune.with_parameters(training_function, X_train=X_train, X_val=X_val),     resources_per_trial={\"cpu\": 1, \"gpu\": 0},     progress_reporter=JupyterNotebookReporter(overwrite=True),     scheduler=asha_scheduler,     config=config,     callbacks=[         WandbLoggerCallback(             project=config[\"wandb\"][\"project\"],             # api_key_file=config[\"wandb\"][\"api_key_file\"],             api_key=config[\"wandb\"][\"api_key\"],             log_config=True,         )     ], ) <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/lib/python3.10/tempfile.py:860: ResourceWarning: Implicitly cleaning up &lt;TemporaryDirectory '/var/folders/_2/lrjn1qn54c758tdtktr1bvkc0000gn/T/tmp60pfyl1kwandb'&gt;\n  _warnings.warn(warn_message, ResourceWarning)\n/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/lib/python3.10/tempfile.py:860: ResourceWarning: Implicitly cleaning up &lt;TemporaryDirectory '/var/folders/_2/lrjn1qn54c758tdtktr1bvkc0000gn/T/tmpnjv2rg1wwandb-artifacts'&gt;\n  _warnings.warn(warn_message, ResourceWarning)\n/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/lib/python3.10/tempfile.py:860: ResourceWarning: Implicitly cleaning up &lt;TemporaryDirectory '/var/folders/_2/lrjn1qn54c758tdtktr1bvkc0000gn/T/tmpgebu5k1kwandb-media'&gt;\n  _warnings.warn(warn_message, ResourceWarning)\n/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/lib/python3.10/tempfile.py:860: ResourceWarning: Implicitly cleaning up &lt;TemporaryDirectory '/var/folders/_2/lrjn1qn54c758tdtktr1bvkc0000gn/T/tmpxy9y2yriwandb-media'&gt;\n  _warnings.warn(warn_message, ResourceWarning)\n</pre> In\u00a0[14]: Copied! <pre>analysis.results\n</pre> analysis.results Out[14]: <pre>{'fc9a8_00000': {'_metric': {'train_loss': 0.006297602537127896,\n   'train_Accuracy': 0.9925042986869812,\n   'train_Precision': 0.9939393997192383,\n   'train_Recall': 0.15814851224422455,\n   'train_F1Score': 0.2728785574436188,\n   'val_loss': 0.005045663565397263,\n   'val_Accuracy': 0.9946483969688416,\n   'val_Precision': 1.0,\n   'val_Recall': 0.39534884691238403,\n   'val_F1Score': 0.5666667222976685},\n  'time_this_iter_s': 2.388202428817749,\n  'done': True,\n  'timesteps_total': None,\n  'episodes_total': None,\n  'training_iteration': 5,\n  'trial_id': 'fc9a8_00000',\n  'experiment_id': 'baad1d4f3d924b48b9ece1b9f26c80cc',\n  'date': '2022-07-31_14-06-51',\n  'timestamp': 1659276411,\n  'time_total_s': 12.656474113464355,\n  'pid': 1813,\n  'hostname': 'jupyter-5uperpalo',\n  'node_ip': '10.32.44.172',\n  'config': {'batch_size': 1000},\n  'time_since_restore': 12.656474113464355,\n  'timesteps_since_restore': 0,\n  'iterations_since_restore': 5,\n  'warmup_time': 0.8006253242492676,\n  'experiment_tag': '0_batch_size=1000'},\n 'fc9a8_00001': {'_metric': {'train_loss': 0.02519632239515583,\n   'train_Accuracy': 0.9910891652107239,\n   'train_Precision': 0.25,\n   'train_Recall': 0.0009643201483413577,\n   'train_F1Score': 0.0019212296465411782,\n   'val_loss': 0.02578434906899929,\n   'val_Accuracy': 0.9911492466926575,\n   'val_Precision': 0.0,\n   'val_Recall': 0.0,\n   'val_F1Score': 0.0},\n  'time_this_iter_s': 4.113586902618408,\n  'done': True,\n  'timesteps_total': None,\n  'episodes_total': None,\n  'training_iteration': 5,\n  'trial_id': 'fc9a8_00001',\n  'experiment_id': 'f2e54a6a5780429fbf0db0746853347e',\n  'date': '2022-07-31_14-06-56',\n  'timestamp': 1659276416,\n  'time_total_s': 12.926990509033203,\n  'pid': 1962,\n  'hostname': 'jupyter-5uperpalo',\n  'node_ip': '10.32.44.172',\n  'config': {'batch_size': 5000},\n  'time_since_restore': 12.926990509033203,\n  'timesteps_since_restore': 0,\n  'iterations_since_restore': 5,\n  'warmup_time': 0.9253025054931641,\n  'experiment_tag': '1_batch_size=5000'}}</pre> <p>Using Weights and Biases logging you can create parallel coordinates graphs that map parametr combinations to the best(lowest) loss achieved during the training of the networks</p> <p></p> <p>local visualization of raytune reults using tensorboard</p> In\u00a0[23]: Copied! <pre>%load_ext tensorboard\n%tensorboard --logdir ~/ray_results\n</pre> %load_ext tensorboard %tensorboard --logdir ~/ray_results"},{"location":"examples/10_3rd_party_integration-RayTune_WnB.html#3rd-party-integration-raytune-weights-biases","title":"3rd party integration - RayTune, Weights &amp; Biases\u00b6","text":"<p>This notebook provides guideline for integration of external library functions in the model training process through <code>Callback</code> objects, a popular concept of using objects as arguments for other objects.</p> <p>[DISCLAIMER]</p> <p>We show integration of RayTune (a hyperparameter tuning framework) and Weights &amp; Biases (ML projects experiment tracking and versioning solution) in the <code>pytorch_widedeep</code> model training process. We did not include <code>RayTuneReporter</code> and <code>WnBReportBest</code> in the library code to minimize the dependencies on other libraries that are not directly included in the model design and training.</p>"},{"location":"examples/10_3rd_party_integration-RayTune_WnB.html#initial-imports","title":"Initial imports\u00b6","text":""},{"location":"examples/10_3rd_party_integration-RayTune_WnB.html#preparing-the-data","title":"Preparing the data\u00b6","text":""},{"location":"examples/10_3rd_party_integration-RayTune_WnB.html#define-the-model","title":"Define the model\u00b6","text":""},{"location":"examples/11_auc_multiclass.html","title":"11_auc_multiclass","text":"In\u00a0[1]: Copied! <pre>import numpy as np\nimport pandas as pd\nfrom torch.optim import SGD, lr_scheduler\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.models import TabMlp, WideDeep\nfrom torchmetrics import AUROC\nfrom pytorch_widedeep.initializers import XavierNormal\nfrom pytorch_widedeep.datasets import load_ecoli\nfrom pytorch_widedeep.utils import LabelEncoder\n\nfrom sklearn.model_selection import train_test_split\n\n# increase displayed columns in jupyter notebook\npd.set_option(\"display.max_columns\", 200)\npd.set_option(\"display.max_rows\", 300)\n</pre> import numpy as np import pandas as pd from torch.optim import SGD, lr_scheduler  from pytorch_widedeep import Trainer from pytorch_widedeep.preprocessing import TabPreprocessor from pytorch_widedeep.models import TabMlp, WideDeep from torchmetrics import AUROC from pytorch_widedeep.initializers import XavierNormal from pytorch_widedeep.datasets import load_ecoli from pytorch_widedeep.utils import LabelEncoder  from sklearn.model_selection import train_test_split  # increase displayed columns in jupyter notebook pd.set_option(\"display.max_columns\", 200) pd.set_option(\"display.max_rows\", 300) In\u00a0[2]: Copied! <pre>df = load_ecoli(as_frame=True)\ndf.head()\n</pre> df = load_ecoli(as_frame=True) df.head() Out[2]: SequenceName mcg gvh lip chg aac alm1 alm2 class 0 AAT_ECOLI 0.49 0.29 0.48 0.5 0.56 0.24 0.35 cp 1 ACEA_ECOLI 0.07 0.40 0.48 0.5 0.54 0.35 0.44 cp 2 ACEK_ECOLI 0.56 0.40 0.48 0.5 0.49 0.37 0.46 cp 3 ACKA_ECOLI 0.59 0.49 0.48 0.5 0.52 0.45 0.36 cp 4 ADI_ECOLI 0.23 0.32 0.48 0.5 0.55 0.25 0.35 cp In\u00a0[3]: Copied! <pre># imbalance of the classes\ndf[\"class\"].value_counts()\n</pre> # imbalance of the classes df[\"class\"].value_counts() Out[3]: <pre>class\ncp     143\nim      77\npp      52\nimU     35\nom      20\nomL      5\nimS      2\nimL      2\nName: count, dtype: int64</pre> In\u00a0[4]: Copied! <pre>df = df.loc[~df[\"class\"].isin([\"omL\", \"imS\", \"imL\"])]\ndf.reset_index(inplace=True, drop=True)\n</pre> df = df.loc[~df[\"class\"].isin([\"omL\", \"imS\", \"imL\"])] df.reset_index(inplace=True, drop=True) In\u00a0[5]: Copied! <pre>encoder = LabelEncoder([\"class\"])\ndf_enc = encoder.fit_transform(df)\ndf_enc[\"class\"] = df_enc[\"class\"] - 1\n</pre> encoder = LabelEncoder([\"class\"]) df_enc = encoder.fit_transform(df) df_enc[\"class\"] = df_enc[\"class\"] - 1 In\u00a0[6]: Copied! <pre># drop columns we won't need in this example\ndf_enc = df_enc.drop(columns=[\"SequenceName\"])\n</pre> # drop columns we won't need in this example df_enc = df_enc.drop(columns=[\"SequenceName\"]) In\u00a0[7]: Copied! <pre>df_train, df_valid = train_test_split(\n    df_enc, test_size=0.2, stratify=df_enc[\"class\"], random_state=1\n)\ndf_valid, df_test = train_test_split(\n    df_valid, test_size=0.5, stratify=df_valid[\"class\"], random_state=1\n)\n</pre> df_train, df_valid = train_test_split(     df_enc, test_size=0.2, stratify=df_enc[\"class\"], random_state=1 ) df_valid, df_test = train_test_split(     df_valid, test_size=0.5, stratify=df_valid[\"class\"], random_state=1 ) In\u00a0[8]: Copied! <pre>continuous_cols = df_enc.drop(columns=[\"class\"]).columns.values.tolist()\n</pre> continuous_cols = df_enc.drop(columns=[\"class\"]).columns.values.tolist() In\u00a0[9]: Copied! <pre># deeptabular\ntab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True)\nX_tab_train = tab_preprocessor.fit_transform(df_train)\nX_tab_valid = tab_preprocessor.transform(df_valid)\nX_tab_test = tab_preprocessor.transform(df_test)\n\n# target\ny_train = df_train[\"class\"].values\ny_valid = df_valid[\"class\"].values\ny_test = df_test[\"class\"].values\n\nX_train = {\"X_tab\": X_tab_train, \"target\": y_train}\nX_val = {\"X_tab\": X_tab_valid, \"target\": y_valid}\n</pre> # deeptabular tab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True) X_tab_train = tab_preprocessor.fit_transform(df_train) X_tab_valid = tab_preprocessor.transform(df_valid) X_tab_test = tab_preprocessor.transform(df_test)  # target y_train = df_train[\"class\"].values y_valid = df_valid[\"class\"].values y_test = df_test[\"class\"].values  X_train = {\"X_tab\": X_tab_train, \"target\": y_train} X_val = {\"X_tab\": X_tab_valid, \"target\": y_valid} <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:295: DeprecationWarning: 'scale' and 'already_standard' will be deprecated in the next release. Please use 'cols_to_scale' instead\n  self._check_inputs(cat_embed_cols)\n</pre> In\u00a0[10]: Copied! <pre>deeptabular = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    continuous_cols=tab_preprocessor.continuous_cols,\n)\nmodel = WideDeep(deeptabular=deeptabular, pred_dim=df_enc[\"class\"].nunique())\nmodel\n</pre> deeptabular = TabMlp(     column_idx=tab_preprocessor.column_idx,     continuous_cols=tab_preprocessor.continuous_cols, ) model = WideDeep(deeptabular=deeptabular, pred_dim=df_enc[\"class\"].nunique()) model Out[10]: <pre>WideDeep(\n  (deeptabular): Sequential(\n    (0): TabMlp(\n      (cont_norm): Identity()\n      (encoder): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=7, out_features=200, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_1): Sequential(\n            (0): Linear(in_features=200, out_features=100, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=100, out_features=5, bias=True)\n  )\n)</pre> In\u00a0[11]: Copied! <pre>auroc = AUROC(num_classes=df_enc[\"class\"].nunique(), task=\"multiclass\")\n</pre> auroc = AUROC(num_classes=df_enc[\"class\"].nunique(), task=\"multiclass\") In\u00a0[12]: Copied! <pre># Optimizers\ndeep_opt = SGD(model.deeptabular.parameters(), lr=0.1)\n# LR Scheduler\ndeep_sch = lr_scheduler.StepLR(deep_opt, step_size=3)\n# Hyperparameters\ntrainer = Trainer(\n    model,\n    objective=\"multiclass_focal_loss\",\n    lr_schedulers={\"deeptabular\": deep_sch},\n    initializers={\"deeptabular\": XavierNormal},\n    optimizers={\"deeptabular\": deep_opt},\n    metrics=[auroc],\n)\n\ntrainer.fit(X_train=X_train, X_val=X_val, n_epochs=5, batch_size=50)\n</pre> # Optimizers deep_opt = SGD(model.deeptabular.parameters(), lr=0.1) # LR Scheduler deep_sch = lr_scheduler.StepLR(deep_opt, step_size=3) # Hyperparameters trainer = Trainer(     model,     objective=\"multiclass_focal_loss\",     lr_schedulers={\"deeptabular\": deep_sch},     initializers={\"deeptabular\": XavierNormal},     optimizers={\"deeptabular\": deep_opt},     metrics=[auroc], )  trainer.fit(X_train=X_train, X_val=X_val, n_epochs=5, batch_size=50) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [00:00&lt;00:00, 54.59it/s, loss=0.109, metrics={'MulticlassAUROC': 0.314}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 98.35it/s, loss=0.105, metrics={'MulticlassAUROC': 0.2558}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [00:00&lt;00:00, 91.55it/s, loss=0.105, metrics={'MulticlassAUROC': 0.3546}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 111.68it/s, loss=0.101, metrics={'MulticlassAUROC': 0.2737}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [00:00&lt;00:00, 62.55it/s, loss=0.1, metrics={'MulticlassAUROC': 0.3795}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 108.51it/s, loss=0.0966, metrics={'MulticlassAUROC': 0.3053}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [00:00&lt;00:00, 99.35it/s, loss=0.0965, metrics={'MulticlassAUROC': 0.3809}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 117.73it/s, loss=0.0962, metrics={'MulticlassAUROC': 0.3089}]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [00:00&lt;00:00, 110.56it/s, loss=0.0967, metrics={'MulticlassAUROC': 0.3509}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 127.35it/s, loss=0.0958, metrics={'MulticlassAUROC': 0.3089}]\n</pre> In\u00a0[\u00a0]: Copied! <pre>\n</pre>"},{"location":"examples/11_auc_multiclass.html#auc-multiclass-computation","title":"AUC multiclass computation\u00b6","text":""},{"location":"examples/11_auc_multiclass.html#initial-imports","title":"Initial imports\u00b6","text":""},{"location":"examples/11_auc_multiclass.html#preparing-the-data","title":"Preparing the data\u00b6","text":""},{"location":"examples/11_auc_multiclass.html#define-the-model","title":"Define the model\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html","title":"12_ZILNLoss_origkeras_vs_pytorch_widedeep","text":"In\u00a0[1]: Copied! <pre># @title Copyright 2019 The Lifetime Value Authors.\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# ============================================================================\n</pre> # @title Copyright 2019 The Lifetime Value Authors. # Licensed under the Apache License, Version 2.0 (the \"License\"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # #     https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an \"AS IS\" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ Run in Google Colab View source on GitHub In\u00a0[3]: Copied! <pre>import os\n\nimport numpy as np\nimport pandas as pd\nfrom scipy import stats\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport tensorflow as tf\nimport tensorflow_probability as tfp\nfrom typing import Sequence\n\n# install and import ltv\n!pip install -q git+https://github.com/google/lifetime_value\nimport lifetime_value as ltv\n</pre> import os  import numpy as np import pandas as pd from scipy import stats import matplotlib.pyplot as plt import seaborn as sns import tensorflow as tf import tensorflow_probability as tfp from typing import Sequence  # install and import ltv !pip install -q git+https://github.com/google/lifetime_value import lifetime_value as ltv In\u00a0[\u00a0]: Copied! <pre>tfd = tfp.distributions\n%config InlineBackend.figure_format='retina'\nsns.set_style(\"whitegrid\")\n</pre> tfd = tfp.distributions %config InlineBackend.figure_format='retina' sns.set_style(\"whitegrid\") In\u00a0[\u00a0]: Copied! <pre>MODEL = \"dnn\"\nLOSS = \"ziln\"  # @param { isTemplate: true, type: 'string'} ['mse', 'ziln']\nLEARNING_RATE = 0.001  # @param { isTemplate: true}\nVERSION = 0  # @param { isTemplate: true, type: 'integer'}\nOUTPUT_CSV_FOLDER = \"/tmp/lifetime-value/kdd_cup_98/result\"  # @param { isTemplate: true, type: 'string'}\n</pre> MODEL = \"dnn\" LOSS = \"ziln\"  # @param { isTemplate: true, type: 'string'} ['mse', 'ziln'] LEARNING_RATE = 0.001  # @param { isTemplate: true} VERSION = 0  # @param { isTemplate: true, type: 'integer'} OUTPUT_CSV_FOLDER = \"/tmp/lifetime-value/kdd_cup_98/result\"  # @param { isTemplate: true, type: 'string'} <p>Download kdd_cup_98 data to /tmp/lifetime-value/kdd_cup_98</p> In\u00a0[\u00a0]: Copied! <pre>%%bash\nmkdir -p /tmp/lifetime-value/kdd_cup_98\nwget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98lrn.zip -P /tmp/lifetime-value/kdd_cup_98/\nwget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98val.zip -P /tmp/lifetime-value/kdd_cup_98/\nwget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/valtargt.txt -P /tmp/lifetime-value/kdd_cup_98/\ncd /tmp/lifetime-value/kdd_cup_98/\nunzip cup98lrn.zip\nunzip cup98val.zip\n</pre> %%bash mkdir -p /tmp/lifetime-value/kdd_cup_98 wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98lrn.zip -P /tmp/lifetime-value/kdd_cup_98/ wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98val.zip -P /tmp/lifetime-value/kdd_cup_98/ wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/valtargt.txt -P /tmp/lifetime-value/kdd_cup_98/ cd /tmp/lifetime-value/kdd_cup_98/ unzip cup98lrn.zip unzip cup98val.zip In\u00a0[\u00a0]: Copied! <pre>df_train = pd.read_csv(\"/tmp/lifetime-value/kdd_cup_98/cup98LRN.txt\")\nnum_train = df_train.shape[0]\ndf_eval = pd.read_csv(\"/tmp/lifetime-value/kdd_cup_98/cup98VAL.txt\")\ndf_eval_target = pd.read_csv(\"/tmp/lifetime-value/kdd_cup_98/valtargt.txt\")\ndf_eval = df_eval.merge(df_eval_target, on=\"CONTROLN\")\n</pre> df_train = pd.read_csv(\"/tmp/lifetime-value/kdd_cup_98/cup98LRN.txt\") num_train = df_train.shape[0] df_eval = pd.read_csv(\"/tmp/lifetime-value/kdd_cup_98/cup98VAL.txt\") df_eval_target = pd.read_csv(\"/tmp/lifetime-value/kdd_cup_98/valtargt.txt\") df_eval = df_eval.merge(df_eval_target, on=\"CONTROLN\") In\u00a0[\u00a0]: Copied! <pre>df = pd.concat([df_train, df_eval], axis=0, sort=True)\n</pre> df = pd.concat([df_train, df_eval], axis=0, sort=True) In\u00a0[\u00a0]: Copied! <pre>y = df[\"TARGET_D\"][:num_train]\n</pre> y = df[\"TARGET_D\"][:num_train] In\u00a0[\u00a0]: Copied! <pre>def plot_hist_log_scale(y):\n    max_val = y.max() + 1.0\n    ax = pd.Series(y).hist(\n        figsize=(8, 5), bins=10 ** np.linspace(0.0, np.log10(max_val), 20)\n    )\n\n    plt.xlabel(\"Donation ($)\")\n    plt.ylabel(\"Count\")\n    # plt.title('Histogram of LTV')\n    plt.xticks(rotation=\"horizontal\")\n    plt.legend(loc=\"upper left\")\n    ax.set_xscale(\"log\")\n    ax.grid(False)\n    # Hide the right and top spines\n    ax.spines[\"right\"].set_visible(False)\n    ax.spines[\"top\"].set_visible(False)\n    # Only show ticks on the left and bottom spines\n    ax.yaxis.set_ticks_position(\"left\")\n    ax.xaxis.set_ticks_position(\"bottom\")\n    plt.show()\n\n    fig = ax.get_figure()\n    output_file = tf.io.gfile.GFile(\n        \"/tmp/lifetime-value/kdd_cup_98/histogram_kdd98_log_scale.pdf\", \"wb\"\n    )\n    fig.savefig(output_file, bbox_inches=\"tight\", format=\"pdf\")\n</pre> def plot_hist_log_scale(y):     max_val = y.max() + 1.0     ax = pd.Series(y).hist(         figsize=(8, 5), bins=10 ** np.linspace(0.0, np.log10(max_val), 20)     )      plt.xlabel(\"Donation ($)\")     plt.ylabel(\"Count\")     # plt.title('Histogram of LTV')     plt.xticks(rotation=\"horizontal\")     plt.legend(loc=\"upper left\")     ax.set_xscale(\"log\")     ax.grid(False)     # Hide the right and top spines     ax.spines[\"right\"].set_visible(False)     ax.spines[\"top\"].set_visible(False)     # Only show ticks on the left and bottom spines     ax.yaxis.set_ticks_position(\"left\")     ax.xaxis.set_ticks_position(\"bottom\")     plt.show()      fig = ax.get_figure()     output_file = tf.io.gfile.GFile(         \"/tmp/lifetime-value/kdd_cup_98/histogram_kdd98_log_scale.pdf\", \"wb\"     )     fig.savefig(output_file, bbox_inches=\"tight\", format=\"pdf\") In\u00a0[\u00a0]: Copied! <pre>plot_hist_log_scale(y[y &gt; 0])\n</pre> plot_hist_log_scale(y[y &gt; 0]) In\u00a0[\u00a0]: Copied! <pre>VOCAB_FEATURES = [\n    \"ODATEDW\",  # date of donor's first gift (YYMM)\n    \"OSOURCE\",  # donor acquisition mailing list\n    \"TCODE\",  # donor title code\n    \"STATE\",\n    \"ZIP\",\n    \"DOMAIN\",  # urbanicity level and socio-economic status of the neighborhood\n    \"CLUSTER\",  # socio-economic status\n    \"GENDER\",\n    \"MAXADATE\",  # date of the most recent promotion received\n    \"MINRDATE\",\n    \"LASTDATE\",\n    \"FISTDATE\",\n    \"RFA_2A\",\n]\n</pre> VOCAB_FEATURES = [     \"ODATEDW\",  # date of donor's first gift (YYMM)     \"OSOURCE\",  # donor acquisition mailing list     \"TCODE\",  # donor title code     \"STATE\",     \"ZIP\",     \"DOMAIN\",  # urbanicity level and socio-economic status of the neighborhood     \"CLUSTER\",  # socio-economic status     \"GENDER\",     \"MAXADATE\",  # date of the most recent promotion received     \"MINRDATE\",     \"LASTDATE\",     \"FISTDATE\",     \"RFA_2A\", ] In\u00a0[\u00a0]: Copied! <pre>df[\"ODATEDW\"] = df[\"ODATEDW\"].astype(\"str\")\ndf[\"TCODE\"] = df[\"TCODE\"].apply(lambda x: \"{:03d}\".format(x // 1000 if x &gt; 1000 else x))\ndf[\"ZIP\"] = df[\"ZIP\"].str.slice(0, 5)\ndf[\"MAXADATE\"] = df[\"MAXADATE\"].astype(\"str\")\ndf[\"MINRDATE\"] = df[\"MINRDATE\"].astype(\"str\")\ndf[\"LASTDATE\"] = df[\"LASTDATE\"].astype(\"str\")\ndf[\"FISTDATE\"] = df[\"FISTDATE\"].astype(\"str\")\n</pre> df[\"ODATEDW\"] = df[\"ODATEDW\"].astype(\"str\") df[\"TCODE\"] = df[\"TCODE\"].apply(lambda x: \"{:03d}\".format(x // 1000 if x &gt; 1000 else x)) df[\"ZIP\"] = df[\"ZIP\"].str.slice(0, 5) df[\"MAXADATE\"] = df[\"MAXADATE\"].astype(\"str\") df[\"MINRDATE\"] = df[\"MINRDATE\"].astype(\"str\") df[\"LASTDATE\"] = df[\"LASTDATE\"].astype(\"str\") df[\"FISTDATE\"] = df[\"FISTDATE\"].astype(\"str\") In\u00a0[\u00a0]: Copied! <pre>def label_encoding(y, frequency_threshold=100):\n    value_counts = pd.value_counts(y)\n    categories = value_counts[value_counts &gt;= frequency_threshold].index.to_numpy()\n    # 0 indicates the unknown category.\n    return pd.Categorical(y, categories=categories).codes + 1\n</pre> def label_encoding(y, frequency_threshold=100):     value_counts = pd.value_counts(y)     categories = value_counts[value_counts &gt;= frequency_threshold].index.to_numpy()     # 0 indicates the unknown category.     return pd.Categorical(y, categories=categories).codes + 1 In\u00a0[\u00a0]: Copied! <pre>for key in VOCAB_FEATURES:\n    df[key] = label_encoding(df[key])\n</pre> for key in VOCAB_FEATURES:     df[key] = label_encoding(df[key]) In\u00a0[\u00a0]: Copied! <pre>MAIL_ORDER_RESPONSES = [\n    \"MBCRAFT\",\n    \"MBGARDEN\",\n    \"MBBOOKS\",\n    \"MBCOLECT\",\n    \"MAGFAML\",\n    \"MAGFEM\",\n    \"MAGMALE\",\n    \"PUBGARDN\",\n    \"PUBCULIN\",\n    \"PUBHLTH\",\n    \"PUBDOITY\",\n    \"PUBNEWFN\",\n    \"PUBPHOTO\",\n    \"PUBOPP\",\n    \"RFA_2F\",\n]\n</pre> MAIL_ORDER_RESPONSES = [     \"MBCRAFT\",     \"MBGARDEN\",     \"MBBOOKS\",     \"MBCOLECT\",     \"MAGFAML\",     \"MAGFEM\",     \"MAGMALE\",     \"PUBGARDN\",     \"PUBCULIN\",     \"PUBHLTH\",     \"PUBDOITY\",     \"PUBNEWFN\",     \"PUBPHOTO\",     \"PUBOPP\",     \"RFA_2F\", ] In\u00a0[\u00a0]: Copied! <pre>INDICATOR_FEATURES = [\n    \"AGE\",  # age decile, 0 indicates unknown\n    \"NUMCHLD\",\n    \"INCOME\",\n    \"WEALTH1\",\n    \"HIT\",\n] + MAIL_ORDER_RESPONSES\n</pre> INDICATOR_FEATURES = [     \"AGE\",  # age decile, 0 indicates unknown     \"NUMCHLD\",     \"INCOME\",     \"WEALTH1\",     \"HIT\", ] + MAIL_ORDER_RESPONSES In\u00a0[\u00a0]: Copied! <pre>df[\"AGE\"] = pd.qcut(df[\"AGE\"].values, 10).codes + 1\ndf[\"NUMCHLD\"] = df[\"NUMCHLD\"].apply(lambda x: 0 if np.isnan(x) else int(x))\ndf[\"INCOME\"] = df[\"INCOME\"].apply(lambda x: 0 if np.isnan(x) else int(x))\ndf[\"WEALTH1\"] = df[\"WEALTH1\"].apply(lambda x: 0 if np.isnan(x) else int(x) + 1)\ndf[\"HIT\"] = pd.qcut(df[\"HIT\"].values, q=50, duplicates=\"drop\").codes\n\nfor col in MAIL_ORDER_RESPONSES:\n    df[col] = pd.qcut(df[col].values, q=20, duplicates=\"drop\").codes + 1\n</pre> df[\"AGE\"] = pd.qcut(df[\"AGE\"].values, 10).codes + 1 df[\"NUMCHLD\"] = df[\"NUMCHLD\"].apply(lambda x: 0 if np.isnan(x) else int(x)) df[\"INCOME\"] = df[\"INCOME\"].apply(lambda x: 0 if np.isnan(x) else int(x)) df[\"WEALTH1\"] = df[\"WEALTH1\"].apply(lambda x: 0 if np.isnan(x) else int(x) + 1) df[\"HIT\"] = pd.qcut(df[\"HIT\"].values, q=50, duplicates=\"drop\").codes  for col in MAIL_ORDER_RESPONSES:     df[col] = pd.qcut(df[col].values, q=20, duplicates=\"drop\").codes + 1 In\u00a0[\u00a0]: Copied! <pre>NUMERIC_FEATURES = [\n    # binary\n    \"MAILCODE\",  # bad address\n    \"NOEXCH\",  # do not exchange\n    \"RECINHSE\",  # donor has given to PVA's in house program\n    \"RECP3\",  # donor has given to PVA's P3 program\n    \"RECPGVG\",  # planned giving record\n    \"RECSWEEP\",  # sweepstakes record\n    \"HOMEOWNR\",  # home owner\n    \"CHILD03\",\n    \"CHILD07\",\n    \"CHILD12\",\n    \"CHILD18\",\n    # continuous\n    \"CARDPROM\",\n    \"NUMPROM\",\n    \"CARDPM12\",\n    \"NUMPRM12\",\n    \"RAMNTALL\",\n    \"NGIFTALL\",\n    \"MINRAMNT\",\n    \"MAXRAMNT\",\n    \"LASTGIFT\",\n    \"AVGGIFT\",\n]\n</pre> NUMERIC_FEATURES = [     # binary     \"MAILCODE\",  # bad address     \"NOEXCH\",  # do not exchange     \"RECINHSE\",  # donor has given to PVA's in house program     \"RECP3\",  # donor has given to PVA's P3 program     \"RECPGVG\",  # planned giving record     \"RECSWEEP\",  # sweepstakes record     \"HOMEOWNR\",  # home owner     \"CHILD03\",     \"CHILD07\",     \"CHILD12\",     \"CHILD18\",     # continuous     \"CARDPROM\",     \"NUMPROM\",     \"CARDPM12\",     \"NUMPRM12\",     \"RAMNTALL\",     \"NGIFTALL\",     \"MINRAMNT\",     \"MAXRAMNT\",     \"LASTGIFT\",     \"AVGGIFT\", ] In\u00a0[\u00a0]: Copied! <pre>df[\"MAILCODE\"] = (df[\"MAILCODE\"] == \"B\").astype(\"float32\")\ndf[\"PVASTATE\"] = df[\"PVASTATE\"].isin([\"P\", \"E\"]).astype(\"float32\")\ndf[\"NOEXCH\"] = df[\"NOEXCH\"].isin([\"X\", \"1\"]).astype(\"float32\")\ndf[\"RECINHSE\"] = (df[\"RECINHSE\"] == \"X\").astype(\"float32\")\ndf[\"RECP3\"] = (df[\"RECP3\"] == \"X\").astype(\"float32\")\ndf[\"RECPGVG\"] = (df[\"RECPGVG\"] == \"X\").astype(\"float32\")\ndf[\"RECSWEEP\"] = (df[\"RECSWEEP\"] == \"X\").astype(\"float32\")\ndf[\"HOMEOWNR\"] = (df[\"HOMEOWNR\"] == \"H\").astype(\"float32\")\ndf[\"CHILD03\"] = df[\"CHILD03\"].isin([\"M\", \"F\", \"B\"]).astype(\"float32\")\ndf[\"CHILD07\"] = df[\"CHILD07\"].isin([\"M\", \"F\", \"B\"]).astype(\"float32\")\ndf[\"CHILD12\"] = df[\"CHILD12\"].isin([\"M\", \"F\", \"B\"]).astype(\"float32\")\ndf[\"CHILD18\"] = df[\"CHILD18\"].isin([\"M\", \"F\", \"B\"]).astype(\"float32\")\n\ndf[\"CARDPROM\"] = df[\"CARDPROM\"] / 100\ndf[\"NUMPROM\"] = df[\"NUMPROM\"] / 100\ndf[\"CARDPM12\"] = df[\"CARDPM12\"] / 100\ndf[\"NUMPRM12\"] = df[\"NUMPRM12\"] / 100\ndf[\"RAMNTALL\"] = np.log1p(df[\"RAMNTALL\"])\ndf[\"NGIFTALL\"] = np.log1p(df[\"NGIFTALL\"])\ndf[\"MINRAMNT\"] = np.log1p(df[\"MINRAMNT\"])\ndf[\"MAXRAMNT\"] = np.log1p(df[\"MAXRAMNT\"])\ndf[\"LASTGIFT\"] = np.log1p(df[\"LASTGIFT\"])\ndf[\"AVGGIFT\"] = np.log1p(df[\"AVGGIFT\"])\n</pre> df[\"MAILCODE\"] = (df[\"MAILCODE\"] == \"B\").astype(\"float32\") df[\"PVASTATE\"] = df[\"PVASTATE\"].isin([\"P\", \"E\"]).astype(\"float32\") df[\"NOEXCH\"] = df[\"NOEXCH\"].isin([\"X\", \"1\"]).astype(\"float32\") df[\"RECINHSE\"] = (df[\"RECINHSE\"] == \"X\").astype(\"float32\") df[\"RECP3\"] = (df[\"RECP3\"] == \"X\").astype(\"float32\") df[\"RECPGVG\"] = (df[\"RECPGVG\"] == \"X\").astype(\"float32\") df[\"RECSWEEP\"] = (df[\"RECSWEEP\"] == \"X\").astype(\"float32\") df[\"HOMEOWNR\"] = (df[\"HOMEOWNR\"] == \"H\").astype(\"float32\") df[\"CHILD03\"] = df[\"CHILD03\"].isin([\"M\", \"F\", \"B\"]).astype(\"float32\") df[\"CHILD07\"] = df[\"CHILD07\"].isin([\"M\", \"F\", \"B\"]).astype(\"float32\") df[\"CHILD12\"] = df[\"CHILD12\"].isin([\"M\", \"F\", \"B\"]).astype(\"float32\") df[\"CHILD18\"] = df[\"CHILD18\"].isin([\"M\", \"F\", \"B\"]).astype(\"float32\")  df[\"CARDPROM\"] = df[\"CARDPROM\"] / 100 df[\"NUMPROM\"] = df[\"NUMPROM\"] / 100 df[\"CARDPM12\"] = df[\"CARDPM12\"] / 100 df[\"NUMPRM12\"] = df[\"NUMPRM12\"] / 100 df[\"RAMNTALL\"] = np.log1p(df[\"RAMNTALL\"]) df[\"NGIFTALL\"] = np.log1p(df[\"NGIFTALL\"]) df[\"MINRAMNT\"] = np.log1p(df[\"MINRAMNT\"]) df[\"MAXRAMNT\"] = np.log1p(df[\"MAXRAMNT\"]) df[\"LASTGIFT\"] = np.log1p(df[\"LASTGIFT\"]) df[\"AVGGIFT\"] = np.log1p(df[\"AVGGIFT\"]) In\u00a0[\u00a0]: Copied! <pre>CATEGORICAL_FEATURES = VOCAB_FEATURES + INDICATOR_FEATURES\nALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES\n</pre> CATEGORICAL_FEATURES = VOCAB_FEATURES + INDICATOR_FEATURES ALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES In\u00a0[\u00a0]: Copied! <pre>def dnn_split(df):\n    df_train = df.iloc[:num_train]\n    df_eval = df.iloc[num_train:]\n\n    def feature_dict(df):\n        features = {k: v.values for k, v in dict(df[CATEGORICAL_FEATURES]).items()}\n        features[\"numeric\"] = df[NUMERIC_FEATURES].astype(\"float32\").values\n        return features\n\n    x_train, y_train = (\n        feature_dict(df_train),\n        df_train[\"TARGET_D\"].astype(\"float32\").values,\n    )\n    x_eval, y_eval = feature_dict(df_eval), df_eval[\"TARGET_D\"].astype(\"float32\").values\n\n    return x_train, x_eval, y_train, y_eval\n</pre> def dnn_split(df):     df_train = df.iloc[:num_train]     df_eval = df.iloc[num_train:]      def feature_dict(df):         features = {k: v.values for k, v in dict(df[CATEGORICAL_FEATURES]).items()}         features[\"numeric\"] = df[NUMERIC_FEATURES].astype(\"float32\").values         return features      x_train, y_train = (         feature_dict(df_train),         df_train[\"TARGET_D\"].astype(\"float32\").values,     )     x_eval, y_eval = feature_dict(df_eval), df_eval[\"TARGET_D\"].astype(\"float32\").values      return x_train, x_eval, y_train, y_eval In\u00a0[\u00a0]: Copied! <pre>def embedding_dim(x):\n    return int(x**0.25) + 1\n\n\ndef embedding_layer(vocab_size):\n    return tf.keras.Sequential(\n        [\n            tf.keras.layers.Embedding(\n                input_dim=vocab_size,\n                output_dim=embedding_dim(vocab_size),\n                input_length=1,\n            ),\n            tf.keras.layers.Flatten(),\n        ]\n    )\n\n\ndef dnn_model(output_units):\n    numeric_input = tf.keras.layers.Input(\n        shape=(len(NUMERIC_FEATURES),), name=\"numeric\"\n    )\n\n    embedding_inputs = [\n        tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)\n        for key in CATEGORICAL_FEATURES\n    ]\n\n    embedding_outputs = [\n        embedding_layer(vocab_size=df[key].max() + 1)(input)\n        for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)\n    ]\n\n    deep_input = tf.keras.layers.concatenate([numeric_input] + embedding_outputs)\n    deep_model = tf.keras.Sequential(\n        [\n            tf.keras.layers.Dense(128, activation=\"relu\"),\n            tf.keras.layers.Dense(128, activation=\"relu\"),\n            tf.keras.layers.Dense(64, activation=\"relu\"),\n            tf.keras.layers.Dense(64, activation=\"relu\"),\n            tf.keras.layers.Dense(units=output_units),\n        ]\n    )\n    return tf.keras.Model(\n        inputs=[numeric_input] + embedding_inputs, outputs=deep_model(deep_input)\n    )\n</pre> def embedding_dim(x):     return int(x**0.25) + 1   def embedding_layer(vocab_size):     return tf.keras.Sequential(         [             tf.keras.layers.Embedding(                 input_dim=vocab_size,                 output_dim=embedding_dim(vocab_size),                 input_length=1,             ),             tf.keras.layers.Flatten(),         ]     )   def dnn_model(output_units):     numeric_input = tf.keras.layers.Input(         shape=(len(NUMERIC_FEATURES),), name=\"numeric\"     )      embedding_inputs = [         tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)         for key in CATEGORICAL_FEATURES     ]      embedding_outputs = [         embedding_layer(vocab_size=df[key].max() + 1)(input)         for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)     ]      deep_input = tf.keras.layers.concatenate([numeric_input] + embedding_outputs)     deep_model = tf.keras.Sequential(         [             tf.keras.layers.Dense(128, activation=\"relu\"),             tf.keras.layers.Dense(128, activation=\"relu\"),             tf.keras.layers.Dense(64, activation=\"relu\"),             tf.keras.layers.Dense(64, activation=\"relu\"),             tf.keras.layers.Dense(units=output_units),         ]     )     return tf.keras.Model(         inputs=[numeric_input] + embedding_inputs, outputs=deep_model(deep_input)     ) In\u00a0[\u00a0]: Copied! <pre>if LOSS == \"mse\":\n    loss = tf.keras.losses.MeanSquaredError()\n    output_units = 1\n\nif LOSS == \"ziln\":\n    loss = ltv.zero_inflated_lognormal_loss\n    output_units = 3\n</pre> if LOSS == \"mse\":     loss = tf.keras.losses.MeanSquaredError()     output_units = 1  if LOSS == \"ziln\":     loss = ltv.zero_inflated_lognormal_loss     output_units = 3 In\u00a0[\u00a0]: Copied! <pre>x_train, x_eval, y_train, y_eval = dnn_split(df)\nmodel = dnn_model(output_units)\n</pre> x_train, x_eval, y_train, y_eval = dnn_split(df) model = dnn_model(output_units) In\u00a0[\u00a0]: Copied! <pre>model.compile(optimizer=tf.keras.optimizers.Nadam(lr=LEARNING_RATE), loss=loss)\n</pre> model.compile(optimizer=tf.keras.optimizers.Nadam(lr=LEARNING_RATE), loss=loss) In\u00a0[\u00a0]: Copied! <pre>callbacks = [\n    tf.keras.callbacks.ReduceLROnPlateau(monitor=\"val_loss\", min_lr=1e-6),\n    tf.keras.callbacks.EarlyStopping(monitor=\"val_loss\", patience=10),\n]\n</pre> callbacks = [     tf.keras.callbacks.ReduceLROnPlateau(monitor=\"val_loss\", min_lr=1e-6),     tf.keras.callbacks.EarlyStopping(monitor=\"val_loss\", patience=10), ] In\u00a0[\u00a0]: Copied! <pre>history = model.fit(\n    x=x_train,\n    y=y_train,\n    batch_size=2048,\n    epochs=200,\n    verbose=2,\n    callbacks=callbacks,\n    validation_data=(x_eval, y_eval),\n).history\n</pre> history = model.fit(     x=x_train,     y=y_train,     batch_size=2048,     epochs=200,     verbose=2,     callbacks=callbacks,     validation_data=(x_eval, y_eval), ).history In\u00a0[\u00a0]: Copied! <pre>pd.DataFrame(history)[[\"loss\", \"val_loss\"]].plot();\n</pre> pd.DataFrame(history)[[\"loss\", \"val_loss\"]].plot(); In\u00a0[\u00a0]: Copied! <pre>if LOSS == \"mse\":\n    y_pred = model.predict(x=x_eval, batch_size=1024).flatten()\n\nif LOSS == \"ziln\":\n    logits = model.predict(x=x_eval, batch_size=1024)\n    y_pred = ltv.zero_inflated_lognormal_pred(logits).numpy().flatten()\n</pre> if LOSS == \"mse\":     y_pred = model.predict(x=x_eval, batch_size=1024).flatten()  if LOSS == \"ziln\":     logits = model.predict(x=x_eval, batch_size=1024)     y_pred = ltv.zero_inflated_lognormal_pred(logits).numpy().flatten() In\u00a0[\u00a0]: Copied! <pre>from pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.training import Trainer\nfrom pytorch_widedeep.models import TabMlp, WideDeep\nfrom torch.optim.lr_scheduler import ReduceLROnPlateau\nfrom pytorch_widedeep.callbacks import EarlyStopping\nfrom torch.optim import NAdam\n</pre> from pytorch_widedeep.preprocessing import TabPreprocessor from pytorch_widedeep.training import Trainer from pytorch_widedeep.models import TabMlp, WideDeep from torch.optim.lr_scheduler import ReduceLROnPlateau from pytorch_widedeep.callbacks import EarlyStopping from torch.optim import NAdam In\u00a0[\u00a0]: Copied! <pre># CATEGORICAL_FEATURES\nNUMERICAL_FEATURES = [\"num\" + str(i) for i in range(21)]\nx_train_pyt_num = pd.DataFrame(columns=NUMERICAL_FEATURES, data=x_train[\"numeric\"])\nx_train_pyt_cat = pd.DataFrame(\n    {key: value for key, value in x_train.items() if key not in [\"numeric\"]}\n)\n\nx_eval_pyt_num = pd.DataFrame(columns=NUMERICAL_FEATURES, data=x_eval[\"numeric\"])\nx_eval_pyt_cat = pd.DataFrame(\n    {key: value for key, value in x_eval.items() if key not in [\"numeric\"]}\n)\n</pre> # CATEGORICAL_FEATURES NUMERICAL_FEATURES = [\"num\" + str(i) for i in range(21)] x_train_pyt_num = pd.DataFrame(columns=NUMERICAL_FEATURES, data=x_train[\"numeric\"]) x_train_pyt_cat = pd.DataFrame(     {key: value for key, value in x_train.items() if key not in [\"numeric\"]} )  x_eval_pyt_num = pd.DataFrame(columns=NUMERICAL_FEATURES, data=x_eval[\"numeric\"]) x_eval_pyt_cat = pd.DataFrame(     {key: value for key, value in x_eval.items() if key not in [\"numeric\"]} ) In\u00a0[\u00a0]: Copied! <pre>x_train_pyt = pd.concat([x_train_pyt_num, x_train_pyt_cat], axis=1)\nx_eval_pyt = pd.concat([x_eval_pyt_num, x_eval_pyt_cat], axis=1)\n</pre> x_train_pyt = pd.concat([x_train_pyt_num, x_train_pyt_cat], axis=1) x_eval_pyt = pd.concat([x_eval_pyt_num, x_eval_pyt_cat], axis=1) In\u00a0[\u00a0]: Copied! <pre>embed_input = [\n    (u, int(x_train_pyt[u].nunique() ** 0.25) + 1) for u in CATEGORICAL_FEATURES\n]\n</pre> embed_input = [     (u, int(x_train_pyt[u].nunique() ** 0.25) + 1) for u in CATEGORICAL_FEATURES ] In\u00a0[\u00a0]: Copied! <pre># deeptabular\ntab_preprocessor = TabPreprocessor(\n    embed_cols=embed_input,\n    continuous_cols=NUMERICAL_FEATURES,\n    shared_embed=False,\n    scale=False,\n)\nX_tab_train = tab_preprocessor.fit_transform(x_train_pyt)\nX_tab_valid = tab_preprocessor.transform(x_eval_pyt)\nX_tab_test = tab_preprocessor.transform(x_eval_pyt)\n\n# target\ny_train = y_train\ny_valid = y_eval\ny_test = y_train\n\nX_train = {\"X_tab\": X_tab_train, \"target\": y_train}\nX_val = {\"X_tab\": X_tab_valid, \"target\": y_valid}\nX_test = {\"X_tab\": X_tab_test}\n\ndeeptabular = TabMlp(\n    mlp_hidden_dims=[128, 128, 64, 64],\n    column_idx=tab_preprocessor.column_idx,\n    embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n)\n\nmodel = WideDeep(deeptabular=deeptabular, pred_dim=3)\n\ndeep_opt = NAdam(model.deeptabular.parameters(), lr=LEARNING_RATE)\ncallbacks = [EarlyStopping()]\ndeep_sch = ReduceLROnPlateau(deep_opt, min_lr=1e-6)\n\nobjective = \"ziln\"\n\ntrainer = Trainer(\n    model,\n    callbacks=callbacks,\n    lr_schedulers={\"deeptabular\": deep_sch},\n    objective=objective,\n    optimizers={\"deeptabular\": deep_opt},\n)\n\ntrainer.fit(\n    X_train=X_train,\n    X_val=X_val,\n    n_epochs=200,\n    batch_size=2048,\n)\n\ny_pred_pytorch = trainer.predict(X_test=X_test)\n</pre> # deeptabular tab_preprocessor = TabPreprocessor(     embed_cols=embed_input,     continuous_cols=NUMERICAL_FEATURES,     shared_embed=False,     scale=False, ) X_tab_train = tab_preprocessor.fit_transform(x_train_pyt) X_tab_valid = tab_preprocessor.transform(x_eval_pyt) X_tab_test = tab_preprocessor.transform(x_eval_pyt)  # target y_train = y_train y_valid = y_eval y_test = y_train  X_train = {\"X_tab\": X_tab_train, \"target\": y_train} X_val = {\"X_tab\": X_tab_valid, \"target\": y_valid} X_test = {\"X_tab\": X_tab_test}  deeptabular = TabMlp(     mlp_hidden_dims=[128, 128, 64, 64],     column_idx=tab_preprocessor.column_idx,     embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=tab_preprocessor.continuous_cols, )  model = WideDeep(deeptabular=deeptabular, pred_dim=3)  deep_opt = NAdam(model.deeptabular.parameters(), lr=LEARNING_RATE) callbacks = [EarlyStopping()] deep_sch = ReduceLROnPlateau(deep_opt, min_lr=1e-6)  objective = \"ziln\"  trainer = Trainer(     model,     callbacks=callbacks,     lr_schedulers={\"deeptabular\": deep_sch},     objective=objective,     optimizers={\"deeptabular\": deep_opt}, )  trainer.fit(     X_train=X_train,     X_val=X_val,     n_epochs=200,     batch_size=2048, )  y_pred_pytorch = trainer.predict(X_test=X_test) In\u00a0[\u00a0]: Copied! <pre>pd.DataFrame(trainer.history)[[\"train_loss\", \"val_loss\"]].plot();\n</pre> pd.DataFrame(trainer.history)[[\"train_loss\", \"val_loss\"]].plot(); In\u00a0[\u00a0]: Copied! <pre>from sklearn.metrics import mean_squared_error\n\nmean_squared_error(y_pred, y_pred_pytorch)\n</pre> from sklearn.metrics import mean_squared_error  mean_squared_error(y_pred, y_pred_pytorch) In\u00a0[\u00a0]: Copied! <pre>unit_costs = [0.4, 0.5, 0.6, 0.68, 0.7, 0.8, 0.9, 1.0]\n</pre> unit_costs = [0.4, 0.5, 0.6, 0.68, 0.7, 0.8, 0.9, 1.0] In\u00a0[\u00a0]: Copied! <pre>num_mailed = [np.sum(y_pred &gt; v) for v in unit_costs]\nnum_mailed\n</pre> num_mailed = [np.sum(y_pred &gt; v) for v in unit_costs] num_mailed In\u00a0[\u00a0]: Copied! <pre>baseline_total_profit = np.sum(y_eval - 0.68)\nbaseline_total_profit\n</pre> baseline_total_profit = np.sum(y_eval - 0.68) baseline_total_profit In\u00a0[\u00a0]: Copied! <pre>total_profits = [np.sum(y_eval[y_pred &gt; v] - v) for v in unit_costs]\ntotal_profits\n</pre> total_profits = [np.sum(y_eval[y_pred &gt; v] - v) for v in unit_costs] total_profits In\u00a0[\u00a0]: Copied! <pre>gain = pd.DataFrame(\n    {\n        \"lorenz\": ltv.cumulative_true(y_eval, y_eval),\n        \"baseline\": ltv.cumulative_true(y_eval, x_eval[\"numeric\"][:, 19]),\n        \"model\": ltv.cumulative_true(y_eval, y_pred),\n    }\n)\n</pre> gain = pd.DataFrame(     {         \"lorenz\": ltv.cumulative_true(y_eval, y_eval),         \"baseline\": ltv.cumulative_true(y_eval, x_eval[\"numeric\"][:, 19]),         \"model\": ltv.cumulative_true(y_eval, y_pred),     } ) In\u00a0[\u00a0]: Copied! <pre>num_customers = np.float32(gain.shape[0])\ngain[\"cumulative_customer\"] = (np.arange(num_customers) + 1.0) / num_customers\n</pre> num_customers = np.float32(gain.shape[0]) gain[\"cumulative_customer\"] = (np.arange(num_customers) + 1.0) / num_customers In\u00a0[\u00a0]: Copied! <pre>ax = gain[\n    [\n        \"cumulative_customer\",\n        \"lorenz\",\n        \"baseline\",\n        \"model\",\n    ]\n].plot(x=\"cumulative_customer\", figsize=(8, 5), legend=True)\n\nax.legend([\"Groundtruth\", \"Baseline\", \"Model\"], loc=\"lower right\")\n\nax.set_xlabel(\"Cumulative Fraction of Customers\")\nax.set_xticks(np.arange(0, 1.1, 0.1))\nax.set_xlim((0, 1.0))\n\nax.set_ylabel(\"Cumulative Fraction of Total Lifetime Value\")\nax.set_yticks(np.arange(0, 1.1, 0.1))\nax.set_ylim((0, 1.05))\nax.set_title(\"Gain Chart\");\n</pre> ax = gain[     [         \"cumulative_customer\",         \"lorenz\",         \"baseline\",         \"model\",     ] ].plot(x=\"cumulative_customer\", figsize=(8, 5), legend=True)  ax.legend([\"Groundtruth\", \"Baseline\", \"Model\"], loc=\"lower right\")  ax.set_xlabel(\"Cumulative Fraction of Customers\") ax.set_xticks(np.arange(0, 1.1, 0.1)) ax.set_xlim((0, 1.0))  ax.set_ylabel(\"Cumulative Fraction of Total Lifetime Value\") ax.set_yticks(np.arange(0, 1.1, 0.1)) ax.set_ylim((0, 1.05)) ax.set_title(\"Gain Chart\"); In\u00a0[\u00a0]: Copied! <pre>gini = ltv.gini_from_gain(gain[[\"lorenz\", \"baseline\", \"model\"]])\ngini\n</pre> gini = ltv.gini_from_gain(gain[[\"lorenz\", \"baseline\", \"model\"]]) gini In\u00a0[\u00a0]: Copied! <pre>df_decile = ltv.decile_stats(y_eval, y_pred)\ndf_decile\n</pre> df_decile = ltv.decile_stats(y_eval, y_pred) df_decile In\u00a0[\u00a0]: Copied! <pre>ax = df_decile[[\"label_mean\", \"pred_mean\"]].plot.bar(rot=0)\n\nax.set_title(\"Decile Chart\")\nax.set_xlabel(\"Prediction bucket\")\nax.set_ylabel(\"Average bucket value\")\nax.legend([\"Label\", \"Prediction\"], loc=\"upper left\");\n</pre> ax = df_decile[[\"label_mean\", \"pred_mean\"]].plot.bar(rot=0)  ax.set_title(\"Decile Chart\") ax.set_xlabel(\"Prediction bucket\") ax.set_ylabel(\"Average bucket value\") ax.legend([\"Label\", \"Prediction\"], loc=\"upper left\"); In\u00a0[\u00a0]: Copied! <pre>def spearmanr(x1: Sequence[float], x2: Sequence[float]) -&gt; float:\n    \"\"\"Calculates spearmanr rank correlation coefficient.\n\n    See https://docs.scipy.org/doc/scipy/reference/stats.html.\n\n    Args:\n      x1: 1D array_like.\n      x2: 1D array_like.\n\n    Returns:\n      correlation: float.\n    \"\"\"\n    return stats.spearmanr(x1, x2, nan_policy=\"raise\")[0]\n\n\nspearman_corr = spearmanr(y_eval, y_pred)\nspearman_corr\n</pre> def spearmanr(x1: Sequence[float], x2: Sequence[float]) -&gt; float:     \"\"\"Calculates spearmanr rank correlation coefficient.      See https://docs.scipy.org/doc/scipy/reference/stats.html.      Args:       x1: 1D array_like.       x2: 1D array_like.      Returns:       correlation: float.     \"\"\"     return stats.spearmanr(x1, x2, nan_policy=\"raise\")[0]   spearman_corr = spearmanr(y_eval, y_pred) spearman_corr In\u00a0[\u00a0]: Copied! <pre>df_metrics = pd.DataFrame(\n    {\n        \"model\": MODEL,\n        \"loss_function\": LOSS,\n        \"train_loss\": history[\"loss\"][-1],\n        \"eval_loss\": history[\"val_loss\"][-1],\n        \"label_positive\": np.mean(y_eval &gt; 0),\n        \"label_mean\": y_eval.mean(),\n        \"pred_mean\": y_pred.mean(),\n        \"decile_mape\": df_decile[\"decile_mape\"].mean(),\n        \"baseline_gini\": gini[\"normalized\"][1],\n        \"gini\": gini[\"normalized\"][2],\n        \"spearman_corr\": spearman_corr,\n    },\n    index=[VERSION],\n)\n</pre> df_metrics = pd.DataFrame(     {         \"model\": MODEL,         \"loss_function\": LOSS,         \"train_loss\": history[\"loss\"][-1],         \"eval_loss\": history[\"val_loss\"][-1],         \"label_positive\": np.mean(y_eval &gt; 0),         \"label_mean\": y_eval.mean(),         \"pred_mean\": y_pred.mean(),         \"decile_mape\": df_decile[\"decile_mape\"].mean(),         \"baseline_gini\": gini[\"normalized\"][1],         \"gini\": gini[\"normalized\"][2],         \"spearman_corr\": spearman_corr,     },     index=[VERSION], ) In\u00a0[\u00a0]: Copied! <pre>for unit_cost, total_profit in zip(unit_costs, total_profits):\n    df_metrics[\"total_profit_{:02d}\".format(int(unit_cost * 100))] = total_profit\n</pre> for unit_cost, total_profit in zip(unit_costs, total_profits):     df_metrics[\"total_profit_{:02d}\".format(int(unit_cost * 100))] = total_profit In\u00a0[\u00a0]: Copied! <pre>df_metrics.T\n</pre> df_metrics.T In\u00a0[\u00a0]: Copied! <pre>output_path = OUTPUT_CSV_FOLDER\n</pre> output_path = OUTPUT_CSV_FOLDER In\u00a0[\u00a0]: Copied! <pre>if not os.path.isdir(output_path):\n    os.makedirs(output_path)\n</pre> if not os.path.isdir(output_path):     os.makedirs(output_path) In\u00a0[\u00a0]: Copied! <pre>output_file = os.path.join(\n    output_path, \"{}_regression_{}_{}.csv\".format(MODEL, LOSS, VERSION)\n)\n</pre> output_file = os.path.join(     output_path, \"{}_regression_{}_{}.csv\".format(MODEL, LOSS, VERSION) ) In\u00a0[\u00a0]: Copied! <pre>df_metrics.to_csv(output_file, index=False)\n</pre> df_metrics.to_csv(output_file, index=False)"},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#zilnloss","title":"ZILNLoss\u00b6","text":"<p>[DISCLAIMER]</p> <p>Purpose of this notebook is to check if ZILNloss implemented originaly Keras give same results in pytorch-widedeep implemenatation</p>"},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#kdd-cup-98-ltv-prediction","title":"KDD Cup 98 LTV Prediction\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#configs","title":"Configs\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#load-data","title":"Load data\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#label-distribution","title":"Label distribution\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#preprocess-features","title":"Preprocess features\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#vocab","title":"Vocab\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#indicator","title":"Indicator\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#numeric","title":"Numeric\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#all","title":"All\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#traineval-split","title":"Train/eval split\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#model","title":"Model\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#loss","title":"Loss\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#train","title":"Train\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#eval","title":"Eval\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#pytorch-widedeep-approach","title":"Pytorch-widedeep approach\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#appendix","title":"Appendix\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#total-profit","title":"Total Profit\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#gini-coefficient","title":"Gini Coefficient\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#calibration","title":"Calibration\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#rank-correlation","title":"Rank Correlation\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#all-metrics-together","title":"All metrics together\u00b6","text":""},{"location":"examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html#save","title":"Save\u00b6","text":""},{"location":"examples/13_model_uncertainty_prediction.html","title":"13_model_uncertainty_prediction","text":"<ul> <li>In this notebook we will use the higly imbalanced Protein Homology Dataset from KDD cup 2004</li> </ul> <pre><code>* The first element of each line is a BLOCK ID that denotes to which native sequence this example belongs. There is a unique BLOCK ID for each native sequence. BLOCK IDs are integers running from 1 to 303 (one for each native sequence, i.e. for each query). BLOCK IDs were assigned before the blocks were split into the train and test sets, so they do not run consecutively in either file.\n* The second element of each line is an EXAMPLE ID that uniquely describes the example. You will need this EXAMPLE ID and the BLOCK ID when you submit results.\n* The third element is the class of the example. Proteins that are homologous to the native sequence are denoted by 1, non-homologous proteins (i.e. decoys) by 0. Test examples have a \"?\" in this position.\n* All following elements are feature values. There are 74 feature values in each line. The features describe the match (e.g. the score of a sequence alignment) between the native protein sequence and the sequence that is tested for homology.\n</code></pre> In\u00a0[1]: Copied! <pre>import numpy as np\nimport pandas as pd\nimport torch\nfrom torch.optim import SGD, lr_scheduler\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.models import TabMlp, WideDeep\nfrom pytorch_widedeep.dataloaders import DataLoaderImbalanced, DataLoaderDefault\nfrom torchmetrics import F1Score as F1_torchmetrics\nfrom torchmetrics import Accuracy as Accuracy_torchmetrics\nfrom torchmetrics import Precision as Precision_torchmetrics\nfrom torchmetrics import Recall as Recall_torchmetrics\nfrom pytorch_widedeep.metrics import Accuracy, Recall, Precision, F1Score, R2Score\nfrom pytorch_widedeep.initializers import XavierNormal\nfrom pytorch_widedeep.datasets import load_bio_kdd04\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import classification_report\n\nimport time\nimport datetime\n\nimport warnings\n\nwarnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n\n# increase displayed columns in jupyter notebook\npd.set_option(\"display.max_columns\", 200)\npd.set_option(\"display.max_rows\", 300)\n</pre> import numpy as np import pandas as pd import torch from torch.optim import SGD, lr_scheduler  from pytorch_widedeep import Trainer from pytorch_widedeep.preprocessing import TabPreprocessor from pytorch_widedeep.models import TabMlp, WideDeep from pytorch_widedeep.dataloaders import DataLoaderImbalanced, DataLoaderDefault from torchmetrics import F1Score as F1_torchmetrics from torchmetrics import Accuracy as Accuracy_torchmetrics from torchmetrics import Precision as Precision_torchmetrics from torchmetrics import Recall as Recall_torchmetrics from pytorch_widedeep.metrics import Accuracy, Recall, Precision, F1Score, R2Score from pytorch_widedeep.initializers import XavierNormal from pytorch_widedeep.datasets import load_bio_kdd04  from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report  import time import datetime  import warnings  warnings.filterwarnings(\"ignore\", category=DeprecationWarning)  # increase displayed columns in jupyter notebook pd.set_option(\"display.max_columns\", 200) pd.set_option(\"display.max_rows\", 300) In\u00a0[2]: Copied! <pre>df = load_bio_kdd04(as_frame=True)\ndf.head()\n</pre> df = load_bio_kdd04(as_frame=True) df.head() Out[2]: EXAMPLE_ID BLOCK_ID target 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 0 279 261532 0 52.0 32.69 0.30 2.5 20.0 1256.8 -0.89 0.33 11.0 -55.0 267.2 0.52 0.05 -2.36 49.6 252.0 0.43 1.16 -2.06 -33.0 -123.2 1.60 -0.49 -6.06 65.0 296.1 -0.28 -0.26 -3.83 -22.6 -170.0 3.06 -1.05 -3.29 22.9 286.3 0.12 2.58 4.08 -33.0 -178.9 1.88 0.53 -7.0 -44.0 1987.0 -5.41 0.95 -4.0 -57.0 722.9 -3.26 -0.55 -7.5 125.5 1547.2 -0.36 1.12 9.0 -37.0 72.5 0.47 0.74 -11.0 -8.0 1595.1 -1.64 2.83 -2.0 -50.0 445.2 -0.35 0.26 0.76 1 279 261533 0 58.0 33.33 0.00 16.5 9.5 608.1 0.50 0.07 20.5 -52.5 521.6 -1.08 0.58 -0.02 -3.2 103.6 -0.95 0.23 -2.87 -25.9 -52.2 -0.21 0.87 -1.81 10.4 62.0 -0.28 -0.04 1.48 -17.6 -198.3 3.43 2.84 5.87 -16.9 72.6 -0.31 2.79 2.71 -33.5 -11.6 -1.11 4.01 5.0 -57.0 666.3 1.13 4.38 5.0 -64.0 39.3 1.07 -0.16 32.5 100.0 1893.7 -2.80 -0.22 2.5 -28.5 45.0 0.58 0.41 -19.0 -6.0 762.9 0.29 0.82 -3.0 -35.0 140.3 1.16 0.39 0.73 2 279 261534 0 77.0 27.27 -0.91 6.0 58.5 1623.6 -1.40 0.02 -6.5 -48.0 621.0 -1.20 0.14 -0.20 73.6 609.1 -0.44 -0.58 -0.04 -23.0 -27.4 -0.72 -1.04 -1.09 91.1 635.6 -0.88 0.24 0.59 -18.7 -7.2 -0.60 -2.82 -0.71 52.4 504.1 0.89 -0.67 -9.30 -20.8 -25.7 -0.77 -0.85 0.0 -20.0 2259.0 -0.94 1.15 -4.0 -44.0 -22.7 0.94 -0.98 -19.0 105.0 1267.9 1.03 1.27 11.0 -39.5 82.3 0.47 -0.19 -10.0 7.0 1491.8 0.32 -1.29 0.0 -34.0 658.2 -0.76 0.26 0.24 3 279 261535 0 41.0 27.91 -0.35 3.0 46.0 1921.6 -1.36 -0.47 -32.0 -51.5 560.9 -0.29 -0.10 -1.11 124.3 791.6 0.00 0.39 -1.85 -21.7 -44.9 -0.21 0.02 0.89 133.9 797.8 -0.08 1.06 -0.26 -16.4 -74.1 0.97 -0.80 -0.41 66.9 955.3 -1.90 1.28 -6.65 -28.1 47.5 -1.91 1.42 1.0 -30.0 1846.7 0.76 1.10 -4.0 -52.0 -53.9 1.71 -0.22 -12.0 97.5 1969.8 -1.70 0.16 -1.0 -32.5 255.9 -0.46 1.57 10.0 6.0 2047.7 -0.98 1.53 0.0 -49.0 554.2 -0.83 0.39 0.73 4 279 261536 0 50.0 28.00 -1.32 -9.0 12.0 464.8 0.88 0.19 8.0 -51.5 98.1 1.09 -0.33 -2.16 -3.9 102.7 0.39 -1.22 -3.39 -15.2 -42.2 -1.18 -1.11 -3.55 8.9 141.3 -0.16 -0.43 -4.15 -12.9 -13.4 -1.32 -0.98 -3.69 8.8 136.1 -0.30 4.13 1.89 -13.0 -18.7 -1.37 -0.93 0.0 -1.0 810.1 -2.29 6.72 1.0 -23.0 -29.7 0.58 -1.10 -18.5 33.5 206.8 1.84 -0.13 4.0 -29.0 30.1 0.80 -0.24 5.0 -14.0 479.5 0.68 -0.59 2.0 -36.0 -6.9 2.02 0.14 -0.23 In\u00a0[3]: Copied! <pre># imbalance of the classes\ndf[\"target\"].value_counts()\n</pre> # imbalance of the classes df[\"target\"].value_counts() Out[3]: <pre>target\n0    144455\n1      1296\nName: count, dtype: int64</pre> In\u00a0[4]: Copied! <pre># drop columns we won't need in this example\ndf.drop(columns=[\"EXAMPLE_ID\", \"BLOCK_ID\"], inplace=True)\n</pre> # drop columns we won't need in this example df.drop(columns=[\"EXAMPLE_ID\", \"BLOCK_ID\"], inplace=True) In\u00a0[5]: Copied! <pre>df_train, df_valid = train_test_split(\n    df, test_size=0.2, stratify=df[\"target\"], random_state=1\n)\ndf_valid, df_test = train_test_split(\n    df_valid, test_size=0.5, stratify=df_valid[\"target\"], random_state=1\n)\n</pre> df_train, df_valid = train_test_split(     df, test_size=0.2, stratify=df[\"target\"], random_state=1 ) df_valid, df_test = train_test_split(     df_valid, test_size=0.5, stratify=df_valid[\"target\"], random_state=1 ) In\u00a0[6]: Copied! <pre>continuous_cols = df.drop(columns=[\"target\"]).columns.values.tolist()\n</pre> continuous_cols = df.drop(columns=[\"target\"]).columns.values.tolist() In\u00a0[7]: Copied! <pre># deeptabular\ntab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True)\nX_tab_train = tab_preprocessor.fit_transform(df_train)\nX_tab_valid = tab_preprocessor.transform(df_valid)\nX_tab_test = tab_preprocessor.transform(df_test)\n\n# target\ny_train = df_train[\"target\"].values\ny_valid = df_valid[\"target\"].values\ny_test = df_test[\"target\"].values\n</pre> # deeptabular tab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True) X_tab_train = tab_preprocessor.fit_transform(df_train) X_tab_valid = tab_preprocessor.transform(df_valid) X_tab_test = tab_preprocessor.transform(df_test)  # target y_train = df_train[\"target\"].values y_valid = df_valid[\"target\"].values y_test = df_test[\"target\"].values In\u00a0[8]: Copied! <pre>input_layer = len(tab_preprocessor.continuous_cols)\noutput_layer = 1\nhidden_layers = np.linspace(\n    input_layer * 2, output_layer, 5, endpoint=False, dtype=int\n).tolist()\n</pre> input_layer = len(tab_preprocessor.continuous_cols) output_layer = 1 hidden_layers = np.linspace(     input_layer * 2, output_layer, 5, endpoint=False, dtype=int ).tolist() In\u00a0[9]: Copied! <pre>deeptabular = TabMlp(\n    mlp_hidden_dims=hidden_layers,\n    column_idx=tab_preprocessor.column_idx,\n    continuous_cols=tab_preprocessor.continuous_cols,\n)\nmodel = WideDeep(deeptabular=deeptabular, pred_dim=1)\nmodel\n</pre> deeptabular = TabMlp(     mlp_hidden_dims=hidden_layers,     column_idx=tab_preprocessor.column_idx,     continuous_cols=tab_preprocessor.continuous_cols, ) model = WideDeep(deeptabular=deeptabular, pred_dim=1) model Out[9]: <pre>WideDeep(\n  (deeptabular): Sequential(\n    (0): TabMlp(\n      (cont_norm): Identity()\n      (encoder): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=74, out_features=148, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_1): Sequential(\n            (0): Linear(in_features=148, out_features=118, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_2): Sequential(\n            (0): Linear(in_features=118, out_features=89, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_3): Sequential(\n            (0): Linear(in_features=89, out_features=59, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_4): Sequential(\n            (0): Linear(in_features=59, out_features=30, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=30, out_features=1, bias=True)\n  )\n)</pre> In\u00a0[10]: Copied! <pre># # Metrics from torchmetrics\n# accuracy = Accuracy_torchmetrics(average=None, num_classes=1)\n# precision = Precision_torchmetrics(average=\"micro\", num_classes=1)\n# f1 = F1_torchmetrics(average=None, num_classes=1)\n# recall = Recall_torchmetrics(average=None, num_classes=1)\n</pre> # # Metrics from torchmetrics # accuracy = Accuracy_torchmetrics(average=None, num_classes=1) # precision = Precision_torchmetrics(average=\"micro\", num_classes=1) # f1 = F1_torchmetrics(average=None, num_classes=1) # recall = Recall_torchmetrics(average=None, num_classes=1) In\u00a0[11]: Copied! <pre># Metrics from pytorch-widedeep\naccuracy = Accuracy(top_k=2)\nprecision = Precision(average=False)\nrecall = Recall(average=True)\nf1 = F1Score(average=False)\n</pre> # Metrics from pytorch-widedeep accuracy = Accuracy(top_k=2) precision = Precision(average=False) recall = Recall(average=True) f1 = F1Score(average=False) In\u00a0[12]: Copied! <pre># Optimizers\ndeep_opt = SGD(model.deeptabular.parameters(), lr=0.1)\n# LR Scheduler\ndeep_sch = lr_scheduler.StepLR(deep_opt, step_size=3)\n\ntrainer = Trainer(\n    model,\n    objective=\"binary\",\n    lr_schedulers={\"deeptabular\": deep_sch},\n    initializers={\"deeptabular\": XavierNormal},\n    optimizers={\"deeptabular\": deep_opt},\n    metrics=[accuracy, precision, recall, f1],\n    verbose=1,\n)\n</pre> # Optimizers deep_opt = SGD(model.deeptabular.parameters(), lr=0.1) # LR Scheduler deep_sch = lr_scheduler.StepLR(deep_opt, step_size=3)  trainer = Trainer(     model,     objective=\"binary\",     lr_schedulers={\"deeptabular\": deep_sch},     initializers={\"deeptabular\": XavierNormal},     optimizers={\"deeptabular\": deep_opt},     metrics=[accuracy, precision, recall, f1],     verbose=1, ) In\u00a0[13]: Copied! <pre>start = time.time()\ntrainer.fit(\n    X_train={\"X_tab\": X_tab_train, \"target\": y_train},\n    X_val={\"X_tab\": X_tab_valid, \"target\": y_valid},\n    n_epochs=3,\n    batch_size=50,\n    custom_dataloader=DataLoaderImbalanced,\n    oversample_mul=5,\n)\nprint(\n    \"Training time[s]: {}\".format(\n        datetime.timedelta(seconds=round(time.time() - start))\n    )\n)\n</pre> start = time.time() trainer.fit(     X_train={\"X_tab\": X_tab_train, \"target\": y_train},     X_val={\"X_tab\": X_tab_valid, \"target\": y_valid},     n_epochs=3,     batch_size=50,     custom_dataloader=DataLoaderImbalanced,     oversample_mul=5, ) print(     \"Training time[s]: {}\".format(         datetime.timedelta(seconds=round(time.time() - start))     ) ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 208/208 [00:01&lt;00:00, 130.75it/s, loss=0.187, metrics={'acc': 0.9214, 'prec': [0.9149], 'rec': 0.9318, 'f1': [0.9233]}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 292/292 [00:01&lt;00:00, 173.71it/s, loss=0.106, metrics={'acc': 0.9499, 'prec': [0.1435], 'rec': 0.938, 'f1': [0.249]}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 208/208 [00:01&lt;00:00, 139.68it/s, loss=0.109, metrics={'acc': 0.9559, 'prec': [0.9537], 'rec': 0.9572, 'f1': [0.9554]}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 292/292 [00:01&lt;00:00, 177.53it/s, loss=0.0888, metrics={'acc': 0.9602, 'prec': [0.1755], 'rec': 0.9457, 'f1': [0.2961]}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 208/208 [00:01&lt;00:00, 141.63it/s, loss=0.08, metrics={'acc': 0.9706, 'prec': [0.9648], 'rec': 0.9766, 'f1': [0.9707]}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 292/292 [00:01&lt;00:00, 165.36it/s, loss=0.0969, metrics={'acc': 0.9564, 'prec': [0.1636], 'rec': 0.9535, 'f1': [0.2792]}]</pre> <pre>Training time[s]: 0:00:10\n</pre> <pre>\n</pre> In\u00a0[14]: Copied! <pre>pd.DataFrame(trainer.history)\n</pre> pd.DataFrame(trainer.history) Out[14]: train_loss train_acc train_prec train_rec train_f1 val_loss val_acc val_prec val_rec val_f1 0 0.186707 0.921408 [0.9149412512779236] 0.931801 [0.9232940673828125] 0.106023 0.949914 [0.14353498816490173] 0.937984 [0.24897116422653198] 1 0.109498 0.955931 [0.9536514282226562] 0.957193 [0.9554190039634705] 0.088787 0.960206 [0.17553956806659698] 0.945736 [0.29611650109291077] 2 0.079979 0.970588 [0.9648183584213257] 0.976582 [0.9706646203994751] 0.096858 0.956432 [0.1635638326406479] 0.953488 [0.279228150844574] In\u00a0[15]: Copied! <pre>df_pred = trainer.predict(X_tab=X_tab_test)\nprint(classification_report(df_test[\"target\"].to_list(), df_pred))\nprint(\"Actual predicted values:\\n{}\".format(np.unique(df_pred, return_counts=True)))\n</pre> df_pred = trainer.predict(X_tab=X_tab_test) print(classification_report(df_test[\"target\"].to_list(), df_pred)) print(\"Actual predicted values:\\n{}\".format(np.unique(df_pred, return_counts=True))) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 292/292 [00:00&lt;00:00, 346.55it/s]\n</pre> <pre>              precision    recall  f1-score   support\n\n           0       1.00      0.96      0.98     14446\n           1       0.17      0.95      0.29       130\n\n    accuracy                           0.96     14576\n   macro avg       0.58      0.95      0.63     14576\nweighted avg       0.99      0.96      0.97     14576\n\nActual predicted values:\n(array([0, 1]), array([13845,   731]))\n</pre> In\u00a0[16]: Copied! <pre>df_pred_unc = trainer.predict_uncertainty(X_tab=X_tab_test, uncertainty_granularity=10)\nprint(classification_report(df_test[\"target\"].to_list(), df_pred))\nprint(\n    \"Actual predicted values:\\n{}\".format(\n        np.unique(df_pred_unc[:, -1], return_counts=True)\n    )\n)\n</pre> df_pred_unc = trainer.predict_uncertainty(X_tab=X_tab_test, uncertainty_granularity=10) print(classification_report(df_test[\"target\"].to_list(), df_pred)) print(     \"Actual predicted values:\\n{}\".format(         np.unique(df_pred_unc[:, -1], return_counts=True)     ) ) <pre>predict_UncertaintyIter: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 10/10 [00:03&lt;00:00,  3.25it/s]</pre> <pre>              precision    recall  f1-score   support\n\n           0       1.00      0.96      0.98     14446\n           1       0.17      0.95      0.29       130\n\n    accuracy                           0.96     14576\n   macro avg       0.58      0.95      0.63     14576\nweighted avg       0.99      0.96      0.97     14576\n\nActual predicted values:\n(array([0.]), array([14576]))\n</pre> <pre>\n</pre> In\u00a0[17]: Copied! <pre>df_pred_unc\n</pre> df_pred_unc Out[17]: <pre>array([[9.98401165e-01, 1.59881881e-03, 0.00000000e+00],\n       [9.99941409e-01, 5.85634953e-05, 0.00000000e+00],\n       [9.97351170e-01, 2.64881272e-03, 0.00000000e+00],\n       ...,\n       [9.99494374e-01, 5.05603210e-04, 0.00000000e+00],\n       [9.99981642e-01, 1.83574630e-05, 0.00000000e+00],\n       [9.99996483e-01, 3.52600046e-06, 0.00000000e+00]])</pre>"},{"location":"examples/13_model_uncertainty_prediction.html#model-uncertainty-prediction","title":"Model Uncertainty prediction\u00b6","text":"<p>Note:</p> <p>This notebook extends the \"Custom DataLoader for Imbalanced dataset\" notebook</p>"},{"location":"examples/13_model_uncertainty_prediction.html#initial-imports","title":"Initial imports\u00b6","text":""},{"location":"examples/13_model_uncertainty_prediction.html#preparing-the-data","title":"Preparing the data\u00b6","text":""},{"location":"examples/13_model_uncertainty_prediction.html#define-the-model","title":"Define the model\u00b6","text":""},{"location":"examples/13_model_uncertainty_prediction.html#normal-prediction","title":"\"Normal\" prediction\u00b6","text":""},{"location":"examples/13_model_uncertainty_prediction.html#prediction-using-uncertainty","title":"Prediction using uncertainty\u00b6","text":""},{"location":"examples/14_bayesian_models.html","title":"14_bayesian_models","text":"In\u00a0[1]: Copied! <pre>import numpy as np\nimport torch\nimport pandas as pd\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import accuracy_score\n\nfrom pytorch_widedeep.metrics import Accuracy\nfrom pytorch_widedeep.datasets import load_adult\nfrom pytorch_widedeep.callbacks import EarlyStopping, ModelCheckpoint\nfrom pytorch_widedeep.preprocessing import TabPreprocessor, WidePreprocessor\nfrom pytorch_widedeep.bayesian_models import BayesianWide, BayesianTabMlp\nfrom pytorch_widedeep.training.bayesian_trainer import BayesianTrainer\n</pre> import numpy as np import torch import pandas as pd  from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score  from pytorch_widedeep.metrics import Accuracy from pytorch_widedeep.datasets import load_adult from pytorch_widedeep.callbacks import EarlyStopping, ModelCheckpoint from pytorch_widedeep.preprocessing import TabPreprocessor, WidePreprocessor from pytorch_widedeep.bayesian_models import BayesianWide, BayesianTabMlp from pytorch_widedeep.training.bayesian_trainer import BayesianTrainer <p>The first few things to do we know them very well, like with any other model described in any of the other notebooks</p> In\u00a0[2]: Copied! <pre>df = load_adult(as_frame=True)\ndf.columns = [c.replace(\"-\", \"_\") for c in df.columns]\ndf[\"age_buckets\"] = pd.cut(\n    df.age, bins=[16, 25, 30, 35, 40, 45, 50, 55, 60, 91], labels=np.arange(9)\n)\ndf[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int)\ndf.drop(\"income\", axis=1, inplace=True)\ndf.head()\n</pre> df = load_adult(as_frame=True) df.columns = [c.replace(\"-\", \"_\") for c in df.columns] df[\"age_buckets\"] = pd.cut(     df.age, bins=[16, 25, 30, 35, 40, 45, 50, 55, 60, 91], labels=np.arange(9) ) df[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int) df.drop(\"income\", axis=1, inplace=True) df.head() Out[2]: age workclass fnlwgt education educational_num marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country age_buckets income_label 0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States 0 0 1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States 3 0 2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States 1 1 3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States 4 1 4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States 0 0 In\u00a0[3]: Copied! <pre>train, test = train_test_split(df, test_size=0.2, stratify=df.income_label)\n</pre> train, test = train_test_split(df, test_size=0.2, stratify=df.income_label) In\u00a0[4]: Copied! <pre>wide_cols = [\n    \"age_buckets\",\n    \"education\",\n    \"relationship\",\n    \"workclass\",\n    \"occupation\",\n    \"native_country\",\n    \"gender\",\n]\ncrossed_cols = [(\"education\", \"occupation\"), (\"native_country\", \"occupation\")]\n\ncat_embed_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital_status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"capital_gain\",\n    \"capital_loss\",\n    \"native_country\",\n]\ncontinuous_cols = [\"age\", \"hours_per_week\"]\n\ntarget = train[\"income_label\"].values\n</pre> wide_cols = [     \"age_buckets\",     \"education\",     \"relationship\",     \"workclass\",     \"occupation\",     \"native_country\",     \"gender\", ] crossed_cols = [(\"education\", \"occupation\"), (\"native_country\", \"occupation\")]  cat_embed_cols = [     \"workclass\",     \"education\",     \"marital_status\",     \"occupation\",     \"relationship\",     \"race\",     \"gender\",     \"capital_gain\",     \"capital_loss\",     \"native_country\", ] continuous_cols = [\"age\", \"hours_per_week\"]  target = train[\"income_label\"].values In\u00a0[5]: Copied! <pre>wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)\nX_tab = wide_preprocessor.fit_transform(train)\n</pre> wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_tab = wide_preprocessor.fit_transform(train) In\u00a0[6]: Copied! <pre>model = BayesianWide(\n    input_dim=np.unique(X_tab).shape[0],\n    prior_sigma_1=1.0,\n    prior_sigma_2=0.002,\n    prior_pi=0.8,\n    posterior_mu_init=0,\n    posterior_rho_init=-7.0,\n    pred_dim=1,  # here the models are NOT passed to a WideDeep constructor class so the output dim MUST be specified\n)\n</pre> model = BayesianWide(     input_dim=np.unique(X_tab).shape[0],     prior_sigma_1=1.0,     prior_sigma_2=0.002,     prior_pi=0.8,     posterior_mu_init=0,     posterior_rho_init=-7.0,     pred_dim=1,  # here the models are NOT passed to a WideDeep constructor class so the output dim MUST be specified ) In\u00a0[7]: Copied! <pre>trainer = BayesianTrainer(\n    model,\n    objective=\"binary\",\n    optimizer=torch.optim.Adam(model.parameters(), lr=0.01),\n    metrics=[Accuracy],\n)\n</pre> trainer = BayesianTrainer(     model,     objective=\"binary\",     optimizer=torch.optim.Adam(model.parameters(), lr=0.01),     metrics=[Accuracy], ) In\u00a0[8]: Copied! <pre>trainer.fit(\n    X_tab=X_tab,\n    target=target,\n    val_split=0.2,\n    n_epochs=2,\n    batch_size=256,\n)\n</pre> trainer.fit(     X_tab=X_tab,     target=target,     val_split=0.2,     n_epochs=2,     batch_size=256, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 123/123 [00:00&lt;00:00, 124.32it/s, loss=163, metrics={'acc': 0.7813}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 31/31 [00:00&lt;00:00, 238.67it/s, loss=141, metrics={'acc': 0.8219}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 123/123 [00:00&lt;00:00, 132.81it/s, loss=140, metrics={'acc': 0.8285}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 31/31 [00:00&lt;00:00, 190.16it/s, loss=140, metrics={'acc': 0.8298}]\n</pre> In\u00a0[9]: Copied! <pre>tab_preprocessor = TabPreprocessor(\n    cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols\n)\nX_tab = tab_preprocessor.fit_transform(train)\n</pre> tab_preprocessor = TabPreprocessor(     cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols ) X_tab = tab_preprocessor.fit_transform(train) <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> In\u00a0[10]: Copied! <pre>model = BayesianTabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=continuous_cols,\n    #     embed_continuous_method = \"standard\",\n    #     cont_embed_activation=\"leaky_relu\",\n    #     cont_embed_dim = 8,\n    mlp_hidden_dims=[128, 64],\n    prior_sigma_1=1.0,\n    prior_sigma_2=0.002,\n    prior_pi=0.8,\n    posterior_mu_init=0,\n    posterior_rho_init=-7.0,\n    pred_dim=1,\n)\n</pre> model = BayesianTabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=continuous_cols,     #     embed_continuous_method = \"standard\",     #     cont_embed_activation=\"leaky_relu\",     #     cont_embed_dim = 8,     mlp_hidden_dims=[128, 64],     prior_sigma_1=1.0,     prior_sigma_2=0.002,     prior_pi=0.8,     posterior_mu_init=0,     posterior_rho_init=-7.0,     pred_dim=1, ) In\u00a0[11]: Copied! <pre>trainer = BayesianTrainer(\n    model,\n    objective=\"binary\",\n    optimizer=torch.optim.Adam(model.parameters(), lr=0.01),\n    metrics=[Accuracy],\n)\n</pre> trainer = BayesianTrainer(     model,     objective=\"binary\",     optimizer=torch.optim.Adam(model.parameters(), lr=0.01),     metrics=[Accuracy], ) In\u00a0[12]: Copied! <pre>trainer.fit(\n    X_tab=X_tab,\n    target=target,\n    val_split=0.2,\n    n_epochs=2,\n    batch_size=256,\n)\n</pre> trainer.fit(     X_tab=X_tab,     target=target,     val_split=0.2,     n_epochs=2,     batch_size=256, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 123/123 [00:04&lt;00:00, 28.74it/s, loss=2e+3, metrics={'acc': 0.8007}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 31/31 [00:00&lt;00:00, 136.89it/s, loss=1.75e+3, metrics={'acc': 0.8418}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 123/123 [00:04&lt;00:00, 29.41it/s, loss=1.73e+3, metrics={'acc': 0.8596}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 31/31 [00:00&lt;00:00, 143.87it/s, loss=1.71e+3, metrics={'acc': 0.8569}]\n</pre> <p>These models are powerful beyond the success metrics because they give us a sense of uncertainty as we predict. Let's have a look</p> In\u00a0[13]: Copied! <pre>X_tab_test = tab_preprocessor.transform(test)\n</pre> X_tab_test = tab_preprocessor.transform(test) In\u00a0[14]: Copied! <pre>preds = trainer.predict(X_tab_test, return_samples=True, n_samples=5)\n</pre> preds = trainer.predict(X_tab_test, return_samples=True, n_samples=5) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 39/39 [00:01&lt;00:00, 33.92it/s]\n</pre> In\u00a0[15]: Copied! <pre>preds.shape\n</pre> preds.shape Out[15]: <pre>(5, 9769)</pre> <p>as we can see the prediction have shape <code>(5, 9769)</code>, one set of predictions each time we have internally run predict (i.e. sample the network and predict, defined by the parameter <code>n_samples</code>). This gives us an idea of how certain the model is about a certain prediction.</p> <p>Similarly, we could obtain the probabilities</p> In\u00a0[16]: Copied! <pre>probs = trainer.predict_proba(X_tab_test, return_samples=True, n_samples=5)\n</pre> probs = trainer.predict_proba(X_tab_test, return_samples=True, n_samples=5) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 39/39 [00:01&lt;00:00, 32.79it/s]\n</pre> In\u00a0[17]: Copied! <pre>probs.shape\n</pre> probs.shape Out[17]: <pre>(5, 9769, 2)</pre> <p>And we could see how the model performs each time we sampled the network</p> In\u00a0[18]: Copied! <pre>for p in preds:\n    print(accuracy_score(p, test[\"income_label\"].values))\n</pre> for p in preds:     print(accuracy_score(p, test[\"income_label\"].values)) <pre>0.8559729757395844\n0.8564847988535162\n0.8567918927218753\n0.8562800696079435\n0.8558706111167981\n</pre> In\u00a0[\u00a0]: Copied! <pre>\n</pre>"},{"location":"examples/14_bayesian_models.html#the-bayesian-models","title":"The Bayesian Models\u00b6","text":"<p>Perhaps one of the most interesting functionality in the library is the access to full Bayesian models in almost exactly the same way one would use any of the other models in the library.</p> <p>Note however that the Bayesian models are ONLY available for tabular data and, at the moment, we do not support combining them to form a Wide and Deep model.</p> <p>The implementation in this library is based on the publication: Weight Uncertainty in Neural Networks, by Blundell et al., 2015. Code-wise, our implementation is inspired by a number of source:</p> <ol> <li>https://joshfeldman.net/WeightUncertainty/</li> <li>https://www.nitarshan.com/bayes-by-backprop/</li> <li>https://github.com/piEsposito/blitz-bayesian-deep-learning</li> <li>https://github.com/zackchase/mxnet-the-straight-dope/tree/master/chapter18_variational-methods-and-uncertainty</li> </ol> <p>The two Bayesian models available in the library are:</p> <ol> <li>BayesianWide: this is a linear model where the non-linearities are captured via crossed-columns</li> <li>BayesianMLP: this is a standard MLP that receives categorical embeddings and continuous cols (embedded or not) which are the passed through a series of dense layers. All parameters in the model are probabilistic.</li> </ol>"},{"location":"examples/14_bayesian_models.html#1-bayesianwide","title":"1. <code>BayesianWide</code>\u00b6","text":""},{"location":"examples/14_bayesian_models.html#2-bayesiantabmlp","title":"2. <code>BayesianTabMlp</code>\u00b6","text":""},{"location":"examples/15_Self_Supervised_Pretraning_pt1.html","title":"15 Self Supervised Pretraning pt1","text":"In\u00a0[1]: Copied! <pre>import torch\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.model_selection import train_test_split\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.models import TabMlp, WideDeep\nfrom pytorch_widedeep.metrics import Accuracy\nfrom pytorch_widedeep.datasets import load_adult\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.self_supervised_training import EncoderDecoderTrainer\n</pre> import torch from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split  from pytorch_widedeep import Trainer from pytorch_widedeep.models import TabMlp, WideDeep from pytorch_widedeep.metrics import Accuracy from pytorch_widedeep.datasets import load_adult from pytorch_widedeep.preprocessing import TabPreprocessor from pytorch_widedeep.self_supervised_training import EncoderDecoderTrainer In\u00a0[2]: Copied! <pre>df = load_adult(as_frame=True)\ndf.columns = [c.replace(\"-\", \"_\") for c in df.columns]\ndf[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int)\ndf.drop(\"income\", axis=1, inplace=True)\n</pre> df = load_adult(as_frame=True) df.columns = [c.replace(\"-\", \"_\") for c in df.columns] df[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int) df.drop(\"income\", axis=1, inplace=True) In\u00a0[3]: Copied! <pre># one could chose to use a validation set for early stopping, hyperparam\n# optimization, etc. This is just an example, so we simply use train/test\n# split\ndf_tr, df_te = train_test_split(df, test_size=0.2, stratify=df.income_label)\n</pre> # one could chose to use a validation set for early stopping, hyperparam # optimization, etc. This is just an example, so we simply use train/test # split df_tr, df_te = train_test_split(df, test_size=0.2, stratify=df.income_label) In\u00a0[4]: Copied! <pre>df_tr.head(2)\n</pre> df_tr.head(2) Out[4]: age workclass fnlwgt education educational_num marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country income_label 9042 26 Local-gov 250551 HS-grad 9 Married-civ-spouse Craft-repair Own-child Black Male 0 0 40 United-States 0 25322 50 Private 34832 Bachelors 13 Married-civ-spouse Tech-support Husband White Male 15024 0 40 United-States 1 In\u00a0[5]: Copied! <pre># As always, we need to define which cols will be represented as embeddings\n# and which one will be continuous features\ncat_embed_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital_status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"capital_gain\",\n    \"capital_loss\",\n    \"native_country\",\n]\ncontinuous_cols = [\"age\", \"hours_per_week\"]\ntarget_col = \"income_label\"\n</pre> # As always, we need to define which cols will be represented as embeddings # and which one will be continuous features cat_embed_cols = [     \"workclass\",     \"education\",     \"marital_status\",     \"occupation\",     \"relationship\",     \"race\",     \"gender\",     \"capital_gain\",     \"capital_loss\",     \"native_country\", ] continuous_cols = [\"age\", \"hours_per_week\"] target_col = \"income_label\" In\u00a0[6]: Copied! <pre># We prepare the data to be passed to the model\ntab_preprocessor = TabPreprocessor(\n    cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols\n)\nX_tab = tab_preprocessor.fit_transform(df_tr)\ntarget = df_tr[target_col].values\n</pre> # We prepare the data to be passed to the model tab_preprocessor = TabPreprocessor(     cat_embed_cols=cat_embed_cols, continuous_cols=continuous_cols ) X_tab = tab_preprocessor.fit_transform(df_tr) target = df_tr[target_col].values <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> In\u00a0[7]: Copied! <pre>X_tab[:5]\n</pre> X_tab[:5] Out[7]: <pre>array([[ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 26, 40],\n       [ 2,  2,  1,  2,  2,  2,  1,  2,  1,  1, 50, 40],\n       [ 2,  1,  1,  3,  2,  2,  1,  1,  2,  1, 39, 46],\n       [ 2,  3,  2,  4,  1,  2,  2,  1,  1,  1, 17, 10],\n       [ 3,  4,  2,  1,  1,  2,  1,  1,  1,  1, 32, 20]])</pre> In\u00a0[8]: Copied! <pre># We define a model that will act as the encoder in the encoder/decoder\n# architecture. This could be any of: TabMlp, TabResnet or TabNet\ntab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n)\n</pre> # We define a model that will act as the encoder in the encoder/decoder # architecture. This could be any of: TabMlp, TabResnet or TabNet tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=tab_preprocessor.continuous_cols, ) In\u00a0[9]: Copied! <pre>tab_mlp\n</pre> tab_mlp Out[9]: <pre>TabMlp(\n  (cat_embed): DiffSizeCatEmbeddings(\n    (embed_layers): ModuleDict(\n      (emb_layer_workclass): Embedding(10, 5, padding_idx=0)\n      (emb_layer_education): Embedding(17, 8, padding_idx=0)\n      (emb_layer_marital_status): Embedding(8, 5, padding_idx=0)\n      (emb_layer_occupation): Embedding(16, 7, padding_idx=0)\n      (emb_layer_relationship): Embedding(7, 4, padding_idx=0)\n      (emb_layer_race): Embedding(6, 4, padding_idx=0)\n      (emb_layer_gender): Embedding(3, 2, padding_idx=0)\n      (emb_layer_capital_gain): Embedding(124, 24, padding_idx=0)\n      (emb_layer_capital_loss): Embedding(98, 21, padding_idx=0)\n      (emb_layer_native_country): Embedding(42, 13, padding_idx=0)\n    )\n    (embedding_dropout): Dropout(p=0.0, inplace=False)\n  )\n  (cont_norm): Identity()\n  (encoder): MLP(\n    (mlp): Sequential(\n      (dense_layer_0): Sequential(\n        (0): Linear(in_features=95, out_features=200, bias=True)\n        (1): ReLU(inplace=True)\n        (2): Dropout(p=0.1, inplace=False)\n      )\n      (dense_layer_1): Sequential(\n        (0): Linear(in_features=200, out_features=100, bias=True)\n        (1): ReLU(inplace=True)\n        (2): Dropout(p=0.1, inplace=False)\n      )\n    )\n  )\n)</pre> In\u00a0[10]: Copied! <pre># If we do not pass a custom decoder, which is perfectly possible via the\n# decoder param,  the EncoderDecoderTrainer will automatically build a\n# decoder which will be the 'mirror' image of the encoder\nencoder_decoder_trainer = EncoderDecoderTrainer(encoder=tab_mlp)\n</pre> # If we do not pass a custom decoder, which is perfectly possible via the # decoder param,  the EncoderDecoderTrainer will automatically build a # decoder which will be the 'mirror' image of the encoder encoder_decoder_trainer = EncoderDecoderTrainer(encoder=tab_mlp) In\u00a0[11]: Copied! <pre># let's have a look to the encoder_decoder_model (aka ed_model)\nencoder_decoder_trainer.ed_model\n</pre> # let's have a look to the encoder_decoder_model (aka ed_model) encoder_decoder_trainer.ed_model Out[11]: <pre>EncoderDecoderModel(\n  (encoder): TabMlp(\n    (cat_embed): DiffSizeCatEmbeddings(\n      (embed_layers): ModuleDict(\n        (emb_layer_workclass): Embedding(10, 5, padding_idx=0)\n        (emb_layer_education): Embedding(17, 8, padding_idx=0)\n        (emb_layer_marital_status): Embedding(8, 5, padding_idx=0)\n        (emb_layer_occupation): Embedding(16, 7, padding_idx=0)\n        (emb_layer_relationship): Embedding(7, 4, padding_idx=0)\n        (emb_layer_race): Embedding(6, 4, padding_idx=0)\n        (emb_layer_gender): Embedding(3, 2, padding_idx=0)\n        (emb_layer_capital_gain): Embedding(124, 24, padding_idx=0)\n        (emb_layer_capital_loss): Embedding(98, 21, padding_idx=0)\n        (emb_layer_native_country): Embedding(42, 13, padding_idx=0)\n      )\n      (embedding_dropout): Dropout(p=0.0, inplace=False)\n    )\n    (cont_norm): Identity()\n    (encoder): MLP(\n      (mlp): Sequential(\n        (dense_layer_0): Sequential(\n          (0): Linear(in_features=95, out_features=200, bias=True)\n          (1): ReLU(inplace=True)\n          (2): Dropout(p=0.1, inplace=False)\n        )\n        (dense_layer_1): Sequential(\n          (0): Linear(in_features=200, out_features=100, bias=True)\n          (1): ReLU(inplace=True)\n          (2): Dropout(p=0.1, inplace=False)\n        )\n      )\n    )\n  )\n  (decoder): TabMlpDecoder(\n    (decoder): MLP(\n      (mlp): Sequential(\n        (dense_layer_0): Sequential(\n          (0): Linear(in_features=100, out_features=200, bias=True)\n          (1): ReLU(inplace=True)\n          (2): Dropout(p=0.1, inplace=False)\n        )\n        (dense_layer_1): Sequential(\n          (0): Linear(in_features=200, out_features=95, bias=True)\n          (1): ReLU(inplace=True)\n          (2): Dropout(p=0.1, inplace=False)\n        )\n      )\n    )\n  )\n  (masker): RandomObfuscator()\n)</pre> <p>Ignoring the <code>masker</code>, which just...well...masks, the <code>ed_model</code> consists of:</p> <ol> <li>An encoder model that is a <code>TabMlp</code> model that is in itself comprised by an Embedding layer (or rather a collection of them, referred as <code>cat_and_cont_embed</code>) and an encoder (a simple MLP, referred as <code>encoder</code>)</li> <li>A decoder which is just an \"inverted\" MLP (referred as <code>decoder</code>)</li> </ol> In\u00a0[12]: Copied! <pre># And we just...pretrain\nencoder_decoder_trainer.pretrain(X_tab, n_epochs=5, batch_size=256)\n</pre> # And we just...pretrain encoder_decoder_trainer.pretrain(X_tab, n_epochs=5, batch_size=256) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 82.90it/s, loss=4.07]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 89.87it/s, loss=3.09]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 92.86it/s, loss=2.53]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 91.24it/s, loss=2.09]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 91.38it/s, loss=1.78]\n</pre> <p>At this point we have two options, we could either save the model for later use or we could continue to supervised training. The latter is rather simple, after running:</p> <pre>encoder_decoder_trainer.pretrain(X_tab, n_epochs=5, batch_size=256)\n</pre> <p>you just have to</p> <pre>model = WideDeep(deeptabular=tab_mlp)\ntrainer = Trainer(model=model, objective=\"binary\", metrics=[Accuracy])\n\ntrainer.fit(X_tab=X_tab, target=target, n_epochs=5, batch_size=256)\n\n# And, you know...we get a test metric\nX_tab_te = tab_preprocessor.transform(df_te)\ntarget_te = df_te[target_col].values\n\npreds = trainer.predict(X_tab=X_tab_te)\ntest_acc = accuracy_score(target_te, preds)\n</pre> <p>Let's say that in any case, we are 'decent' scientists/people and we want to save the model:</p> In\u00a0[13]: Copied! <pre>encoder_decoder_trainer.save(\n    path=\"pretrained_weights\", model_filename=\"encoder_decoder_model.pt\"\n)\n</pre> encoder_decoder_trainer.save(     path=\"pretrained_weights\", model_filename=\"encoder_decoder_model.pt\" ) <p>some time has passed...</p> In\u00a0[14]: Copied! <pre>encoder_decoder_model = torch.load(\"pretrained_weights/encoder_decoder_model.pt\")\n</pre> encoder_decoder_model = torch.load(\"pretrained_weights/encoder_decoder_model.pt\") <p>Now, AND THIS IS IMPORTANT We have loaded the encoder AND the decoder. To proceed to the supervised training we ONLY need the encoder</p> In\u00a0[15]: Copied! <pre>pretrained_encoder = encoder_decoder_model.encoder\n</pre> pretrained_encoder = encoder_decoder_model.encoder In\u00a0[16]: Copied! <pre>pretrained_encoder\n</pre> pretrained_encoder Out[16]: <pre>TabMlp(\n  (cat_embed): DiffSizeCatEmbeddings(\n    (embed_layers): ModuleDict(\n      (emb_layer_workclass): Embedding(10, 5, padding_idx=0)\n      (emb_layer_education): Embedding(17, 8, padding_idx=0)\n      (emb_layer_marital_status): Embedding(8, 5, padding_idx=0)\n      (emb_layer_occupation): Embedding(16, 7, padding_idx=0)\n      (emb_layer_relationship): Embedding(7, 4, padding_idx=0)\n      (emb_layer_race): Embedding(6, 4, padding_idx=0)\n      (emb_layer_gender): Embedding(3, 2, padding_idx=0)\n      (emb_layer_capital_gain): Embedding(124, 24, padding_idx=0)\n      (emb_layer_capital_loss): Embedding(98, 21, padding_idx=0)\n      (emb_layer_native_country): Embedding(42, 13, padding_idx=0)\n    )\n    (embedding_dropout): Dropout(p=0.0, inplace=False)\n  )\n  (cont_norm): Identity()\n  (encoder): MLP(\n    (mlp): Sequential(\n      (dense_layer_0): Sequential(\n        (0): Linear(in_features=95, out_features=200, bias=True)\n        (1): ReLU(inplace=True)\n        (2): Dropout(p=0.1, inplace=False)\n      )\n      (dense_layer_1): Sequential(\n        (0): Linear(in_features=200, out_features=100, bias=True)\n        (1): ReLU(inplace=True)\n        (2): Dropout(p=0.1, inplace=False)\n      )\n    )\n  )\n)</pre> In\u00a0[17]: Copied! <pre># and as always, ANY supervised model in this library has to go throuth the WideDeep class:\nmodel = WideDeep(deeptabular=pretrained_encoder)\ntrainer = Trainer(model=model, objective=\"binary\", metrics=[Accuracy])\n\ntrainer.fit(X_tab=X_tab, target=target, n_epochs=5, batch_size=256)\n\nX_tab_te = tab_preprocessor.transform(df_te)\ntarget_te = df_te[target_col].values\n\npreds = trainer.predict(X_tab=X_tab_te)\ntest_acc = accuracy_score(target_te, preds)\nprint(test_acc)\n</pre> # and as always, ANY supervised model in this library has to go throuth the WideDeep class: model = WideDeep(deeptabular=pretrained_encoder) trainer = Trainer(model=model, objective=\"binary\", metrics=[Accuracy])  trainer.fit(X_tab=X_tab, target=target, n_epochs=5, batch_size=256)  X_tab_te = tab_preprocessor.transform(df_te) target_te = df_te[target_col].values  preds = trainer.predict(X_tab=X_tab_te) test_acc = accuracy_score(target_te, preds) print(test_acc) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 88.04it/s, loss=0.374, metrics={'acc': 0.8253}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 85.63it/s, loss=0.324, metrics={'acc': 0.8491}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 87.56it/s, loss=0.301, metrics={'acc': 0.8608}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:02&lt;00:00, 73.38it/s, loss=0.29, metrics={'acc': 0.8655}]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:01&lt;00:00, 78.68it/s, loss=0.284, metrics={'acc': 0.8686}]\npredict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 39/39 [00:00&lt;00:00, 173.02it/s]\n</pre> <pre>0.8730678677449074\n</pre> <p>As we mentioned before, we can also use a <code>TabResNet</code> or <code>TabNet</code> model and a custom decoder. Let's have a look:</p> In\u00a0[18]: Copied! <pre>from pytorch_widedeep.models import TabResnet as TabResnetEncoder, TabResnetDecoder\n</pre> from pytorch_widedeep.models import TabResnet as TabResnetEncoder, TabResnetDecoder In\u00a0[19]: Copied! <pre>resnet_encoder = TabResnetEncoder(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=continuous_cols,\n    blocks_dims=[200, 100, 100],\n)\n</pre> resnet_encoder = TabResnetEncoder(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=continuous_cols,     blocks_dims=[200, 100, 100], ) <p>let's have a look to the model</p> In\u00a0[20]: Copied! <pre>resnet_encoder\n</pre> resnet_encoder Out[20]: <pre>TabResnet(\n  (cat_embed): DiffSizeCatEmbeddings(\n    (embed_layers): ModuleDict(\n      (emb_layer_workclass): Embedding(10, 5, padding_idx=0)\n      (emb_layer_education): Embedding(17, 8, padding_idx=0)\n      (emb_layer_marital_status): Embedding(8, 5, padding_idx=0)\n      (emb_layer_occupation): Embedding(16, 7, padding_idx=0)\n      (emb_layer_relationship): Embedding(7, 4, padding_idx=0)\n      (emb_layer_race): Embedding(6, 4, padding_idx=0)\n      (emb_layer_gender): Embedding(3, 2, padding_idx=0)\n      (emb_layer_capital_gain): Embedding(124, 24, padding_idx=0)\n      (emb_layer_capital_loss): Embedding(98, 21, padding_idx=0)\n      (emb_layer_native_country): Embedding(42, 13, padding_idx=0)\n    )\n    (embedding_dropout): Dropout(p=0.0, inplace=False)\n  )\n  (cont_norm): Identity()\n  (encoder): DenseResnet(\n    (dense_resnet): Sequential(\n      (lin_inp): Linear(in_features=95, out_features=200, bias=False)\n      (bn_inp): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      (block_0): BasicBlock(\n        (resize): Sequential(\n          (0): Linear(in_features=200, out_features=100, bias=False)\n          (1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        )\n        (lin1): Linear(in_features=200, out_features=100, bias=False)\n        (bn1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n        (dp): Dropout(p=0.1, inplace=False)\n        (lin2): Linear(in_features=100, out_features=100, bias=False)\n        (bn2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      )\n      (block_1): BasicBlock(\n        (lin1): Linear(in_features=100, out_features=100, bias=False)\n        (bn1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n        (dp): Dropout(p=0.1, inplace=False)\n        (lin2): Linear(in_features=100, out_features=100, bias=False)\n        (bn2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      )\n    )\n  )\n)</pre> <p>As we can see, the tensor we are trying to reconstruct, the embeddings, is of size <code>94</code> (this number is stored in the attribute: <code>esnet_encoder.cat_and_cont_embed.output_dim</code>), with that information we could build or own decoder as:</p> In\u00a0[21]: Copied! <pre># for all possible params see the docs\nresnet_decoder = TabResnetDecoder(\n    embed_dim=resnet_encoder.cat_out_dim + resnet_encoder.cont_out_dim,\n    blocks_dims=[100, 100, 200],\n)\n</pre> # for all possible params see the docs resnet_decoder = TabResnetDecoder(     embed_dim=resnet_encoder.cat_out_dim + resnet_encoder.cont_out_dim,     blocks_dims=[100, 100, 200], ) In\u00a0[22]: Copied! <pre>resnet_decoder\n</pre> resnet_decoder Out[22]: <pre>TabResnetDecoder(\n  (decoder): DenseResnet(\n    (dense_resnet): Sequential(\n      (block_0): BasicBlock(\n        (lin1): Linear(in_features=100, out_features=100, bias=False)\n        (bn1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n        (dp): Dropout(p=0.1, inplace=False)\n        (lin2): Linear(in_features=100, out_features=100, bias=False)\n        (bn2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      )\n      (block_1): BasicBlock(\n        (resize): Sequential(\n          (0): Linear(in_features=100, out_features=200, bias=False)\n          (1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        )\n        (lin1): Linear(in_features=100, out_features=200, bias=False)\n        (bn1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (leaky_relu): LeakyReLU(negative_slope=0.01, inplace=True)\n        (dp): Dropout(p=0.1, inplace=False)\n        (lin2): Linear(in_features=200, out_features=200, bias=False)\n        (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      )\n    )\n  )\n  (reconstruction_layer): Linear(in_features=200, out_features=95, bias=False)\n)</pre> <p>and now:</p> In\u00a0[23]: Copied! <pre>ec_trainer = EncoderDecoderTrainer(\n    encoder=resnet_encoder,\n    decoder=resnet_decoder,\n    masked_prob=0.2,\n)\nec_trainer.pretrain(X_tab, n_epochs=5, batch_size=256)\n</pre> ec_trainer = EncoderDecoderTrainer(     encoder=resnet_encoder,     decoder=resnet_decoder,     masked_prob=0.2, ) ec_trainer.pretrain(X_tab, n_epochs=5, batch_size=256) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:03&lt;00:00, 46.89it/s, loss=1.52]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:03&lt;00:00, 46.78it/s, loss=0.81]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:03&lt;00:00, 39.82it/s, loss=0.56]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:03&lt;00:00, 46.73it/s, loss=0.417]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:03&lt;00:00, 46.24it/s, loss=0.329]\n</pre> In\u00a0[24]: Copied! <pre># and as always, ANY supervised model in this library has to go throuth the WideDeep class:\nmodel = WideDeep(deeptabular=resnet_encoder)\ntrainer = Trainer(model=model, objective=\"binary\", metrics=[Accuracy])\n\ntrainer.fit(X_tab=X_tab, target=target, n_epochs=5, batch_size=256)\n\nX_tab_te = tab_preprocessor.transform(df_te)\ntarget_te = df_te[target_col].values\n\npreds = trainer.predict(X_tab=X_tab_te)\ntest_acc = accuracy_score(target_te, preds)\nprint(test_acc)\n</pre> # and as always, ANY supervised model in this library has to go throuth the WideDeep class: model = WideDeep(deeptabular=resnet_encoder) trainer = Trainer(model=model, objective=\"binary\", metrics=[Accuracy])  trainer.fit(X_tab=X_tab, target=target, n_epochs=5, batch_size=256)  X_tab_te = tab_preprocessor.transform(df_te) target_te = df_te[target_col].values  preds = trainer.predict(X_tab=X_tab_te) test_acc = accuracy_score(target_te, preds) print(test_acc) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:02&lt;00:00, 58.63it/s, loss=0.335, metrics={'acc': 0.8442}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:02&lt;00:00, 58.02it/s, loss=0.296, metrics={'acc': 0.864}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:02&lt;00:00, 55.91it/s, loss=0.283, metrics={'acc': 0.8687}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:02&lt;00:00, 55.00it/s, loss=0.276, metrics={'acc': 0.871}]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:02&lt;00:00, 51.95it/s, loss=0.272, metrics={'acc': 0.8732}]\npredict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 39/39 [00:00&lt;00:00, 120.15it/s]\n</pre> <pre>0.8725560446309756\n</pre> In\u00a0[\u00a0]: Copied! <pre>\n</pre>"},{"location":"examples/15_Self_Supervised_Pretraning_pt1.html#self-supervised-pretraining-for-tabular-data","title":"Self Supervised Pretraining for Tabular Data\u00b6","text":"<p>We have implemented two Self Supervised Pre-training routines that allow the user to pre-train all tabular models in the library with the exception of the TabPerceiver (which is a special monster).</p> <p>The two routines implemented are illustrated in the figures below. The 1st is from TabNet: Attentive Interpretable Tabular Learning and is designed for models that do not use transformer-based architectures, while the second is from SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training, and is designed for models that use transformer-based architectures.</p> <p> </p> <p>Fig 1. Figure 2 in their paper. I have included de original caption in case is useful, althought the Figure itself is pretty self explanatory</p> <p> </p> <p>Fig 2. Figure 1 in their paper. Here the caption is necessary \ud83d\ude0f</p> <p>It is beyond the scope of this notebook to explain in detail those implementations. Therefore, we strongly recommend the user to go and read the papers if this functionality is of interest to her/him.</p> <p>One thing is worth noticing however. As seen in Fig 1(the TabNet paper's Fig 2) the masking of the input features happens in the feature space. However, the implementation in this library is inspired by that at the dreamquark-ai repo, which is in itself inspired by the original implementation (by the way, at this point I will write it once again. All TabNet related things in this library are inspired when not directly based in the code in that repo, therefore, ALL CREDIT TO THE GUYS AT dreamquark-ai).</p> <p>In that implementation the masking happens in the embedding space, and currently does not mask the entire embedding (i.e. categorical feature). We decided to release as it is in this version and we will implement the exact same process described in the paper in future releases.</p> <p>Having said all of the above let's see how to use self supervision for tabular data with <code>pytorch-widedeep</code>. We will concentrate in this notebook on the 1st of the two approaches (the 'TabNet approach'). For details on the second approach please see <code>16_Self_Supervised_Pretraning_pt2</code>.</p>"},{"location":"examples/15_Self_Supervised_Pretraning_pt1.html#self-supervision-for-non-transformer-based-models","title":"Self Supervision for non-transformer-based models..\u00b6","text":"<p>...or in general, for models where the embeddigns can have all different dimensions. In this library, these are: <code>TabMlp</code>, <code>TabResNet</code> and <code>TabNet</code></p> <p>As shown in Figure, this is an encoder-encoder approach where we learn to predict values in the incoming data that have been masked. However, as I mentioned before, our implementation is a bit different, and the masking occurs in th embedding space.</p> <p>Nonetheless, the code below illustrates how to use this encoder-decoder approach with <code>pytorch-widedeep</code></p>"},{"location":"examples/15_Self_Supervised_Pretraning_pt2.html","title":"15 Self Supervised Pretraning pt2","text":"In\u00a0[1]: Copied! <pre>import torch\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.model_selection import train_test_split\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.models import WideDeep, FTTransformer\nfrom pytorch_widedeep.metrics import Accuracy\nfrom pytorch_widedeep.datasets import load_adult\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\nfrom pytorch_widedeep.self_supervised_training import (\n    ContrastiveDenoisingTrainer,\n)\n</pre> import torch from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split  from pytorch_widedeep import Trainer from pytorch_widedeep.models import WideDeep, FTTransformer from pytorch_widedeep.metrics import Accuracy from pytorch_widedeep.datasets import load_adult from pytorch_widedeep.preprocessing import TabPreprocessor from pytorch_widedeep.self_supervised_training import (     ContrastiveDenoisingTrainer, ) In\u00a0[2]: Copied! <pre>df = load_adult(as_frame=True)\ndf.columns = [c.replace(\"-\", \"_\") for c in df.columns]\ndf[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int)\ndf.drop(\"income\", axis=1, inplace=True)\n\n# one could chose to use a validation set for early stopping, hyperparam\n# optimization, etc. This is just an example, so we simply use train/test\n# split\ndf_tr, df_te = train_test_split(df, test_size=0.2, stratify=df.income_label)\n\ncat_embed_cols = [\n    \"workclass\",\n    \"education\",\n    \"marital_status\",\n    \"occupation\",\n    \"relationship\",\n    \"race\",\n    \"gender\",\n    \"capital_gain\",\n    \"capital_loss\",\n    \"native_country\",\n]\ncontinuous_cols = [\"age\", \"hours_per_week\"]\ntarget_col = \"income_label\"\n\ntab_preprocessor = TabPreprocessor(\n    cat_embed_cols=cat_embed_cols,\n    continuous_cols=continuous_cols,\n    with_attention=True,\n    with_cls_token=True,  # this is optional\n)\nX_tab = tab_preprocessor.fit_transform(df_tr)\ntarget = df_tr[target_col].values\n</pre> df = load_adult(as_frame=True) df.columns = [c.replace(\"-\", \"_\") for c in df.columns] df[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int) df.drop(\"income\", axis=1, inplace=True)  # one could chose to use a validation set for early stopping, hyperparam # optimization, etc. This is just an example, so we simply use train/test # split df_tr, df_te = train_test_split(df, test_size=0.2, stratify=df.income_label)  cat_embed_cols = [     \"workclass\",     \"education\",     \"marital_status\",     \"occupation\",     \"relationship\",     \"race\",     \"gender\",     \"capital_gain\",     \"capital_loss\",     \"native_country\", ] continuous_cols = [\"age\", \"hours_per_week\"] target_col = \"income_label\"  tab_preprocessor = TabPreprocessor(     cat_embed_cols=cat_embed_cols,     continuous_cols=continuous_cols,     with_attention=True,     with_cls_token=True,  # this is optional ) X_tab = tab_preprocessor.fit_transform(df_tr) target = df_tr[target_col].values <pre>/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised\n  warnings.warn(\"Continuous columns will not be normalised\")\n</pre> In\u00a0[3]: Copied! <pre>ft_transformer = FTTransformer(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    embed_continuous_method=\"standard\",\n    input_dim=32,\n    kv_compression_factor=0.5,\n    n_blocks=3,\n    n_heads=4,\n)\n</pre> ft_transformer = FTTransformer(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=tab_preprocessor.continuous_cols,     embed_continuous_method=\"standard\",     input_dim=32,     kv_compression_factor=0.5,     n_blocks=3,     n_heads=4, ) In\u00a0[4]: Copied! <pre># for a full list of the params for the the ContrastiveDenoisingTrainer (which are many) please see the docs.\n# Note that using these params involves some knowledge of the routine and the architecture of the model used\ncontrastive_denoising_trainer = ContrastiveDenoisingTrainer(\n    model=ft_transformer,\n    preprocessor=tab_preprocessor,\n)\ncontrastive_denoising_trainer.pretrain(X_tab, n_epochs=5, batch_size=256)\n</pre> # for a full list of the params for the the ContrastiveDenoisingTrainer (which are many) please see the docs. # Note that using these params involves some knowledge of the routine and the architecture of the model used contrastive_denoising_trainer = ContrastiveDenoisingTrainer(     model=ft_transformer,     preprocessor=tab_preprocessor, ) contrastive_denoising_trainer.pretrain(X_tab, n_epochs=5, batch_size=256) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:13&lt;00:00, 11.73it/s, loss=579]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:12&lt;00:00, 12.56it/s, loss=143]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:12&lt;00:00, 12.49it/s, loss=141]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:11&lt;00:00, 12.77it/s, loss=138]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:11&lt;00:00, 13.29it/s, loss=137]\n</pre> In\u00a0[5]: Copied! <pre>contrastive_denoising_trainer.save(\n    path=\"pretrained_weights\", model_filename=\"contrastive_denoising_model.pt\"\n)\n</pre> contrastive_denoising_trainer.save(     path=\"pretrained_weights\", model_filename=\"contrastive_denoising_model.pt\" ) <p>some time has passed</p> In\u00a0[6]: Copied! <pre># some time has passed, we load the model with torch as usual:\ncontrastive_denoising_model = torch.load(\n    \"pretrained_weights/contrastive_denoising_model.pt\"\n)\n</pre> # some time has passed, we load the model with torch as usual: contrastive_denoising_model = torch.load(     \"pretrained_weights/contrastive_denoising_model.pt\" ) <p>NOW, AND THIS IS IMPORTANT! We have loaded the entire contrastive, denoising model. To proceed to the supervised training we ONLY need the attention-based model, which is the 'model' attribute of the trainer, let's have a look</p> In\u00a0[7]: Copied! <pre>contrastive_denoising_model.model\n</pre> contrastive_denoising_model.model Out[7]: <pre>FTTransformer(\n  (cat_embed): SameSizeCatEmbeddings(\n    (embed): Embedding(323, 32, padding_idx=0)\n    (dropout): Dropout(p=0.0, inplace=False)\n  )\n  (cont_norm): Identity()\n  (cont_embed): ContEmbeddings(\n    INFO: [ContLinear = weight(n_cont_cols, embed_dim) + bias(n_cont_cols, embed_dim)]\n    (linear): ContLinear(n_cont_cols=2, embed_dim=32, embed_dropout=0.0)\n    (dropout): Dropout(p=0.0, inplace=False)\n  )\n  (encoder): Sequential(\n    (fttransformer_block0): FTTransformerEncoder(\n      (attn): LinearAttentionLinformer(\n        (dropout): Dropout(p=0.2, inplace=False)\n        (qkv_proj): Linear(in_features=32, out_features=96, bias=False)\n        (out_proj): Linear(in_features=32, out_features=32, bias=False)\n      )\n      (ff): FeedForward(\n        (w_1): Linear(in_features=32, out_features=84, bias=True)\n        (w_2): Linear(in_features=42, out_features=32, bias=True)\n        (dropout): Dropout(p=0.1, inplace=False)\n        (activation): REGLU()\n      )\n      (attn_normadd): NormAdd(\n        (dropout): Dropout(p=0.2, inplace=False)\n        (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n      )\n      (ff_normadd): NormAdd(\n        (dropout): Dropout(p=0.1, inplace=False)\n        (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n      )\n    )\n    (fttransformer_block1): FTTransformerEncoder(\n      (attn): LinearAttentionLinformer(\n        (dropout): Dropout(p=0.2, inplace=False)\n        (qkv_proj): Linear(in_features=32, out_features=96, bias=False)\n        (out_proj): Linear(in_features=32, out_features=32, bias=False)\n      )\n      (ff): FeedForward(\n        (w_1): Linear(in_features=32, out_features=84, bias=True)\n        (w_2): Linear(in_features=42, out_features=32, bias=True)\n        (dropout): Dropout(p=0.1, inplace=False)\n        (activation): REGLU()\n      )\n      (attn_normadd): NormAdd(\n        (dropout): Dropout(p=0.2, inplace=False)\n        (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n      )\n      (ff_normadd): NormAdd(\n        (dropout): Dropout(p=0.1, inplace=False)\n        (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n      )\n    )\n    (fttransformer_block2): FTTransformerEncoder(\n      (attn): LinearAttentionLinformer(\n        (dropout): Dropout(p=0.2, inplace=False)\n        (qkv_proj): Linear(in_features=32, out_features=96, bias=False)\n        (out_proj): Linear(in_features=32, out_features=32, bias=False)\n      )\n      (ff): FeedForward(\n        (w_1): Linear(in_features=32, out_features=84, bias=True)\n        (w_2): Linear(in_features=42, out_features=32, bias=True)\n        (dropout): Dropout(p=0.1, inplace=False)\n        (activation): REGLU()\n      )\n      (attn_normadd): NormAdd(\n        (dropout): Dropout(p=0.2, inplace=False)\n        (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n      )\n      (ff_normadd): NormAdd(\n        (dropout): Dropout(p=0.1, inplace=False)\n        (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n      )\n    )\n  )\n)</pre> In\u00a0[8]: Copied! <pre>pretrained_model = contrastive_denoising_model.model\n</pre> pretrained_model = contrastive_denoising_model.model In\u00a0[9]: Copied! <pre># and as always, ANY supervised model in this library has to go throuth the WideDeep class:\nmodel = WideDeep(deeptabular=pretrained_model)\ntrainer = Trainer(model=model, objective=\"binary\", metrics=[Accuracy])\n\ntrainer.fit(X_tab=X_tab, target=target, n_epochs=5, batch_size=256)\n\n# And, you know...we get a test metric\nX_tab_te = tab_preprocessor.transform(df_te)\ntarget_te = df_te[target_col].values\n\npreds = trainer.predict(X_tab=X_tab_te)\ntest_acc = accuracy_score(target_te, preds)\nprint(test_acc)\n</pre> # and as always, ANY supervised model in this library has to go throuth the WideDeep class: model = WideDeep(deeptabular=pretrained_model) trainer = Trainer(model=model, objective=\"binary\", metrics=[Accuracy])  trainer.fit(X_tab=X_tab, target=target, n_epochs=5, batch_size=256)  # And, you know...we get a test metric X_tab_te = tab_preprocessor.transform(df_te) target_te = df_te[target_col].values  preds = trainer.predict(X_tab=X_tab_te) test_acc = accuracy_score(target_te, preds) print(test_acc) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:05&lt;00:00, 27.19it/s, loss=0.383, metrics={'acc': 0.8176}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:05&lt;00:00, 26.08it/s, loss=0.325, metrics={'acc': 0.8502}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:05&lt;00:00, 26.56it/s, loss=0.306, metrics={'acc': 0.8601}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:05&lt;00:00, 27.41it/s, loss=0.295, metrics={'acc': 0.8641}]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 153/153 [00:06&lt;00:00, 24.70it/s, loss=0.289, metrics={'acc': 0.8656}]\npredict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 39/39 [00:00&lt;00:00, 97.26it/s]</pre> <pre>0.8695874705701709\n</pre> <pre>\n</pre> In\u00a0[\u00a0]: Copied! <pre>\n</pre>"},{"location":"examples/15_Self_Supervised_Pretraning_pt2.html#self-supervised-pretraining-for-tabular-data","title":"Self Supervised Pretraining for Tabular Data\u00b6","text":"<p>We have implemented two Self Supervised Pre-training routines that allow the user to pre-train all tabular models in the library with the exception of the TabPerceiver (which is a special monster).</p> <p>The two routines implemented are illustrated in the figures below. The 1st is from TabNet: Attentive Interpretable Tabular Learning and is designed for models that do not use transformer-based architectures, while the second is from SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training, and is designed for models that use transformer-based architectures.</p> <p> </p> <p>Fig 1. Figure 2 in their paper. I have included de original caption in case is useful, althought the Figure itself is pretty self explanatory</p> <p> </p> <p>Fig 2. Figure 1 in their paper. Here the caption is necessary \ud83d\ude0f</p> <p>It is beyond the scope of this notebook to explain in detail those implementations. Therefore, we strongly recommend the user to go and read the papers if this functionality is of interest to her/him.</p> <p>One thing is worth noticing however. As seen in Fig 1(the TabNet paper's Fig 2) the masking of the input features happens in the feature space. However, the implementation in this library is inspired by that at the dreamquark-ai repo, which is in itself inspired by the original implementation (by the way, at this point I will write it once again. All TabNet related things in this library are inspired when not directly based in the code in that repo, therefore, ALL CREDIT TO THE GUYS AT dreamquark-ai).</p> <p>In that implementation the masking happens in the embedding space, and currently does not mask the entire embedding (i.e. categorical feature). We decided to release as it is in this version and we will implement the exact same process described in the paper in future releases.</p> <p>Having said all of the above let's see how to use self supervision for tabular data with <code>pytorch-widedeep</code>. We will concentrate in this notebook on the 2nd of the two approaches (the 'SAINT approach'). For details on the 1st approach (the 'TabNet' approach) please see <code>16_Self_Supervised_Pretraning_pt1</code>.</p>"},{"location":"examples/15_Self_Supervised_Pretraning_pt2.html#self-supervision-transformer-based-models","title":"Self Supervision transformer-based models..\u00b6","text":"<p>...or in general, for models where the embeddigns have all the same dimensions. In this library, these are:</p> <ul> <li>TabTransformer</li> <li>FTTransformer</li> <li>SAINT</li> <li>TabFastFormer</li> </ul> <p>Note that there is one additional Transformer-based model, the <code>TabPerceiver</code>, however this is a \"particular\" model and at the moment we do not support self supervision for it, but it will come.</p> <p>Let see at one example using the <code>FTTransformer</code>.</p>"},{"location":"examples/16_Usign_a_custom_hugging_face_model.html","title":"16 Usign a custom hugging face model","text":"In\u00a0[1]: Copied! <pre>import numpy as np\nimport torch\nimport lightgbm as lgb\nfrom lightgbm import Dataset as lgbDataset\nfrom scipy.sparse import hstack, csr_matrix\nfrom sklearn.metrics import (\n    f1_score,\n    recall_score,\n    accuracy_score,\n    precision_score,\n    confusion_matrix,\n)\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\nfrom torch import Tensor, nn\nfrom transformers import DistilBertModel, DistilBertTokenizer\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep\nfrom pytorch_widedeep.metrics import F1Score, Accuracy\nfrom pytorch_widedeep.utils import Tokenizer, LabelEncoder\nfrom pytorch_widedeep.preprocessing import TextPreprocessor, TabPreprocessor\nfrom pytorch_widedeep.datasets import load_womens_ecommerce\nfrom pytorch_widedeep.utils.fastai_transforms import (\n    fix_html,\n    spec_add_spaces,\n    rm_useless_spaces,\n)\n</pre> import numpy as np import torch import lightgbm as lgb from lightgbm import Dataset as lgbDataset from scipy.sparse import hstack, csr_matrix from sklearn.metrics import (     f1_score,     recall_score,     accuracy_score,     precision_score,     confusion_matrix, ) from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer  from torch import Tensor, nn from transformers import DistilBertModel, DistilBertTokenizer from pytorch_widedeep import Trainer from pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep from pytorch_widedeep.metrics import F1Score, Accuracy from pytorch_widedeep.utils import Tokenizer, LabelEncoder from pytorch_widedeep.preprocessing import TextPreprocessor, TabPreprocessor from pytorch_widedeep.datasets import load_womens_ecommerce from pytorch_widedeep.utils.fastai_transforms import (     fix_html,     spec_add_spaces,     rm_useless_spaces, ) <p>Let's load the data and have a look:</p> In\u00a0[2]: Copied! <pre>df = load_womens_ecommerce(as_frame=True)\n\ndf.columns = [c.replace(\" \", \"_\").lower() for c in df.columns]\n\n# classes from [0,num_class)\ndf[\"rating\"] = (df[\"rating\"] - 1).astype(\"int64\")\n\n# group reviews with 1 and 2 scores into one class\ndf.loc[df.rating == 0, \"rating\"] = 1\n\n# and back again to [0,num_class)\ndf[\"rating\"] = (df[\"rating\"] - 1).astype(\"int64\")\n\n# drop short reviews\ndf = df[~df.review_text.isna()]\ndf[\"review_length\"] = df.review_text.apply(lambda x: len(x.split(\" \")))\ndf = df[df.review_length &gt;= 5]\ndf = df.drop(\"review_length\", axis=1).reset_index(drop=True)\n</pre> df = load_womens_ecommerce(as_frame=True)  df.columns = [c.replace(\" \", \"_\").lower() for c in df.columns]  # classes from [0,num_class) df[\"rating\"] = (df[\"rating\"] - 1).astype(\"int64\")  # group reviews with 1 and 2 scores into one class df.loc[df.rating == 0, \"rating\"] = 1  # and back again to [0,num_class) df[\"rating\"] = (df[\"rating\"] - 1).astype(\"int64\")  # drop short reviews df = df[~df.review_text.isna()] df[\"review_length\"] = df.review_text.apply(lambda x: len(x.split(\" \"))) df = df[df.review_length &gt;= 5] df = df.drop(\"review_length\", axis=1).reset_index(drop=True) In\u00a0[3]: Copied! <pre>df.head()\n</pre> df.head() Out[3]: clothing_id age title review_text rating recommended_ind positive_feedback_count division_name department_name class_name 0 767 33 None Absolutely wonderful - silky and sexy and comf... 2 1 0 Initmates Intimate Intimates 1 1080 34 None Love this dress!  it's sooo pretty.  i happene... 3 1 4 General Dresses Dresses 2 1077 60 Some major design flaws I had such high hopes for this dress and reall... 1 0 0 General Dresses Dresses 3 1049 50 My favorite buy! I love, love, love this jumpsuit. it's fun, fl... 3 1 0 General Petite Bottoms Pants 4 847 47 Flattering shirt This shirt is very flattering to all due to th... 3 1 6 General Tops Blouses <p>So, we will use the <code>review_text</code> column to predict the <code>rating</code>. Later on, we will try to combine it with some other columns (like <code>division_name</code> and <code>age</code>) see if these help.</p> <p>Let's first have a look to the distribution of ratings</p> In\u00a0[4]: Copied! <pre>df.rating.value_counts()\n</pre> df.rating.value_counts() Out[4]: <pre>rating\n3    12515\n2     4904\n1     2820\n0     2369\nName: count, dtype: int64</pre> <p>This shows that we could have perhaps grouped rating scores of 1, 2 and 3 into 1...but anyway, let's just move on with those 4 classes.</p> <p>We are not going to carry any hyperparameter optimization here, so, we will only need a train and a test set (i.e.  no need of a validation set for the example in this notebook)</p> In\u00a0[5]: Copied! <pre>train, test = train_test_split(df, train_size=0.8, random_state=1, stratify=df.rating)\n</pre> train, test = train_test_split(df, train_size=0.8, random_state=1, stratify=df.rating) <p>Let's see what we have to beat. What metrics would we obtain if we always predict the most common rating (3)?</p> In\u00a0[6]: Copied! <pre>most_common_pred = [train.rating.value_counts().index[0]] * len(test)\n\nmost_common_acc = accuracy_score(test.rating, most_common_pred)\nmost_common_f1 = f1_score(test.rating, most_common_pred, average=\"weighted\")\n</pre> most_common_pred = [train.rating.value_counts().index[0]] * len(test)  most_common_acc = accuracy_score(test.rating, most_common_pred) most_common_f1 = f1_score(test.rating, most_common_pred, average=\"weighted\") In\u00a0[7]: Copied! <pre>print(f\"Accuracy: {most_common_acc}. F1 Score: {most_common_f1}\")\n</pre> print(f\"Accuracy: {most_common_acc}. F1 Score: {most_common_f1}\") <pre>Accuracy: 0.553516143299425. F1 Score: 0.3944344218301668\n</pre> <p>ok, these are our \"baseline\" metrics.</p> <p>Let's start by using simply tf-idf + lightGBM</p> In\u00a0[8]: Copied! <pre># ?Tokenizer\n</pre> # ?Tokenizer In\u00a0[9]: Copied! <pre># this Tokenizer is part of our utils module but of course, any valid tokenizer can be used here.\n\n# When using notebooks there seems to be an issue related with multiprocessing (and sometimes tqdm)\n# that can only be solved by using only one CPU\ntok = Tokenizer(n_cpus=1)\ntok_reviews_tr = tok.process_all(train.review_text.tolist())\ntok_reviews_te = tok.process_all(test.review_text.tolist())\n</pre> # this Tokenizer is part of our utils module but of course, any valid tokenizer can be used here.  # When using notebooks there seems to be an issue related with multiprocessing (and sometimes tqdm) # that can only be solved by using only one CPU tok = Tokenizer(n_cpus=1) tok_reviews_tr = tok.process_all(train.review_text.tolist()) tok_reviews_te = tok.process_all(test.review_text.tolist()) In\u00a0[10]: Copied! <pre>vectorizer = TfidfVectorizer(\n    max_features=5000, preprocessor=lambda x: x, tokenizer=lambda x: x, min_df=5\n)\n\nX_text_tr = vectorizer.fit_transform(tok_reviews_tr)\nX_text_te = vectorizer.transform(tok_reviews_te)\n</pre> vectorizer = TfidfVectorizer(     max_features=5000, preprocessor=lambda x: x, tokenizer=lambda x: x, min_df=5 )  X_text_tr = vectorizer.fit_transform(tok_reviews_tr) X_text_te = vectorizer.transform(tok_reviews_te) <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:525: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n  warnings.warn(\n</pre> In\u00a0[11]: Copied! <pre>X_text_tr\n</pre> X_text_tr Out[11]: <pre>&lt;18086x4566 sparse matrix of type '&lt;class 'numpy.float64'&gt;'\n\twith 884074 stored elements in Compressed Sparse Row format&gt;</pre> <p>We now move our matrices to lightGBM <code>Dataset</code> format</p> In\u00a0[12]: Copied! <pre>lgbtrain_text = lgbDataset(\n    X_text_tr,\n    train.rating.values,\n    free_raw_data=False,\n)\n\nlgbtest_text = lgbDataset(\n    X_text_te,\n    test.rating.values,\n    reference=lgbtrain_text,\n    free_raw_data=False,\n)\n</pre> lgbtrain_text = lgbDataset(     X_text_tr,     train.rating.values,     free_raw_data=False, )  lgbtest_text = lgbDataset(     X_text_te,     test.rating.values,     reference=lgbtrain_text,     free_raw_data=False, ) <p>and off we go. By the way, I think as we run the next cell, we should appreciate how fast lightGBM runs. Yes, the input is a sparse matrix, but still, trains on 18086x4566 in a matter of secs</p> In\u00a0[\u00a0]: Copied! <pre>lgb_text_model = lgb.train(\n    {\"objective\": \"multiclass\", \"num_classes\": 4},\n    lgbtrain_text,\n    valid_sets=[lgbtest_text, lgbtrain_text],\n    valid_names=[\"test\", \"train\"],\n)\n</pre> lgb_text_model = lgb.train(     {\"objective\": \"multiclass\", \"num_classes\": 4},     lgbtrain_text,     valid_sets=[lgbtest_text, lgbtrain_text],     valid_names=[\"test\", \"train\"], ) In\u00a0[14]: Copied! <pre>preds_text = lgb_text_model.predict(X_text_te)\npred_text_class = np.argmax(preds_text, 1)\n</pre> preds_text = lgb_text_model.predict(X_text_te) pred_text_class = np.argmax(preds_text, 1) In\u00a0[15]: Copied! <pre>acc_text = accuracy_score(lgbtest_text.label, pred_text_class)\nf1_text = f1_score(lgbtest_text.label, pred_text_class, average=\"weighted\")\ncm_text = confusion_matrix(lgbtest_text.label, pred_text_class)\n</pre> acc_text = accuracy_score(lgbtest_text.label, pred_text_class) f1_text = f1_score(lgbtest_text.label, pred_text_class, average=\"weighted\") cm_text = confusion_matrix(lgbtest_text.label, pred_text_class) In\u00a0[16]: Copied! <pre>print(f\"LightGBM Accuracy: {acc_text}. LightGBM F1 Score: {f1_text}\")\n</pre> print(f\"LightGBM Accuracy: {acc_text}. LightGBM F1 Score: {f1_text}\") <pre>LightGBM Accuracy: 0.6444051304732419. LightGBM F1 Score: 0.617154488246181\n</pre> In\u00a0[17]: Copied! <pre>print(f\"LightGBM Confusion Matrix: \\n {cm_text}\")\n</pre> print(f\"LightGBM Confusion Matrix: \\n {cm_text}\") <pre>LightGBM Confusion Matrix: \n [[ 199  135   61   79]\n [ 123  169  149  123]\n [  30   94  279  578]\n [  16   30  190 2267]]\n</pre> <p>Ok, so, with no hyperparameter optimization lightGBM gets an accuracy of 0.64 and a F1 score of 0.62. This is significantly better than predicting always the most popular.</p> <p>Let's see if in this implementation, some additional features, like <code>age</code> or <code>class_name</code> are of any help</p> In\u00a0[18]: Copied! <pre>tab_cols = [\n    \"age\",\n    \"division_name\",\n    \"department_name\",\n    \"class_name\",\n]\n\nfor tab_df in [train, test]:\n    for c in [\"division_name\", \"department_name\", \"class_name\"]:\n        tab_df[c] = tab_df[c].str.lower()\n        tab_df[c].fillna(\"missing\", inplace=True)\n</pre> tab_cols = [     \"age\",     \"division_name\",     \"department_name\",     \"class_name\", ]  for tab_df in [train, test]:     for c in [\"division_name\", \"department_name\", \"class_name\"]:         tab_df[c] = tab_df[c].str.lower()         tab_df[c].fillna(\"missing\", inplace=True) In\u00a0[19]: Copied! <pre># This is our LabelEncoder. A class that is designed to work with the models in this library but\n# can be used for general purposes\nle = LabelEncoder(columns_to_encode=[\"division_name\", \"department_name\", \"class_name\"])\ntrain_tab_le = le.fit_transform(train)\ntest_tab_le = le.transform(test)\n</pre> # This is our LabelEncoder. A class that is designed to work with the models in this library but # can be used for general purposes le = LabelEncoder(columns_to_encode=[\"division_name\", \"department_name\", \"class_name\"]) train_tab_le = le.fit_transform(train) test_tab_le = le.transform(test) In\u00a0[20]: Copied! <pre>train_tab_le.head()\n</pre> train_tab_le.head() Out[20]: clothing_id age title review_text rating recommended_ind positive_feedback_count division_name department_name class_name 4541 836 35 None Bought this on sale in my reg size- 10. im 5'9... 2 1 2 1 1 1 18573 1022 25 Look like \"mom jeans\" Maybe i just have the wrong body type for thes... 1 0 0 2 2 2 1058 815 39 Ig brought me here Love the way this top layers under my jackets ... 2 1 0 1 1 1 12132 984 47 Runs small especially the arms I love this jacket. it's the prettiest and mos... 3 1 0 1 3 3 20756 1051 42 True red, true beauty. These pants are gorgeous--the fabric has a sat... 3 1 0 2 2 4 <p>let's for example have a look to the encodings for the categorical feature <code>class_name</code></p> In\u00a0[21]: Copied! <pre>le.encoding_dict[\"class_name\"]\n</pre> le.encoding_dict[\"class_name\"] Out[21]: <pre>{'blouses': 1,\n 'jeans': 2,\n 'jackets': 3,\n 'pants': 4,\n 'knits': 5,\n 'dresses': 6,\n 'skirts': 7,\n 'sweaters': 8,\n 'fine gauge': 9,\n 'legwear': 10,\n 'lounge': 11,\n 'shorts': 12,\n 'outerwear': 13,\n 'intimates': 14,\n 'swim': 15,\n 'trend': 16,\n 'sleep': 17,\n 'layering': 18,\n 'missing': 19,\n 'casual bottoms': 20,\n 'chemises': 21}</pre> In\u00a0[22]: Copied! <pre># tabular training and test sets\nX_tab_tr = csr_matrix(train_tab_le[tab_cols].values)\nX_tab_te = csr_matrix(test_tab_le[tab_cols].values)\n\n# text + tabular training and test sets\nX_tab_text_tr = hstack((X_tab_tr, X_text_tr))\nX_tab_text_te = hstack((X_tab_te, X_text_te))\n</pre> # tabular training and test sets X_tab_tr = csr_matrix(train_tab_le[tab_cols].values) X_tab_te = csr_matrix(test_tab_le[tab_cols].values)  # text + tabular training and test sets X_tab_text_tr = hstack((X_tab_tr, X_text_tr)) X_tab_text_te = hstack((X_tab_te, X_text_te)) In\u00a0[23]: Copied! <pre>X_tab_tr\n</pre> X_tab_tr Out[23]: <pre>&lt;18086x4 sparse matrix of type '&lt;class 'numpy.int64'&gt;'\n\twith 72344 stored elements in Compressed Sparse Row format&gt;</pre> In\u00a0[24]: Copied! <pre>X_tab_text_tr\n</pre> X_tab_text_tr Out[24]: <pre>&lt;18086x4570 sparse matrix of type '&lt;class 'numpy.float64'&gt;'\n\twith 956418 stored elements in Compressed Sparse Row format&gt;</pre> In\u00a0[25]: Copied! <pre>lgbtrain_tab_text = lgbDataset(\n    X_tab_text_tr,\n    train.rating.values,\n    categorical_feature=[0, 1, 2, 3],\n    free_raw_data=False,\n)\n\nlgbtest_tab_text = lgbDataset(\n    X_tab_text_te,\n    test.rating.values,\n    reference=lgbtrain_tab_text,\n    free_raw_data=False,\n)\n</pre> lgbtrain_tab_text = lgbDataset(     X_tab_text_tr,     train.rating.values,     categorical_feature=[0, 1, 2, 3],     free_raw_data=False, )  lgbtest_tab_text = lgbDataset(     X_tab_text_te,     test.rating.values,     reference=lgbtrain_tab_text,     free_raw_data=False, ) In\u00a0[26]: Copied! <pre>lgb_tab_text_model = lgb.train(\n    {\"objective\": \"multiclass\", \"num_classes\": 4},\n    lgbtrain_tab_text,\n    valid_sets=[lgbtrain_tab_text, lgbtest_tab_text],\n    valid_names=[\"test\", \"train\"],\n    verbose_eval=False,\n)\n</pre> lgb_tab_text_model = lgb.train(     {\"objective\": \"multiclass\", \"num_classes\": 4},     lgbtrain_tab_text,     valid_sets=[lgbtrain_tab_text, lgbtest_tab_text],     valid_names=[\"test\", \"train\"],     verbose_eval=False, ) <pre>/opt/conda/envs/wd38/lib/python3.8/site-packages/lightgbm/basic.py:2065: UserWarning: Using categorical_feature in Dataset.\n  _log_warning('Using categorical_feature in Dataset.')\n/opt/conda/envs/wd38/lib/python3.8/site-packages/lightgbm/basic.py:2068: UserWarning: categorical_feature in Dataset is overridden.\nNew categorical_feature is [0, 1, 2, 3]\n  _log_warning('categorical_feature in Dataset is overridden.\\n'\n/opt/conda/envs/wd38/lib/python3.8/site-packages/lightgbm/engine.py:239: UserWarning: 'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.\n  _log_warning(\"'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. \"\n</pre> <pre>[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.138280 seconds.\nYou can set `force_col_wise=true` to remove the overhead.\n[LightGBM] [Info] Total Bins 143432\n[LightGBM] [Info] Number of data points in the train set: 18086, number of used features: 2289\n[LightGBM] [Info] Start training from score -2.255919\n[LightGBM] [Info] Start training from score -2.081545\n[LightGBM] [Info] Start training from score -1.528281\n[LightGBM] [Info] Start training from score -0.591354\n</pre> <pre>/opt/conda/envs/wd38/lib/python3.8/site-packages/lightgbm/basic.py:1780: UserWarning: Overriding the parameters from Reference Dataset.\n  _log_warning('Overriding the parameters from Reference Dataset.')\n/opt/conda/envs/wd38/lib/python3.8/site-packages/lightgbm/basic.py:1513: UserWarning: categorical_column in param dict is overridden.\n  _log_warning(f'{cat_alias} in param dict is overridden.')\n</pre> In\u00a0[27]: Copied! <pre>preds_tab_text = lgb_tab_text_model.predict(X_tab_text_te)\npreds_tab_text_class = np.argmax(preds_tab_text, 1)\n\nacc_tab_text = accuracy_score(lgbtest_tab_text.label, preds_tab_text_class)\nf1_tab_text = f1_score(lgbtest_tab_text.label, preds_tab_text_class, average=\"weighted\")\ncm_tab_text = confusion_matrix(lgbtest_tab_text.label, preds_tab_text_class)\n</pre> preds_tab_text = lgb_tab_text_model.predict(X_tab_text_te) preds_tab_text_class = np.argmax(preds_tab_text, 1)  acc_tab_text = accuracy_score(lgbtest_tab_text.label, preds_tab_text_class) f1_tab_text = f1_score(lgbtest_tab_text.label, preds_tab_text_class, average=\"weighted\") cm_tab_text = confusion_matrix(lgbtest_tab_text.label, preds_tab_text_class) In\u00a0[28]: Copied! <pre>print(\n    f\"LightGBM text + tabular Accuracy: {acc_tab_text}. LightGBM text + tabular F1 Score: {f1_tab_text}\"\n)\n</pre> print(     f\"LightGBM text + tabular Accuracy: {acc_tab_text}. LightGBM text + tabular F1 Score: {f1_tab_text}\" ) <pre>LightGBM text + tabular Accuracy: 0.6382131800088456. LightGBM text + tabular F1 Score: 0.6080251307242649\n</pre> In\u00a0[29]: Copied! <pre>print(f\"LightGBM text + tabular Confusion Matrix:\\n {cm_tab_text}\")\n</pre> print(f\"LightGBM text + tabular Confusion Matrix:\\n {cm_tab_text}\") <pre>LightGBM text + tabular Confusion Matrix:\n [[ 193  123   68   90]\n [ 123  146  157  138]\n [  37   90  272  582]\n [  16   37  175 2275]]\n</pre> <p>So, in this set up, the addition tabular columns do not help performance.</p> In\u00a0[30]: Copied! <pre>text_preprocessor = TextPreprocessor(\n    text_col=\"review_text\", max_vocab=5000, min_freq=5, maxlen=90, n_cpus=1\n)\n\nwd_X_text_tr = text_preprocessor.fit_transform(train)\nwd_X_text_te = text_preprocessor.transform(test)\n</pre> text_preprocessor = TextPreprocessor(     text_col=\"review_text\", max_vocab=5000, min_freq=5, maxlen=90, n_cpus=1 )  wd_X_text_tr = text_preprocessor.fit_transform(train) wd_X_text_te = text_preprocessor.transform(test) <pre>The vocabulary contains 4328 tokens\n</pre> In\u00a0[31]: Copied! <pre>basic_rnn = BasicRNN(\n    vocab_size=len(text_preprocessor.vocab.itos),\n    embed_dim=300,\n    hidden_dim=64,\n    n_layers=3,\n    rnn_dropout=0.2,\n    head_hidden_dims=[32],\n)\n\n\nwd_text_model = WideDeep(deeptext=basic_rnn, pred_dim=4)\n</pre> basic_rnn = BasicRNN(     vocab_size=len(text_preprocessor.vocab.itos),     embed_dim=300,     hidden_dim=64,     n_layers=3,     rnn_dropout=0.2,     head_hidden_dims=[32], )   wd_text_model = WideDeep(deeptext=basic_rnn, pred_dim=4) In\u00a0[32]: Copied! <pre>wd_text_model\n</pre> wd_text_model Out[32]: <pre>WideDeep(\n  (deeptext): Sequential(\n    (0): BasicRNN(\n      (word_embed): Embedding(4328, 300, padding_idx=1)\n      (rnn): LSTM(300, 64, num_layers=3, batch_first=True, dropout=0.2)\n      (rnn_mlp): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=64, out_features=32, bias=True)\n            (1): ReLU(inplace=True)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=32, out_features=4, bias=True)\n  )\n)</pre> In\u00a0[33]: Copied! <pre>text_trainer = Trainer(\n    wd_text_model,\n    objective=\"multiclass\",\n    metrics=[Accuracy, F1Score(average=True)],\n    num_workers=0,  # As in the case of the tokenizer, in notebook I need to set this to 0 for the Trainer to work\n)\n</pre> text_trainer = Trainer(     wd_text_model,     objective=\"multiclass\",     metrics=[Accuracy, F1Score(average=True)],     num_workers=0,  # As in the case of the tokenizer, in notebook I need to set this to 0 for the Trainer to work ) In\u00a0[34]: Copied! <pre>text_trainer.fit(\n    X_text=wd_X_text_tr,\n    target=train.rating.values,\n    n_epochs=5,\n    batch_size=256,\n)\n</pre> text_trainer.fit(     X_text=wd_X_text_tr,     target=train.rating.values,     n_epochs=5,     batch_size=256, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:01&lt;00:00, 52.39it/s, loss=1.16, metrics={'acc': 0.5349, 'f1': 0.2011}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:01&lt;00:00, 70.35it/s, loss=0.964, metrics={'acc': 0.5827, 'f1': 0.3005}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:01&lt;00:00, 70.33it/s, loss=0.845, metrics={'acc': 0.6252, 'f1': 0.4133}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:01&lt;00:00, 69.99it/s, loss=0.765, metrics={'acc': 0.6575, 'f1': 0.4875}]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:01&lt;00:00, 69.55it/s, loss=0.709, metrics={'acc': 0.6879, 'f1': 0.5423}]\n</pre> In\u00a0[35]: Copied! <pre>wd_pred_text = text_trainer.predict_proba(X_text=wd_X_text_te)\nwd_pred_text_class = np.argmax(wd_pred_text, 1)\n</pre> wd_pred_text = text_trainer.predict_proba(X_text=wd_X_text_te) wd_pred_text_class = np.argmax(wd_pred_text, 1) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 18/18 [00:00&lt;00:00, 211.51it/s]\n</pre> In\u00a0[36]: Copied! <pre>wd_acc_text = accuracy_score(test.rating, wd_pred_text_class)\nwd_f1_text = f1_score(test.rating, wd_pred_text_class, average=\"weighted\")\nwd_cm_text = confusion_matrix(test.rating, wd_pred_text_class)\n</pre> wd_acc_text = accuracy_score(test.rating, wd_pred_text_class) wd_f1_text = f1_score(test.rating, wd_pred_text_class, average=\"weighted\") wd_cm_text = confusion_matrix(test.rating, wd_pred_text_class) In\u00a0[37]: Copied! <pre>print(f\"Basic RNN Accuracy: {wd_acc_text}. Basic RNN F1 Score: {wd_f1_text}\")\n</pre> print(f\"Basic RNN Accuracy: {wd_acc_text}. Basic RNN F1 Score: {wd_f1_text}\") <pre>Basic RNN Accuracy: 0.6076957098628926. Basic RNN F1 Score: 0.6017335854471788\n</pre> In\u00a0[38]: Copied! <pre>print(f\"Basic RNN Confusion Matrix:\\n {wd_cm_text}\")\n</pre> print(f\"Basic RNN Confusion Matrix:\\n {wd_cm_text}\") <pre>Basic RNN Confusion Matrix:\n [[ 327   76   62    9]\n [ 285  115  117   47]\n [ 131  122  315  413]\n [  42   69  401 1991]]\n</pre> <p>The performance is very similar to that of using simply tf-idf and lightgbm. Let see if adding tabular features helps when using <code>pytorch-widedeep</code></p> In\u00a0[39]: Copied! <pre># ?TabPreprocessor\n</pre> # ?TabPreprocessor In\u00a0[40]: Copied! <pre>tab_preprocessor = TabPreprocessor(cat_embed_cols=tab_cols)\n\nwd_X_tab_tr = tab_preprocessor.fit_transform(train)\nwd_X_tab_te = tab_preprocessor.transform(test)\n</pre> tab_preprocessor = TabPreprocessor(cat_embed_cols=tab_cols)  wd_X_tab_tr = tab_preprocessor.fit_transform(train) wd_X_tab_te = tab_preprocessor.transform(test) In\u00a0[41]: Copied! <pre># ?TabMlp\n</pre> # ?TabMlp In\u00a0[42]: Copied! <pre>tab_model = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    mlp_hidden_dims=[100, 50],\n)\n</pre> tab_model = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     mlp_hidden_dims=[100, 50], ) In\u00a0[43]: Copied! <pre>tab_model\n</pre> tab_model Out[43]: <pre>TabMlp(\n  (cat_and_cont_embed): DiffSizeCatAndContEmbeddings(\n    (cat_embed): DiffSizeCatEmbeddings(\n      (embed_layers): ModuleDict(\n        (emb_layer_age): Embedding(78, 18, padding_idx=0)\n        (emb_layer_division_name): Embedding(5, 3, padding_idx=0)\n        (emb_layer_department_name): Embedding(8, 5, padding_idx=0)\n        (emb_layer_class_name): Embedding(22, 9, padding_idx=0)\n      )\n      (embedding_dropout): Dropout(p=0.1, inplace=False)\n    )\n  )\n  (encoder): MLP(\n    (mlp): Sequential(\n      (dense_layer_0): Sequential(\n        (0): Dropout(p=0.1, inplace=False)\n        (1): Linear(in_features=35, out_features=100, bias=True)\n        (2): ReLU(inplace=True)\n      )\n      (dense_layer_1): Sequential(\n        (0): Dropout(p=0.1, inplace=False)\n        (1): Linear(in_features=100, out_features=50, bias=True)\n        (2): ReLU(inplace=True)\n      )\n    )\n  )\n)</pre> In\u00a0[44]: Copied! <pre>text_model = BasicRNN(\n    vocab_size=len(text_preprocessor.vocab.itos),\n    embed_dim=300,\n    hidden_dim=64,\n    n_layers=3,\n    rnn_dropout=0.2,\n    head_hidden_dims=[32],\n)\n</pre> text_model = BasicRNN(     vocab_size=len(text_preprocessor.vocab.itos),     embed_dim=300,     hidden_dim=64,     n_layers=3,     rnn_dropout=0.2,     head_hidden_dims=[32], ) In\u00a0[45]: Copied! <pre>wd_tab_and_text_model = WideDeep(deeptabular=tab_model, deeptext=text_model, pred_dim=4)\n</pre> wd_tab_and_text_model = WideDeep(deeptabular=tab_model, deeptext=text_model, pred_dim=4) In\u00a0[46]: Copied! <pre>wd_tab_and_text_model\n</pre> wd_tab_and_text_model Out[46]: <pre>WideDeep(\n  (deeptabular): Sequential(\n    (0): TabMlp(\n      (cat_and_cont_embed): DiffSizeCatAndContEmbeddings(\n        (cat_embed): DiffSizeCatEmbeddings(\n          (embed_layers): ModuleDict(\n            (emb_layer_age): Embedding(78, 18, padding_idx=0)\n            (emb_layer_division_name): Embedding(5, 3, padding_idx=0)\n            (emb_layer_department_name): Embedding(8, 5, padding_idx=0)\n            (emb_layer_class_name): Embedding(22, 9, padding_idx=0)\n          )\n          (embedding_dropout): Dropout(p=0.1, inplace=False)\n        )\n      )\n      (encoder): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Dropout(p=0.1, inplace=False)\n            (1): Linear(in_features=35, out_features=100, bias=True)\n            (2): ReLU(inplace=True)\n          )\n          (dense_layer_1): Sequential(\n            (0): Dropout(p=0.1, inplace=False)\n            (1): Linear(in_features=100, out_features=50, bias=True)\n            (2): ReLU(inplace=True)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=50, out_features=4, bias=True)\n  )\n  (deeptext): Sequential(\n    (0): BasicRNN(\n      (word_embed): Embedding(4328, 300, padding_idx=1)\n      (rnn): LSTM(300, 64, num_layers=3, batch_first=True, dropout=0.2)\n      (rnn_mlp): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=64, out_features=32, bias=True)\n            (1): ReLU(inplace=True)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=32, out_features=4, bias=True)\n  )\n)</pre> In\u00a0[47]: Copied! <pre>tab_and_text_trainer = Trainer(\n    wd_tab_and_text_model,\n    objective=\"multiclass\",\n    metrics=[Accuracy, F1Score(average=True)],\n    num_workers=0,  # As in the case of the tokenizer, in notebook I need to set this to 0 for the Trainer to work\n)\n</pre> tab_and_text_trainer = Trainer(     wd_tab_and_text_model,     objective=\"multiclass\",     metrics=[Accuracy, F1Score(average=True)],     num_workers=0,  # As in the case of the tokenizer, in notebook I need to set this to 0 for the Trainer to work ) In\u00a0[48]: Copied! <pre>tab_and_text_trainer.fit(\n    X_tab=wd_X_tab_tr,\n    X_text=wd_X_text_tr,\n    target=train.rating.values,\n    n_epochs=5,\n    batch_size=256,\n)\n</pre> tab_and_text_trainer.fit(     X_tab=wd_X_tab_tr,     X_text=wd_X_text_tr,     target=train.rating.values,     n_epochs=5,     batch_size=256, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:01&lt;00:00, 52.04it/s, loss=1.13, metrics={'acc': 0.538, 'f1': 0.1911}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:01&lt;00:00, 52.28it/s, loss=0.936, metrics={'acc': 0.5887, 'f1': 0.3507}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:01&lt;00:00, 52.26it/s, loss=0.825, metrics={'acc': 0.6394, 'f1': 0.4545}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:01&lt;00:00, 51.33it/s, loss=0.757, metrics={'acc': 0.6696, 'f1': 0.5214}]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:01&lt;00:00, 50.39it/s, loss=0.702, metrics={'acc': 0.6963, 'f1': 0.5654}]\n</pre> In\u00a0[49]: Copied! <pre>wd_pred_tab_and_text = tab_and_text_trainer.predict_proba(\n    X_tab=wd_X_tab_te, X_text=wd_X_text_te\n)\nwd_pred_tab_and_text_class = np.argmax(wd_pred_tab_and_text, 1)\n</pre> wd_pred_tab_and_text = tab_and_text_trainer.predict_proba(     X_tab=wd_X_tab_te, X_text=wd_X_text_te ) wd_pred_tab_and_text_class = np.argmax(wd_pred_tab_and_text, 1) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 18/18 [00:00&lt;00:00, 136.94it/s]\n</pre> In\u00a0[50]: Copied! <pre>wd_acc_tab_and_text = accuracy_score(test.rating, wd_pred_tab_and_text_class)\nwd_f1_tab_and_text = f1_score(\n    test.rating, wd_pred_tab_and_text_class, average=\"weighted\"\n)\nwd_cm_tab_and_text = confusion_matrix(test.rating, wd_pred_tab_and_text_class)\n</pre> wd_acc_tab_and_text = accuracy_score(test.rating, wd_pred_tab_and_text_class) wd_f1_tab_and_text = f1_score(     test.rating, wd_pred_tab_and_text_class, average=\"weighted\" ) wd_cm_tab_and_text = confusion_matrix(test.rating, wd_pred_tab_and_text_class) In\u00a0[51]: Copied! <pre>print(\n    f\"Basic RNN + Tabular  Accuracy: {wd_acc_tab_and_text}. Basic RNN + TabularF1 Score: {wd_f1_tab_and_text}\"\n)\nprint(f\"Basic RNN + Tabular  Confusion Matrix:\\n {wd_cm_tab_and_text}\")\n</pre> print(     f\"Basic RNN + Tabular  Accuracy: {wd_acc_tab_and_text}. Basic RNN + TabularF1 Score: {wd_f1_tab_and_text}\" ) print(f\"Basic RNN + Tabular  Confusion Matrix:\\n {wd_cm_tab_and_text}\") <pre>Basic RNN + Tabular  Accuracy: 0.6333480760725343. Basic RNN + TabularF1 Score: 0.6332310089593208\nBasic RNN + Tabular  Confusion Matrix:\n [[ 267  132   65   10]\n [ 198  168  159   39]\n [  57  113  410  401]\n [  12   58  414 2019]]\n</pre> <p>We are going to \"manually\" code the Tokenizer and the model and see how they can be used as part of the process along with the <code>pytorch-widedeep</code> library.</p> <p>Tokenizer:</p> In\u00a0[52]: Copied! <pre>class BertTokenizer(object):\n    def __init__(\n        self,\n        pretrained_tokenizer=\"distilbert-base-uncased\",\n        do_lower_case=True,\n        max_length=90,\n    ):\n        super(BertTokenizer, self).__init__()\n        self.pretrained_tokenizer = pretrained_tokenizer\n        self.do_lower_case = do_lower_case\n        self.max_length = max_length\n\n    def fit(self, texts):\n        self.tokenizer = DistilBertTokenizer.from_pretrained(\n            self.pretrained_tokenizer, do_lower_case=self.do_lower_case\n        )\n\n        return self\n\n    def transform(self, texts):\n        input_ids = []\n        for text in texts:\n            encoded_sent = self.tokenizer.encode_plus(\n                text=self._pre_rules(text),\n                add_special_tokens=True,\n                max_length=self.max_length,\n                padding=\"max_length\",\n                truncation=True,\n            )\n\n            input_ids.append(encoded_sent.get(\"input_ids\"))\n        return np.stack(input_ids)\n\n    def fit_transform(self, texts):\n        return self.fit(texts).transform(texts)\n\n    @staticmethod\n    def _pre_rules(text):\n        return fix_html(rm_useless_spaces(spec_add_spaces(text)))\n</pre> class BertTokenizer(object):     def __init__(         self,         pretrained_tokenizer=\"distilbert-base-uncased\",         do_lower_case=True,         max_length=90,     ):         super(BertTokenizer, self).__init__()         self.pretrained_tokenizer = pretrained_tokenizer         self.do_lower_case = do_lower_case         self.max_length = max_length      def fit(self, texts):         self.tokenizer = DistilBertTokenizer.from_pretrained(             self.pretrained_tokenizer, do_lower_case=self.do_lower_case         )          return self      def transform(self, texts):         input_ids = []         for text in texts:             encoded_sent = self.tokenizer.encode_plus(                 text=self._pre_rules(text),                 add_special_tokens=True,                 max_length=self.max_length,                 padding=\"max_length\",                 truncation=True,             )              input_ids.append(encoded_sent.get(\"input_ids\"))         return np.stack(input_ids)      def fit_transform(self, texts):         return self.fit(texts).transform(texts)      @staticmethod     def _pre_rules(text):         return fix_html(rm_useless_spaces(spec_add_spaces(text))) <p>Model:</p> In\u00a0[53]: Copied! <pre>class BertModel(nn.Module):\n    def __init__(\n        self,\n        model_name: str = \"distilbert-base-uncased\",\n        freeze_bert: bool = False,\n    ):\n        super(BertModel, self).__init__()\n\n        self.bert = DistilBertModel.from_pretrained(\n            model_name,\n        )\n\n        if freeze_bert:\n            for param in self.bert.parameters():\n                param.requires_grad = False\n\n    def forward(self, X_inp: Tensor) -&gt; Tensor:\n        attn_mask = (X_inp != 0).type(torch.int8)\n        outputs = self.bert(input_ids=X_inp, attention_mask=attn_mask)\n        return outputs[0][:, 0, :]\n\n    @property\n    def output_dim(self) -&gt; int:\n        # This is THE ONLY requirement for any model to work with pytorch-widedeep. Must\n        # have a 'output_dim' property so the WideDeep class knows the incoming dims\n        # from the custom model. in this case, I hardcoded it\n        return 768\n</pre> class BertModel(nn.Module):     def __init__(         self,         model_name: str = \"distilbert-base-uncased\",         freeze_bert: bool = False,     ):         super(BertModel, self).__init__()          self.bert = DistilBertModel.from_pretrained(             model_name,         )          if freeze_bert:             for param in self.bert.parameters():                 param.requires_grad = False      def forward(self, X_inp: Tensor) -&gt; Tensor:         attn_mask = (X_inp != 0).type(torch.int8)         outputs = self.bert(input_ids=X_inp, attention_mask=attn_mask)         return outputs[0][:, 0, :]      @property     def output_dim(self) -&gt; int:         # This is THE ONLY requirement for any model to work with pytorch-widedeep. Must         # have a 'output_dim' property so the WideDeep class knows the incoming dims         # from the custom model. in this case, I hardcoded it         return 768 In\u00a0[54]: Copied! <pre>bert_tokenizer = BertTokenizer()\nX_bert_tr = bert_tokenizer.fit_transform(train[\"review_text\"].tolist())\nX_bert_te = bert_tokenizer.transform(test[\"review_text\"].tolist())\n</pre> bert_tokenizer = BertTokenizer() X_bert_tr = bert_tokenizer.fit_transform(train[\"review_text\"].tolist()) X_bert_te = bert_tokenizer.transform(test[\"review_text\"].tolist()) <p>As I mentioned a number of times in the documentation and examples, <code>pytorch-widedeep</code> is designed for flexibility. For any of the data modes (tabular, text and images) there are available components/models in the library. However, the user can choose to use any model they want with the only requirement that such model must have a <code>output_dim</code> property.</p> <p>With that in mind, the <code>BertModel</code> class defined above can be used by <code>pytorch-widedeep</code> as any other of the internal components. In other words, simply...pass it to the <code>WideDeep</code> class. In this case we are going to add a FC-head as part of the classifier.</p> In\u00a0[55]: Copied! <pre>bert_model = BertModel(freeze_bert=True)\nwd_bert_model = WideDeep(\n    deeptext=bert_model,\n    head_hidden_dims=[256, 128, 64],\n    pred_dim=4,\n)\n</pre> bert_model = BertModel(freeze_bert=True) wd_bert_model = WideDeep(     deeptext=bert_model,     head_hidden_dims=[256, 128, 64],     pred_dim=4, ) <pre>Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']\n- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n</pre> In\u00a0[56]: Copied! <pre>wd_bert_model\n</pre> wd_bert_model Out[56]: <pre>WideDeep(\n  (deeptext): BertModel(\n    (bert): DistilBertModel(\n      (embeddings): Embeddings(\n        (word_embeddings): Embedding(30522, 768, padding_idx=0)\n        (position_embeddings): Embedding(512, 768)\n        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n        (dropout): Dropout(p=0.1, inplace=False)\n      )\n      (transformer): Transformer(\n        (layer): ModuleList(\n          (0): TransformerBlock(\n            (attention): MultiHeadSelfAttention(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n            )\n            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n            (ffn): FFN(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n              (activation): GELUActivation()\n            )\n            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n          )\n          (1): TransformerBlock(\n            (attention): MultiHeadSelfAttention(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n            )\n            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n            (ffn): FFN(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n              (activation): GELUActivation()\n            )\n            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n          )\n          (2): TransformerBlock(\n            (attention): MultiHeadSelfAttention(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n            )\n            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n            (ffn): FFN(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n              (activation): GELUActivation()\n            )\n            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n          )\n          (3): TransformerBlock(\n            (attention): MultiHeadSelfAttention(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n            )\n            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n            (ffn): FFN(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n              (activation): GELUActivation()\n            )\n            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n          )\n          (4): TransformerBlock(\n            (attention): MultiHeadSelfAttention(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n            )\n            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n            (ffn): FFN(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n              (activation): GELUActivation()\n            )\n            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n          )\n          (5): TransformerBlock(\n            (attention): MultiHeadSelfAttention(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n            )\n            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n            (ffn): FFN(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n              (activation): GELUActivation()\n            )\n            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n          )\n        )\n      )\n    )\n  )\n  (deephead): Sequential(\n    (0): MLP(\n      (mlp): Sequential(\n        (dense_layer_0): Sequential(\n          (0): Linear(in_features=768, out_features=256, bias=True)\n          (1): ReLU(inplace=True)\n          (2): Dropout(p=0.1, inplace=False)\n        )\n        (dense_layer_1): Sequential(\n          (0): Linear(in_features=256, out_features=128, bias=True)\n          (1): ReLU(inplace=True)\n          (2): Dropout(p=0.1, inplace=False)\n        )\n        (dense_layer_2): Sequential(\n          (0): Linear(in_features=128, out_features=64, bias=True)\n          (1): ReLU(inplace=True)\n          (2): Dropout(p=0.1, inplace=False)\n        )\n      )\n    )\n    (1): Linear(in_features=64, out_features=4, bias=True)\n  )\n)</pre> In\u00a0[57]: Copied! <pre>wd_bert_trainer = Trainer(\n    wd_bert_model,\n    objective=\"multiclass\",\n    metrics=[Accuracy, F1Score(average=True)],\n    num_workers=0,  # As in the case of the tokenizer, in notebook I need to set this to 0 for the Trainer to work\n)\n\nwd_bert_trainer.fit(\n    X_text=X_bert_tr,\n    target=train.rating.values,\n    n_epochs=3,\n    batch_size=64,\n)\n</pre> wd_bert_trainer = Trainer(     wd_bert_model,     objective=\"multiclass\",     metrics=[Accuracy, F1Score(average=True)],     num_workers=0,  # As in the case of the tokenizer, in notebook I need to set this to 0 for the Trainer to work )  wd_bert_trainer.fit(     X_text=X_bert_tr,     target=train.rating.values,     n_epochs=3,     batch_size=64, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 283/283 [00:14&lt;00:00, 19.68it/s, loss=0.968, metrics={'acc': 0.5879, 'f1': 0.3591}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 283/283 [00:14&lt;00:00, 19.63it/s, loss=0.884, metrics={'acc': 0.6178, 'f1': 0.4399}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 283/283 [00:14&lt;00:00, 19.55it/s, loss=0.87, metrics={'acc': 0.6234, 'f1': 0.4527}]\n</pre> In\u00a0[58]: Copied! <pre>wd_bert_pred_text = wd_bert_trainer.predict_proba(X_text=X_bert_te)\nwd_bert_pred_text_class = np.argmax(wd_bert_pred_text, 1)\n</pre> wd_bert_pred_text = wd_bert_trainer.predict_proba(X_text=X_bert_te) wd_bert_pred_text_class = np.argmax(wd_bert_pred_text, 1) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:03&lt;00:00, 21.97it/s]\n</pre> In\u00a0[59]: Copied! <pre>wd_bert_acc = accuracy_score(test.rating, wd_bert_pred_text_class)\nwd_bert_f1 = f1_score(test.rating, wd_bert_pred_text_class, average=\"weighted\")\nwd_bert_cm = confusion_matrix(test.rating, wd_bert_pred_text_class)\n</pre> wd_bert_acc = accuracy_score(test.rating, wd_bert_pred_text_class) wd_bert_f1 = f1_score(test.rating, wd_bert_pred_text_class, average=\"weighted\") wd_bert_cm = confusion_matrix(test.rating, wd_bert_pred_text_class) In\u00a0[60]: Copied! <pre>print(f\"Distilbert Accuracy: {wd_bert_acc}. Distilbert F1 Score: {wd_bert_f1}\")\nprint(f\"Distilbert Confusion Matrix:\\n {wd_bert_cm}\")\n</pre> print(f\"Distilbert Accuracy: {wd_bert_acc}. Distilbert F1 Score: {wd_bert_f1}\") print(f\"Distilbert Confusion Matrix:\\n {wd_bert_cm}\") <pre>Distilbert Accuracy: 0.6326846528084918. Distilbert F1 Score: 0.5796652991272998\nDistilbert Confusion Matrix:\n [[ 287   75   22   90]\n [ 197  136   62  169]\n [  68  119  123  671]\n [  40   64   84 2315]]\n</pre> <p>Now, adding a tabular model follows the exact same process as the one described in section 2.</p> In\u00a0[61]: Copied! <pre>tab_model = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    mlp_hidden_dims=[100, 50],\n)\n</pre> tab_model = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     mlp_hidden_dims=[100, 50], ) In\u00a0[62]: Copied! <pre>wd_tab_bert_model = WideDeep(\n    deeptabular=tab_model,\n    deeptext=bert_model,\n    head_hidden_dims=[256, 128, 64],\n    pred_dim=4,\n)\n</pre> wd_tab_bert_model = WideDeep(     deeptabular=tab_model,     deeptext=bert_model,     head_hidden_dims=[256, 128, 64],     pred_dim=4, ) In\u00a0[63]: Copied! <pre>wd_tab_bert_trainer = Trainer(\n    wd_tab_bert_model,\n    objective=\"multiclass\",\n    metrics=[Accuracy, F1Score(average=True)],\n    num_workers=0,  # As in the case of the tokenizer, in notebook I need to set this to 0 for the Trainer to work\n)\n</pre> wd_tab_bert_trainer = Trainer(     wd_tab_bert_model,     objective=\"multiclass\",     metrics=[Accuracy, F1Score(average=True)],     num_workers=0,  # As in the case of the tokenizer, in notebook I need to set this to 0 for the Trainer to work ) In\u00a0[64]: Copied! <pre>wd_tab_bert_trainer.fit(\n    X_tab=wd_X_tab_tr,\n    X_text=X_bert_tr,\n    target=train.rating.values,\n    n_epochs=3,\n    batch_size=64,\n)\n</pre> wd_tab_bert_trainer.fit(     X_tab=wd_X_tab_tr,     X_text=X_bert_tr,     target=train.rating.values,     n_epochs=3,     batch_size=64, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 283/283 [00:15&lt;00:00, 18.15it/s, loss=0.974, metrics={'acc': 0.5838, 'f1': 0.3404}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 283/283 [00:15&lt;00:00, 18.38it/s, loss=0.885, metrics={'acc': 0.618, 'f1': 0.4378}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 283/283 [00:15&lt;00:00, 18.40it/s, loss=0.868, metrics={'acc': 0.6252, 'f1': 0.4575}]\n</pre> In\u00a0[65]: Copied! <pre>wd_tab_bert_pred_text = wd_tab_bert_trainer.predict_proba(\n    X_tab=wd_X_tab_te, X_text=X_bert_te\n)\nwd_tab_bert_pred_text_class = np.argmax(wd_tab_bert_pred_text, 1)\n</pre> wd_tab_bert_pred_text = wd_tab_bert_trainer.predict_proba(     X_tab=wd_X_tab_te, X_text=X_bert_te ) wd_tab_bert_pred_text_class = np.argmax(wd_tab_bert_pred_text, 1) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 71/71 [00:03&lt;00:00, 21.32it/s]\n</pre> In\u00a0[66]: Copied! <pre>wd_tab_bert_acc = accuracy_score(test.rating, wd_tab_bert_pred_text_class)\nwd_tab_bert_f1 = f1_score(test.rating, wd_tab_bert_pred_text_class, average=\"weighted\")\nwd_tab_bert_cm = confusion_matrix(test.rating, wd_tab_bert_pred_text_class)\n</pre> wd_tab_bert_acc = accuracy_score(test.rating, wd_tab_bert_pred_text_class) wd_tab_bert_f1 = f1_score(test.rating, wd_tab_bert_pred_text_class, average=\"weighted\") wd_tab_bert_cm = confusion_matrix(test.rating, wd_tab_bert_pred_text_class) In\u00a0[67]: Copied! <pre>print(\n    f\"Distilbert + Tabular Accuracy: {wd_tab_bert_acc}. Distilbert+ Tabular F1 Score: {wd_tab_bert_f1}\"\n)\nprint(f\"Distilbert + Tabular Confusion Matrix:\\n {wd_tab_bert_cm}\")\n</pre> print(     f\"Distilbert + Tabular Accuracy: {wd_tab_bert_acc}. Distilbert+ Tabular F1 Score: {wd_tab_bert_f1}\" ) print(f\"Distilbert + Tabular Confusion Matrix:\\n {wd_tab_bert_cm}\") <pre>Distilbert + Tabular Accuracy: 0.6242812914639541. Distilbert+ Tabular F1 Score: 0.5508351761564895\nDistilbert + Tabular Confusion Matrix:\n [[ 297   56   11  110]\n [ 229   91   38  206]\n [  86   90   71  734]\n [  49   48   42 2364]]\n</pre>"},{"location":"examples/16_Usign_a_custom_hugging_face_model.html#using-a-hugginface-model","title":"Using a Hugginface model\u00b6","text":"<p>In this notebook we will show how to use an \"external\" Hugginface model along with any other model in the libray. In particular we will show how to combine it with a tabular DL model.</p> <p>Since we are here, we will also compare the performance of a few models on a text classification problem.</p> <p>The notebook will go as follows:</p> <ol> <li>Text classification using tf-idf + LightGBM</li> <li>Text classification using a basic RNN</li> <li>Text classification using Distilbert</li> </ol> <p>In all 3 cases we will add some tabular features to see if these help.</p> <p>In general, I would not pay much attention to the results since I have placed no effort in getting the best possible results (i.e. no hyperparameter optimization or trying different architectures, for example).</p> <p>Let's go</p>"},{"location":"examples/16_Usign_a_custom_hugging_face_model.html#1-text-classification-using-tf-idf-lightgbm","title":"1. Text classification using tf-idf + LightGBM\u00b6","text":""},{"location":"examples/16_Usign_a_custom_hugging_face_model.html#2-text-classification-using-pytorch-widedeeps-built-in-models-a-basic-rnn","title":"2. Text classification using pytorch-widedeep's built-in models (a basic RNN)\u00b6","text":"<p>Moving on now to fully using <code>pytorch-widedeep</code> in this dataset, let's have a look on how one could use a simple RNN to predict the ratings with the library.</p>"},{"location":"examples/16_Usign_a_custom_hugging_face_model.html#3-text-classification-using-a-hugginface-model-as-a-custom-model-in-pytorch-widedeeps","title":"3. Text classification using a Hugginface model as a custom model in pytorch-widedeep's\u00b6","text":""},{"location":"examples/17_feature_importance_via_attention_weights.html","title":"17 feature importance via attention weights","text":"In\u00a0[1]: Copied! <pre>import torch\n\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import accuracy_score\n\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.models import TabTransformer, ContextAttentionMLP, WideDeep\nfrom pytorch_widedeep.callbacks import EarlyStopping\nfrom pytorch_widedeep.metrics import Accuracy\nfrom pytorch_widedeep.datasets import load_adult\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\n</pre> import torch  import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score   from pytorch_widedeep import Trainer from pytorch_widedeep.models import TabTransformer, ContextAttentionMLP, WideDeep from pytorch_widedeep.callbacks import EarlyStopping from pytorch_widedeep.metrics import Accuracy from pytorch_widedeep.datasets import load_adult from pytorch_widedeep.preprocessing import TabPreprocessor In\u00a0[2]: Copied! <pre># use_cuda = torch.cuda.is_available()\ndf = load_adult(as_frame=True)\ndf.columns = [c.replace(\"-\", \"_\") for c in df.columns]\ndf[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int)\ndf.drop([\"income\", \"fnlwgt\", \"educational_num\"], axis=1, inplace=True)\ntarget_colname = \"income_label\"\n</pre> # use_cuda = torch.cuda.is_available() df = load_adult(as_frame=True) df.columns = [c.replace(\"-\", \"_\") for c in df.columns] df[\"income_label\"] = (df[\"income\"].apply(lambda x: \"&gt;50K\" in x)).astype(int) df.drop([\"income\", \"fnlwgt\", \"educational_num\"], axis=1, inplace=True) target_colname = \"income_label\" In\u00a0[3]: Copied! <pre>df.head()\n</pre> df.head() Out[3]: age workclass education marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country income_label 0 25 Private 11th Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States 0 1 38 Private HS-grad Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States 0 2 28 Local-gov Assoc-acdm Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States 1 3 44 Private Some-college Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States 1 4 18 ? Some-college Never-married ? Own-child White Female 0 0 30 United-States 0 In\u00a0[4]: Copied! <pre>cat_embed_cols = []\nfor col in df.columns:\n    if df[col].dtype == \"O\" or df[col].nunique() &lt; 200 and col != target_colname:\n        cat_embed_cols.append(col)\n</pre> cat_embed_cols = [] for col in df.columns:     if df[col].dtype == \"O\" or df[col].nunique() &lt; 200 and col != target_colname:         cat_embed_cols.append(col) In\u00a0[5]: Copied! <pre># all cols will be categorical\nassert len(cat_embed_cols) == df.shape[1] - 1\n</pre> # all cols will be categorical assert len(cat_embed_cols) == df.shape[1] - 1 In\u00a0[6]: Copied! <pre>train, test = train_test_split(\n    df, test_size=0.1, random_state=1, stratify=df[[target_colname]]\n)\n</pre> train, test = train_test_split(     df, test_size=0.1, random_state=1, stratify=df[[target_colname]] ) In\u00a0[7]: Copied! <pre>tab_preprocessor = TabPreprocessor(cat_embed_cols=cat_embed_cols, with_attention=True)\n</pre> tab_preprocessor = TabPreprocessor(cat_embed_cols=cat_embed_cols, with_attention=True) In\u00a0[8]: Copied! <pre>X_tab_train = tab_preprocessor.fit_transform(train)\nX_tab_test = tab_preprocessor.transform(test)\ntarget = train[target_colname].values\n</pre> X_tab_train = tab_preprocessor.fit_transform(train) X_tab_test = tab_preprocessor.transform(test) target = train[target_colname].values In\u00a0[9]: Copied! <pre>tab_transformer = TabTransformer(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    embed_continuous_method=\"standard\",\n    input_dim=8,\n    n_heads=2,\n    n_blocks=1,\n    attn_dropout=0.1,\n    transformer_activation=\"relu\",\n)\n</pre> tab_transformer = TabTransformer(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     embed_continuous_method=\"standard\",     input_dim=8,     n_heads=2,     n_blocks=1,     attn_dropout=0.1,     transformer_activation=\"relu\", ) In\u00a0[10]: Copied! <pre>model = WideDeep(deeptabular=tab_transformer)\n</pre> model = WideDeep(deeptabular=tab_transformer) In\u00a0[11]: Copied! <pre>optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0)\n</pre> optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0) In\u00a0[12]: Copied! <pre>lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(\n    optimizer,\n    threshold=0.001,\n    threshold_mode=\"abs\",\n    patience=10,\n)\n</pre> lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(     optimizer,     threshold=0.001,     threshold_mode=\"abs\",     patience=10, ) In\u00a0[13]: Copied! <pre>early_stopping = EarlyStopping(\n    min_delta=0.001, patience=30, restore_best_weights=True, verbose=True\n)\n</pre> early_stopping = EarlyStopping(     min_delta=0.001, patience=30, restore_best_weights=True, verbose=True ) In\u00a0[14]: Copied! <pre>trainer = Trainer(\n    model,\n    objective=\"binary\",\n    optimizers=optimizer,\n    lr_schedulers=lr_scheduler,\n    reducelronplateau_criterion=\"loss\",\n    callbacks=[early_stopping],\n    metrics=[Accuracy],\n)\n</pre> trainer = Trainer(     model,     objective=\"binary\",     optimizers=optimizer,     lr_schedulers=lr_scheduler,     reducelronplateau_criterion=\"loss\",     callbacks=[early_stopping],     metrics=[Accuracy], ) <p>The feature importances will be computed after training, using a sample of the training dataset of size <code>feature_importance_sample_size</code></p> In\u00a0[15]: Copied! <pre>trainer.fit(\n    X_tab=X_tab_train,\n    target=target,\n    val_split=0.2,\n    n_epochs=100,\n    batch_size=128,\n    validation_freq=1,\n    feature_importance_sample_size=1000,\n)\n</pre> trainer.fit(     X_tab=X_tab_train,     target=target,     val_split=0.2,     n_epochs=100,     batch_size=128,     validation_freq=1,     feature_importance_sample_size=1000, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 81.80it/s, loss=0.334, metrics={'acc': 0.847}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 111.34it/s, loss=0.294, metrics={'acc': 0.8669}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 83.02it/s, loss=0.293, metrics={'acc': 0.8656}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 124.03it/s, loss=0.283, metrics={'acc': 0.8678}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 87.69it/s, loss=0.282, metrics={'acc': 0.8703}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 118.22it/s, loss=0.279, metrics={'acc': 0.8717}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.24it/s, loss=0.277, metrics={'acc': 0.8718}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 115.29it/s, loss=0.277, metrics={'acc': 0.8731}]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 83.76it/s, loss=0.275, metrics={'acc': 0.8727}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 120.80it/s, loss=0.276, metrics={'acc': 0.8727}]\nepoch 6: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 82.78it/s, loss=0.273, metrics={'acc': 0.873}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 100.43it/s, loss=0.276, metrics={'acc': 0.871}]\nepoch 7: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.35it/s, loss=0.271, metrics={'acc': 0.8742}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 116.14it/s, loss=0.275, metrics={'acc': 0.8726}]\nepoch 8: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 74.29it/s, loss=0.271, metrics={'acc': 0.875}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 117.46it/s, loss=0.276, metrics={'acc': 0.8718}]\nepoch 9: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.10it/s, loss=0.27, metrics={'acc': 0.8761}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 105.49it/s, loss=0.275, metrics={'acc': 0.8728}]\nepoch 10: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 70.40it/s, loss=0.269, metrics={'acc': 0.8747}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 105.47it/s, loss=0.275, metrics={'acc': 0.8726}]\nepoch 11: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 72.83it/s, loss=0.268, metrics={'acc': 0.8742}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 105.03it/s, loss=0.274, metrics={'acc': 0.873}]\nepoch 12: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 71.86it/s, loss=0.267, metrics={'acc': 0.8743}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 106.61it/s, loss=0.274, metrics={'acc': 0.8734}]\nepoch 13: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 72.39it/s, loss=0.267, metrics={'acc': 0.876}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 108.05it/s, loss=0.275, metrics={'acc': 0.8717}]\nepoch 14: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 73.36it/s, loss=0.265, metrics={'acc': 0.8767}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 109.60it/s, loss=0.276, metrics={'acc': 0.8747}]\nepoch 15: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 73.34it/s, loss=0.264, metrics={'acc': 0.876}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 117.55it/s, loss=0.276, metrics={'acc': 0.8706}]\nepoch 16: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.35it/s, loss=0.264, metrics={'acc': 0.8777}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 122.08it/s, loss=0.275, metrics={'acc': 0.8753}]\nepoch 17: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.96it/s, loss=0.263, metrics={'acc': 0.877}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 120.83it/s, loss=0.277, metrics={'acc': 0.8739}]\nepoch 18: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.38it/s, loss=0.263, metrics={'acc': 0.8779}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 118.18it/s, loss=0.278, metrics={'acc': 0.8714}]\nepoch 19: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.61it/s, loss=0.261, metrics={'acc': 0.8784}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 119.19it/s, loss=0.278, metrics={'acc': 0.8712}]\nepoch 20: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 72.43it/s, loss=0.261, metrics={'acc': 0.8791}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 118.87it/s, loss=0.28, metrics={'acc': 0.873}]\nepoch 21: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 73.97it/s, loss=0.26, metrics={'acc': 0.8787}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 107.50it/s, loss=0.279, metrics={'acc': 0.8732}]\nepoch 22: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 71.76it/s, loss=0.253, metrics={'acc': 0.8816}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 108.11it/s, loss=0.279, metrics={'acc': 0.8707}]\nepoch 23: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 71.92it/s, loss=0.252, metrics={'acc': 0.8828}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 100.14it/s, loss=0.28, metrics={'acc': 0.8711}]\nepoch 24: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.04it/s, loss=0.252, metrics={'acc': 0.8829}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 98.36it/s, loss=0.28, metrics={'acc': 0.8708}]\nepoch 25: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.31it/s, loss=0.251, metrics={'acc': 0.883}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 122.97it/s, loss=0.281, metrics={'acc': 0.8709}]\nepoch 26: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.63it/s, loss=0.25, metrics={'acc': 0.8834}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 123.07it/s, loss=0.281, metrics={'acc': 0.8698}]\nepoch 27: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 78.37it/s, loss=0.251, metrics={'acc': 0.884}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 114.75it/s, loss=0.281, metrics={'acc': 0.87}]\nepoch 28: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.19it/s, loss=0.25, metrics={'acc': 0.883}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 117.99it/s, loss=0.282, metrics={'acc': 0.8699}]\nepoch 29: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.19it/s, loss=0.25, metrics={'acc': 0.8829}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 111.11it/s, loss=0.282, metrics={'acc': 0.8695}]\nepoch 30: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.57it/s, loss=0.249, metrics={'acc': 0.8839}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 115.64it/s, loss=0.283, metrics={'acc': 0.8689}]\nepoch 31: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.55it/s, loss=0.249, metrics={'acc': 0.8846}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 125.10it/s, loss=0.283, metrics={'acc': 0.869}]\nepoch 32: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 82.56it/s, loss=0.248, metrics={'acc': 0.8841}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 118.45it/s, loss=0.284, metrics={'acc': 0.8687}]\nepoch 33: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 81.06it/s, loss=0.248, metrics={'acc': 0.8848}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 129.26it/s, loss=0.284, metrics={'acc': 0.8689}]\nepoch 34: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 80.53it/s, loss=0.248, metrics={'acc': 0.8854}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 120.61it/s, loss=0.283, metrics={'acc': 0.869}]\nepoch 35: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 80.78it/s, loss=0.248, metrics={'acc': 0.8853}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 127.31it/s, loss=0.283, metrics={'acc': 0.8694}]\nepoch 36: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 82.51it/s, loss=0.248, metrics={'acc': 0.8863}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 125.94it/s, loss=0.283, metrics={'acc': 0.8693}]\nepoch 37: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 81.35it/s, loss=0.247, metrics={'acc': 0.8844}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 126.77it/s, loss=0.283, metrics={'acc': 0.8692}]\nepoch 38: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 80.62it/s, loss=0.248, metrics={'acc': 0.8837}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 116.62it/s, loss=0.283, metrics={'acc': 0.8692}]\nepoch 39: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.82it/s, loss=0.248, metrics={'acc': 0.8842}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 123.64it/s, loss=0.283, metrics={'acc': 0.8695}]\nepoch 40: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.86it/s, loss=0.247, metrics={'acc': 0.8855}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 120.34it/s, loss=0.283, metrics={'acc': 0.8692}]\n</pre> <pre>Best Epoch: 10. Best val_loss: 0.27451\nRestoring model weights from the end of the best epoch\n</pre> In\u00a0[16]: Copied! <pre>trainer.feature_importance\n</pre> trainer.feature_importance Out[16]: <pre>{'age': 0.09718182,\n 'workclass': 0.090637445,\n 'education': 0.08910798,\n 'marital_status': 0.08971319,\n 'occupation': 0.12546304,\n 'relationship': 0.086381145,\n 'race': 0.050686445,\n 'gender': 0.05116429,\n 'capital_gain': 0.08165918,\n 'capital_loss': 0.07702667,\n 'hours_per_week': 0.08205996,\n 'native_country': 0.07891885}</pre> In\u00a0[17]: Copied! <pre>preds = trainer.predict(X_tab=X_tab_test)\n</pre> preds = trainer.predict(X_tab=X_tab_test) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 39/39 [00:00&lt;00:00, 199.63it/s]\n</pre> In\u00a0[18]: Copied! <pre>accuracy_score(preds, test.income_label)\n</pre> accuracy_score(preds, test.income_label) Out[18]: <pre>0.8685772773797339</pre> In\u00a0[19]: Copied! <pre>test.reset_index(drop=True, inplace=True)\n</pre> test.reset_index(drop=True, inplace=True) In\u00a0[20]: Copied! <pre>test[test.income_label == 0].head(1)\n</pre> test[test.income_label == 0].head(1) Out[20]: age workclass education marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country income_label 0 26 Private Some-college Never-married Exec-managerial Not-in-family White Male 0 0 60 United-States 0 In\u00a0[21]: Copied! <pre>test[test.income_label == 1].head(1)\n</pre> test[test.income_label == 1].head(1) Out[21]: age workclass education marital_status occupation relationship race gender capital_gain capital_loss hours_per_week native_country income_label 3 36 Local-gov Doctorate Married-civ-spouse Prof-specialty Husband White Male 0 1887 50 United-States 1 <p>To get the feature importance of a test dataset, simply use the <code>explain</code> method</p> In\u00a0[22]: Copied! <pre>feat_imp_per_sample = trainer.explain(X_tab_test, save_step_masks=False)\n</pre> feat_imp_per_sample = trainer.explain(X_tab_test, save_step_masks=False) In\u00a0[23]: Copied! <pre>list(test.iloc[0].index[np.argsort(-feat_imp_per_sample[0])])\n</pre> list(test.iloc[0].index[np.argsort(-feat_imp_per_sample[0])]) Out[23]: <pre>['hours_per_week',\n 'education',\n 'relationship',\n 'occupation',\n 'workclass',\n 'capital_gain',\n 'native_country',\n 'marital_status',\n 'capital_loss',\n 'age',\n 'race',\n 'gender']</pre> In\u00a0[24]: Copied! <pre>list(test.iloc[3].index[np.argsort(-feat_imp_per_sample[3])])\n</pre> list(test.iloc[3].index[np.argsort(-feat_imp_per_sample[3])]) Out[24]: <pre>['age',\n 'capital_loss',\n 'hours_per_week',\n 'marital_status',\n 'native_country',\n 'relationship',\n 'race',\n 'education',\n 'occupation',\n 'capital_gain',\n 'gender',\n 'workclass']</pre> <p>We could do the same with the <code>ContextAttentionMLP</code></p> In\u00a0[25]: Copied! <pre>context_attn_mlp = ContextAttentionMLP(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    cat_embed_dropout=0.0,\n    input_dim=16,\n    attn_dropout=0.1,\n    attn_activation=\"relu\",\n)\n</pre> context_attn_mlp = ContextAttentionMLP(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     cat_embed_dropout=0.0,     input_dim=16,     attn_dropout=0.1,     attn_activation=\"relu\", ) In\u00a0[26]: Copied! <pre>mlp_model = WideDeep(deeptabular=context_attn_mlp)\n</pre> mlp_model = WideDeep(deeptabular=context_attn_mlp) In\u00a0[27]: Copied! <pre>mlp_optimizer = torch.optim.Adam(mlp_model.parameters(), lr=0.01, weight_decay=0.0)\n</pre> mlp_optimizer = torch.optim.Adam(mlp_model.parameters(), lr=0.01, weight_decay=0.0) In\u00a0[28]: Copied! <pre>mlp_lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(\n    mlp_optimizer,\n    threshold=0.001,\n    threshold_mode=\"abs\",\n    patience=10,\n)\n</pre> mlp_lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(     mlp_optimizer,     threshold=0.001,     threshold_mode=\"abs\",     patience=10, ) In\u00a0[29]: Copied! <pre>mlp_early_stopping = EarlyStopping(\n    min_delta=0.001, patience=30, restore_best_weights=True, verbose=True\n)\n</pre> mlp_early_stopping = EarlyStopping(     min_delta=0.001, patience=30, restore_best_weights=True, verbose=True ) In\u00a0[30]: Copied! <pre>mlp_trainer = Trainer(\n    mlp_model,\n    objective=\"binary\",\n    optimizers=mlp_optimizer,\n    lr_schedulers=mlp_lr_scheduler,\n    reducelronplateau_criterion=\"loss\",\n    callbacks=[mlp_early_stopping],\n    metrics=[Accuracy],\n)\n</pre> mlp_trainer = Trainer(     mlp_model,     objective=\"binary\",     optimizers=mlp_optimizer,     lr_schedulers=mlp_lr_scheduler,     reducelronplateau_criterion=\"loss\",     callbacks=[mlp_early_stopping],     metrics=[Accuracy], ) In\u00a0[31]: Copied! <pre>mlp_trainer.fit(\n    X_tab=X_tab_train,\n    target=target,\n    val_split=0.2,\n    n_epochs=100,\n    batch_size=128,\n    validation_freq=1,\n    feature_importance_sample_size=1000,\n)\n</pre> mlp_trainer.fit(     X_tab=X_tab_train,     target=target,     val_split=0.2,     n_epochs=100,     batch_size=128,     validation_freq=1,     feature_importance_sample_size=1000, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 73.11it/s, loss=0.405, metrics={'acc': 0.8094}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 119.26it/s, loss=0.309, metrics={'acc': 0.8583}]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 71.70it/s, loss=0.332, metrics={'acc': 0.8447}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 116.36it/s, loss=0.293, metrics={'acc': 0.8646}]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 78.42it/s, loss=0.319, metrics={'acc': 0.8505}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 118.05it/s, loss=0.293, metrics={'acc': 0.8654}]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.00it/s, loss=0.312, metrics={'acc': 0.8554}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 116.49it/s, loss=0.291, metrics={'acc': 0.8661}]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.50it/s, loss=0.308, metrics={'acc': 0.8583}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 120.48it/s, loss=0.287, metrics={'acc': 0.8669}]\nepoch 6: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 80.84it/s, loss=0.303, metrics={'acc': 0.8605}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 128.70it/s, loss=0.288, metrics={'acc': 0.8673}]\nepoch 7: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.93it/s, loss=0.301, metrics={'acc': 0.8597}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 121.47it/s, loss=0.298, metrics={'acc': 0.8628}]\nepoch 8: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 80.56it/s, loss=0.3, metrics={'acc': 0.8592}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 119.84it/s, loss=0.281, metrics={'acc': 0.8718}]\nepoch 9: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.17it/s, loss=0.298, metrics={'acc': 0.8619}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 126.32it/s, loss=0.28, metrics={'acc': 0.8716}]\nepoch 10: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 82.13it/s, loss=0.297, metrics={'acc': 0.8615}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 120.50it/s, loss=0.281, metrics={'acc': 0.8718}]\nepoch 11: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 82.54it/s, loss=0.293, metrics={'acc': 0.8641}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 122.57it/s, loss=0.284, metrics={'acc': 0.867}]\nepoch 12: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 80.92it/s, loss=0.293, metrics={'acc': 0.863}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 126.42it/s, loss=0.282, metrics={'acc': 0.8701}]\nepoch 13: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.61it/s, loss=0.293, metrics={'acc': 0.8635}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 117.56it/s, loss=0.276, metrics={'acc': 0.8719}]\nepoch 14: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 74.92it/s, loss=0.29, metrics={'acc': 0.8633}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 112.06it/s, loss=0.286, metrics={'acc': 0.8669}]\nepoch 15: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.83it/s, loss=0.291, metrics={'acc': 0.865}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 112.88it/s, loss=0.282, metrics={'acc': 0.8677}]\nepoch 16: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.74it/s, loss=0.29, metrics={'acc': 0.8653}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 120.85it/s, loss=0.285, metrics={'acc': 0.8672}]\nepoch 17: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.99it/s, loss=0.29, metrics={'acc': 0.865}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 113.53it/s, loss=0.282, metrics={'acc': 0.8681}]\nepoch 18: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 71.22it/s, loss=0.288, metrics={'acc': 0.8651}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 112.89it/s, loss=0.288, metrics={'acc': 0.8676}]\nepoch 19: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.85it/s, loss=0.29, metrics={'acc': 0.8661}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 114.26it/s, loss=0.284, metrics={'acc': 0.8662}]\nepoch 20: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.80it/s, loss=0.289, metrics={'acc': 0.8661}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 119.44it/s, loss=0.281, metrics={'acc': 0.8703}]\nepoch 21: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.72it/s, loss=0.29, metrics={'acc': 0.8661}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 113.04it/s, loss=0.285, metrics={'acc': 0.8648}]\nepoch 22: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 74.86it/s, loss=0.289, metrics={'acc': 0.8656}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 111.75it/s, loss=0.282, metrics={'acc': 0.8666}]\nepoch 23: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.68it/s, loss=0.289, metrics={'acc': 0.8668}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 115.86it/s, loss=0.282, metrics={'acc': 0.8724}]\nepoch 24: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.93it/s, loss=0.288, metrics={'acc': 0.8653}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 114.69it/s, loss=0.285, metrics={'acc': 0.8656}]\nepoch 25: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.99it/s, loss=0.284, metrics={'acc': 0.8671}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 118.27it/s, loss=0.277, metrics={'acc': 0.8707}]\nepoch 26: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.86it/s, loss=0.282, metrics={'acc': 0.8686}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 123.94it/s, loss=0.276, metrics={'acc': 0.8712}]\nepoch 27: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.86it/s, loss=0.283, metrics={'acc': 0.8691}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 114.11it/s, loss=0.277, metrics={'acc': 0.8716}]\nepoch 28: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.43it/s, loss=0.281, metrics={'acc': 0.8696}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 114.64it/s, loss=0.277, metrics={'acc': 0.8712}]\nepoch 29: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.39it/s, loss=0.281, metrics={'acc': 0.8696}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 115.83it/s, loss=0.277, metrics={'acc': 0.872}]\nepoch 30: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 74.93it/s, loss=0.28, metrics={'acc': 0.8706}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 113.07it/s, loss=0.275, metrics={'acc': 0.8714}]\nepoch 31: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.35it/s, loss=0.281, metrics={'acc': 0.8697}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 112.68it/s, loss=0.276, metrics={'acc': 0.872}]\nepoch 32: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.72it/s, loss=0.28, metrics={'acc': 0.8693}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 122.50it/s, loss=0.276, metrics={'acc': 0.8709}]\nepoch 33: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.68it/s, loss=0.28, metrics={'acc': 0.8716}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 110.07it/s, loss=0.277, metrics={'acc': 0.8709}]\nepoch 34: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 73.58it/s, loss=0.279, metrics={'acc': 0.8704}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 111.07it/s, loss=0.274, metrics={'acc': 0.8719}]\nepoch 35: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 74.03it/s, loss=0.28, metrics={'acc': 0.8687}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 110.15it/s, loss=0.276, metrics={'acc': 0.871}]\nepoch 36: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 73.11it/s, loss=0.279, metrics={'acc': 0.8706}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 111.10it/s, loss=0.278, metrics={'acc': 0.8705}]\nepoch 37: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 74.30it/s, loss=0.279, metrics={'acc': 0.869}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 110.75it/s, loss=0.279, metrics={'acc': 0.8702}]\nepoch 38: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 72.34it/s, loss=0.28, metrics={'acc': 0.8691}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 117.79it/s, loss=0.277, metrics={'acc': 0.8698}]\nepoch 39: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.49it/s, loss=0.279, metrics={'acc': 0.8694}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 115.25it/s, loss=0.279, metrics={'acc': 0.87}]\nepoch 40: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.51it/s, loss=0.28, metrics={'acc': 0.8694}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 110.90it/s, loss=0.277, metrics={'acc': 0.8694}]\nepoch 41: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.35it/s, loss=0.278, metrics={'acc': 0.8716}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 115.74it/s, loss=0.28, metrics={'acc': 0.8675}]\nepoch 42: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.35it/s, loss=0.279, metrics={'acc': 0.8695}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 122.76it/s, loss=0.277, metrics={'acc': 0.8699}]\nepoch 43: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:04&lt;00:00, 66.14it/s, loss=0.279, metrics={'acc': 0.8681}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 106.20it/s, loss=0.277, metrics={'acc': 0.8714}]\nepoch 44: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 73.37it/s, loss=0.279, metrics={'acc': 0.8704}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 115.18it/s, loss=0.277, metrics={'acc': 0.8716}]\nepoch 45: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.23it/s, loss=0.278, metrics={'acc': 0.8702}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 124.83it/s, loss=0.278, metrics={'acc': 0.8707}]\nepoch 46: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.12it/s, loss=0.278, metrics={'acc': 0.8704}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 126.62it/s, loss=0.279, metrics={'acc': 0.8693}]\nepoch 47: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 78.55it/s, loss=0.276, metrics={'acc': 0.8713}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 117.99it/s, loss=0.279, metrics={'acc': 0.8691}]\nepoch 48: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.25it/s, loss=0.278, metrics={'acc': 0.8719}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 124.52it/s, loss=0.278, metrics={'acc': 0.8695}]\nepoch 49: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 78.35it/s, loss=0.277, metrics={'acc': 0.8721}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 119.82it/s, loss=0.279, metrics={'acc': 0.8691}]\nepoch 50: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.15it/s, loss=0.277, metrics={'acc': 0.8717}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 122.62it/s, loss=0.278, metrics={'acc': 0.8699}]\nepoch 51: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 78.55it/s, loss=0.277, metrics={'acc': 0.8713}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 117.63it/s, loss=0.278, metrics={'acc': 0.87}]\nepoch 52: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.04it/s, loss=0.276, metrics={'acc': 0.8721}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 126.39it/s, loss=0.278, metrics={'acc': 0.8697}]\nepoch 53: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 78.15it/s, loss=0.277, metrics={'acc': 0.8721}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 127.56it/s, loss=0.278, metrics={'acc': 0.8699}]\nepoch 54: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.41it/s, loss=0.277, metrics={'acc': 0.8711}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 118.95it/s, loss=0.278, metrics={'acc': 0.8698}]\nepoch 55: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 76.35it/s, loss=0.277, metrics={'acc': 0.8718}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 126.90it/s, loss=0.278, metrics={'acc': 0.8699}]\nepoch 56: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.83it/s, loss=0.277, metrics={'acc': 0.8707}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 115.13it/s, loss=0.279, metrics={'acc': 0.8691}]\nepoch 57: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.20it/s, loss=0.277, metrics={'acc': 0.8722}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 123.16it/s, loss=0.279, metrics={'acc': 0.8691}]\nepoch 58: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 73.33it/s, loss=0.276, metrics={'acc': 0.871}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 123.37it/s, loss=0.278, metrics={'acc': 0.8691}]\nepoch 59: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 78.41it/s, loss=0.277, metrics={'acc': 0.8714}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 125.17it/s, loss=0.278, metrics={'acc': 0.8695}]\nepoch 60: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 79.26it/s, loss=0.276, metrics={'acc': 0.8721}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 120.60it/s, loss=0.278, metrics={'acc': 0.869}]\nepoch 61: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 75.88it/s, loss=0.278, metrics={'acc': 0.8703}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 124.47it/s, loss=0.278, metrics={'acc': 0.8692}]\nepoch 62: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.13it/s, loss=0.276, metrics={'acc': 0.8711}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 125.80it/s, loss=0.278, metrics={'acc': 0.8691}]\nepoch 63: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 77.20it/s, loss=0.277, metrics={'acc': 0.8715}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 118.50it/s, loss=0.278, metrics={'acc': 0.8695}]\nepoch 64: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 275/275 [00:03&lt;00:00, 78.11it/s, loss=0.276, metrics={'acc': 0.8719}]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 69/69 [00:00&lt;00:00, 114.52it/s, loss=0.278, metrics={'acc': 0.869}]\n</pre> <pre>Best Epoch: 34. Best val_loss: 0.27449\nRestoring model weights from the end of the best epoch\n</pre> In\u00a0[32]: Copied! <pre>mlp_trainer.feature_importance\n</pre> mlp_trainer.feature_importance Out[32]: <pre>{'age': 0.116632804,\n 'workclass': 0.050255153,\n 'education': 0.094621316,\n 'marital_status': 0.12328919,\n 'occupation': 0.107893184,\n 'relationship': 0.11747801,\n 'race': 0.054717205,\n 'gender': 0.07514235,\n 'capital_gain': 0.059732802,\n 'capital_loss': 0.06738944,\n 'hours_per_week': 0.0610674,\n 'native_country': 0.07178114}</pre> In\u00a0[33]: Copied! <pre>mlp_preds = mlp_trainer.predict(X_tab=X_tab_test)\n</pre> mlp_preds = mlp_trainer.predict(X_tab=X_tab_test) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 39/39 [00:00&lt;00:00, 212.38it/s]\n</pre> In\u00a0[34]: Copied! <pre>accuracy_score(mlp_preds, test.income_label)\n</pre> accuracy_score(mlp_preds, test.income_label) Out[34]: <pre>0.8726714431934494</pre>"},{"location":"examples/17_feature_importance_via_attention_weights.html#feature-importance-via-the-attention-weights","title":"Feature Importance via the attention weights\u00b6","text":"<p>I will start by saying that I consider this feature of the library purely experimental. First of all I think there are multiple ways one could address finding the features importances for these models. However, and more importantly, one has to bear in mind that even tree-based algorithms on the same dataset produce different feature importances. This is more \"dramatic\" if one uses different techniques, such as shap or feature permutation (see for example this and references therein). All this to say that, sometimes, feature importance is just a measure contained within the experiment run, and for the model used.</p> <p>With that in mind, each instantiation of a deep tabular model, that has millions of trainable parameters, will potentially produce a different set of feature importances, even if the model has the same architecture. Moreover, this effect will become more apparent if the dataset is relatively easy and there are dependent/related columns so that one could get to the same success metric with different parameters.</p> <p>In summary, feature importances are implemented in this librray for all attention-based models for tabular data, with the exception of the <code>TabPerceiver</code>. However this functionality has to be used and interpreted with care and consider of value within the 'universe' (or context) of the model with which these features were produced.</p> <p>Nonetheless, let's have a look to how one would access to the feature importances when using this library.</p>"},{"location":"examples/18_wide_and_deep_for_recsys_pt1.html","title":"18 wide and deep for recsys pt1","text":"<p>The goal of this, and the companion (part 2) notebooks is to illustrate how one could use this library in the context of recommendation systems. In particular, this notebook and the scripts at the <code>wide_deep_for_recsys</code> dir are a response to this issue. Therefore, we will use the Kaggle notebook referred in that issue here.</p> <p>In order to keep the length of the notebook tractable, we will split this exercise in 2. In this first notebook we will prepare the data in almost the exact same way as it is done in the Kaggle notebook and also show how one could use <code>pytorch-widedeep</code> to build a model almost identical to the one in that notebook.</p> <p>In a second notebook, we will show how one could use this library to implement other models, still following the same problem formulation.</p> In\u00a0[1]: Copied! <pre>from pathlib import Path\nimport warnings\n\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\n\nfrom pytorch_widedeep.datasets import load_movielens100k\n</pre> from pathlib import Path import warnings  import pandas as pd from sklearn.model_selection import train_test_split  from pytorch_widedeep.datasets import load_movielens100k In\u00a0[2]: Copied! <pre>warnings.filterwarnings(\"ignore\")\n</pre> warnings.filterwarnings(\"ignore\") In\u00a0[3]: Copied! <pre>save_path = Path(\"prepared_data\")\nif not save_path.exists():\n    save_path.mkdir(parents=True, exist_ok=True)\n</pre> save_path = Path(\"prepared_data\") if not save_path.exists():     save_path.mkdir(parents=True, exist_ok=True) In\u00a0[4]: Copied! <pre>data, users, items = load_movielens100k(as_frame=True)\n</pre> data, users, items = load_movielens100k(as_frame=True) In\u00a0[5]: Copied! <pre># Alternatively, as specified in the docs: 'The last 19 fields are the genres' so:\n# list_of_genres = items.columns.tolist()[-19:]\nlist_of_genres = [\n    \"unknown\",\n    \"Action\",\n    \"Adventure\",\n    \"Animation\",\n    \"Children's\",\n    \"Comedy\",\n    \"Crime\",\n    \"Documentary\",\n    \"Drama\",\n    \"Fantasy\",\n    \"Film-Noir\",\n    \"Horror\",\n    \"Musical\",\n    \"Mystery\",\n    \"Romance\",\n    \"Sci-Fi\",\n    \"Thriller\",\n    \"War\",\n    \"Western\",\n]\n</pre> # Alternatively, as specified in the docs: 'The last 19 fields are the genres' so: # list_of_genres = items.columns.tolist()[-19:] list_of_genres = [     \"unknown\",     \"Action\",     \"Adventure\",     \"Animation\",     \"Children's\",     \"Comedy\",     \"Crime\",     \"Documentary\",     \"Drama\",     \"Fantasy\",     \"Film-Noir\",     \"Horror\",     \"Musical\",     \"Mystery\",     \"Romance\",     \"Sci-Fi\",     \"Thriller\",     \"War\",     \"Western\", ] <p>Let's first start by loading the interactions, user and item data</p> In\u00a0[6]: Copied! <pre>data.head()\n</pre> data.head() Out[6]: user_id movie_id rating timestamp 0 196 242 3 881250949 1 186 302 3 891717742 2 22 377 1 878887116 3 244 51 2 880606923 4 166 346 1 886397596 In\u00a0[7]: Copied! <pre>users.head()\n</pre> users.head() Out[7]: user_id age gender occupation zip_code 0 1 24 M technician 85711 1 2 53 F other 94043 2 3 23 M writer 32067 3 4 24 M technician 43537 4 5 33 F other 15213 In\u00a0[8]: Copied! <pre>items.head()\n</pre> items.head() Out[8]: movie_id movie_title release_date video_release_date IMDb_URL unknown Action Adventure Animation Children's ... Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western 0 1 Toy Story (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Toy%20Story%2... 0 0 0 1 1 ... 0 0 0 0 0 0 0 0 0 0 1 2 GoldenEye (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?GoldenEye%20(... 0 1 1 0 0 ... 0 0 0 0 0 0 0 1 0 0 2 3 Four Rooms (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Four%20Rooms%... 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0 3 4 Get Shorty (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Get%20Shorty%... 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 4 5 Copycat (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Copycat%20(1995) 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0 <p>5 rows \u00d7 24 columns</p> In\u00a0[9]: Copied! <pre># adding a column with the number of movies watched per user\ndataset = data.sort_values([\"user_id\", \"timestamp\"]).reset_index(drop=True)\ndataset[\"one\"] = 1\ndataset[\"num_watched\"] = dataset.groupby(\"user_id\")[\"one\"].cumsum()\ndataset.drop(\"one\", axis=1, inplace=True)\ndataset.head()\n</pre> # adding a column with the number of movies watched per user dataset = data.sort_values([\"user_id\", \"timestamp\"]).reset_index(drop=True) dataset[\"one\"] = 1 dataset[\"num_watched\"] = dataset.groupby(\"user_id\")[\"one\"].cumsum() dataset.drop(\"one\", axis=1, inplace=True) dataset.head() Out[9]: user_id movie_id rating timestamp num_watched 0 1 168 5 874965478 1 1 1 172 5 874965478 2 2 1 165 5 874965518 3 3 1 156 4 874965556 4 4 1 196 5 874965677 5 In\u00a0[10]: Copied! <pre># adding a column with the mean rating at a point in time per user\ndataset[\"mean_rate\"] = (\n    dataset.groupby(\"user_id\")[\"rating\"].cumsum() / dataset[\"num_watched\"]\n)\ndataset.head()\n</pre> # adding a column with the mean rating at a point in time per user dataset[\"mean_rate\"] = (     dataset.groupby(\"user_id\")[\"rating\"].cumsum() / dataset[\"num_watched\"] ) dataset.head() Out[10]: user_id movie_id rating timestamp num_watched mean_rate 0 1 168 5 874965478 1 5.00 1 1 172 5 874965478 2 5.00 2 1 165 5 874965518 3 5.00 3 1 156 4 874965556 4 4.75 4 1 196 5 874965677 5 4.80 In\u00a0[11]: Copied! <pre>dataset[\"target\"] = dataset.groupby(\"user_id\")[\"movie_id\"].shift(-1)\n</pre> dataset[\"target\"] = dataset.groupby(\"user_id\")[\"movie_id\"].shift(-1) <p>Following the same processing used by the author in the before-mentioned Kaggle notebook, we build sequences of previous movies watched</p> In\u00a0[12]: Copied! <pre># Here the author builds the sequences\ndataset[\"prev_movies\"] = dataset[\"movie_id\"].apply(lambda x: str(x))\ndataset[\"prev_movies\"] = (\n    dataset.groupby(\"user_id\")[\"prev_movies\"]\n    .apply(lambda x: (x + \" \").cumsum().str.strip())\n    .reset_index(drop=True)\n)\ndataset[\"prev_movies\"] = dataset[\"prev_movies\"].apply(lambda x: x.split())\ndataset.head()\n</pre> # Here the author builds the sequences dataset[\"prev_movies\"] = dataset[\"movie_id\"].apply(lambda x: str(x)) dataset[\"prev_movies\"] = (     dataset.groupby(\"user_id\")[\"prev_movies\"]     .apply(lambda x: (x + \" \").cumsum().str.strip())     .reset_index(drop=True) ) dataset[\"prev_movies\"] = dataset[\"prev_movies\"].apply(lambda x: x.split()) dataset.head() Out[12]: user_id movie_id rating timestamp num_watched mean_rate target prev_movies 0 1 168 5 874965478 1 5.00 172.0 [168] 1 1 172 5 874965478 2 5.00 165.0 [168, 172] 2 1 165 5 874965518 3 5.00 156.0 [168, 172, 165] 3 1 156 4 874965556 4 4.75 196.0 [168, 172, 165, 156] 4 1 196 5 874965677 5 4.80 166.0 [168, 172, 165, 156, 196] <p>And now we add a <code>genre_rate</code> as the mean of all movies rated for a given genre per user</p> In\u00a0[13]: Copied! <pre>dataset = dataset.merge(items[[\"movie_id\"] + list_of_genres], on=\"movie_id\", how=\"left\")\nfor genre in list_of_genres:\n    dataset[f\"{genre}_rate\"] = dataset[genre] * dataset[\"rating\"]\n    dataset[genre] = dataset.groupby(\"user_id\")[genre].cumsum()\n    dataset[f\"{genre}_rate\"] = (\n        dataset.groupby(\"user_id\")[f\"{genre}_rate\"].cumsum() / dataset[genre]\n    )\ndataset[list_of_genres] = dataset[list_of_genres].apply(\n    lambda x: x / dataset[\"num_watched\"]\n)\ndataset.head()\n</pre> dataset = dataset.merge(items[[\"movie_id\"] + list_of_genres], on=\"movie_id\", how=\"left\") for genre in list_of_genres:     dataset[f\"{genre}_rate\"] = dataset[genre] * dataset[\"rating\"]     dataset[genre] = dataset.groupby(\"user_id\")[genre].cumsum()     dataset[f\"{genre}_rate\"] = (         dataset.groupby(\"user_id\")[f\"{genre}_rate\"].cumsum() / dataset[genre]     ) dataset[list_of_genres] = dataset[list_of_genres].apply(     lambda x: x / dataset[\"num_watched\"] ) dataset.head() Out[13]: user_id movie_id rating timestamp num_watched mean_rate target prev_movies unknown Action ... Fantasy_rate Film-Noir_rate Horror_rate Musical_rate Mystery_rate Romance_rate Sci-Fi_rate Thriller_rate War_rate Western_rate 0 1 168 5 874965478 1 5.00 172.0 [168] 0.0 0.000000 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1 1 172 5 874965478 2 5.00 165.0 [168, 172] 0.0 0.500000 ... NaN NaN NaN NaN NaN 5.0 5.0 NaN 5.0 NaN 2 1 165 5 874965518 3 5.00 156.0 [168, 172, 165] 0.0 0.333333 ... NaN NaN NaN NaN NaN 5.0 5.0 NaN 5.0 NaN 3 1 156 4 874965556 4 4.75 196.0 [168, 172, 165, 156] 0.0 0.250000 ... NaN NaN NaN NaN NaN 5.0 5.0 4.0 5.0 NaN 4 1 196 5 874965677 5 4.80 166.0 [168, 172, 165, 156, 196] 0.0 0.200000 ... NaN NaN NaN NaN NaN 5.0 5.0 4.0 5.0 NaN <p>5 rows \u00d7 46 columns</p> <p>Adding user features</p> In\u00a0[14]: Copied! <pre>dataset = dataset.merge(users, on=\"user_id\", how=\"left\")\ndataset.head()\n</pre> dataset = dataset.merge(users, on=\"user_id\", how=\"left\") dataset.head() Out[14]: user_id movie_id rating timestamp num_watched mean_rate target prev_movies unknown Action ... Mystery_rate Romance_rate Sci-Fi_rate Thriller_rate War_rate Western_rate age gender occupation zip_code 0 1 168 5 874965478 1 5.00 172.0 [168] 0.0 0.000000 ... NaN NaN NaN NaN NaN NaN 24 M technician 85711 1 1 172 5 874965478 2 5.00 165.0 [168, 172] 0.0 0.500000 ... NaN 5.0 5.0 NaN 5.0 NaN 24 M technician 85711 2 1 165 5 874965518 3 5.00 156.0 [168, 172, 165] 0.0 0.333333 ... NaN 5.0 5.0 NaN 5.0 NaN 24 M technician 85711 3 1 156 4 874965556 4 4.75 196.0 [168, 172, 165, 156] 0.0 0.250000 ... NaN 5.0 5.0 4.0 5.0 NaN 24 M technician 85711 4 1 196 5 874965677 5 4.80 166.0 [168, 172, 165, 156, 196] 0.0 0.200000 ... NaN 5.0 5.0 4.0 5.0 NaN 24 M technician 85711 <p>5 rows \u00d7 50 columns</p> <p>Again, we use the same settings as those in the Kaggle notebook, but <code>COLD_START_TRESH</code> is pretty aggressive</p> In\u00a0[15]: Copied! <pre>COLD_START_TRESH = 5\n\nfiltred_data = dataset[\n    (dataset[\"num_watched\"] &gt;= COLD_START_TRESH) &amp; ~(dataset[\"target\"].isna())\n].sort_values(\"timestamp\")\ntrain_data, _test_data = train_test_split(filtred_data, test_size=0.2, shuffle=False)\nvalid_data, test_data = train_test_split(_test_data, test_size=0.5, shuffle=False)\n</pre> COLD_START_TRESH = 5  filtred_data = dataset[     (dataset[\"num_watched\"] &gt;= COLD_START_TRESH) &amp; ~(dataset[\"target\"].isna()) ].sort_values(\"timestamp\") train_data, _test_data = train_test_split(filtred_data, test_size=0.2, shuffle=False) valid_data, test_data = train_test_split(_test_data, test_size=0.5, shuffle=False) In\u00a0[16]: Copied! <pre>cols_to_drop = [\n    # \"rating\",\n    \"timestamp\",\n    \"num_watched\",\n]\n\ndf_train = train_data.drop(cols_to_drop, axis=1)\ndf_valid = valid_data.drop(cols_to_drop, axis=1)\ndf_test = test_data.drop(cols_to_drop, axis=1)\n\ndf_train.to_pickle(save_path / \"df_train.pkl\")\ndf_valid.to_pickle(save_path / \"df_valid.pkl\")\ndf_test.to_pickle(save_path / \"df_test.pkl\")\n</pre> cols_to_drop = [     # \"rating\",     \"timestamp\",     \"num_watched\", ]  df_train = train_data.drop(cols_to_drop, axis=1) df_valid = valid_data.drop(cols_to_drop, axis=1) df_test = test_data.drop(cols_to_drop, axis=1)  df_train.to_pickle(save_path / \"df_train.pkl\") df_valid.to_pickle(save_path / \"df_valid.pkl\") df_test.to_pickle(save_path / \"df_test.pkl\") <p>Let's now build a model that is nearly identical to the one use in the Kaggle notebook</p> In\u00a0[17]: Copied! <pre>import numpy as np\nimport torch\nfrom torch import nn\nfrom scipy.sparse import coo_matrix\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\n</pre> import numpy as np import torch from torch import nn from scipy.sparse import coo_matrix  from pytorch_widedeep import Trainer from pytorch_widedeep.models import TabMlp, BasicRNN, WideDeep from pytorch_widedeep.preprocessing import TabPreprocessor In\u00a0[18]: Copied! <pre>device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n\nsave_path = Path(\"prepared_data\")\n\nPAD_IDX = 0\n</pre> device = \"cuda\" if torch.cuda.is_available() else \"cpu\"  save_path = Path(\"prepared_data\")  PAD_IDX = 0 <p>Let's use some of the functions the author of the kaggle's notebook uses to prepare the data</p> In\u00a0[19]: Copied! <pre>def get_coo_indexes(lil):\n    rows = []\n    cols = []\n    for i, el in enumerate(lil):\n        if type(el) != list:\n            el = [el]\n        for j in el:\n            rows.append(i)\n            cols.append(j)\n    return rows, cols\n\n\ndef get_sparse_features(series, shape):\n    coo_indexes = get_coo_indexes(series.tolist())\n    sparse_df = coo_matrix(\n        (np.ones(len(coo_indexes[0])), (coo_indexes[0], coo_indexes[1])), shape=shape\n    )\n    return sparse_df\n\n\ndef sparse_to_idx(data, pad_idx=-1):\n    indexes = data.nonzero()\n    indexes_df = pd.DataFrame()\n    indexes_df[\"rows\"] = indexes[0]\n    indexes_df[\"cols\"] = indexes[1]\n    mdf = indexes_df.groupby(\"rows\").apply(lambda x: x[\"cols\"].tolist())\n    max_len = mdf.apply(lambda x: len(x)).max()\n    return mdf.apply(lambda x: pd.Series(x + [pad_idx] * (max_len - len(x)))).values\n</pre> def get_coo_indexes(lil):     rows = []     cols = []     for i, el in enumerate(lil):         if type(el) != list:             el = [el]         for j in el:             rows.append(i)             cols.append(j)     return rows, cols   def get_sparse_features(series, shape):     coo_indexes = get_coo_indexes(series.tolist())     sparse_df = coo_matrix(         (np.ones(len(coo_indexes[0])), (coo_indexes[0], coo_indexes[1])), shape=shape     )     return sparse_df   def sparse_to_idx(data, pad_idx=-1):     indexes = data.nonzero()     indexes_df = pd.DataFrame()     indexes_df[\"rows\"] = indexes[0]     indexes_df[\"cols\"] = indexes[1]     mdf = indexes_df.groupby(\"rows\").apply(lambda x: x[\"cols\"].tolist())     max_len = mdf.apply(lambda x: len(x)).max()     return mdf.apply(lambda x: pd.Series(x + [pad_idx] * (max_len - len(x)))).values <p>For the time being, we will not use a validation set for hyperparameter optimization, and we will simply concatenate the validation and the test set in one test set. I simply splitted the data into train/valid/test in case the reader wants to actually do hyperparameter optimization (and because I know in the future I will).</p> <p>There is also another caveat worth mentioning, related to the indexing of the movies. To build the matrices of movies watched, we use the entire dataset. A more realistic (and correct) approach would be to use ONLY the movies that appear in the training set and consider <code>unknown</code> or <code>unseen</code> those in the testing set that have not been seen during training. Nonetheless, this will not affect the purposes of this notebook, which is to illustrate how one could use <code>pytorch-widedeep</code> to build a recommendation algorithm. However, if one wanted to explore the performance of different algorithms in a \"proper\" way, these \"details\" need to be accounted for.</p> In\u00a0[20]: Copied! <pre>df_test = pd.concat([df_valid, df_test], ignore_index=True)\n</pre> df_test = pd.concat([df_valid, df_test], ignore_index=True) In\u00a0[21]: Copied! <pre>id_cols = [\"user_id\", \"movie_id\"]\nmax_movie_index = max(df_train.movie_id.max(), df_test.movie_id.max())\n</pre> id_cols = [\"user_id\", \"movie_id\"] max_movie_index = max(df_train.movie_id.max(), df_test.movie_id.max()) In\u00a0[22]: Copied! <pre>X_train = df_train.drop(id_cols + [\"rating\", \"prev_movies\", \"target\"], axis=1)\ny_train = np.array(df_train.target.values, dtype=\"int64\")\ntrain_movies_watched = get_sparse_features(\n    df_train[\"prev_movies\"], (len(df_train), max_movie_index + 1)\n)\n\nX_test = df_test.drop(id_cols + [\"rating\", \"prev_movies\", \"target\"], axis=1)\ny_test = np.array(df_test.target.values, dtype=\"int64\")\ntest_movies_watched = get_sparse_features(\n    df_test[\"prev_movies\"], (len(df_test), max_movie_index + 1)\n)\n</pre> X_train = df_train.drop(id_cols + [\"rating\", \"prev_movies\", \"target\"], axis=1) y_train = np.array(df_train.target.values, dtype=\"int64\") train_movies_watched = get_sparse_features(     df_train[\"prev_movies\"], (len(df_train), max_movie_index + 1) )  X_test = df_test.drop(id_cols + [\"rating\", \"prev_movies\", \"target\"], axis=1) y_test = np.array(df_test.target.values, dtype=\"int64\") test_movies_watched = get_sparse_features(     df_test[\"prev_movies\"], (len(df_test), max_movie_index + 1) ) <p>let's have a look to the information in each dataset</p> In\u00a0[23]: Copied! <pre>X_train.head()\n</pre> X_train.head() Out[23]: mean_rate unknown Action Adventure Animation Children's Comedy Crime Documentary Drama ... Mystery_rate Romance_rate Sci-Fi_rate Thriller_rate War_rate Western_rate age gender occupation zip_code 25423 4.000000 0.0 0.400000 0.200000 0.0 0.0 0.400000 0.0 0.0 0.200000 ... NaN 4.0 4.0 4.000000 4.0 NaN 21 M student 48823 25425 4.000000 0.0 0.285714 0.142857 0.0 0.0 0.428571 0.0 0.0 0.285714 ... NaN 4.0 4.0 4.000000 4.0 NaN 21 M student 48823 25424 4.000000 0.0 0.333333 0.166667 0.0 0.0 0.333333 0.0 0.0 0.333333 ... NaN 4.0 4.0 4.000000 4.0 NaN 21 M student 48823 25426 3.875000 0.0 0.250000 0.125000 0.0 0.0 0.375000 0.0 0.0 0.250000 ... NaN 4.0 4.0 3.666667 4.0 NaN 21 M student 48823 25427 3.888889 0.0 0.222222 0.111111 0.0 0.0 0.333333 0.0 0.0 0.333333 ... NaN 4.0 4.0 3.666667 4.0 NaN 21 M student 48823 <p>5 rows \u00d7 43 columns</p> In\u00a0[24]: Copied! <pre>y_train\n</pre> y_train Out[24]: <pre>array([772, 288, 108, ..., 183, 432, 509])</pre> In\u00a0[25]: Copied! <pre>train_movies_watched\n</pre> train_movies_watched Out[25]: <pre>&lt;76228x1683 sparse matrix of type '&lt;class 'numpy.float64'&gt;'\n\twith 7957390 stored elements in COOrdinate format&gt;</pre> In\u00a0[26]: Copied! <pre>sorted(df_train.prev_movies.tolist()[0])\n</pre> sorted(df_train.prev_movies.tolist()[0]) Out[26]: <pre>['173', '185', '255', '286', '298']</pre> In\u00a0[27]: Copied! <pre>np.where(train_movies_watched.todense()[0])\n</pre> np.where(train_movies_watched.todense()[0]) Out[27]: <pre>(array([0, 0, 0, 0, 0]), array([173, 185, 255, 286, 298]))</pre> <p>And from now on is when the specifics related to this library start to appear. The only component that is going to be a bit different is the so-called tabular component, referred as <code>continuous</code> in the notebook.</p> <p>In the case of <code>pytorch-widedeep</code> we have the <code>TabPreprocessor</code> that allows for a lot of flexibility as to how we would like to process the tabular component of this Wide and Deep model. In other words, here our tabular component is a bit more elaborated than that in the notebook, just a bit...</p> In\u00a0[28]: Copied! <pre>cat_cols = [\"gender\", \"occupation\", \"zip_code\"]\ncont_cols = [c for c in X_train if c not in cat_cols]\ntab_preprocessor = TabPreprocessor(\n    cat_embed_cols=cat_cols,\n    continuous_cols=cont_cols,\n)\n</pre> cat_cols = [\"gender\", \"occupation\", \"zip_code\"] cont_cols = [c for c in X_train if c not in cat_cols] tab_preprocessor = TabPreprocessor(     cat_embed_cols=cat_cols,     continuous_cols=cont_cols, ) In\u00a0[29]: Copied! <pre>X_train_tab = tab_preprocessor.fit_transform(X_train.fillna(0))\nX_test_tab = tab_preprocessor.transform(X_test.fillna(0))\n</pre> X_train_tab = tab_preprocessor.fit_transform(X_train.fillna(0)) X_test_tab = tab_preprocessor.transform(X_test.fillna(0)) <p>Now, in the notebook, the author moves the sparse matrices to sparse tensors and then turns them into dense tensors. In reality, this is not neccessary, one could feed sparse tensors to <code>nn.Linear</code> layers in pytorch. Nonetheless, this is not the most efficient implementation and is the reason why in our library the wide, linear component is implemented as an embedding layer.</p> <p>Nonetheless, to reproduce the notebook the best we can and because currently the <code>Wide</code> model in <code>pytorch-widedeep</code> is not designed to receive sparse tensors (we might consider implementing this functionality), we will turn the sparse COO matrices into dense arrays. We will then code a fairly simple, custom <code>Wide</code> component.</p> In\u00a0[30]: Copied! <pre>X_train_wide = np.array(train_movies_watched.todense())\nX_test_wide = np.array(test_movies_watched.todense())\n</pre> X_train_wide = np.array(train_movies_watched.todense()) X_test_wide = np.array(test_movies_watched.todense()) <p>Finally, the author of the notebook uses a simple <code>Embedding</code> layer to encode the sequences of movies watched, the <code>prev_movies</code> columns. In my opinion, there is an element of information redundancy here. This is because the wide and text components have implicitely the same information, but in different form. Moreover, both of the models used for these two components ignore the sequential element in the data. Nonetheless, we want to reproduce the Kaggle notebook as close as possible, AND as one can explore later (by simply performing simple ablation studies), the wide component seems to carry most of the predictive power.</p> In\u00a0[31]: Copied! <pre>X_train_text = sparse_to_idx(train_movies_watched, pad_idx=PAD_IDX)\nX_test_text = sparse_to_idx(test_movies_watched, pad_idx=PAD_IDX)\n</pre> X_train_text = sparse_to_idx(train_movies_watched, pad_idx=PAD_IDX) X_test_text = sparse_to_idx(test_movies_watched, pad_idx=PAD_IDX) <p>Let's now build the models</p> In\u00a0[32]: Copied! <pre>class Wide(nn.Module):\n    def __init__(self, input_dim: int, pred_dim: int):\n        super().__init__()\n\n        self.input_dim = input_dim\n        self.pred_dim = pred_dim\n\n        # When I coded the library I never though that someone would want to code\n        # their own wide component. However, if you do, the wide component must have\n        # a 'wide_linear' attribute. In other words, the linear layer must be\n        # called 'wide_linear'\n        self.wide_linear = nn.Linear(input_dim, pred_dim)\n\n    def forward(self, X):\n        out = self.wide_linear(X.type(torch.float32))\n        return out\n\n\nwide = Wide(X_train_wide.shape[1], max_movie_index + 1)\n</pre> class Wide(nn.Module):     def __init__(self, input_dim: int, pred_dim: int):         super().__init__()          self.input_dim = input_dim         self.pred_dim = pred_dim          # When I coded the library I never though that someone would want to code         # their own wide component. However, if you do, the wide component must have         # a 'wide_linear' attribute. In other words, the linear layer must be         # called 'wide_linear'         self.wide_linear = nn.Linear(input_dim, pred_dim)      def forward(self, X):         out = self.wide_linear(X.type(torch.float32))         return out   wide = Wide(X_train_wide.shape[1], max_movie_index + 1) In\u00a0[33]: Copied! <pre>wide\n</pre> wide Out[33]: <pre>Wide(\n  (wide_linear): Linear(in_features=1683, out_features=1683, bias=True)\n)</pre> In\u00a0[34]: Copied! <pre>class SimpleEmbed(nn.Module):\n    def __init__(self, vocab_size: int, embed_dim: int, pad_idx: int):\n        super().__init__()\n\n        self.vocab_size = vocab_size\n        self.embed_dim = embed_dim\n        self.pad_idx = pad_idx\n\n        # The sequences of movies watched are simply embedded in the Kaggle\n        # notebook. No RNN, Transformer or any model is used\n        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)\n\n    def forward(self, X):\n        embed = self.embed(X)\n        embed_mean = torch.mean(embed, dim=1)\n        return embed_mean\n\n    @property\n    def output_dim(self) -&gt; int:\n        # All deep components in a custom 'pytorch-widedeep' model must have\n        # an output_dim property\n        return self.embed_dim\n\n\n#  In the notebook the author uses simply embeddings\nsimple_embed = SimpleEmbed(max_movie_index + 1, 16, 0)\n</pre> class SimpleEmbed(nn.Module):     def __init__(self, vocab_size: int, embed_dim: int, pad_idx: int):         super().__init__()          self.vocab_size = vocab_size         self.embed_dim = embed_dim         self.pad_idx = pad_idx          # The sequences of movies watched are simply embedded in the Kaggle         # notebook. No RNN, Transformer or any model is used         self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)      def forward(self, X):         embed = self.embed(X)         embed_mean = torch.mean(embed, dim=1)         return embed_mean      @property     def output_dim(self) -&gt; int:         # All deep components in a custom 'pytorch-widedeep' model must have         # an output_dim property         return self.embed_dim   #  In the notebook the author uses simply embeddings simple_embed = SimpleEmbed(max_movie_index + 1, 16, 0) In\u00a0[35]: Copied! <pre>simple_embed\n</pre> simple_embed Out[35]: <pre>SimpleEmbed(\n  (embed): Embedding(1683, 16, padding_idx=0)\n)</pre> <p>Maybe one would like to use an RNN to account for the sequence nature of the problem. If that was the case it would be as easy as:</p> In\u00a0[36]: Copied! <pre>basic_rnn = BasicRNN(\n    vocab_size=max_movie_index + 1,\n    embed_dim=16,\n    hidden_dim=32,\n    n_layers=2,\n    rnn_type=\"gru\",\n)\n</pre> basic_rnn = BasicRNN(     vocab_size=max_movie_index + 1,     embed_dim=16,     hidden_dim=32,     n_layers=2,     rnn_type=\"gru\", ) <p>And finally, the tabular component, which is the notebook is simply a stak of linear + Rely layers. In our case we have an embedding layer before the linear layers to encode categorial and numerical cols</p> In\u00a0[37]: Copied! <pre>tab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=tab_preprocessor.continuous_cols,\n    cont_norm_layer=None,\n    mlp_hidden_dims=[1024, 512, 256],\n    mlp_activation=\"relu\",\n)\n</pre> tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=tab_preprocessor.continuous_cols,     cont_norm_layer=None,     mlp_hidden_dims=[1024, 512, 256],     mlp_activation=\"relu\", ) In\u00a0[38]: Copied! <pre>tab_mlp\n</pre> tab_mlp Out[38]: <pre>TabMlp(\n  (cat_embed): DiffSizeCatEmbeddings(\n    (embed_layers): ModuleDict(\n      (emb_layer_gender): Embedding(3, 2, padding_idx=0)\n      (emb_layer_occupation): Embedding(22, 9, padding_idx=0)\n      (emb_layer_zip_code): Embedding(648, 60, padding_idx=0)\n    )\n    (embedding_dropout): Dropout(p=0.0, inplace=False)\n  )\n  (cont_norm): Identity()\n  (encoder): MLP(\n    (mlp): Sequential(\n      (dense_layer_0): Sequential(\n        (0): Linear(in_features=111, out_features=1024, bias=True)\n        (1): ReLU(inplace=True)\n        (2): Dropout(p=0.1, inplace=False)\n      )\n      (dense_layer_1): Sequential(\n        (0): Linear(in_features=1024, out_features=512, bias=True)\n        (1): ReLU(inplace=True)\n        (2): Dropout(p=0.1, inplace=False)\n      )\n      (dense_layer_2): Sequential(\n        (0): Linear(in_features=512, out_features=256, bias=True)\n        (1): ReLU(inplace=True)\n        (2): Dropout(p=0.1, inplace=False)\n      )\n    )\n  )\n)</pre> <p>Finally, we simply wrap up all models with the <code>WideDeep</code> 'collector' class and we are ready to train.</p> In\u00a0[39]: Copied! <pre>wide_deep_model = WideDeep(\n    wide=wide, deeptabular=tab_mlp, deeptext=simple_embed, pred_dim=max_movie_index + 1\n)\n</pre> wide_deep_model = WideDeep(     wide=wide, deeptabular=tab_mlp, deeptext=simple_embed, pred_dim=max_movie_index + 1 ) In\u00a0[40]: Copied! <pre>wide_deep_model\n</pre> wide_deep_model Out[40]: <pre>WideDeep(\n  (wide): Wide(\n    (wide_linear): Linear(in_features=1683, out_features=1683, bias=True)\n  )\n  (deeptabular): Sequential(\n    (0): TabMlp(\n      (cat_embed): DiffSizeCatEmbeddings(\n        (embed_layers): ModuleDict(\n          (emb_layer_gender): Embedding(3, 2, padding_idx=0)\n          (emb_layer_occupation): Embedding(22, 9, padding_idx=0)\n          (emb_layer_zip_code): Embedding(648, 60, padding_idx=0)\n        )\n        (embedding_dropout): Dropout(p=0.0, inplace=False)\n      )\n      (cont_norm): Identity()\n      (encoder): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=111, out_features=1024, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_1): Sequential(\n            (0): Linear(in_features=1024, out_features=512, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_2): Sequential(\n            (0): Linear(in_features=512, out_features=256, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=256, out_features=1683, bias=True)\n  )\n  (deeptext): Sequential(\n    (0): SimpleEmbed(\n      (embed): Embedding(1683, 16, padding_idx=0)\n    )\n    (1): Linear(in_features=16, out_features=1683, bias=True)\n  )\n)</pre> <p>Note that the main difference between this wide and deep model and the Wide and Deep model in the Kaggle notebook is that in that notebook, the author concatenates the embedings and the tabular features, then passes this concatenation through a stack of linear + Relu layers with a final output dim of 256. Then concatenates this output with the binary features and connects this concatenation with the final linear layer (so the final weights are of dim (batch_size, 256 + 1683)). Our implementation follows the notation of the original paper and instead of concatenating the tabular, text and wide components and then connect them to the output neurons, we first compute their output, and then add it (see here: https://arxiv.org/pdf/1606.07792.pdf, their Eq 3). Note that this is effectively the same, with the caveat that while in one case one initialises a big weight matrix \"at once\", in our implementation we initialise different matrices for different components. Anyway, let's give it a go.</p> In\u00a0[41]: Copied! <pre>trainer = Trainer(\n    model=wide_deep_model,\n    objective=\"multiclass\",\n    custom_loss_function=nn.CrossEntropyLoss(ignore_index=PAD_IDX),\n    optimizers=torch.optim.Adam(wide_deep_model.parameters(), lr=1e-3),\n)\n</pre> trainer = Trainer(     model=wide_deep_model,     objective=\"multiclass\",     custom_loss_function=nn.CrossEntropyLoss(ignore_index=PAD_IDX),     optimizers=torch.optim.Adam(wide_deep_model.parameters(), lr=1e-3), ) In\u00a0[42]: Copied! <pre>trainer.fit(\n    X_train={\n        \"X_wide\": X_train_wide,\n        \"X_tab\": X_train_tab,\n        \"X_text\": X_train_text,\n        \"target\": y_train,\n    },\n    X_val={\n        \"X_wide\": X_test_wide,\n        \"X_tab\": X_test_tab,\n        \"X_text\": X_test_text,\n        \"target\": y_test,\n    },\n    n_epochs=5,\n    batch_size=512,\n    shuffle=False,\n)\n</pre> trainer.fit(     X_train={         \"X_wide\": X_train_wide,         \"X_tab\": X_train_tab,         \"X_text\": X_train_text,         \"target\": y_train,     },     X_val={         \"X_wide\": X_test_wide,         \"X_tab\": X_test_tab,         \"X_text\": X_test_text,         \"target\": y_test,     },     n_epochs=5,     batch_size=512,     shuffle=False, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 149/149 [00:19&lt;00:00,  7.66it/s, loss=6.66]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 38/38 [00:02&lt;00:00, 18.75it/s, loss=6.6]\nepoch 2: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 149/149 [00:21&lt;00:00,  6.95it/s, loss=5.97]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 38/38 [00:01&lt;00:00, 21.03it/s, loss=6.52]\nepoch 3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 149/149 [00:19&lt;00:00,  7.51it/s, loss=5.65]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 38/38 [00:01&lt;00:00, 20.16it/s, loss=6.53]\nepoch 4: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 149/149 [00:23&lt;00:00,  6.29it/s, loss=5.41]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 38/38 [00:02&lt;00:00, 13.97it/s, loss=6.57]\nepoch 5: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 149/149 [00:19&lt;00:00,  7.58it/s, loss=5.2]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 38/38 [00:02&lt;00:00, 18.82it/s, loss=6.63]\n</pre> <p>Now one could continue to the 'compare' metrics section of the Kaggle notebook. However, for the purposes of illustrating how one could use <code>pytorch-widedeep</code> to build recommendation algorithms we consider this notebook completed and move onto part 2</p>"},{"location":"examples/18_wide_and_deep_for_recsys_pt1.html#problem-formulation","title":"Problem formulation\u00b6","text":"<p>In this particular exercise the problem is formulated as predicting the next movie that will be watched (in consequence the last interactions will be discarded)</p>"},{"location":"examples/18_wide_and_deep_for_recsys_pt2.html","title":"18 wide and deep for recsys pt2","text":"<p>This is the second of the two notebooks where we aim to illustrate how one could use this library to build recommendation algorithms using the example in this Kaggle notebook as guidance. In the previous notebook we used <code>pytorch-widedeep</code> to build a model that replicated almost exactly that in the notebook. In this, shorter notebook we will show how one could use the library to explore other models, following the same problem formulation, this is: given a state of a user at a certain point in time having watched a series of movies, our goal is to predict which movie the user will watch next.</p> <p>Assuming that one has read (and run) the previous notebook, the required data will be stored in a local dir called <code>prepared_data</code>, so let's read it:</p> In\u00a0[1]: Copied! <pre>from pathlib import Path\n\nimport numpy as np\nimport torch\nimport pandas as pd\nfrom torch import nn\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.utils import pad_sequences\nfrom pytorch_widedeep.models import TabMlp, WideDeep, Transformer\nfrom pytorch_widedeep.preprocessing import TabPreprocessor\n</pre> from pathlib import Path  import numpy as np import torch import pandas as pd from torch import nn  from pytorch_widedeep import Trainer from pytorch_widedeep.utils import pad_sequences from pytorch_widedeep.models import TabMlp, WideDeep, Transformer from pytorch_widedeep.preprocessing import TabPreprocessor In\u00a0[2]: Copied! <pre>save_path = Path(\"prepared_data\")\n\nPAD_IDX = 0\n\nid_cols = [\"user_id\", \"movie_id\"]\n\ndf_train = pd.read_pickle(save_path / \"df_train.pkl\")\ndf_valid = pd.read_pickle(save_path / \"df_valid.pkl\")\ndf_test = pd.read_pickle(save_path / \"df_test.pkl\")\n</pre> save_path = Path(\"prepared_data\")  PAD_IDX = 0  id_cols = [\"user_id\", \"movie_id\"]  df_train = pd.read_pickle(save_path / \"df_train.pkl\") df_valid = pd.read_pickle(save_path / \"df_valid.pkl\") df_test = pd.read_pickle(save_path / \"df_test.pkl\") <p>...remember that in the previous notebook we explained that we are not  going to use a validation set here (in a real-world example, or simply a more realistic example, one should always use it).</p> In\u00a0[3]: Copied! <pre>df_test = pd.concat([df_valid, df_test], ignore_index=True)\n</pre> df_test = pd.concat([df_valid, df_test], ignore_index=True) <p>Also remember that, in the previous notebook we discussed that the <code>'maxlen'</code> and <code>'max_movie_index'</code> parameters should be computed using only the train set. In particular, to properly do the tokenization, one would have to use ONLY train tokens and add a token for new 'unknown'/'unseen' movies in the test set. This can also be done with this library or manually, so I will leave it to the reader to implement that tokenzation appraoch.</p> In\u00a0[4]: Copied! <pre>maxlen = max(\n    df_train.prev_movies.apply(lambda x: len(x)).max(),\n    df_test.prev_movies.apply(lambda x: len(x)).max(),\n)\n\nmax_movie_index = max(df_train.movie_id.max(), df_test.movie_id.max())\n</pre> maxlen = max(     df_train.prev_movies.apply(lambda x: len(x)).max(),     df_test.prev_movies.apply(lambda x: len(x)).max(), )  max_movie_index = max(df_train.movie_id.max(), df_test.movie_id.max()) <p>From now one things are pretty simple, moreover bearing in mind that in this example we are not going to use a wide component since, in pple, one would believe that the information in that component is also 'carried' by the movie sequences (However in the previous notebook, if one performs ablation studies, these suggest that most of the prediction power comes from the linear, wide model).</p> <p>In the example here we are going to explore one (of many) possibilities. We are simply going to encode the triplet <code>(user, item, rating)</code> and use it as a <code>deeptabular</code> component and the sequences of previously watched movies as the <code>deeptext</code> component. For the <code>deeptext</code> component we are going to use a basic encoder-only transformer model.</p> <p>Let's start with the tabular data preparation</p> In\u00a0[5]: Copied! <pre>df_train_user_item = df_train[[\"user_id\", \"movie_id\", \"rating\"]]\ntrain_movies_sequences = df_train.prev_movies.apply(\n    lambda x: [int(el) for el in x]\n).to_list()\ny_train = df_train.target.values.astype(int)\n\ndf_test_user_item = df_train[[\"user_id\", \"movie_id\", \"rating\"]]\ntest_movies_sequences = df_test.prev_movies.apply(\n    lambda x: [int(el) for el in x]\n).to_list()\ny_test = df_test.target.values.astype(int)\n\ntab_preprocessor = tab_preprocessor = TabPreprocessor(\n    cat_embed_cols=[\"user_id\", \"movie_id\", \"rating\"],\n)\nX_train_tab = tab_preprocessor.fit_transform(df_train_user_item)\nX_test_tab = tab_preprocessor.transform(df_test_user_item)\n</pre> df_train_user_item = df_train[[\"user_id\", \"movie_id\", \"rating\"]] train_movies_sequences = df_train.prev_movies.apply(     lambda x: [int(el) for el in x] ).to_list() y_train = df_train.target.values.astype(int)  df_test_user_item = df_train[[\"user_id\", \"movie_id\", \"rating\"]] test_movies_sequences = df_test.prev_movies.apply(     lambda x: [int(el) for el in x] ).to_list() y_test = df_test.target.values.astype(int)  tab_preprocessor = tab_preprocessor = TabPreprocessor(     cat_embed_cols=[\"user_id\", \"movie_id\", \"rating\"], ) X_train_tab = tab_preprocessor.fit_transform(df_train_user_item) X_test_tab = tab_preprocessor.transform(df_test_user_item) <p>And not the text component, simply padding the sequences:</p> In\u00a0[6]: Copied! <pre>X_train_text = np.array(\n    [\n        pad_sequences(\n            s,\n            maxlen=maxlen,\n            pad_first=False,\n            pad_idx=PAD_IDX,\n        )\n        for s in train_movies_sequences\n    ]\n)\nX_test_text = np.array(\n    [\n        pad_sequences(\n            s,\n            maxlen=maxlen,\n            pad_first=False,\n            pad_idx=0,\n        )\n        for s in test_movies_sequences\n    ]\n)\n</pre> X_train_text = np.array(     [         pad_sequences(             s,             maxlen=maxlen,             pad_first=False,             pad_idx=PAD_IDX,         )         for s in train_movies_sequences     ] ) X_test_text = np.array(     [         pad_sequences(             s,             maxlen=maxlen,             pad_first=False,             pad_idx=0,         )         for s in test_movies_sequences     ] ) <p>We now define the model components and the wide and deep model.</p> In\u00a0[7]: Copied! <pre>tab_mlp = TabMlp(\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    mlp_hidden_dims=[1024, 512, 256],\n    mlp_activation=\"relu\",\n)\n\n# plenty of options here, see the docs\ntransformer = Transformer(\n    vocab_size=max_movie_index + 1,\n    embed_dim=32,\n    n_heads=2,\n    n_blocks=2,\n    seq_length=maxlen,\n)\n\nwide_deep_model = WideDeep(\n    deeptabular=tab_mlp, deeptext=transformer, pred_dim=max_movie_index + 1\n)\n</pre> tab_mlp = TabMlp(     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     mlp_hidden_dims=[1024, 512, 256],     mlp_activation=\"relu\", )  # plenty of options here, see the docs transformer = Transformer(     vocab_size=max_movie_index + 1,     embed_dim=32,     n_heads=2,     n_blocks=2,     seq_length=maxlen, )  wide_deep_model = WideDeep(     deeptabular=tab_mlp, deeptext=transformer, pred_dim=max_movie_index + 1 ) In\u00a0[8]: Copied! <pre>wide_deep_model\n</pre> wide_deep_model Out[8]: <pre>WideDeep(\n  (deeptabular): Sequential(\n    (0): TabMlp(\n      (cat_embed): DiffSizeCatEmbeddings(\n        (embed_layers): ModuleDict(\n          (emb_layer_user_id): Embedding(749, 65, padding_idx=0)\n          (emb_layer_movie_id): Embedding(1612, 100, padding_idx=0)\n          (emb_layer_rating): Embedding(6, 4, padding_idx=0)\n        )\n        (embedding_dropout): Dropout(p=0.0, inplace=False)\n      )\n      (encoder): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=169, out_features=1024, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_1): Sequential(\n            (0): Linear(in_features=1024, out_features=512, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_2): Sequential(\n            (0): Linear(in_features=512, out_features=256, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=256, out_features=1683, bias=True)\n  )\n  (deeptext): Sequential(\n    (0): Transformer(\n      (embedding): Embedding(1683, 32, padding_idx=0)\n      (pos_encoder): PositionalEncoding(\n        (dropout): Dropout(p=0.1, inplace=False)\n      )\n      (encoder): Sequential(\n        (transformer_block0): TransformerEncoder(\n          (attn): MultiHeadedAttention(\n            (dropout): Dropout(p=0.1, inplace=False)\n            (q_proj): Linear(in_features=32, out_features=32, bias=False)\n            (kv_proj): Linear(in_features=32, out_features=64, bias=False)\n            (out_proj): Linear(in_features=32, out_features=32, bias=False)\n          )\n          (ff): FeedForward(\n            (w_1): Linear(in_features=32, out_features=128, bias=True)\n            (w_2): Linear(in_features=128, out_features=32, bias=True)\n            (dropout): Dropout(p=0.1, inplace=False)\n            (activation): GELU(approximate='none')\n          )\n          (attn_addnorm): AddNorm(\n            (dropout): Dropout(p=0.1, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n          (ff_addnorm): AddNorm(\n            (dropout): Dropout(p=0.1, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n        )\n        (transformer_block1): TransformerEncoder(\n          (attn): MultiHeadedAttention(\n            (dropout): Dropout(p=0.1, inplace=False)\n            (q_proj): Linear(in_features=32, out_features=32, bias=False)\n            (kv_proj): Linear(in_features=32, out_features=64, bias=False)\n            (out_proj): Linear(in_features=32, out_features=32, bias=False)\n          )\n          (ff): FeedForward(\n            (w_1): Linear(in_features=32, out_features=128, bias=True)\n            (w_2): Linear(in_features=128, out_features=32, bias=True)\n            (dropout): Dropout(p=0.1, inplace=False)\n            (activation): GELU(approximate='none')\n          )\n          (attn_addnorm): AddNorm(\n            (dropout): Dropout(p=0.1, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n          (ff_addnorm): AddNorm(\n            (dropout): Dropout(p=0.1, inplace=False)\n            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=23552, out_features=1683, bias=True)\n  )\n)</pre> <p>And as in the previous notebook, let's train (you will need a GPU for this)</p> In\u00a0[\u00a0]: Copied! <pre>trainer = Trainer(\n    model=wide_deep_model,\n    objective=\"multiclass\",\n    custom_loss_function=nn.CrossEntropyLoss(ignore_index=PAD_IDX),\n    optimizers=torch.optim.Adam(wide_deep_model.parameters(), lr=1e-3),\n)\n\ntrainer.fit(\n    X_train={\n        \"X_tab\": X_train_tab,\n        \"X_text\": X_train_text,\n        \"target\": y_train,\n    },\n    X_val={\n        \"X_tab\": X_test_tab,\n        \"X_text\": X_test_text,\n        \"target\": y_test,\n    },\n    n_epochs=10,\n    batch_size=521,\n    shuffle=False,\n)\n</pre> trainer = Trainer(     model=wide_deep_model,     objective=\"multiclass\",     custom_loss_function=nn.CrossEntropyLoss(ignore_index=PAD_IDX),     optimizers=torch.optim.Adam(wide_deep_model.parameters(), lr=1e-3), )  trainer.fit(     X_train={         \"X_tab\": X_train_tab,         \"X_text\": X_train_text,         \"target\": y_train,     },     X_val={         \"X_tab\": X_test_tab,         \"X_text\": X_test_text,         \"target\": y_test,     },     n_epochs=10,     batch_size=521,     shuffle=False, ) <pre>epoch 1:   0%|                                                                                                         | 0/147 [00:34&lt;?, ?it/s]\n</pre> In\u00a0[\u00a0]: Copied! <pre>\n</pre>"},{"location":"examples/19_load_from_folder_functionality.html","title":"19 load from folder functionality","text":"<p>In this notebook I want to illustrate how one can use our <code>[...]FromFolder</code> functionalities along with the <code>[...]ChunkPreProcessors</code> in those cases where the dataset is too bit to fit in memory.</p> <p>These functionalities in the library have been designed for the following scenarop</p> In\u00a0[1]: Copied! <pre>import numpy as np\nimport torch\nimport pandas as pd\nfrom torch.utils.data import DataLoader\n\nfrom pytorch_widedeep.models import TabMlp, Vision, BasicRNN, WideDeep\nfrom pytorch_widedeep.training import TrainerFromFolder\nfrom pytorch_widedeep.callbacks import EarlyStopping, ModelCheckpoint\nfrom pytorch_widedeep.preprocessing import (\n    TabPreprocessor,\n    TextPreprocessor,\n    ImagePreprocessor,\n    ChunkTabPreprocessor,\n    ChunkTextPreprocessor,\n)\nfrom pytorch_widedeep.load_from_folder import (\n    TabFromFolder,\n    TextFromFolder,\n    ImageFromFolder,\n    WideDeepDatasetFromFolder,\n)\n</pre> import numpy as np import torch import pandas as pd from torch.utils.data import DataLoader  from pytorch_widedeep.models import TabMlp, Vision, BasicRNN, WideDeep from pytorch_widedeep.training import TrainerFromFolder from pytorch_widedeep.callbacks import EarlyStopping, ModelCheckpoint from pytorch_widedeep.preprocessing import (     TabPreprocessor,     TextPreprocessor,     ImagePreprocessor,     ChunkTabPreprocessor,     ChunkTextPreprocessor, ) from pytorch_widedeep.load_from_folder import (     TabFromFolder,     TextFromFolder,     ImageFromFolder,     WideDeepDatasetFromFolder, ) In\u00a0[2]: Copied! <pre># in my case, I place the data in a folder I call tmp_data, let's see how it looks\nairbnb_data = pd.read_csv(\"../tmp_data/airbnb/airbnb_sample.csv\")\nairbnb_data.sample(5)\n</pre> # in my case, I place the data in a folder I call tmp_data, let's see how it looks airbnb_data = pd.read_csv(\"../tmp_data/airbnb/airbnb_sample.csv\") airbnb_data.sample(5) Out[2]: id host_id description host_listings_count host_identity_verified neighbourhood_cleansed latitude longitude is_location_exact property_type ... amenity_wide_entrance amenity_wide_entrance_for_guests amenity_wide_entryway amenity_wide_hallways amenity_wifi amenity_window_guards amenity_wine_cooler security_deposit extra_people yield 39 53242.jpg 247650 A lovely big bright bedroom in a 2 bedroom fla... 2.0 t Lambeth 51.47075 -0.12913 t apartment ... 0 0 0 0 1 0 0 250.0 5.0 9.75 214 236716.jpg 1241070 We offer a warm welcome in our quiet double ro... 1.0 t Hackney 51.56593 -0.07482 t other ... 0 0 0 0 1 0 0 200.0 10.0 76.50 400 346523.jpg 1756532 Available for you to rent is a cozy studio in ... 2.0 t Kensington and Chelsea 51.48311 -0.18428 t other ... 0 0 0 0 1 0 0 0.0 50.0 180.90 512 389627.jpg 1949299 This gorgeous studio flat is situated in the v... 1.0 t Westminster 51.51838 -0.14238 f apartment ... 0 0 0 0 1 0 0 250.0 25.0 276.90 504 388767.jpg 1945165 If you want to experience London at it's best ... 2.0 f Camden 51.54293 -0.14073 t apartment ... 0 0 0 0 1 0 0 150.0 10.0 591.10 <p>5 rows \u00d7 223 columns</p> In\u00a0[3]: Copied! <pre># for example\nfrom IPython.display import Image\n\npil_img = Image(filename=\"../tmp_data/airbnb/property_picture/272908.jpg\")\ndisplay(pil_img)\n</pre> # for example from IPython.display import Image  pil_img = Image(filename=\"../tmp_data/airbnb/property_picture/272908.jpg\") display(pil_img) In\u00a0[4]: Copied! <pre># And the description for the property that that picture belongs to is:\nairbnb_data[airbnb_data.id == \"272908.jpg\"].description.tolist()\n</pre> # And the description for the property that that picture belongs to is: airbnb_data[airbnb_data.id == \"272908.jpg\"].description.tolist() Out[4]: <pre>[\"Bright, sunny beautiful room that will give you the perfect base to explore all of London. Come and explore one of London's best neighbourhoods - Herne Hill! As mentioned in (Website hidden by Airbnb)   (Website hidden by Airbnb)  WiFi availability with a fully stocked and clean uplifting home. Lovely sunny, airy and big double bedroom on a leafy south-London street.    Note: This room comes with a reserved Off-Street parking spot! The room is on the first floor and boasts an enormous Super King bed, gorgeous wooden floors, tall ceilings and large windows which let in the sunshine almost all day. (Yoga May or meditation cushion available on request) The flat is bright and airy and big! So lots of space for all.  Location wise you are only 10 minutes walk to either Herne Hill or West Dulwich stations, both of which will take you to Victoria and the city within minutes. You can also hop on a bus right outside the flat that will take you to Brixton tube station within 8 minutes where you \"]</pre> <p>Ok, so we have tabular data where one column is <code>description</code> and another <code>id</code>, points towards the images stored in disk. Now, remember the following, because this will appear a few times in the notebook: our \"reference dataset\" is the tabular data.</p> <p>Therefore, since I want to illustrate a \"semi-realistic\" case, if we need to split the data into training, validation and test datasets, these datasets needs to be separetely stored in disk. In my case I have done this and in the <code>tmp_data/airbnb</code> dir I have the following:</p> <pre>../tmp_data/airbnb\n\u251c\u2500\u2500 airbnb_sample.csv\n\u251c\u2500\u2500 airbnb_sample_eval.csv\n\u251c\u2500\u2500 airbnb_sample_test.csv\n\u251c\u2500\u2500 airbnb_sample_train.csv\n\u2514\u2500\u2500 property_picture\n</pre> <p>Where <code>airbnb_sample.csv</code> is the full sample (1001 rows) and the <code>train</code>, <code>eval</code> and <code>test</code> set is the corresponding splits. In a realistic example, the full sample would be the 'gigantic' dataset and the rest the corresponding splits. One has to do this 'offline', prior to start the coding.</p> <p>Also, one thing that one needs to know is the number of total observations/rows, as well as the splits. In our case the train size is 800, and the eval and test sizes are 100 and 101 respectively.</p> <p>With all that info, let's star</p> In\u00a0[5]: Copied! <pre># path to the tabular data and the splits\ndata_path = \"../tmp_data/airbnb/\"\ntrain_fname = \"airbnb_sample_train.csv\"\neval_fname = \"airbnb_sample_eval.csv\"\ntest_fname = \"airbnb_sample_test.csv\"\n\n# split sizes\ntrain_size = 800\neval_size = 100\ntest_size = 101\n\n# number of chunks for the Chunk Preprocessors\nchunksize = 100\nn_chunks = int(np.ceil(train_size / chunksize))\n\n# path to the image dataset and name of the image col\nimg_path = \"../tmp_data/airbnb/property_picture/\"\nimg_col = \"id\"\n\n# name of the text col\ntext_col = \"description\"\n\n# mane of the target\ntarget_col = \"yield\"\n\n# definition of the categorical and continuous cols for the TabPreprocessor\ncat_embed_cols = [\n    \"host_listings_count\",\n    \"neighbourhood_cleansed\",\n    \"is_location_exact\",\n    \"property_type\",\n    \"room_type\",\n    \"accommodates\",\n    \"bathrooms\",\n    \"bedrooms\",\n    \"beds\",\n    \"guests_included\",\n    \"minimum_nights\",\n    \"instant_bookable\",\n    \"cancellation_policy\",\n    \"has_house_rules\",\n    \"host_gender\",\n    \"accommodates_catg\",\n    \"guests_included_catg\",\n    \"minimum_nights_catg\",\n    \"host_listings_count_catg\",\n    \"bathrooms_catg\",\n    \"bedrooms_catg\",\n    \"beds_catg\",\n    \"security_deposit\",\n    \"extra_people\",\n]\ncont_cols = [\"latitude\", \"longitude\"]\n</pre> # path to the tabular data and the splits data_path = \"../tmp_data/airbnb/\" train_fname = \"airbnb_sample_train.csv\" eval_fname = \"airbnb_sample_eval.csv\" test_fname = \"airbnb_sample_test.csv\"  # split sizes train_size = 800 eval_size = 100 test_size = 101  # number of chunks for the Chunk Preprocessors chunksize = 100 n_chunks = int(np.ceil(train_size / chunksize))  # path to the image dataset and name of the image col img_path = \"../tmp_data/airbnb/property_picture/\" img_col = \"id\"  # name of the text col text_col = \"description\"  # mane of the target target_col = \"yield\"  # definition of the categorical and continuous cols for the TabPreprocessor cat_embed_cols = [     \"host_listings_count\",     \"neighbourhood_cleansed\",     \"is_location_exact\",     \"property_type\",     \"room_type\",     \"accommodates\",     \"bathrooms\",     \"bedrooms\",     \"beds\",     \"guests_included\",     \"minimum_nights\",     \"instant_bookable\",     \"cancellation_policy\",     \"has_house_rules\",     \"host_gender\",     \"accommodates_catg\",     \"guests_included_catg\",     \"minimum_nights_catg\",     \"host_listings_count_catg\",     \"bathrooms_catg\",     \"bedrooms_catg\",     \"beds_catg\",     \"security_deposit\",     \"extra_people\", ] cont_cols = [\"latitude\", \"longitude\"] In\u00a0[6]: Copied! <pre>tab_preprocessor = TabPreprocessor(\n    embed_cols=cat_embed_cols,\n    continuous_cols=cont_cols,\n    default_embed_dim=8,\n    verbose=0,\n)\n\ntext_preprocessor = TextPreprocessor(\n    text_col=text_col,\n    n_cpus=1,\n)\n\nimg_preprocessor = ImagePreprocessor(\n    img_col=img_col,\n    img_path=img_path,\n)\n</pre> tab_preprocessor = TabPreprocessor(     embed_cols=cat_embed_cols,     continuous_cols=cont_cols,     default_embed_dim=8,     verbose=0, )  text_preprocessor = TextPreprocessor(     text_col=text_col,     n_cpus=1, )  img_preprocessor = ImagePreprocessor(     img_col=img_col,     img_path=img_path, ) In\u00a0[7]: Copied! <pre>tab_preprocessor.fit(airbnb_data)\ntext_preprocessor.fit(airbnb_data)\nimg_preprocessor.fit(airbnb_data)\n</pre> tab_preprocessor.fit(airbnb_data) text_preprocessor.fit(airbnb_data) img_preprocessor.fit(airbnb_data) <pre>The vocabulary contains 2192 tokens\n</pre> Out[7]: <pre>ImagePreprocessor(img_col=id, img_path=../tmp_data/airbnb/property_picture/, width=224, height=224, verbose=1)</pre> In\u00a0[8]: Copied! <pre>chunk_tab_preprocessor = ChunkTabPreprocessor(\n    embed_cols=cat_embed_cols,\n    continuous_cols=cont_cols,\n    n_chunks=n_chunks,\n    default_embed_dim=8,\n    verbose=0,\n)\n\nchunk_text_preprocessor = ChunkTextPreprocessor(\n    n_chunks=n_chunks,\n    text_col=text_col,\n    n_cpus=1,\n    verbose=0,\n)\n\nfor i, chunk in enumerate(\n    pd.read_csv(\"/\".join([data_path, train_fname]), chunksize=chunksize)\n):\n    print(f\"chunk in loop: {i + 1}\")\n    chunk_tab_preprocessor.fit(chunk)\n    chunk_text_preprocessor.fit(chunk)\n</pre> chunk_tab_preprocessor = ChunkTabPreprocessor(     embed_cols=cat_embed_cols,     continuous_cols=cont_cols,     n_chunks=n_chunks,     default_embed_dim=8,     verbose=0, )  chunk_text_preprocessor = ChunkTextPreprocessor(     n_chunks=n_chunks,     text_col=text_col,     n_cpus=1,     verbose=0, )  for i, chunk in enumerate(     pd.read_csv(\"/\".join([data_path, train_fname]), chunksize=chunksize) ):     print(f\"chunk in loop: {i + 1}\")     chunk_tab_preprocessor.fit(chunk)     chunk_text_preprocessor.fit(chunk) <pre>chunk in loop: 1\nchunk in loop: 2\nchunk in loop: 3\nchunk in loop: 4\nchunk in loop: 5\nchunk in loop: 6\nchunk in loop: 7\nchunk in loop: 8\n</pre> In\u00a0[9]: Copied! <pre>train_tab_folder = TabFromFolder(\n    fname=train_fname,\n    directory=data_path,\n    target_col=target_col,\n    preprocessor=tab_preprocessor,\n    text_col=text_col,\n    img_col=img_col,\n)\n\n# Note how we can use the `train_tab_folder` as reference so we don't have to\n# define all parameters again\neval_tab_folder = TabFromFolder(fname=eval_fname, reference=train_tab_folder)\n\n# Note that for the test set we can ignore the target as no metric will be\n# computed by the `predict` method\ntest_tab_folder = TabFromFolder(\n    fname=test_fname, reference=train_tab_folder, ignore_target=True\n)\n</pre> train_tab_folder = TabFromFolder(     fname=train_fname,     directory=data_path,     target_col=target_col,     preprocessor=tab_preprocessor,     text_col=text_col,     img_col=img_col, )  # Note how we can use the `train_tab_folder` as reference so we don't have to # define all parameters again eval_tab_folder = TabFromFolder(fname=eval_fname, reference=train_tab_folder)  # Note that for the test set we can ignore the target as no metric will be # computed by the `predict` method test_tab_folder = TabFromFolder(     fname=test_fname, reference=train_tab_folder, ignore_target=True ) In\u00a0[10]: Copied! <pre># for the text and image datasets we do not need to specify eval or test loaders\ntext_folder = TextFromFolder(preprocessor=text_preprocessor)\nimg_folder = ImageFromFolder(preprocessor=img_preprocessor)\n</pre> # for the text and image datasets we do not need to specify eval or test loaders text_folder = TextFromFolder(preprocessor=text_preprocessor) img_folder = ImageFromFolder(preprocessor=img_preprocessor) In\u00a0[11]: Copied! <pre>train_dataset_folder = WideDeepDatasetFromFolder(\n    n_samples=train_size,\n    tab_from_folder=train_tab_folder,\n    text_from_folder=text_folder,\n    img_from_folder=img_folder,\n)\n\n# Note that the eval and test loaders only need their corresponding\n# `TabFromFolder` classes. The rest of the parameters can be defined\n# via a `reference` `TabFromFolder` class\neval_dataset_folder = WideDeepDatasetFromFolder(\n    n_samples=eval_size,\n    tab_from_folder=eval_tab_folder,\n    reference=train_dataset_folder,\n)\n\ntest_dataset_folder = WideDeepDatasetFromFolder(\n    n_samples=test_size,\n    tab_from_folder=test_tab_folder,\n    reference=train_dataset_folder,\n)\n</pre> train_dataset_folder = WideDeepDatasetFromFolder(     n_samples=train_size,     tab_from_folder=train_tab_folder,     text_from_folder=text_folder,     img_from_folder=img_folder, )  # Note that the eval and test loaders only need their corresponding # `TabFromFolder` classes. The rest of the parameters can be defined # via a `reference` `TabFromFolder` class eval_dataset_folder = WideDeepDatasetFromFolder(     n_samples=eval_size,     tab_from_folder=eval_tab_folder,     reference=train_dataset_folder, )  test_dataset_folder = WideDeepDatasetFromFolder(     n_samples=test_size,     tab_from_folder=test_tab_folder,     reference=train_dataset_folder, ) In\u00a0[12]: Copied! <pre>train_loader = DataLoader(train_dataset_folder, batch_size=16, num_workers=1)\neval_loader = DataLoader(eval_dataset_folder, batch_size=16, num_workers=1)\ntest_loader = DataLoader(test_dataset_folder, batch_size=16, num_workers=1)\n</pre> train_loader = DataLoader(train_dataset_folder, batch_size=16, num_workers=1) eval_loader = DataLoader(eval_dataset_folder, batch_size=16, num_workers=1) test_loader = DataLoader(test_dataset_folder, batch_size=16, num_workers=1) <p>And from here on is business as usual:</p> In\u00a0[13]: Copied! <pre># for example\nbasic_rnn = BasicRNN(\n    vocab_size=len(text_preprocessor.vocab.itos),\n    embed_dim=32,\n    hidden_dim=64,\n    n_layers=2,\n)\n\ndeepimage = Vision()\n\ndeepdense = TabMlp(\n    mlp_hidden_dims=[32, 16],\n    column_idx=tab_preprocessor.column_idx,\n    cat_embed_input=tab_preprocessor.cat_embed_input,\n    continuous_cols=cont_cols,\n)\n\nmodel = WideDeep(\n    deeptabular=deepdense,\n    deeptext=basic_rnn,\n    deepimage=deepimage,\n)\n\nmodel\n</pre> # for example basic_rnn = BasicRNN(     vocab_size=len(text_preprocessor.vocab.itos),     embed_dim=32,     hidden_dim=64,     n_layers=2, )  deepimage = Vision()  deepdense = TabMlp(     mlp_hidden_dims=[32, 16],     column_idx=tab_preprocessor.column_idx,     cat_embed_input=tab_preprocessor.cat_embed_input,     continuous_cols=cont_cols, )  model = WideDeep(     deeptabular=deepdense,     deeptext=basic_rnn,     deepimage=deepimage, )  model Out[13]: <pre>WideDeep(\n  (deeptabular): Sequential(\n    (0): TabMlp(\n      (cat_embed): DiffSizeCatEmbeddings(\n        (embed_layers): ModuleDict(\n          (emb_layer_host_listings_count): Embedding(28, 10, padding_idx=0)\n          (emb_layer_neighbourhood_cleansed): Embedding(33, 11, padding_idx=0)\n          (emb_layer_is_location_exact): Embedding(3, 2, padding_idx=0)\n          (emb_layer_property_type): Embedding(4, 3, padding_idx=0)\n          (emb_layer_room_type): Embedding(4, 3, padding_idx=0)\n          (emb_layer_accommodates): Embedding(14, 7, padding_idx=0)\n          (emb_layer_bathrooms): Embedding(11, 6, padding_idx=0)\n          (emb_layer_bedrooms): Embedding(7, 4, padding_idx=0)\n          (emb_layer_beds): Embedding(11, 6, padding_idx=0)\n          (emb_layer_guests_included): Embedding(11, 6, padding_idx=0)\n          (emb_layer_minimum_nights): Embedding(25, 9, padding_idx=0)\n          (emb_layer_instant_bookable): Embedding(3, 2, padding_idx=0)\n          (emb_layer_cancellation_policy): Embedding(6, 4, padding_idx=0)\n          (emb_layer_has_house_rules): Embedding(3, 2, padding_idx=0)\n          (emb_layer_host_gender): Embedding(4, 3, padding_idx=0)\n          (emb_layer_accommodates_catg): Embedding(4, 3, padding_idx=0)\n          (emb_layer_guests_included_catg): Embedding(4, 3, padding_idx=0)\n          (emb_layer_minimum_nights_catg): Embedding(4, 3, padding_idx=0)\n          (emb_layer_host_listings_count_catg): Embedding(5, 3, padding_idx=0)\n          (emb_layer_bathrooms_catg): Embedding(4, 3, padding_idx=0)\n          (emb_layer_bedrooms_catg): Embedding(5, 3, padding_idx=0)\n          (emb_layer_beds_catg): Embedding(5, 3, padding_idx=0)\n          (emb_layer_security_deposit): Embedding(53, 15, padding_idx=0)\n          (emb_layer_extra_people): Embedding(39, 12, padding_idx=0)\n        )\n        (embedding_dropout): Dropout(p=0.0, inplace=False)\n      )\n      (cont_norm): Identity()\n      (encoder): MLP(\n        (mlp): Sequential(\n          (dense_layer_0): Sequential(\n            (0): Linear(in_features=128, out_features=32, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n          (dense_layer_1): Sequential(\n            (0): Linear(in_features=32, out_features=16, bias=True)\n            (1): ReLU(inplace=True)\n            (2): Dropout(p=0.1, inplace=False)\n          )\n        )\n      )\n    )\n    (1): Linear(in_features=16, out_features=1, bias=True)\n  )\n  (deeptext): Sequential(\n    (0): BasicRNN(\n      (word_embed): Embedding(2192, 32, padding_idx=1)\n      (rnn): LSTM(32, 64, num_layers=2, batch_first=True, dropout=0.1)\n      (rnn_mlp): Identity()\n    )\n    (1): Linear(in_features=64, out_features=1, bias=True)\n  )\n  (deepimage): Sequential(\n    (0): Vision(\n      (features): Sequential(\n        (conv_layer_0): Sequential(\n          (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)\n          (1): BatchNorm2d(64, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)\n          (2): LeakyReLU(negative_slope=0.1, inplace=True)\n          (maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n        )\n        (conv_layer_1): Sequential(\n          (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n          (1): BatchNorm2d(128, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)\n          (2): LeakyReLU(negative_slope=0.1, inplace=True)\n        )\n        (conv_layer_2): Sequential(\n          (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n          (1): BatchNorm2d(256, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)\n          (2): LeakyReLU(negative_slope=0.1, inplace=True)\n        )\n        (conv_layer_3): Sequential(\n          (0): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n          (1): BatchNorm2d(512, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)\n          (2): LeakyReLU(negative_slope=0.1, inplace=True)\n          (adaptiveavgpool): AdaptiveAvgPool2d(output_size=(1, 1))\n        )\n      )\n    )\n    (1): Linear(in_features=512, out_features=1, bias=True)\n  )\n)</pre> In\u00a0[14]: Copied! <pre>trainer = TrainerFromFolder(\n    model,\n    objective=\"regression\",\n)\n\ntrainer.fit(\n    train_loader=train_loader,\n    eval_loader=eval_loader,\n)\n</pre> trainer = TrainerFromFolder(     model,     objective=\"regression\", )  trainer.fit(     train_loader=train_loader,     eval_loader=eval_loader, ) <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 50/50 [03:41&lt;00:00,  4.42s/it, loss=1.64e+4]\nvalid: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 7/7 [00:23&lt;00:00,  3.30s/it, loss=6.27e+3]\n</pre> In\u00a0[15]: Copied! <pre>preds = trainer.predict(test_loader=test_loader)\n</pre> preds = trainer.predict(test_loader=test_loader) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 7/7 [00:22&lt;00:00,  3.26s/it]\n</pre> <p>Note that in the case of predict you could also choose to do this</p> In\u00a0[16]: Copied! <pre>df_test = pd.read_csv(\"/\".join([data_path, test_fname]))\n</pre> df_test = pd.read_csv(\"/\".join([data_path, test_fname])) In\u00a0[17]: Copied! <pre># if the images for the test set fit in memory\nX_tab_test = chunk_tab_preprocessor.transform(df_test)\nX_text_test = chunk_text_preprocessor.transform(df_test)\nX_img_test = img_preprocessor.transform(df_test)\n</pre> # if the images for the test set fit in memory X_tab_test = chunk_tab_preprocessor.transform(df_test) X_text_test = chunk_text_preprocessor.transform(df_test) X_img_test = img_preprocessor.transform(df_test) <pre>Reading Images from ../tmp_data/airbnb/property_picture/\nResizing\n</pre> <pre>100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 101/101 [00:00&lt;00:00, 708.23it/s]</pre> <pre>Computing normalisation metrics\n</pre> <pre>\n</pre> In\u00a0[18]: Copied! <pre>preds = trainer.predict(\n    X_tab=X_tab_test, X_text=X_text_test, X_img=X_img_test, batch_size=32\n)\n</pre> preds = trainer.predict(     X_tab=X_tab_test, X_text=X_text_test, X_img=X_img_test, batch_size=32 ) <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:03&lt;00:00,  1.14it/s]\n</pre>"},{"location":"examples/19_load_from_folder_functionality.html#scenario","title":"Scenario\u00b6","text":"<p>We have a tabular dataset combined with images and text and either some, or all these datasets do not fit in memory. Note that the tabular dataset MUST ALWAYS be present as it is considered the rerefence. This is, if we have an image dataset, the tabular dataset must contain a column that points to the image file names as stored in disk. Similarly, if we have a text dataset, then the tabular dataset must contain a column with the texts themselves or a the file names of the text files as stored in disk.</p> <p>If you only have text and/or images and not a tabular component, I would suggest using other libraries (such as hugginface probably).</p> <p>Within this scenario, they are two possible scenarios that we will cover here:</p> <ol> <li><p>The tabular data itsel fits in memory and is only the images that do not: in this case you could use the 'standard' <code>Preprocessors</code> in the library and off you go, move directly to the <code>[...]FromFolder</code> functionalities</p> </li> <li><p>The tabular data is also very large and does not fit in memory, so we have to process it in chuncks. For this second case I have created the so called <code>Chunk[...]Preprocessor</code> (Wide, Tab and Text).</p> </li> </ol> <p>Note that at the moment ONLY csv format is allowed for the tabular file. More formats will be supported in the future.</p> <p>Let's see a complete example to illustrate how each of these cases would be addressed with the new functionalities in the library. For this example we will use a sample of the airbnb dataset</p> <p>The airbnb dataset, which you could get from here, is too big to be included in our datasets module (when including images). Therefore, what I did was, go there, download it, and use the download_images.py script to get the images and the airbnb_data_processing.py to process the data. I did this ages ago and I believe the format of the dataset might be different now. Nonetheless, I will show samples of the dataset as we go through so you can extrapolate the content of this notebook to your particular problem.</p> <p>In the future we will find better datasets\ud83d\ude42. Finally, note that here we are only using a small sample to illustrate the use, so PLEASE ignore the results, just focus on usage.</p>"},{"location":"examples/19_load_from_folder_functionality.html#setting-variables-and-constants","title":"Setting variables and constants\u00b6","text":""},{"location":"examples/19_load_from_folder_functionality.html#step-1-the-preprocessors","title":"Step 1: the preprocessors\u00b6","text":""},{"location":"examples/19_load_from_folder_functionality.html#scenario-1-only-the-images-do-not-fit-in-disk","title":"Scenario 1: only the images do not fit in disk\u00b6","text":"<p>In this case we can prepare the data in the 'standard' way</p>"},{"location":"examples/19_load_from_folder_functionality.html#scenario-2-the-tabular-data-is-also-huge","title":"Scenario 2: the tabular data is also huge\u00b6","text":"<p>Then we need to prepare it in chunks. Note that, unfortunately, the tabular and text preprocessors need to see the whole dataset once. This is because to process tabular or text data we need to encode values. For those encodings to be consistent they need to have seen the whole dataset. Alternatively, one could code a solution with some streaming encoder for both datasets. However, such implementation is not trivial for this library (and in general). I also don't think that having to see the whole data once is such a big limitation. Let's see how is done.</p> <p>Note that I have not mentioned the image dataset. This is because the processing of the image dataset does not require any form of encoding and in consequence can be done 'on the fly'. Therefore, no <code>ChunkImgPreprocessor</code> processor is needed.</p>"},{"location":"examples/19_load_from_folder_functionality.html#step-2-the-fromfolder-classes","title":"Step 2: the <code>[...]FromFolder</code> classes\u00b6","text":"<p>Once we have the preprocessors, we need to instantiate the classes that will enable us to load the data from their respective folders. From now on I am going to proceed with the <code>chunk_tab_preprocessor</code>, <code>chunk_text_preprocessor</code> and <code>img_preprocessor</code>, but the code would be identical if instead of the first two preprocessors we decided to use the <code>tab_preprocessor</code> and <code>text_preprocessor</code>.</p> <p>Once more, our reference datasets are the tabular datasets, which we have splitted in train, eval and test prior to start the coding. Therefore, we will eventually need a loader for each split</p>"},{"location":"examples/19_load_from_folder_functionality.html#step-3-pytorch-datasets-and-dataloaders","title":"Step 3: pytorch datasets and dataloaders\u00b6","text":"<p>From here in advance, is all very 'standard' if you are familiar with pytorch. One needs to define a class that inherits from the <code>Dataset</code> class in pytorch. Then this will be passed to a <code>DataLoader</code> class and we are ready to train. Our <code>Dataset</code> child class is <code>WideDeepDatasetFromFolder</code>. This class will use the tabular dataset and the corresponding text and image columns to load the adequate data in the batches</p> <p>Let's do it</p>"},{"location":"examples/19_load_from_folder_functionality.html#step-4-define-the-model","title":"Step 4: define the model\u00b6","text":""},{"location":"examples/19_load_from_folder_functionality.html#step-5-fit-and-predict","title":"Step 5: fit and predict\u00b6","text":""},{"location":"examples/20_Using_huggingface_within_widedeep.html","title":"20 Using huggingface within widedeep","text":"<p>In this notebook we will show how to use Hugginface's tokenizers and models as they are integrated within the library. In notebook number 17 you can find examples on how to code your own, custom, Hugginface (hereafter HF) model and use it in combination of any other model in the library</p> In\u00a0[1]: Copied! <pre>import numpy as np\nimport pandas as pd\nfrom sklearn.metrics import f1_score, accuracy_score\nfrom sklearn.model_selection import train_test_split\n\nfrom pytorch_widedeep import Trainer\nfrom pytorch_widedeep.models import HFModel, WideDeep\nfrom pytorch_widedeep.metrics import F1Score, Accuracy\nfrom pytorch_widedeep.datasets import load_womens_ecommerce\nfrom pytorch_widedeep.preprocessing import HFPreprocessor\n</pre> import numpy as np import pandas as pd from sklearn.metrics import f1_score, accuracy_score from sklearn.model_selection import train_test_split  from pytorch_widedeep import Trainer from pytorch_widedeep.models import HFModel, WideDeep from pytorch_widedeep.metrics import F1Score, Accuracy from pytorch_widedeep.datasets import load_womens_ecommerce from pytorch_widedeep.preprocessing import HFPreprocessor <pre>/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n</pre> In\u00a0[2]: Copied! <pre>df: pd.DataFrame = load_womens_ecommerce(as_frame=True)  # type: ignore\n</pre> df: pd.DataFrame = load_womens_ecommerce(as_frame=True)  # type: ignore In\u00a0[3]: Copied! <pre>df.shape\n</pre> df.shape Out[3]: <pre>(23486, 10)</pre> In\u00a0[4]: Copied! <pre>df.sample(3)\n</pre> df.sample(3) Out[4]: Clothing ID Age Title Review Text Rating Recommended IND Positive Feedback Count Division Name Department Name Class Name 7004 862 43 Cute and feminine Loved this sweater wrap and bought it in both ... 5 1 2 General Tops Knits 12508 975 66 Love it The linen fabric is elegantly thin feels and l... 5 1 3 General Jackets Jackets 10288 950 41 Perfect for fall This sweater is just as pictured. the fit is t... 5 1 0 General Tops Sweaters In\u00a0[5]: Copied! <pre># Let's do some mild preprocessing\ndf.columns = [c.replace(\" \", \"_\").lower() for c in df.columns]\n\n# classes from [0,num_class)\ndf[\"rating\"] = (df[\"rating\"] - 1).astype(\"int64\")\n\n# group reviews with 1 and 2 scores into one class\ndf.loc[df.rating == 0, \"rating\"] = 1\n\n# and back again to [0,num_class)\ndf[\"rating\"] = (df[\"rating\"] - 1).astype(\"int64\")\n</pre> # Let's do some mild preprocessing df.columns = [c.replace(\" \", \"_\").lower() for c in df.columns]  # classes from [0,num_class) df[\"rating\"] = (df[\"rating\"] - 1).astype(\"int64\")  # group reviews with 1 and 2 scores into one class df.loc[df.rating == 0, \"rating\"] = 1  # and back again to [0,num_class) df[\"rating\"] = (df[\"rating\"] - 1).astype(\"int64\") In\u00a0[6]: Copied! <pre># drop short reviews\ndf = df[~df.review_text.isna()]\ndf[\"review_length\"] = df.review_text.apply(lambda x: len(x.split(\" \")))\ndf = df[df.review_length &gt;= 5]\ndf = df.drop(\"review_length\", axis=1).reset_index(drop=True)\n</pre> # drop short reviews df = df[~df.review_text.isna()] df[\"review_length\"] = df.review_text.apply(lambda x: len(x.split(\" \"))) df = df[df.review_length &gt;= 5] df = df.drop(\"review_length\", axis=1).reset_index(drop=True) In\u00a0[7]: Copied! <pre>df.shape\n</pre> df.shape Out[7]: <pre>(22608, 10)</pre> In\u00a0[8]: Copied! <pre># if you run this on a CPU, you might want to subsample the dataset. With that in mind I am simply going to stratify-sample to the minimum category occurrence and then sample at random\n# If you run this on a GPU you can comment out the following two cells\ndf.rating.value_counts()\n</pre> # if you run this on a CPU, you might want to subsample the dataset. With that in mind I am simply going to stratify-sample to the minimum category occurrence and then sample at random # If you run this on a GPU you can comment out the following two cells df.rating.value_counts() Out[8]: <pre>rating\n3    12515\n2     4904\n1     2820\n0     2369\nName: count, dtype: int64</pre> In\u00a0[9]: Copied! <pre>df = (\n    df.groupby(\"rating\", group_keys=False)\n    .apply(lambda x: x.sample(min(len(x), 2369)))\n    .sample(1000)\n)\n</pre> df = (     df.groupby(\"rating\", group_keys=False)     .apply(lambda x: x.sample(min(len(x), 2369)))     .sample(1000) ) <pre>/var/folders/_2/lrjn1qn54c758tdtktr1bvkc0000gn/T/ipykernel_5886/895673206.py:3: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n  .apply(lambda x: x.sample(min(len(x), 2369)))\n</pre> In\u00a0[10]: Copied! <pre>train, test = train_test_split(df, train_size=0.8, random_state=1, stratify=df.rating)\n\n# possible model names currently supported in the library\nmodel_names = [\n    \"distilbert-base-uncased\",\n    \"bert-base-uncased\",\n    \"FacebookAI/roberta-base\",\n    \"albert-base-v2\",\n    \"google/electra-base-discriminator\",\n]\n\n# Let's choose one. The syntax is the same for all the models\nmodel_name = \"distilbert-base-uncased\"\n</pre> train, test = train_test_split(df, train_size=0.8, random_state=1, stratify=df.rating)  # possible model names currently supported in the library model_names = [     \"distilbert-base-uncased\",     \"bert-base-uncased\",     \"FacebookAI/roberta-base\",     \"albert-base-v2\",     \"google/electra-base-discriminator\", ]  # Let's choose one. The syntax is the same for all the models model_name = \"distilbert-base-uncased\" <p>Now we can use the <code>HFPreprocessor</code> class. As most things in this library, the integration with HF has been coded aiming for a flexible use. With this in mind, there are two ways one can use a <code>HFPreprocessor</code> class.</p> <ol> <li>Passing a <code>text_col</code> and <code>encode_params</code> as the class is instantiated and then using the <code>fit</code> and <code>transform</code> as with any other preprocessor in the library</li> <li>Without passing  <code>text_col</code> and <code>encode_params</code> as the class is instantiated and using the <code>encode</code> method of the <code>HFPreprocessor</code> which is simply a wrapper around the encode method of HF's tokenizers</li> </ol> <p>Let's have a look</p> In\u00a0[11]: Copied! <pre>tokenizer1 = HFPreprocessor(\n    model_name=model_name,\n    text_col=\"review_text\",\n    num_workers=1,\n    encode_params={\n        \"max_length\": 90,\n        \"padding\": \"max_length\",\n        \"truncation\": True,\n        \"add_special_tokens\": True,\n    },\n)\n\nX_text_tr1 = tokenizer1.fit_transform(train)\nX_text_te1 = tokenizer1.transform(test)\n</pre> tokenizer1 = HFPreprocessor(     model_name=model_name,     text_col=\"review_text\",     num_workers=1,     encode_params={         \"max_length\": 90,         \"padding\": \"max_length\",         \"truncation\": True,         \"add_special_tokens\": True,     }, )  X_text_tr1 = tokenizer1.fit_transform(train) X_text_te1 = tokenizer1.transform(test) In\u00a0[12]: Copied! <pre>tokenizer2 = HFPreprocessor(\n    model_name=model_name,\n    num_workers=1,\n)\n\nX_text_tr2 = tokenizer2.encode(\n    train.review_text.tolist(),\n    max_length=90,\n    padding=\"max_length\",\n    truncation=True,\n    add_special_tokens=True,\n)\nX_text_te2 = tokenizer2.encode(\n    test.review_text.tolist(),\n    max_length=90,\n    padding=\"max_length\",\n    truncation=True,\n    add_special_tokens=True,\n)\n</pre> tokenizer2 = HFPreprocessor(     model_name=model_name,     num_workers=1, )  X_text_tr2 = tokenizer2.encode(     train.review_text.tolist(),     max_length=90,     padding=\"max_length\",     truncation=True,     add_special_tokens=True, ) X_text_te2 = tokenizer2.encode(     test.review_text.tolist(),     max_length=90,     padding=\"max_length\",     truncation=True,     add_special_tokens=True, ) In\u00a0[13]: Copied! <pre>all(X_text_tr1[0] == X_text_tr2[0])\n</pre> all(X_text_tr1[0] == X_text_tr2[0]) Out[13]: <pre>True</pre> In\u00a0[14]: Copied! <pre># Now we define a model which is as easy as:\n# Note that this will instantiation will lead to NO parameter trainable in the HF model.\n# If you want to fine-tune the HF model, you can set the trainable parameters via the 'trainable_parameters' argument.\n# Alternatively, you can use a head (MLP) via the 'head'-related arguments (see the docs for more details)\nhf_model = HFModel(model_name=model_name)\n</pre> # Now we define a model which is as easy as: # Note that this will instantiation will lead to NO parameter trainable in the HF model. # If you want to fine-tune the HF model, you can set the trainable parameters via the 'trainable_parameters' argument. # Alternatively, you can use a head (MLP) via the 'head'-related arguments (see the docs for more details) hf_model = HFModel(model_name=model_name) In\u00a0[15]: Copied! <pre># And from here on is the same as any other WideDeep model\nmodel = WideDeep(\n    deeptext=hf_model,\n    pred_dim=4,\n)\n\ntrainer = Trainer(\n    model,\n    objective=\"multiclass\",\n    metrics=[Accuracy(), F1Score(average=True)],\n)\n\ntrainer.fit(\n    X_text=X_text_tr2,\n    target=train.rating.values,\n    n_epochs=1,\n    batch_size=64,\n)\n# If you run this on a CPU and you sampled the data, the metrics will not be better than a random guess. Remember, this is just a demo\n</pre> # And from here on is the same as any other WideDeep model model = WideDeep(     deeptext=hf_model,     pred_dim=4, )  trainer = Trainer(     model,     objective=\"multiclass\",     metrics=[Accuracy(), F1Score(average=True)], )  trainer.fit(     X_text=X_text_tr2,     target=train.rating.values,     n_epochs=1,     batch_size=64, ) # If you run this on a CPU and you sampled the data, the metrics will not be better than a random guess. Remember, this is just a demo <pre>epoch 1: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 13/13 [02:06&lt;00:00,  9.75s/it, loss=3.2, metrics={'acc': 0.235, 'f1': 0.2336}]  \n</pre> In\u00a0[17]: Copied! <pre>preds_text = trainer.predict_proba(X_text=X_text_te2)\npred_text_class = np.argmax(preds_text, 1)\n\nacc_text = accuracy_score(test.rating, pred_text_class)\nf1_text = f1_score(test.rating, pred_text_class, average=\"weighted\")\nprint(f\"Accuracy: {acc_text:.4f}\")\nprint(f\"F1: {f1_text:.4f}\")\n</pre> preds_text = trainer.predict_proba(X_text=X_text_te2) pred_text_class = np.argmax(preds_text, 1)  acc_text = accuracy_score(test.rating, pred_text_class) f1_text = f1_score(test.rating, pred_text_class, average=\"weighted\") print(f\"Accuracy: {acc_text:.4f}\") print(f\"F1: {f1_text:.4f}\") <pre>predict: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 4/4 [00:05&lt;00:00,  1.43s/it]</pre> <pre>Accuracy: 0.2500\nF1: 0.1000\n</pre> <pre>\n</pre>"},{"location":"pytorch-widedeep/bayesian_models.html","title":"The <code>bayesian models</code> module","text":"<p>This module contains the two Bayesian Models available in this library, namely the bayesian version of the <code>Wide</code> and <code>TabMlp</code> models, referred as <code>BayesianWide</code> and <code>BayesianTabMlp</code>. These models are very useful in scenarios where getting a measure of uncertainty is important.</p> <p>The models in this module are based on the publication: Weight Uncertainty in Neural Networks.</p>"},{"location":"pytorch-widedeep/bayesian_models.html#pytorch_widedeep.bayesian_models.tabular.bayesian_linear.bayesian_wide.BayesianWide","title":"BayesianWide","text":"<pre><code>BayesianWide(\n    input_dim,\n    pred_dim=1,\n    prior_sigma_1=1.0,\n    prior_sigma_2=0.002,\n    prior_pi=0.8,\n    posterior_mu_init=0.0,\n    posterior_rho_init=-7.0,\n)\n</code></pre> <p>               Bases: <code>BaseBayesianModel</code></p> <p>Defines a <code>Wide</code> model. This is a linear model where the non-linearlities are captured via crossed-columns</p> <p>Parameters:</p> <ul> <li> <code>input_dim</code>               (<code>int</code>)           \u2013            <p>size of the Embedding layer. <code>input_dim</code> is the summation of all the individual values for all the features that go through the wide component. For example, if the wide component receives 2 features with 5 individual values each, <code>input_dim = 10</code></p> </li> <li> <code>pred_dim</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>size of the ouput tensor containing the predictions</p> </li> <li> <code>prior_sigma_1</code>               (<code>float</code>, default:                   <code>1.0</code> )           \u2013            <p>The prior weight distribution is a scaled mixture of two Gaussian densities:</p> \\[    \\begin{aligned}    P(\\mathbf{w}) = \\prod_{i=j} \\pi N (\\mathbf{w}_j | 0, \\sigma_{1}^{2}) + (1 - \\pi) N (\\mathbf{w}_j | 0, \\sigma_{2}^{2})    \\end{aligned} \\] <p><code>prior_sigma_1</code> is the prior of the sigma parameter for the first of the two Gaussians that will be mixed to produce the prior weight distribution.</p> </li> <li> <code>prior_sigma_2</code>               (<code>float</code>, default:                   <code>0.002</code> )           \u2013            <p>Prior of the sigma parameter for the second of the two Gaussian distributions that will be mixed to produce the prior weight distribution</p> </li> <li> <code>prior_pi</code>               (<code>float</code>, default:                   <code>0.8</code> )           \u2013            <p>Scaling factor that will be used to mix the Gaussians to produce the prior weight distribution</p> </li> <li> <code>posterior_mu_init</code>               (<code>float</code>, default:                   <code>0.0</code> )           \u2013            <p>The posterior sample of the weights is defined as:</p> \\[    \\begin{aligned}    \\mathbf{w} &amp;= \\mu + log(1 + exp(\\rho))    \\end{aligned} \\] <p>where:</p> \\[    \\begin{aligned}    \\mathcal{N}(x\\vert \\mu, \\sigma) &amp;= \\frac{1}{\\sqrt{2\\pi}\\sigma}e^{-\\frac{(x-\\mu)^2}{2\\sigma^2}}\\\\    \\log{\\mathcal{N}(x\\vert \\mu, \\sigma)} &amp;= -\\log{\\sqrt{2\\pi}} -\\log{\\sigma} -\\frac{(x-\\mu)^2}{2\\sigma^2}\\\\    \\end{aligned} \\] <p>\\(\\mu\\) is initialised using a normal distributtion with mean <code>posterior_mu_init</code> and std equal to 0.1.</p> </li> <li> <code>posterior_rho_init</code>               (<code>float</code>, default:                   <code>-7.0</code> )           \u2013            <p>As in the case of \\(\\mu\\), \\(\\rho\\) is initialised using a normal distributtion with mean <code>posterior_rho_init</code> and std equal to 0.1.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>bayesian_wide_linear</code>               (<code>Module</code>)           \u2013            <p>the linear layer that comprises the wide branch of the model</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.bayesian_models import BayesianWide\n&gt;&gt;&gt; X = torch.empty(4, 4).random_(6)\n&gt;&gt;&gt; wide = BayesianWide(input_dim=int(X.max().item()), pred_dim=1)\n&gt;&gt;&gt; out = wide(X)\n</code></pre> Source code in <code>pytorch_widedeep/bayesian_models/tabular/bayesian_linear/bayesian_wide.py</code> <pre><code>def __init__(\n    self,\n    input_dim: int,\n    pred_dim: int = 1,\n    prior_sigma_1: float = 1.0,\n    prior_sigma_2: float = 0.002,\n    prior_pi: float = 0.8,\n    posterior_mu_init: float = 0.0,\n    posterior_rho_init: float = -7.0,\n):\n    super(BayesianWide, self).__init__()\n    #  Embeddings: val + 1 because 0 is reserved for padding/unseen cateogories.\n    self.bayesian_wide_linear = bnn.BayesianEmbedding(\n        n_embed=input_dim + 1,\n        embed_dim=pred_dim,\n        padding_idx=0,\n        prior_sigma_1=prior_sigma_1,\n        prior_sigma_2=prior_sigma_2,\n        prior_pi=prior_pi,\n        posterior_mu_init=posterior_mu_init,\n        posterior_rho_init=posterior_rho_init,\n    )\n    self.bias = nn.Parameter(torch.zeros(pred_dim))\n</code></pre>"},{"location":"pytorch-widedeep/bayesian_models.html#pytorch_widedeep.bayesian_models.tabular.bayesian_mlp.bayesian_tab_mlp.BayesianTabMlp","title":"BayesianTabMlp","text":"<pre><code>BayesianTabMlp(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_activation=None,\n    continuous_cols=None,\n    embed_continuous=None,\n    cont_embed_dim=None,\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    use_cont_bias=None,\n    cont_norm_layer=None,\n    mlp_hidden_dims=[200, 100],\n    mlp_activation=\"leaky_relu\",\n    prior_sigma_1=1,\n    prior_sigma_2=0.002,\n    prior_pi=0.8,\n    posterior_mu_init=0.0,\n    posterior_rho_init=-7.0,\n    pred_dim=1\n)\n</code></pre> <p>               Bases: <code>BaseBayesianModel</code></p> <p>Defines a <code>BayesianTabMlp</code> model.</p> <p>This class combines embedding representations of the categorical features with numerical (aka continuous) features, embedded or not. These are then passed through a series of probabilistic dense layers (i.e. a MLP).</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name, number of unique values and embedding dimension. e.g. [(education, 11, 32), ...]</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm', 'batchnorm' or None.</p> </li> <li> <code>embed_continuous</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the continuous columns will be embedded (i.e. passed each through a linear layer with or without activation)</p> </li> <li> <code>cont_embed_dim</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Size of the continuous embeddings</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings</p> </li> <li> <code>use_cont_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the continuous embeddings</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>List[int]</code>, default:                   <code>[200, 100]</code> )           \u2013            <p>List with the number of neurons per dense layer in the mlp.</p> </li> <li> <code>mlp_activation</code>               (<code>str</code>, default:                   <code>'leaky_relu'</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>prior_sigma_1</code>               (<code>float</code>, default:                   <code>1</code> )           \u2013            <p>The prior weight distribution is a scaled mixture of two Gaussian densities:</p> \\[    \\begin{aligned}    P(\\mathbf{w}) = \\prod_{i=j} \\pi N (\\mathbf{w}_j | 0, \\sigma_{1}^{2}) + (1 - \\pi) N (\\mathbf{w}_j | 0, \\sigma_{2}^{2})    \\end{aligned} \\] <p><code>prior_sigma_1</code> is the prior of the sigma parameter for the first of the two Gaussians that will be mixed to produce the prior weight distribution.</p> </li> <li> <code>prior_sigma_2</code>               (<code>float</code>, default:                   <code>0.002</code> )           \u2013            <p>Prior of the sigma parameter for the second of the two Gaussian distributions that will be mixed to produce the prior weight distribution for each Bayesian linear and embedding layer</p> </li> <li> <code>prior_pi</code>               (<code>float</code>, default:                   <code>0.8</code> )           \u2013            <p>Scaling factor that will be used to mix the Gaussians to produce the prior weight distribution ffor each Bayesian linear and embedding layer</p> </li> <li> <code>posterior_mu_init</code>               (<code>float</code>, default:                   <code>0.0</code> )           \u2013            <p>The posterior sample of the weights is defined as:</p> <p>$$    \\begin{aligned}    \\mathbf{w} &amp;= \\mu + log(1 + exp(\\rho))    \\end{aligned} $$ where:</p> \\[    \\begin{aligned}    \\mathcal{N}(x\\vert \\mu, \\sigma) &amp;= \\frac{1}{\\sqrt{2\\pi}\\sigma}e^{-\\frac{(x-\\mu)^2}{2\\sigma^2}}\\\\    \\log{\\mathcal{N}(x\\vert \\mu, \\sigma)} &amp;= -\\log{\\sqrt{2\\pi}} -\\log{\\sigma} -\\frac{(x-\\mu)^2}{2\\sigma^2}\\\\    \\end{aligned} \\] <p>\\(\\mu\\) is initialised using a normal distributtion with mean <code>posterior_mu_init</code> and std equal to 0.1.</p> </li> <li> <code>posterior_rho_init</code>               (<code>float</code>, default:                   <code>-7.0</code> )           \u2013            <p>As in the case of \\(\\mu\\), \\(\\rho\\) is initialised using a normal distributtion with mean <code>posterior_rho_init</code> and std equal to 0.1.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>bayesian_cat_and_cont_embed</code>               (<code>Module</code>)           \u2013            <p>This is the module that processes the categorical and continuous columns</p> </li> <li> <code>bayesian_tab_mlp</code>               (<code>Sequential</code>)           \u2013            <p>mlp model that will receive the concatenation of the embeddings and the continuous columns</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.bayesian_models import BayesianTabMlp\n&gt;&gt;&gt; X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = ['a', 'b', 'c', 'd', 'e']\n&gt;&gt;&gt; cat_embed_input = [(u,i,j) for u,i,j in zip(colnames[:4], [4]*4, [8]*4)]\n&gt;&gt;&gt; column_idx = {k:v for v,k in enumerate(colnames)}\n&gt;&gt;&gt; model = BayesianTabMlp(mlp_hidden_dims=[8,4], column_idx=column_idx, cat_embed_input=cat_embed_input,\n... continuous_cols = ['e'])\n&gt;&gt;&gt; out = model(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/bayesian_models/tabular/bayesian_mlp/bayesian_tab_mlp.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int, int]]] = None,\n    cat_embed_activation: Optional[str] = None,\n    continuous_cols: Optional[List[str]] = None,\n    embed_continuous: Optional[bool] = None,\n    cont_embed_dim: Optional[int] = None,\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    use_cont_bias: Optional[bool] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    mlp_hidden_dims: List[int] = [200, 100],\n    mlp_activation: str = \"leaky_relu\",\n    prior_sigma_1: float = 1,\n    prior_sigma_2: float = 0.002,\n    prior_pi: float = 0.8,\n    posterior_mu_init: float = 0.0,\n    posterior_rho_init: float = -7.0,\n    pred_dim=1,  # Bayesian models will require their own trainer and need the output layer\n):\n    super(BayesianTabMlp, self).__init__()\n\n    self.column_idx = column_idx\n    self.cat_embed_input = cat_embed_input\n    self.cat_embed_activation = cat_embed_activation\n\n    self.continuous_cols = continuous_cols\n    self.cont_norm_layer = cont_norm_layer\n    self.embed_continuous = embed_continuous\n    self.cont_embed_dim = cont_embed_dim\n    self.cont_embed_dropout = cont_embed_dropout\n    self.use_cont_bias = use_cont_bias\n    self.cont_embed_activation = cont_embed_activation\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n\n    self.prior_sigma_1 = prior_sigma_1\n    self.prior_sigma_2 = prior_sigma_2\n    self.prior_pi = prior_pi\n    self.posterior_mu_init = posterior_mu_init\n    self.posterior_rho_init = posterior_rho_init\n\n    self.pred_dim = pred_dim\n\n    allowed_activations = [\"relu\", \"leaky_relu\", \"tanh\", \"gelu\"]\n    if self.mlp_activation not in allowed_activations:\n        raise ValueError(\n            \"Currently, only the following activation functions are supported \"\n            \"for the Bayesian MLP's dense layers: {}. Got '{}' instead\".format(\n                \", \".join(allowed_activations),\n                self.mlp_activation,\n            )\n        )\n\n    # Categorical\n    if self.cat_embed_input is not None:\n        self.cat_embed = BayesianDiffSizeCatEmbeddings(\n            column_idx=self.column_idx,\n            embed_input=self.cat_embed_input,\n            prior_sigma_1=self.prior_sigma_1,\n            prior_sigma_2=self.prior_sigma_2,\n            prior_pi=self.prior_pi,\n            posterior_mu_init=self.posterior_mu_init,\n            posterior_rho_init=self.posterior_rho_init,\n            activation_fn=self.cat_embed_activation,\n        )\n        self.cat_out_dim = int(np.sum([embed[2] for embed in self.cat_embed_input]))\n    else:\n        self.cat_out_dim = 0\n\n    # Continuous\n    if self.continuous_cols is not None:\n        self.cont_idx = [self.column_idx[col] for col in self.continuous_cols]\n        if cont_norm_layer == \"layernorm\":\n            self.cont_norm: NormLayers = nn.LayerNorm(len(self.continuous_cols))\n        elif cont_norm_layer == \"batchnorm\":\n            self.cont_norm = nn.BatchNorm1d(len(self.continuous_cols))\n        else:\n            self.cont_norm = nn.Identity()\n        if self.embed_continuous:\n            assert self.cont_embed_dim is not None, (\n                \"If 'embed_continuous' is True, 'cont_embed_dim' must be \"\n                \"provided\"\n            )\n            self.cont_embed = BayesianContEmbeddings(\n                n_cont_cols=len(self.continuous_cols),\n                embed_dim=self.cont_embed_dim,\n                prior_sigma_1=self.prior_sigma_1,\n                prior_sigma_2=self.prior_sigma_2,\n                prior_pi=self.prior_pi,\n                posterior_mu_init=self.posterior_mu_init,\n                posterior_rho_init=self.posterior_rho_init,\n                use_bias=(\n                    False if self.use_cont_bias is None else self.use_cont_bias\n                ),\n                activation_fn=self.cont_embed_activation,\n            )\n            self.cont_out_dim = len(self.continuous_cols) * self.cont_embed_dim\n        else:\n            self.cont_out_dim = len(self.continuous_cols)\n    else:\n        self.cont_out_dim = 0\n\n    self.output_dim = self.cat_out_dim + self.cont_out_dim\n\n    mlp_hidden_dims = [self.output_dim] + mlp_hidden_dims + [pred_dim]\n    self.bayesian_tab_mlp = BayesianMLP(\n        mlp_hidden_dims,\n        mlp_activation,\n        True,  # use_bias\n        prior_sigma_1,\n        prior_sigma_2,\n        prior_pi,\n        posterior_mu_init,\n        posterior_rho_init,\n    )\n</code></pre>"},{"location":"pytorch-widedeep/bayesian_trainer.html","title":"Training Deep Learning Probabilistic Models","text":""},{"location":"pytorch-widedeep/bayesian_trainer.html#pytorch_widedeep.training.BayesianTrainer","title":"BayesianTrainer","text":"<pre><code>BayesianTrainer(\n    model,\n    objective,\n    custom_loss_function=None,\n    optimizer=None,\n    lr_scheduler=None,\n    callbacks=None,\n    metrics=None,\n    verbose=1,\n    seed=1,\n    **kwargs\n)\n</code></pre> <p>               Bases: <code>BaseBayesianTrainer</code></p> <p>Class to set the of attributes that will be used during the training process.</p> <p>Both the Bayesian models and the Trainer in this repo are based on the paper: Weight Uncertainty in Neural Networks.</p> <p>Parameters:</p> <ul> <li> <code>model</code>               (<code>BaseBayesianModel</code>)           \u2013            <p>An object of class <code>BaseBayesianModel</code>. See the <code>Model Components</code> section here in the docs.</p> </li> <li> <code>objective</code>               (<code>str</code>)           \u2013            <p>Defines the objective, loss or cost function. Param aliases: <code>loss_function</code>, <code>loss_fn</code>, <code>loss</code>, <code>cost_function</code>, <code>cost_fn</code>, <code>cost</code> Possible values are: 'binary', 'multiclass', 'regression'</p> </li> <li> <code>custom_loss_function</code>               (<code>Optional[Module]</code>, default:                   <code>None</code> )           \u2013            <p>If none of the loss functions available suits the user, it is possible to pass a custom loss function. See for example <code>pytorch_widedeep.losses.FocalLoss</code> for the required structure of the object or the Examples folder in the repo.</p> </li> <li> <code>optimizer</code>               (<code>Optional[Optimizer]</code>, default:                   <code>None</code> )           \u2013            <p>An instance of Pytorch's <code>Optimizer</code> object(e.g. <code>torch.optim.Adam ()</code>). if no optimizer is passed it will default to <code>AdamW</code>.</p> </li> <li> <code>lr_scheduler</code>               (<code>Optional[LRScheduler]</code>, default:                   <code>None</code> )           \u2013            <p>An instance of Pytorch's <code>LRScheduler</code> object (e.g <code>torch.optim.lr_scheduler.StepLR(opt, step_size=5)</code>).</p> </li> <li> <code>callbacks</code>               (<code>Optional[List[Callback]]</code>, default:                   <code>None</code> )           \u2013            <p>List with <code>Callback</code> objects. The three callbacks available in <code>pytorch-widedeep</code> are: <code>LRHistory</code>, <code>ModelCheckpoint</code> and <code>EarlyStopping</code>. This can also be a custom callback. See <code>pytorch_widedeep.callbacks.Callback</code> or the Examples folder in the repo.</p> </li> <li> <code>metrics</code>               (<code>Optional[Union[List[Metric], List[Metric]]]</code>, default:                   <code>None</code> )           \u2013            <ul> <li>List of objects of type <code>Metric</code>. Metrics available are:   <code>Accuracy</code>, <code>Precision</code>, <code>Recall</code>, <code>FBetaScore</code>,   <code>F1Score</code> and <code>R2Score</code>. This can also be a custom metric as   long as it is an object of type <code>Metric</code>. See   <code>pytorch_widedeep.metrics.Metric</code> or the Examples folder in the repo</li> <li>List of objects of type <code>torchmetrics.Metric</code>. This can be any   metric from torchmetrics library Examples   classification-metrics&gt;<code>_. It can also be a torchmetric custom metric as   long as it is an object of type</code>Metric<code>.   See</code>the instructions</li> </ul> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Setting it to 0 will print nothing during training.</p> </li> <li> <code>seed</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Random seed to be used internally for train_test_split</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>**kwargs</code>           \u2013            <p>Other infrequently used arguments that can also be passed as kwargs are:</p> <ul> <li> <p>device: <code>str</code>     string indicating the device. One of 'cpu' or 'gpu'</p> </li> <li> <p>num_workers: <code>int</code>     number of workers to be used internally by the data loaders</p> </li> <li> <p>class_weight: <code>List[float]</code>     This is the <code>weight</code> or <code>pos_weight</code> parameter in     <code>CrossEntropyLoss</code> and <code>BCEWithLogitsLoss</code>, depending on whether</p> </li> <li> <p>reducelronplateau_criterion: <code>str</code>     This sets the criterion that will be used by the lr scheduler to     take a step: One of 'loss' or 'metric'. The ReduceLROnPlateau     learning rate is a bit particular.</p> </li> </ul> </li> </ul> <p>Attributes:</p> <ul> <li> <code>cyclic_lr</code>               (<code>bool</code>)           \u2013            <p>Attribute that indicates if  the lr_scheduler is cyclic_lr (i.e. <code>CyclicLR</code> or <code>OneCycleLR</code>). See <code>Pytorch schedulers &lt;https://pytorch.org/docs/stable/optim.html&gt;</code>_.</p> </li> </ul> Source code in <code>pytorch_widedeep/training/bayesian_trainer.py</code> <pre><code>@alias(  # noqa: C901\n    \"objective\",\n    [\"loss_function\", \"loss_fn\", \"loss\", \"cost_function\", \"cost_fn\", \"cost\"],\n)\ndef __init__(\n    self,\n    model: BaseBayesianModel,\n    objective: str,\n    custom_loss_function: Optional[Module] = None,\n    optimizer: Optional[Optimizer] = None,\n    lr_scheduler: Optional[LRScheduler] = None,\n    callbacks: Optional[List[Callback]] = None,\n    metrics: Optional[Union[List[Metric], List[TorchMetric]]] = None,\n    verbose: int = 1,\n    seed: int = 1,\n    **kwargs,\n):\n    super().__init__(\n        model=model,\n        objective=objective,\n        custom_loss_function=custom_loss_function,\n        optimizer=optimizer,\n        lr_scheduler=lr_scheduler,\n        callbacks=callbacks,\n        metrics=metrics,\n        verbose=verbose,\n        seed=seed,\n        **kwargs,\n    )\n</code></pre>"},{"location":"pytorch-widedeep/bayesian_trainer.html#pytorch_widedeep.training.BayesianTrainer.fit","title":"fit","text":"<pre><code>fit(\n    X_tab,\n    target,\n    X_tab_val=None,\n    target_val=None,\n    val_split=None,\n    n_epochs=1,\n    validation_freq=1,\n    batch_size=32,\n    n_train_samples=2,\n    n_val_samples=2,\n)\n</code></pre> <p>Fit method.</p> <p>Parameters:</p> <ul> <li> <code>X_tab</code>               (<code>ndarray</code>)           \u2013            <p>tabular dataset</p> </li> <li> <code>target</code>               (<code>ndarray</code>)           \u2013            <p>target values</p> </li> <li> <code>X_tab_val</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>validation data</p> </li> <li> <code>target_val</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>validation target values</p> </li> <li> <code>val_split</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>An alterative to passing the validation set is to use a train/val split fraction via <code>val_split</code></p> </li> <li> <code>n_epochs</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>number of epochs</p> </li> <li> <code>validation_freq</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>epochs validation frequency</p> </li> <li> <code>batch_size</code>               (<code>int</code>, default:                   <code>32</code> )           \u2013            <p>batch size</p> </li> <li> <code>n_train_samples</code>               (<code>int</code>, default:                   <code>2</code> )           \u2013            <p>number of samples to average over during the training process. See Weight Uncertainty in Neural Networks for details.</p> </li> <li> <code>n_val_samples</code>               (<code>int</code>, default:                   <code>2</code> )           \u2013            <p>number of samples to average over during the validation process. See Weight Uncertainty in Neural Networks for details.</p> </li> </ul> Source code in <code>pytorch_widedeep/training/bayesian_trainer.py</code> <pre><code>def fit(  # noqa: C901\n    self,\n    X_tab: np.ndarray,\n    target: np.ndarray,\n    X_tab_val: Optional[np.ndarray] = None,\n    target_val: Optional[np.ndarray] = None,\n    val_split: Optional[float] = None,\n    n_epochs: int = 1,\n    validation_freq: int = 1,\n    batch_size: int = 32,\n    n_train_samples: int = 2,\n    n_val_samples: int = 2,\n):\n    r\"\"\"Fit method.\n\n    Parameters\n    ----------\n    X_tab: np.ndarray,\n        tabular dataset\n    target: np.ndarray\n        target values\n    X_tab_val: np.ndarray, Optional, default = None\n        validation data\n    target_val: np.ndarray, Optional, default = None\n        validation target values\n    val_split: float, Optional. default=None\n        An alterative to passing the validation set is to use a train/val\n        split fraction via `val_split`\n    n_epochs: int, default=1\n        number of epochs\n    validation_freq: int, default=1\n        epochs validation frequency\n    batch_size: int, default=32\n        batch size\n    n_train_samples: int, default=2\n        number of samples to average over during the training process.\n        See [Weight Uncertainty in Neural Networks](https://arxiv.org/pdf/1505.05424.pdf) for details.\n    n_val_samples: int, default=2\n        number of samples to average over during the validation process.\n        See [Weight Uncertainty in Neural Networks](https://arxiv.org/pdf/1505.05424.pdf) for details.\n    \"\"\"\n\n    self.batch_size = batch_size\n\n    train_set, eval_set = tabular_train_val_split(\n        self.seed, self.objective, X_tab, target, X_tab_val, target_val, val_split\n    )\n    train_loader = DataLoader(\n        dataset=train_set, batch_size=batch_size, num_workers=self.num_workers\n    )\n    train_steps = len(train_loader)\n\n    if eval_set is not None:\n        eval_loader = DataLoader(\n            dataset=eval_set,\n            batch_size=batch_size,\n            num_workers=self.num_workers,\n            shuffle=False,\n        )\n        eval_steps = len(eval_loader)\n\n    self.callback_container.on_train_begin(\n        {\n            \"batch_size\": batch_size,\n            \"train_steps\": train_steps,\n            \"n_epochs\": n_epochs,\n        }\n    )\n    for epoch in range(n_epochs):\n        epoch_logs: Dict[str, float] = {}\n        self.callback_container.on_epoch_begin(epoch, logs=epoch_logs)\n\n        self.train_running_loss = 0.0\n        with trange(train_steps, disable=self.verbose != 1) as t:\n            for batch_idx, (X, y) in zip(t, train_loader):\n                t.set_description(\"epoch %i\" % (epoch + 1))\n                train_score, train_loss = self._train_step(\n                    X, y, n_train_samples, train_steps, batch_idx\n                )\n                print_loss_and_metric(t, train_loss, train_score)\n                self.callback_container.on_batch_end(batch=batch_idx)\n        epoch_logs = save_epoch_logs(epoch_logs, train_loss, train_score, \"train\")\n\n        on_epoch_end_metric = None\n        if eval_set is not None and epoch % validation_freq == (\n            validation_freq - 1\n        ):\n            self.callback_container.on_eval_begin()\n            self.valid_running_loss = 0.0\n            with trange(eval_steps, disable=self.verbose != 1) as v:\n                for i, (X, y) in zip(v, eval_loader):\n                    v.set_description(\"valid\")\n                    val_score, val_loss = self._eval_step(\n                        X, y, n_val_samples, train_steps, i\n                    )\n                    print_loss_and_metric(v, val_loss, val_score)\n            epoch_logs = save_epoch_logs(epoch_logs, val_loss, val_score, \"val\")\n\n            if self.reducelronplateau:\n                if self.reducelronplateau_criterion == \"loss\":\n                    on_epoch_end_metric = val_loss\n                else:\n                    on_epoch_end_metric = val_score[\n                        self.reducelronplateau_criterion\n                    ]\n\n        self.callback_container.on_epoch_end(epoch, epoch_logs, on_epoch_end_metric)\n\n        if self.early_stop:\n            self.callback_container.on_train_end(epoch_logs)\n            break\n\n    self.callback_container.on_train_end(epoch_logs)\n    self._restore_best_weights()\n    self.model.train()\n</code></pre>"},{"location":"pytorch-widedeep/bayesian_trainer.html#pytorch_widedeep.training.BayesianTrainer.predict","title":"predict","text":"<pre><code>predict(\n    X_tab, n_samples=5, return_samples=False, batch_size=256\n)\n</code></pre> <p>Returns the predictions</p> <p>Parameters:</p> <ul> <li> <code>X_tab</code>               (<code>ndarray</code>)           \u2013            <p>tabular dataset</p> </li> <li> <code>n_samples</code>               (<code>int</code>, default:                   <code>5</code> )           \u2013            <p>number of samples that will be either returned or averaged to produce an overal prediction</p> </li> <li> <code>return_samples</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the n samples will be averaged or directly returned</p> </li> <li> <code>batch_size</code>               (<code>int</code>, default:                   <code>256</code> )           \u2013            <p>batch size</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>np.ndarray:</code>           \u2013            <p>array with the predictions</p> </li> </ul> Source code in <code>pytorch_widedeep/training/bayesian_trainer.py</code> <pre><code>def predict(  # type: ignore[return]\n    self,\n    X_tab: np.ndarray,\n    n_samples: int = 5,\n    return_samples: bool = False,\n    batch_size: int = 256,\n) -&gt; np.ndarray:\n    r\"\"\"Returns the predictions\n\n    Parameters\n    ----------\n    X_tab: np.ndarray,\n        tabular dataset\n    n_samples: int, default=5\n        number of samples that will be either returned or averaged to\n        produce an overal prediction\n    return_samples: bool, default = False\n        Boolean indicating whether the n samples will be averaged or directly returned\n    batch_size: int, default = 256\n        batch size\n\n    Returns\n    -------\n    np.ndarray:\n        array with the predictions\n    \"\"\"\n\n    preds_l = self._predict(X_tab, n_samples, return_samples, batch_size)\n    preds = np.hstack(preds_l) if return_samples else np.vstack(preds_l)\n    axis = 2 if return_samples else 1\n\n    if self.objective == \"regression\":\n        return preds.squeeze(axis)\n    if self.objective == \"binary\":\n        return (preds.squeeze(axis) &gt; 0.5).astype(\"int\")\n    if self.objective == \"multiclass\":\n        return np.argmax(preds, axis)\n</code></pre>"},{"location":"pytorch-widedeep/bayesian_trainer.html#pytorch_widedeep.training.BayesianTrainer.predict_proba","title":"predict_proba","text":"<pre><code>predict_proba(\n    X_tab, n_samples=5, return_samples=False, batch_size=256\n)\n</code></pre> <p>Returns the predicted probabilities</p> <p>Parameters:</p> <ul> <li> <code>X_tab</code>               (<code>ndarray</code>)           \u2013            <p>tabular dataset</p> </li> <li> <code>n_samples</code>               (<code>int</code>, default:                   <code>5</code> )           \u2013            <p>number of samples that will be either returned or averaged to produce an overal prediction</p> </li> <li> <code>return_samples</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the n samples will be averaged or directly returned</p> </li> <li> <code>batch_size</code>               (<code>int</code>, default:                   <code>256</code> )           \u2013            <p>batch size</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>array with the probabilities per class</p> </li> </ul> Source code in <code>pytorch_widedeep/training/bayesian_trainer.py</code> <pre><code>def predict_proba(  # type: ignore[return]\n    self,\n    X_tab: np.ndarray,\n    n_samples: int = 5,\n    return_samples: bool = False,\n    batch_size: int = 256,\n) -&gt; np.ndarray:\n    r\"\"\"Returns the predicted probabilities\n\n    Parameters\n    ----------\n    X_tab: np.ndarray,\n        tabular dataset\n    n_samples: int, default=5\n        number of samples that will be either returned or averaged to\n        produce an overal prediction\n    return_samples: bool, default = False\n        Boolean indicating whether the n samples will be averaged or directly returned\n    batch_size: int, default = 256\n        batch size\n\n    Returns\n    -------\n    np.ndarray\n        array with the probabilities per class\n    \"\"\"\n    preds_l = self._predict(X_tab, n_samples, return_samples, batch_size)\n    preds = np.hstack(preds_l) if return_samples else np.vstack(preds_l)\n\n    if self.objective == \"binary\":\n        if return_samples:\n            preds = preds.squeeze(2)\n            probs = np.zeros([n_samples, preds.shape[1], 2])\n            for i in range(n_samples):\n                probs[i, :, 0] = 1 - preds[i]\n                probs[i, :, 1] = preds[i]\n        else:\n            preds = preds.squeeze(1)\n            probs = np.zeros([preds.shape[0], 2])\n            probs[:, 0] = 1 - preds\n            probs[:, 1] = preds\n        return probs\n    if self.objective == \"multiclass\":\n        return preds\n</code></pre>"},{"location":"pytorch-widedeep/bayesian_trainer.html#pytorch_widedeep.training.BayesianTrainer.save","title":"save","text":"<pre><code>save(\n    path,\n    save_state_dict=False,\n    model_filename=\"bayesian_model.pt\",\n)\n</code></pre> <p>Saves the model, training and evaluation history to disk</p> <p>The <code>Trainer</code> class is built so that it 'just' trains a model. With that in mind, all the torch related parameters (such as optimizers or learning rate schedulers) have to be defined externally and then passed to the <code>Trainer</code>. As a result, the <code>Trainer</code> does not generate any attribute or additional data products that need to be saved other than the <code>model</code> object itself, which can be saved as any other torch model (e.g. <code>torch.save(model, path)</code>).</p> <p>Parameters:</p> <ul> <li> <code>path</code>               (<code>str</code>)           \u2013            <p>path to the directory where the model and the feature importance attribute will be saved.</p> </li> <li> <code>save_state_dict</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether to save directly the model or the model's state dictionary</p> </li> <li> <code>model_filename</code>               (<code>str</code>, default:                   <code>'bayesian_model.pt'</code> )           \u2013            <p>filename where the model weights will be store</p> </li> </ul> Source code in <code>pytorch_widedeep/training/bayesian_trainer.py</code> <pre><code>def save(\n    self,\n    path: str,\n    save_state_dict: bool = False,\n    model_filename: str = \"bayesian_model.pt\",\n):\n    r\"\"\"Saves the model, training and evaluation history to disk\n\n    The `Trainer` class is built so that it 'just' trains a model. With\n    that in mind, all the torch related parameters (such as optimizers or\n    learning rate schedulers) have to be defined externally and then\n    passed to the `Trainer`. As a result, the `Trainer` does not\n    generate any attribute or additional data products that need to be\n    saved other than the `model` object itself, which can be saved as\n    any other torch model (e.g. `torch.save(model, path)`).\n\n    Parameters\n    ----------\n    path: str\n        path to the directory where the model and the feature importance\n        attribute will be saved.\n    save_state_dict: bool, default = False\n        Boolean indicating whether to save directly the model or the\n        model's state dictionary\n    model_filename: str, Optional, default = \"wd_model.pt\"\n        filename where the model weights will be store\n    \"\"\"\n\n    save_dir = Path(path)\n    history_dir = save_dir / \"history\"\n    history_dir.mkdir(exist_ok=True, parents=True)\n\n    # the trainer is run with the History Callback by default\n    with open(history_dir / \"train_eval_history.json\", \"w\") as teh:\n        json.dump(self.history, teh)  # type: ignore[attr-defined]\n\n    has_lr_history = any(\n        [clbk.__class__.__name__ == \"LRHistory\" for clbk in self.callbacks]\n    )\n    if self.lr_scheduler is not None and has_lr_history:\n        with open(history_dir / \"lr_history.json\", \"w\") as lrh:\n            json.dump(self.lr_history, lrh)  # type: ignore[attr-defined]\n\n    model_path = save_dir / model_filename\n    if save_state_dict:\n        torch.save(self.model.state_dict(), model_path)\n    else:\n        torch.save(self.model, model_path)\n</code></pre>"},{"location":"pytorch-widedeep/callbacks.html","title":"Callbacks","text":"<p>Here are the 4 callbacks available to the user in <code>pytorch-widedepp</code>: <code>LRHistory</code>, <code>ModelCheckpoint</code>, <code>EarlyStopping</code> and <code>RayTuneReporter</code>.</p> <p> NOTE: other callbacks , like <code>History</code>, run always  by default. In particular, the <code>History</code> callback saves the metrics in the  <code>history</code> attribute of the <code>Trainer</code>.</p>"},{"location":"pytorch-widedeep/callbacks.html#pytorch_widedeep.callbacks.LRHistory","title":"LRHistory","text":"<pre><code>LRHistory(n_epochs)\n</code></pre> <p>               Bases: <code>Callback</code></p> <p>Saves the learning rates during training in the <code>lr_history</code> attribute of the <code>Trainer</code>.</p> <p>Callbacks are passed as input parameters to the <code>Trainer</code> class. See <code>pytorch_widedeep.trainer.Trainer</code></p> <p>Parameters:</p> <ul> <li> <code>n_epochs</code>               (<code>int</code>)           \u2013            <p>number of training epochs</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from pytorch_widedeep.callbacks import LRHistory\n&gt;&gt;&gt; from pytorch_widedeep.models import TabMlp, Wide, WideDeep\n&gt;&gt;&gt; from pytorch_widedeep.training import Trainer\n&gt;&gt;&gt;\n&gt;&gt;&gt; embed_input = [(u, i, j) for u, i, j in zip([\"a\", \"b\", \"c\"][:4], [4] * 3, [8] * 3)]\n&gt;&gt;&gt; column_idx = {k: v for v, k in enumerate([\"a\", \"b\", \"c\"])}\n&gt;&gt;&gt; wide = Wide(10, 1)\n&gt;&gt;&gt; deep = TabMlp(mlp_hidden_dims=[8, 4], column_idx=column_idx, cat_embed_input=embed_input)\n&gt;&gt;&gt; model = WideDeep(wide, deep)\n&gt;&gt;&gt; trainer = Trainer(model, objective=\"regression\", callbacks=[LRHistory(n_epochs=10)])\n</code></pre> Source code in <code>pytorch_widedeep/callbacks.py</code> <pre><code>def __init__(self, n_epochs: int):\n    super(LRHistory, self).__init__()\n    self.n_epochs = n_epochs\n</code></pre>"},{"location":"pytorch-widedeep/callbacks.html#pytorch_widedeep.callbacks.ModelCheckpoint","title":"ModelCheckpoint","text":"<pre><code>ModelCheckpoint(\n    filepath=None,\n    monitor=\"val_loss\",\n    min_delta=0.0,\n    verbose=0,\n    save_best_only=False,\n    mode=\"auto\",\n    period=1,\n    max_save=-1,\n)\n</code></pre> <p>               Bases: <code>Callback</code></p> <p>Saves the model after every epoch.</p> <p>This class is almost identical to the corresponding keras class. Therefore, credit to the Keras Team.</p> <p>Callbacks are passed as input parameters to the <code>Trainer</code> class. See <code>pytorch_widedeep.trainer.Trainer</code></p> <p>Parameters:</p> <ul> <li> <code>filepath</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Full path to save the output weights. It must contain only the root of the filenames. Epoch number and <code>.pt</code> extension (for pytorch) will be added. e.g. <code>filepath=\"path/to/output_weights/weights_out\"</code> And the saved files in that directory will be named: 'weights_out_1.pt', 'weights_out_2.pt', .... If set to <code>None</code> the class just report best metric and best_epoch.</p> </li> <li> <code>monitor</code>               (<code>str</code>, default:                   <code>'val_loss'</code> )           \u2013            <p>quantity to monitor. Typically 'val_loss' or metric name (e.g. 'val_acc')</p> </li> <li> <code>min_delta</code>               (<code>float</code>, default:                   <code>0.0</code> )           \u2013            <p>minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>0</code> )           \u2013            <p>verbosity mode</p> </li> <li> <code>save_best_only</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>the latest best model according to the quantity monitored will not be overwritten.</p> </li> <li> <code>mode</code>               (<code>str</code>, default:                   <code>'auto'</code> )           \u2013            <p>If <code>save_best_only=True</code>, the decision to overwrite the current save file is made based on either the maximization or the minimization of the monitored quantity. For 'acc', this should be 'max', for 'loss' this should be 'min', etc. In 'auto' mode, the direction is automatically inferred from the name of the monitored quantity.</p> </li> <li> <code>period</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Interval (number of epochs) between checkpoints.</p> </li> <li> <code>max_save</code>               (<code>int</code>, default:                   <code>-1</code> )           \u2013            <p>Maximum number of outputs to save. If -1 will save all outputs</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>best</code>               (<code>float</code>)           \u2013            <p>best metric</p> </li> <li> <code>best_epoch</code>               (<code>int</code>)           \u2013            <p>best epoch</p> </li> <li> <code>best_state_dict</code>               (<code>dict</code>)           \u2013            <p>best model state dictionary. To restore model to its best state use <code>Trainer.model.load_state_dict (model_checkpoint.best_state_dict)</code> where <code>model_checkpoint</code> is an instance of the class <code>ModelCheckpoint</code>. See the Examples folder in the repo or the Examples section in this documentation for details</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from pytorch_widedeep.callbacks import ModelCheckpoint\n&gt;&gt;&gt; from pytorch_widedeep.models import TabMlp, Wide, WideDeep\n&gt;&gt;&gt; from pytorch_widedeep.training import Trainer\n&gt;&gt;&gt;\n&gt;&gt;&gt; embed_input = [(u, i, j) for u, i, j in zip([\"a\", \"b\", \"c\"][:4], [4] * 3, [8] * 3)]\n&gt;&gt;&gt; column_idx = {k: v for v, k in enumerate([\"a\", \"b\", \"c\"])}\n&gt;&gt;&gt; wide = Wide(10, 1)\n&gt;&gt;&gt; deep = TabMlp(mlp_hidden_dims=[8, 4], column_idx=column_idx, cat_embed_input=embed_input)\n&gt;&gt;&gt; model = WideDeep(wide, deep)\n&gt;&gt;&gt; trainer = Trainer(model, objective=\"regression\", callbacks=[ModelCheckpoint(filepath='checkpoints/weights_out')])\n</code></pre> Source code in <code>pytorch_widedeep/callbacks.py</code> <pre><code>def __init__(\n    self,\n    filepath: Optional[str] = None,\n    monitor: str = \"val_loss\",\n    min_delta: float = 0.0,\n    verbose: int = 0,\n    save_best_only: bool = False,\n    mode: str = \"auto\",\n    period: int = 1,\n    max_save: int = -1,\n):\n    super(ModelCheckpoint, self).__init__()\n\n    self.filepath = filepath\n    self.monitor = monitor\n    self.min_delta = min_delta\n    self.verbose = verbose\n    self.save_best_only = save_best_only\n    self.mode = mode\n    self.period = period\n    self.max_save = max_save\n\n    self.epochs_since_last_save = 0\n\n    if self.filepath:\n        if len(self.filepath.split(\"/\")[:-1]) == 0:\n            raise ValueError(\n                \"'filepath' must be the full path to save the output weights,\"\n                \" including the root of the filenames. e.g. 'checkpoints/weights_out'\"\n            )\n\n        root_dir = (\"/\").join(self.filepath.split(\"/\")[:-1])\n        if not os.path.exists(root_dir):\n            os.makedirs(root_dir)\n\n    if self.max_save &gt; 0:\n        self.old_files: List[str] = []\n\n    if self.mode not in [\"auto\", \"min\", \"max\"]:\n        warnings.warn(\n            \"ModelCheckpoint mode %s is unknown, \"\n            \"fallback to auto mode.\" % (self.mode),\n            RuntimeWarning,\n        )\n        self.mode = \"auto\"\n    if self.mode == \"min\":\n        self.monitor_op = np.less\n        self.best = np.Inf\n    elif self.mode == \"max\":\n        self.monitor_op = np.greater  # type: ignore[assignment]\n        self.best = -np.Inf\n    else:\n        if _is_metric(self.monitor):\n            self.monitor_op = np.greater  # type: ignore[assignment]\n            self.best = -np.Inf\n        else:\n            self.monitor_op = np.less\n            self.best = np.Inf\n\n    if self.monitor_op == np.greater:\n        self.min_delta *= 1\n    else:\n        self.min_delta *= -1\n</code></pre>"},{"location":"pytorch-widedeep/callbacks.html#pytorch_widedeep.callbacks.EarlyStopping","title":"EarlyStopping","text":"<pre><code>EarlyStopping(\n    monitor=\"val_loss\",\n    min_delta=0.0,\n    patience=10,\n    verbose=0,\n    mode=\"auto\",\n    baseline=None,\n    restore_best_weights=False,\n)\n</code></pre> <p>               Bases: <code>Callback</code></p> <p>Stop training when a monitored quantity has stopped improving.</p> <p>This class is almost identical to the corresponding keras class. Therefore, credit to the Keras Team.</p> <p>Callbacks are passed as input parameters to the <code>Trainer</code> class. See <code>pytorch_widedeep.trainer.Trainer</code></p> <p>Parameters:</p> <ul> <li> <code>monitor</code>               (<code>str</code>, default:                   <code>'val_loss'</code> )           \u2013            <p>Quantity to monitor. Typically 'val_loss' or metric name (e.g. 'val_acc')</p> </li> <li> <code>min_delta</code>               (<code>float</code>, default:                   <code>0.0</code> )           \u2013            <p>minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement.</p> </li> <li> <code>patience</code>               (<code>int</code>, default:                   <code>10</code> )           \u2013            <p>Number of epochs that produced the monitored quantity with no improvement after which training will be stopped.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>0</code> )           \u2013            <p>verbosity mode.</p> </li> <li> <code>mode</code>               (<code>str</code>, default:                   <code>'auto'</code> )           \u2013            <p>one of {'auto', 'min', 'max'}. In 'min' mode, training will stop when the quantity monitored has stopped decreasing; in 'max' mode it will stop when the quantity monitored has stopped increasing; in 'auto' mode, the direction is automatically inferred from the name of the monitored quantity.</p> </li> <li> <code>baseline</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Baseline value for the monitored quantity to reach. Training will stop if the model does not show improvement over the baseline.</p> </li> <li> <code>restore_best_weights</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Whether to restore model weights from the epoch with the best value of the monitored quantity. If <code>False</code>, the model weights obtained at the last step of training are used.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>best</code>               (<code>float</code>)           \u2013            <p>best metric</p> </li> <li> <code>stopped_epoch</code>               (<code>int</code>)           \u2013            <p>epoch when the training stopped</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from pytorch_widedeep.callbacks import EarlyStopping\n&gt;&gt;&gt; from pytorch_widedeep.models import TabMlp, Wide, WideDeep\n&gt;&gt;&gt; from pytorch_widedeep.training import Trainer\n&gt;&gt;&gt;\n&gt;&gt;&gt; embed_input = [(u, i, j) for u, i, j in zip([\"a\", \"b\", \"c\"][:4], [4] * 3, [8] * 3)]\n&gt;&gt;&gt; column_idx = {k: v for v, k in enumerate([\"a\", \"b\", \"c\"])}\n&gt;&gt;&gt; wide = Wide(10, 1)\n&gt;&gt;&gt; deep = TabMlp(mlp_hidden_dims=[8, 4], column_idx=column_idx, cat_embed_input=embed_input)\n&gt;&gt;&gt; model = WideDeep(wide, deep)\n&gt;&gt;&gt; trainer = Trainer(model, objective=\"regression\", callbacks=[EarlyStopping(patience=10)])\n</code></pre> Source code in <code>pytorch_widedeep/callbacks.py</code> <pre><code>def __init__(\n    self,\n    monitor: str = \"val_loss\",\n    min_delta: float = 0.0,\n    patience: int = 10,\n    verbose: int = 0,\n    mode: str = \"auto\",\n    baseline: Optional[float] = None,\n    restore_best_weights: bool = False,\n):\n    super(EarlyStopping, self).__init__()\n\n    self.monitor = monitor\n    self.min_delta = min_delta\n    self.patience = patience\n    self.verbose = verbose\n    self.mode = mode\n    self.baseline = baseline\n    self.restore_best_weights = restore_best_weights\n\n    self.wait = 0\n    self.stopped_epoch = 0\n    self.state_dict = None\n\n    if self.mode not in [\"auto\", \"min\", \"max\"]:\n        warnings.warn(\n            \"EarlyStopping mode %s is unknown, \"\n            \"fallback to auto mode.\" % self.mode,\n            RuntimeWarning,\n        )\n        self.mode = \"auto\"\n\n    if self.mode == \"min\":\n        self.monitor_op = np.less\n    elif self.mode == \"max\":\n        self.monitor_op = np.greater  # type: ignore[assignment]\n    else:\n        if _is_metric(self.monitor):\n            self.monitor_op = np.greater  # type: ignore[assignment]\n        else:\n            self.monitor_op = np.less\n\n    if self.monitor_op == np.greater:\n        self.min_delta *= 1\n    else:\n        self.min_delta *= -1\n</code></pre>"},{"location":"pytorch-widedeep/dataloaders.html","title":"Dataloaders","text":"<p> NOTE: This module should contain custom dataloaders  that the user might want to implement. At the moment <code>pytorch-widedeep</code>  offers one custom dataloader, <code>DataLoaderImbalanced</code>.</p>"},{"location":"pytorch-widedeep/dataloaders.html#pytorch_widedeep.dataloaders.DataLoaderImbalanced","title":"DataLoaderImbalanced","text":"<pre><code>DataLoaderImbalanced(\n    dataset, batch_size, num_workers, **kwargs\n)\n</code></pre> <p>               Bases: <code>DataLoader</code></p> <p>Class to load and shuffle batches with adjusted weights for imbalanced datasets. If the classes do not begin from 0 remapping is necessary. See here.</p> <p>Parameters:</p> <ul> <li> <code>dataset</code>               (<code>WideDeepDataset</code>)           \u2013            <p>see <code>pytorch_widedeep.training._wd_dataset</code></p> </li> <li> <code>batch_size</code>               (<code>int</code>)           \u2013            <p>size of batch</p> </li> <li> <code>num_workers</code>               (<code>int</code>)           \u2013            <p>number of workers</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>**kwargs</code>           \u2013            <p>This can include any parameter that can be passed to the 'standard' pytorch DataLoader and that is not already explicitely passed to the class. In addition, the dictionary can also include the extra parameter <code>oversample_mul</code> which will multiply the number of samples of the minority class to be sampled by the <code>WeightedRandomSampler</code>.</p> <p>In other words, the <code>num_samples</code> param in <code>WeightedRandomSampler</code> will be defined as:</p> \\[ minority \\space class \\space count \\times number \\space of \\space classes \\times oversample\\_mul \\] </li> </ul> Source code in <code>pytorch_widedeep/dataloaders.py</code> <pre><code>def __init__(\n    self, dataset: WideDeepDataset, batch_size: int, num_workers: int, **kwargs\n):\n    assert dataset.Y is not None, (\n        \"The 'dataset' instance of WideDeepDataset must contain a \"\n        \"target array 'Y'\"\n    )\n\n    if \"oversample_mul\" in kwargs:\n        oversample_mul = kwargs[\"oversample_mul\"]\n        del kwargs[\"oversample_mul\"]\n    else:\n        oversample_mul = 1\n    weights, minor_cls_cnt, num_clss = get_class_weights(dataset)\n    num_samples = int(minor_cls_cnt * num_clss * oversample_mul)\n    samples_weight = list(np.array([weights[i] for i in dataset.Y]))\n    sampler = WeightedRandomSampler(samples_weight, num_samples, replacement=True)\n    super().__init__(\n        dataset, batch_size, num_workers=num_workers, sampler=sampler, **kwargs\n    )\n</code></pre>"},{"location":"pytorch-widedeep/load_from_folder.html","title":"The <code>load_from_folder</code> module","text":"<p>The <code>load_from_folder</code> module contains the classes that are necessary to load data from disk and these are inspired by the <code>ImageFolder</code> class in the <code>torchvision</code> library. This module is designed with one specific case in mind. Such case is the following: given a multi-modal dataset with tabular data, images and text, the images do not fit in memory, and therefore, they have to be loaded from disk. However, as any other functionality in this library, there is some flexibility and some additional cases can also be addressed using this module.</p> <p>For this module to be used, the datasets must be prepared in a certain way:</p> <ol> <li> <p>the tabular data must contain a column with the images names as stored in disk, including the extension (<code>.jpg</code>, <code>.png</code>, etc...).</p> </li> <li> <p>Regarding to the text dataset, the tabular data can contain a column with the texts themselves or the names of the files containing the texts as stored in disk.</p> </li> </ol> <p>The tabular data might or might not fit in disk itself. If it does not, please see the <code>ChunkPreprocessor</code> utilities at the[<code>preprocessing</code>] (preprocessing.md) module and the examples folder in the repo, which illustrate such case. Finally note that only <code>csv</code> format is currently supported in that case(more formats coming soon).</p>"},{"location":"pytorch-widedeep/load_from_folder.html#pytorch_widedeep.load_from_folder.tabular.tabular_from_folder.TabFromFolder","title":"TabFromFolder","text":"<pre><code>TabFromFolder(\n    fname,\n    directory=None,\n    target_col=None,\n    preprocessor=None,\n    text_col=None,\n    img_col=None,\n    ignore_target=False,\n    reference=None,\n    verbose=1,\n)\n</code></pre> <p>This class is used to load tabular data from disk. The current constrains are:</p> <ol> <li>The only file format supported right now is csv</li> <li>The csv file must contain headers</li> </ol> <p>For examples, please, see the examples folder in the repo.</p> <p>Parameters:</p> <ul> <li> <code>fname</code>               (<code>str</code>)           \u2013            <p>the name of the csv file</p> </li> <li> <code>directory</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>the path to the directory where the csv file is located. If None, a <code>TabFromFolder</code> reference object must be provided</p> </li> <li> <code>target_col</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>the name of the target column. If None, a <code>TabFromFolder</code> reference object must be provided</p> </li> <li> <code>preprocessor</code>               (<code>Optional[TabularPreprocessor]</code>, default:                   <code>None</code> )           \u2013            <p>a fitted <code>TabularPreprocessor</code> object. If None, a <code>TabFromFolder</code> reference object must be provided</p> </li> <li> <code>text_col</code>               (<code>Optional[Union[str, List[str]]]</code>, default:                   <code>None</code> )           \u2013            <p>the name of the column with the texts themselves or the names of the files that contain the text dataset. If None, either there is no text column or a <code>TabFromFolder</code> reference object must be provided</p> </li> <li> <code>img_col</code>               (<code>Optional[Union[str, List[str]]]</code>, default:                   <code>None</code> )           \u2013            <p>the name of the column with the the names of the images. If None, either there is no image column or a <code>TabFromFolder</code> reference object must be provided</p> </li> <li> <code>ignore_target</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>whether to ignore the target column. This is normally set to True when this class is used for a test dataset.</p> </li> <li> <code>reference</code>               (<code>Optional[Any]</code>, default:                   <code>None</code> )           \u2013            <p>a reference <code>TabFromFolder</code> object. If provided, the <code>TabFromFolder</code> object will be created using the attributes of the reference object. This is useful to instantiate a <code>TabFromFolder</code> object for evaluation or test purposes</p> </li> <li> <code>verbose</code>               (<code>Optional[int]</code>, default:                   <code>1</code> )           \u2013            <p>verbosity. If 0, no output will be printed during the process.</p> </li> </ul> Source code in <code>pytorch_widedeep/load_from_folder/tabular/tabular_from_folder.py</code> <pre><code>def __init__(\n    self,\n    fname: str,\n    directory: Optional[str] = None,\n    target_col: Optional[str] = None,\n    preprocessor: Optional[TabularPreprocessor] = None,\n    text_col: Optional[Union[str, List[str]]] = None,\n    img_col: Optional[Union[str, List[str]]] = None,\n    ignore_target: bool = False,\n    reference: Optional[Any] = None,  # is Type[\"TabFromFolder\"],\n    verbose: Optional[int] = 1,\n):\n    self.fname = fname\n    self.ignore_target = ignore_target\n    self.verbose = verbose\n\n    if reference is not None:\n        (\n            self.directory,\n            self.target_col,\n            self.preprocessor,\n            self.text_col,\n            self.img_col,\n        ) = self._set_from_reference(reference, preprocessor)\n    else:\n        assert (\n            directory is not None\n            and (target_col is not None and not ignore_target)\n            and preprocessor is not None\n        ), (\n            \"if no reference is provided, 'directory', 'target_col' and 'preprocessor' \"\n            \"must be provided\"\n        )\n\n        self.directory = directory\n        self.target_col = target_col\n        self.preprocessor = preprocessor\n        self.text_col = text_col\n        self.img_col = img_col\n\n    assert (\n        self.preprocessor.is_fitted\n    ), \"The preprocessor must be fitted before passing it to this class\"\n</code></pre>"},{"location":"pytorch-widedeep/load_from_folder.html#pytorch_widedeep.load_from_folder.tabular.tabular_from_folder.WideFromFolder","title":"WideFromFolder","text":"<pre><code>WideFromFolder(\n    fname,\n    directory=None,\n    target_col=None,\n    preprocessor=None,\n    text_col=None,\n    img_col=None,\n    ignore_target=False,\n    reference=None,\n    verbose=1,\n)\n</code></pre> <p>               Bases: <code>TabFromFolder</code></p> <p>This class is mostly identical to <code>TabFromFolder</code> but exists because we want to separate the treatment of the wide and the deep tabular components</p> <p>Parameters:</p> <ul> <li> <code>fname</code>               (<code>str</code>)           \u2013            <p>the name of the csv file</p> </li> <li> <code>directory</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>the path to the directory where the csv file is located. If None, a <code>WideFromFolder</code> reference object must be provided</p> </li> <li> <code>target_col</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>the name of the target column. If None, a <code>WideFromFolder</code> reference object must be provided</p> </li> <li> <code>preprocessor</code>               (<code>Optional[TabularPreprocessor]</code>, default:                   <code>None</code> )           \u2013            <p>a fitted <code>TabularPreprocessor</code> object. If None, a <code>WideFromFolder</code> reference object must be provided</p> </li> <li> <code>text_col</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>the name of the column with the texts themselves or the names of the files that contain the text dataset. If None, either there is no text column or a <code>WideFromFolder</code> reference object must be provided=</p> </li> <li> <code>img_col</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>the name of the column with the the names of the images. If None, either there is no image column or a <code>WideFromFolder</code> reference object must be provided</p> </li> <li> <code>ignore_target</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>whether to ignore the target column. This is normally used when this class is used for a test dataset.</p> </li> <li> <code>reference</code>               (<code>Optional[Any]</code>, default:                   <code>None</code> )           \u2013            <p>a reference <code>WideFromFolder</code> object. If provided, the <code>WideFromFolder</code> object will be created using the attributes of the reference object. This is useful to instantiate a <code>WideFromFolder</code> object for evaluation or test purposes</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>verbosity. If 0, no output will be printed during the process.</p> </li> </ul> Source code in <code>pytorch_widedeep/load_from_folder/tabular/tabular_from_folder.py</code> <pre><code>def __init__(\n    self,\n    fname: str,\n    directory: Optional[str] = None,\n    target_col: Optional[str] = None,\n    preprocessor: Optional[TabularPreprocessor] = None,\n    text_col: Optional[str] = None,\n    img_col: Optional[str] = None,\n    ignore_target: bool = False,\n    reference: Optional[Any] = None,  # is Type[\"WideFromFolder\"],\n    verbose: int = 1,\n):\n    super(WideFromFolder, self).__init__(\n        fname=fname,\n        directory=directory,\n        target_col=target_col,\n        preprocessor=preprocessor,\n        text_col=text_col,\n        img_col=img_col,\n        reference=reference,\n        ignore_target=ignore_target,\n        verbose=verbose,\n    )\n</code></pre>"},{"location":"pytorch-widedeep/load_from_folder.html#pytorch_widedeep.load_from_folder.text.text_from_folder.TextFromFolder","title":"TextFromFolder","text":"<pre><code>TextFromFolder(preprocessor)\n</code></pre> <p>This class is used to load the text dataset (i.e. the text files) from a folder, or to retrieve the text given a texts column specified within the preprocessor object.</p> <p>For examples, please, see the examples folder in the repo.</p> <p>Parameters:</p> <ul> <li> <code>preprocessor</code>               (<code>Union[TextPreprocessor, ChunkTextPreprocessor, HFPreprocessor, ChunkHFPreprocessor, List[TextPreprocessor], List[ChunkTextPreprocessor], List[HFPreprocessor], List[ChunkHFPreprocessor]]</code>)           \u2013            <p>The preprocessor used to process the text. It must be fitted before using this class</p> </li> </ul> Source code in <code>pytorch_widedeep/load_from_folder/text/text_from_folder.py</code> <pre><code>def __init__(\n    self,\n    preprocessor: Union[\n        TextPreprocessor,\n        ChunkTextPreprocessor,\n        HFPreprocessor,\n        ChunkHFPreprocessor,\n        List[TextPreprocessor],\n        List[ChunkTextPreprocessor],\n        List[HFPreprocessor],\n        List[ChunkHFPreprocessor],\n    ],\n):\n    if isinstance(preprocessor, list):\n        for p in preprocessor:\n            assert (\n                p.is_fitted\n            ), \"All preprocessors must be fitted before using this class\"\n    else:\n        assert (\n            preprocessor.is_fitted\n        ), \"The preprocessor must be fitted before using this class\"\n\n    self.preprocessor = preprocessor\n</code></pre>"},{"location":"pytorch-widedeep/load_from_folder.html#pytorch_widedeep.load_from_folder.image.image_from_folder.ImageFromFolder","title":"ImageFromFolder","text":"<pre><code>ImageFromFolder(\n    directory=None,\n    preprocessor=None,\n    loader=default_loader,\n    extensions=None,\n    transforms=None,\n)\n</code></pre> <p>This class is used to load the image dataset from disk. It is inspired by the <code>ImageFolder</code> class at the <code>torchvision</code> library. Here, we have simply adapted to work within the context of a Wide and Deep multi-modal model.</p> <p>For examples, please, see the examples folder in the repo.</p> <p>Parameters:</p> <ul> <li> <code>directory</code>               (<code>Optional[Union[str, List[str]]]</code>, default:                   <code>None</code> )           \u2013            <p>the path to the directory where the images are located. If None, a preprocessor must be provided.</p> </li> <li> <code>preprocessor</code>               (<code>Optional[Union[ImagePreprocessor, List[ImagePreprocessor]]]</code>, default:                   <code>None</code> )           \u2013            <p>a fitted <code>ImagePreprocessor</code> object.</p> </li> <li> <code>loader</code>               (<code>Callable[[str], Any]</code>, default:                   <code>default_loader</code> )           \u2013            <p>a function to load a sample given its path.</p> </li> <li> <code>extensions</code>               (<code>Optional[Tuple[str, ...]]</code>, default:                   <code>None</code> )           \u2013            <p>a tuple with the allowed extensions. If None, IMG_EXTENSIONS will be used where IMG_EXTENSIONS =\".jpg\", \".jpeg\", \".png\", \".ppm\", \".bmp\", \".pgm\", \".tif\", \".tiff\", \".webp\"</p> </li> <li> <code>transforms</code>               (<code>Optional[Any]</code>, default:                   <code>None</code> )           \u2013            <p>a <code>torchvision.transforms</code> object. If None, this class will simply return an array representation of the PIL Image</p> </li> </ul> Source code in <code>pytorch_widedeep/load_from_folder/image/image_from_folder.py</code> <pre><code>def __init__(\n    self,\n    directory: Optional[Union[str, List[str]]] = None,\n    preprocessor: Optional[\n        Union[ImagePreprocessor, List[ImagePreprocessor]]\n    ] = None,\n    loader: Callable[[str], Any] = default_loader,\n    extensions: Optional[Tuple[str, ...]] = None,\n    transforms: Optional[Any] = None,\n) -&gt; None:\n    assert (\n        directory is not None or preprocessor is not None\n    ), \"Either a directory or an instance of ImagePreprocessor(s) must be provided\"\n\n    if directory is not None and preprocessor is not None:  # pragma: no cover\n        error_msg = (\n            \"If both 'directory' and 'preprocessor' are provided, the 'img_path' \"\n            \"attribute of the 'preprocessor' must be the same as the 'directory'\"\n        )\n        if isinstance(directory, list):\n            assert isinstance(preprocessor, list)\n            assert len(directory) == len(preprocessor)\n            for d, p in zip(directory, preprocessor):\n                assert d == p.img_path, error_msg\n        else:\n            assert isinstance(preprocessor, ImagePreprocessor)\n            assert directory == preprocessor.img_path, error_msg\n\n    if directory is not None:\n        self.directory = directory\n    else:\n        assert (\n            preprocessor is not None\n        ), \"Either a directory or an instance of ImagePreprocessor must be provided\"\n        if isinstance(preprocessor, list):\n            self.directory = [p.img_path for p in preprocessor]\n        else:\n            self.directory = preprocessor.img_path\n\n    self.preprocessor = preprocessor\n    self.loader = loader\n    self.extensions = extensions if extensions is not None else IMG_EXTENSIONS\n    self.transforms = transforms\n    if self.transforms:\n        self.transforms_names = [\n            tr.__class__.__name__ for tr in self.transforms.transforms\n        ]\n    else:\n        self.transforms_names = []\n\n        self.transpose = True\n</code></pre>"},{"location":"pytorch-widedeep/load_from_folder.html#pytorch_widedeep.load_from_folder.wd_dataset_from_folder.WideDeepDatasetFromFolder","title":"WideDeepDatasetFromFolder","text":"<pre><code>WideDeepDatasetFromFolder(\n    n_samples,\n    tab_from_folder=None,\n    wide_from_folder=None,\n    text_from_folder=None,\n    img_from_folder=None,\n    reference=None,\n)\n</code></pre> <p>               Bases: <code>Dataset</code></p> <p>This class is the Dataset counterpart of the <code>WideDeepDataset</code> class.</p> <p>Given a reference tabular dataset, with columns that indicate the path to the images and to the text files or the texts themselves, it will use the <code>[...]FromFolder</code> classes to load the data consistently from disk per batch.</p> <p>For examples, please, see the examples folder in the repo.</p> <p>Parameters:</p> <ul> <li> <code>n_samples</code>               (<code>int</code>)           \u2013            <p>Number of samples in the dataset</p> </li> <li> <code>tab_from_folder</code>               (<code>Optional[TabFromFolder]</code>, default:                   <code>None</code> )           \u2013            <p>Instance of the <code>TabFromFolder</code> class</p> </li> <li> <code>wide_from_folder</code>               (<code>Optional[WideFromFolder]</code>, default:                   <code>None</code> )           \u2013            <p>Instance of the <code>WideFromFolder</code> class</p> </li> <li> <code>text_from_folder</code>               (<code>Optional[TextFromFolder]</code>, default:                   <code>None</code> )           \u2013            <p>Instance of the <code>TextFromFolder</code> class</p> </li> <li> <code>img_from_folder</code>               (<code>Optional[ImageFromFolder]</code>, default:                   <code>None</code> )           \u2013            <p>Instance of the <code>ImageFromFolder</code> class</p> </li> <li> <code>reference</code>               (<code>Optional[Any]</code>, default:                   <code>None</code> )           \u2013            <p>If not None, the 'text_from_folder' and 'img_from_folder' objects will be retrieved from the reference class. This is useful when we want to use a <code>WideDeepDatasetFromFolder</code> class used for a train dataset as a reference for the validation and test datasets. In this case, the <code>text_from_folder</code> and <code>img_from_folder</code> objects will be the same for all three datasets, so there is no need to create a new instance for each dataset.</p> </li> </ul> Source code in <code>pytorch_widedeep/load_from_folder/wd_dataset_from_folder.py</code> <pre><code>def __init__(\n    self,\n    n_samples: int,\n    tab_from_folder: Optional[TabFromFolder] = None,\n    wide_from_folder: Optional[WideFromFolder] = None,\n    text_from_folder: Optional[TextFromFolder] = None,\n    img_from_folder: Optional[ImageFromFolder] = None,\n    reference: Optional[Any] = None,  # is Type[\"WideDeepDatasetFromFolder\"],\n):\n    super(WideDeepDatasetFromFolder, self).__init__()\n\n    if tab_from_folder is None and wide_from_folder is None:\n        raise ValueError(\n            \"Either 'tab_from_folder' or 'wide_from_folder' must be not None\"\n        )\n\n    if reference is not None:\n        assert (\n            img_from_folder is None and text_from_folder is None\n        ), \"If reference is not None, 'img_from_folder' and 'text_from_folder' left as None\"\n        self.text_from_folder, self.img_from_folder = self._get_from_reference(\n            reference\n        )\n    else:\n        assert (\n            text_from_folder is not None and img_from_folder is not None\n        ), \"If reference is None, 'img_from_folder' and 'text_from_folder' must be not None\"\n        self.text_from_folder = text_from_folder\n        self.img_from_folder = img_from_folder\n\n    self.n_samples = n_samples\n    self.tab_from_folder = tab_from_folder\n    self.wide_from_folder = wide_from_folder\n</code></pre>"},{"location":"pytorch-widedeep/losses.html","title":"Losses","text":"<p><code>pytorch-widedeep</code> accepts a number of losses and objectives that can be passed to the <code>Trainer</code> class via the parameter <code>objective</code> (see <code>pytorch-widedeep.training.Trainer</code>). For most cases the loss function that <code>pytorch-widedeep</code> will use internally is already implemented in Pytorch.</p> <p>In addition, <code>pytorch-widedeep</code> implements a series of  \"custom\" loss functions. These are described below for completion since, as mentioned before, they are used internally by the <code>Trainer</code>. Of course, onen could always use them on their own and can be imported as:</p> <p><code>from pytorch_widedeep.losses import FocalLoss</code></p> <p> NOTE:  Losses in this module expect the predictions  and ground truth to have the same dimensions for regression and binary  classification problems \\((N_{samples}, 1)\\). In the case of multiclass  classification problems the ground truth is expected to be a 1D tensor with  the corresponding classes. See Examples below</p>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.MSELoss","title":"MSELoss","text":"<pre><code>MSELoss()\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Mean square error loss</p> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self):\n    super().__init__()\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.MSELoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual values</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses import MSELoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = MSELoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(\n    self,\n    input: Tensor,\n    target: Tensor,\n) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions\n    target: Tensor\n        Target tensor with the actual values\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt; from pytorch_widedeep.losses import MSELoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = MSELoss()(input, target)\n    \"\"\"\n    loss = (input - target) ** 2\n    return torch.mean(loss)\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.MSLELoss","title":"MSLELoss","text":"<pre><code>MSLELoss()\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Mean square log error loss</p> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self):\n    super().__init__()\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.MSLELoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions (not probabilities)</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual classes</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses import MSLELoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = MSLELoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(\n    self,\n    input: Tensor,\n    target: Tensor,\n) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions (not probabilities)\n    target: Tensor\n        Target tensor with the actual classes\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt; from pytorch_widedeep.losses import MSLELoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = MSLELoss()(input, target)\n    \"\"\"\n    assert (\n        input.min() &gt;= 0\n    ), \"\"\"All input values must be &gt;=0, if your model is predicting\n        values &lt;0 try to enforce positive values by activation function\n        on last layer with `trainer.enforce_positive_output=True`\"\"\"\n    assert target.min() &gt;= 0, \"All target values must be &gt;=0\"\n\n    loss = (torch.log(input + 1) - torch.log(target + 1)) ** 2\n    return torch.mean(loss)\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.RMSELoss","title":"RMSELoss","text":"<pre><code>RMSELoss()\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Root mean square error loss</p> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self):\n    super().__init__()\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.RMSELoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions (not probabilities)</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual classes</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses import RMSELoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = RMSELoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(self, input: Tensor, target: Tensor) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions (not probabilities)\n    target: Tensor\n        Target tensor with the actual classes\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt; from pytorch_widedeep.losses import RMSELoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = RMSELoss()(input, target)\n    \"\"\"\n    loss = (input - target) ** 2\n    return torch.sqrt(torch.mean(loss))\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.RMSLELoss","title":"RMSLELoss","text":"<pre><code>RMSLELoss()\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Root mean square log error loss</p> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self):\n    super().__init__()\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.RMSLELoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions (not probabilities)</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual classes</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses import RMSLELoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = RMSLELoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(self, input: Tensor, target: Tensor) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions (not probabilities)\n    target: Tensor\n        Target tensor with the actual classes\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt; from pytorch_widedeep.losses import RMSLELoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = RMSLELoss()(input, target)\n    \"\"\"\n    assert (\n        input.min() &gt;= 0\n    ), \"\"\"All input values must be &gt;=0, if your model is predicting\n        values &lt;0 try to enforce positive values by activation function\n        on last layer with `trainer.enforce_positive_output=True`\"\"\"\n    assert target.min() &gt;= 0, \"All target values must be &gt;=0\"\n\n    loss = (torch.log(input + 1) - torch.log(target + 1)) ** 2\n    return torch.sqrt(torch.mean(loss))\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.QuantileLoss","title":"QuantileLoss","text":"<pre><code>QuantileLoss(\n    quantiles=[0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98]\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Quantile loss defined as:</p> \\[ Loss = max(q \\times (y-y_{pred}), (1-q) \\times (y_{pred}-y)) \\] <p>All credits go to the implementation at pytorch-forecasting.</p> <p>Parameters:</p> <ul> <li> <code>quantiles</code>               (<code>List[float]</code>, default:                   <code>[0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98]</code> )           \u2013            <p>List of quantiles</p> </li> </ul> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(\n    self,\n    quantiles: List[float] = [0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98],\n):\n    super().__init__()\n    self.quantiles = quantiles\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.QuantileLoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual values</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.losses import QuantileLoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; # REGRESSION\n&gt;&gt;&gt; target = torch.tensor([[0.6, 1.5]]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([[.1, .2,], [.4, .5]])\n&gt;&gt;&gt; qloss = QuantileLoss([0.25, 0.75])\n&gt;&gt;&gt; loss = qloss(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(self, input: Tensor, target: Tensor) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions\n    target: Tensor\n        Target tensor with the actual values\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; from pytorch_widedeep.losses import QuantileLoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; # REGRESSION\n    &gt;&gt;&gt; target = torch.tensor([[0.6, 1.5]]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([[.1, .2,], [.4, .5]])\n    &gt;&gt;&gt; qloss = QuantileLoss([0.25, 0.75])\n    &gt;&gt;&gt; loss = qloss(input, target)\n    \"\"\"\n\n    assert input.shape == torch.Size([target.shape[0], len(self.quantiles)]), (\n        \"The input and target have inconsistent shape. The dimension of the prediction \"\n        \"of the model that is using QuantileLoss must be equal to number of quantiles, \"\n        f\"i.e. {len(self.quantiles)}.\"\n    )\n    target = target.view(-1, 1).float()\n    losses = []\n    for i, q in enumerate(self.quantiles):\n        errors = target - input[..., i]\n        losses.append(torch.max((q - 1) * errors, q * errors).unsqueeze(-1))\n\n    loss = torch.cat(losses, dim=2)\n\n    return torch.mean(loss)\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.FocalLoss","title":"FocalLoss","text":"<pre><code>FocalLoss(alpha=0.25, gamma=1.0)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Implementation of the Focal loss for both binary and multiclass classification:</p> \\[ FL(p_t) = \\alpha (1 - p_t)^{\\gamma} log(p_t) \\] <p>where, for a case of a binary classification problem</p> \\[ \\begin{equation} p_t= \\begin{cases}p, &amp; \\text{if $y=1$}.\\\\1-p, &amp; \\text{otherwise}. \\end{cases} \\end{equation} \\] <p>Parameters:</p> <ul> <li> <code>alpha</code>               (<code>float</code>, default:                   <code>0.25</code> )           \u2013            <p>Focal Loss <code>alpha</code> parameter</p> </li> <li> <code>gamma</code>               (<code>float</code>, default:                   <code>1.0</code> )           \u2013            <p>Focal Loss <code>gamma</code> parameter</p> </li> </ul> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self, alpha: float = 0.25, gamma: float = 1.0):\n    super().__init__()\n    self.alpha = alpha\n    self.gamma = gamma\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.FocalLoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions (not probabilities)</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual classes</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.losses import FocalLoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; # BINARY\n&gt;&gt;&gt; target = torch.tensor([0, 1, 0, 1]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([[0.6, 0.7, 0.3, 0.8]]).t()\n&gt;&gt;&gt; loss = FocalLoss()(input, target)\n&gt;&gt;&gt;\n&gt;&gt;&gt; # MULTICLASS\n&gt;&gt;&gt; target = torch.tensor([1, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([[0.2, 0.5, 0.3], [0.8, 0.1, 0.1], [0.7, 0.2, 0.1]])\n&gt;&gt;&gt; loss = FocalLoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(self, input: Tensor, target: Tensor) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions (not probabilities)\n    target: Tensor\n        Target tensor with the actual classes\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; from pytorch_widedeep.losses import FocalLoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; # BINARY\n    &gt;&gt;&gt; target = torch.tensor([0, 1, 0, 1]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([[0.6, 0.7, 0.3, 0.8]]).t()\n    &gt;&gt;&gt; loss = FocalLoss()(input, target)\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; # MULTICLASS\n    &gt;&gt;&gt; target = torch.tensor([1, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([[0.2, 0.5, 0.3], [0.8, 0.1, 0.1], [0.7, 0.2, 0.1]])\n    &gt;&gt;&gt; loss = FocalLoss()(input, target)\n    \"\"\"\n    input_prob = torch.sigmoid(input)\n    if input.size(1) == 1:\n        input_prob = torch.cat([1 - input_prob, input_prob], axis=1)  # type: ignore\n        num_class = 2\n    else:\n        num_class = input_prob.size(1)\n    binary_target = torch.eye(num_class)[target.squeeze().cpu().long()]\n    if use_cuda:\n        binary_target = binary_target.cuda()\n    binary_target = binary_target.contiguous()\n    weight = self._get_weight(input_prob, binary_target)\n\n    return F.binary_cross_entropy(\n        input_prob, binary_target, weight, reduction=\"mean\"\n    )\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.BayesianSELoss","title":"BayesianSELoss","text":"<pre><code>BayesianSELoss()\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Squared Loss (log Gaussian) for the case of a regression as specified in the original publication Weight Uncertainty in Neural Networks.</p> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self):\n    super().__init__()\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.BayesianSELoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions (not probabilities)</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual classes</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses import BayesianSELoss\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = BayesianSELoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(self, input: Tensor, target: Tensor) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions (not probabilities)\n    target: Tensor\n        Target tensor with the actual classes\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt; from pytorch_widedeep.losses import BayesianSELoss\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = BayesianSELoss()(input, target)\n    \"\"\"\n    return (0.5 * (input - target) ** 2).sum()\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.TweedieLoss","title":"TweedieLoss","text":"<pre><code>TweedieLoss()\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Tweedie loss for extremely unbalanced zero-inflated data</p> <p>All credits go to Wenbo Shi. See this post and the original publication for details.</p> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self):\n    super().__init__()\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.TweedieLoss.forward","title":"forward","text":"<pre><code>forward(input, target, p=1.5)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual values</p> </li> <li> <code>p</code>               (<code>float</code>, default:                   <code>1.5</code> )           \u2013            <p>the power to be used to compute the loss. See the original publication for details</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses import TweedieLoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = TweedieLoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(\n    self,\n    input: Tensor,\n    target: Tensor,\n    p: float = 1.5,\n) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions\n    target: Tensor\n        Target tensor with the actual values\n    p: float, default = 1.5\n        the power to be used to compute the loss. See the original\n        publication for details\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt; from pytorch_widedeep.losses import TweedieLoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = TweedieLoss()(input, target)\n    \"\"\"\n\n    assert (\n        input.min() &gt; 0\n    ), \"\"\"All input values must be &gt;=0, if your model is predicting\n        values &lt;0 try to enforce positive values by activation function\n        on last layer with `trainer.enforce_positive_output=True`\"\"\"\n    assert target.min() &gt;= 0, \"All target values must be &gt;=0\"\n    loss = -target * torch.pow(input, 1 - p) / (1 - p) + torch.pow(input, 2 - p) / (\n        2 - p\n    )\n    return torch.mean(loss)\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.ZILNLoss","title":"ZILNLoss","text":"<pre><code>ZILNLoss()\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Adjusted implementation of the Zero Inflated LogNormal Loss</p> <p>See A Deep Probabilistic Model for Customer Lifetime Value Prediction and the corresponding code.</p> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self):\n    super().__init__()\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.ZILNLoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions with spape (N,3), where N is the batch size</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual target values</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses import ZILNLoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([[0., 1.5]]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([[.1, .2, .3], [.4, .5, .6]])\n&gt;&gt;&gt; loss = ZILNLoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(self, input: Tensor, target: Tensor) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions with spape (N,3), where N is the batch size\n    target: Tensor\n        Target tensor with the actual target values\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt; from pytorch_widedeep.losses import ZILNLoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([[0., 1.5]]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([[.1, .2, .3], [.4, .5, .6]])\n    &gt;&gt;&gt; loss = ZILNLoss()(input, target)\n    \"\"\"\n    positive = target &gt; 0\n    positive = positive.float()\n\n    assert input.shape == torch.Size([target.shape[0], 3]), (\n        \"Wrong shape of the 'input' tensor. The pred_dim of the \"\n        \"model that is using ZILNLoss must be equal to 3.\"\n    )\n\n    positive_input = input[..., :1]\n\n    classification_loss = F.binary_cross_entropy_with_logits(\n        positive_input, positive, reduction=\"none\"\n    ).flatten()\n\n    loc = input[..., 1:2]\n\n    # when using max the two input tensors (input and other) have to be of\n    # the same type\n    max_input = F.softplus(input[..., 2:])\n    max_other = torch.sqrt(torch.Tensor([torch.finfo(torch.double).eps])).type(\n        max_input.type()\n    )\n    scale = torch.max(max_input, max_other)\n    safe_labels = positive * target + (1 - positive) * torch.ones_like(target)\n\n    regression_loss = -torch.mean(\n        positive\n        * torch.distributions.log_normal.LogNormal(loc=loc, scale=scale).log_prob(\n            safe_labels\n        ),\n        dim=-1,\n    )\n\n    return torch.mean(classification_loss + regression_loss)\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.L1Loss","title":"L1Loss","text":"<pre><code>L1Loss()\n</code></pre> <p>               Bases: <code>Module</code></p> <p>L1 loss</p> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self):\n    super().__init__()\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.L1Loss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual values</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.losses import L1Loss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = L1Loss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(self, input: Tensor, target: Tensor) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions\n    target: Tensor\n        Target tensor with the actual values\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; from pytorch_widedeep.losses import L1Loss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = L1Loss()(input, target)\n    \"\"\"\n    loss = F.l1_loss(input, target, reduction=\"none\")\n    return torch.mean(loss)\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.FocalR_L1Loss","title":"FocalR_L1Loss","text":"<pre><code>FocalR_L1Loss(beta=0.2, gamma=1.0, activation_fn='sigmoid')\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Focal-R L1 loss</p> <p>Based on Delving into Deep Imbalanced Regression.</p> <p>Parameters:</p> <ul> <li> <code>beta</code>               (<code>float</code>, default:                   <code>0.2</code> )           \u2013            <p>Focal Loss <code>beta</code> parameter in their implementation</p> </li> <li> <code>gamma</code>               (<code>float</code>, default:                   <code>1.0</code> )           \u2013            <p>Focal Loss <code>gamma</code> parameter</p> </li> <li> <code>activation_fn</code>               (<code>Literal[sigmoid, tanh]</code>, default:                   <code>'sigmoid'</code> )           \u2013            <p>Activation function to be used during the computation of the loss. Possible values are 'sigmoid' and 'tanh'. See the original publication for details.</p> </li> </ul> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(\n    self,\n    beta: float = 0.2,\n    gamma: float = 1.0,\n    activation_fn: Literal[\"sigmoid\", \"tanh\"] = \"sigmoid\",\n):\n    super().__init__()\n    self.beta = beta\n    self.gamma = gamma\n    self.activation_fn = activation_fn\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.FocalR_L1Loss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions (not probabilities)</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual classes</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.losses import FocalR_L1Loss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = FocalR_L1Loss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(\n    self,\n    input: Tensor,\n    target: Tensor,\n) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions (not probabilities)\n    target: Tensor\n        Target tensor with the actual classes\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; from pytorch_widedeep.losses import FocalR_L1Loss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = FocalR_L1Loss()(input, target)\n    \"\"\"\n    loss = F.l1_loss(input, target, reduction=\"none\")\n    if self.activation_fn == \"tanh\":\n        loss *= (torch.tanh(self.beta * torch.abs(input - target))) ** self.gamma\n    elif self.activation_fn == \"sigmoid\":\n        loss *= (\n            2 * torch.sigmoid(self.beta * torch.abs(input - target)) - 1\n        ) ** self.gamma\n    else:\n        ValueError(\n            \"Incorrect activation function value - must be in ['sigmoid', 'tanh']\"\n        )\n    return torch.mean(loss)\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.FocalR_MSELoss","title":"FocalR_MSELoss","text":"<pre><code>FocalR_MSELoss(\n    beta=0.2, gamma=1.0, activation_fn=\"sigmoid\"\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Focal-R MSE loss</p> <p>Based on Delving into Deep Imbalanced Regression.</p> <p>Parameters:</p> <ul> <li> <code>beta</code>               (<code>float</code>, default:                   <code>0.2</code> )           \u2013            <p>Focal Loss <code>beta</code> parameter in their implementation</p> </li> <li> <code>gamma</code>               (<code>float</code>, default:                   <code>1.0</code> )           \u2013            <p>Focal Loss <code>gamma</code> parameter</p> </li> <li> <code>activation_fn</code>               (<code>Literal[sigmoid, tanh]</code>, default:                   <code>'sigmoid'</code> )           \u2013            <p>Activation function to be used during the computation of the loss. Possible values are 'sigmoid' and 'tanh'. See the original publication for details.</p> </li> </ul> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(\n    self,\n    beta: float = 0.2,\n    gamma: float = 1.0,\n    activation_fn: Literal[\"sigmoid\", \"tanh\"] = \"sigmoid\",\n):\n    super().__init__()\n    self.beta = beta\n    self.gamma = gamma\n    self.activation_fn = activation_fn\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.FocalR_MSELoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions (not probabilities)</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual classes</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.losses import FocalR_MSELoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = FocalR_MSELoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(\n    self,\n    input: Tensor,\n    target: Tensor,\n) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions (not probabilities)\n    target: Tensor\n        Target tensor with the actual classes\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; from pytorch_widedeep.losses import FocalR_MSELoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = FocalR_MSELoss()(input, target)\n    \"\"\"\n    loss = (input - target) ** 2\n    if self.activation_fn == \"tanh\":\n        loss *= (torch.tanh(self.beta * torch.abs(input - target))) ** self.gamma\n    elif self.activation_fn == \"sigmoid\":\n        loss *= (\n            2 * torch.sigmoid(self.beta * torch.abs((input - target) ** 2)) - 1\n        ) ** self.gamma\n    else:\n        ValueError(\n            \"Incorrect activation function value - must be in ['sigmoid', 'tanh']\"\n        )\n    return torch.mean(loss)\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.FocalR_RMSELoss","title":"FocalR_RMSELoss","text":"<pre><code>FocalR_RMSELoss(\n    beta=0.2, gamma=1.0, activation_fn=\"sigmoid\"\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Focal-R RMSE loss</p> <p>Based on Delving into Deep Imbalanced Regression.</p> <p>Parameters:</p> <ul> <li> <code>beta</code>               (<code>float</code>, default:                   <code>0.2</code> )           \u2013            <p>Focal Loss <code>beta</code> parameter in their implementation</p> </li> <li> <code>gamma</code>               (<code>float</code>, default:                   <code>1.0</code> )           \u2013            <p>Focal Loss <code>gamma</code> parameter</p> </li> <li> <code>activation_fn</code>               (<code>Literal[sigmoid, tanh]</code>, default:                   <code>'sigmoid'</code> )           \u2013            <p>Activation function to be used during the computation of the loss. Possible values are 'sigmoid' and 'tanh'. See the original publication for details.</p> </li> </ul> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(\n    self,\n    beta: float = 0.2,\n    gamma: float = 1.0,\n    activation_fn: Literal[\"sigmoid\", \"tanh\"] = \"sigmoid\",\n):\n    super().__init__()\n    self.beta = beta\n    self.gamma = gamma\n    self.activation_fn = activation_fn\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.FocalR_RMSELoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions (not probabilities)</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual classes</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.losses import FocalR_RMSELoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = FocalR_RMSELoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(\n    self,\n    input: Tensor,\n    target: Tensor,\n) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions (not probabilities)\n    target: Tensor\n        Target tensor with the actual classes\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; from pytorch_widedeep.losses import FocalR_RMSELoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = FocalR_RMSELoss()(input, target)\n    \"\"\"\n    loss = (input - target) ** 2\n    if self.activation_fn == \"tanh\":\n        loss *= (torch.tanh(self.beta * torch.abs(input - target))) ** self.gamma\n    elif self.activation_fn == \"sigmoid\":\n        loss *= (\n            2 * torch.sigmoid(self.beta * torch.abs((input - target) ** 2)) - 1\n        ) ** self.gamma\n    else:\n        ValueError(\n            \"Incorrect activation function value - must be in ['sigmoid', 'tanh']\"\n        )\n    return torch.sqrt(torch.mean(loss))\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.HuberLoss","title":"HuberLoss","text":"<pre><code>HuberLoss(beta=0.2)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Hubbler Loss</p> <p>Based on Delving into Deep Imbalanced Regression.</p> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self, beta: float = 0.2):\n    super().__init__()\n    self.beta = beta\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.HuberLoss.forward","title":"forward","text":"<pre><code>forward(input, target)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>input</code>               (<code>Tensor</code>)           \u2013            <p>Input tensor with predictions (not probabilities)</p> </li> <li> <code>target</code>               (<code>Tensor</code>)           \u2013            <p>Target tensor with the actual classes</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.losses import HuberLoss\n&gt;&gt;&gt;\n&gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n&gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n&gt;&gt;&gt; loss = HuberLoss()(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(\n    self,\n    input: Tensor,\n    target: Tensor,\n) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    input: Tensor\n        Input tensor with predictions (not probabilities)\n    target: Tensor\n        Target tensor with the actual classes\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; from pytorch_widedeep.losses import HuberLoss\n    &gt;&gt;&gt;\n    &gt;&gt;&gt; target = torch.tensor([1, 1.2, 0, 2]).view(-1, 1)\n    &gt;&gt;&gt; input = torch.tensor([0.6, 0.7, 0.3, 0.8]).view(-1, 1)\n    &gt;&gt;&gt; loss = HuberLoss()(input, target)\n    \"\"\"\n    l1_loss = torch.abs(input - target)\n    cond = l1_loss &lt; self.beta\n    loss = torch.where(\n        cond, 0.5 * l1_loss**2 / self.beta, l1_loss - 0.5 * self.beta\n    )\n    return torch.mean(loss)\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.InfoNCELoss","title":"InfoNCELoss","text":"<pre><code>InfoNCELoss(temperature=0.1, reduction='mean')\n</code></pre> <p>               Bases: <code>Module</code></p> <p>InfoNCE Loss. Loss applied during the Contrastive Denoising Self Supervised Pre-training routine available in this library</p> <p> NOTE: This loss is in principle not exposed to  the user, as it is used internally in the library, but it is included  here for completion.</p> <p>See SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training and references therein</p> <p>Partially inspired by the code in this repo</p> <p>Parameters:</p> <ul> <li> <code>temperature</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>The logits are divided by the temperature before computing the loss value</p> </li> <li> <code>reduction</code>               (<code>str</code>, default:                   <code>'mean'</code> )           \u2013            <p>Loss reduction method</p> </li> </ul> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self, temperature: float = 0.1, reduction: str = \"mean\"):\n    super(InfoNCELoss, self).__init__()\n\n    self.temperature = temperature\n    self.reduction = reduction\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.InfoNCELoss.forward","title":"forward","text":"<pre><code>forward(g_projs)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>g_projs</code>               (<code>Tuple[Tensor, Tensor]</code>)           \u2013            <p>Tuple with the two tensors corresponding to the output of the two projection heads, as described 'SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training'.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses import InfoNCELoss\n&gt;&gt;&gt; g_projs = (torch.rand(3, 5, 16), torch.rand(3, 5, 16))\n&gt;&gt;&gt; loss = InfoNCELoss()\n&gt;&gt;&gt; res = loss(g_projs)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(self, g_projs: Tuple[Tensor, Tensor]) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    g_projs: Tuple\n        Tuple with the two tensors corresponding to the output of the two\n        projection heads, as described 'SAINT: Improved Neural Networks\n        for Tabular Data via Row Attention and Contrastive Pre-Training'.\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt; from pytorch_widedeep.losses import InfoNCELoss\n    &gt;&gt;&gt; g_projs = (torch.rand(3, 5, 16), torch.rand(3, 5, 16))\n    &gt;&gt;&gt; loss = InfoNCELoss()\n    &gt;&gt;&gt; res = loss(g_projs)\n    \"\"\"\n    z, z_ = g_projs[0], g_projs[1]\n\n    norm_z = F.normalize(z, dim=-1).flatten(1)\n    norm_z_ = F.normalize(z_, dim=-1).flatten(1)\n\n    logits = (norm_z @ norm_z_.t()) / self.temperature\n    logits_ = (norm_z_ @ norm_z.t()) / self.temperature\n\n    # the target/labels are the entries on the diagonal\n    target = torch.arange(len(norm_z), device=norm_z.device)\n\n    loss = F.cross_entropy(logits, target, reduction=self.reduction)\n    loss_ = F.cross_entropy(logits_, target, reduction=self.reduction)\n\n    return (loss + loss_) / 2.0\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.DenoisingLoss","title":"DenoisingLoss","text":"<pre><code>DenoisingLoss(\n    lambda_cat=1.0, lambda_cont=1.0, reduction=\"mean\"\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Denoising Loss. Loss applied during the Contrastive Denoising Self Supervised Pre-training routine available in this library</p> <p> NOTE: This loss is in principle not exposed to  the user, as it is used internally in the library, but it is included  here for completion.</p> <p>See SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training and references therein</p> <p>Parameters:</p> <ul> <li> <code>lambda_cat</code>               (<code>float</code>, default:                   <code>1.0</code> )           \u2013            <p>Multiplicative factor that will be applied to loss associated to the categorical features</p> </li> <li> <code>lambda_cont</code>               (<code>float</code>, default:                   <code>1.0</code> )           \u2013            <p>Multiplicative factor that will be applied to loss associated to the continuous features</p> </li> <li> <code>reduction</code>               (<code>str</code>, default:                   <code>'mean'</code> )           \u2013            <p>Loss reduction method</p> </li> </ul> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(\n    self, lambda_cat: float = 1.0, lambda_cont: float = 1.0, reduction: str = \"mean\"\n):\n    super(DenoisingLoss, self).__init__()\n\n    self.lambda_cat = lambda_cat\n    self.lambda_cont = lambda_cont\n    self.reduction = reduction\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.DenoisingLoss.forward","title":"forward","text":"<pre><code>forward(x_cat_and_cat_, x_cont_and_cont_)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>x_cat_and_cat_</code>               (<code>Optional[Union[List[Tuple[Tensor, Tensor]], Tuple[Tensor, Tensor]]]</code>)           \u2013            <p>Tuple of tensors containing the raw input features and their encodings, referred in the SAINT paper as \\(x\\) and \\(x''\\) respectively. If one denoising MLP is used per categorical feature <code>x_cat_and_cat_</code> will be a list of tuples, one per categorical feature</p> </li> <li> <code>x_cont_and_cont_</code>               (<code>Optional[Union[List[Tuple[Tensor, Tensor]], Tuple[Tensor, Tensor]]]</code>)           \u2013            <p>same as <code>x_cat_and_cat_</code> but for continuous columns</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses import DenoisingLoss\n&gt;&gt;&gt; x_cat_and_cat_ = (torch.empty(3).random_(3).long(), torch.randn(3, 3))\n&gt;&gt;&gt; x_cont_and_cont_ = (torch.randn(3, 1), torch.randn(3, 1))\n&gt;&gt;&gt; loss = DenoisingLoss()\n&gt;&gt;&gt; res = loss(x_cat_and_cat_, x_cont_and_cont_)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(\n    self,\n    x_cat_and_cat_: Optional[\n        Union[List[Tuple[Tensor, Tensor]], Tuple[Tensor, Tensor]]\n    ],\n    x_cont_and_cont_: Optional[\n        Union[List[Tuple[Tensor, Tensor]], Tuple[Tensor, Tensor]]\n    ],\n) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    x_cat_and_cat_: tuple of Tensors or lists of tuples\n        Tuple of tensors containing the raw input features and their\n        encodings, referred in the SAINT paper as $x$ and $x''$\n        respectively. If one denoising MLP is used per categorical\n        feature `x_cat_and_cat_` will be a list of tuples, one per\n        categorical feature\n    x_cont_and_cont_: tuple of Tensors or lists of tuples\n        same as `x_cat_and_cat_` but for continuous columns\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt; from pytorch_widedeep.losses import DenoisingLoss\n    &gt;&gt;&gt; x_cat_and_cat_ = (torch.empty(3).random_(3).long(), torch.randn(3, 3))\n    &gt;&gt;&gt; x_cont_and_cont_ = (torch.randn(3, 1), torch.randn(3, 1))\n    &gt;&gt;&gt; loss = DenoisingLoss()\n    &gt;&gt;&gt; res = loss(x_cat_and_cat_, x_cont_and_cont_)\n    \"\"\"\n\n    loss_cat = (\n        self._compute_cat_loss(x_cat_and_cat_)\n        if x_cat_and_cat_ is not None\n        else torch.tensor(0.0)\n    )\n    loss_cont = (\n        self._compute_cont_loss(x_cont_and_cont_)\n        if x_cont_and_cont_ is not None\n        else torch.tensor(0.0)\n    )\n\n    return self.lambda_cat * loss_cat + self.lambda_cont * loss_cont\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.EncoderDecoderLoss","title":"EncoderDecoderLoss","text":"<pre><code>EncoderDecoderLoss(eps=1e-09)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>'Standard' Encoder Decoder Loss. Loss applied during the Endoder-Decoder  Self-Supervised Pre-Training routine available in this library</p> <p> NOTE: This loss is in principle not exposed to  the user, as it is used internally in the library, but it is included  here for completion.</p> <p>The implementation of this lost is based on that at the tabnet repo, which is in itself an adaptation of that in the original paper TabNet: Attentive Interpretable Tabular Learning.</p> <p>Parameters:</p> <ul> <li> <code>eps</code>               (<code>float</code>, default:                   <code>1e-09</code> )           \u2013            <p>Simply a small number to avoid dividing by zero</p> </li> </ul> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def __init__(self, eps: float = 1e-9):\n    super(EncoderDecoderLoss, self).__init__()\n    self.eps = eps\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses.EncoderDecoderLoss.forward","title":"forward","text":"<pre><code>forward(x_true, x_pred, mask)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>x_true</code>               (<code>Tensor</code>)           \u2013            <p>Embeddings of the input data</p> </li> <li> <code>x_pred</code>               (<code>Tensor</code>)           \u2013            <p>Reconstructed embeddings</p> </li> <li> <code>mask</code>               (<code>Tensor</code>)           \u2013            <p>Mask with 1s indicated that the reconstruction, and therefore the loss, is based on those features.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses import EncoderDecoderLoss\n&gt;&gt;&gt; x_true = torch.rand(3, 3)\n&gt;&gt;&gt; x_pred = torch.rand(3, 3)\n&gt;&gt;&gt; mask = torch.empty(3, 3).random_(2)\n&gt;&gt;&gt; loss = EncoderDecoderLoss()\n&gt;&gt;&gt; res = loss(x_true, x_pred, mask)\n</code></pre> Source code in <code>pytorch_widedeep/losses.py</code> <pre><code>def forward(self, x_true: Tensor, x_pred: Tensor, mask: Tensor) -&gt; Tensor:\n    r\"\"\"\n    Parameters\n    ----------\n    x_true: Tensor\n        Embeddings of the input data\n    x_pred: Tensor\n        Reconstructed embeddings\n    mask: Tensor\n        Mask with 1s indicated that the reconstruction, and therefore the\n        loss, is based on those features.\n\n    Examples\n    --------\n    &gt;&gt;&gt; import torch\n    &gt;&gt;&gt; from pytorch_widedeep.losses import EncoderDecoderLoss\n    &gt;&gt;&gt; x_true = torch.rand(3, 3)\n    &gt;&gt;&gt; x_pred = torch.rand(3, 3)\n    &gt;&gt;&gt; mask = torch.empty(3, 3).random_(2)\n    &gt;&gt;&gt; loss = EncoderDecoderLoss()\n    &gt;&gt;&gt; res = loss(x_true, x_pred, mask)\n    \"\"\"\n\n    errors = x_pred - x_true\n\n    reconstruction_errors = torch.mul(errors, mask) ** 2\n\n    x_true_means = torch.mean(x_true, dim=0)\n    x_true_means[x_true_means == 0] = 1\n\n    x_true_stds = torch.std(x_true, dim=0) ** 2\n    x_true_stds[x_true_stds == 0] = x_true_means[x_true_stds == 0]\n\n    features_loss = torch.matmul(reconstruction_errors, 1 / x_true_stds)\n    nb_reconstructed_variables = torch.sum(mask, dim=1)\n    features_loss_norm = features_loss / (nb_reconstructed_variables + self.eps)\n\n    loss = torch.mean(features_loss_norm)\n\n    return loss\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses_multitarget.MultiTargetRegressionLoss","title":"MultiTargetRegressionLoss","text":"<pre><code>MultiTargetRegressionLoss(weights=None, reduction='mean')\n</code></pre> <p>               Bases: <code>Module</code></p> <p>This class is a wrapper around the Pytorch MSELoss. It allows for multi-target regression problems. The user can provide a list of weights to apply to each target. The loss can be either the sum or the mean of the individual losses</p> <p>Parameters:</p> <ul> <li> <code>weights</code>               (<code>Optional[List[float]]</code>, default:                   <code>None</code> )           \u2013            <p>List of weights to apply to the loss associated to each target. The length of the list must match the number of targets. Alias: 'target_weights'</p> </li> <li> <code>reduction</code>               (<code>Literal[mean, sum]</code>, default:                   <code>'mean'</code> )           \u2013            <p>Specifies the reduction to apply to the loss associated to each target: 'mean' | 'sum'. Note that this is NOT the same as the reduction in the MSELoss. This reduction is applied after the loss for each target has been computed. Alias: 'target_reduction'</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses_multitarget import MultiTargetRegressionLoss\n&gt;&gt;&gt; input = torch.randn(3, 2)\n&gt;&gt;&gt; target = torch.randn(3, 2)\n&gt;&gt;&gt; loss = MultiTargetRegressionLoss(weights=[0.5, 0.5], reduction=\"mean\")\n&gt;&gt;&gt; output = loss(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses_multitarget.py</code> <pre><code>@alias(\"reduction\", [\"target_reduction\"])\n@alias(\"weights\", [\"target_weights\"])\ndef __init__(\n    self,\n    weights: Optional[List[float]] = None,\n    reduction: Literal[\"mean\", \"sum\"] = \"mean\",\n):\n    super(MultiTargetRegressionLoss, self).__init__()\n\n    self.weights = weights\n    self.reduction = reduction\n\n    if self.reduction not in [\"mean\", \"sum\"]:\n        raise ValueError(\"reduction must be either 'mean' or 'sum'\")\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses_multitarget.MultiTargetClassificationLoss","title":"MultiTargetClassificationLoss","text":"<pre><code>MultiTargetClassificationLoss(\n    binary_config=None,\n    multiclass_config=None,\n    weights=None,\n    reduction=\"mean\",\n    binary_trick=False,\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>This class is a wrapper around the Pytorch binary_cross_entropy_with_logits and cross_entropy losses. It allows for multi-target classification problems. The user can provide a list of weights to apply to each target. The loss can be either the sum or the mean of the individual losses</p> <p>Parameters:</p> <ul> <li> <code>binary_config</code>               (<code>Optional[List[Union[int, Tuple[int, float]]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of integers with the index of the target for binary classification or tuples with two elements: the index of the targets or binary classification and the positive weight for binary classification</p> </li> <li> <code>multiclass_config</code>               (<code>Optional[List[Union[Tuple[int, int], Tuple[int, int, List[float]]]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of tuples with two or three elements: the index of the target and the number of classes for multiclass classification, or a tuple with the index of the target, the number of classes and a list of weights to apply to each class (i.e. the 'weight' parameter in the cross_entropy loss)</p> </li> <li> <code>weights</code>               (<code>Optional[List[float]]</code>, default:                   <code>None</code> )           \u2013            <p>List of weights to apply to the loss associated to each target. The length of the list must match the number of targets. Alias: 'target_weights'</p> </li> <li> <code>reduction</code>               (<code>Literal[mean, sum]</code>, default:                   <code>'mean'</code> )           \u2013            <p>Specifies the reduction to apply to the loss associated to each target: 'mean' | 'sum'. Note that this is NOT the same as the reduction in the cross_entropy loss or the binary_cross_entropy_with_logits. This reduction is applied after the loss for each target has been computed. Alias: 'target_reduction'</p> </li> <li> <code>binary_trick</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>If True, each target will be considered independently and the loss will be computed as binary_cross_entropy_with_logits. This is a faster implementation. Note that the 'weights' parameter is not compatible with binary_trick=True. Also note that if binary_trick=True, the 'binary_config' must be a list of integers and the 'multiclass_config' must be a list of tuples with two integers: the index of the target and the number of classes. Finally, if binary_trick=True, the binary targets must be the first targets in the target tensor.</p> <p> NOTE: When using the binary_trick, the binary targets are   considered as 2 classes. Therefore, the pred_dim parametere of the   WideDeep class should be adjusted accordingly (adding 2 to per   binary target). For example, in a problem with a binary target and   a 4 class multiclassification target, the pred_dim should be 6.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses_multitarget import MultiTargetClassificationLoss\n&gt;&gt;&gt; input = torch.randn(5, 4)\n&gt;&gt;&gt; input_binary_trick = torch.randn(5, 5)\n&gt;&gt;&gt; target = torch.stack([torch.tensor([0, 1, 0, 1, 1]), torch.tensor([0, 1, 2, 0, 2])], 1)\n&gt;&gt;&gt; loss_1 = MultiTargetClassificationLoss(binary_config=[0], multiclass_config=[(1, 3)], reduction=\"mean\")\n&gt;&gt;&gt; output_1 = loss_1(input, target)\n&gt;&gt;&gt; loss_2 = MultiTargetClassificationLoss(binary_config=[(0, 0.5)], multiclass_config=[(1, 3, [1., 2., 3.])],\n... reduction=\"sum\", weights=[0.5, 0.5])\n&gt;&gt;&gt; output_2 = loss_2(input, target)\n&gt;&gt;&gt; loss_3 = MultiTargetClassificationLoss(binary_config=[0], multiclass_config=[(1, 3)], binary_trick=True)\n&gt;&gt;&gt; output_3 = loss_3(input_binary_trick, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses_multitarget.py</code> <pre><code>@alias(\"reduction\", [\"target_reduction\"])\n@alias(\"weights\", [\"target_weights\"])\ndef __init__(  # noqa: C901\n    self,\n    binary_config: Optional[List[Union[int, Tuple[int, float]]]] = None,\n    multiclass_config: Optional[\n        List[Union[Tuple[int, int], Tuple[int, int, List[float]]]]\n    ] = None,\n    weights: Optional[List[float]] = None,\n    reduction: Literal[\"mean\", \"sum\"] = \"mean\",\n    binary_trick: bool = False,\n):\n    super(MultiTargetClassificationLoss, self).__init__()\n\n    if reduction not in [\"mean\", \"sum\"]:\n        raise ValueError(\"reduction must be either 'mean' or 'sum'\")\n\n    self.binary_config = binary_config\n    self.multiclass_config = multiclass_config\n    self.weights = weights\n    self.reduction = reduction\n    self.binary_trick = binary_trick\n\n    if self.weights is not None:\n        if len(self.weights) != (\n            len(self.binary_config) if self.binary_config is not None else 0\n        ) + (\n            len(self.multiclass_config) if self.multiclass_config is not None else 0\n        ):\n            raise ValueError(\n                \"The number of weights must match the number of binary and multiclass targets\"\n            )\n\n    if self.binary_trick:\n        self._check_inputs_with_binary_trick()\n        self._binary_config: List[int] = binary_config  # type: ignore[assignment]\n        self._multiclass_config: List[Tuple[int, int]] = self.multiclass_config  # type: ignore[assignment]\n    else:\n        self.binary_config_with_pos_weights = (\n            (self._set_binary_config_without_binary_trick())\n            if self.binary_config is not None\n            else None\n        )\n        self.multiclass_config_with_weights = (\n            (self._set_multiclass_config_without_binary_trick())\n            if self.multiclass_config is not None\n            else None\n        )\n</code></pre>"},{"location":"pytorch-widedeep/losses.html#pytorch_widedeep.losses_multitarget.MutilTargetRegressionAndClassificationLoss","title":"MutilTargetRegressionAndClassificationLoss","text":"<pre><code>MutilTargetRegressionAndClassificationLoss(\n    regression_config=[],\n    binary_config=None,\n    multiclass_config=None,\n    weights=None,\n    reduction=\"mean\",\n    binary_trick=False,\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>This class is a wrapper around the MultiTargetRegressionLoss and the MultiTargetClassificationLoss. It allows for multi-target regression and classification problems. The user can provide a list of weights to apply to each target. The loss can be either the sum or the mean of the individual losses</p> <p>Parameters:</p> <ul> <li> <code>regression_config</code>               (<code>List[int]</code>, default:                   <code>[]</code> )           \u2013            <p>List of integers with the indices of the regression targets</p> </li> <li> <code>binary_config</code>               (<code>Optional[List[Union[int, Tuple[int, float]]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of integers with the index of the target for binary classification or tuples with two elements: the index of the targets or binary classification and the positive weight for binary classification</p> </li> <li> <code>multiclass_config</code>               (<code>Optional[List[Union[Tuple[int, int], Tuple[int, int, List[float]]]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of tuples with two or three elements: the index of the target and the number of classes for multiclass classification, or a tuple with the index of the target, the number of classes and a list of weights to apply to each class (i.e. the 'weight' parameter in the cross_entropy loss)</p> </li> <li> <code>weights</code>               (<code>Optional[List[float]]</code>, default:                   <code>None</code> )           \u2013            <p>List of weights to apply to the loss associated to each target. The length of the list must match the number of targets. Alias: 'target_weights'</p> </li> <li> <code>reduction</code>               (<code>Literal[mean, sum]</code>, default:                   <code>'mean'</code> )           \u2013            <p>Specifies the reduction to apply to the output: 'mean' | 'sum'. Note that this is NOT the same as the reduction in the cross_entropy loss, the binary_cross_entropy_with_logits or the MSELoss. This reduction is applied after each target has been computed. Alias: 'target_reduction'</p> </li> <li> <code>binary_trick</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>If True, each target will be considered independently and the loss will be computed as binary_cross_entropy_with_logits. This is a faster implementation. Note that the 'weights' parameter is not compatible with binary_trick=True. Also note that if binary_trick=True, the 'binary_config' must be a list of integers and the 'multiclass_config' must be a list of tuples with two integers: the index of the target and the number of classes. Finally, if binary_trick=True, the binary targets must be the first targets in the target tensor.</p> <p> NOTE: When using the binary_trick, the binary targets are   considered as 2 classes. Therefore, the pred_dim parametere of the   WideDeep class should be adjusted accordingly (adding 2 to per   binary target). For example, in a problem with a binary target and   a 4 class multiclassification target, the pred_dim should be 6.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.losses_multitarget import MutilTargetRegressionAndClassificationLoss\n&gt;&gt;&gt; input = torch.randn(5, 5)\n&gt;&gt;&gt; target = torch.stack([torch.randn(5), torch.tensor([0, 1, 0, 1, 1]), torch.tensor([0, 1, 2, 0, 2])], 1)\n&gt;&gt;&gt; loss = MutilTargetRegressionAndClassificationLoss(regression_config=[0], binary_config=[2],\n... multiclass_config=[(2, 3)], reduction=\"mean\")\n&gt;&gt;&gt; output = loss(input, target)\n</code></pre> Source code in <code>pytorch_widedeep/losses_multitarget.py</code> <pre><code>@alias(\"reduction\", [\"target_reduction\"])\n@alias(\"weights\", [\"target_weights\"])\ndef __init__(  # noqa: C901\n    self,\n    regression_config: List[int] = [],\n    binary_config: Optional[List[Union[int, Tuple[int, float]]]] = None,\n    multiclass_config: Optional[\n        List[Union[Tuple[int, int], Tuple[int, int, List[float]]]]\n    ] = None,\n    weights: Optional[List[float]] = None,\n    reduction: Literal[\"mean\", \"sum\"] = \"mean\",\n    binary_trick: bool = False,\n):\n\n    super(MutilTargetRegressionAndClassificationLoss, self).__init__()\n\n    self.regression_config = regression_config\n\n    assert binary_config is not None or multiclass_config is not None, (\n        \"Either binary_config or multiclass_config must be provided. \"\n        \"Otherwise, use the MultiTargetRegressionLoss\"\n    )\n\n    if binary_trick:\n        self._check_inputs_with_binary_trick(\n            regression_config, binary_config, multiclass_config\n        )\n\n    if weights is not None:\n        if len(weights) != (\n            len(regression_config)\n            + (len(binary_config) if binary_config is not None else 0)\n            + (len(multiclass_config) if multiclass_config is not None else 0)\n        ):\n            raise ValueError(\n                \"The number of weights must match the number of regression, binary and multiclass targets\"\n            )\n\n        self.weights_regression = self._prepare_weights_for_regression_targets(\n            weights, regression_config\n        )\n        self.weights_binary = self._prepare_weights_per_binary_targets(\n            weights, binary_config\n        )\n        self.weights_multiclass = self._prepare_weights_per_multiclass_targets(\n            weights, multiclass_config\n        )\n        self.weights = weights\n    else:\n        self.weights_regression = None\n        self.weights_binary = None\n        self.weights_multiclass = None\n\n    self.multi_target_regression_loss = MultiTargetRegressionLoss(\n        weights=self.weights_regression, reduction=reduction\n    )\n\n    self.multi_target_classification_loss = MultiTargetClassificationLoss(\n        binary_config=binary_config,\n        multiclass_config=multiclass_config,\n        weights=(\n            self.weights_binary + self.weights_multiclass\n            if self.weights_binary is not None\n            and self.weights_multiclass is not None\n            else (\n                self.weights_binary\n                if self.weights_binary is not None\n                else self.weights_multiclass\n            )\n        ),\n        reduction=reduction,\n        binary_trick=binary_trick,\n    )\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html","title":"Metrics","text":"<p> NOTE: metrics in this module expect the predictions  and ground truth to have the same dimensions for regression and binary  classification problems: \\((N_{samples}, 1)\\). In the case of multiclass  classification problems the ground truth is expected to be a 1D tensor with  the corresponding classes. See Examples below</p> <p>We have added the possibility of using the metrics available at the torchmetrics library. Note that this library is still in its early versions and therefore this option should be used with caution. To use <code>torchmetrics</code> simply import them and use them as any of the <code>pytorch-widedeep</code> metrics described below.</p> <pre><code>from torchmetrics import Accuracy, Precision\n\naccuracy = Accuracy(average=None, num_classes=2)\nprecision = Precision(average='micro', num_classes=2)\n\ntrainer = Trainer(model, objective=\"binary\", metrics=[accuracy, precision])\n</code></pre> <p>A functioning example for <code>pytorch-widedeep</code> using <code>torchmetrics</code> can be found in the Examples folder</p> <p> NOTE: the forward method for all metrics in this  module takes two tensors, <code>y_pred</code> and <code>y_true</code> (in that order). Therefore,  we do not include the method in the documentation.</p>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.Accuracy","title":"Accuracy","text":"<pre><code>Accuracy(top_k=1)\n</code></pre> <p>               Bases: <code>Metric</code></p> <p>Class to calculate the accuracy for both binary and categorical problems</p> <p>Parameters:</p> <ul> <li> <code>top_k</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Accuracy will be computed using the top k most likely classes in multiclass problems</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.metrics import Accuracy\n&gt;&gt;&gt;\n&gt;&gt;&gt; acc = Accuracy()\n&gt;&gt;&gt; y_true = torch.tensor([0, 1, 0, 1]).view(-1, 1)\n&gt;&gt;&gt; y_pred = torch.tensor([[0.3, 0.2, 0.6, 0.7]]).view(-1, 1)\n&gt;&gt;&gt; acc(y_pred, y_true)\narray(0.5)\n&gt;&gt;&gt;\n&gt;&gt;&gt; acc = Accuracy(top_k=2)\n&gt;&gt;&gt; y_true = torch.tensor([0, 1, 2])\n&gt;&gt;&gt; y_pred = torch.tensor([[0.3, 0.5, 0.2], [0.1, 0.1, 0.8], [0.1, 0.5, 0.4]])\n&gt;&gt;&gt; acc(y_pred, y_true)\narray(0.66666667)\n</code></pre> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def __init__(self, top_k: int = 1):\n    super(Accuracy, self).__init__()\n\n    self.top_k = top_k\n    self.correct_count = 0\n    self.total_count = 0\n    self._name = \"acc\"\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.Accuracy.reset","title":"reset","text":"<pre><code>reset()\n</code></pre> <p>resets counters to 0</p> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def reset(self):\n    \"\"\"\n    resets counters to 0\n    \"\"\"\n    self.correct_count = 0\n    self.total_count = 0\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.Precision","title":"Precision","text":"<pre><code>Precision(average=True)\n</code></pre> <p>               Bases: <code>Metric</code></p> <p>Class to calculate the precision for both binary and categorical problems</p> <p>Parameters:</p> <ul> <li> <code>average</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>This applies only to multiclass problems. if <code>True</code> calculate precision for each label, and finds their unweighted mean.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.metrics import Precision\n&gt;&gt;&gt;\n&gt;&gt;&gt; prec = Precision()\n&gt;&gt;&gt; y_true = torch.tensor([0, 1, 0, 1]).view(-1, 1)\n&gt;&gt;&gt; y_pred = torch.tensor([[0.3, 0.2, 0.6, 0.7]]).view(-1, 1)\n&gt;&gt;&gt; prec(y_pred, y_true)\narray(0.5)\n&gt;&gt;&gt;\n&gt;&gt;&gt; prec = Precision(average=True)\n&gt;&gt;&gt; y_true = torch.tensor([0, 1, 2])\n&gt;&gt;&gt; y_pred = torch.tensor([[0.7, 0.1, 0.2], [0.1, 0.1, 0.8], [0.1, 0.5, 0.4]])\n&gt;&gt;&gt; prec(y_pred, y_true)\narray(0.33333334)\n</code></pre> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def __init__(self, average: bool = True):\n    super(Precision, self).__init__()\n\n    self.average = average\n    self.true_positives = 0\n    self.all_positives = 0\n    self.eps = 1e-20\n    self._name = \"prec\"\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.Precision.reset","title":"reset","text":"<pre><code>reset()\n</code></pre> <p>resets counters to 0</p> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def reset(self):\n    \"\"\"\n    resets counters to 0\n    \"\"\"\n    self.true_positives = 0\n    self.all_positives = 0\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.Recall","title":"Recall","text":"<pre><code>Recall(average=True)\n</code></pre> <p>               Bases: <code>Metric</code></p> <p>Class to calculate the recall for both binary and categorical problems</p> <p>Parameters:</p> <ul> <li> <code>average</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>This applies only to multiclass problems. if <code>True</code> calculate recall for each label, and finds their unweighted mean.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.metrics import Recall\n&gt;&gt;&gt;\n&gt;&gt;&gt; rec = Recall()\n&gt;&gt;&gt; y_true = torch.tensor([0, 1, 0, 1]).view(-1, 1)\n&gt;&gt;&gt; y_pred = torch.tensor([[0.3, 0.2, 0.6, 0.7]]).view(-1, 1)\n&gt;&gt;&gt; rec(y_pred, y_true)\narray(0.5)\n&gt;&gt;&gt;\n&gt;&gt;&gt; rec = Recall(average=True)\n&gt;&gt;&gt; y_true = torch.tensor([0, 1, 2])\n&gt;&gt;&gt; y_pred = torch.tensor([[0.7, 0.1, 0.2], [0.1, 0.1, 0.8], [0.1, 0.5, 0.4]])\n&gt;&gt;&gt; rec(y_pred, y_true)\narray(0.33333334)\n</code></pre> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def __init__(self, average: bool = True):\n    super(Recall, self).__init__()\n\n    self.average = average\n    self.true_positives = 0\n    self.actual_positives = 0\n    self.eps = 1e-20\n    self._name = \"rec\"\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.Recall.reset","title":"reset","text":"<pre><code>reset()\n</code></pre> <p>resets counters to 0</p> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def reset(self):\n    \"\"\"\n    resets counters to 0\n    \"\"\"\n    self.true_positives = 0\n    self.actual_positives = 0\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.FBetaScore","title":"FBetaScore","text":"<pre><code>FBetaScore(beta, average=True)\n</code></pre> <p>               Bases: <code>Metric</code></p> <p>Class to calculate the fbeta score for both binary and categorical problems</p> \\[ F_{\\beta} = ((1 + {\\beta}^2) * \\frac{(precision * recall)}{({\\beta}^2 * precision + recall)} \\] <p>Parameters:</p> <ul> <li> <code>beta</code>               (<code>int</code>)           \u2013            <p>Coefficient to control the balance between precision and recall</p> </li> <li> <code>average</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>This applies only to multiclass problems. if <code>True</code> calculate fbeta for each label, and find their unweighted mean.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.metrics import FBetaScore\n&gt;&gt;&gt;\n&gt;&gt;&gt; fbeta = FBetaScore(beta=2)\n&gt;&gt;&gt; y_true = torch.tensor([0, 1, 0, 1]).view(-1, 1)\n&gt;&gt;&gt; y_pred = torch.tensor([[0.3, 0.2, 0.6, 0.7]]).view(-1, 1)\n&gt;&gt;&gt; fbeta(y_pred, y_true)\narray(0.5)\n&gt;&gt;&gt;\n&gt;&gt;&gt; fbeta = FBetaScore(beta=2)\n&gt;&gt;&gt; y_true = torch.tensor([0, 1, 2])\n&gt;&gt;&gt; y_pred = torch.tensor([[0.7, 0.1, 0.2], [0.1, 0.1, 0.8], [0.1, 0.5, 0.4]])\n&gt;&gt;&gt; fbeta(y_pred, y_true)\narray(0.33333334)\n</code></pre> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def __init__(self, beta: int, average: bool = True):\n    super(FBetaScore, self).__init__()\n\n    self.beta = beta\n    self.average = average\n    self.precision = Precision(average=False)\n    self.recall = Recall(average=False)\n    self.eps = 1e-20\n    self._name = \"\".join([\"f\", str(self.beta)])\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.FBetaScore.reset","title":"reset","text":"<pre><code>reset()\n</code></pre> <p>resets precision and recall</p> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def reset(self):\n    \"\"\"\n    resets precision and recall\n    \"\"\"\n    self.precision.reset()\n    self.recall.reset()\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.F1Score","title":"F1Score","text":"<pre><code>F1Score(average=True)\n</code></pre> <p>               Bases: <code>Metric</code></p> <p>Class to calculate the f1 score for both binary and categorical problems</p> <p>Parameters:</p> <ul> <li> <code>average</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>This applies only to multiclass problems. if <code>True</code> calculate f1 for each label, and find their unweighted mean.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.metrics import F1Score\n&gt;&gt;&gt;\n&gt;&gt;&gt; f1 = F1Score()\n&gt;&gt;&gt; y_true = torch.tensor([0, 1, 0, 1]).view(-1, 1)\n&gt;&gt;&gt; y_pred = torch.tensor([[0.3, 0.2, 0.6, 0.7]]).view(-1, 1)\n&gt;&gt;&gt; f1(y_pred, y_true)\narray(0.5)\n&gt;&gt;&gt;\n&gt;&gt;&gt; f1 = F1Score()\n&gt;&gt;&gt; y_true = torch.tensor([0, 1, 2])\n&gt;&gt;&gt; y_pred = torch.tensor([[0.7, 0.1, 0.2], [0.1, 0.1, 0.8], [0.1, 0.5, 0.4]])\n&gt;&gt;&gt; f1(y_pred, y_true)\narray(0.33333334)\n</code></pre> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def __init__(self, average: bool = True):\n    super(F1Score, self).__init__()\n\n    self.average = average\n    self.f1 = FBetaScore(beta=1, average=self.average)\n    self._name = self.f1._name\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.F1Score.reset","title":"reset","text":"<pre><code>reset()\n</code></pre> <p>resets counters to 0</p> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def reset(self):\n    \"\"\"\n    resets counters to 0\n    \"\"\"\n    self.f1.reset()\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.R2Score","title":"R2Score","text":"<pre><code>R2Score()\n</code></pre> <p>               Bases: <code>Metric</code></p> <p>Calculates R-Squared, the coefficient of determination:</p> \\[ R^2 = 1 - \\frac{\\sum_{j=1}^n(y_j - \\hat{y_j})^2}{\\sum_{j=1}^n(y_j - \\bar{y})^2} \\] <p>where \\(\\hat{y_j}\\) is the ground truth, \\(y_j\\) is the predicted value and \\(\\bar{y}\\) is the mean of the ground truth.</p> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.metrics import R2Score\n&gt;&gt;&gt;\n&gt;&gt;&gt; r2 = R2Score()\n&gt;&gt;&gt; y_true = torch.tensor([3, -0.5, 2, 7]).view(-1, 1)\n&gt;&gt;&gt; y_pred = torch.tensor([2.5, 0.0, 2, 8]).view(-1, 1)\n&gt;&gt;&gt; r2(y_pred, y_true)\narray(0.94860814)\n</code></pre> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def __init__(self):\n    self.numerator = 0\n    self.denominator = 0\n    self.num_examples = 0\n    self.y_true_sum = 0\n\n    self._name = \"r2\"\n</code></pre>"},{"location":"pytorch-widedeep/metrics.html#pytorch_widedeep.metrics.R2Score.reset","title":"reset","text":"<pre><code>reset()\n</code></pre> <p>resets counters to 0</p> Source code in <code>pytorch_widedeep/metrics.py</code> <pre><code>def reset(self):\n    \"\"\"\n    resets counters to 0\n    \"\"\"\n    self.numerator = 0\n    self.denominator = 0\n    self.num_examples = 0\n    self.y_true_sum = 0\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html","title":"The <code>models</code> module","text":"<p>This module contains the models that can be used as the four main components that will comprise a Wide and Deep model (<code>wide</code>, <code>deeptabular</code>, <code>deeptext</code>, <code>deepimage</code>), as well as the <code>WideDeep</code> \"constructor\" class. Note that each of the four components can be used independently. It also contains all the documentation for the models that can be used for self-supervised pre-training with tabular data.</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.linear.wide.Wide","title":"Wide","text":"<pre><code>Wide(input_dim, pred_dim=1)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Defines a <code>Wide</code> (linear) model where the non-linearities are captured via the so-called crossed-columns. This can be used as the <code>wide</code> component of a Wide &amp; Deep model.</p> <p>Parameters:</p> <ul> <li> <code>input_dim</code>               (<code>int</code>)           \u2013            <p>size of the Linear layer (implemented via an Embedding layer). <code>input_dim</code> is the summation of all the individual values for all the features that go through the wide model. For example, if the wide model receives 2 features with 5 individual values each, <code>input_dim = 10</code></p> </li> <li> <code>pred_dim</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>size of the ouput tensor containing the predictions. Note that unlike all the other models, the wide model is connected directly to the output neuron(s) when used to build a Wide and Deep model. Therefore, it requires the <code>pred_dim</code> parameter.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>wide_linear</code>               (<code>Module</code>)           \u2013            <p>the linear layer that comprises the wide branch of the model</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import Wide\n&gt;&gt;&gt; X = torch.empty(4, 4).random_(20)\n&gt;&gt;&gt; wide = Wide(input_dim=int(X.max().item()), pred_dim=1)\n&gt;&gt;&gt; out = wide(X)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/linear/wide.py</code> <pre><code>@alias(\"pred_dim\", [\"pred_size\", \"num_class\"])\ndef __init__(self, input_dim: int, pred_dim: int = 1):\n    super(Wide, self).__init__()\n\n    self.input_dim = input_dim\n    self.pred_dim = pred_dim\n\n    # Embeddings: val + 1 because 0 is reserved for padding/unseen cateogories.\n    self.wide_linear = nn.Embedding(input_dim + 1, pred_dim, padding_idx=0)\n    # (Sum(Embedding) + bias) is equivalent to (OneHotVector + Linear)\n    self.bias = nn.Parameter(torch.zeros(pred_dim))\n    self._reset_parameters()\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.linear.wide.Wide.forward","title":"forward","text":"<pre><code>forward(X)\n</code></pre> <p>Forward pass. Simply connecting the Embedding layer with the ouput neuron(s)</p> Source code in <code>pytorch_widedeep/models/tabular/linear/wide.py</code> <pre><code>def forward(self, X: Tensor) -&gt; Tensor:\n    r\"\"\"Forward pass. Simply connecting the Embedding layer with the ouput\n    neuron(s)\"\"\"\n    out = self.wide_linear(X.long()).sum(dim=1) + self.bias\n    return out\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.mlp.tab_mlp.TabMlp","title":"TabMlp","text":"<pre><code>TabMlp(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous=None,\n    embed_continuous_method=None,\n    cont_embed_dim=None,\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    mlp_hidden_dims=[200, 100],\n    mlp_activation=\"relu\",\n    mlp_dropout=0.1,\n    mlp_batchnorm=False,\n    mlp_batchnorm_last=False,\n    mlp_linear_first=True\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithoutAttention</code></p> <p>Defines a <code>TabMlp</code> model that can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>This class combines embedding representations of the categorical features with numerical (aka continuous) features, embedded or not. These are then passed through a series of dense layers (i.e. a MLP).</p> <p>Most of the parameters for this class are <code>Optional</code> since the use of categorical or continuous is in fact optional (i.e. one can use categorical features only, continuous features only or both).</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name, number of unique values and embedding dimension. e.g. [(education, 11, 32), ...]</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the continuous columns will be embedded using one of the available methods: 'standard', 'periodic' or 'piecewise'. If <code>None</code>, it will default to 'False'. NOTE: This parameter is deprecated and it  will be removed in future releases. Please, use the  <code>embed_continuous_method</code> parameter instead.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal[standard, piecewise, periodic]]</code>, default:                   <code>None</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dim</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Size of the continuous embeddings. If the continuous columns are embedded, <code>cont_embed_dim</code> must be passed.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>List[int]</code>, default:                   <code>[200, 100]</code> )           \u2013            <p>List with the number of neurons per dense layer in the mlp.</p> </li> <li> <code>mlp_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>mlp_dropout</code>               (<code>Union[float, List[float]]</code>, default:                   <code>0.1</code> )           \u2013            <p>float or List of floats with the dropout between the dense layers. e.g: [0.5,0.5]</p> </li> <li> <code>mlp_batchnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers</p> </li> <li> <code>mlp_linear_first</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>encoder</code>               (<code>Module</code>)           \u2013            <p>mlp model that will receive the concatenation of the embeddings and the continuous columns</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import TabMlp\n&gt;&gt;&gt; X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = [\"a\", \"b\", \"c\", \"d\", \"e\"]\n&gt;&gt;&gt; cat_embed_input = [(u, i, j) for u, i, j in zip(colnames[:4], [4] * 4, [8] * 4)]\n&gt;&gt;&gt; column_idx = {k: v for v, k in enumerate(colnames)}\n&gt;&gt;&gt; model = TabMlp(mlp_hidden_dims=[8, 4], column_idx=column_idx, cat_embed_input=cat_embed_input,\n... continuous_cols=[\"e\"])\n&gt;&gt;&gt; out = model(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/mlp/tab_mlp.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous: Optional[bool] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = None,\n    cont_embed_dim: Optional[int] = None,\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    mlp_hidden_dims: List[int] = [200, 100],\n    mlp_activation: str = \"relu\",\n    mlp_dropout: Union[float, List[float]] = 0.1,\n    mlp_batchnorm: bool = False,\n    mlp_batchnorm_last: bool = False,\n    mlp_linear_first: bool = True,\n):\n    super(TabMlp, self).__init__(\n        column_idx=column_idx,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous=embed_continuous,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dim=cont_embed_dim,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    # Embeddings are instantiated at the base model\n    # Mlp\n    mlp_input_dim = self.cat_out_dim + self.cont_out_dim\n    mlp_hidden_dims = [mlp_input_dim] + mlp_hidden_dims\n    self.encoder = MLP(\n        mlp_hidden_dims,\n        mlp_activation,\n        mlp_dropout,\n        mlp_batchnorm,\n        mlp_batchnorm_last,\n        mlp_linear_first,\n    )\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.mlp.tab_mlp.TabMlp.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.mlp.tab_mlp.TabMlpDecoder","title":"TabMlpDecoder","text":"<pre><code>TabMlpDecoder(\n    embed_dim,\n    mlp_hidden_dims=[100, 200],\n    mlp_activation=\"relu\",\n    mlp_dropout=0.1,\n    mlp_batchnorm=False,\n    mlp_batchnorm_last=False,\n    mlp_linear_first=True,\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Companion decoder model for the <code>TabMlp</code> model (which can be considered an encoder itself).</p> <p>This class is designed to be used with the <code>EncoderDecoderTrainer</code> when using self-supervised pre-training (see the corresponding section in the docs). The <code>TabMlpDecoder</code> will receive the output from the MLP and 'reconstruct' the embeddings.</p> <p>Parameters:</p> <ul> <li> <code>embed_dim</code>               (<code>int</code>)           \u2013            <p>Size of the embeddings tensor that needs to be reconstructed.</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>List[int]</code>, default:                   <code>[100, 200]</code> )           \u2013            <p>List with the number of neurons per dense layer in the mlp.</p> </li> <li> <code>mlp_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>mlp_dropout</code>               (<code>Union[float, List[float]]</code>, default:                   <code>0.1</code> )           \u2013            <p>float or List of floats with the dropout between the dense layers. e.g: [0.5,0.5]</p> </li> <li> <code>mlp_batchnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers</p> </li> <li> <code>mlp_linear_first</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>decoder</code>               (<code>Module</code>)           \u2013            <p>mlp model that will receive the output of the encoder</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import TabMlpDecoder\n&gt;&gt;&gt; x_inp = torch.rand(3, 8)\n&gt;&gt;&gt; decoder = TabMlpDecoder(embed_dim=32, mlp_hidden_dims=[8,16])\n&gt;&gt;&gt; res = decoder(x_inp)\n&gt;&gt;&gt; res.shape\ntorch.Size([3, 32])\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/mlp/tab_mlp.py</code> <pre><code>def __init__(\n    self,\n    embed_dim: int,\n    mlp_hidden_dims: List[int] = [100, 200],\n    mlp_activation: str = \"relu\",\n    mlp_dropout: Union[float, List[float]] = 0.1,\n    mlp_batchnorm: bool = False,\n    mlp_batchnorm_last: bool = False,\n    mlp_linear_first: bool = True,\n):\n    super(TabMlpDecoder, self).__init__()\n\n    self.embed_dim = embed_dim\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    self.decoder = MLP(\n        mlp_hidden_dims + [self.embed_dim],\n        mlp_activation,\n        mlp_dropout,\n        mlp_batchnorm,\n        mlp_batchnorm_last,\n        mlp_linear_first,\n    )\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.resnet.tab_resnet.TabResnet","title":"TabResnet","text":"<pre><code>TabResnet(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous=None,\n    embed_continuous_method=None,\n    cont_embed_dim=None,\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    blocks_dims=[200, 100, 100],\n    blocks_dropout=0.1,\n    simplify_blocks=False,\n    mlp_hidden_dims=None,\n    mlp_activation=None,\n    mlp_dropout=None,\n    mlp_batchnorm=None,\n    mlp_batchnorm_last=None,\n    mlp_linear_first=None\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithoutAttention</code></p> <p>Defines a <code>TabResnet</code> model that can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>This class combines embedding representations of the categorical features with numerical (aka continuous) features, embedded or not. These are then passed through a series of Resnet blocks. See <code>pytorch_widedeep.models.tab_resnet._layers</code> for details on the structure of each block.</p> <p>Most of the parameters for this class are <code>Optional</code> since the use of categorical or continuous is in fact optional (i.e. one can use categorical features only, continuous features only or both).</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name, number of unique values and embedding dimension. e.g. [(education, 11, 32), ...]</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the continuous columns will be embedded using one of the available methods: 'standard', 'periodic' or 'piecewise'. If <code>None</code>, it will default to 'False'. NOTE: This parameter is deprecated and it  will be removed in future releases. Please, use the  <code>embed_continuous_method</code> parameter instead.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal[standard, piecewise, periodic]]</code>, default:                   <code>None</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dim</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Size of the continuous embeddings. If the continuous columns are embedded, <code>cont_embed_dim</code> must be passed.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>blocks_dims</code>               (<code>List[int]</code>, default:                   <code>[200, 100, 100]</code> )           \u2013            <p>List of integers that define the input and output units of each block. For example: [200, 100, 100] will generate 2 blocks. The first will receive a tensor of size 200 and output a tensor of size 100, and the second will receive a tensor of size 100 and output a tensor of size 100. See <code>pytorch_widedeep.models.tab_resnet._layers</code> for details on the structure of each block.</p> </li> <li> <code>blocks_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Block's internal dropout.</p> </li> <li> <code>simplify_blocks</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the simplest possible residual blocks (<code>X -&gt; [ [LIN, BN, ACT]  + X ]</code>) will be used instead of a standard one (<code>X -&gt; [ [LIN1, BN1, ACT1] -&gt; [LIN2, BN2]  + X ]</code>).</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per dense layer in the MLP. e.g: [64, 32]. If <code>None</code> the  output of the Resnet Blocks will be connected directly to the output neuron(s).</p> </li> <li> <code>mlp_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky'_relu' and _'gelu' are supported. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 'relu'.</p> </li> <li> <code>mlp_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>float with the dropout between the dense layers of the MLP. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 0.0.</p> </li> <li> <code>mlp_batchnorm</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_linear_first</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code> If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to <code>True</code>.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>encoder</code>               (<code>Module</code>)           \u2013            <p>deep dense Resnet model that will receive the concatenation of the embeddings and the continuous columns</p> </li> <li> <code>mlp</code>               (<code>Module</code>)           \u2013            <p>if <code>mlp_hidden_dims</code> is <code>True</code>, this attribute will be an mlp model that will receive the results of the concatenation of the embeddings and the continuous columns -- if present --.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import TabResnet\n&gt;&gt;&gt; X_deep = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = ['a', 'b', 'c', 'd', 'e']\n&gt;&gt;&gt; cat_embed_input = [(u,i,j) for u,i,j in zip(colnames[:4], [4]*4, [8]*4)]\n&gt;&gt;&gt; column_idx = {k:v for v,k in enumerate(colnames)}\n&gt;&gt;&gt; model = TabResnet(blocks_dims=[16,4], column_idx=column_idx, cat_embed_input=cat_embed_input,\n... continuous_cols = ['e'])\n&gt;&gt;&gt; out = model(X_deep)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/resnet/tab_resnet.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous: Optional[bool] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = None,\n    cont_embed_dim: Optional[int] = None,\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    blocks_dims: List[int] = [200, 100, 100],\n    blocks_dropout: float = 0.1,\n    simplify_blocks: bool = False,\n    mlp_hidden_dims: Optional[List[int]] = None,\n    mlp_activation: Optional[str] = None,\n    mlp_dropout: Optional[float] = None,\n    mlp_batchnorm: Optional[bool] = None,\n    mlp_batchnorm_last: Optional[bool] = None,\n    mlp_linear_first: Optional[bool] = None,\n):\n    super(TabResnet, self).__init__(\n        column_idx=column_idx,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous=embed_continuous,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dim=cont_embed_dim,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    if len(blocks_dims) &lt; 2:\n        raise ValueError(\n            \"'blocks' must contain at least two elements, e.g. [256, 128]\"\n        )\n\n    self.blocks_dims = blocks_dims\n    self.blocks_dropout = blocks_dropout\n    self.simplify_blocks = simplify_blocks\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    # Embeddings are instantiated at the base model\n\n    # Resnet\n    dense_resnet_input_dim = self.cat_out_dim + self.cont_out_dim\n    self.encoder = DenseResnet(\n        dense_resnet_input_dim, blocks_dims, blocks_dropout, self.simplify_blocks\n    )\n\n    # Mlp: adding an MLP on top of the Resnet blocks is optional and\n    # therefore all related params are optional\n    if self.mlp_hidden_dims is not None:\n        self.mlp = MLP(\n            d_hidden=[self.blocks_dims[-1]] + self.mlp_hidden_dims,\n            activation=(\n                \"relu\" if self.mlp_activation is None else self.mlp_activation\n            ),\n            dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,\n            batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,\n            batchnorm_last=(\n                False\n                if self.mlp_batchnorm_last is None\n                else self.mlp_batchnorm_last\n            ),\n            linear_first=(\n                True if self.mlp_linear_first is None else self.mlp_linear_first\n            ),\n        )\n    else:\n        self.mlp = None\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.resnet.tab_resnet.TabResnet.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.resnet.tab_resnet.TabResnetDecoder","title":"TabResnetDecoder","text":"<pre><code>TabResnetDecoder(\n    embed_dim,\n    blocks_dims=[100, 100, 200],\n    blocks_dropout=0.1,\n    simplify_blocks=False,\n    mlp_hidden_dims=None,\n    mlp_activation=None,\n    mlp_dropout=None,\n    mlp_batchnorm=None,\n    mlp_batchnorm_last=None,\n    mlp_linear_first=None,\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Companion decoder model for the <code>TabResnet</code> model (which can be considered an encoder itself)</p> <p>This class is designed to be used with the <code>EncoderDecoderTrainer</code> when using self-supervised pre-training (see the corresponding section in the docs). This class will receive the output from the ResNet blocks or the MLP(if present) and 'reconstruct' the embeddings.</p> <p>Parameters:</p> <ul> <li> <code>embed_dim</code>               (<code>int</code>)           \u2013            <p>Size of the embeddings tensor to be reconstructed.</p> </li> <li> <code>blocks_dims</code>               (<code>List[int]</code>, default:                   <code>[100, 100, 200]</code> )           \u2013            <p>List of integers that define the input and output units of each block. For example: [200, 100, 100] will generate 2 blocks. The first will receive a tensor of size 200 and output a tensor of size 100, and the second will receive a tensor of size 100 and output a tensor of size 100. See <code>pytorch_widedeep.models.tab_resnet._layers</code> for details on the structure of each block.</p> </li> <li> <code>blocks_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Block's internal dropout.</p> </li> <li> <code>simplify_blocks</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the simplest possible residual blocks (<code>X -&gt; [ [LIN, BN, ACT]  + X ]</code>) will be used instead of a standard one (<code>X -&gt; [ [LIN1, BN1, ACT1] -&gt; [LIN2, BN2]  + X ]</code>).</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per dense layer in the MLP. e.g: [64, 32]. If <code>None</code> the  output of the Resnet Blocks will be connected directly to the output neuron(s).</p> </li> <li> <code>mlp_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky'_relu' and _'gelu' are supported. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 'relu'.</p> </li> <li> <code>mlp_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>float with the dropout between the dense layers of the MLP. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 0.0.</p> </li> <li> <code>mlp_batchnorm</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_linear_first</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code> If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to <code>True</code>.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>decoder</code>               (<code>Module</code>)           \u2013            <p>deep dense Resnet model that will receive the output of the encoder IF <code>mlp_hidden_dims</code> is None</p> </li> <li> <code>mlp</code>               (<code>Module</code>)           \u2013            <p>if <code>mlp_hidden_dims</code> is not None, the overall decoder will consist in an MLP that will receive the output of the encoder followed by the deep dense Resnet.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import TabResnetDecoder\n&gt;&gt;&gt; x_inp = torch.rand(3, 8)\n&gt;&gt;&gt; decoder = TabResnetDecoder(embed_dim=32, blocks_dims=[8, 16, 16])\n&gt;&gt;&gt; res = decoder(x_inp)\n&gt;&gt;&gt; res.shape\ntorch.Size([3, 32])\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/resnet/tab_resnet.py</code> <pre><code>def __init__(\n    self,\n    embed_dim: int,\n    blocks_dims: List[int] = [100, 100, 200],\n    blocks_dropout: float = 0.1,\n    simplify_blocks: bool = False,\n    mlp_hidden_dims: Optional[List[int]] = None,\n    mlp_activation: Optional[str] = None,\n    mlp_dropout: Optional[float] = None,\n    mlp_batchnorm: Optional[bool] = None,\n    mlp_batchnorm_last: Optional[bool] = None,\n    mlp_linear_first: Optional[bool] = None,\n):\n    super(TabResnetDecoder, self).__init__()\n\n    if len(blocks_dims) &lt; 2:\n        raise ValueError(\n            \"'blocks' must contain at least two elements, e.g. [256, 128]\"\n        )\n\n    self.embed_dim = embed_dim\n\n    self.blocks_dims = blocks_dims\n    self.blocks_dropout = blocks_dropout\n    self.simplify_blocks = simplify_blocks\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    if self.mlp_hidden_dims is not None:\n        self.mlp = MLP(\n            d_hidden=[self.mlp_first_hidden_dim] + self.mlp_hidden_dims,\n            activation=(\n                \"relu\" if self.mlp_activation is None else self.mlp_activation\n            ),\n            dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,\n            batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,\n            batchnorm_last=(\n                False\n                if self.mlp_batchnorm_last is None\n                else self.mlp_batchnorm_last\n            ),\n            linear_first=(\n                True if self.mlp_linear_first is None else self.mlp_linear_first\n            ),\n        )\n        self.decoder = DenseResnet(\n            self.mlp_hidden_dims[-1],\n            blocks_dims,\n            blocks_dropout,\n            self.simplify_blocks,\n        )\n    else:\n        self.mlp = None\n        self.decoder = DenseResnet(\n            blocks_dims[0], blocks_dims, blocks_dropout, self.simplify_blocks\n        )\n\n    self.reconstruction_layer = nn.Linear(blocks_dims[-1], embed_dim, bias=False)\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.tabnet.tab_net.TabNet","title":"TabNet","text":"<pre><code>TabNet(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous=None,\n    embed_continuous_method=None,\n    cont_embed_dim=None,\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    n_steps=3,\n    step_dim=8,\n    attn_dim=8,\n    dropout=0.0,\n    n_glu_step_dependent=2,\n    n_glu_shared=2,\n    ghost_bn=True,\n    virtual_batch_size=128,\n    momentum=0.02,\n    gamma=1.3,\n    epsilon=1e-15,\n    mask_type=\"sparsemax\"\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithoutAttention</code></p> <p>Defines a TabNet model that can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>The implementation in this library is fully based on that here by the dreamquark-ai team, simply adapted so that it can work within the <code>WideDeep</code> frame. Therefore, ALL CREDIT TO THE DREAMQUARK-AI TEAM.</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name, number of unique values and embedding dimension. e.g. [(education, 11, 32), ...]</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the continuous columns will be embedded using one of the available methods: 'standard', 'periodic' or 'piecewise'. If <code>None</code>, it will default to 'False'. NOTE: This parameter is deprecated and it  will be removed in future releases. Please, use the  <code>embed_continuous_method</code> parameter instead.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal[standard, piecewise, periodic]]</code>, default:                   <code>None</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dim</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Size of the continuous embeddings. If the continuous columns are embedded, <code>cont_embed_dim</code> must be passed.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>n_steps</code>               (<code>int</code>, default:                   <code>3</code> )           \u2013            <p>number of decision steps. For a better understanding of the function of <code>n_steps</code> and the upcoming parameters, please see the paper.</p> </li> <li> <code>step_dim</code>               (<code>int</code>, default:                   <code>8</code> )           \u2013            <p>Step's output dimension. This is the output dimension that <code>WideDeep</code> will collect and connect to the output neuron(s).</p> </li> <li> <code>attn_dim</code>               (<code>int</code>, default:                   <code>8</code> )           \u2013            <p>Attention dimension</p> </li> <li> <code>dropout</code>               (<code>float</code>, default:                   <code>0.0</code> )           \u2013            <p>GLU block's internal dropout</p> </li> <li> <code>n_glu_step_dependent</code>               (<code>int</code>, default:                   <code>2</code> )           \u2013            <p>number of GLU Blocks (<code>[FC -&gt; BN -&gt; GLU]</code>) that are step dependent</p> </li> <li> <code>n_glu_shared</code>               (<code>int</code>, default:                   <code>2</code> )           \u2013            <p>number of GLU Blocks (<code>[FC -&gt; BN -&gt; GLU]</code>) that will be shared across decision steps</p> </li> <li> <code>ghost_bn</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating if Ghost Batch Normalization will be used.</p> </li> <li> <code>virtual_batch_size</code>               (<code>int</code>, default:                   <code>128</code> )           \u2013            <p>Batch size when using Ghost Batch Normalization</p> </li> <li> <code>momentum</code>               (<code>float</code>, default:                   <code>0.02</code> )           \u2013            <p>Ghost Batch Normalization's momentum. The dreamquark-ai advises for very low values. However high values are used in the original publication. During our tests higher values lead to better results</p> </li> <li> <code>gamma</code>               (<code>float</code>, default:                   <code>1.3</code> )           \u2013            <p>Relaxation parameter in the paper. When gamma = 1, a feature is enforced to be used only at one decision step. As gamma increases, more flexibility is provided to use a feature at multiple decision steps</p> </li> <li> <code>epsilon</code>               (<code>float</code>, default:                   <code>1e-15</code> )           \u2013            <p>Float to avoid log(0). Always keep low</p> </li> <li> <code>mask_type</code>               (<code>str</code>, default:                   <code>'sparsemax'</code> )           \u2013            <p>Mask function to use. Either 'sparsemax' or 'entmax'</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>encoder</code>               (<code>Module</code>)           \u2013            <p>the TabNet encoder. For details see the original publication.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = [\"a\", \"b\", \"c\", \"d\", \"e\"]\n&gt;&gt;&gt; cat_embed_input = [(u, i, j) for u, i, j in zip(colnames[:4], [4] * 4, [8] * 4)]\n&gt;&gt;&gt; column_idx = {k: v for v, k in enumerate(colnames)}\n&gt;&gt;&gt; model = TabNet(column_idx=column_idx, cat_embed_input=cat_embed_input, continuous_cols=[\"e\"])\n&gt;&gt;&gt; out = model(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/tabnet/tab_net.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous: Optional[bool] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = None,\n    cont_embed_dim: Optional[int] = None,\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    n_steps: int = 3,\n    step_dim: int = 8,\n    attn_dim: int = 8,\n    dropout: float = 0.0,\n    n_glu_step_dependent: int = 2,\n    n_glu_shared: int = 2,\n    ghost_bn: bool = True,\n    virtual_batch_size: int = 128,\n    momentum: float = 0.02,\n    gamma: float = 1.3,\n    epsilon: float = 1e-15,\n    mask_type: str = \"sparsemax\",\n):\n    super(TabNet, self).__init__(\n        column_idx=column_idx,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous=embed_continuous,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dim=cont_embed_dim,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    self.n_steps = n_steps\n    self.step_dim = step_dim\n    self.attn_dim = attn_dim\n    self.dropout = dropout\n    self.n_glu_step_dependent = n_glu_step_dependent\n    self.n_glu_shared = n_glu_shared\n    self.ghost_bn = ghost_bn\n    self.virtual_batch_size = virtual_batch_size\n    self.momentum = momentum\n    self.gamma = gamma\n    self.epsilon = epsilon\n    self.mask_type = mask_type\n\n    # Embeddings are instantiated at the base model\n    self.embed_out_dim = self.cat_out_dim + self.cont_out_dim\n\n    # TabNet\n    self.encoder = TabNetEncoder(\n        self.embed_out_dim,\n        n_steps,\n        step_dim,\n        attn_dim,\n        dropout,\n        n_glu_step_dependent,\n        n_glu_shared,\n        ghost_bn,\n        virtual_batch_size,\n        momentum,\n        gamma,\n        epsilon,\n        mask_type,\n    )\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.tabnet.tab_net.TabNet.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.tabnet.tab_net.TabNetDecoder","title":"TabNetDecoder","text":"<pre><code>TabNetDecoder(\n    embed_dim,\n    n_steps=3,\n    step_dim=8,\n    dropout=0.0,\n    n_glu_step_dependent=2,\n    n_glu_shared=2,\n    ghost_bn=True,\n    virtual_batch_size=128,\n    momentum=0.02,\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Companion decoder model for the <code>TabNet</code> model (which can be considered an encoder itself)</p> <p>This class is designed to be used with the <code>EncoderDecoderTrainer</code> when using self-supervised pre-training (see the corresponding section in the docs). This class will receive the output from the <code>TabNet</code> encoder (i.e. the output from the so called 'steps') and 'reconstruct' the embeddings.</p> <p>Parameters:</p> <ul> <li> <code>embed_dim</code>               (<code>int</code>)           \u2013            <p>Size of the embeddings tensor to be reconstructed.</p> </li> <li> <code>n_steps</code>               (<code>int</code>, default:                   <code>3</code> )           \u2013            <p>number of decision steps. For a better understanding of the function of <code>n_steps</code> and the upcoming parameters, please see the paper.</p> </li> <li> <code>step_dim</code>               (<code>int</code>, default:                   <code>8</code> )           \u2013            <p>Step's output dimension. This is the output dimension that <code>WideDeep</code> will collect and connect to the output neuron(s).</p> </li> <li> <code>dropout</code>               (<code>float</code>, default:                   <code>0.0</code> )           \u2013            <p>GLU block's internal dropout</p> </li> <li> <code>n_glu_step_dependent</code>               (<code>int</code>, default:                   <code>2</code> )           \u2013            <p>number of GLU Blocks (<code>[FC -&gt; BN -&gt; GLU]</code>) that are step dependent</p> </li> <li> <code>n_glu_shared</code>               (<code>int</code>, default:                   <code>2</code> )           \u2013            <p>number of GLU Blocks (<code>[FC -&gt; BN -&gt; GLU]</code>) that will be shared across decision steps</p> </li> <li> <code>ghost_bn</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating if Ghost Batch Normalization will be used.</p> </li> <li> <code>virtual_batch_size</code>               (<code>int</code>, default:                   <code>128</code> )           \u2013            <p>Batch size when using Ghost Batch Normalization</p> </li> <li> <code>momentum</code>               (<code>float</code>, default:                   <code>0.02</code> )           \u2013            <p>Ghost Batch Normalization's momentum. The dreamquark-ai advises for very low values. However high values are used in the original publication. During our tests higher values lead to better results</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>decoder</code>               (<code>Module</code>)           \u2013            <p>decoder that will receive the output from the encoder's steps and will reconstruct the embeddings</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import TabNetDecoder\n&gt;&gt;&gt; x_inp = [torch.rand(3, 8), torch.rand(3, 8), torch.rand(3, 8)]\n&gt;&gt;&gt; decoder = TabNetDecoder(embed_dim=32, ghost_bn=False)\n&gt;&gt;&gt; res = decoder(x_inp)\n&gt;&gt;&gt; res.shape\ntorch.Size([3, 32])\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/tabnet/tab_net.py</code> <pre><code>def __init__(\n    self,\n    embed_dim: int,\n    n_steps: int = 3,\n    step_dim: int = 8,\n    dropout: float = 0.0,\n    n_glu_step_dependent: int = 2,\n    n_glu_shared: int = 2,\n    ghost_bn: bool = True,\n    virtual_batch_size: int = 128,\n    momentum: float = 0.02,\n):\n    super(TabNetDecoder, self).__init__()\n\n    self.n_steps = n_steps\n    self.step_dim = step_dim\n    self.dropout = dropout\n    self.n_glu_step_dependent = n_glu_step_dependent\n    self.n_glu_shared = n_glu_shared\n    self.ghost_bn = ghost_bn\n    self.virtual_batch_size = virtual_batch_size\n    self.momentum = momentum\n\n    shared_layers = nn.ModuleList()\n    for i in range(n_glu_shared):\n        if i == 0:\n            shared_layers.append(nn.Linear(step_dim, 2 * step_dim, bias=False))\n        else:\n            shared_layers.append(nn.Linear(step_dim, 2 * step_dim, bias=False))\n\n    self.decoder = nn.ModuleList()\n    for step in range(n_steps):\n        transformer = FeatTransformer(\n            step_dim,\n            step_dim,\n            dropout,\n            shared_layers,\n            n_glu_step_dependent,\n            ghost_bn,\n            virtual_batch_size,\n            momentum=momentum,\n        )\n        self.decoder.append(transformer)\n\n    self.reconstruction_layer = nn.Linear(step_dim, embed_dim, bias=False)\n    initialize_non_glu(self.reconstruction_layer, step_dim, embed_dim)\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.mlp.context_attention_mlp.ContextAttentionMLP","title":"ContextAttentionMLP","text":"<pre><code>ContextAttentionMLP(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    shared_embed=None,\n    add_shared_embed=None,\n    frac_shared_embed=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous_method=\"standard\",\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    input_dim=32,\n    attn_dropout=0.2,\n    with_addnorm=False,\n    attn_activation=\"leaky_relu\",\n    n_blocks=3\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithAttention</code></p> <p>Defines a <code>ContextAttentionMLP</code> model that can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>This class combines embedding representations of the categorical features with numerical (aka continuous) features that are also embedded. These are then passed through a series of attention blocks. Each attention block is comprised by a <code>ContextAttentionEncoder</code>. Such encoder is in part inspired by the attention mechanism described in Hierarchical Attention Networks for Document Classification. See <code>pytorch_widedeep.models.tabular.mlp._attention_layers</code> for details.</p> <p>Most of the parameters for this class are <code>Optional</code> since the use of categorical or continuous is in fact optional (i.e. one can use categorical features only, continuous features only or both).</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name and number of unique values and embedding dimension. e.g. [(education, 11), ...]</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the embeddings will be \"shared\". The idea behind <code>shared_embed</code> is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: <code>pytorch_widedeep.models.transformers._layers.SharedEmbeddings</code>.</p> </li> <li> <code>add_shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>The two embedding sharing strategies are: 1) add the shared embeddings to the column embeddings or 2) to replace the first <code>frac_shared_embed</code> with the shared embeddings. See <code>pytorch_widedeep.models.embeddings_layers.SharedEmbeddings</code> If 'None' is passed, it will default to 'False'.</p> </li> <li> <code>frac_shared_embed</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>The fraction of embeddings that will be shared (if <code>add_shared_embed = False</code>) by all the different categories for one particular column. If 'None' is passed, it will default to 0.0.</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal[standard, piecewise, periodic]]</code>, default:                   <code>'standard'</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>input_dim</code>               (<code>int</code>, default:                   <code>32</code> )           \u2013            <p>The so-called dimension of the model. Is the number of embeddings used to encode the categorical and/or continuous columns</p> </li> <li> <code>attn_dropout</code>               (<code>float</code>, default:                   <code>0.2</code> )           \u2013            <p>Dropout for each attention block</p> </li> <li> <code>with_addnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if residual connections will be used in the attention blocks</p> </li> <li> <code>attn_activation</code>               (<code>str</code>, default:                   <code>'leaky_relu'</code> )           \u2013            <p>String indicating the activation function to be applied to the dense layer in each attention encoder. 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported.</p> </li> <li> <code>n_blocks</code>               (<code>int</code>, default:                   <code>3</code> )           \u2013            <p>Number of attention blocks</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>encoder</code>               (<code>Module</code>)           \u2013            <p>Sequence of attention encoders.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import ContextAttentionMLP\n&gt;&gt;&gt; X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = ['a', 'b', 'c', 'd', 'e']\n&gt;&gt;&gt; cat_embed_input = [(u,i,j) for u,i,j in zip(colnames[:4], [4]*4, [8]*4)]\n&gt;&gt;&gt; column_idx = {k:v for v,k in enumerate(colnames)}\n&gt;&gt;&gt; model = ContextAttentionMLP(column_idx=column_idx, cat_embed_input=cat_embed_input, continuous_cols = ['e'])\n&gt;&gt;&gt; out = model(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/mlp/context_attention_mlp.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    shared_embed: Optional[bool] = None,\n    add_shared_embed: Optional[bool] = None,\n    frac_shared_embed: Optional[float] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = \"standard\",\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    input_dim: int = 32,\n    attn_dropout: float = 0.2,\n    with_addnorm: bool = False,\n    attn_activation: str = \"leaky_relu\",\n    n_blocks: int = 3,\n):\n    super(ContextAttentionMLP, self).__init__(\n        column_idx=column_idx,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        shared_embed=shared_embed,\n        add_shared_embed=add_shared_embed,\n        frac_shared_embed=frac_shared_embed,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        input_dim=input_dim,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    self.attn_dropout = attn_dropout\n    self.with_addnorm = with_addnorm\n    self.attn_activation = attn_activation\n    self.n_blocks = n_blocks\n\n    self.with_cls_token = \"cls_token\" in column_idx\n    self.n_cat = len(cat_embed_input) if cat_embed_input is not None else 0\n    self.n_cont = len(continuous_cols) if continuous_cols is not None else 0\n\n    # Embeddings are instantiated at the base model\n    # Attention Blocks\n    self.encoder = nn.Sequential()\n    for i in range(n_blocks):\n        self.encoder.add_module(\n            \"attention_block\" + str(i),\n            ContextAttentionEncoder(\n                input_dim,\n                attn_dropout,\n                with_addnorm,\n                attn_activation,\n            ),\n        )\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.mlp.context_attention_mlp.ContextAttentionMLP.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.mlp.context_attention_mlp.ContextAttentionMLP.attention_weights","title":"attention_weights  <code>property</code>","text":"<pre><code>attention_weights\n</code></pre> <p>List with the attention weights per block</p> <p>The shape of the attention weights is \\((N, F)\\), where \\(N\\) is the batch size and \\(F\\) is the number of features/columns in the dataset</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.mlp.self_attention_mlp.SelfAttentionMLP","title":"SelfAttentionMLP","text":"<pre><code>SelfAttentionMLP(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    shared_embed=None,\n    add_shared_embed=None,\n    frac_shared_embed=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous_method=\"standard\",\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    input_dim=32,\n    attn_dropout=0.2,\n    n_heads=8,\n    use_bias=False,\n    with_addnorm=False,\n    attn_activation=\"leaky_relu\",\n    n_blocks=3\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithAttention</code></p> <p>Defines a <code>SelfAttentionMLP</code> model that can be used as the deeptabular component of a Wide &amp; Deep model or independently by itself.</p> <p>This class combines embedding representations of the categorical features with numerical (aka continuous) features that are also embedded. These are then passed through a series of attention blocks. Each attention block is comprised by what we would refer as a simplified <code>SelfAttentionEncoder</code>. See <code>pytorch_widedeep.models.tabular.mlp._attention_layers</code> for details. The reason to use a simplified version of self attention is because we observed that the 'standard' attention mechanism used in the TabTransformer has a notable tendency to overfit.</p> <p>In more detail, this model only uses Q and K (and not V). If we think about it as in terms of text (and intuitively), the Softmax(QK^T) is the attention mechanism that tells us how much, at each position in the input sentence, each word is represented or 'expressed'. We refer to that as 'attention weights'. These attention weighst are normally multiplied by a Value matrix to further strength the focus on the words that each word should be attending to (again, intuitively).</p> <p>In this implementation we skip this last multiplication and instead we multiply the attention weights directly by the input tensor. This is a simplification that we expect is beneficial in terms of avoiding overfitting for tabular data.</p> <p>Most of the parameters for this class are <code>Optional</code> since the use of categorical or continuous is in fact optional (i.e. one can use categorical features only, continuous features only or both).</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name and number of unique values and embedding dimension. e.g. [(education, 11), ...]</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the embeddings will be \"shared\". The idea behind <code>shared_embed</code> is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: <code>pytorch_widedeep.models.transformers._layers.SharedEmbeddings</code>.</p> </li> <li> <code>add_shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>The two embedding sharing strategies are: 1) add the shared embeddings to the column embeddings or 2) to replace the first <code>frac_shared_embed</code> with the shared embeddings. See <code>pytorch_widedeep.models.embeddings_layers.SharedEmbeddings</code> If 'None' is passed, it will default to 'False'.</p> </li> <li> <code>frac_shared_embed</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>The fraction of embeddings that will be shared (if <code>add_shared_embed = False</code>) by all the different categories for one particular column. If 'None' is passed, it will default to 0.0.</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal[standard, piecewise, periodic]]</code>, default:                   <code>'standard'</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>input_dim</code>               (<code>int</code>, default:                   <code>32</code> )           \u2013            <p>The so-called dimension of the model. Is the number of embeddings used to encode the categorical and/or continuous columns</p> </li> <li> <code>attn_dropout</code>               (<code>float</code>, default:                   <code>0.2</code> )           \u2013            <p>Dropout for each attention block</p> </li> <li> <code>n_heads</code>               (<code>int</code>, default:                   <code>8</code> )           \u2013            <p>Number of attention heads per attention block.</p> </li> <li> <code>use_bias</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to use bias in the Q, K projection layers.</p> </li> <li> <code>with_addnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if residual connections will be used in the attention blocks</p> </li> <li> <code>attn_activation</code>               (<code>str</code>, default:                   <code>'leaky_relu'</code> )           \u2013            <p>String indicating the activation function to be applied to the dense layer in each attention encoder. 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported.</p> </li> <li> <code>n_blocks</code>               (<code>int</code>, default:                   <code>3</code> )           \u2013            <p>Number of attention blocks</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>cat_and_cont_embed</code>               (<code>Module</code>)           \u2013            <p>This is the module that processes the categorical and continuous columns</p> </li> <li> <code>encoder</code>               (<code>Module</code>)           \u2013            <p>Sequence of attention encoders.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import SelfAttentionMLP\n&gt;&gt;&gt; X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = ['a', 'b', 'c', 'd', 'e']\n&gt;&gt;&gt; cat_embed_input = [(u,i,j) for u,i,j in zip(colnames[:4], [4]*4, [8]*4)]\n&gt;&gt;&gt; column_idx = {k:v for v,k in enumerate(colnames)}\n&gt;&gt;&gt; model = SelfAttentionMLP(column_idx=column_idx, cat_embed_input=cat_embed_input, continuous_cols = ['e'])\n&gt;&gt;&gt; out = model(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/mlp/self_attention_mlp.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    shared_embed: Optional[bool] = None,\n    add_shared_embed: Optional[bool] = None,\n    frac_shared_embed: Optional[float] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = \"standard\",\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    input_dim: int = 32,\n    attn_dropout: float = 0.2,\n    n_heads: int = 8,\n    use_bias: bool = False,\n    with_addnorm: bool = False,\n    attn_activation: str = \"leaky_relu\",\n    n_blocks: int = 3,\n):\n    super(SelfAttentionMLP, self).__init__(\n        column_idx=column_idx,\n        input_dim=input_dim,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        shared_embed=shared_embed,\n        add_shared_embed=add_shared_embed,\n        frac_shared_embed=frac_shared_embed,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    self.attn_dropout = attn_dropout\n    self.n_heads = n_heads\n    self.use_bias = use_bias\n    self.with_addnorm = with_addnorm\n    self.attn_activation = attn_activation\n    self.n_blocks = n_blocks\n\n    self.with_cls_token = \"cls_token\" in column_idx\n    self.n_cat = len(cat_embed_input) if cat_embed_input is not None else 0\n    self.n_cont = len(continuous_cols) if continuous_cols is not None else 0\n\n    # Embeddings are instantiated at the base model\n    # Attention Blocks\n    self.encoder = nn.Sequential()\n    for i in range(n_blocks):\n        self.encoder.add_module(\n            \"attention_block\" + str(i),\n            SelfAttentionEncoder(\n                input_dim,\n                attn_dropout,\n                use_bias,\n                n_heads,\n                with_addnorm,\n                attn_activation,\n            ),\n        )\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.mlp.self_attention_mlp.SelfAttentionMLP.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the WideDeep class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.mlp.self_attention_mlp.SelfAttentionMLP.attention_weights","title":"attention_weights  <code>property</code>","text":"<pre><code>attention_weights\n</code></pre> <p>List with the attention weights per block</p> <p>The shape of the attention weights is \\((N, H, F, F)\\), where \\(N\\) is the batch size, \\(H\\) is the number of attention heads and \\(F\\) is the number of features/columns in the dataset</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.tab_transformer.TabTransformer","title":"TabTransformer","text":"<pre><code>TabTransformer(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    shared_embed=None,\n    add_shared_embed=None,\n    frac_shared_embed=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous=None,\n    embed_continuous_method=None,\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    input_dim=32,\n    n_heads=8,\n    use_qkv_bias=False,\n    n_blocks=4,\n    attn_dropout=0.2,\n    ff_dropout=0.1,\n    ff_factor=4,\n    transformer_activation=\"gelu\",\n    use_linear_attention=False,\n    use_flash_attention=False,\n    mlp_hidden_dims=None,\n    mlp_activation=\"relu\",\n    mlp_dropout=0.1,\n    mlp_batchnorm=False,\n    mlp_batchnorm_last=False,\n    mlp_linear_first=True\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithAttention</code></p> <p>Defines our adptation of the TabTransformer model that can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>Most of the parameters for this class are <code>Optional</code> since the use of categorical or continuous is in fact optional (i.e. one can use categorical features only, continuous features only or both).</p> <p> NOTE: This is an enhanced adaptation of the model described in the paper. It can be considered as the flagship of our transformer family of models for tabular data and offers mutiple, additional features relative to the original publication(and some other models in the library)</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name and number of unique values and embedding dimension. e.g. [(education, 11), ...]</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the embeddings will be \"shared\". The idea behind <code>shared_embed</code> is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: <code>pytorch_widedeep.models.transformers._layers.SharedEmbeddings</code>.</p> </li> <li> <code>add_shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>The two embedding sharing strategies are: 1) add the shared embeddings to the column embeddings or 2) to replace the first <code>frac_shared_embed</code> with the shared embeddings. See <code>pytorch_widedeep.models.embeddings_layers.SharedEmbeddings</code> If 'None' is passed, it will default to 'False'.</p> </li> <li> <code>frac_shared_embed</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>The fraction of embeddings that will be shared (if <code>add_shared_embed = False</code>) by all the different categories for one particular column. If 'None' is passed, it will default to 0.0.</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal[standard, piecewise, periodic]]</code>, default:                   <code>None</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>input_dim</code>               (<code>int</code>, default:                   <code>32</code> )           \u2013            <p>The so-called dimension of the model. Is the number of embeddings used to encode the categorical and/or continuous columns</p> </li> <li> <code>n_heads</code>               (<code>int</code>, default:                   <code>8</code> )           \u2013            <p>Number of attention heads per Transformer block</p> </li> <li> <code>use_qkv_bias</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to use bias in the Q, K, and V projection layers.</p> </li> <li> <code>n_blocks</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Number of Transformer blocks</p> </li> <li> <code>attn_dropout</code>               (<code>float</code>, default:                   <code>0.2</code> )           \u2013            <p>Dropout that will be applied to the Multi-Head Attention layers</p> </li> <li> <code>ff_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Dropout that will be applied to the FeedForward network</p> </li> <li> <code>ff_factor</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Multiplicative factor applied to the first layer of the FF network in each Transformer block, This is normally set to 4.</p> </li> <li> <code>transformer_activation</code>               (<code>str</code>, default:                   <code>'gelu'</code> )           \u2013            <p>Transformer Encoder activation function. 'tanh', 'relu', 'leaky_relu', 'gelu', 'geglu' and 'reglu' are supported</p> </li> <li> <code>use_linear_attention</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if Linear Attention (from Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention) will be used. The inclusing of this mode of attention is inspired by this post, where the Uber team finds that this attention mechanism leads to the best results for their tabular data.</p> </li> <li> <code>use_flash_attention</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if Flash Attention will be used. </p> </li> <li> <code>mlp_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per dense layer in the MLP. e.g: [64, 32]. If not provided no MLP on top of the final Transformer block will be used.</p> </li> <li> <code>mlp_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky'_relu' and _'gelu' are supported. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 'relu'.</p> </li> <li> <code>mlp_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>float with the dropout between the dense layers of the MLP. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 0.0.</p> </li> <li> <code>mlp_batchnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_linear_first</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code> If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to <code>True</code>.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>encoder</code>               (<code>Module</code>)           \u2013            <p>Sequence of Transformer blocks</p> </li> <li> <code>mlp</code>               (<code>Module</code>)           \u2013            <p>MLP component in the model</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import TabTransformer\n&gt;&gt;&gt; X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = ['a', 'b', 'c', 'd', 'e']\n&gt;&gt;&gt; cat_embed_input = [(u,i) for u,i in zip(colnames[:4], [4]*4)]\n&gt;&gt;&gt; continuous_cols = ['e']\n&gt;&gt;&gt; column_idx = {k:v for v,k in enumerate(colnames)}\n&gt;&gt;&gt; model = TabTransformer(column_idx=column_idx, cat_embed_input=cat_embed_input, continuous_cols=continuous_cols)\n&gt;&gt;&gt; out = model(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/transformers/tab_transformer.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    shared_embed: Optional[bool] = None,\n    add_shared_embed: Optional[bool] = None,\n    frac_shared_embed: Optional[float] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous: Optional[bool] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = None,\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    input_dim: int = 32,\n    n_heads: int = 8,\n    use_qkv_bias: bool = False,\n    n_blocks: int = 4,\n    attn_dropout: float = 0.2,\n    ff_dropout: float = 0.1,\n    ff_factor: int = 4,\n    transformer_activation: str = \"gelu\",\n    use_linear_attention: bool = False,\n    use_flash_attention: bool = False,\n    mlp_hidden_dims: Optional[List[int]] = None,\n    mlp_activation: str = \"relu\",\n    mlp_dropout: float = 0.1,\n    mlp_batchnorm: bool = False,\n    mlp_batchnorm_last: bool = False,\n    mlp_linear_first: bool = True,\n):\n    super(TabTransformer, self).__init__(\n        column_idx=column_idx,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        shared_embed=shared_embed,\n        add_shared_embed=add_shared_embed,\n        frac_shared_embed=frac_shared_embed,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        # UGLY hack to be able to use the base class __init__ method\n        embed_continuous_method=(\n            \"standard\"\n            if embed_continuous_method is None\n            else embed_continuous_method\n        ),\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        input_dim=input_dim,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    # we overwrite the embed_continuous_method parameter that was set\n    # to 'standard' if embed_continuous_method is None. In other words,\n    # this will already have the right value if the user provided a value\n    # for embed_continuous_method, otherwise will be \"standard\" when it\n    # should be None\n    self.embed_continuous_method = embed_continuous_method\n\n    if embed_continuous is not None:\n        self.embed_continuous = embed_continuous\n        if embed_continuous and embed_continuous_method is None:\n            raise ValueError(\n                \"If 'embed_continuous' is True, 'embed_continuous_method' must be \"\n                \"one of 'standard', 'piecewise' or 'periodic'.\"\n            )\n    elif self.embed_continuous_method is not None:\n        self.embed_continuous = True\n    else:\n        self.embed_continuous = False\n\n    self.n_heads = n_heads\n    self.use_qkv_bias = use_qkv_bias\n    self.n_blocks = n_blocks\n    self.attn_dropout = attn_dropout\n    self.ff_dropout = ff_dropout\n    self.transformer_activation = transformer_activation\n    self.use_linear_attention = use_linear_attention\n    self.use_flash_attention = use_flash_attention\n    self.ff_factor = ff_factor\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    self.with_cls_token = \"cls_token\" in column_idx\n    self.n_cat = len(cat_embed_input) if cat_embed_input is not None else 0\n    self.n_cont = len(continuous_cols) if continuous_cols is not None else 0\n\n    if self.n_cont and not self.n_cat and not self.embed_continuous:\n        raise ValueError(\n            \"If only continuous features are used 'embed_continuous' must be set to 'True'\"\n        )\n\n    # Embeddings are instantiated at the base model\n    # Transformer blocks\n    self.encoder = nn.Sequential()\n    for i in range(n_blocks):\n        self.encoder.add_module(\n            \"transformer_block\" + str(i),\n            TransformerEncoder(\n                input_dim,\n                n_heads,\n                use_qkv_bias,\n                attn_dropout,\n                ff_dropout,\n                ff_factor,\n                transformer_activation,\n                use_linear_attention,\n                use_flash_attention,\n            ),\n        )\n\n    self.mlp_first_hidden_dim = self._mlp_first_hidden_dim()\n\n    if self.mlp_hidden_dims is not None:\n        self.mlp = MLP(\n            d_hidden=[self.mlp_first_hidden_dim] + self.mlp_hidden_dims,\n            activation=(\n                \"relu\" if self.mlp_activation is None else self.mlp_activation\n            ),\n            dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,\n            batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,\n            batchnorm_last=(\n                False\n                if self.mlp_batchnorm_last is None\n                else self.mlp_batchnorm_last\n            ),\n            linear_first=(\n                False if self.mlp_linear_first is None else self.mlp_linear_first\n            ),\n        )\n    else:\n        self.mlp = None\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.tab_transformer.TabTransformer.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.tab_transformer.TabTransformer.attention_weights","title":"attention_weights  <code>property</code>","text":"<pre><code>attention_weights\n</code></pre> <p>List with the attention weights per block</p> <p>The shape of the attention weights is \\((N, H, F, F)\\), where \\(N\\) is the batch size, \\(H\\) is the number of attention heads and \\(F\\) is the number of features/columns in the dataset</p> <p> NOTE: if flash attention or linear attention are used, no attention weights are saved during the training process and calling this property will throw a ValueError</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.saint.SAINT","title":"SAINT","text":"<pre><code>SAINT(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    shared_embed=None,\n    add_shared_embed=None,\n    frac_shared_embed=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous_method=\"standard\",\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    input_dim=32,\n    use_qkv_bias=False,\n    n_heads=8,\n    n_blocks=2,\n    attn_dropout=0.1,\n    ff_dropout=0.2,\n    ff_factor=4,\n    transformer_activation=\"gelu\",\n    mlp_hidden_dims=None,\n    mlp_activation=None,\n    mlp_dropout=None,\n    mlp_batchnorm=None,\n    mlp_batchnorm_last=None,\n    mlp_linear_first=None\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithAttention</code></p> <p>Defines a SAINT model that can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>Most of the parameters for this class are <code>Optional</code> since the use of categorical or continuous is in fact optional (i.e. one can use categorical features only, continuous features only or both).</p> <p> NOTE: This is an slightly modified and enhanced  version of the model described in the paper,</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name and number of unique values and embedding dimension. e.g. [(education, 11), ...]</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the embeddings will be \"shared\". The idea behind <code>shared_embed</code> is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: <code>pytorch_widedeep.models.transformers._layers.SharedEmbeddings</code>.</p> </li> <li> <code>add_shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>The two embedding sharing strategies are: 1) add the shared embeddings to the column embeddings or 2) to replace the first <code>frac_shared_embed</code> with the shared embeddings. See <code>pytorch_widedeep.models.embeddings_layers.SharedEmbeddings</code> If 'None' is passed, it will default to 'False'.</p> </li> <li> <code>frac_shared_embed</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>The fraction of embeddings that will be shared (if <code>add_shared_embed = False</code>) by all the different categories for one particular column. If 'None' is passed, it will default to 0.0.</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal[standard, piecewise, periodic]]</code>, default:                   <code>'standard'</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>input_dim</code>               (<code>int</code>, default:                   <code>32</code> )           \u2013            <p>The so-called dimension of the model. Is the number of embeddings used to encode the categorical and/or continuous columns</p> </li> <li> <code>n_heads</code>               (<code>int</code>, default:                   <code>8</code> )           \u2013            <p>Number of attention heads per Transformer block</p> </li> <li> <code>use_qkv_bias</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to use bias in the Q, K, and V projection layers</p> </li> <li> <code>n_blocks</code>               (<code>int</code>, default:                   <code>2</code> )           \u2013            <p>Number of SAINT-Transformer blocks.</p> </li> <li> <code>attn_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Dropout that will be applied to the Multi-Head Attention column and row layers</p> </li> <li> <code>ff_dropout</code>               (<code>float</code>, default:                   <code>0.2</code> )           \u2013            <p>Dropout that will be applied to the FeedForward network</p> </li> <li> <code>ff_factor</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Multiplicative factor applied to the first layer of the FF network in each Transformer block, This is normally set to 4.</p> </li> <li> <code>transformer_activation</code>               (<code>str</code>, default:                   <code>'gelu'</code> )           \u2013            <p>Transformer Encoder activation function. 'tanh', 'relu', 'leaky_relu', 'gelu', 'geglu' and 'reglu' are supported</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per dense layer in the MLP. e.g: [64, 32]. If not provided no MLP on top of the final Transformer block will be used.</p> </li> <li> <code>mlp_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky'_relu' and _'gelu' are supported. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 'relu'.</p> </li> <li> <code>mlp_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>float with the dropout between the dense layers of the MLP. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 0.0.</p> </li> <li> <code>mlp_batchnorm</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_linear_first</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code> If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to <code>True</code>.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>encoder</code>               (<code>Module</code>)           \u2013            <p>Sequence of SAINT-Transformer blocks</p> </li> <li> <code>mlp</code>               (<code>Module</code>)           \u2013            <p>MLP component in the model</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import SAINT\n&gt;&gt;&gt; X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = ['a', 'b', 'c', 'd', 'e']\n&gt;&gt;&gt; cat_embed_input = [(u,i) for u,i in zip(colnames[:4], [4]*4)]\n&gt;&gt;&gt; continuous_cols = ['e']\n&gt;&gt;&gt; column_idx = {k:v for v,k in enumerate(colnames)}\n&gt;&gt;&gt; model = SAINT(column_idx=column_idx, cat_embed_input=cat_embed_input, continuous_cols=continuous_cols)\n&gt;&gt;&gt; out = model(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/transformers/saint.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    shared_embed: Optional[bool] = None,\n    add_shared_embed: Optional[bool] = None,\n    frac_shared_embed: Optional[float] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = \"standard\",\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    input_dim: int = 32,\n    use_qkv_bias: bool = False,\n    n_heads: int = 8,\n    n_blocks: int = 2,\n    attn_dropout: float = 0.1,\n    ff_dropout: float = 0.2,\n    ff_factor: int = 4,\n    transformer_activation: str = \"gelu\",\n    mlp_hidden_dims: Optional[List[int]] = None,\n    mlp_activation: Optional[str] = None,\n    mlp_dropout: Optional[float] = None,\n    mlp_batchnorm: Optional[bool] = None,\n    mlp_batchnorm_last: Optional[bool] = None,\n    mlp_linear_first: Optional[bool] = None,\n):\n    super(SAINT, self).__init__(\n        column_idx=column_idx,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        shared_embed=shared_embed,\n        add_shared_embed=add_shared_embed,\n        frac_shared_embed=frac_shared_embed,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        input_dim=input_dim,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    self.use_qkv_bias = use_qkv_bias\n    self.n_heads = n_heads\n    self.n_blocks = n_blocks\n    self.attn_dropout = attn_dropout\n    self.ff_dropout = ff_dropout\n    self.ff_factor = ff_factor\n    self.transformer_activation = transformer_activation\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    self.with_cls_token = \"cls_token\" in column_idx\n    self.n_cat = len(cat_embed_input) if cat_embed_input is not None else 0\n    self.n_cont = len(continuous_cols) if continuous_cols is not None else 0\n    self.n_feats = self.n_cat + self.n_cont\n\n    # Embeddings are instantiated at the base model\n    # Transformer blocks\n    self.encoder = nn.Sequential()\n    for i in range(n_blocks):\n        self.encoder.add_module(\n            \"saint_block\" + str(i),\n            SaintEncoder(\n                input_dim,\n                n_heads,\n                use_qkv_bias,\n                attn_dropout,\n                ff_dropout,\n                ff_factor,\n                transformer_activation,\n                self.n_feats,\n            ),\n        )\n\n    self.mlp_first_hidden_dim = (\n        self.input_dim if self.with_cls_token else (self.n_feats * self.input_dim)\n    )\n\n    # Mlp: adding an MLP on top of the Resnet blocks is optional and\n    # therefore all related params are optional\n    if self.mlp_hidden_dims is not None:\n        self.mlp = MLP(\n            d_hidden=[self.mlp_first_hidden_dim] + self.mlp_hidden_dims,\n            activation=(\n                \"relu\" if self.mlp_activation is None else self.mlp_activation\n            ),\n            dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,\n            batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,\n            batchnorm_last=(\n                False\n                if self.mlp_batchnorm_last is None\n                else self.mlp_batchnorm_last\n            ),\n            linear_first=(\n                False if self.mlp_linear_first is None else self.mlp_linear_first\n            ),\n        )\n    else:\n        self.mlp = None\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.saint.SAINT.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.saint.SAINT.attention_weights","title":"attention_weights  <code>property</code>","text":"<pre><code>attention_weights\n</code></pre> <p>List with the attention weights. Each element of the list is a tuple where the first and the second elements are the column and row attention weights respectively</p> <p>The shape of the attention weights is:</p> <ul> <li> <p>column attention: \\((N, H, F, F)\\)</p> </li> <li> <p>row attention: \\((1, H, N, N)\\)</p> </li> </ul> <p>where \\(N\\) is the batch size, \\(H\\) is the number of heads and \\(F\\) is the number of features/columns in the dataset</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.ft_transformer.FTTransformer","title":"FTTransformer","text":"<pre><code>FTTransformer(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    shared_embed=None,\n    add_shared_embed=None,\n    frac_shared_embed=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous_method=\"standard\",\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    input_dim=64,\n    kv_compression_factor=0.5,\n    kv_sharing=False,\n    use_qkv_bias=False,\n    n_heads=8,\n    n_blocks=4,\n    attn_dropout=0.2,\n    ff_dropout=0.1,\n    ff_factor=1.33,\n    transformer_activation=\"reglu\",\n    mlp_hidden_dims=None,\n    mlp_activation=None,\n    mlp_dropout=None,\n    mlp_batchnorm=None,\n    mlp_batchnorm_last=None,\n    mlp_linear_first=None\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithAttention</code></p> <p>Defines a FTTransformer model that can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>Most of the parameters for this class are <code>Optional</code> since the use of categorical or continuous is in fact optional (i.e. one can use categorical features only, continuous features only or both).</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name and number of unique values and embedding dimension. e.g. [(education, 11), ...]</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the embeddings will be \"shared\". The idea behind <code>shared_embed</code> is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: <code>pytorch_widedeep.models.transformers._layers.SharedEmbeddings</code>.</p> </li> <li> <code>add_shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>The two embedding sharing strategies are: 1) add the shared embeddings to the column embeddings or 2) to replace the first <code>frac_shared_embed</code> with the shared embeddings. See <code>pytorch_widedeep.models.embeddings_layers.SharedEmbeddings</code> If 'None' is passed, it will default to 'False'.</p> </li> <li> <code>frac_shared_embed</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>The fraction of embeddings that will be shared (if <code>add_shared_embed = False</code>) by all the different categories for one particular column. If 'None' is passed, it will default to 0.0.</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal[standard, piecewise, periodic]]</code>, default:                   <code>'standard'</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>input_dim</code>               (<code>int</code>, default:                   <code>64</code> )           \u2013            <p>The so-called dimension of the model. Is the number of embeddings used to encode the categorical and/or continuous columns.</p> </li> <li> <code>kv_compression_factor</code>               (<code>float</code>, default:                   <code>0.5</code> )           \u2013            <p>By default, the FTTransformer uses Linear Attention (See Linformer: Self-Attention with Linear Complexity ). The compression factor that will be used to reduce the input sequence length. If we denote the resulting sequence length as \\(k = int(kv_{compression \\space factor} \\times s)\\) where \\(s\\) is the input sequence length.</p> </li> <li> <code>kv_sharing</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the \\(E\\) and \\(F\\) projection matrices will share weights.  See Linformer: Self-Attention with Linear Complexity for details</p> </li> <li> <code>n_heads</code>               (<code>int</code>, default:                   <code>8</code> )           \u2013            <p>Number of attention heads per FTTransformer block</p> </li> <li> <code>use_qkv_bias</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to use bias in the Q, K, and V projection layers</p> </li> <li> <code>n_blocks</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Number of FTTransformer blocks</p> </li> <li> <code>attn_dropout</code>               (<code>float</code>, default:                   <code>0.2</code> )           \u2013            <p>Dropout that will be applied to the Linear-Attention layers</p> </li> <li> <code>ff_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Dropout that will be applied to the FeedForward network</p> </li> <li> <code>ff_factor</code>               (<code>float</code>, default:                   <code>1.33</code> )           \u2013            <p>Multiplicative factor applied to the first layer of the FF network in each Transformer block, This is normally set to 4, but they use 4/3 in the paper.</p> </li> <li> <code>transformer_activation</code>               (<code>str</code>, default:                   <code>'reglu'</code> )           \u2013            <p>Transformer Encoder activation function. 'tanh', 'relu', 'leaky_relu', 'gelu', 'geglu' and 'reglu' are supported</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per dense layer in the MLP. e.g: [64, 32]. If not provided no MLP on top of the final FTTransformer block will be used.</p> </li> <li> <code>mlp_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky'_relu' and _'gelu' are supported. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 'relu'.</p> </li> <li> <code>mlp_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>float with the dropout between the dense layers of the MLP. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 0.0.</p> </li> <li> <code>mlp_batchnorm</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_linear_first</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code> If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to <code>True</code>.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>encoder</code>               (<code>Module</code>)           \u2013            <p>Sequence of FTTransformer blocks</p> </li> <li> <code>mlp</code>               (<code>Module</code>)           \u2013            <p>MLP component in the model</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import FTTransformer\n&gt;&gt;&gt; X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = ['a', 'b', 'c', 'd', 'e']\n&gt;&gt;&gt; cat_embed_input = [(u,i) for u,i in zip(colnames[:4], [4]*4)]\n&gt;&gt;&gt; continuous_cols = ['e']\n&gt;&gt;&gt; column_idx = {k:v for v,k in enumerate(colnames)}\n&gt;&gt;&gt; model = FTTransformer(column_idx=column_idx, cat_embed_input=cat_embed_input, continuous_cols=continuous_cols)\n&gt;&gt;&gt; out = model(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/transformers/ft_transformer.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    shared_embed: Optional[bool] = None,\n    add_shared_embed: Optional[bool] = None,\n    frac_shared_embed: Optional[float] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = \"standard\",\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    input_dim: int = 64,\n    kv_compression_factor: float = 0.5,\n    kv_sharing: bool = False,\n    use_qkv_bias: bool = False,\n    n_heads: int = 8,\n    n_blocks: int = 4,\n    attn_dropout: float = 0.2,\n    ff_dropout: float = 0.1,\n    ff_factor: float = 1.33,\n    transformer_activation: str = \"reglu\",\n    mlp_hidden_dims: Optional[List[int]] = None,\n    mlp_activation: Optional[str] = None,\n    mlp_dropout: Optional[float] = None,\n    mlp_batchnorm: Optional[bool] = None,\n    mlp_batchnorm_last: Optional[bool] = None,\n    mlp_linear_first: Optional[bool] = None,\n):\n    super(FTTransformer, self).__init__(\n        column_idx=column_idx,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        shared_embed=shared_embed,\n        add_shared_embed=add_shared_embed,\n        frac_shared_embed=frac_shared_embed,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        input_dim=input_dim,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    self.kv_compression_factor = kv_compression_factor\n    self.kv_sharing = kv_sharing\n    self.use_qkv_bias = use_qkv_bias\n    self.n_heads = n_heads\n    self.n_blocks = n_blocks\n    self.attn_dropout = attn_dropout\n    self.ff_dropout = ff_dropout\n    self.ff_factor = ff_factor\n    self.transformer_activation = transformer_activation\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    self.with_cls_token = \"cls_token\" in column_idx\n    self.n_cat = len(cat_embed_input) if cat_embed_input is not None else 0\n    self.n_cont = len(continuous_cols) if continuous_cols is not None else 0\n    self.n_feats = self.n_cat + self.n_cont\n\n    # Embeddings are instantiated at the base model\n    # Transformer blocks\n    is_first = True\n    self.encoder = nn.Sequential()\n    for i in range(n_blocks):\n        self.encoder.add_module(\n            \"fttransformer_block\" + str(i),\n            FTTransformerEncoder(\n                input_dim,\n                self.n_feats,\n                n_heads,\n                use_qkv_bias,\n                attn_dropout,\n                ff_dropout,\n                ff_factor,\n                kv_compression_factor,\n                kv_sharing,\n                transformer_activation,\n                is_first,\n            ),\n        )\n        is_first = False\n\n    self.mlp_first_hidden_dim = (\n        self.input_dim if self.with_cls_token else (self.n_feats * self.input_dim)\n    )\n\n    # Mlp: adding an MLP on top of the Resnet blocks is optional and\n    # therefore all related params are optional\n    if self.mlp_hidden_dims is not None:\n        self.mlp = MLP(\n            d_hidden=[self.mlp_first_hidden_dim] + self.mlp_hidden_dims,\n            activation=(\n                \"relu\" if self.mlp_activation is None else self.mlp_activation\n            ),\n            dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,\n            batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,\n            batchnorm_last=(\n                False\n                if self.mlp_batchnorm_last is None\n                else self.mlp_batchnorm_last\n            ),\n            linear_first=(\n                False if self.mlp_linear_first is None else self.mlp_linear_first\n            ),\n        )\n    else:\n        self.mlp = None\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.ft_transformer.FTTransformer.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.ft_transformer.FTTransformer.attention_weights","title":"attention_weights  <code>property</code>","text":"<pre><code>attention_weights\n</code></pre> <p>List with the attention weights per block</p> <p>The shape of the attention weights is: \\((N, H, F, k)\\), where \\(N\\) is the batch size, \\(H\\) is the number of attention heads, \\(F\\) is the number of features/columns and \\(k\\) is the reduced sequence length or dimension, i.e. \\(k = int(kv_{compression \\space factor} \\times s)\\)</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.tab_perceiver.TabPerceiver","title":"TabPerceiver","text":"<pre><code>TabPerceiver(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    shared_embed=None,\n    add_shared_embed=None,\n    frac_shared_embed=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous_method=\"standard\",\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    input_dim=32,\n    n_cross_attns=1,\n    n_cross_attn_heads=4,\n    n_latents=16,\n    latent_dim=128,\n    n_latent_heads=4,\n    n_latent_blocks=4,\n    n_perceiver_blocks=4,\n    share_weights=False,\n    attn_dropout=0.1,\n    ff_dropout=0.1,\n    ff_factor=4,\n    transformer_activation=\"geglu\",\n    mlp_hidden_dims=None,\n    mlp_activation=None,\n    mlp_dropout=None,\n    mlp_batchnorm=None,\n    mlp_batchnorm_last=None,\n    mlp_linear_first=None\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithAttention</code></p> <p>Defines an adaptation of a Perceiver  that can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model  or independently by itself.</p> <p>Most of the parameters for this class are <code>Optional</code> since the use of categorical or continuous is in fact optional (i.e. one can use categorical features only, continuous features only or both).</p> <p> NOTE: while there are scientific publications for  the <code>TabTransformer</code>, <code>SAINT</code> and <code>FTTransformer</code>, the <code>TabPerceiver</code>  and the <code>TabFastFormer</code> are our own adaptations of the  Perceiver and the  FastFormer for tabular data.</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name and number of unique values and embedding dimension. e.g. [(education, 11), ...]</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the embeddings will be \"shared\". The idea behind <code>shared_embed</code> is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: <code>pytorch_widedeep.models.transformers._layers.SharedEmbeddings</code>.</p> </li> <li> <code>add_shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>The two embedding sharing strategies are: 1) add the shared embeddings to the column embeddings or 2) to replace the first <code>frac_shared_embed</code> with the shared embeddings. See <code>pytorch_widedeep.models.embeddings_layers.SharedEmbeddings</code> If 'None' is passed, it will default to 'False'.</p> </li> <li> <code>frac_shared_embed</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>The fraction of embeddings that will be shared (if <code>add_shared_embed = False</code>) by all the different categories for one particular column. If 'None' is passed, it will default to 0.0.</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal[standard, piecewise, periodic]]</code>, default:                   <code>'standard'</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>input_dim</code>               (<code>int</code>, default:                   <code>32</code> )           \u2013            <p>The so-called dimension of the model. Is the number of embeddings used to encode the categorical and/or continuous columns.</p> </li> <li> <code>n_cross_attns</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Number of times each perceiver block will cross attend to the input data (i.e. number of cross attention components per perceiver block). This should normally be 1. However, in the paper they describe some architectures (normally computer vision-related problems) where the Perceiver attends multiple times to the input array. Therefore, maybe multiple cross attention to the input array is also useful in some cases for tabular data  .</p> </li> <li> <code>n_cross_attn_heads</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Number of attention heads for the cross attention component</p> </li> <li> <code>n_latents</code>               (<code>int</code>, default:                   <code>16</code> )           \u2013            <p>Number of latents. This is the \\(N\\) parameter in the paper. As indicated in the paper, this number should be significantly lower than \\(M\\) (the number of columns in the dataset). Setting \\(N\\) closer to \\(M\\) defies the main purpose of the Perceiver, which is to overcome the transformer quadratic bottleneck</p> </li> <li> <code>latent_dim</code>               (<code>int</code>, default:                   <code>128</code> )           \u2013            <p>Latent dimension.</p> </li> <li> <code>n_latent_heads</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Number of attention heads per Latent Transformer</p> </li> <li> <code>n_latent_blocks</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Number of transformer encoder blocks (normalised MHA + normalised FF) per Latent Transformer</p> </li> <li> <code>n_perceiver_blocks</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Number of Perceiver blocks defined as [Cross Attention + Latent Transformer]</p> </li> <li> <code>share_weights</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the weights will be shared between Perceiver blocks</p> </li> <li> <code>attn_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Dropout that will be applied to the Multi-Head Attention layers</p> </li> <li> <code>ff_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Dropout that will be applied to the FeedForward network</p> </li> <li> <code>ff_factor</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Multiplicative factor applied to the first layer of the FF network in each Transformer block, This is normally set to 4.</p> </li> <li> <code>transformer_activation</code>               (<code>str</code>, default:                   <code>'geglu'</code> )           \u2013            <p>Transformer Encoder activation function. 'tanh', 'relu', 'leaky_relu', 'gelu', 'geglu' and 'reglu' are supported</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per dense layer in the MLP. e.g: [64, 32]. If not provided no MLP on top of the final Transformer block will be used.</p> </li> <li> <code>mlp_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky'_relu' and _'gelu' are supported. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 'relu'.</p> </li> <li> <code>mlp_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>float with the dropout between the dense layers of the MLP. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 0.0.</p> </li> <li> <code>mlp_batchnorm</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_linear_first</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code> If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to <code>True</code>.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>encoder</code>               (<code>ModuleDict</code>)           \u2013            <p>ModuleDict with the Perceiver blocks</p> </li> <li> <code>latents</code>               (<code>Parameter</code>)           \u2013            <p>Latents that will be used for prediction</p> </li> <li> <code>mlp</code>               (<code>Module</code>)           \u2013            <p>MLP component in the model</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import TabPerceiver\n&gt;&gt;&gt; X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = ['a', 'b', 'c', 'd', 'e']\n&gt;&gt;&gt; cat_embed_input = [(u,i) for u,i in zip(colnames[:4], [4]*4)]\n&gt;&gt;&gt; continuous_cols = ['e']\n&gt;&gt;&gt; column_idx = {k:v for v,k in enumerate(colnames)}\n&gt;&gt;&gt; model = TabPerceiver(column_idx=column_idx, cat_embed_input=cat_embed_input,\n... continuous_cols=continuous_cols, n_latents=2, latent_dim=16,\n... n_perceiver_blocks=2)\n&gt;&gt;&gt; out = model(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/transformers/tab_perceiver.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    shared_embed: Optional[bool] = None,\n    add_shared_embed: Optional[bool] = None,\n    frac_shared_embed: Optional[float] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = \"standard\",\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    input_dim: int = 32,\n    n_cross_attns: int = 1,\n    n_cross_attn_heads: int = 4,\n    n_latents: int = 16,\n    latent_dim: int = 128,\n    n_latent_heads: int = 4,\n    n_latent_blocks: int = 4,\n    n_perceiver_blocks: int = 4,\n    share_weights: bool = False,\n    attn_dropout: float = 0.1,\n    ff_dropout: float = 0.1,\n    ff_factor: int = 4,\n    transformer_activation: str = \"geglu\",\n    mlp_hidden_dims: Optional[List[int]] = None,\n    mlp_activation: Optional[str] = None,\n    mlp_dropout: Optional[float] = None,\n    mlp_batchnorm: Optional[bool] = None,\n    mlp_batchnorm_last: Optional[bool] = None,\n    mlp_linear_first: Optional[bool] = None,\n):\n    super(TabPerceiver, self).__init__(\n        column_idx=column_idx,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        shared_embed=shared_embed,\n        add_shared_embed=add_shared_embed,\n        frac_shared_embed=frac_shared_embed,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        input_dim=input_dim,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    self.n_cross_attns = n_cross_attns\n    self.n_cross_attn_heads = n_cross_attn_heads\n    self.n_latents = n_latents\n    self.latent_dim = latent_dim\n    self.n_latent_heads = n_latent_heads\n    self.n_latent_blocks = n_latent_blocks\n    self.n_perceiver_blocks = n_perceiver_blocks\n    self.share_weights = share_weights\n    self.attn_dropout = attn_dropout\n    self.ff_dropout = ff_dropout\n    self.ff_factor = ff_factor\n    self.transformer_activation = transformer_activation\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    # Embeddings are instantiated at the base model\n    # Transformer blocks\n    self.latents = nn.init.trunc_normal_(\n        nn.Parameter(torch.empty(n_latents, latent_dim))\n    )\n\n    self.encoder = nn.ModuleDict()\n    first_perceiver_block = self._build_perceiver_block()\n    self.encoder[\"perceiver_block0\"] = first_perceiver_block\n\n    if share_weights:\n        for n in range(1, n_perceiver_blocks):\n            self.encoder[\"perceiver_block\" + str(n)] = first_perceiver_block\n    else:\n        for n in range(1, n_perceiver_blocks):\n            self.encoder[\"perceiver_block\" + str(n)] = self._build_perceiver_block()\n\n    self.mlp_first_hidden_dim = self.latent_dim\n\n    # Mlp\n    if self.mlp_hidden_dims is not None:\n        self.mlp = MLP(\n            d_hidden=[self.mlp_first_hidden_dim] + self.mlp_hidden_dims,\n            activation=(\n                \"relu\" if self.mlp_activation is None else self.mlp_activation\n            ),\n            dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,\n            batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,\n            batchnorm_last=(\n                False\n                if self.mlp_batchnorm_last is None\n                else self.mlp_batchnorm_last\n            ),\n            linear_first=(\n                False if self.mlp_linear_first is None else self.mlp_linear_first\n            ),\n        )\n    else:\n        self.mlp = None\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.tab_perceiver.TabPerceiver.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.tab_perceiver.TabPerceiver.attention_weights","title":"attention_weights  <code>property</code>","text":"<pre><code>attention_weights\n</code></pre> <p>List with the attention weights. If the weights are not shared between perceiver blocks each element of the list will be a list itself containing the Cross Attention and Latent Transformer attention weights respectively</p> <p>The shape of the attention weights is:</p> <ul> <li> <p>Cross Attention: \\((N, C, L, F)\\)</p> </li> <li> <p>Latent Attention: \\((N, T, L, L)\\)</p> </li> </ul> <p>WHere \\(N\\) is the batch size, \\(C\\) is the number of Cross Attention heads, \\(L\\) is the number of Latents, \\(F\\) is the number of features/columns in the dataset and \\(T\\) is the number of Latent Attention heads</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.tab_fastformer.TabFastFormer","title":"TabFastFormer","text":"<pre><code>TabFastFormer(\n    column_idx,\n    *,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    shared_embed=None,\n    add_shared_embed=None,\n    frac_shared_embed=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous_method=\"standard\",\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    input_dim=32,\n    n_heads=8,\n    use_bias=False,\n    n_blocks=4,\n    attn_dropout=0.1,\n    ff_dropout=0.2,\n    ff_factor=4,\n    share_qv_weights=False,\n    share_weights=False,\n    transformer_activation=\"relu\",\n    mlp_hidden_dims=None,\n    mlp_activation=None,\n    mlp_dropout=None,\n    mlp_batchnorm=None,\n    mlp_batchnorm_last=None,\n    mlp_linear_first=None\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithAttention</code></p> <p>Defines an adaptation of a FastFormer that can be used as the <code>deeptabular</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>Most of the parameters for this class are <code>Optional</code> since the use of categorical or continuous is in fact optional (i.e. one can use categorical features only, continuous features only or both).</p> <p> NOTE: while there are scientific publications for  the <code>TabTransformer</code>, <code>SAINT</code> and <code>FTTransformer</code>, the <code>TabPerceiver</code>  and the <code>TabFastFormer</code> are our own adaptations of the  Perceiver and the  FastFormer for tabular data.</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dict containing the index of the columns that will be passed through the <code>TabMlp</code> model. Required to slice the tensors. e.g. {'education': 0, 'relationship': 1, 'workclass': 2, ...}.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the column name and number of unique values and embedding dimension. e.g. [(education, 11), ...]</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the embeddings will be \"shared\". The idea behind <code>shared_embed</code> is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: <code>pytorch_widedeep.models.transformers._layers.SharedEmbeddings</code>.</p> </li> <li> <code>add_shared_embed</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>The two embedding sharing strategies are: 1) add the shared embeddings to the column embeddings or 2) to replace the first <code>frac_shared_embed</code> with the shared embeddings. See <code>pytorch_widedeep.models.embeddings_layers.SharedEmbeddings</code> If 'None' is passed, it will default to 'False'.</p> </li> <li> <code>frac_shared_embed</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>The fraction of embeddings that will be shared (if <code>add_shared_embed = False</code>) by all the different categories for one particular column. If 'None' is passed, it will default to 0.0.</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal[batchnorm, layernorm]]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal[standard, piecewise, periodic]]</code>, default:                   <code>'standard'</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>input_dim</code>               (<code>int</code>, default:                   <code>32</code> )           \u2013            <p>The so-called dimension of the model. Is the number of embeddings used to encode the categorical and/or continuous columns</p> </li> <li> <code>n_heads</code>               (<code>int</code>, default:                   <code>8</code> )           \u2013            <p>Number of attention heads per FastFormer block</p> </li> <li> <code>use_bias</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to use bias in the Q, K, and V projection layers</p> </li> <li> <code>n_blocks</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Number of FastFormer blocks</p> </li> <li> <code>attn_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Dropout that will be applied to the Additive Attention layers</p> </li> <li> <code>ff_dropout</code>               (<code>float</code>, default:                   <code>0.2</code> )           \u2013            <p>Dropout that will be applied to the FeedForward network</p> </li> <li> <code>ff_factor</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Multiplicative factor applied to the first layer of the FF network in each Transformer block, This is normally set to 4.</p> </li> <li> <code>share_qv_weights</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Following the paper, this is a boolean indicating if the Value (\\(V\\)) and the Query (\\(Q\\)) transformation parameters will be shared.</p> </li> <li> <code>share_weights</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>In addition to sharing the \\(V\\) and \\(Q\\) transformation parameters, the parameters across different Fastformer layers can also be shared. Please, see <code>pytorch_widedeep/models/tabular/transformers/tab_fastformer.py</code> for details</p> </li> <li> <code>transformer_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Transformer Encoder activation function. 'tanh', 'relu', 'leaky_relu', 'gelu', 'geglu' and 'reglu' are supported</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>MLP hidden dimensions. If not provided no MLP on top of the final FTTransformer block will be used</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per dense layer in the MLP. e.g: [64, 32]. If not provided no MLP on top of the final Transformer block will be used.</p> </li> <li> <code>mlp_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky'_relu' and _'gelu' are supported. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 'relu'.</p> </li> <li> <code>mlp_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>float with the dropout between the dense layers of the MLP. If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to 0.0.</p> </li> <li> <code>mlp_batchnorm</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to False.</p> </li> <li> <code>mlp_linear_first</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code> If 'mlp_hidden_dims' is not <code>None</code> and this parameter is <code>None</code>, it will default to <code>True</code>.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>encoder</code>               (<code>Module</code>)           \u2013            <p>Sequence of FasFormer blocks.</p> </li> <li> <code>mlp</code>               (<code>Module</code>)           \u2013            <p>MLP component in the model</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import TabFastFormer\n&gt;&gt;&gt; X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)\n&gt;&gt;&gt; colnames = ['a', 'b', 'c', 'd', 'e']\n&gt;&gt;&gt; cat_embed_input = [(u,i) for u,i in zip(colnames[:4], [4]*4)]\n&gt;&gt;&gt; continuous_cols = ['e']\n&gt;&gt;&gt; column_idx = {k:v for v,k in enumerate(colnames)}\n&gt;&gt;&gt; model = TabFastFormer(column_idx=column_idx, cat_embed_input=cat_embed_input, continuous_cols=continuous_cols)\n&gt;&gt;&gt; out = model(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/models/tabular/transformers/tab_fastformer.py</code> <pre><code>def __init__(\n    self,\n    column_idx: Dict[str, int],\n    *,\n    cat_embed_input: Optional[List[Tuple[str, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    shared_embed: Optional[bool] = None,\n    add_shared_embed: Optional[bool] = None,\n    frac_shared_embed: Optional[float] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = \"standard\",\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    input_dim: int = 32,\n    n_heads: int = 8,\n    use_bias: bool = False,\n    n_blocks: int = 4,\n    attn_dropout: float = 0.1,\n    ff_dropout: float = 0.2,\n    ff_factor: int = 4,\n    share_qv_weights: bool = False,\n    share_weights: bool = False,\n    transformer_activation: str = \"relu\",\n    mlp_hidden_dims: Optional[List[int]] = None,\n    mlp_activation: Optional[str] = None,\n    mlp_dropout: Optional[float] = None,\n    mlp_batchnorm: Optional[bool] = None,\n    mlp_batchnorm_last: Optional[bool] = None,\n    mlp_linear_first: Optional[bool] = None,\n):\n    super(TabFastFormer, self).__init__(\n        column_idx=column_idx,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        shared_embed=shared_embed,\n        add_shared_embed=add_shared_embed,\n        frac_shared_embed=frac_shared_embed,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        input_dim=input_dim,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    self.n_heads = n_heads\n    self.use_bias = use_bias\n    self.n_blocks = n_blocks\n    self.attn_dropout = attn_dropout\n    self.ff_dropout = ff_dropout\n    self.ff_factor = ff_factor\n    self.share_qv_weights = share_qv_weights\n    self.share_weights = share_weights\n    self.transformer_activation = transformer_activation\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    self.with_cls_token = \"cls_token\" in column_idx\n    self.n_cat = len(cat_embed_input) if cat_embed_input is not None else 0\n    self.n_cont = len(continuous_cols) if continuous_cols is not None else 0\n    self.n_feats = self.n_cat + self.n_cont\n\n    # Embeddings are instantiated at the base model\n    # Transformer blocks\n    self.encoder = nn.Sequential()\n    first_fastformer_block = FastFormerEncoder(\n        input_dim,\n        n_heads,\n        use_bias,\n        attn_dropout,\n        ff_dropout,\n        ff_factor,\n        share_qv_weights,\n        transformer_activation,\n    )\n    self.encoder.add_module(\"fastformer_block0\", first_fastformer_block)\n    for i in range(1, n_blocks):\n        if share_weights:\n            self.encoder.add_module(\n                \"fastformer_block\" + str(i), first_fastformer_block\n            )\n        else:\n            self.encoder.add_module(\n                \"fastformer_block\" + str(i),\n                FastFormerEncoder(\n                    input_dim,\n                    n_heads,\n                    use_bias,\n                    attn_dropout,\n                    ff_dropout,\n                    ff_factor,\n                    share_qv_weights,\n                    transformer_activation,\n                ),\n            )\n\n    self.mlp_first_hidden_dim = (\n        self.input_dim if self.with_cls_token else (self.n_feats * self.input_dim)\n    )\n\n    # Mlp: adding an MLP on top of the Resnet blocks is optional and\n    # therefore all related params are optional\n    if self.mlp_hidden_dims is not None:\n        self.mlp = MLP(\n            d_hidden=[self.mlp_first_hidden_dim] + self.mlp_hidden_dims,\n            activation=(\n                \"relu\" if self.mlp_activation is None else self.mlp_activation\n            ),\n            dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,\n            batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,\n            batchnorm_last=(\n                False\n                if self.mlp_batchnorm_last is None\n                else self.mlp_batchnorm_last\n            ),\n            linear_first=(\n                False if self.mlp_linear_first is None else self.mlp_linear_first\n            ),\n        )\n    else:\n        self.mlp = None\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.tab_fastformer.TabFastFormer.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.tabular.transformers.tab_fastformer.TabFastFormer.attention_weights","title":"attention_weights  <code>property</code>","text":"<pre><code>attention_weights\n</code></pre> <p>List with the attention weights. Each element of the list is a tuple where the first and second elements are the \\(\\alpha\\) and \\(\\beta\\) attention weights in the paper.</p> <p>The shape of the attention weights is \\((N, H, F)\\) where \\(N\\) is the batch size, \\(H\\) is the number of attention heads and \\(F\\) is the number of features/columns in the dataset</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.text.rnns.basic_rnn.BasicRNN","title":"BasicRNN","text":"<pre><code>BasicRNN(\n    vocab_size,\n    embed_dim=None,\n    embed_matrix=None,\n    embed_trainable=True,\n    rnn_type=\"lstm\",\n    hidden_dim=64,\n    n_layers=3,\n    rnn_dropout=0.0,\n    bidirectional=False,\n    use_hidden_state=True,\n    padding_idx=1,\n    head_hidden_dims=None,\n    head_activation=\"relu\",\n    head_dropout=None,\n    head_batchnorm=False,\n    head_batchnorm_last=False,\n    head_linear_first=False,\n)\n</code></pre> <p>               Bases: <code>BaseWDModelComponent</code></p> <p>Standard text classifier/regressor comprised by a stack of RNNs (LSTMs or GRUs) that can be used as the <code>deeptext</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>In addition, there is the option to add a Fully Connected (FC) set of dense layers on top of the stack of RNNs</p> <p>Parameters:</p> <ul> <li> <code>vocab_size</code>               (<code>int</code>)           \u2013            <p>Number of words in the vocabulary</p> </li> <li> <code>embed_dim</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Dimension of the word embeddings if non-pretained word vectors are used</p> </li> <li> <code>embed_matrix</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>Pretrained word embeddings</p> </li> <li> <code>embed_trainable</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating if the pretrained embeddings are trainable</p> </li> <li> <code>rnn_type</code>               (<code>Literal[lstm, gru]</code>, default:                   <code>'lstm'</code> )           \u2013            <p>String indicating the type of RNN to use. One of 'lstm' or 'gru'</p> </li> <li> <code>hidden_dim</code>               (<code>int</code>, default:                   <code>64</code> )           \u2013            <p>Hidden dim of the RNN</p> </li> <li> <code>n_layers</code>               (<code>int</code>, default:                   <code>3</code> )           \u2013            <p>Number of recurrent layers</p> </li> <li> <code>rnn_dropout</code>               (<code>float</code>, default:                   <code>0.0</code> )           \u2013            <p>Dropout for each RNN layer except the last layer</p> </li> <li> <code>bidirectional</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the staked RNNs are bidirectional</p> </li> <li> <code>use_hidden_state</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating whether to use the final hidden state or the RNN's output as predicting features. Typically the former is used.</p> </li> <li> <code>padding_idx</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>index of the padding token in the padded-tokenised sequences. The <code>TextPreprocessor</code> class within this library uses fastai's tokenizer where the token index 0 is reserved for the 'unknown' word token. Therefore, the default value is set to 1.</p> </li> <li> <code>head_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the sizes of the dense layers in the head e.g: [128, 64]</p> </li> <li> <code>head_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Activation function for the dense layers in the head. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>head_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout of the dense layers in the head</p> </li> <li> <code>head_batchnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to include batch normalization in the dense layers that form the 'rnn_mlp'</p> </li> <li> <code>head_batchnorm_last</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to apply batch normalization to the last of the dense layers in the head</p> </li> <li> <code>head_linear_first</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>word_embed</code>               (<code>Module</code>)           \u2013            <p>word embedding matrix</p> </li> <li> <code>rnn</code>               (<code>Module</code>)           \u2013            <p>Stack of RNNs</p> </li> <li> <code>rnn_mlp</code>               (<code>Module</code>)           \u2013            <p>Stack of dense layers on top of the RNN. This will only exists if <code>head_layers_dim</code> is not None</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import BasicRNN\n&gt;&gt;&gt; X_text = torch.cat((torch.zeros([5,1]), torch.empty(5, 4).random_(1,4)), axis=1)\n&gt;&gt;&gt; model = BasicRNN(vocab_size=4, hidden_dim=4, n_layers=2, padding_idx=0, embed_dim=4)\n&gt;&gt;&gt; out = model(X_text)\n</code></pre> Source code in <code>pytorch_widedeep/models/text/rnns/basic_rnn.py</code> <pre><code>def __init__(\n    self,\n    vocab_size: int,\n    embed_dim: Optional[int] = None,\n    embed_matrix: Optional[np.ndarray] = None,\n    embed_trainable: bool = True,\n    rnn_type: Literal[\"lstm\", \"gru\"] = \"lstm\",\n    hidden_dim: int = 64,\n    n_layers: int = 3,\n    rnn_dropout: float = 0.0,\n    bidirectional: bool = False,\n    use_hidden_state: bool = True,\n    padding_idx: int = 1,\n    head_hidden_dims: Optional[List[int]] = None,\n    head_activation: str = \"relu\",\n    head_dropout: Optional[float] = None,\n    head_batchnorm: bool = False,\n    head_batchnorm_last: bool = False,\n    head_linear_first: bool = False,\n):\n    super(BasicRNN, self).__init__()\n\n    if embed_dim is None and embed_matrix is None:\n        raise ValueError(\n            \"If no 'embed_matrix' is passed, the embedding dimension must\"\n            \"be specified with 'embed_dim'\"\n        )\n\n    if rnn_type.lower() not in [\"lstm\", \"gru\"]:\n        raise ValueError(\n            f\"'rnn_type' must be 'lstm' or 'gru', got {rnn_type} instead\"\n        )\n\n    if (\n        embed_dim is not None\n        and embed_matrix is not None\n        and not embed_dim == embed_matrix.shape[1]\n    ):\n        warnings.warn(\n            \"the input embedding dimension {} and the dimension of the \"\n            \"pretrained embeddings {} do not match. The pretrained embeddings \"\n            \"dimension ({}) will be used\".format(\n                embed_dim, embed_matrix.shape[1], embed_matrix.shape[1]\n            ),\n            UserWarning,\n        )\n\n    self.vocab_size = vocab_size\n    self.embed_trainable = embed_trainable\n    self.embed_dim = embed_dim\n\n    self.rnn_type = rnn_type\n    self.hidden_dim = hidden_dim\n    self.n_layers = n_layers\n    self.rnn_dropout = rnn_dropout\n    self.bidirectional = bidirectional\n    self.use_hidden_state = use_hidden_state\n    self.padding_idx = padding_idx\n\n    self.head_hidden_dims = head_hidden_dims\n    self.head_activation = head_activation\n    self.head_dropout = head_dropout\n    self.head_batchnorm = head_batchnorm\n    self.head_batchnorm_last = head_batchnorm_last\n    self.head_linear_first = head_linear_first\n\n    # Embeddings\n    if embed_matrix is not None:\n        self.word_embed, self.embed_dim = self._set_embeddings(embed_matrix)\n    else:\n        assert self.embed_dim is not None\n        self.word_embed = nn.Embedding(\n            self.vocab_size, self.embed_dim, padding_idx=self.padding_idx\n        )\n\n    # RNN\n    rnn_params = {\n        \"input_size\": self.embed_dim,\n        \"hidden_size\": hidden_dim,\n        \"num_layers\": n_layers,\n        \"bidirectional\": bidirectional,\n        \"dropout\": rnn_dropout,\n        \"batch_first\": True,\n    }\n    if self.rnn_type.lower() == \"lstm\":\n        self.rnn: Union[nn.LSTM, nn.GRU] = nn.LSTM(**rnn_params)\n    elif self.rnn_type.lower() == \"gru\":\n        self.rnn = nn.GRU(**rnn_params)\n    else:\n        raise ValueError(\n            f\"'rnn_type' must be 'lstm' or 'gru', got {self.rnn_type} instead\"\n        )\n\n    self.rnn_output_dim = hidden_dim * 2 if bidirectional else hidden_dim\n\n    # FC-Head (Mlp)\n    if self.head_hidden_dims is not None:\n        head_hidden_dims = [self.rnn_output_dim] + self.head_hidden_dims\n        self.rnn_mlp: Union[MLP, nn.Identity] = MLP(\n            head_hidden_dims,\n            head_activation,\n            head_dropout,\n            head_batchnorm,\n            head_batchnorm_last,\n            head_linear_first,\n        )\n    else:\n        # simple hack to add readability in the forward pass\n        self.rnn_mlp = nn.Identity()\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.text.rnns.basic_rnn.BasicRNN.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.text.rnns.attentive_rnn.AttentiveRNN","title":"AttentiveRNN","text":"<pre><code>AttentiveRNN(\n    vocab_size,\n    embed_dim=None,\n    embed_matrix=None,\n    embed_trainable=True,\n    rnn_type=\"lstm\",\n    hidden_dim=64,\n    n_layers=3,\n    rnn_dropout=0.1,\n    bidirectional=False,\n    use_hidden_state=True,\n    padding_idx=1,\n    attn_concatenate=True,\n    attn_dropout=0.1,\n    head_hidden_dims=None,\n    head_activation=\"relu\",\n    head_dropout=None,\n    head_batchnorm=False,\n    head_batchnorm_last=False,\n    head_linear_first=False,\n)\n</code></pre> <p>               Bases: <code>BasicRNN</code></p> <p>Text classifier/regressor comprised by a stack of RNNs (LSTMs or GRUs) plus an attention layer. This model can be used as the <code>deeptext</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>In addition, there is the option to add a Fully Connected (FC) set of dense layers on top of attention layer</p> <p>Parameters:</p> <ul> <li> <code>vocab_size</code>               (<code>int</code>)           \u2013            <p>Number of words in the vocabulary</p> </li> <li> <code>embed_dim</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Dimension of the word embeddings if non-pretained word vectors are used</p> </li> <li> <code>embed_matrix</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>Pretrained word embeddings</p> </li> <li> <code>embed_trainable</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating if the pretrained embeddings are trainable</p> </li> <li> <code>rnn_type</code>               (<code>Literal[lstm, gru]</code>, default:                   <code>'lstm'</code> )           \u2013            <p>String indicating the type of RNN to use. One of 'lstm' or 'gru'</p> </li> <li> <code>hidden_dim</code>               (<code>int</code>, default:                   <code>64</code> )           \u2013            <p>Hidden dim of the RNN</p> </li> <li> <code>n_layers</code>               (<code>int</code>, default:                   <code>3</code> )           \u2013            <p>Number of recurrent layers</p> </li> <li> <code>rnn_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Dropout for each RNN layer except the last layer</p> </li> <li> <code>bidirectional</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the staked RNNs are bidirectional</p> </li> <li> <code>use_hidden_state</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating whether to use the final hidden state or the RNN's output as predicting features. Typically the former is used.</p> </li> <li> <code>padding_idx</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>index of the padding token in the padded-tokenised sequences. The <code>TextPreprocessor</code> class within this library uses fastai's tokenizer where the token index 0 is reserved for the 'unknown' word token. Therefore, the default value is set to 1.</p> </li> <li> <code>attn_concatenate</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating if the input to the attention mechanism will be the output of the RNN or the output of the RNN concatenated with the last hidden state.</p> </li> <li> <code>attn_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Internal dropout for the attention mechanism</p> </li> <li> <code>head_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the sizes of the dense layers in the head e.g: [128, 64]</p> </li> <li> <code>head_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Activation function for the dense layers in the head. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>head_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout of the dense layers in the head</p> </li> <li> <code>head_batchnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to include batch normalization in the dense layers that form the 'rnn_mlp'</p> </li> <li> <code>head_batchnorm_last</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to apply batch normalization to the last of the dense layers in the head</p> </li> <li> <code>head_linear_first</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>word_embed</code>               (<code>Module</code>)           \u2013            <p>word embedding matrix</p> </li> <li> <code>rnn</code>               (<code>Module</code>)           \u2013            <p>Stack of RNNs</p> </li> <li> <code>rnn_mlp</code>               (<code>Module</code>)           \u2013            <p>Stack of dense layers on top of the RNN. This will only exists if <code>head_layers_dim</code> is not <code>None</code></p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import AttentiveRNN\n&gt;&gt;&gt; X_text = torch.cat((torch.zeros([5,1]), torch.empty(5, 4).random_(1,4)), axis=1)\n&gt;&gt;&gt; model = AttentiveRNN(vocab_size=4, hidden_dim=4, n_layers=2, padding_idx=0, embed_dim=4)\n&gt;&gt;&gt; out = model(X_text)\n</code></pre> Source code in <code>pytorch_widedeep/models/text/rnns/attentive_rnn.py</code> <pre><code>def __init__(\n    self,\n    vocab_size: int,\n    embed_dim: Optional[int] = None,\n    embed_matrix: Optional[np.ndarray] = None,\n    embed_trainable: bool = True,\n    rnn_type: Literal[\"lstm\", \"gru\"] = \"lstm\",\n    hidden_dim: int = 64,\n    n_layers: int = 3,\n    rnn_dropout: float = 0.1,\n    bidirectional: bool = False,\n    use_hidden_state: bool = True,\n    padding_idx: int = 1,\n    attn_concatenate: bool = True,\n    attn_dropout: float = 0.1,\n    head_hidden_dims: Optional[List[int]] = None,\n    head_activation: str = \"relu\",\n    head_dropout: Optional[float] = None,\n    head_batchnorm: bool = False,\n    head_batchnorm_last: bool = False,\n    head_linear_first: bool = False,\n):\n    super(AttentiveRNN, self).__init__(\n        vocab_size=vocab_size,\n        embed_dim=embed_dim,\n        embed_matrix=embed_matrix,\n        embed_trainable=embed_trainable,\n        rnn_type=rnn_type,\n        hidden_dim=hidden_dim,\n        n_layers=n_layers,\n        rnn_dropout=rnn_dropout,\n        bidirectional=bidirectional,\n        use_hidden_state=use_hidden_state,\n        padding_idx=padding_idx,\n        head_hidden_dims=head_hidden_dims,\n        head_activation=head_activation,\n        head_dropout=head_dropout,\n        head_batchnorm=head_batchnorm,\n        head_batchnorm_last=head_batchnorm_last,\n        head_linear_first=head_linear_first,\n    )\n\n    # Embeddings and RNN defined in the BasicRNN inherited class\n\n    # Attention\n    self.attn_concatenate = attn_concatenate\n    self.attn_dropout = attn_dropout\n\n    if bidirectional and attn_concatenate:\n        self.rnn_output_dim = hidden_dim * 4\n    elif bidirectional or attn_concatenate:\n        self.rnn_output_dim = hidden_dim * 2\n    else:\n        self.rnn_output_dim = hidden_dim\n    self.attn = ContextAttention(\n        self.rnn_output_dim, attn_dropout, sum_along_seq=True\n    )\n\n    # FC-Head (Mlp)\n    if self.head_hidden_dims is not None:\n        head_hidden_dims = [self.rnn_output_dim] + self.head_hidden_dims\n        self.rnn_mlp = MLP(\n            head_hidden_dims,\n            head_activation,\n            head_dropout,\n            head_batchnorm,\n            head_batchnorm_last,\n            head_linear_first,\n        )\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.text.rnns.attentive_rnn.AttentiveRNN.attention_weights","title":"attention_weights  <code>property</code>","text":"<pre><code>attention_weights\n</code></pre> <p>List with the attention weights</p> <p>The shape of the attention weights is \\((N, S)\\), where \\(N\\) is the batch size and \\(S\\) is the length of the sequence</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.text.rnns.stacked_attentive_rnn.StackedAttentiveRNN","title":"StackedAttentiveRNN","text":"<pre><code>StackedAttentiveRNN(\n    vocab_size,\n    embed_dim=None,\n    embed_matrix=None,\n    embed_trainable=True,\n    rnn_type=\"lstm\",\n    hidden_dim=64,\n    bidirectional=False,\n    padding_idx=1,\n    n_blocks=3,\n    attn_concatenate=False,\n    attn_dropout=0.1,\n    with_addnorm=False,\n    head_hidden_dims=None,\n    head_activation=\"relu\",\n    head_dropout=None,\n    head_batchnorm=False,\n    head_batchnorm_last=False,\n    head_linear_first=False,\n)\n</code></pre> <p>               Bases: <code>BaseWDModelComponent</code></p> <p>Text classifier/regressor comprised by a stack of blocks: <code>[RNN + Attention]</code>. This can be used as the <code>deeptext</code> component of a Wide &amp; Deep model or independently by itself.</p> <p>In addition, there is the option to add a Fully Connected (FC) set of dense layers on top of the attentiob blocks</p> <p>Parameters:</p> <ul> <li> <code>vocab_size</code>               (<code>int</code>)           \u2013            <p>Number of words in the vocabulary</p> </li> <li> <code>embed_dim</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Dimension of the word embeddings if non-pretained word vectors are used</p> </li> <li> <code>embed_matrix</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>Pretrained word embeddings</p> </li> <li> <code>embed_trainable</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating if the pretrained embeddings are trainable</p> </li> <li> <code>rnn_type</code>               (<code>Literal[lstm, gru]</code>, default:                   <code>'lstm'</code> )           \u2013            <p>String indicating the type of RNN to use. One of 'lstm' or 'gru'</p> </li> <li> <code>hidden_dim</code>               (<code>int</code>, default:                   <code>64</code> )           \u2013            <p>Hidden dim of the RNN</p> </li> <li> <code>bidirectional</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the staked RNNs are bidirectional</p> </li> <li> <code>padding_idx</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>index of the padding token in the padded-tokenised sequences. The <code>TextPreprocessor</code> class within this library uses fastai's tokenizer where the token index 0 is reserved for the 'unknown' word token. Therefore, the default value is set to 1.</p> </li> <li> <code>n_blocks</code>               (<code>int</code>, default:                   <code>3</code> )           \u2013            <p>Number of attention blocks. Each block is comprised by an RNN and a Context Attention Encoder</p> </li> <li> <code>attn_concatenate</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the input to the attention mechanism will be the output of the RNN or the output of the RNN concatenated with the last hidden state or simply</p> </li> <li> <code>attn_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Internal dropout for the attention mechanism</p> </li> <li> <code>with_addnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the output of each block will be added to the input and normalised</p> </li> <li> <code>head_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the sizes of the dense layers in the head e.g: [128, 64]</p> </li> <li> <code>head_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Activation function for the dense layers in the head. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>head_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout of the dense layers in the head</p> </li> <li> <code>head_batchnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to include batch normalization in the dense layers that form the 'rnn_mlp'</p> </li> <li> <code>head_batchnorm_last</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to apply batch normalization to the last of the dense layers in the head</p> </li> <li> <code>head_linear_first</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>word_embed</code>               (<code>Module</code>)           \u2013            <p>word embedding matrix</p> </li> <li> <code>rnn</code>               (<code>Module</code>)           \u2013            <p>Stack of RNNs</p> </li> <li> <code>rnn_mlp</code>               (<code>Module</code>)           \u2013            <p>Stack of dense layers on top of the RNN. This will only exists if <code>head_layers_dim</code> is not <code>None</code></p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import StackedAttentiveRNN\n&gt;&gt;&gt; X_text = torch.cat((torch.zeros([5,1]), torch.empty(5, 4).random_(1,4)), axis=1)\n&gt;&gt;&gt; model = StackedAttentiveRNN(vocab_size=4, hidden_dim=4, padding_idx=0, embed_dim=4)\n&gt;&gt;&gt; out = model(X_text)\n</code></pre> Source code in <code>pytorch_widedeep/models/text/rnns/stacked_attentive_rnn.py</code> <pre><code>def __init__(\n    self,\n    vocab_size: int,\n    embed_dim: Optional[int] = None,\n    embed_matrix: Optional[np.ndarray] = None,\n    embed_trainable: bool = True,\n    rnn_type: Literal[\"lstm\", \"gru\"] = \"lstm\",\n    hidden_dim: int = 64,\n    bidirectional: bool = False,\n    padding_idx: int = 1,\n    n_blocks: int = 3,\n    attn_concatenate: bool = False,\n    attn_dropout: float = 0.1,\n    with_addnorm: bool = False,\n    head_hidden_dims: Optional[List[int]] = None,\n    head_activation: str = \"relu\",\n    head_dropout: Optional[float] = None,\n    head_batchnorm: bool = False,\n    head_batchnorm_last: bool = False,\n    head_linear_first: bool = False,\n):\n    super(StackedAttentiveRNN, self).__init__()\n\n    if (\n        embed_dim is not None\n        and embed_matrix is not None\n        and not embed_dim == embed_matrix.shape[1]\n    ):\n        warnings.warn(\n            \"the input embedding dimension {} and the dimension of the \"\n            \"pretrained embeddings {} do not match. The pretrained embeddings \"\n            \"dimension ({}) will be used\".format(\n                embed_dim, embed_matrix.shape[1], embed_matrix.shape[1]\n            ),\n            UserWarning,\n        )\n\n    if rnn_type.lower() not in [\"lstm\", \"gru\"]:\n        raise ValueError(\n            f\"'rnn_type' must be 'lstm' or 'gru', got {rnn_type} instead\"\n        )\n\n    self.vocab_size = vocab_size\n    self.embed_trainable = embed_trainable\n    self.embed_dim = embed_dim\n\n    self.rnn_type = rnn_type\n    self.hidden_dim = hidden_dim\n    self.bidirectional = bidirectional\n    self.padding_idx = padding_idx\n\n    self.n_blocks = n_blocks\n    self.attn_concatenate = attn_concatenate\n    self.attn_dropout = attn_dropout\n    self.with_addnorm = with_addnorm\n\n    self.head_hidden_dims = head_hidden_dims\n    self.head_activation = head_activation\n    self.head_dropout = head_dropout\n    self.head_batchnorm = head_batchnorm\n    self.head_batchnorm_last = head_batchnorm_last\n    self.head_linear_first = head_linear_first\n\n    # Embeddings\n    self.word_embed, self.embed_dim = self._set_embeddings(embed_matrix)\n\n    # Linear Projection: if embed_dim is different that the input of the\n    # attention blocks we add a linear projection\n    if bidirectional and attn_concatenate:\n        self.rnn_output_dim = hidden_dim * 4\n    elif bidirectional or attn_concatenate:\n        self.rnn_output_dim = hidden_dim * 2\n    else:\n        self.rnn_output_dim = hidden_dim\n\n    if self.rnn_output_dim != self.embed_dim:\n        self.embed_proj: Union[nn.Linear, nn.Identity] = nn.Linear(\n            self.embed_dim, self.rnn_output_dim\n        )\n    else:\n        self.embed_proj = nn.Identity()\n\n    # RNN\n    rnn_params = {\n        \"input_size\": self.rnn_output_dim,\n        \"hidden_size\": hidden_dim,\n        \"bidirectional\": bidirectional,\n        \"batch_first\": True,\n    }\n    if self.rnn_type.lower() == \"lstm\":\n        self.rnn: Union[nn.LSTM, nn.GRU] = nn.LSTM(**rnn_params)\n    elif self.rnn_type.lower() == \"gru\":\n        self.rnn = nn.GRU(**rnn_params)\n\n    # FC-Head (Mlp)\n    self.attention_blks = nn.ModuleList()\n    for i in range(n_blocks):\n        self.attention_blks.append(\n            ContextAttentionEncoder(\n                self.rnn,\n                self.rnn_output_dim,\n                attn_dropout,\n                attn_concatenate,\n                with_addnorm=with_addnorm if i != n_blocks - 1 else False,\n                sum_along_seq=i == n_blocks - 1,\n            )\n        )\n\n    # Mlp\n    if self.head_hidden_dims is not None:\n        head_hidden_dims = [self.rnn_output_dim] + self.head_hidden_dims\n        self.rnn_mlp: Union[MLP, nn.Identity] = MLP(\n            head_hidden_dims,\n            head_activation,\n            head_dropout,\n            head_batchnorm,\n            head_batchnorm_last,\n            head_linear_first,\n        )\n    else:\n        # simple hack to add readability in the forward pass\n        self.rnn_mlp = nn.Identity()\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.text.rnns.stacked_attentive_rnn.StackedAttentiveRNN.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.text.rnns.stacked_attentive_rnn.StackedAttentiveRNN.attention_weights","title":"attention_weights  <code>property</code>","text":"<pre><code>attention_weights\n</code></pre> <p>List with the attention weights per block</p> <p>The shape of the attention weights is \\((N, S)\\) Where \\(N\\) is the batch size and \\(S\\) is the length of the sequence</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.text.huggingface_transformers.hf_model.HFModel","title":"HFModel","text":"<pre><code>HFModel(\n    model_name,\n    use_cls_token=True,\n    trainable_parameters=None,\n    head_hidden_dims=None,\n    head_activation=\"relu\",\n    head_dropout=None,\n    head_batchnorm=False,\n    head_batchnorm_last=False,\n    head_linear_first=False,\n    verbose=False,\n    **kwargs\n)\n</code></pre> <p>               Bases: <code>BaseWDModelComponent</code></p> <p>This class is a wrapper around the Hugging Face transformers library. It can be used as the text component of a Wide &amp; Deep model or independently by itself.</p> <p>At the moment only models from the families BERT, RoBERTa, DistilBERT, ALBERT and ELECTRA are supported. This is because this library is designed to address classification and regression tasks and these are the most 'popular' encoder-only models, which have proved to be those that work best for these tasks.</p> <p>Parameters:</p> <ul> <li> <code>model_name</code>               (<code>str</code>)           \u2013            <p>The model name from the transformers library e.g. 'bert-base-uncased'. Currently supported models are those from the families: BERT, RoBERTa, DistilBERT, ALBERT and ELECTRA.</p> </li> <li> <code>use_cls_token</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating whether to use the [CLS] token or the mean of the sequence of hidden states as the sentence embedding</p> </li> <li> <code>trainable_parameters</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the names of the model parameters that will be trained. If None, none of the parameters will be trainable</p> </li> <li> <code>head_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the sizes of the dense layers in the head e.g: [128, 64]</p> </li> <li> <code>head_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Activation function for the dense layers in the head. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>head_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout of the dense layers in the head</p> </li> <li> <code>head_batchnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to include batch normalization in the dense layers that form the head</p> </li> <li> <code>head_batchnorm_last</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to apply batch normalization to the last of the dense layers in the head</p> </li> <li> <code>head_linear_first</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> <li> <code>verbose</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>If True, it will print information about the model</p> </li> <li> <code>**kwargs</code>           \u2013            <p>Additional kwargs to be passed to the model</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>head</code>               (<code>Module</code>)           \u2013            <p>Stack of dense layers on top of the transformer. This will only exists if <code>head_layers_dim</code> is not None</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import HFModel\n&gt;&gt;&gt; X_text = torch.cat((torch.zeros([5,1]), torch.empty(5, 4).random_(1,4)), axis=1).long()\n&gt;&gt;&gt; model = HFModel(model_name='bert-base-uncased')\n&gt;&gt;&gt; out = model(X_text)\n</code></pre> Source code in <code>pytorch_widedeep/models/text/huggingface_transformers/hf_model.py</code> <pre><code>@alias(\"use_cls_token\", [\"use_special_token\"])\ndef __init__(\n    self,\n    model_name: str,\n    use_cls_token: bool = True,\n    trainable_parameters: Optional[List[str]] = None,\n    head_hidden_dims: Optional[List[int]] = None,\n    head_activation: str = \"relu\",\n    head_dropout: Optional[float] = None,\n    head_batchnorm: bool = False,\n    head_batchnorm_last: bool = False,\n    head_linear_first: bool = False,\n    verbose: bool = False,\n    **kwargs,\n):\n    super().__init__()\n\n    # TO DO: add warning regarging ELECTRA as ELECTRA does not have a cls\n    # token.  Research what happens with ELECTRA\n    self.model_name = model_name\n    self.use_cls_token = use_cls_token\n    self.trainable_parameters = trainable_parameters\n    self.head_hidden_dims = head_hidden_dims\n    self.head_activation = head_activation\n    self.head_dropout = head_dropout\n    self.head_batchnorm = head_batchnorm\n    self.head_batchnorm_last = head_batchnorm_last\n    self.head_linear_first = head_linear_first\n    self.verbose = verbose\n    self.kwargs = kwargs\n\n    if self.verbose and self.use_cls_token:\n        warnings.warn(\n            \"The model will use the [CLS] token. Make sure the tokenizer \"\n            \"was run with add_special_tokens=True\",\n            UserWarning,\n        )\n\n    self.model_class = get_model_class(model_name)\n\n    self.config, self.model = get_config_and_model(self.model_name)\n\n    self.output_attention_weights = kwargs.get(\"output_attentions\", False)\n\n    if self.trainable_parameters is not None:\n        for n, p in self.model.named_parameters():\n            p.requires_grad = any([tl in n for tl in self.trainable_parameters])\n\n    # FC-Head (Mlp). Note that the FC head will always be trainable\n    if self.head_hidden_dims is not None:\n        head_hidden_dims = [self.config.hidden_size] + self.head_hidden_dims\n        self.head = MLP(\n            head_hidden_dims,\n            head_activation,\n            head_dropout,\n            head_batchnorm,\n            head_batchnorm_last,\n            head_linear_first,\n        )\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.text.huggingface_transformers.hf_model.HFModel.attention_weight","title":"attention_weight  <code>property</code>","text":"<pre><code>attention_weight\n</code></pre> <p>Returns the attention weights if the model was created with the output_attention_weights=True argument. If not, it will raise an AttributeError.</p> <p>The shape of the attention weights is \\((N, H, F, F)\\), where \\(N\\) is the batch size, \\(H\\) is the number of attention heads and \\(F\\) is the sequence length.</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.image.vision.Vision","title":"Vision","text":"<pre><code>Vision(\n    pretrained_model_setup=None,\n    n_trainable=None,\n    trainable_params=None,\n    channel_sizes=[64, 128, 256, 512],\n    kernel_sizes=[7, 3, 3, 3],\n    strides=[2, 1, 1, 1],\n    head_hidden_dims=None,\n    head_activation=\"relu\",\n    head_dropout=0.1,\n    head_batchnorm=False,\n    head_batchnorm_last=False,\n    head_linear_first=False,\n)\n</code></pre> <p>               Bases: <code>BaseWDModelComponent</code></p> <p>Defines a standard image classifier/regressor using a pretrained network or a sequence of convolution layers that can be used as the <code>deepimage</code> component of a Wide &amp; Deep model or independently by itself.</p> <p> NOTE: this class represents the integration  between <code>pytorch-widedeep</code> and <code>torchvision</code>. New architectures will be  available as they are added to <code>torchvision</code>. In a distant future we aim  to bring transformer-based architectures as well. However, simple  CNN-based architectures (and even MLP-based) seem to produce SoTA  results. For the time being, we describe below the options available  through this class</p> <p>Parameters:</p> <ul> <li> <code>pretrained_model_setup</code>               (<code>Union[str, Dict[str, Union[str, WeightsEnum]]]</code>, default:                   <code>None</code> )           \u2013            <p>Name of the pretrained model. Should be a variant of the following architectures: 'resnet', 'shufflenet', 'resnext', 'wide_resnet', 'regnet', 'densenet', 'mobilenetv3', 'mobilenetv2', 'mnasnet', 'efficientnet' and 'squeezenet'. if <code>pretrained_model_setup = None</code> a basic, fully trainable CNN will be used. Alternatively, since Torchvision 0.13 one can use pretrained models with different weigths. Therefore, <code>pretrained_model_setup</code> can also be dictionary with the name of the model and the weights (e.g. <code>{'resnet50': ResNet50_Weights.DEFAULT}</code> or <code>{'resnet50': \"IMAGENET1K_V2\"}</code>).  Aliased as <code>pretrained_model_name</code>.</p> </li> <li> <code>n_trainable</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Number of trainable layers starting from the layer closer to the output neuron(s). Note that this number DOES NOT take into account the so-called 'head' which is ALWAYS trainable. If <code>trainable_params</code> is not None this parameter will be ignored</p> </li> <li> <code>trainable_params</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List of strings containing the names (or substring within the name) of the parameters that will be trained. For example, if we use a 'resnet18' pretrained model and we set <code>trainable_params = ['layer4']</code> only the parameters of 'layer4' of the network (and the head, as mentioned before) will be trained. Note that setting this or the previous parameter involves some knowledge of the architecture used.</p> </li> <li> <code>channel_sizes</code>               (<code>List[int]</code>, default:                   <code>[64, 128, 256, 512]</code> )           \u2013            <p>List of integers with the channel sizes of a CNN in case we choose not to use a pretrained model</p> </li> <li> <code>kernel_sizes</code>               (<code>Union[int, List[int]]</code>, default:                   <code>[7, 3, 3, 3]</code> )           \u2013            <p>List of integers with the kernel sizes of a CNN in case we choose not to use a pretrained model. Must be of length equal to <code>len(channel_sizes) - 1</code>.</p> </li> <li> <code>strides</code>               (<code>Union[int, List[int]]</code>, default:                   <code>[2, 1, 1, 1]</code> )           \u2013            <p>List of integers with the stride sizes of a CNN in case we choose not to use a pretrained model. Must be of length equal to <code>len(channel_sizes) - 1</code>.</p> </li> <li> <code>head_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per dense layer in the head. e.g: [64,32]</p> </li> <li> <code>head_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Activation function for the dense layers in the head. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>head_dropout</code>               (<code>Union[float, List[float]]</code>, default:                   <code>0.1</code> )           \u2013            <p>float indicating the dropout between the dense layers.</p> </li> <li> <code>head_batchnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers</p> </li> <li> <code>head_batchnorm_last</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers</p> </li> <li> <code>head_linear_first</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>features</code>               (<code>Module</code>)           \u2013            <p>The pretrained model or Standard CNN plus the optional head</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models import Vision\n&gt;&gt;&gt; X_img = torch.rand((2,3,224,224))\n&gt;&gt;&gt; model = Vision(channel_sizes=[64, 128], kernel_sizes = [3, 3], strides=[1, 1], head_hidden_dims=[32, 8])\n&gt;&gt;&gt; out = model(X_img)\n</code></pre> Source code in <code>pytorch_widedeep/models/image/vision.py</code> <pre><code>@alias(\"pretrained_model_setup\", [\"pretrained_model_name\"])\ndef __init__(\n    self,\n    pretrained_model_setup: Union[str, Dict[str, Union[str, WeightsEnum]]] = None,\n    n_trainable: Optional[int] = None,\n    trainable_params: Optional[List[str]] = None,\n    channel_sizes: List[int] = [64, 128, 256, 512],\n    kernel_sizes: Union[int, List[int]] = [7, 3, 3, 3],\n    strides: Union[int, List[int]] = [2, 1, 1, 1],\n    head_hidden_dims: Optional[List[int]] = None,\n    head_activation: str = \"relu\",\n    head_dropout: Union[float, List[float]] = 0.1,\n    head_batchnorm: bool = False,\n    head_batchnorm_last: bool = False,\n    head_linear_first: bool = False,\n):\n    super(Vision, self).__init__()\n\n    self._check_pretrained_model_setup(\n        pretrained_model_setup, n_trainable, trainable_params\n    )\n\n    self.pretrained_model_setup = pretrained_model_setup\n    self.n_trainable = n_trainable\n    self.trainable_params = trainable_params\n    self.channel_sizes = channel_sizes\n    self.kernel_sizes = kernel_sizes\n    self.strides = strides\n    self.head_hidden_dims = head_hidden_dims\n    self.head_activation = head_activation\n    self.head_dropout = head_dropout\n    self.head_batchnorm = head_batchnorm\n    self.head_batchnorm_last = head_batchnorm_last\n    self.head_linear_first = head_linear_first\n\n    self.features, self.backbone_output_dim = self._get_features()\n\n    if pretrained_model_setup is not None:\n        self._freeze(self.features)\n\n    if self.head_hidden_dims is not None:\n        head_hidden_dims = [self.backbone_output_dim] + self.head_hidden_dims\n        self.vision_mlp = MLP(\n            head_hidden_dims,\n            self.head_activation,\n            self.head_dropout,\n            self.head_batchnorm,\n            self.head_batchnorm_last,\n            self.head_linear_first,\n        )\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.image.vision.Vision.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>The output dimension of the model. This is a required property neccesary to build the <code>WideDeep</code> class</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.model_fusion.ModelFuser","title":"ModelFuser","text":"<pre><code>ModelFuser(\n    models,\n    *,\n    fusion_method,\n    projection_method=None,\n    custom_head=None,\n    head_hidden_dims=None,\n    head_activation=None,\n    head_dropout=None,\n    head_batchnorm=None,\n    head_batchnorm_last=None,\n    head_linear_first=None\n)\n</code></pre> <p>               Bases: <code>BaseWDModelComponent</code></p> <p>This class is a wrapper around a list of models that are associated to the different text and/or image columns (and datasets) The class is designed to 'fuse' the models using a variety of methods.</p> <p>Parameters:</p> <ul> <li> <code>models</code>               (<code>List[BaseWDModelComponent]</code>)           \u2013            <p>List of models whose outputs will be fused</p> </li> <li> <code>fusion_method</code>               (<code>Union[Literal[concatenate, mean, max, sum, mult, dot, head], List[Literal[concatenate, mean, max, sum, mult, head]]]</code>)           \u2013            <p>Method to fuse the output of the models. It can be one of ['concatenate', 'mean', 'max', 'sum', 'mult', 'dot', 'head'] or a list of those, but 'dot'. If a list is provided the output of the models will be fused using all the methods in the list and the final output will be the concatenation of the outputs of each method</p> </li> <li> <code>projection_method</code>               (<code>Optional[Literal[min, max, mean]]</code>, default:                   <code>None</code> )           \u2013            <p>If the fusion_method is not 'concatenate', this parameter will determine how to project the output of the models to a common dimension. It can be one of ['min', 'max', 'mean']. Default is None</p> </li> <li> <code>custom_head</code>               (<code>Optional[Union[BaseWDModelComponent, Module]]</code>, default:                   <code>None</code> )           \u2013            <p>Custom head to be used to fuse the output of the models. If provided, this will take precedence over head_hidden_dims. Also, if provided, 'projection_method' will be ignored.</p> </li> <li> <code>head_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per layer in the custom head. If custom_head is provided, this parameter will be ignored</p> </li> <li> <code>head_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function to be used in the custom head. Default is None</p> </li> <li> <code>head_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout to be used in the custom head. Default is None</p> </li> <li> <code>head_batchnorm</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Whether to use batchnorm in the custom head. Default is None</p> </li> <li> <code>head_batchnorm_last</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Whether or not batch normalization will be applied to the last of the dense layers</p> </li> <li> <code>head_linear_first</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>head</code>               (<code>Module or BaseWDModelComponent</code>)           \u2013            <p>Custom head to be used to fuse the output of the models. If custom_head is provided, this will take precedence over head_hidden_dims</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from pytorch_widedeep.preprocessing import TextPreprocessor\n&gt;&gt;&gt; from pytorch_widedeep.models import BasicRNN, ModelFuser\n&gt;&gt;&gt; import torch\n&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt;\n&gt;&gt;&gt; df = pd.DataFrame({'text_col1': ['hello world', 'this is a test'],\n... 'text_col2': ['goodbye world', 'this is another test']})\n&gt;&gt;&gt; text_preprocessor_1 = TextPreprocessor(\n...     text_col=\"text_col1\",\n...     max_vocab=10,\n...     min_freq=1,\n...     maxlen=5,\n...     n_cpus=1,\n...     verbose=0)\n&gt;&gt;&gt; text_preprocessor_2 = TextPreprocessor(\n...     text_col=\"text_col2\",\n...     max_vocab=10,\n...     min_freq=1,\n...     maxlen=5,\n...     n_cpus=1,\n...     verbose=0)\n&gt;&gt;&gt; X_text1 = text_preprocessor_1.fit_transform(df)\n&gt;&gt;&gt; X_text2 = text_preprocessor_2.fit_transform(df)\n&gt;&gt;&gt; X_text1_tnsr = torch.from_numpy(X_text1)\n&gt;&gt;&gt; X_text2_tnsr = torch.from_numpy(X_text2)\n&gt;&gt;&gt; rnn1 = BasicRNN(\n...     vocab_size=len(text_preprocessor_1.vocab.itos),\n...     embed_dim=4,\n...     hidden_dim=4,\n...     n_layers=1,\n...     bidirectional=False)\n&gt;&gt;&gt; rnn2 = BasicRNN(\n...     vocab_size=len(text_preprocessor_2.vocab.itos),\n...     embed_dim=4,\n...     hidden_dim=4,\n...     n_layers=1,\n...     bidirectional=False)\n&gt;&gt;&gt; fused_model = ModelFuser(models=[rnn1, rnn2], fusion_method='concatenate')\n&gt;&gt;&gt; out = fused_model([X_text1_tnsr, X_text2_tnsr])\n</code></pre> Source code in <code>pytorch_widedeep/models/model_fusion.py</code> <pre><code>def __init__(\n    self,\n    models: List[BaseWDModelComponent],\n    *,\n    fusion_method: Union[\n        Literal[\n            \"concatenate\",\n            \"mean\",\n            \"max\",\n            \"sum\",\n            \"mult\",\n            \"dot\",\n            \"head\",\n        ],\n        List[Literal[\"concatenate\", \"mean\", \"max\", \"sum\", \"mult\", \"head\"]],\n    ],\n    projection_method: Optional[Literal[\"min\", \"max\", \"mean\"]] = None,\n    custom_head: Optional[Union[BaseWDModelComponent, nn.Module]] = None,\n    head_hidden_dims: Optional[List[int]] = None,\n    head_activation: Optional[str] = None,\n    head_dropout: Optional[float] = None,\n    head_batchnorm: Optional[bool] = None,\n    head_batchnorm_last: Optional[bool] = None,\n    head_linear_first: Optional[bool] = None,\n) -&gt; None:\n    super(ModelFuser, self).__init__()\n\n    self.models = nn.ModuleList(models)\n    self.fusion_method = fusion_method\n    self.projection_method = projection_method\n\n    self.all_output_dim_equal = all(\n        model.output_dim == self.models[0].output_dim for model in self.models\n    )\n\n    self.check_input_parameters()\n\n    if self.fusion_method == \"head\":\n        assert (\n            head_hidden_dims is not None or custom_head is not None\n        ), \"When using 'head' as fusion_method, either head_hidden_dims or custom_head must be provided\"\n        if custom_head is not None:\n            # custom_head takes precedence over head_hidden_dims (in case\n            # both are provided)\n            assert hasattr(\n                custom_head, \"output_dim\"\n            ), \"custom_head must have an 'output_dim' property\"\n            self.head: Union[BaseWDModelComponent, nn.Module] = custom_head\n        else:\n            assert head_hidden_dims is not None\n            self.head_hidden_dims = head_hidden_dims\n            self.head_activation = head_activation\n            self.head_dropout = head_dropout\n            self.head_batchnorm = head_batchnorm\n            self.head_batchnorm_last = head_batchnorm_last\n            self.head_linear_first = head_linear_first\n\n            self.head = MLP(\n                d_hidden=[sum([model.output_dim for model in self.models])]\n                + self.head_hidden_dims,\n                activation=(\n                    \"relu\" if self.head_activation is None else self.head_activation\n                ),\n                dropout=0.0 if self.head_dropout is None else self.head_dropout,\n                batchnorm=(\n                    False if self.head_batchnorm is None else self.head_batchnorm\n                ),\n                batchnorm_last=(\n                    False\n                    if self.head_batchnorm_last is None\n                    else self.head_batchnorm_last\n                ),\n                linear_first=(\n                    True\n                    if self.head_linear_first is None\n                    else self.head_linear_first\n                ),\n            )\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.model_fusion.ModelFuser.output_dim","title":"output_dim  <code>property</code>","text":"<pre><code>output_dim\n</code></pre> <p>Returns the output dimension of the model.</p>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.model_fusion.ModelFuser.project","title":"project","text":"<pre><code>project(X)\n</code></pre> <p>Projects the output of the models to a common dimension.</p> Source code in <code>pytorch_widedeep/models/model_fusion.py</code> <pre><code>def project(self, X: List[Tensor]) -&gt; List[Tensor]:\n    r\"\"\"Projects the output of the models to a common dimension.\"\"\"\n\n    if self.all_output_dim_equal and self.projection_method is None:\n        return X\n\n    output_dims = [model.output_dim for model in self.models]\n\n    if self.projection_method == \"min\":\n        proj_dim = min(output_dims)\n        idx = output_dims.index(proj_dim)\n    elif self.projection_method == \"max\":\n        proj_dim = max(output_dims)\n        idx = output_dims.index(proj_dim)\n    elif self.projection_method == \"mean\":\n        proj_dim = int(sum(output_dims) / len(output_dims))\n        idx = None\n    else:\n        raise ValueError(\"projection_method must be one of ['min', 'max', 'mean']\")\n\n    x_proj: List[Tensor] = []\n    for i, x in enumerate(X):\n        if i == idx:\n            x_proj.append(x)\n        else:\n            x_proj.append(\n                nn.Linear(output_dims[i], proj_dim, bias=False, device=x.device)(x)\n            )\n\n    return x_proj\n</code></pre>"},{"location":"pytorch-widedeep/model_components.html#pytorch_widedeep.models.wide_deep.WideDeep","title":"WideDeep","text":"<pre><code>WideDeep(\n    wide=None,\n    deeptabular=None,\n    deeptext=None,\n    deepimage=None,\n    deephead=None,\n    head_hidden_dims=None,\n    head_activation=\"relu\",\n    head_dropout=0.1,\n    head_batchnorm=False,\n    head_batchnorm_last=False,\n    head_linear_first=True,\n    enforce_positive=False,\n    enforce_positive_activation=\"softplus\",\n    pred_dim=1,\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Main collector class that combines all <code>wide</code>, <code>deeptabular</code> <code>deeptext</code> and <code>deepimage</code> models.</p> <p>Note that all models described so far in this library must be passed to the <code>WideDeep</code> class once constructed. This is because the models output the last layer before the prediction layer. Such prediction layer is added by the <code>WideDeep</code> class as it collects the components for every data mode.</p> <p>There are two options to combine these models that correspond to the two main architectures that <code>pytorch-widedeep</code> can build.</p> <ul> <li> <p>Directly connecting the output of the model components to an ouput neuron(s).</p> </li> <li> <p>Adding a <code>Fully-Connected Head</code> (FC-Head) on top of the deep models.   This FC-Head will combine the output form the <code>deeptabular</code>, <code>deeptext</code> and   <code>deepimage</code> and will be then connected to the output neuron(s).</p> </li> </ul> <p>Parameters:</p> <ul> <li> <code>wide</code>               (<code>Optional[Module]</code>, default:                   <code>None</code> )           \u2013            <p><code>Wide</code> model. This is a linear model where the non-linearities are captured via crossed-columns.</p> </li> <li> <code>deeptabular</code>               (<code>Optional[Union[BaseWDModelComponent, List[BaseWDModelComponent]]]</code>, default:                   <code>None</code> )           \u2013            <p>Currently this library implements a number of possible architectures for the <code>deeptabular</code> component. See the documenation of the package. Note that <code>deeptabular</code> can be a list of models. This is useful when using multiple tabular inputs (e.g. for example in the context of a two-tower model for recommendation systems)</p> </li> <li> <code>deeptext</code>               (<code>Optional[Union[BaseWDModelComponent, List[BaseWDModelComponent]]]</code>, default:                   <code>None</code> )           \u2013            <p>Currently this library implements a number of possible architectures for the <code>deeptext</code> component. See the documenation of the package. Note that <code>deeptext</code> can be a list of models. This is useful when using multiple text inputs.</p> </li> <li> <code>deepimage</code>               (<code>Optional[Union[BaseWDModelComponent, List[BaseWDModelComponent]]]</code>, default:                   <code>None</code> )           \u2013            <p>Currently this library uses <code>torchvision</code> and implements a number of possible architectures for the <code>deepimage</code> component. See the documenation of the package. Note that <code>deepimage</code> can be a list of models. This is useful when using multiple image inputs.</p> </li> <li> <code>deephead</code>               (<code>Optional[BaseWDModelComponent]</code>, default:                   <code>None</code> )           \u2013            <p>Alternatively, the user can pass a custom model that will receive the output of the deep component. If <code>deephead</code> is not None all the previous fc-head parameters will be ignored</p> </li> <li> <code>head_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the sizes of the dense layers in the head e.g: [128, 64]</p> </li> <li> <code>head_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Activation function for the dense layers in the head. Currently <code>'tanh'</code>, <code>'relu'</code>, <code>'leaky_relu'</code> and <code>'gelu'</code> are supported</p> </li> <li> <code>head_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Dropout of the dense layers in the head</p> </li> <li> <code>head_batchnorm</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to include batch normalization in the dense layers that form the <code>'rnn_mlp'</code></p> </li> <li> <code>head_batchnorm_last</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether or not to apply batch normalization to the last of the dense layers in the head</p> </li> <li> <code>head_linear_first</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating whether the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> <li> <code>enforce_positive</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the output from the final layer must be positive. This is important if you are using loss functions with non-negative input restrictions, e.g. RMSLE, or if you know your predictions are bounded in between 0 and inf</p> </li> <li> <code>enforce_positive_activation</code>               (<code>str</code>, default:                   <code>'softplus'</code> )           \u2013            <p>Activation function to enforce that the final layer has a positive output. <code>'softplus'</code> or <code>'relu'</code> are supported.</p> </li> <li> <code>pred_dim</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Size of the final wide and deep output layer containing the predictions. <code>1</code> for regression and binary classification or number of classes for multiclass classification.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from pytorch_widedeep.models import TabResnet, Vision, BasicRNN, Wide, WideDeep\n&gt;&gt;&gt; embed_input = [(u, i, j) for u, i, j in zip([\"a\", \"b\", \"c\"][:4], [4] * 3, [8] * 3)]\n&gt;&gt;&gt; column_idx = {k: v for v, k in enumerate([\"a\", \"b\", \"c\"])}\n&gt;&gt;&gt; wide = Wide(10, 1)\n&gt;&gt;&gt; deeptabular = TabResnet(blocks_dims=[8, 4], column_idx=column_idx, cat_embed_input=embed_input)\n&gt;&gt;&gt; deeptext = BasicRNN(vocab_size=10, embed_dim=4, padding_idx=0)\n&gt;&gt;&gt; deepimage = Vision()\n&gt;&gt;&gt; model = WideDeep(wide=wide, deeptabular=deeptabular, deeptext=deeptext, deepimage=deepimage)\n</code></pre> <p> NOTE: It is possible to use custom components to  build Wide &amp; Deep models. Simply, build them and pass them as the  corresponding parameters. Note that the custom models MUST return a last  layer of activations(i.e. not the final prediction) so that  these  activations are collected by <code>WideDeep</code> and combined accordingly. In  addition, the models MUST also contain an attribute <code>output_dim</code> with  the size of these last layers of activations. See for example  <code>pytorch_widedeep.models.tab_mlp.TabMlp</code></p> Source code in <code>pytorch_widedeep/models/wide_deep.py</code> <pre><code>@alias(  # noqa: C901\n    \"pred_dim\",\n    [\"num_class\", \"pred_size\"],\n)\ndef __init__(\n    self,\n    wide: Optional[nn.Module] = None,\n    deeptabular: Optional[\n        Union[BaseWDModelComponent, List[BaseWDModelComponent]]\n    ] = None,\n    deeptext: Optional[\n        Union[BaseWDModelComponent, List[BaseWDModelComponent]]\n    ] = None,\n    deepimage: Optional[\n        Union[BaseWDModelComponent, List[BaseWDModelComponent]]\n    ] = None,\n    deephead: Optional[BaseWDModelComponent] = None,\n    head_hidden_dims: Optional[List[int]] = None,\n    head_activation: str = \"relu\",\n    head_dropout: float = 0.1,\n    head_batchnorm: bool = False,\n    head_batchnorm_last: bool = False,\n    head_linear_first: bool = True,\n    enforce_positive: bool = False,\n    enforce_positive_activation: str = \"softplus\",\n    pred_dim: int = 1,\n):\n    super(WideDeep, self).__init__()\n\n    self._check_inputs(\n        wide,\n        deeptabular,\n        deeptext,\n        deepimage,\n        deephead,\n        head_hidden_dims,\n        pred_dim,\n    )\n\n    # this attribute will be eventually over-written by the Trainer's\n    # device. Acts here as a 'placeholder'.\n    self.wd_device: Optional[str] = None\n\n    # required as attribute just in case we pass a deephead\n    self.pred_dim = pred_dim\n\n    self.enforce_positive = enforce_positive\n\n    # better to set this attribute already here\n    if isinstance(deeptabular, list):\n        self.is_tabnet = False\n    else:\n        self.is_tabnet = deeptabular.__class__.__name__ == \"TabNet\"\n\n    # The main 5 components of the wide and deep assemble: wide,\n    # deeptabular, deeptext, deepimage and deephead\n    self.with_deephead = deephead is not None or head_hidden_dims is not None\n    if deephead is None and head_hidden_dims is not None:\n        self.deephead = self._build_deephead(\n            deeptabular,\n            deeptext,\n            deepimage,\n            head_hidden_dims,\n            head_activation,\n            head_dropout,\n            head_batchnorm,\n            head_batchnorm_last,\n            head_linear_first,\n        )\n    elif deephead is not None:\n        self.deephead = nn.Sequential(\n            deephead, nn.Linear(deephead.output_dim, self.pred_dim)\n        )\n    else:\n        # for consistency with other components we default to None\n        self.deephead = None\n\n    self.wide = wide\n    self.deeptabular, self.deeptext, self.deepimage = self._set_model_components(\n        deeptabular, deeptext, deepimage\n    )\n\n    if self.enforce_positive:\n        self.enf_pos = get_activation_fn(enforce_positive_activation)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html","title":"The <code>preprocessing</code> module","text":"<p>This module contains the classes that are used to prepare the data before being passed to the models. There is one Preprocessor per data mode or model component (<code>wide</code>, <code>deeptabular</code>, <code>deepimage</code> and <code>deeptext</code>) with the exception of the <code>deeptext</code> component. In this case, two processors are available: one for the case when no Hugging Face model is used (<code>TextPreprocessor</code>) and another one when a Hugging Face model is used (<code>HFPreprocessor</code>).</p>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor","title":"WidePreprocessor","text":"<pre><code>WidePreprocessor(wide_cols, crossed_cols=None)\n</code></pre> <p>               Bases: <code>BasePreprocessor</code></p> <p>Preprocessor to prepare the wide input dataset</p> <p>This Preprocessor prepares the data for the wide, linear component. This linear model is implemented via an Embedding layer that is connected to the output neuron. <code>WidePreprocessor</code> numerically encodes all the unique values of all categorical columns <code>wide_cols + crossed_cols</code>. See the Example below.</p> <p>Parameters:</p> <ul> <li> <code>wide_cols</code>               (<code>List[str]</code>)           \u2013            <p>List of strings with the name of the columns that will label encoded and passed through the <code>wide</code> component</p> </li> <li> <code>crossed_cols</code>               (<code>Optional[List[Tuple[str, str]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the name of the columns that will be <code>'crossed'</code> and then label encoded. e.g. [('education', 'occupation'), ...]. For binary features, a cross-product transformation is 1 if and only if the constituent features are all 1, and 0 otherwise.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>wide_crossed_cols</code>               (<code>List</code>)           \u2013            <p>List with the names of all columns that will be label encoded</p> </li> <li> <code>encoding_dict</code>               (<code>Dict</code>)           \u2013            <p>Dictionary where the keys are the result of pasting <code>colname + '_' + column value</code> and the values are the corresponding mapped integer.</p> </li> <li> <code>inverse_encoding_dict</code>               (<code>Dict</code>)           \u2013            <p>the inverse encoding dictionary</p> </li> <li> <code>wide_dim</code>               (<code>int</code>)           \u2013            <p>Dimension of the wide model (i.e. dim of the linear layer)</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from pytorch_widedeep.preprocessing import WidePreprocessor\n&gt;&gt;&gt; df = pd.DataFrame({'color': ['r', 'b', 'g'], 'size': ['s', 'n', 'l']})\n&gt;&gt;&gt; wide_cols = ['color']\n&gt;&gt;&gt; crossed_cols = [('color', 'size')]\n&gt;&gt;&gt; wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)\n&gt;&gt;&gt; X_wide = wide_preprocessor.fit_transform(df)\n&gt;&gt;&gt; X_wide\narray([[1, 4],\n       [2, 5],\n       [3, 6]])\n&gt;&gt;&gt; wide_preprocessor.encoding_dict\n{'color_r': 1, 'color_b': 2, 'color_g': 3, 'color_size_r-s': 4, 'color_size_b-n': 5, 'color_size_g-l': 6}\n&gt;&gt;&gt; wide_preprocessor.inverse_transform(X_wide)\n  color color_size\n0     r        r-s\n1     b        b-n\n2     g        g-l\n</code></pre> Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code> <pre><code>def __init__(\n    self, wide_cols: List[str], crossed_cols: Optional[List[Tuple[str, str]]] = None\n):\n    super(WidePreprocessor, self).__init__()\n\n    self.wide_cols = wide_cols\n    self.crossed_cols = crossed_cols\n\n    self.is_fitted = False\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor.fit","title":"fit","text":"<pre><code>fit(df)\n</code></pre> <p>Fits the Preprocessor and creates required attributes</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>WidePreprocessor</code>           \u2013            <p><code>WidePreprocessor</code> fitted object</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code> <pre><code>def fit(self, df: pd.DataFrame) -&gt; \"WidePreprocessor\":\n    r\"\"\"Fits the Preprocessor and creates required attributes\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    WidePreprocessor\n        `WidePreprocessor` fitted object\n    \"\"\"\n    df_wide = self._prepare_wide(df)\n    self.wide_crossed_cols = df_wide.columns.tolist()\n    glob_feature_list = self._make_global_feature_list(\n        df_wide[self.wide_crossed_cols]\n    )\n    # leave 0 for padding/\"unseen\" categories\n    self.encoding_dict = {v: i + 1 for i, v in enumerate(glob_feature_list)}\n    self.wide_dim = len(self.encoding_dict)\n    self.inverse_encoding_dict = {k: v for v, k in self.encoding_dict.items()}\n    self.inverse_encoding_dict[0] = \"unseen\"\n\n    self.is_fitted = True\n\n    return self\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor.transform","title":"transform","text":"<pre><code>transform(df)\n</code></pre> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>transformed input dataframe</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code> <pre><code>def transform(self, df: pd.DataFrame) -&gt; np.ndarray:\n    r\"\"\"\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    np.ndarray\n        transformed input dataframe\n    \"\"\"\n    check_is_fitted(self, attributes=[\"encoding_dict\"])\n    df_wide = self._prepare_wide(df)\n    encoded = np.zeros([len(df_wide), len(self.wide_crossed_cols)])\n    for col_i, col in enumerate(self.wide_crossed_cols):\n        encoded[:, col_i] = df_wide[col].apply(\n            lambda x: (\n                self.encoding_dict[col + \"_\" + str(x)]\n                if col + \"_\" + str(x) in self.encoding_dict\n                else 0\n            )\n        )\n    return encoded.astype(\"int64\")\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor.inverse_transform","title":"inverse_transform","text":"<pre><code>inverse_transform(encoded)\n</code></pre> <p>Takes as input the output from the <code>transform</code> method and it will return the original values.</p> <p>Parameters:</p> <ul> <li> <code>encoded</code>               (<code>ndarray</code>)           \u2013            <p>numpy array with the encoded values that are the output from the <code>transform</code> method</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>DataFrame</code>           \u2013            <p>Pandas dataframe with the original values</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code> <pre><code>def inverse_transform(self, encoded: np.ndarray) -&gt; pd.DataFrame:\n    r\"\"\"Takes as input the output from the `transform` method and it will\n    return the original values.\n\n    Parameters\n    ----------\n    encoded: np.ndarray\n        numpy array with the encoded values that are the output from the\n        `transform` method\n\n    Returns\n    -------\n    pd.DataFrame\n        Pandas dataframe with the original values\n    \"\"\"\n    decoded = pd.DataFrame(encoded, columns=self.wide_crossed_cols)\n\n    if pd.__version__ &gt;= \"2.1.0\":\n        decoded = decoded.map(lambda x: self.inverse_encoding_dict[x])\n    else:\n        decoded = decoded.applymap(lambda x: self.inverse_encoding_dict[x])\n\n    for col in decoded.columns:\n        rm_str = \"\".join([col, \"_\"])\n        decoded[col] = decoded[col].apply(lambda x: x.replace(rm_str, \"\"))\n    return decoded\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor.fit_transform","title":"fit_transform","text":"<pre><code>fit_transform(df)\n</code></pre> <p>Combines <code>fit</code> and <code>transform</code></p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>transformed input dataframe</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code> <pre><code>def fit_transform(self, df: pd.DataFrame) -&gt; np.ndarray:\n    \"\"\"Combines `fit` and `transform`\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    np.ndarray\n        transformed input dataframe\n    \"\"\"\n    return self.fit(df).transform(df)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor","title":"TabPreprocessor","text":"<pre><code>TabPreprocessor(\n    cat_embed_cols=None,\n    continuous_cols=None,\n    quantization_setup=None,\n    cols_to_scale=None,\n    auto_embed_dim=True,\n    embedding_rule=\"fastai_new\",\n    default_embed_dim=16,\n    with_attention=False,\n    with_cls_token=False,\n    shared_embed=False,\n    verbose=1,\n    *,\n    scale=False,\n    already_standard=None,\n    **kwargs\n)\n</code></pre> <p>               Bases: <code>BasePreprocessor</code></p> <p>Preprocessor to prepare the <code>deeptabular</code> component input dataset</p> <p>Parameters:</p> <ul> <li> <code>cat_embed_cols</code>               (<code>Optional[Union[List[str], List[Tuple[str, int]]]]</code>, default:                   <code>None</code> )           \u2013            <p>List containing the name of the categorical columns that will be represented by embeddings (e.g. ['education', 'relationship', ...]) or a Tuple with the name and the embedding dimension (e.g.: [ ('education',32), ('relationship',16), ...])</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the continuous cols</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Union[int, Dict[str, Union[int, List[float]]]]]</code>, default:                   <code>None</code> )           \u2013            <p>Continuous columns can be turned into categorical via <code>pd.cut</code>. If <code>quantization_setup</code> is an <code>int</code>, all continuous columns will be quantized using this value as the number of bins. Alternatively, a dictionary where the keys are the column names to quantize and the values are the either integers indicating the number of bins or a list of scalars indicating the bin edges can also be used.</p> </li> <li> <code>cols_to_scale</code>               (<code>Optional[Union[List[str], str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the names of the columns that will be standarised via sklearn's <code>StandardScaler</code>. It can also be the string <code>'all'</code> in which case all the continuous cols will be scaled.</p> </li> <li> <code>auto_embed_dim</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating whether the embedding dimensions will be automatically defined via rule of thumb. See <code>embedding_rule</code> below.</p> </li> <li> <code>embedding_rule</code>               (<code>Literal[google, fastai_old, fastai_new]</code>, default:                   <code>'fastai_new'</code> )           \u2013            <p>If <code>auto_embed_dim=True</code>, this is the choice of embedding rule of thumb. Choices are:</p> <ul> <li> <p>fastai_new: \\(min(600, round(1.6 \\times n_{cat}^{0.56}))\\)</p> </li> <li> <p>fastai_old: \\(min(50, (n_{cat}//{2})+1)\\)</p> </li> <li> <p>google: \\(min(600, round(n_{cat}^{0.24}))\\)</p> </li> </ul> </li> <li> <code>default_embed_dim</code>               (<code>int</code>, default:                   <code>16</code> )           \u2013            <p>Dimension for the embeddings if the embedding dimension is not provided in the <code>cat_embed_cols</code> parameter and <code>auto_embed_dim</code> is set to <code>False</code>.</p> </li> <li> <code>with_attention</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the preprocessed data will be passed to an attention-based model (more precisely a model where all embeddings must have the same dimensions). If <code>True</code>, the param <code>cat_embed_cols</code> must just be a list containing just the categorical column names: e.g. ['education', 'relationship', ...]. This is because they will all be  encoded using embeddings of the same dim, which will be specified  later when the model is defined.  Param alias:  <code>for_transformer</code></p> </li> <li> <code>with_cls_token</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if a <code>'[CLS]'</code> token will be added to the dataset when using attention-based models. The final hidden state corresponding to this token is used as the aggregated representation for classification and regression tasks. If not, the categorical and/or continuous embeddings will be concatenated before being passed to the final MLP (if present).</p> </li> <li> <code>shared_embed</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the embeddings will be \"shared\" when using attention-based models. The idea behind <code>shared_embed</code> is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: <code>pytorch_widedeep.models.transformers._layers.SharedEmbeddings</code>.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            </li> <li> <code>scale</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p> note: this arg will be removed in upcoming  releases. Please use <code>cols_to_scale</code> instead.  Bool indicating  whether or not to scale/standarise continuous cols. It is important  to emphasize that all the DL models for tabular data in the library  also include the possibility of normalising the input continuous  features via a <code>BatchNorm</code> or a <code>LayerNorm</code>.  Param alias:  <code>scale_cont_cols</code>.</p> </li> <li> <code>already_standard</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p> note: this arg will be removed in upcoming  releases. Please use <code>cols_to_scale</code> instead.  List with the  name of the continuous cols that do not need to be  scaled/standarised.</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>**kwargs</code>           \u2013            <p><code>pd.cut</code> and <code>StandardScaler</code> related args</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>embed_dim</code>               (<code>Dict</code>)           \u2013            <p>Dictionary where keys are the embed cols and values are the embedding dimensions. If <code>with_attention</code> is set to <code>True</code> this attribute is not generated during the <code>fit</code> process</p> </li> <li> <code>label_encoder</code>               (<code>LabelEncoder</code>)           \u2013            <p>see <code>pytorch_widedeep.utils.dense_utils.LabelEncder</code></p> </li> <li> <code>cat_embed_input</code>               (<code>List</code>)           \u2013            <p>List of Tuples with the column name, number of individual values for that column and, If <code>with_attention</code> is set to <code>False</code>, the corresponding embeddings dim, e.g. [('education', 16, 10), ('relationship', 6, 8), ...].</p> </li> <li> <code>standardize_cols</code>               (<code>List</code>)           \u2013            <p>List of the columns that will be standarized</p> </li> <li> <code>scaler</code>               (<code>StandardScaler</code>)           \u2013            <p>an instance of <code>sklearn.preprocessing.StandardScaler</code></p> </li> <li> <code>column_idx</code>               (<code>Dict</code>)           \u2013            <p>Dictionary where keys are column names and values are column indexes. This is neccesary to slice tensors</p> </li> <li> <code>quantizer</code>               (<code>Quantizer</code>)           \u2013            <p>an instance of <code>Quantizer</code></p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; from pytorch_widedeep.preprocessing import TabPreprocessor\n&gt;&gt;&gt; df = pd.DataFrame({'color': ['r', 'b', 'g'], 'size': ['s', 'n', 'l'], 'age': [25, 40, 55]})\n&gt;&gt;&gt; cat_embed_cols = [('color',5), ('size',5)]\n&gt;&gt;&gt; cont_cols = ['age']\n&gt;&gt;&gt; deep_preprocessor = TabPreprocessor(cat_embed_cols=cat_embed_cols, continuous_cols=cont_cols)\n&gt;&gt;&gt; X_tab = deep_preprocessor.fit_transform(df)\n&gt;&gt;&gt; deep_preprocessor.cat_embed_cols\n[('color', 5), ('size', 5)]\n&gt;&gt;&gt; deep_preprocessor.column_idx\n{'color': 0, 'size': 1, 'age': 2}\n&gt;&gt;&gt; cont_df = pd.DataFrame({\"col1\": np.random.rand(10), \"col2\": np.random.rand(10) + 1})\n&gt;&gt;&gt; cont_cols = [\"col1\", \"col2\"]\n&gt;&gt;&gt; tab_preprocessor = TabPreprocessor(continuous_cols=cont_cols, quantization_setup=3)\n&gt;&gt;&gt; ft_cont_df = tab_preprocessor.fit_transform(cont_df)\n&gt;&gt;&gt; # or...\n&gt;&gt;&gt; quantization_setup = {'col1': [0., 0.4, 1.], 'col2': [1., 1.4, 2.]}\n&gt;&gt;&gt; tab_preprocessor2 = TabPreprocessor(continuous_cols=cont_cols, quantization_setup=quantization_setup)\n&gt;&gt;&gt; ft_cont_df2 = tab_preprocessor2.fit_transform(cont_df)\n</code></pre> Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code> <pre><code>@alias(\"with_attention\", [\"for_transformer\", \"for_matrix_factorization\", \"for_mf\"])\n@alias(\"cat_embed_cols\", [\"embed_cols\"])\n@alias(\"scale\", [\"scale_cont_cols\"])\n@alias(\"quantization_setup\", [\"cols_and_bins\"])\ndef __init__(\n    self,\n    cat_embed_cols: Optional[Union[List[str], List[Tuple[str, int]]]] = None,\n    continuous_cols: Optional[List[str]] = None,\n    quantization_setup: Optional[\n        Union[int, Dict[str, Union[int, List[float]]]]\n    ] = None,\n    cols_to_scale: Optional[Union[List[str], str]] = None,\n    auto_embed_dim: bool = True,\n    embedding_rule: Literal[\"google\", \"fastai_old\", \"fastai_new\"] = \"fastai_new\",\n    default_embed_dim: int = 16,\n    with_attention: bool = False,\n    with_cls_token: bool = False,\n    shared_embed: bool = False,\n    verbose: int = 1,\n    *,\n    scale: bool = False,\n    already_standard: Optional[List[str]] = None,\n    **kwargs,\n):\n    super(TabPreprocessor, self).__init__()\n\n    self.continuous_cols = continuous_cols\n    self.quantization_setup = quantization_setup\n    self.cols_to_scale = cols_to_scale\n    self.scale = scale\n    self.already_standard = already_standard\n    self.auto_embed_dim = auto_embed_dim\n    self.embedding_rule = embedding_rule\n    self.default_embed_dim = default_embed_dim\n    self.with_attention = with_attention\n    self.with_cls_token = with_cls_token\n    self.shared_embed = shared_embed\n    self.verbose = verbose\n\n    self.quant_args = {\n        k: v for k, v in kwargs.items() if k in pd.cut.__code__.co_varnames\n    }\n    self.scale_args = {\n        k: v for k, v in kwargs.items() if k in StandardScaler().get_params()\n    }\n\n    self._check_inputs(cat_embed_cols)\n\n    if with_cls_token:\n        self.cat_embed_cols = (\n            [\"cls_token\"] + cat_embed_cols  # type: ignore[operator]\n            if cat_embed_cols is not None\n            else [\"cls_token\"]\n        )\n    else:\n        self.cat_embed_cols = cat_embed_cols  # type: ignore[assignment]\n\n    self.is_fitted = False\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor.fit","title":"fit","text":"<pre><code>fit(df)\n</code></pre> <p>Fits the Preprocessor and creates required attributes</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>TabPreprocessor</code>           \u2013            <p><code>TabPreprocessor</code> fitted object</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code> <pre><code>def fit(self, df: pd.DataFrame) -&gt; BasePreprocessor:  # noqa: C901\n    \"\"\"Fits the Preprocessor and creates required attributes\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    TabPreprocessor\n        `TabPreprocessor` fitted object\n    \"\"\"\n\n    df_adj = self._insert_cls_token(df) if self.with_cls_token else df.copy()\n\n    self.column_idx: Dict[str, int] = {}\n\n    # Categorical embeddings logic\n    if self.cat_embed_cols is not None or self.quantization_setup is not None:\n        self.cat_embed_input: List[Union[Tuple[str, int], Tuple[str, int, int]]] = (\n            []\n        )\n\n    if self.cat_embed_cols is not None:\n        df_cat, cat_embed_dim = self._prepare_categorical(df_adj)\n\n        self.label_encoder = LabelEncoder(\n            columns_to_encode=df_cat.columns.tolist(),\n            shared_embed=self.shared_embed,\n            with_attention=self.with_attention,\n        )\n        self.label_encoder.fit(df_cat)\n\n        for k, v in self.label_encoder.encoding_dict.items():\n            if self.with_attention:\n                self.cat_embed_input.append((k, len(v)))\n            else:\n                self.cat_embed_input.append((k, len(v), cat_embed_dim[k]))\n\n        self.column_idx.update({k: v for v, k in enumerate(df_cat.columns)})\n\n    # Continuous columns logic\n    if self.continuous_cols is not None:\n        df_cont, cont_embed_dim = self._prepare_continuous(df_adj)\n\n        # Standardization logic\n        if self.standardize_cols is not None:\n            self.scaler = StandardScaler(**self.scale_args).fit(\n                df_cont[self.standardize_cols].values\n            )\n        elif self.verbose:\n            warnings.warn(\"Continuous columns will not be normalised\")\n\n        # Quantization logic\n        if self.cols_and_bins is not None:\n            # we do not run 'Quantizer.fit' here since in the wild case\n            # someone wants standardization and quantization for the same\n            # columns, the Quantizer will run on the scaled data\n            self.quantizer = Quantizer(self.cols_and_bins, **self.quant_args)\n\n            if self.with_attention:\n                for col, n_cat, _ in cont_embed_dim:\n                    self.cat_embed_input.append((col, n_cat))\n            else:\n                self.cat_embed_input.extend(cont_embed_dim)\n\n        self.column_idx.update(\n            {k: v + len(self.column_idx) for v, k in enumerate(df_cont)}\n        )\n\n    self.is_fitted = True\n\n    return self\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor.transform","title":"transform","text":"<pre><code>transform(df)\n</code></pre> <p>Returns the processed <code>dataframe</code> as a np.ndarray</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>transformed input dataframe</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code> <pre><code>def transform(self, df: pd.DataFrame) -&gt; np.ndarray:  # noqa: C901\n    \"\"\"Returns the processed `dataframe` as a np.ndarray\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    np.ndarray\n        transformed input dataframe\n    \"\"\"\n    check_is_fitted(self, condition=self.is_fitted)\n\n    df_adj = self._insert_cls_token(df) if self.with_cls_token else df.copy()\n\n    if self.cat_embed_cols is not None:\n        df_cat = df_adj[self.cat_cols]\n        df_cat = self.label_encoder.transform(df_cat)\n    if self.continuous_cols is not None:\n        df_cont = df_adj[self.continuous_cols]\n        # Standardization logic\n        if self.standardize_cols:\n            df_cont[self.standardize_cols] = self.scaler.transform(\n                df_cont[self.standardize_cols].values\n            )\n        # Quantization logic\n        if self.cols_and_bins is not None:\n            # Adjustment so I don't have to override the method\n            # in 'ChunkTabPreprocessor'\n            if self.quantizer.is_fitted:\n                df_cont = self.quantizer.transform(df_cont)\n            else:\n                df_cont = self.quantizer.fit_transform(df_cont)\n    try:\n        df_deep = pd.concat([df_cat, df_cont], axis=1)\n    except NameError:\n        try:\n            df_deep = df_cat.copy()\n        except NameError:\n            df_deep = df_cont.copy()\n\n    return df_deep.values\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor.inverse_transform","title":"inverse_transform","text":"<pre><code>inverse_transform(encoded)\n</code></pre> <p>Takes as input the output from the <code>transform</code> method and it will return the original values.</p> <p>Parameters:</p> <ul> <li> <code>encoded</code>               (<code>ndarray</code>)           \u2013            <p>array with the output of the <code>transform</code> method</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>DataFrame</code>           \u2013            <p>Pandas dataframe with the original values</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code> <pre><code>def inverse_transform(self, encoded: np.ndarray) -&gt; pd.DataFrame:  # noqa: C901\n    r\"\"\"Takes as input the output from the `transform` method and it will\n    return the original values.\n\n    Parameters\n    ----------\n    encoded: np.ndarray\n        array with the output of the `transform` method\n\n    Returns\n    -------\n    pd.DataFrame\n        Pandas dataframe with the original values\n    \"\"\"\n    decoded = pd.DataFrame(encoded, columns=list(self.column_idx.keys()))\n    # embeddings back to original category\n    if self.cat_embed_cols is not None:\n        decoded = self.label_encoder.inverse_transform(decoded)\n    if self.continuous_cols is not None:\n        # quantized cols to the mid point\n        if self.cols_and_bins is not None:\n            if self.verbose:\n                print(\n                    \"Note that quantized cols will be turned into the mid point of \"\n                    \"the corresponding bin\"\n                )\n            for k, v in self.quantizer.inversed_bins.items():\n                decoded[k] = decoded[k].map(v)\n        # continuous_cols back to non-standarised\n        try:\n            decoded[self.standardize_cols] = self.scaler.inverse_transform(\n                decoded[self.standardize_cols]\n            )\n        except Exception:  # KeyError:\n            pass\n\n    if \"cls_token\" in decoded.columns:\n        decoded.drop(\"cls_token\", axis=1, inplace=True)\n\n    return decoded\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor.fit_transform","title":"fit_transform","text":"<pre><code>fit_transform(df)\n</code></pre> <p>Combines <code>fit</code> and <code>transform</code></p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>transformed input dataframe</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code> <pre><code>def fit_transform(self, df: pd.DataFrame) -&gt; np.ndarray:\n    \"\"\"Combines `fit` and `transform`\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    np.ndarray\n        transformed input dataframe\n    \"\"\"\n    return self.fit(df).transform(df)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.tab_preprocessor.Quantizer","title":"Quantizer","text":"<pre><code>Quantizer(quantization_setup, **kwargs)\n</code></pre> <p>Helper class to perform the quantization of continuous columns. It is included in this docs for completion, since depending on the value of the parameter <code>'quantization_setup'</code> of the <code>TabPreprocessor</code> class, that class might have an attribute of type <code>Quantizer</code>. However, this class is designed to always run internally within the <code>TabPreprocessor</code> class.</p> <p>Parameters:</p> <ul> <li> <code>quantization_setup</code>               (<code>Dict[str, Union[int, List[float]]]</code>)           \u2013            <p>Dictionary where the keys are the column names to quantize and the values are the either integers indicating the number of bins or a list of scalars indicating the bin edges.</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code> <pre><code>def __init__(\n    self,\n    quantization_setup: Dict[str, Union[int, List[float]]],\n    **kwargs,\n):\n    self.quantization_setup = quantization_setup\n    self.quant_args = kwargs\n\n    self.is_fitted = False\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.text_preprocessor.TextPreprocessor","title":"TextPreprocessor","text":"<pre><code>TextPreprocessor(\n    text_col,\n    max_vocab=30000,\n    min_freq=5,\n    maxlen=80,\n    pad_first=True,\n    pad_idx=1,\n    already_processed=False,\n    word_vectors_path=None,\n    n_cpus=None,\n    verbose=1,\n)\n</code></pre> <p>               Bases: <code>BasePreprocessor</code></p> <p>Preprocessor to prepare the <code>deeptext</code> input dataset</p> <p>Parameters:</p> <ul> <li> <code>text_col</code>               (<code>str</code>)           \u2013            <p>column in the input dataframe containing the texts</p> </li> <li> <code>max_vocab</code>               (<code>int</code>, default:                   <code>30000</code> )           \u2013            <p>Maximum number of tokens in the vocabulary</p> </li> <li> <code>min_freq</code>               (<code>int</code>, default:                   <code>5</code> )           \u2013            <p>Minimum frequency for a token to be part of the vocabulary</p> </li> <li> <code>maxlen</code>               (<code>int</code>, default:                   <code>80</code> )           \u2013            <p>Maximum length of the tokenized sequences</p> </li> <li> <code>pad_first</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Indicates whether the padding index will be added at the beginning or the end of the sequences</p> </li> <li> <code>pad_idx</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>padding index. Fastai's Tokenizer leaves 0 for the 'unknown' token.</p> </li> <li> <code>already_processed</code>               (<code>Optional[bool]</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the sequence of elements is already processed or prepared. If this is the case, this Preprocessor will simply tokenize and pad the sequence. </p> <pre><code>Param aliases: `not_text`. &lt;br/&gt;\n</code></pre> <p>This parameter is thought for those cases where the input sequences are already fully processed or are directly not text (e.g. IDs)</p> </li> <li> <code>word_vectors_path</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Path to the pretrained word vectors</p> </li> <li> <code>n_cpus</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>number of CPUs to used during the tokenization process</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Enable verbose output.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>vocab</code>               (<code>Vocab</code>)           \u2013            <p>an instance of <code>pytorch_widedeep.utils.fastai_transforms.Vocab</code></p> </li> <li> <code>embedding_matrix</code>               (<code>ndarray</code>)           \u2013            <p>Array with the pretrained embeddings</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from pytorch_widedeep.preprocessing import TextPreprocessor\n&gt;&gt;&gt; df_train = pd.DataFrame({'text_column': [\"life is like a box of chocolates\",\n... \"You never know what you're gonna get\"]})\n&gt;&gt;&gt; text_preprocessor = TextPreprocessor(text_col='text_column', max_vocab=25, min_freq=1, maxlen=10)\n&gt;&gt;&gt; text_preprocessor.fit_transform(df_train)\nThe vocabulary contains 24 tokens\narray([[ 1,  1,  1,  1, 10, 11, 12, 13, 14, 15],\n       [ 5,  9, 16, 17, 18,  9, 19, 20, 21, 22]], dtype=int32)\n&gt;&gt;&gt; df_te = pd.DataFrame({'text_column': ['you never know what is in the box']})\n&gt;&gt;&gt; text_preprocessor.transform(df_te)\narray([[ 1,  1,  9, 16, 17, 18, 11,  0,  0, 13]], dtype=int32)\n</code></pre> Source code in <code>pytorch_widedeep/preprocessing/text_preprocessor.py</code> <pre><code>@alias(\"already_processed\", [\"not_text\"])\ndef __init__(\n    self,\n    text_col: str,\n    max_vocab: int = 30000,\n    min_freq: int = 5,\n    maxlen: int = 80,\n    pad_first: bool = True,\n    pad_idx: int = 1,\n    already_processed: Optional[bool] = False,\n    word_vectors_path: Optional[str] = None,\n    n_cpus: Optional[int] = None,\n    verbose: int = 1,\n):\n    super(TextPreprocessor, self).__init__()\n\n    self.text_col = text_col\n    self.max_vocab = max_vocab\n    self.min_freq = min_freq\n    self.maxlen = maxlen\n    self.pad_first = pad_first\n    self.pad_idx = pad_idx\n    self.already_processed = already_processed\n    self.word_vectors_path = word_vectors_path\n    self.verbose = verbose\n    self.n_cpus = n_cpus if n_cpus is not None else os.cpu_count()\n\n    self.is_fitted = False\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.text_preprocessor.TextPreprocessor.fit","title":"fit","text":"<pre><code>fit(df)\n</code></pre> <p>Builds the vocabulary</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>TextPreprocessor</code>           \u2013            <p><code>TextPreprocessor</code> fitted object</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/text_preprocessor.py</code> <pre><code>def fit(self, df: pd.DataFrame) -&gt; BasePreprocessor:\n    \"\"\"Builds the vocabulary\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    TextPreprocessor\n        `TextPreprocessor` fitted object\n    \"\"\"\n    texts = self._read_texts(df)\n\n    tokens = get_texts(texts, self.already_processed, self.n_cpus)\n\n    self.vocab: TVocab = Vocab(\n        max_vocab=self.max_vocab,\n        min_freq=self.min_freq,\n        pad_idx=self.pad_idx,\n    ).fit(\n        tokens,\n    )\n\n    if self.verbose:\n        print(\"The vocabulary contains {} tokens\".format(len(self.vocab.stoi)))\n    if self.word_vectors_path is not None:\n        self.embedding_matrix = build_embeddings_matrix(\n            self.vocab, self.word_vectors_path, self.min_freq\n        )\n\n    self.is_fitted = True\n\n    return self\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.text_preprocessor.TextPreprocessor.transform","title":"transform","text":"<pre><code>transform(df)\n</code></pre> <p>Returns the padded, 'numericalised' sequences</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>Padded, 'numericalised' sequences</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/text_preprocessor.py</code> <pre><code>def transform(self, df: pd.DataFrame) -&gt; np.ndarray:\n    \"\"\"Returns the padded, _'numericalised'_ sequences\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    np.ndarray\n        Padded, _'numericalised'_ sequences\n    \"\"\"\n    check_is_fitted(self, attributes=[\"vocab\"])\n    texts = self._read_texts(df)\n    tokens = get_texts(texts, self.already_processed, self.n_cpus)\n    return self._pad_sequences(tokens)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.text_preprocessor.TextPreprocessor.transform_sample","title":"transform_sample","text":"<pre><code>transform_sample(text)\n</code></pre> <p>Returns the padded, 'numericalised' sequence</p> <p>Parameters:</p> <ul> <li> <code>text</code>               (<code>str</code>)           \u2013            <p>text to be tokenized and padded</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>Padded, 'numericalised' sequence</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/text_preprocessor.py</code> <pre><code>def transform_sample(self, text: str) -&gt; np.ndarray:\n    \"\"\"Returns the padded, _'numericalised'_ sequence\n\n    Parameters\n    ----------\n    text: str\n        text to be tokenized and padded\n\n    Returns\n    -------\n    np.ndarray\n        Padded, _'numericalised'_ sequence\n    \"\"\"\n    check_is_fitted(self, attributes=[\"vocab\"])\n    tokens = get_texts([text], self.already_processed, self.n_cpus)\n    return self._pad_sequences(tokens)[0]\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.text_preprocessor.TextPreprocessor.fit_transform","title":"fit_transform","text":"<pre><code>fit_transform(df)\n</code></pre> <p>Combines <code>fit</code> and <code>transform</code></p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>Padded, 'numericalised' sequences</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/text_preprocessor.py</code> <pre><code>def fit_transform(self, df: pd.DataFrame) -&gt; np.ndarray:\n    \"\"\"Combines `fit` and `transform`\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    np.ndarray\n        Padded, _'numericalised'_ sequences\n    \"\"\"\n    return self.fit(df).transform(df)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.text_preprocessor.TextPreprocessor.inverse_transform","title":"inverse_transform","text":"<pre><code>inverse_transform(padded_seq)\n</code></pre> <p>Returns the original text plus the added 'special' tokens</p> <p>Parameters:</p> <ul> <li> <code>padded_seq</code>               (<code>ndarray</code>)           \u2013            <p>array with the output of the <code>transform</code> method</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>DataFrame</code>           \u2013            <p>Pandas dataframe with the original text plus the added 'special' tokens</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/text_preprocessor.py</code> <pre><code>def inverse_transform(self, padded_seq: np.ndarray) -&gt; pd.DataFrame:\n    \"\"\"Returns the original text plus the added 'special' tokens\n\n    Parameters\n    ----------\n    padded_seq: np.ndarray\n        array with the output of the `transform` method\n\n    Returns\n    -------\n    pd.DataFrame\n        Pandas dataframe with the original text plus the added 'special' tokens\n    \"\"\"\n    texts = [self.vocab.inverse_transform(num) for num in padded_seq]\n    return pd.DataFrame({self.text_col: texts})\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor","title":"HFPreprocessor","text":"<pre><code>HFPreprocessor(\n    model_name,\n    *,\n    use_fast_tokenizer=False,\n    text_col=None,\n    root_dir=None,\n    num_workers=None,\n    preprocessing_rules=None,\n    tokenizer_params=None,\n    encode_params=None,\n    **kwargs\n)\n</code></pre> <p>               Bases: <code>BasePreprocessor</code></p> <p>Text processor to prepare the <code>deeptext</code> input dataset that is a wrapper around HuggingFace's tokenizers.</p> <p>Following the main phylosophy of the <code>pytorch-widedeep</code> library, this class is designed to be as flexible as possible. Therefore, it is coded so that the user can use it as one would use any HuggingFace tokenizers, or following the API call 'protocol' of the rest of the library.</p> <p>Parameters:</p> <ul> <li> <code>model_name</code>               (<code>str</code>)           \u2013            <p>The model name from the transformers library e.g. 'bert-base-uncased'. Currently supported models are those from the families: BERT, RoBERTa, DistilBERT, ALBERT and ELECTRA.</p> </li> <li> <code>use_fast_tokenizer</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Whether to use the fast tokenizer from HuggingFace or not</p> </li> <li> <code>text_col</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>The column in the input dataframe containing the text data. If this tokenizer is used via the <code>fit</code> and <code>transform</code> methods, this argument is mandatory. If the tokenizer is used via the <code>encode</code> method, this argument is not needed since the input text is passed directly to the <code>encode</code> method.</p> </li> <li> <code>num_workers</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Number of workers to use when preprocessing the text data. If not None, and <code>use_fast_tokenizer</code> is False, the text data will be preprocessed in parallel using the number of workers specified. If <code>use_fast_tokenizer</code> is True, this argument is ignored.</p> </li> <li> <code>preprocessing_rules</code>               (<code>Optional[List[Callable[[str], str]]]</code>, default:                   <code>None</code> )           \u2013            <p>A list of functions to be applied to the text data before encoding. This can be useful to clean the text data before encoding. For example, removing html tags, special characters, etc.</p> </li> <li> <code>tokenizer_params</code>               (<code>Optional[Dict[str, Any]]</code>, default:                   <code>None</code> )           \u2013            <p>Additional parameters to be passed to the HuggingFace's <code>PreTrainedTokenizer</code>. Parameters to the <code>PreTrainedTokenizer</code> can also be passed via the <code>**kwargs</code> argument</p> </li> <li> <code>encode_params</code>               (<code>Optional[Dict[str, Any]]</code>, default:                   <code>None</code> )           \u2013            <p>Additional parameters to be passed to the <code>batch_encode_plus</code> method of the HuggingFace's <code>PreTrainedTokenizer</code>. If the <code>fit</code> and <code>transform</code> methods are used, the <code>encode_params</code> dict parameter is mandatory. If the <code>encode</code> method is used, this parameter is not needed since the input text is passed directly to the <code>encode</code> method.</p> </li> <li> <code>**kwargs</code>           \u2013            <p>Additional kwargs to be passed to the model, in particular to the <code>PreTrainedTokenizer</code> class.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>is_fitted</code>               (<code>bool</code>)           \u2013            <p>Boolean indicating if the preprocessor has been fitted. This is a HuggingFacea tokenizer, so it is always considered fitted and this attribute is manually set to True internally. This parameter exists for consistency with the rest of the library and because is needed for some functionality in the library.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from pytorch_widedeep.preprocessing import HFPreprocessor\n&gt;&gt;&gt; df = pd.DataFrame({\"text\": [\"this is the first text\", \"this is the second text\"]})\n&gt;&gt;&gt; hf_processor_1 = HFPreprocessor(model_name=\"bert-base-uncased\", text_col=\"text\")\n&gt;&gt;&gt; X_text_1 = hf_processor_1.fit_transform(df)\n&gt;&gt;&gt; texts = [\"this is a new text\", \"this is another text\"]\n&gt;&gt;&gt; hf_processor_2 = HFPreprocessor(model_name=\"bert-base-uncased\")\n&gt;&gt;&gt; X_text_2 = hf_processor_2.encode(texts, max_length=10, padding=\"max_length\", truncation=True)\n</code></pre> Source code in <code>pytorch_widedeep/preprocessing/hf_preprocessor.py</code> <pre><code>def __init__(\n    self,\n    model_name: str,\n    *,\n    use_fast_tokenizer: bool = False,\n    text_col: Optional[str] = None,\n    root_dir: Optional[str] = None,\n    num_workers: Optional[int] = None,\n    preprocessing_rules: Optional[List[Callable[[str], str]]] = None,\n    tokenizer_params: Optional[Dict[str, Any]] = None,\n    encode_params: Optional[Dict[str, Any]] = None,\n    **kwargs,\n):\n    self.model_name = model_name\n    self.use_fast_tokenizer = use_fast_tokenizer\n    self.text_col = text_col\n    self.root_dir = root_dir\n    self.num_workers = num_workers\n    self.preprocessing_rules = preprocessing_rules\n    self.tokenizer_params = tokenizer_params if tokenizer_params is not None else {}\n    self.encode_params = encode_params if encode_params is not None else {}\n\n    self._multiprocessing = (\n        num_workers is not None and num_workers &gt; 1 and not use_fast_tokenizer\n    )\n\n    if kwargs:\n        self.tokenizer_params.update(kwargs)\n\n    self.tokenizer = get_tokenizer(\n        model_name=self.model_name,\n        use_fast_tokenizer=self.use_fast_tokenizer,\n        **self.tokenizer_params,\n    )\n\n    # A HuggingFace tokenizer is already trained, since we need this\n    # attribute elsewhere in the library, we simply set it to True\n    self.is_fitted = True\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.encode","title":"encode","text":"<pre><code>encode(texts, **kwargs)\n</code></pre> <p>Encodes a list of texts. The method is a wrapper around the <code>batch_encode_plus</code> method of the HuggingFace's tokenizer.</p> <p>if 'use_fast_tokenizer' is True, the method will use the <code>batch_encode_plus</code></p> <p>Parameters:</p> <ul> <li> <code>texts</code>               (<code>List[str]</code>)           \u2013            <p>List of texts to be encoded</p> </li> <li> <code>**kwargs</code>           \u2013            <p>Additional parameters to be passed to the <code>batch_encode_plus</code> method of the HuggingFace's tokenizer. If the 'encode_params' dict was passed when instantiating the class, that dictionaly will be updated with the kwargs passed here.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>array</code>           \u2013            <p>The encoded texts</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/hf_preprocessor.py</code> <pre><code>def encode(self, texts: List[str], **kwargs) -&gt; npt.NDArray[np.int64]:\n    \"\"\"\n    Encodes a list of texts. The method is a wrapper around the\n    `batch_encode_plus` method of the HuggingFace's tokenizer.\n\n    if 'use_fast_tokenizer' is True, the method will use the `batch_encode_plus`\n\n    Parameters\n    ----------\n    texts: List[str]\n        List of texts to be encoded\n    **kwargs\n        Additional parameters to be passed to the `batch_encode_plus` method\n        of the HuggingFace's tokenizer. If the 'encode_params' dict was passed\n        when instantiating the class, that dictionaly will be updated with\n        the kwargs passed here.\n\n    Returns\n    -------\n    np.array\n        The encoded texts\n    \"\"\"\n    if kwargs:\n        self.encode_params.update(kwargs)\n\n    if self.preprocessing_rules:\n        if self._multiprocessing:\n            texts = self._process_text_parallel(texts)\n        else:\n            texts = [self._preprocess_text(text) for text in texts]\n\n    if self._multiprocessing:\n        input_ids = self._encode_paralell(texts, **self.encode_params)\n    else:\n        encoded_texts = self.tokenizer.batch_encode_plus(\n            texts,\n            **self.encode_params,\n        )\n        input_ids = encoded_texts.get(\"input_ids\")\n\n    self.is_fitted = True\n\n    try:\n        output = np.array(input_ids)\n    except ValueError:\n        warnings.warn(\n            \"Padding and Truncating parameters were not passed and all input arrays \"\n            \"do not have the same shape. Padding to the longest sequence. \"\n            \"Padding will be done with the index of the pad token for the model\",\n            UserWarning,\n        )\n        max_len = max([len(ids) for ids in input_ids])\n        output = np.array(\n            [\n                np.pad(ids, (self.tokenizer.pad_token_id, max_len - len(ids)))\n                for ids in input_ids\n            ]\n        )\n\n    return output\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.decode","title":"decode","text":"<pre><code>decode(input_ids, skip_special_tokens)\n</code></pre> <p>Decodes a list of input_ids. The method is a wrapper around the <code>convert_ids_to_tokens</code> and <code>convert_tokens_to_string</code> methods of the HuggingFace's tokenizer.</p> <p>Parameters:</p> <ul> <li> <code>input_ids</code>               (<code>NDArray[int64]</code>)           \u2013            <p>The input_ids to be decoded</p> </li> <li> <code>skip_special_tokens</code>               (<code>bool</code>)           \u2013            <p>Whether to skip the special tokens or not</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>List[str]</code>           \u2013            <p>The decoded texts</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/hf_preprocessor.py</code> <pre><code>def decode(\n    self, input_ids: npt.NDArray[np.int64], skip_special_tokens: bool\n) -&gt; List[str]:\n    \"\"\"\n    Decodes a list of input_ids. The method is a wrapper around the\n    `convert_ids_to_tokens` and `convert_tokens_to_string` methods of the\n    HuggingFace's tokenizer.\n\n    Parameters\n    ----------\n    input_ids: npt.NDArray[np.int64]\n        The input_ids to be decoded\n    skip_special_tokens: bool\n        Whether to skip the special tokens or not\n\n    Returns\n    -------\n    List[str]\n        The decoded texts\n    \"\"\"\n    texts = [\n        self.tokenizer.convert_tokens_to_string(\n            self.tokenizer.convert_ids_to_tokens(input_ids[i], skip_special_tokens)\n        )\n        for i in range(input_ids.shape[0])\n    ]\n    return texts\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.fit","title":"fit","text":"<pre><code>fit(df)\n</code></pre> <p>This method is included for consistency with the rest of the library in general and with the <code>BasePreprocessor</code> in particular. HuggingFace's tokenizers and models are already trained. Therefore, the 'fit' method here does nothing other than checking that the 'text_col' parameter is not <code>None</code>.</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>The dataframe containing the text data in the column specified by the 'text_col' parameter</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/hf_preprocessor.py</code> <pre><code>def fit(self, df: pd.DataFrame) -&gt; \"HFPreprocessor\":\n    \"\"\"\n    This method is included for consistency with the rest of the library\n    in general and with the `BasePreprocessor` in particular. HuggingFace's\n    tokenizers and models are already trained. Therefore, the 'fit' method\n    here does nothing other than checking that the 'text_col' parameter is\n    not `None`.\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        The dataframe containing the text data in the column specified by\n        the 'text_col' parameter\n    \"\"\"\n    if self.text_col is None:\n        raise ValueError(\n            \"'text_col' is None. Please specify the column name containing the text data\"\n            \" if you want to use the 'fit' method\"\n        )\n    return self\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.transform","title":"transform","text":"<pre><code>transform(df)\n</code></pre> <p>Encodes the text data in the input dataframe. This method simply calls the <code>encode</code> method under the hood. Similar to the <code>fit</code> method, this method is included for consistency with the rest of the library in general and with the <code>BasePreprocessor</code> in particular.</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>The dataframe containing the text data in the column specified by the 'text_col' parameter</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>array</code>           \u2013            <p>The encoded texts</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/hf_preprocessor.py</code> <pre><code>def transform(self, df: pd.DataFrame) -&gt; npt.NDArray[np.int64]:\n    \"\"\"\n    Encodes the text data in the input dataframe. This method simply\n    calls the `encode` method under the hood. Similar to the `fit` method,\n    this method is included for consistency with the rest of the library\n    in general and with the `BasePreprocessor` in particular.\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        The dataframe containing the text data in the column specified by\n        the 'text_col' parameter\n\n    Returns\n    -------\n    np.array\n        The encoded texts\n    \"\"\"\n    if self.text_col is None:\n        raise ValueError(\n            \"'text_col' is None. Please specify the column name containing the text data\"\n            \" if you want to use the 'fit' method\"\n        )\n\n    texts = self._read_texts(df, self.root_dir)\n\n    return self.encode(texts)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.transform_sample","title":"transform_sample","text":"<pre><code>transform_sample(text)\n</code></pre> <p>Encodes a single text sample.</p> <p>Parameters:</p> <ul> <li> <code>text</code>               (<code>str</code>)           \u2013            <p>The text sample to be encoded</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>array</code>           \u2013            <p>The encoded text</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/hf_preprocessor.py</code> <pre><code>def transform_sample(self, text: str) -&gt; npt.NDArray[np.int64]:\n    \"\"\"\n    Encodes a single text sample.\n\n    Parameters\n    ----------\n    text: str\n        The text sample to be encoded\n\n    Returns\n    -------\n    np.array\n        The encoded text\n    \"\"\"\n\n    if not self.is_fitted:\n        raise ValueError(\n            \"The `encode` (or `fit`) method must be called before calling `transform_sample`\"\n        )\n    return self.encode([text])[0]\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.fit_transform","title":"fit_transform","text":"<pre><code>fit_transform(df)\n</code></pre> <p>Encodes the text data in the input dataframe.</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>The dataframe containing the text data in the column specified by the 'text_col' parameter</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>array</code>           \u2013            <p>The encoded texts</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/hf_preprocessor.py</code> <pre><code>def fit_transform(self, df: pd.DataFrame) -&gt; npt.NDArray[np.int64]:\n    \"\"\"\n    Encodes the text data in the input dataframe.\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        The dataframe containing the text data in the column specified by\n        the 'text_col' parameter\n\n    Returns\n    -------\n    np.array\n        The encoded texts\n    \"\"\"\n    return self.fit(df).transform(df)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.hf_preprocessor.HFPreprocessor.inverse_transform","title":"inverse_transform","text":"<pre><code>inverse_transform(input_ids, skip_special_tokens)\n</code></pre> <p>Decodes a list of input_ids. The method simply calls the <code>decode</code> method under the hood.</p> <p>Parameters:</p> <ul> <li> <code>input_ids</code>               (<code>NDArray[int64]</code>)           \u2013            <p>The input_ids to be decoded</p> </li> <li> <code>skip_special_tokens</code>               (<code>bool</code>)           \u2013            <p>Whether to skip the special tokens or not</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>List[str]</code>           \u2013            <p>The decoded texts</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/hf_preprocessor.py</code> <pre><code>def inverse_transform(\n    self, input_ids: npt.NDArray[np.int64], skip_special_tokens: bool\n) -&gt; List[str]:\n    \"\"\"\n    Decodes a list of input_ids. The method simply calls the `decode` method\n    under the hood.\n\n    Parameters\n    ----------\n    input_ids: npt.NDArray[np.int64]\n        The input_ids to be decoded\n    skip_special_tokens: bool\n        Whether to skip the special tokens or not\n\n    Returns\n    -------\n    List[str]\n        The decoded texts\n    \"\"\"\n    return self.decode(input_ids, skip_special_tokens)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.image_preprocessor.ImagePreprocessor","title":"ImagePreprocessor","text":"<pre><code>ImagePreprocessor(\n    img_col, img_path, width=224, height=224, verbose=1\n)\n</code></pre> <p>               Bases: <code>BasePreprocessor</code></p> <p>Preprocessor to prepare the <code>deepimage</code> input dataset.</p> <p>The Preprocessing consists simply on resizing according to their aspect ratio</p> <p>Parameters:</p> <ul> <li> <code>img_col</code>               (<code>str</code>)           \u2013            <p>name of the column with the images filenames</p> </li> <li> <code>img_path</code>               (<code>str</code>)           \u2013            <p>path to the dicrectory where the images are stored</p> </li> <li> <code>width</code>               (<code>int</code>, default:                   <code>224</code> )           \u2013            <p>width of the resulting processed image.</p> </li> <li> <code>height</code>               (<code>int</code>, default:                   <code>224</code> )           \u2013            <p>width of the resulting processed image.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Enable verbose output.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>aap</code>               (<code>AspectAwarePreprocessor</code>)           \u2013            <p>an instance of <code>pytorch_widedeep.utils.image_utils.AspectAwarePreprocessor</code></p> </li> <li> <code>spp</code>               (<code>SimplePreprocessor</code>)           \u2013            <p>an instance of <code>pytorch_widedeep.utils.image_utils.SimplePreprocessor</code></p> </li> <li> <code>normalise_metrics</code>               (<code>Dict</code>)           \u2013            <p>Dict containing the normalisation metrics of the image dataset, i.e. mean and std for the R, G and B channels</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt;\n&gt;&gt;&gt; from pytorch_widedeep.preprocessing import ImagePreprocessor\n&gt;&gt;&gt;\n&gt;&gt;&gt; path_to_image1 = 'tests/test_data_utils/images/galaxy1.png'\n&gt;&gt;&gt; path_to_image2 = 'tests/test_data_utils/images/galaxy2.png'\n&gt;&gt;&gt;\n&gt;&gt;&gt; df_train = pd.DataFrame({'images_column': [path_to_image1]})\n&gt;&gt;&gt; df_test = pd.DataFrame({'images_column': [path_to_image2]})\n&gt;&gt;&gt; img_preprocessor = ImagePreprocessor(img_col='images_column', img_path='.', verbose=0)\n&gt;&gt;&gt; resized_images = img_preprocessor.fit_transform(df_train)\n&gt;&gt;&gt; new_resized_images = img_preprocessor.transform(df_train)\n</code></pre> <p> NOTE: Normalising metrics will only be computed when the <code>fit_transform</code> method is run. Running <code>transform</code> only will not change the computed metrics and running <code>fit</code> only simply instantiates the resizing functions.</p> Source code in <code>pytorch_widedeep/preprocessing/image_preprocessor.py</code> <pre><code>def __init__(\n    self,\n    img_col: str,\n    img_path: str,\n    width: int = 224,\n    height: int = 224,\n    verbose: int = 1,\n):\n    super(ImagePreprocessor, self).__init__()\n\n    self.img_col = img_col\n    self.img_path = img_path\n    self.width = width\n    self.height = height\n    self.verbose = verbose\n\n    self.aap = AspectAwarePreprocessor(self.width, self.height)\n    self.spp = SimplePreprocessor(self.width, self.height)\n\n    self.compute_normalising_computed = False\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.image_preprocessor.ImagePreprocessor.transform","title":"transform","text":"<pre><code>transform(df)\n</code></pre> <p>Resizes the images to the input height and width.</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe with the <code>img_col</code></p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>Resized images to the input height and width</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/image_preprocessor.py</code> <pre><code>def transform(self, df: pd.DataFrame) -&gt; np.ndarray:\n    \"\"\"Resizes the images to the input height and width.\n\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe with the `img_col`\n\n    Returns\n    -------\n    np.ndarray\n        Resized images to the input height and width\n    \"\"\"\n    image_list = df[self.img_col].tolist()\n    if self.verbose:\n        print(\"Reading Images from {}\".format(self.img_path))\n    imgs = [cv2.imread(\"/\".join([self.img_path, img])) for img in image_list]\n\n    # finding images with different height and width\n    aspect = [(im.shape[0], im.shape[1]) for im in imgs]\n    aspect_r = [a[0] / a[1] for a in aspect]\n    diff_idx = [i for i, r in enumerate(aspect_r) if r != 1.0]\n\n    if self.verbose:\n        print(\"Resizing\")\n    resized_imgs = []\n    for i, img in tqdm(enumerate(imgs), total=len(imgs), disable=self.verbose != 1):\n        if i in diff_idx:\n            resized_imgs.append(self.aap.preprocess(img))\n        else:\n            # if aspect ratio is 1:1, no need for AspectAwarePreprocessor\n            resized_imgs.append(self.spp.preprocess(img))\n\n    if not self.compute_normalising_computed:\n        if self.verbose:\n            print(\"Computing normalisation metrics\")\n        # mean and std deviation will only be computed when the fit method\n        # is called\n        mean_R, mean_G, mean_B = [], [], []\n        std_R, std_G, std_B = [], [], []\n        for rsz_img in resized_imgs:\n            (mean_b, mean_g, mean_r), (std_b, std_g, std_r) = cv2.meanStdDev(\n                rsz_img\n            )\n            mean_R.append(mean_r)\n            mean_G.append(mean_g)\n            mean_B.append(mean_b)\n            std_R.append(std_r)\n            std_G.append(std_g)\n            std_B.append(std_b)\n        self.normalise_metrics = dict(\n            mean={\n                \"R\": np.mean(mean_R) / 255.0,\n                \"G\": np.mean(mean_G) / 255.0,\n                \"B\": np.mean(mean_B) / 255.0,\n            },\n            std={\n                \"R\": np.mean(std_R) / 255.0,\n                \"G\": np.mean(std_G) / 255.0,\n                \"B\": np.mean(std_B) / 255.0,\n            },\n        )\n        self.compute_normalising_computed = True\n    return np.asarray(resized_imgs)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.image_preprocessor.ImagePreprocessor.fit_transform","title":"fit_transform","text":"<pre><code>fit_transform(df)\n</code></pre> <p>Combines <code>fit</code> and <code>transform</code></p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>Resized images to the input height and width</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/image_preprocessor.py</code> <pre><code>def fit_transform(self, df: pd.DataFrame) -&gt; np.ndarray:\n    \"\"\"Combines `fit` and `transform`\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    np.ndarray\n        Resized images to the input height and width\n    \"\"\"\n    return self.fit(df).transform(df)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#chunked-versions","title":"Chunked versions","text":"<p>Chunked versions of the preprocessors are also available. These are useful when the data is too big to fit in memory. See also the <code>load_from_folder</code> module in the library and the corresponding section here in the documentation.</p> <p>Note that there is not a <code>ChunkImagePreprocessor</code>. This is because the processing of the images will occur inside the <code>ImageFromFolder</code> class in the <code>load_from_folder</code> module.</p>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.wide_preprocessor.ChunkWidePreprocessor","title":"ChunkWidePreprocessor","text":"<pre><code>ChunkWidePreprocessor(\n    wide_cols, n_chunks, crossed_cols=None\n)\n</code></pre> <p>               Bases: <code>WidePreprocessor</code></p> <p>Preprocessor to prepare the wide input dataset</p> <p>This Preprocessor prepares the data for the wide, linear component. This linear model is implemented via an Embedding layer that is connected to the output neuron. <code>ChunkWidePreprocessor</code> numerically encodes all the unique values of all categorical columns <code>wide_cols + crossed_cols</code>. See the Example below.</p> <p>Parameters:</p> <ul> <li> <code>wide_cols</code>               (<code>List[str]</code>)           \u2013            <p>List of strings with the name of the columns that will label encoded and passed through the <code>wide</code> component</p> </li> <li> <code>crossed_cols</code>               (<code>Optional[List[Tuple[str, str]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of Tuples with the name of the columns that will be <code>'crossed'</code> and then label encoded. e.g. [('education', 'occupation'), ...]. For binary features, a cross-product transformation is 1 if and only if the constituent features are all 1, and 0 otherwise.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>wide_crossed_cols</code>               (<code>List</code>)           \u2013            <p>List with the names of all columns that will be label encoded</p> </li> <li> <code>encoding_dict</code>               (<code>Dict</code>)           \u2013            <p>Dictionary where the keys are the result of pasting <code>colname + '_' + column value</code> and the values are the corresponding mapped integer.</p> </li> <li> <code>inverse_encoding_dict</code>               (<code>Dict</code>)           \u2013            <p>the inverse encoding dictionary</p> </li> <li> <code>wide_dim</code>               (<code>int</code>)           \u2013            <p>Dimension of the wide model (i.e. dim of the linear layer)</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from pytorch_widedeep.preprocessing import ChunkWidePreprocessor\n&gt;&gt;&gt; chunk = pd.DataFrame({'color': ['r', 'b', 'g'], 'size': ['s', 'n', 'l']})\n&gt;&gt;&gt; wide_cols = ['color']\n&gt;&gt;&gt; crossed_cols = [('color', 'size')]\n&gt;&gt;&gt; chunk_wide_preprocessor = ChunkWidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols,\n... n_chunks=1)\n&gt;&gt;&gt; X_wide = chunk_wide_preprocessor.fit_transform(chunk)\n</code></pre> Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code> <pre><code>def __init__(\n    self,\n    wide_cols: List[str],\n    n_chunks: int,\n    crossed_cols: Optional[List[Tuple[str, str]]] = None,\n):\n    super(ChunkWidePreprocessor, self).__init__(wide_cols, crossed_cols)\n\n    self.n_chunks = n_chunks\n\n    self.chunk_counter = 0\n\n    self.is_fitted = False\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.wide_preprocessor.ChunkWidePreprocessor.partial_fit","title":"partial_fit","text":"<pre><code>partial_fit(chunk)\n</code></pre> <p>Fits the Preprocessor and creates required attributes</p> <p>Parameters:</p> <ul> <li> <code>chunk</code>               (<code>DataFrame</code>)           \u2013            <p>Input pandas dataframe</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ChunkWidePreprocessor</code>           \u2013            <p><code>ChunkWidePreprocessor</code> fitted object</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code> <pre><code>def partial_fit(self, chunk: pd.DataFrame) -&gt; \"ChunkWidePreprocessor\":\n    r\"\"\"Fits the Preprocessor and creates required attributes\n\n    Parameters\n    ----------\n    chunk: pd.DataFrame\n        Input pandas dataframe\n\n    Returns\n    -------\n    ChunkWidePreprocessor\n        `ChunkWidePreprocessor` fitted object\n    \"\"\"\n    df_wide = self._prepare_wide(chunk)\n    self.wide_crossed_cols = df_wide.columns.tolist()\n\n    if self.chunk_counter == 0:\n        self.glob_feature_set = set(\n            self._make_global_feature_list(df_wide[self.wide_crossed_cols])\n        )\n    else:\n        self.glob_feature_set.update(\n            self._make_global_feature_list(df_wide[self.wide_crossed_cols])\n        )\n\n    self.chunk_counter += 1\n\n    if self.chunk_counter == self.n_chunks:\n        self.encoding_dict = {v: i + 1 for i, v in enumerate(self.glob_feature_set)}\n        self.wide_dim = len(self.encoding_dict)\n        self.inverse_encoding_dict = {k: v for v, k in self.encoding_dict.items()}\n        self.inverse_encoding_dict[0] = \"unseen\"\n\n        self.is_fitted = True\n\n    return self\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.wide_preprocessor.ChunkWidePreprocessor.fit","title":"fit","text":"<pre><code>fit(df)\n</code></pre> <p>Runs <code>partial_fit</code>. This is just to override the fit method in the base class. This class is not designed or thought to run fit</p> Source code in <code>pytorch_widedeep/preprocessing/wide_preprocessor.py</code> <pre><code>def fit(self, df: pd.DataFrame) -&gt; \"ChunkWidePreprocessor\":\n    \"\"\"\n    Runs `partial_fit`. This is just to override the fit method in the base\n    class. This class is not designed or thought to run fit\n    \"\"\"\n    return self.partial_fit(df)\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.tab_preprocessor.ChunkTabPreprocessor","title":"ChunkTabPreprocessor","text":"<pre><code>ChunkTabPreprocessor(\n    n_chunks,\n    cat_embed_cols=None,\n    continuous_cols=None,\n    cols_and_bins=None,\n    cols_to_scale=None,\n    default_embed_dim=16,\n    with_attention=False,\n    with_cls_token=False,\n    shared_embed=False,\n    verbose=1,\n    *,\n    scale=False,\n    already_standard=None,\n    **kwargs\n)\n</code></pre> <p>               Bases: <code>TabPreprocessor</code></p> <p>Preprocessor to prepare the <code>deeptabular</code> component input dataset</p> <p>Parameters:</p> <ul> <li> <code>n_chunks</code>               (<code>int</code>)           \u2013            <p>Number of chunks that the tabular dataset is divided by.</p> </li> <li> <code>cat_embed_cols</code>               (<code>Optional[Union[List[str], List[Tuple[str, int]]]]</code>, default:                   <code>None</code> )           \u2013            <p>List containing the name of the categorical columns that will be represented by embeddings (e.g. ['education', 'relationship', ...]) or a Tuple with the name and the embedding dimension (e.g.: [ ('education',32), ('relationship',16), ...])</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the continuous cols</p> </li> <li> <code>cols_and_bins</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>Continuous columns can be turned into categorical via <code>pd.cut</code>. 'cols_and_bins' is dictionary where the keys are the column names to quantize and the values are a list of scalars indicating the bin edges.</p> </li> <li> <code>cols_to_scale</code>               (<code>Optional[Union[List[str], str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the names of the columns that will be standarised via sklearn's <code>StandardScaler</code></p> </li> <li> <code>default_embed_dim</code>               (<code>int</code>, default:                   <code>16</code> )           \u2013            <p>Dimension for the embeddings if the embed_dim is not provided in the <code>cat_embed_cols</code> parameter and <code>auto_embed_dim</code> is set to <code>False</code>.</p> </li> <li> <code>with_attention</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the preprocessed data will be passed to an attention-based model (more precisely a model where all embeddings must have the same dimensions). If <code>True</code>, the param <code>cat_embed_cols</code> must just be a list containing just the categorical column names: e.g. ['education', 'relationship', ...]. This is because they will all be  encoded using embeddings of the same dim, which will be specified  later when the model is defined.  Param alias:  <code>for_transformer</code></p> </li> <li> <code>with_cls_token</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if a <code>'[CLS]'</code> token will be added to the dataset when using attention-based models. The final hidden state corresponding to this token is used as the aggregated representation for classification and regression tasks. If not, the categorical (and continuous embeddings if present) will be concatenated before being passed to the final MLP (if present).</p> </li> <li> <code>shared_embed</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the embeddings will be \"shared\" when using attention-based models. The idea behind <code>shared_embed</code> is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: <code>pytorch_widedeep.models.transformers._layers.SharedEmbeddings</code>.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            </li> <li> <code>scale</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p> note: this arg will be removed in upcoming  releases. Please use <code>cols_to_scale</code> instead.  Bool indicating  whether or not to scale/standarise continuous cols. It is important  to emphasize that all the DL models for tabular data in the library  also include the possibility of normalising the input continuous  features via a <code>BatchNorm</code> or a <code>LayerNorm</code>.  Param alias:  <code>scale_cont_cols</code>.</p> </li> <li> <code>already_standard</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p> note: this arg will be removed in upcoming  releases. Please use <code>cols_to_scale</code> instead.  List with the  name of the continuous cols that do not need to be  scaled/standarised.</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>**kwargs</code>           \u2013            <p><code>pd.cut</code> and <code>StandardScaler</code> related args</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>embed_dim</code>               (<code>Dict</code>)           \u2013            <p>Dictionary where keys are the embed cols and values are the embedding dimensions. If <code>with_attention</code> is set to <code>True</code> this attribute is not generated during the <code>fit</code> process</p> </li> <li> <code>label_encoder</code>               (<code>LabelEncoder</code>)           \u2013            <p>see <code>pytorch_widedeep.utils.dense_utils.LabelEncder</code></p> </li> <li> <code>cat_embed_input</code>               (<code>List</code>)           \u2013            <p>List of Tuples with the column name, number of individual values for that column and, If <code>with_attention</code> is set to <code>False</code>, the corresponding embeddings dim, e.g. [('education', 16, 10), ('relationship', 6, 8), ...].</p> </li> <li> <code>standardize_cols</code>               (<code>List</code>)           \u2013            <p>List of the columns that will be standarized</p> </li> <li> <code>scaler</code>               (<code>StandardScaler</code>)           \u2013            <p>an instance of <code>sklearn.preprocessing.StandardScaler</code> if 'cols_to_scale' is not None or 'scale' is 'True'</p> </li> <li> <code>column_idx</code>               (<code>Dict</code>)           \u2013            <p>Dictionary where keys are column names and values are column indexes. This is neccesary to slice tensors</p> </li> <li> <code>quantizer</code>               (<code>Quantizer</code>)           \u2013            <p>an instance of <code>Quantizer</code></p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; from pytorch_widedeep.preprocessing import ChunkTabPreprocessor\n&gt;&gt;&gt; np.random.seed(42)\n&gt;&gt;&gt; chunk_df = pd.DataFrame({'cat_col': np.random.choice(['A', 'B', 'C'], size=8),\n... 'cont_col': np.random.uniform(1, 100, size=8)})\n&gt;&gt;&gt; cat_embed_cols = [('cat_col',4)]\n&gt;&gt;&gt; cont_cols = ['cont_col']\n&gt;&gt;&gt; tab_preprocessor = ChunkTabPreprocessor(\n... n_chunks=1, cat_embed_cols=cat_embed_cols, continuous_cols=cont_cols\n... )\n&gt;&gt;&gt; X_tab = tab_preprocessor.fit_transform(chunk_df)\n&gt;&gt;&gt; tab_preprocessor.cat_embed_cols\n[('cat_col', 4)]\n&gt;&gt;&gt; tab_preprocessor.column_idx\n{'cat_col': 0, 'cont_col': 1}\n</code></pre> Source code in <code>pytorch_widedeep/preprocessing/tab_preprocessor.py</code> <pre><code>@alias(\"with_attention\", [\"for_transformer\"])\n@alias(\"cat_embed_cols\", [\"embed_cols\"])\n@alias(\"scale\", [\"scale_cont_cols\"])\n@alias(\"cols_and_bins\", [\"quantization_setup\"])\ndef __init__(\n    self,\n    n_chunks: int,\n    cat_embed_cols: Optional[Union[List[str], List[Tuple[str, int]]]] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cols_and_bins: Optional[Dict[str, List[float]]] = None,\n    cols_to_scale: Optional[Union[List[str], str]] = None,\n    default_embed_dim: int = 16,\n    with_attention: bool = False,\n    with_cls_token: bool = False,\n    shared_embed: bool = False,\n    verbose: int = 1,\n    *,\n    scale: bool = False,\n    already_standard: Optional[List[str]] = None,\n    **kwargs,\n):\n    super(ChunkTabPreprocessor, self).__init__(\n        cat_embed_cols=cat_embed_cols,\n        continuous_cols=continuous_cols,\n        quantization_setup=None,\n        cols_to_scale=cols_to_scale,\n        auto_embed_dim=False,\n        embedding_rule=\"google\",  # does not matter, irrelevant\n        default_embed_dim=default_embed_dim,\n        with_attention=with_attention,\n        with_cls_token=with_cls_token,\n        shared_embed=shared_embed,\n        verbose=verbose,\n        scale=scale,\n        already_standard=already_standard,\n        **kwargs,\n    )\n\n    self.n_chunks = n_chunks\n    self.chunk_counter = 0\n\n    self.cols_and_bins = cols_and_bins  # type: ignore[assignment]\n    if self.cols_and_bins is not None:\n        self.quantizer = Quantizer(self.cols_and_bins, **self.quant_args)\n\n    self.embed_prepared = False\n    self.continuous_prepared = False\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.text_preprocessor.ChunkTextPreprocessor","title":"ChunkTextPreprocessor","text":"<pre><code>ChunkTextPreprocessor(\n    text_col,\n    n_chunks,\n    root_dir=None,\n    max_vocab=30000,\n    min_freq=5,\n    maxlen=80,\n    pad_first=True,\n    pad_idx=1,\n    already_processed=False,\n    word_vectors_path=None,\n    n_cpus=None,\n    verbose=1,\n)\n</code></pre> <p>               Bases: <code>TextPreprocessor</code></p> <p>Preprocessor to prepare the <code>deeptext</code> input dataset</p> <p>Parameters:</p> <ul> <li> <code>text_col</code>               (<code>str</code>)           \u2013            <p>column in the input dataframe containing either the texts or the filenames where the text documents are stored</p> </li> <li> <code>n_chunks</code>               (<code>int</code>)           \u2013            <p>Number of chunks that the text dataset is divided by.</p> </li> <li> <code>root_dir</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>If 'text_col' contains the filenames with the text documents, this is the path to the directory where those documents are stored.</p> </li> <li> <code>max_vocab</code>               (<code>int</code>, default:                   <code>30000</code> )           \u2013            <p>Maximum number of tokens in the vocabulary</p> </li> <li> <code>min_freq</code>               (<code>int</code>, default:                   <code>5</code> )           \u2013            <p>Minimum frequency for a token to be part of the vocabulary</p> </li> <li> <code>maxlen</code>               (<code>int</code>, default:                   <code>80</code> )           \u2013            <p>Maximum length of the tokenized sequences</p> </li> <li> <code>pad_first</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Indicates whether the padding index will be added at the beginning or the end of the sequences</p> </li> <li> <code>pad_idx</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>padding index. Fastai's Tokenizer leaves 0 for the 'unknown' token.</p> </li> <li> <code>word_vectors_path</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Path to the pretrained word vectors</p> </li> <li> <code>n_cpus</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>number of CPUs to used during the tokenization process</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Enable verbose output.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>vocab</code>               (<code>Vocab</code>)           \u2013            <p>an instance of <code>pytorch_widedeep.utils.fastai_transforms.ChunkVocab</code></p> </li> <li> <code>embedding_matrix</code>               (<code>ndarray</code>)           \u2013            <p>Array with the pretrained embeddings if <code>word_vectors_path</code> is not None</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from pytorch_widedeep.preprocessing import ChunkTextPreprocessor\n&gt;&gt;&gt; chunk_df = pd.DataFrame({'text_column': [\"life is like a box of chocolates\",\n... \"You never know what you're gonna get\"]})\n&gt;&gt;&gt; chunk_text_preprocessor = ChunkTextPreprocessor(text_col='text_column', n_chunks=1,\n... max_vocab=25, min_freq=1, maxlen=10, verbose=0, n_cpus=1)\n&gt;&gt;&gt; processed_chunk = chunk_text_preprocessor.fit_transform(chunk_df)\n</code></pre> Source code in <code>pytorch_widedeep/preprocessing/text_preprocessor.py</code> <pre><code>def __init__(\n    self,\n    text_col: str,\n    n_chunks: int,\n    root_dir: Optional[str] = None,\n    max_vocab: int = 30000,\n    min_freq: int = 5,\n    maxlen: int = 80,\n    pad_first: bool = True,\n    pad_idx: int = 1,\n    already_processed: Optional[bool] = False,\n    word_vectors_path: Optional[str] = None,\n    n_cpus: Optional[int] = None,\n    verbose: int = 1,\n):\n    super(ChunkTextPreprocessor, self).__init__(\n        text_col=text_col,\n        max_vocab=max_vocab,\n        min_freq=min_freq,\n        maxlen=maxlen,\n        pad_first=pad_first,\n        pad_idx=pad_idx,\n        already_processed=already_processed,\n        word_vectors_path=word_vectors_path,\n        n_cpus=n_cpus,\n        verbose=verbose,\n    )\n\n    self.n_chunks = n_chunks\n    self.root_dir = root_dir\n\n    self.chunk_counter = 0\n\n    self.is_fitted = False\n</code></pre>"},{"location":"pytorch-widedeep/preprocessing.html#pytorch_widedeep.preprocessing.hf_preprocessor.ChunkHFPreprocessor","title":"ChunkHFPreprocessor","text":"<pre><code>ChunkHFPreprocessor(\n    model_name,\n    *,\n    text_col,\n    root_dir=None,\n    use_fast_tokenizer=True,\n    num_workers=None,\n    preprocessing_rules=None,\n    tokenizer_params=None,\n    encode_params=None\n)\n</code></pre> <p>               Bases: <code>HFPreprocessor</code></p> <p>Text processor to prepare the <code>deeptext</code> input dataset that is a wrapper around HuggingFace's tokenizers.</p> <p>Hugginface Tokenizer's are already 'trained'. Therefore, unlike the <code>ChunkTextPreprocessor</code> this is mostly identical to the <code>HFPreprocessor</code> with the only difference that the class needs a 'text_col' parameter to be passed. Also the parameter <code>encode_params</code> is not really optional when using this class. It must be passed containing at least the 'max_length' encoding parameter. This is because we need to ensure that  all sequences have the same length when encoding in chunks.</p> <p>Parameters:</p> <ul> <li> <code>model_name</code>               (<code>str</code>)           \u2013            <p>The model name from the transformers library e.g. 'bert-base-uncased'. Currently supported models are those from the families: BERT, RoBERTa, DistilBERT, ALBERT and ELECTRA.</p> </li> <li> <code>text_col</code>               (<code>str</code>)           \u2013            <p>The column in the input dataframe containing the text data. When using the <code>ChunkHFPreprocessor</code> the <code>text_col</code> parameter is mandatory.</p> </li> <li> <code>root_dir</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>The root directory where the text files are located. This is only needed if the text data is stored in text files. If the text data is stored in a column in the input dataframe, this parameter is not needed.</p> </li> <li> <code>use_fast_tokenizer</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Whether to use the fast tokenizer from HuggingFace or not</p> </li> <li> <code>num_workers</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Number of workers to use when preprocessing the text data. If not None, and <code>use_fast_tokenizer</code> is False, the text data will be preprocessed in parallel using the number of workers specified. If <code>use_fast_tokenizer</code> is True, this argument is ignored.</p> </li> <li> <code>preprocessing_rules</code>               (<code>Optional[List[Callable[[str], str]]]</code>, default:                   <code>None</code> )           \u2013            <p>A list of functions to be applied to the text data before encoding. This can be useful to clean the text data before encoding. For example, removing html tags, special characters, etc.</p> </li> <li> <code>tokenizer_params</code>               (<code>Optional[Dict[str, Any]]</code>, default:                   <code>None</code> )           \u2013            <p>Additional parameters to be passed to the HuggingFace's <code>PreTrainedTokenizer</code>.</p> </li> <li> <code>encode_params</code>               (<code>Optional[Dict[str, Any]]</code>, default:                   <code>None</code> )           \u2013            <p>Additional parameters to be passed to the <code>batch_encode_plus</code> method of the HuggingFace's <code>PreTrainedTokenizer</code>. In the case of the <code>ChunkHFPreprocessor</code>, this parameter is not really <code>Optional</code>. It must be passed containing at least the 'max_length' encoding parameter</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>is_fitted</code>               (<code>bool</code>)           \u2013            <p>Boolean indicating if the preprocessor has been fitted. This is a HuggingFacea tokenizer, so it is always considered fitted and this attribute is manually set to True internally. This parameter exists for consistency with the rest of the library and because is needed for some functionality in the library.</p> </li> </ul> Source code in <code>pytorch_widedeep/preprocessing/hf_preprocessor.py</code> <pre><code>def __init__(\n    self,\n    model_name: str,\n    *,\n    text_col: str,\n    root_dir: Optional[str] = None,\n    use_fast_tokenizer: bool = True,\n    num_workers: Optional[int] = None,\n    preprocessing_rules: Optional[List[Callable[[str], str]]] = None,\n    tokenizer_params: Optional[Dict[str, Any]] = None,\n    encode_params: Optional[Dict[str, Any]] = None,\n):\n    super().__init__(\n        model_name=model_name,\n        use_fast_tokenizer=use_fast_tokenizer,\n        text_col=text_col,\n        num_workers=num_workers,\n        preprocessing_rules=preprocessing_rules,\n        tokenizer_params=tokenizer_params,\n        encode_params=encode_params,\n    )\n\n    self.root_dir = root_dir\n\n    # when using in chunks encode_params is not really optional. I will\n    # review types in due time\n    if self.encode_params is None:\n        raise ValueError(\n            \"The 'encode_params' dict must be passed to the ChunkHFTokenizer \"\n            \"containing at least the 'max_length' encoding parameter\"\n        )\n\n    if \"padding\" not in self.encode_params or not self.encode_params[\"padding\"]:\n        self.encode_params[\"padding\"] = True\n\n    if (\n        \"truncation\" not in self.encode_params\n        or not self.encode_params[\"truncation\"]\n    ):\n        self.encode_params[\"truncation\"] = True\n</code></pre>"},{"location":"pytorch-widedeep/self_supervised_pretraining.html","title":"Self Supervised Pre-training for tabular data","text":"<p>In this library we have implemented two methods or routines that allow the user to use self-suerpvised pre-training for all tabular models in the library with the exception of the <code>TabPerceiver</code> (this is a particular model and self-supervised pre-training requires some adjustments that will be implemented in future versions). Please see the examples folder in the repo or the examples section in the docs for details on how to use self-supervised pre-training with this library.</p> <p>The two routines implemented are illustrated in the figures below. The first is from TabNet: Attentive Interpretable Tabular Learning. It is a 'standard' encoder-decoder architecture and and is designed here for models that do not use transformer-based architectures (or when the embeddings can all have different dimensions). The second is from SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training, it is based on Contrastive and Denoising learning and is designed for models that use transformer-based architectures (or when the embeddings all need to have the same dimension):</p> <p> </p> <p>Figure 1. Figure 2 in their paper. The caption of the original paper is included in case it is useful.</p> <p> </p> <p>Figure 2. Figure 1 in their paper. The caption of the original paper is included in case it is useful.</p> <p>Note that the self-supervised pre-trainers described below focus, of course, on the self-supervised pre-training phase, i.e. the left side in Figure 1 and the upper part in Figure 2. When combined with the <code>Trainer</code> described earlier in the documenation, one can reproduce the full process illustrated in the figures above.</p> <p>Also Note that it is beyond the scope of this docs to explain in detail these routines. In addition, to fully utilise the self-supervised trainers implemented in this library a minimum understanding of the processes as described in the papers is required. Therefore, we strongly encourage the users to have a look to the papers.</p>"},{"location":"pytorch-widedeep/self_supervised_pretraining.html#pytorch_widedeep.self_supervised_training.EncoderDecoderTrainer","title":"EncoderDecoderTrainer","text":"<pre><code>EncoderDecoderTrainer(\n    encoder,\n    decoder=None,\n    masked_prob=0.2,\n    optimizer=None,\n    lr_scheduler=None,\n    callbacks=None,\n    verbose=1,\n    seed=1,\n    **kwargs\n)\n</code></pre> <p>               Bases: <code>BaseEncoderDecoderTrainer</code></p> <p>This class implements an Encoder-Decoder self-supervised 'routine' inspired by TabNet: Attentive Interpretable Tabular Learning. See Figure 1 above.</p> <p>Parameters:</p> <ul> <li> <code>encoder</code>               (<code>ModelWithoutAttention</code>)           \u2013            <p>An instance of a <code>TabMlp</code>, <code>TabResNet</code> or <code>TabNet</code> model</p> </li> <li> <code>decoder</code>               (<code>Optional[DecoderWithoutAttention]</code>, default:                   <code>None</code> )           \u2013            <p>An instance of  a <code>TabMlpDecoder</code>, <code>TabResNetDecoder</code> or <code>TabNetDecoder</code> model. if <code>None</code> the decoder will be automatically built as a 'simetric' model to the Encoder</p> </li> <li> <code>masked_prob</code>               (<code>float</code>, default:                   <code>0.2</code> )           \u2013            <p>Indicates the fraction of elements in the embedding tensor that will be masked and hence used for reconstruction</p> </li> <li> <code>optimizer</code>               (<code>Optional[Optimizer]</code>, default:                   <code>None</code> )           \u2013            <p>An instance of Pytorch's <code>Optimizer</code> object (e.g. <code>torch.optim.Adam ()</code>). if no optimizer is passed it will default to <code>AdamW</code>.</p> </li> <li> <code>lr_scheduler</code>               (<code>Optional[LRScheduler]</code>, default:                   <code>None</code> )           \u2013            <p>An instance of Pytorch's <code>LRScheduler</code> object (e.g <code>torch.optim.lr_scheduler.StepLR(opt, step_size=5)</code>).</p> </li> <li> <code>callbacks</code>               (<code>Optional[List[Callback]]</code>, default:                   <code>None</code> )           \u2013            <p>List with <code>Callback</code> objects. The three callbacks available in <code>pytorch-widedeep</code> are: <code>LRHistory</code>, <code>ModelCheckpoint</code> and <code>EarlyStopping</code>. This can also be a custom callback. See <code>pytorch_widedeep.callbacks.Callback</code> or the Examples folder in the repo.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Setting it to 0 will print nothing during training.</p> </li> <li> <code>seed</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Random seed to be used internally for train_test_split</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>**kwargs</code>           \u2013            <p>Other infrequently used arguments that can also be passed as kwargs are:</p> <ul> <li> <p>device: <code>str</code>     string indicating the device. One of 'cpu' or 'gpu'</p> </li> <li> <p>num_workers: <code>int</code>     number of workers to be used internally by the data loaders</p> </li> <li> <p>reducelronplateau_criterion: <code>str</code>     This sets the criterion that will be used by the lr scheduler to     take a step: One of 'loss' or 'metric'. The ReduceLROnPlateau     learning rate is a bit particular.</p> </li> </ul> </li> </ul> Source code in <code>pytorch_widedeep/self_supervised_training/encoder_decoder_trainer.py</code> <pre><code>def __init__(\n    self,\n    encoder: ModelWithoutAttention,\n    decoder: Optional[DecoderWithoutAttention] = None,\n    masked_prob: float = 0.2,\n    optimizer: Optional[Optimizer] = None,\n    lr_scheduler: Optional[LRScheduler] = None,\n    callbacks: Optional[List[Callback]] = None,\n    verbose: int = 1,\n    seed: int = 1,\n    **kwargs,\n):\n    super().__init__(\n        encoder=encoder,\n        decoder=decoder,\n        masked_prob=masked_prob,\n        optimizer=optimizer,\n        lr_scheduler=lr_scheduler,\n        callbacks=callbacks,\n        verbose=verbose,\n        seed=seed,\n        **kwargs,\n    )\n</code></pre>"},{"location":"pytorch-widedeep/self_supervised_pretraining.html#pytorch_widedeep.self_supervised_training.EncoderDecoderTrainer.pretrain","title":"pretrain","text":"<pre><code>pretrain(\n    X_tab,\n    X_tab_val=None,\n    val_split=None,\n    validation_freq=1,\n    n_epochs=1,\n    batch_size=32,\n)\n</code></pre> <p>Pretrain method. Can also be called using <code>.fit(&lt;same_args&gt;)</code></p> <p>Parameters:</p> <ul> <li> <code>X_tab</code>               (<code>ndarray</code>)           \u2013            <p>tabular dataset</p> </li> <li> <code>X_tab_val</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>validation data</p> </li> <li> <code>val_split</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>An alterative to passing the validation set is to use a train/val split fraction via <code>val_split</code></p> </li> <li> <code>validation_freq</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>epochs validation frequency</p> </li> <li> <code>n_epochs</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>number of epochs</p> </li> <li> <code>batch_size</code>               (<code>int</code>, default:                   <code>32</code> )           \u2013            <p>batch size</p> </li> </ul> Source code in <code>pytorch_widedeep/self_supervised_training/encoder_decoder_trainer.py</code> <pre><code>def pretrain(\n    self,\n    X_tab: np.ndarray,\n    X_tab_val: Optional[np.ndarray] = None,\n    val_split: Optional[float] = None,\n    validation_freq: int = 1,\n    n_epochs: int = 1,\n    batch_size: int = 32,\n):\n    r\"\"\"Pretrain method. Can also be called using `.fit(&lt;same_args&gt;)`\n\n    Parameters\n    ----------\n    X_tab: np.ndarray,\n        tabular dataset\n    X_tab_val: np.ndarray, Optional, default = None\n        validation data\n    val_split: float, Optional. default=None\n        An alterative to passing the validation set is to use a train/val\n        split fraction via `val_split`\n    validation_freq: int, default=1\n        epochs validation frequency\n    n_epochs: int, default=1\n        number of epochs\n    batch_size: int, default=32\n        batch size\n    \"\"\"\n\n    self.batch_size = batch_size\n\n    train_set, eval_set = self._train_eval_split(X_tab, X_tab_val, val_split)\n    train_loader = DataLoader(\n        dataset=train_set, batch_size=batch_size, num_workers=self.num_workers\n    )\n    train_steps = len(train_loader)\n    if eval_set is not None:\n        eval_loader = DataLoader(\n            dataset=eval_set,\n            batch_size=batch_size,\n            num_workers=self.num_workers,\n            shuffle=False,\n        )\n        eval_steps = len(eval_loader)\n\n    self.callback_container.on_train_begin(\n        {\n            \"batch_size\": batch_size,\n            \"train_steps\": train_steps,\n            \"n_epochs\": n_epochs,\n        }\n    )\n    for epoch in range(n_epochs):\n        epoch_logs: Dict[str, float] = {}\n        self.callback_container.on_epoch_begin(epoch, logs=epoch_logs)\n\n        self.train_running_loss = 0.0\n        with trange(train_steps, disable=self.verbose != 1) as t:\n            for batch_idx, X in zip(t, train_loader):\n                t.set_description(\"epoch %i\" % (epoch + 1))\n                train_loss = self._train_step(X[0], batch_idx)\n                self.callback_container.on_batch_end(batch=batch_idx)\n                print_loss_and_metric(t, train_loss)\n\n        epoch_logs = save_epoch_logs(epoch_logs, train_loss, None, \"train\")\n\n        on_epoch_end_metric = None\n        if eval_set is not None and epoch % validation_freq == (\n            validation_freq - 1\n        ):\n            self.callback_container.on_eval_begin()\n            self.valid_running_loss = 0.0\n            with trange(eval_steps, disable=self.verbose != 1) as v:\n                for batch_idx, X in zip(v, eval_loader):\n                    v.set_description(\"valid\")\n                    val_loss = self._eval_step(X[0], batch_idx)\n                    print_loss_and_metric(v, val_loss)\n            epoch_logs = save_epoch_logs(epoch_logs, val_loss, None, \"val\")\n            on_epoch_end_metric = val_loss\n        else:\n            if self.reducelronplateau:\n                raise NotImplementedError(\n                    \"ReduceLROnPlateau scheduler can be used only with validation data.\"\n                )\n\n        self.callback_container.on_epoch_end(epoch, epoch_logs, on_epoch_end_metric)\n\n        if self.early_stop:\n            self.callback_container.on_train_end(epoch_logs)\n            break\n\n    self.callback_container.on_train_end(epoch_logs)\n    self._restore_best_weights()\n    self.ed_model.train()\n</code></pre>"},{"location":"pytorch-widedeep/self_supervised_pretraining.html#pytorch_widedeep.self_supervised_training.ContrastiveDenoisingTrainer","title":"ContrastiveDenoisingTrainer","text":"<pre><code>ContrastiveDenoisingTrainer(\n    model,\n    preprocessor,\n    optimizer=None,\n    lr_scheduler=None,\n    callbacks=None,\n    loss_type=\"both\",\n    projection_head1_dims=None,\n    projection_head2_dims=None,\n    projection_heads_activation=\"relu\",\n    cat_mlp_type=\"multiple\",\n    cont_mlp_type=\"multiple\",\n    denoise_mlps_activation=\"relu\",\n    verbose=1,\n    seed=1,\n    **kwargs\n)\n</code></pre> <p>               Bases: <code>BaseContrastiveDenoisingTrainer</code></p> <p>This class trains a Contrastive, Denoising Self Supervised 'routine' that is based on the one described in SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training, their Figure 1.</p> <p>Parameters:</p> <ul> <li> <code>model</code>               (<code>ModelWithAttention</code>)           \u2013            <p>An instance of a <code>TabTransformer</code>, <code>SAINT</code>, <code>FTTransformer</code>, <code>TabFastFormer</code>, <code>TabPerceiver</code>, <code>ContextAttentionMLP</code> and <code>SelfAttentionMLP</code>.</p> </li> <li> <code>preprocessor</code>               (<code>TabPreprocessor</code>)           \u2013            <p>A fitted <code>TabPreprocessor</code> object. See <code>pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor</code></p> </li> <li> <code>optimizer</code>               (<code>Optional[Optimizer]</code>, default:                   <code>None</code> )           \u2013            <p>An instance of Pytorch's <code>Optimizer</code> object (e.g. <code>torch.optim.Adam ()</code>). if no optimizer is passed it will default to <code>AdamW</code>.</p> </li> <li> <code>lr_scheduler</code>               (<code>Optional[LRScheduler]</code>, default:                   <code>None</code> )           \u2013            <p>An instance of Pytorch's <code>LRScheduler</code> object (e.g <code>torch.optim.lr_scheduler.StepLR(opt, step_size=5)</code>).</p> </li> <li> <code>callbacks</code>               (<code>Optional[List[Callback]]</code>, default:                   <code>None</code> )           \u2013            <p>List with <code>Callback</code> objects. The three callbacks available in <code>pytorch-widedeep</code> are: <code>LRHistory</code>, <code>ModelCheckpoint</code> and <code>EarlyStopping</code>. This can also be a custom callback. See <code>pytorch_widedeep.callbacks.Callback</code> or the Examples folder in the repo.</p> </li> <li> <code>loss_type</code>               (<code>Literal[contrastive, denoising, both]</code>, default:                   <code>'both'</code> )           \u2013            <p>One of 'contrastive', 'denoising' or 'both'. See SAINT: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training, their figure (1) and their equation (5).</p> </li> <li> <code>projection_head1_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>The projection heads are simply MLPs. This parameter is a list of integers with the dimensions of the MLP hidden layers. See the paper for details. Note that setting up this parameter requires some knowledge of the architecture one is using. For example, if we are representing the features with embeddings of dim 32 (i.e. the so called dimension of the model is 32), then the first dimension of the projection head must be 32 (e.g. [32, 16])</p> </li> <li> <code>projection_head2_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>Same as 'projection_head1_dims' for the second head</p> </li> <li> <code>projection_heads_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>Activation function for the projection heads</p> </li> <li> <code>cat_mlp_type</code>               (<code>Literal[single, multiple]</code>, default:                   <code>'multiple'</code> )           \u2013            <p>If 'denoising' loss is used, one can choose two types of 'stacked' MLPs to process the output from the transformer-based encoder that receives 'corrupted' (cut-mixed and mixed-up) features. These are 'single' or 'multiple'. The former approach will apply a single MLP to all the categorical features while the latter will use one MLP per categorical feature</p> </li> <li> <code>cont_mlp_type</code>               (<code>Literal[single, multiple]</code>, default:                   <code>'multiple'</code> )           \u2013            <p>Same as 'cat_mlp_type' but for the continuous features</p> </li> <li> <code>denoise_mlps_activation</code>               (<code>str</code>, default:                   <code>'relu'</code> )           \u2013            <p>activation function for the so called 'denoising mlps'.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Setting it to 0 will print nothing during training.</p> </li> <li> <code>seed</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Random seed to be used internally for train_test_split</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>**kwargs</code>           \u2013            <p>Other infrequently used arguments that can also be passed as kwargs are:</p> <ul> <li> <p>device: <code>str</code>     string indicating the device. One of 'cpu' or 'gpu'</p> </li> <li> <p>num_workers: <code>int</code>     number of workers to be used internally by the data loaders</p> </li> <li> <p>reducelronplateau_criterion: <code>str</code>     This sets the criterion that will be used by the lr scheduler to     take a step: One of 'loss' or 'metric'. The ReduceLROnPlateau     learning rate is a bit particular.</p> </li> </ul> </li> </ul> Source code in <code>pytorch_widedeep/self_supervised_training/contrastive_denoising_trainer.py</code> <pre><code>def __init__(\n    self,\n    model: ModelWithAttention,\n    preprocessor: TabPreprocessor,\n    optimizer: Optional[Optimizer] = None,\n    lr_scheduler: Optional[LRScheduler] = None,\n    callbacks: Optional[List[Callback]] = None,\n    loss_type: Literal[\"contrastive\", \"denoising\", \"both\"] = \"both\",\n    projection_head1_dims: Optional[List[int]] = None,\n    projection_head2_dims: Optional[List[int]] = None,\n    projection_heads_activation: str = \"relu\",\n    cat_mlp_type: Literal[\"single\", \"multiple\"] = \"multiple\",\n    cont_mlp_type: Literal[\"single\", \"multiple\"] = \"multiple\",\n    denoise_mlps_activation: str = \"relu\",\n    verbose: int = 1,\n    seed: int = 1,\n    **kwargs,\n):\n    super().__init__(\n        model=model,\n        preprocessor=preprocessor,\n        loss_type=loss_type,\n        optimizer=optimizer,\n        lr_scheduler=lr_scheduler,\n        callbacks=callbacks,\n        projection_head1_dims=projection_head1_dims,\n        projection_head2_dims=projection_head2_dims,\n        projection_heads_activation=projection_heads_activation,\n        cat_mlp_type=cat_mlp_type,\n        cont_mlp_type=cont_mlp_type,\n        denoise_mlps_activation=denoise_mlps_activation,\n        verbose=verbose,\n        seed=seed,\n        **kwargs,\n    )\n</code></pre>"},{"location":"pytorch-widedeep/self_supervised_pretraining.html#pytorch_widedeep.self_supervised_training.ContrastiveDenoisingTrainer.pretrain","title":"pretrain","text":"<pre><code>pretrain(\n    X_tab,\n    X_tab_val=None,\n    val_split=None,\n    validation_freq=1,\n    n_epochs=1,\n    batch_size=32,\n)\n</code></pre> <p>Pretrain method. Can also be called using <code>.fit(&lt;same_args&gt;)</code></p> <p>Parameters:</p> <ul> <li> <code>X_tab</code>               (<code>ndarray</code>)           \u2013            <p>tabular dataset</p> </li> <li> <code>X_tab_val</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>validation data. Note that, although it is possible to use contrastive-denoising training with a validation set, such set must include feature values that are all seen in the training set in the case of the categorical columns. This is because the values of the columns themselves will be used as targets when computing the loss. Therefore, if a new category is present in the validation set that was not seen in training this will effectively be like trying to predict a new, never seen category (and Pytorch will throw an error)</p> </li> <li> <code>val_split</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>An alterative to passing the validation set is to use a train/val split fraction via <code>val_split</code></p> </li> <li> <code>validation_freq</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>epochs validation frequency</p> </li> <li> <code>n_epochs</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>number of epochs</p> </li> <li> <code>batch_size</code>               (<code>int</code>, default:                   <code>32</code> )           \u2013            <p>batch size</p> </li> </ul> Source code in <code>pytorch_widedeep/self_supervised_training/contrastive_denoising_trainer.py</code> <pre><code>def pretrain(\n    self,\n    X_tab: np.ndarray,\n    X_tab_val: Optional[np.ndarray] = None,\n    val_split: Optional[float] = None,\n    validation_freq: int = 1,\n    n_epochs: int = 1,\n    batch_size: int = 32,\n):\n    r\"\"\"Pretrain method. Can also be called using `.fit(&lt;same_args&gt;)`\n\n    Parameters\n    ----------\n    X_tab: np.ndarray,\n        tabular dataset\n    X_tab_val: np.ndarray, Optional, default = None\n        validation data. Note that, although it is possible to use\n        contrastive-denoising training with a validation set, such set\n        must include feature values that are _all_ seen in the training\n        set in the case of the categorical columns. This is because the\n        values of the columns themselves will be used as targets when\n        computing the loss. Therefore, if a new category is present in\n        the validation set that was not seen in training this will\n        effectively be like trying to predict a new, never seen category\n        (and Pytorch will throw an error)\n    val_split: float, Optional. default=None\n        An alterative to passing the validation set is to use a train/val\n        split fraction via `val_split`\n    validation_freq: int, default=1\n        epochs validation frequency\n    n_epochs: int, default=1\n        number of epochs\n    batch_size: int, default=32\n        batch size\n    \"\"\"\n\n    self.batch_size = batch_size\n\n    train_set, eval_set = self._train_eval_split(X_tab, X_tab_val, val_split)\n    train_loader = DataLoader(\n        dataset=train_set, batch_size=batch_size, num_workers=self.num_workers\n    )\n    train_steps = len(train_loader)\n    if eval_set is not None:\n        eval_loader = DataLoader(\n            dataset=eval_set,\n            batch_size=batch_size,\n            num_workers=self.num_workers,\n            shuffle=False,\n        )\n        eval_steps = len(eval_loader)\n\n    self.callback_container.on_train_begin(\n        {\n            \"batch_size\": batch_size,\n            \"train_steps\": train_steps,\n            \"n_epochs\": n_epochs,\n        }\n    )\n    for epoch in range(n_epochs):\n        epoch_logs: Dict[str, float] = {}\n        self.callback_container.on_epoch_begin(epoch, logs=epoch_logs)\n\n        self.train_running_loss = 0.0\n        with trange(train_steps, disable=self.verbose != 1) as t:\n            for batch_idx, X in zip(t, train_loader):\n                t.set_description(\"epoch %i\" % (epoch + 1))\n                train_loss = self._train_step(X[0], batch_idx)\n                self.callback_container.on_batch_end(batch=batch_idx)\n                print_loss_and_metric(t, train_loss)\n\n        epoch_logs = save_epoch_logs(epoch_logs, train_loss, None, \"train\")\n\n        on_epoch_end_metric = None\n        if eval_set is not None and epoch % validation_freq == (\n            validation_freq - 1\n        ):\n            self.callback_container.on_eval_begin()\n            self.valid_running_loss = 0.0\n            with trange(eval_steps, disable=self.verbose != 1) as v:\n                for batch_idx, X in zip(v, eval_loader):\n                    v.set_description(\"valid\")\n                    val_loss = self._eval_step(X[0], batch_idx)\n                    print_loss_and_metric(v, val_loss)\n            epoch_logs = save_epoch_logs(epoch_logs, val_loss, None, \"val\")\n            on_epoch_end_metric = val_loss\n        else:\n            if self.reducelronplateau:\n                raise NotImplementedError(\n                    \"ReduceLROnPlateau scheduler can be used only with validation data.\"\n                )\n\n        self.callback_container.on_epoch_end(epoch, epoch_logs, on_epoch_end_metric)\n\n        if self.early_stop:\n            self.callback_container.on_train_end(epoch_logs)\n            break\n\n    self.callback_container.on_train_end(epoch_logs)\n    self._restore_best_weights()\n    self.cd_model.train()\n</code></pre>"},{"location":"pytorch-widedeep/tab2vec.html","title":"Tab2Vec","text":""},{"location":"pytorch-widedeep/tab2vec.html#pytorch_widedeep.tab2vec.Tab2Vec","title":"Tab2Vec","text":"<pre><code>Tab2Vec(\n    tab_preprocessor,\n    model,\n    return_dataframe=False,\n    verbose=False,\n)\n</code></pre> <p>Class to transform an input dataframe into vectorized form.</p> <p>This class will take an input dataframe in the form of the dataframe used for training, and it will turn it into a vectorised form based on the processing applied by the model to the categorical and continuous columns.</p> <p> NOTE: Currently this class is only implemented  for the deeptabular component. Therefore, if the input dataframe has a  text column or a column with the path to images, these will be ignored.  We will be adding these functionalities in future versions</p> <p>Parameters:</p> <ul> <li> <code>model</code>               (<code>Union[WideDeep, BayesianWide, BayesianTabMlp]</code>)           \u2013            <p><code>WideDeep</code>, <code>BayesianWide</code> or <code>BayesianTabMlp</code> model. Must be trained.</p> </li> <li> <code>tab_preprocessor</code>               (<code>TabPreprocessor</code>)           \u2013            <p><code>TabPreprocessor</code> object. Must be fitted.</p> </li> <li> <code>return_dataframe</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating of the returned object(s) will be array(s) or pandas dataframe(s)</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>vectorizer</code>               (<code>Module</code>)           \u2013            <p>Torch module with the categorical and continuous encoding process</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import string\n&gt;&gt;&gt; from random import choices\n&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from pytorch_widedeep import Tab2Vec\n&gt;&gt;&gt; from pytorch_widedeep.models import TabMlp, WideDeep\n&gt;&gt;&gt; from pytorch_widedeep.preprocessing import TabPreprocessor\n&gt;&gt;&gt;\n&gt;&gt;&gt; colnames = list(string.ascii_lowercase)[:4]\n&gt;&gt;&gt; cat_col1_vals = [\"a\", \"b\", \"c\"]\n&gt;&gt;&gt; cat_col2_vals = [\"d\", \"e\", \"f\"]\n&gt;&gt;&gt;\n&gt;&gt;&gt; # Create the toy input dataframe and a toy dataframe to be vectorised\n&gt;&gt;&gt; cat_inp = [np.array(choices(c, k=5)) for c in [cat_col1_vals, cat_col2_vals]]\n&gt;&gt;&gt; cont_inp = [np.round(np.random.rand(5), 2) for _ in range(2)]\n&gt;&gt;&gt; df_inp = pd.DataFrame(np.vstack(cat_inp + cont_inp).transpose(), columns=colnames)\n&gt;&gt;&gt; cat_t2v = [np.array(choices(c, k=5)) for c in [cat_col1_vals, cat_col2_vals]]\n&gt;&gt;&gt; cont_t2v = [np.round(np.random.rand(5), 2) for _ in range(2)]\n&gt;&gt;&gt; df_t2v = pd.DataFrame(np.vstack(cat_t2v + cont_t2v).transpose(), columns=colnames)\n&gt;&gt;&gt;\n&gt;&gt;&gt; # fit the TabPreprocessor\n&gt;&gt;&gt; embed_cols = [(\"a\", 2), (\"b\", 4)]\n&gt;&gt;&gt; cont_cols = [\"c\", \"d\"]\n&gt;&gt;&gt; tab_preprocessor = TabPreprocessor(cat_embed_cols=embed_cols, continuous_cols=cont_cols)\n&gt;&gt;&gt; X_tab = tab_preprocessor.fit_transform(df_inp)\n&gt;&gt;&gt;\n&gt;&gt;&gt; # define the model (and let's assume we train it)\n&gt;&gt;&gt; tabmlp = TabMlp(\n... column_idx=tab_preprocessor.column_idx,\n... cat_embed_input=tab_preprocessor.cat_embed_input,\n... continuous_cols=tab_preprocessor.continuous_cols,\n... mlp_hidden_dims=[8, 4])\n&gt;&gt;&gt; model = WideDeep(deeptabular=tabmlp)\n&gt;&gt;&gt; # ...train the model...\n&gt;&gt;&gt;\n&gt;&gt;&gt; # vectorise the dataframe\n&gt;&gt;&gt; t2v = Tab2Vec(tab_preprocessor, model)\n&gt;&gt;&gt; X_vec = t2v.transform(df_t2v)\n</code></pre> Source code in <code>pytorch_widedeep/tab2vec.py</code> <pre><code>def __init__(\n    self,\n    tab_preprocessor: TabPreprocessor,\n    model: Union[WideDeep, BayesianWide, BayesianTabMlp],\n    return_dataframe: bool = False,\n    verbose: bool = False,\n):\n    super(Tab2Vec, self).__init__()\n\n    self._check_inputs(tab_preprocessor, model, verbose)\n\n    self.tab_preprocessor = tab_preprocessor\n    self.return_dataframe = return_dataframe\n    self.verbose = verbose\n\n    self.vectorizer = self._set_vectorizer(model)\n\n    self._set_dim_attributes(tab_preprocessor, model)\n</code></pre>"},{"location":"pytorch-widedeep/tab2vec.html#pytorch_widedeep.tab2vec.Tab2Vec.fit","title":"fit","text":"<pre><code>fit(df, target_col=None)\n</code></pre> <p>This is an empty method i.e. Returns the unchanged object itself. Is only included for consistency in case <code>Tab2Vec</code> is used as part of a Pipeline</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>DataFrame to be vectorised, i.e. the categorical and continuous columns will be encoded based on the processing applied within the model</p> </li> <li> <code>target_col</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Column name of the target_col variable. If <code>None</code> only the array of predictors will be returned</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>Tab2Vec</code>           \u2013            </li> </ul> Source code in <code>pytorch_widedeep/tab2vec.py</code> <pre><code>def fit(self, df: pd.DataFrame, target_col: Optional[str] = None) -&gt; \"Tab2Vec\":\n    r\"\"\"This is an empty method i.e. Returns the unchanged object itself. Is\n    only included for consistency in case `Tab2Vec` is used as part of a\n    Pipeline\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        DataFrame to be vectorised, i.e. the categorical and continuous\n        columns will be encoded based on the processing applied within\n        the model\n    target_col: str, Optional\n        Column name of the target_col variable. If `None` only the array of\n        predictors will be returned\n\n    Returns\n    -------\n    Tab2Vec\n    \"\"\"\n\n    return self\n</code></pre>"},{"location":"pytorch-widedeep/tab2vec.html#pytorch_widedeep.tab2vec.Tab2Vec.transform","title":"transform","text":"<pre><code>transform(df, target_col=None)\n</code></pre> <p>Transforms the input dataframe into vectorized form. If a target column name is passed the target values will be returned separately in their corresponding type (np.ndarray or pd.DataFrame)</p> <p>Parameters:</p> <ul> <li> <code>df</code>               (<code>DataFrame</code>)           \u2013            <p>DataFrame to be vectorised, i.e. the categorical and continuous columns will be encoded based on the processing applied within the model</p> </li> <li> <code>target_col</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Column name of the target_col variable. If <code>None</code> only the array of predictors will be returned</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>Union[np.ndarray, Tuple[np.ndarray, np.ndarray], pd.DataFrame, Tuple[pd.DataFrame, pd.Series]</code>           \u2013            <p>Returns eiter a numpy array with the vectorised values, or a Tuple of numpy arrays with the vectorised values and the target. The same applies to dataframes in case we choose to set <code>return_dataframe = True</code></p> </li> </ul> Source code in <code>pytorch_widedeep/tab2vec.py</code> <pre><code>def transform(\n    self,\n    df: pd.DataFrame,\n    target_col: Optional[str] = None,\n) -&gt; Union[\n    np.ndarray,\n    Tuple[np.ndarray, np.ndarray],\n    pd.DataFrame,\n    Tuple[pd.DataFrame, pd.Series],\n]:\n    r\"\"\"Transforms the input dataframe into vectorized form. If a target\n    column name is passed the target values will be returned separately\n    in their corresponding type (np.ndarray or pd.DataFrame)\n\n    Parameters\n    ----------\n    df: pd.DataFrame\n        DataFrame to be vectorised, i.e. the categorical and continuous\n        columns will be encoded based on the processing applied within\n        the model\n    target_col: str, Optional\n        Column name of the target_col variable. If `None` only the array of\n        predictors will be returned\n\n    Returns\n    -------\n    Union[np.ndarray, Tuple[np.ndarray, np.ndarray], pd.DataFrame, Tuple[pd.DataFrame, pd.Series]\n        Returns eiter a numpy array with the vectorised values, or a Tuple\n        of numpy arrays with the vectorised values and the target. The\n        same applies to dataframes in case we choose to set\n        `return_dataframe = True`\n    \"\"\"\n\n    X_tab = self.tab_preprocessor.transform(df)\n    X = torch.from_numpy(X_tab.astype(\"float\")).to(device)\n\n    with torch.no_grad():\n        if self.is_tab_transformer:\n            x_vec, x_cont_not_embed = self.vectorizer(X)\n        else:\n            x_vec = self.vectorizer(X)\n            x_cont_not_embed = None\n\n    if self.tab_preprocessor.with_cls_token:\n        x_vec = x_vec[:, 1:, :]\n\n    if self.tab_preprocessor.with_attention:\n        x_vec = einops.rearrange(x_vec, \"s c e -&gt; s (c e)\")\n\n    if x_cont_not_embed is not None:\n        x_vec = torch.cat([x_vec, x_cont_not_embed], 1).detach().cpu().numpy()\n    else:\n        x_vec = x_vec.detach().cpu().numpy()\n\n    if self.return_dataframe:\n        new_colnames = self._new_colnames()\n        if target_col:\n            return pd.DataFrame(data=x_vec, columns=new_colnames), df[[target_col]]\n        else:\n            return pd.DataFrame(data=x_vec, columns=new_colnames)\n    else:\n        if target_col:\n            return x_vec, df[target_col].values\n        else:\n            return x_vec\n</code></pre>"},{"location":"pytorch-widedeep/tab2vec.html#pytorch_widedeep.tab2vec.Tab2Vec.fit_transform","title":"fit_transform","text":"<pre><code>fit_transform(df, target_col=None)\n</code></pre> <p>Combines <code>fit</code> and <code>transform</code></p> Source code in <code>pytorch_widedeep/tab2vec.py</code> <pre><code>def fit_transform(\n    self, df: pd.DataFrame, target_col: Optional[str] = None\n) -&gt; Union[\n    np.ndarray,\n    Tuple[np.ndarray, np.ndarray],\n    pd.DataFrame,\n    Tuple[pd.DataFrame, pd.Series],\n]:\n    r\"\"\"Combines `fit` and `transform`\"\"\"\n    return self.fit(df, target_col).transform(df, target_col)\n</code></pre>"},{"location":"pytorch-widedeep/the_rec_module.html","title":"The <code>rec</code> module","text":"<p>This module contains models are that specifically designed for recommendation systems. While the rest of the models can be accessed from the <code>pytorch_widedeep.models</code> module, models in this module need to be specifically imported from the <code>rec</code> module, e.g.:</p> <pre><code>from pytorch_widedeep.models.rec import DeepFactorizationMachine\n</code></pre> <p>The list of models here is not meant to be exhaustive, but it includes some common architectures such as factorization machines, field aware factorization machines or extreme factorization machines. More models will be added in the future.</p>"},{"location":"pytorch-widedeep/the_rec_module.html#pytorch_widedeep.models.rec.deepfm.DeepFactorizationMachine","title":"DeepFactorizationMachine","text":"<pre><code>DeepFactorizationMachine(\n    *,\n    column_idx,\n    num_factors,\n    reduce_sum=True,\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous=None,\n    embed_continuous_method=\"piecewise\",\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    mlp_hidden_dims=None,\n    mlp_activation=None,\n    mlp_dropout=None,\n    mlp_batchnorm=None,\n    mlp_batchnorm_last=None,\n    mlp_linear_first=None\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithAttention</code></p> <p>Deep Factorization Machine (DeepFM) for recommendation systems, which is an adaptation of 'Factorization Machines' by Steffen Rendle. Presented in 'DeepFM: A Factorization-Machine based Neural Network for CTR Prediction' by Huifeng Guo, Ruiming Tang, Yunming Yey, Zhenguo Li, Xiuqiang He. 2017.</p> <p>The implementation in this library takes advantage of all the functionalities available to encode categorical and continuous features. The model can be used with only the factorization machine</p> <p>Note that this class implements only the 'Deep' component of the model described in the paper. The linear component is not implemented 'internally' and, if one wants to include it, it can be easily added using the 'wide' (aka linear) component available in this library. See the examples in the examples folder.</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dictionary mapping column names to their corresponding index.</p> </li> <li> <code>num_factors</code>               (<code>int</code>)           \u2013            <p>Number of factors for the factorization machine.</p> </li> <li> <code>reduce_sum</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Whether to reduce the sum in the factorization machine output.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of tuples with categorical column names and number of unique values.</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal['batchnorm', 'layernorm']]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal['piecewise', 'periodic']]</code>, default:                   <code>\"piecewise\"</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dim</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Size of the continuous embeddings. If the continuous columns are embedded, <code>cont_embed_dim</code> must be passed.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per dense layer in the mlp.</p> </li> <li> <code>mlp_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>mlp_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>float or List of floats with the dropout between the dense layers. e.g: [0.5,0.5]</p> </li> <li> <code>mlp_batchnorm</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers</p> </li> <li> <code>mlp_linear_first</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>mlp</code>               (<code>MLP</code>)           \u2013            <p>MLP component of the model if the mlp_hidden_dims parameter is not None. If None, the model will only return the output of the factorization machine.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from typing import Dict, List, Tuple\n&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from torch import Tensor\n&gt;&gt;&gt; from pytorch_widedeep.models.rec import DeepFactorizationMachine\n&gt;&gt;&gt; X = torch.randint(0, 10, (16, 2))\n&gt;&gt;&gt; column_idx: Dict[str, int] = {\"col1\": 0, \"col2\": 1}\n&gt;&gt;&gt; cat_embed_input: List[Tuple[str, int]] = [(\"col1\", 10), (\"col2\", 10)]\n&gt;&gt;&gt; fm = DeepFactorizationMachine(\n...     column_idx=column_idx,\n...     num_factors=8,\n...     cat_embed_input=cat_embed_input,\n...     mlp_hidden_dims=[16, 8]\n... )\n&gt;&gt;&gt; out = fm(X)\n</code></pre> Source code in <code>pytorch_widedeep/models/rec/deepfm.py</code> <pre><code>def __init__(\n    self,\n    *,\n    column_idx: Dict[str, int],\n    num_factors: int,\n    reduce_sum: bool = True,\n    cat_embed_input: Optional[List[Tuple[str, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous: Optional[bool] = None,\n    embed_continuous_method: Optional[\n        Literal[\"piecewise\", \"periodic\"]\n    ] = \"piecewise\",\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    mlp_hidden_dims: Optional[List[int]] = None,\n    mlp_activation: Optional[str] = None,\n    mlp_dropout: Optional[float] = None,\n    mlp_batchnorm: Optional[bool] = None,\n    mlp_batchnorm_last: Optional[bool] = None,\n    mlp_linear_first: Optional[bool] = None,\n):\n    super(DeepFactorizationMachine, self).__init__(\n        column_idx=column_idx,\n        input_dim=num_factors,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        shared_embed=False,\n        add_shared_embed=None,\n        frac_shared_embed=None,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    self.reduce_sum = reduce_sum\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    if self.mlp_hidden_dims is not None:\n\n        if self.mlp_hidden_dims[-1] != self.input_dim:\n            d_hidden = (\n                [self.input_dim * len(self.column_idx)]\n                + self.mlp_hidden_dims\n                + [self.input_dim]\n            )\n        else:\n            d_hidden = [\n                self.input_dim * len(self.column_idx)\n            ] + self.mlp_hidden_dims\n\n        self.mlp = MLP(\n            d_hidden=d_hidden,\n            activation=(\n                \"relu\" if self.mlp_activation is None else self.mlp_activation\n            ),\n            dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,\n            batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,\n            batchnorm_last=(\n                False\n                if self.mlp_batchnorm_last is None\n                else self.mlp_batchnorm_last\n            ),\n            linear_first=(\n                False if self.mlp_linear_first is None else self.mlp_linear_first\n            ),\n        )\n    else:\n        self.mlp = None\n</code></pre>"},{"location":"pytorch-widedeep/the_rec_module.html#pytorch_widedeep.models.rec.deepffm.DeepFieldAwareFactorizationMachine","title":"DeepFieldAwareFactorizationMachine","text":"<pre><code>DeepFieldAwareFactorizationMachine(\n    *,\n    column_idx,\n    num_factors,\n    cat_embed_input,\n    reduce_sum=True,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    mlp_hidden_dims=None,\n    mlp_activation=None,\n    mlp_dropout=None,\n    mlp_batchnorm=None,\n    mlp_batchnorm_last=None,\n    mlp_linear_first=None\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithAttention</code></p> <p>Deep Field Aware Factorization Machine (DeepFFM) for recommendation systems. Adaptation of the paper 'Field-aware Factorization Machines in a Real-world Online Advertising System', Juan et al. 2017.</p> <p>This class implements only the 'Deep' component of the model described in the paper. The linear component is not implemented 'internally' and, if one wants to include it, it can be easily added using the 'wide'/linear component in this library. See the examples in the examples folder.</p> <p>Note that in this case, only categorical features are accepted. This is because the embeddings of each feature will be learned using all other features. Therefore these embeddings have to be all of the same nature. This does not occur if we mix categorical and continuous features.</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dictionary mapping column names to their corresponding index.</p> </li> <li> <code>num_factors</code>               (<code>int</code>)           \u2013            <p>Number of factors for the factorization machine.</p> </li> <li> <code>reduce_sum</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Whether to reduce the sum in the factorization machine output.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of tuples with categorical column names and number of unique values.</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per dense layer in the mlp.</p> </li> <li> <code>mlp_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>mlp_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>float or List of floats with the dropout between the dense layers. e.g: [0.5,0.5]</p> </li> <li> <code>mlp_batchnorm</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers</p> </li> <li> <code>mlp_linear_first</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>n_features</code>               (<code>int</code>)           \u2013            <p>Number of unique features/columns</p> </li> <li> <code>n_tokens</code>               (<code>int</code>)           \u2013            <p>Number of unique values (tokens) in the full dataset (corpus)</p> </li> <li> <code>encoders</code>               (<code>ModuleList</code>)           \u2013            <p>List of <code>BaseTabularModelWithAttention</code> instances. One per categorical column</p> </li> <li> <code>mlp</code>               (<code>Module</code>)           \u2013            <p>Multi-layer perceptron. If <code>None</code> the output will be the output of the factorization machine (i.e. the sum of the interactions)</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from torch import Tensor\n&gt;&gt;&gt; from typing import Dict, List, Tuple\n&gt;&gt;&gt; from pytorch_widedeep.models.rec import DeepFieldAwareFactorizationMachine\n&gt;&gt;&gt; X = torch.randint(0, 10, (16, 2))\n&gt;&gt;&gt; column_idx: Dict[str, int] = {\"col1\": 0, \"col2\": 1}\n&gt;&gt;&gt; cat_embed_input: List[Tuple[str, int]] = [(\"col1\", 10), (\"col2\", 10)]\n&gt;&gt;&gt; ffm = DeepFieldAwareFactorizationMachine(\n...     column_idx=column_idx,\n...     num_factors=4,\n...     cat_embed_input=cat_embed_input,\n...     mlp_hidden_dims=[16, 8]\n... )\n&gt;&gt;&gt; output = ffm(X)\n</code></pre> Source code in <code>pytorch_widedeep/models/rec/deepffm.py</code> <pre><code>def __init__(\n    self,\n    *,\n    column_idx: Dict[str, int],\n    num_factors: int,\n    cat_embed_input: List[Tuple[str, int]],\n    reduce_sum: bool = True,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    mlp_hidden_dims: Optional[List[int]] = None,\n    mlp_activation: Optional[str] = None,\n    mlp_dropout: Optional[float] = None,\n    mlp_batchnorm: Optional[bool] = None,\n    mlp_batchnorm_last: Optional[bool] = None,\n    mlp_linear_first: Optional[bool] = None,\n):\n    super(DeepFieldAwareFactorizationMachine, self).__init__(\n        column_idx=column_idx,\n        input_dim=num_factors,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        shared_embed=False,\n        add_shared_embed=None,\n        frac_shared_embed=None,\n        continuous_cols=None,\n        cont_norm_layer=None,\n        embed_continuous_method=None,\n        cont_embed_dropout=None,\n        cont_embed_activation=None,\n        quantization_setup=None,\n        n_frequencies=None,\n        sigma=None,\n        share_last_layer=None,\n        full_embed_dropout=None,\n    )\n\n    self.reduce_sum = reduce_sum\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    self.n_features = len(self.column_idx)\n    self.n_tokens = sum([ei[1] for ei in cat_embed_input])\n\n    self.encoders = nn.ModuleList(\n        [\n            BaseTabularModelWithAttention(**config)\n            for config in self._get_encoder_configs()\n        ]\n    )\n\n    if self.mlp_hidden_dims is not None:\n        d_hidden = [\n            self.n_features * (self.n_features - 1) // 2 * num_factors\n        ] + self.mlp_hidden_dims\n        self.mlp = MLP(\n            d_hidden=d_hidden,\n            activation=(\n                \"relu\" if self.mlp_activation is None else self.mlp_activation\n            ),\n            dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,\n            batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,\n            batchnorm_last=(\n                False\n                if self.mlp_batchnorm_last is None\n                else self.mlp_batchnorm_last\n            ),\n            linear_first=(\n                False if self.mlp_linear_first is None else self.mlp_linear_first\n            ),\n        )\n    else:\n        self.mlp = None\n</code></pre>"},{"location":"pytorch-widedeep/the_rec_module.html#pytorch_widedeep.models.rec.xdeepfm.ExtremeDeepFactorizationMachine","title":"ExtremeDeepFactorizationMachine","text":"<pre><code>ExtremeDeepFactorizationMachine(\n    *,\n    column_idx,\n    input_dim,\n    reduce_sum=True,\n    cin_layer_dims,\n    cat_embed_input,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous_method=\"piecewise\",\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    mlp_hidden_dims=None,\n    mlp_activation=None,\n    mlp_dropout=None,\n    mlp_batchnorm=None,\n    mlp_batchnorm_last=None,\n    mlp_linear_first=None\n)\n</code></pre> <p>               Bases: <code>BaseTabularModelWithAttention</code></p> <p>Adaptation of 'xDeepFM implementation: xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems' by Jianxun Lian, Xiaohuan Zhou, Fuzheng Zhang, Zhongxia Chen, Xing Xie, Guangzhong Sun and Enhong Chen, 2018</p> <p>The implementation in this library takes advantage of all the functionalities available to encode categorical and continuous features. The model can be used with only the factorization machine</p> <p>Note that this class implements only the 'Deep' component of the model described in the paper. The linear component is not implemented 'internally' and, if one wants to include it, it can be easily added using the 'wide'/linear component in this library. See the examples in the examples folder.</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dictionary mapping column names to their corresponding index.</p> </li> <li> <code>input_dim</code>               (<code>int</code>)           \u2013            <p>Embedding input dimensions</p> </li> <li> <code>reduce_sum</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Whether to reduce the sum in the factorization machine output.</p> </li> <li> <code>cin_layer_dims</code>               (<code>List[int]</code>)           \u2013            <p>List with the number of units per CIN layer. e.g: [128, 64]</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>List of tuples with categorical column names and number of unique values.</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal['batchnorm', 'layernorm']]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal['piecewise', 'periodic']]</code>, default:                   <code>\"piecewise\"</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dim</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Size of the continuous embeddings. If the continuous columns are embedded, <code>cont_embed_dim</code> must be passed.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per dense layer in the mlp.</p> </li> <li> <code>mlp_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>mlp_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>float or List of floats with the dropout between the dense layers. e.g: [0.5,0.5]</p> </li> <li> <code>mlp_batchnorm</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers</p> </li> <li> <code>mlp_linear_first</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>n_features</code>               (<code>int</code>)           \u2013            <p>Number of unique features/columns</p> </li> <li> <code>cin</code>               (<code>CompressedInteractionNetwork</code>)           \u2013            <p>Instance of the <code>CompressedInteractionNetwork</code> class</p> </li> <li> <code>mlp</code>               (<code>MLP</code>)           \u2013            <p>Instance of the <code>MLP</code> class if <code>mlp_hidden_dims</code> is not None. If None, the model will return directly the output of the <code>CIN</code></p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models.rec import ExtremeDeepFactorizationMachine\n&gt;&gt;&gt; X_tab = torch.randint(0, 10, (16, 2))\n&gt;&gt;&gt; column_idx = {\"col1\": 0, \"col2\": 1}\n&gt;&gt;&gt; cat_embed_input = [(\"col1\", 10), (\"col2\", 10)]\n&gt;&gt;&gt; xdeepfm = ExtremeDeepFactorizationMachine(\n...     column_idx=column_idx,\n...     input_dim=4,\n...     cin_layer_dims=[8, 16],\n...     cat_embed_input=cat_embed_input,\n...     mlp_hidden_dims=[16, 8]\n... )\n&gt;&gt;&gt; output = xdeepfm(X_tab)\n</code></pre> Source code in <code>pytorch_widedeep/models/rec/xdeepfm.py</code> <pre><code>def __init__(\n    self,\n    *,\n    column_idx: Dict[str, int],\n    input_dim: int,\n    reduce_sum: bool = True,\n    cin_layer_dims: List[int],\n    cat_embed_input: List[Tuple[str, int]],\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous_method: Optional[\n        Literal[\"piecewise\", \"periodic\"]\n    ] = \"piecewise\",\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    mlp_hidden_dims: Optional[List[int]] = None,\n    mlp_activation: Optional[str] = None,\n    mlp_dropout: Optional[float] = None,\n    mlp_batchnorm: Optional[bool] = None,\n    mlp_batchnorm_last: Optional[bool] = None,\n    mlp_linear_first: Optional[bool] = None,\n):\n    super(ExtremeDeepFactorizationMachine, self).__init__(\n        column_idx=column_idx,\n        input_dim=input_dim,\n        cat_embed_input=cat_embed_input,\n        cat_embed_dropout=cat_embed_dropout,\n        use_cat_bias=use_cat_bias,\n        cat_embed_activation=cat_embed_activation,\n        shared_embed=False,\n        add_shared_embed=None,\n        frac_shared_embed=None,\n        continuous_cols=continuous_cols,\n        cont_norm_layer=cont_norm_layer,\n        embed_continuous_method=embed_continuous_method,\n        cont_embed_dropout=cont_embed_dropout,\n        cont_embed_activation=cont_embed_activation,\n        quantization_setup=quantization_setup,\n        n_frequencies=n_frequencies,\n        sigma=sigma,\n        share_last_layer=share_last_layer,\n        full_embed_dropout=full_embed_dropout,\n    )\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    self.reduce_sum = reduce_sum\n    self.cin_layer_dims = cin_layer_dims\n\n    self.n_features = len(self.column_idx)\n\n    self.cin = CompressedInteractionNetwork(\n        num_cols=self.n_features, cin_layer_dims=self.cin_layer_dims\n    )\n\n    if self.mlp_hidden_dims is not None:\n        if (\n            self.mlp_hidden_dims[-1] != sum(self.cin_layer_dims)\n            and not self.reduce_sum\n        ):\n            d_hidden = (\n                [sum(self.cin_layer_dims)]\n                + self.mlp_hidden_dims\n                + [sum(self.cin_layer_dims)]\n            )\n        else:\n            d_hidden = [sum(self.cin_layer_dims)] + self.mlp_hidden_dims\n\n        self.mlp = MLP(\n            d_hidden=d_hidden,\n            activation=(\n                \"relu\" if self.mlp_activation is None else self.mlp_activation\n            ),\n            dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,\n            batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,\n            batchnorm_last=(\n                False\n                if self.mlp_batchnorm_last is None\n                else self.mlp_batchnorm_last\n            ),\n            linear_first=(\n                False if self.mlp_linear_first is None else self.mlp_linear_first\n            ),\n        )\n    else:\n        self.mlp = None\n</code></pre>"},{"location":"pytorch-widedeep/the_rec_module.html#pytorch_widedeep.models.rec.din.DeepInterestNetwork","title":"DeepInterestNetwork","text":"<pre><code>DeepInterestNetwork(\n    *,\n    column_idx,\n    target_item_col,\n    user_behavior_confiq,\n    action_seq_config=None,\n    other_seq_cols_confiq=None,\n    attention_unit_activation=\"prelu\",\n    cat_embed_input=None,\n    cat_embed_dropout=None,\n    use_cat_bias=None,\n    cat_embed_activation=None,\n    continuous_cols=None,\n    cont_norm_layer=None,\n    embed_continuous=None,\n    embed_continuous_method=None,\n    cont_embed_dim=None,\n    cont_embed_dropout=None,\n    cont_embed_activation=None,\n    quantization_setup=None,\n    n_frequencies=None,\n    sigma=None,\n    share_last_layer=None,\n    full_embed_dropout=None,\n    mlp_hidden_dims=None,\n    mlp_activation=None,\n    mlp_dropout=None,\n    mlp_batchnorm=None,\n    mlp_batchnorm_last=None,\n    mlp_linear_first=None\n)\n</code></pre> <p>               Bases: <code>BaseWDModelComponent</code></p> <p>Adaptation of the Deep Interest Network (DIN) for recommendation systems as described in the paper: 'Deep Interest Network for Click-Through Rate Prediction' by Guorui Zhou et al. 2018.</p> <p>Note that all the categorical- and continuous-related parameters refer to the categorical and continuous columns that are not part of the sequential columns and will be treated as standard tabular data.</p> <p>This model requires some specific data preparation that allows for quite a lot of flexibility. Therefore, this library does not currently include a preprocessor designed specifically for this model. Please, see the example 'movielens_din.py' in the examples folder to understand the data preparation process.</p> <p>Parameters:</p> <ul> <li> <code>column_idx</code>               (<code>Dict[str, int]</code>)           \u2013            <p>Dictionary mapping column names to their corresponding index.</p> </li> <li> <code>target_item_col</code>               (<code>str</code>)           \u2013            <p>Name of the target item column.</p> </li> <li> <code>user_behavior_confiq</code>               (<code>Tuple[List[str], int, int]</code>)           \u2013            <p>Configuration for user behavior sequence columns. Tuple containing: - List of column names that correspond to the user behavior sequence - Number of unique feature values (n_tokens) - Embedding dimension Example: <code>([\"item_1\", \"item_2\", \"item_3\"], 5, 8)</code></p> </li> <li> <code>action_seq_config</code>               (<code>Optional[Tuple[List[str], int]]</code>, default:                   <code>None</code> )           \u2013            <p>Configuration for a so-called action sequence columns (for example a rating, or purchased/not-purchased, etc). Tuple containing: - List of column names - Number of unique feature values (n_tokens) This action will always be learned as a 1d embedding and will be combined with the user behaviour. For example, imagine that the action is purchased/not-purchased. then per item in the user behaviour sequence there will be a binary action to learn 0/1. Such action will be represented by a float number that will multiply the corresponding item embedding in the user behaviour sequence. Example: <code>([\"rating_1\", \"rating_2\", \"rating_3\"], 5)</code> Internally, the embedding dimension will be set to 1</p> </li> <li> <code>other_seq_cols_confiq</code>               (<code>Optional[List[Tuple[List[str], int, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>Configuration for other sequential columns. List of tuples containing: - List of column names that correspond to the sequential column - Number of unique feature values (n_tokens) - Embedding dimension Example: <code>[([\"seq1_col1\", \"seq1_col2\"], 5, 8), ([\"seq2_col1\", \"seq2_col2\"], 5, 8)]</code></p> </li> <li> <code>attention_unit_activation</code>               (<code>Literal['prelu', 'dice']</code>, default:                   <code>\"prelu\"</code> )           \u2013            <p>Activation function to use in the attention unit.</p> </li> <li> <code>cat_embed_input</code>               (<code>Optional[List[Tuple[str, int, int]]]</code>, default:                   <code>None</code> )           \u2013            <p>Configuration for other columns. List of tuples containing: - Column name - Number of unique feature values (n_tokens) - Embedding dimension</p> <p>Note: From here in advance the remaining parameters are related to the categorical and continuous columns that are not part of the sequential columns and will be treated as standard tabular data.</p> </li> <li> <code>cat_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Categorical embeddings dropout. If <code>None</code>, it will default to 0.</p> </li> <li> <code>use_cat_bias</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if bias will be used for the categorical embeddings. If <code>None</code>, it will default to 'False'.</p> </li> <li> <code>cat_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the categorical embeddings, if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported</p> </li> <li> <code>continuous_cols</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the name of the numeric (aka continuous) columns</p> </li> <li> <code>cont_norm_layer</code>               (<code>Optional[Literal['batchnorm', 'layernorm']]</code>, default:                   <code>None</code> )           \u2013            <p>Type of normalization layer applied to the continuous features. Options are: 'layernorm' and 'batchnorm'. if <code>None</code>, no normalization layer will be used.</p> </li> <li> <code>embed_continuous</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating if the continuous columns will be embedded using one of the available methods: 'standard', 'periodic' or 'piecewise'. If <code>None</code>, it will default to 'False'. NOTE: This parameter is deprecated and it  will be removed in future releases. Please, use the  <code>embed_continuous_method</code> parameter instead.</p> </li> <li> <code>embed_continuous_method</code>               (<code>Optional[Literal['piecewise', 'periodic']]</code>, default:                   <code>\"piecewise\"</code> )           \u2013            <p>Method to use to embed the continuous features. Options are: 'standard', 'periodic' or 'piecewise'. The 'standard' embedding method is based on the FT-Transformer implementation presented in the paper: Revisiting Deep Learning Models for Tabular Data. The 'periodic' and_'piecewise'_ methods were presented in the paper: On Embeddings for Numerical Features in Tabular Deep Learning. Please, read the papers for details.</p> </li> <li> <code>cont_embed_dim</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>Size of the continuous embeddings. If the continuous columns are embedded, <code>cont_embed_dim</code> must be passed.</p> </li> <li> <code>cont_embed_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>Dropout for the continuous embeddings. If <code>None</code>, it will default to 0.0</p> </li> <li> <code>cont_embed_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the continuous embeddings if any. Currently 'tanh', 'relu', 'leaky_relu' and 'gelu' are supported. If <code>None</code>, no activation function will be applied.</p> </li> <li> <code>quantization_setup</code>               (<code>Optional[Dict[str, List[float]]]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is used when the 'piecewise' method is used to embed the continuous cols. It is a dict where keys are the name of the continuous columns and values are lists with the boundaries for the quantization of the continuous_cols. See the examples for details. If If the 'piecewise' method is used, this parameter is required.</p> </li> <li> <code>n_frequencies</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>This is the so called 'k' in their paper On Embeddings for Numerical Features in Tabular Deep Learning, and is the number of 'frequencies' that will be used to represent each continuous column. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>sigma</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>This is the sigma parameter in the paper mentioned when describing the previous parameters and it is used to initialise the 'frequency weights'. See their Eq 2 in the paper for details. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>share_last_layer</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>This parameter is not present in the before mentioned paper but it is implemented in the official repo. If <code>True</code> the linear layer that turns the frequencies into embeddings will be shared across the continuous columns. If <code>False</code> a different linear layer will be used for each continuous column. If the 'periodic' method is used, this parameter is required.</p> </li> <li> <code>full_embed_dropout</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>If <code>True</code>, the full embedding corresponding to a column will be masked out/dropout. If <code>None</code>, it will default to <code>False</code>.</p> </li> <li> <code>mlp_hidden_dims</code>               (<code>Optional[List[int]]</code>, default:                   <code>None</code> )           \u2013            <p>List with the number of neurons per dense layer in the mlp.</p> </li> <li> <code>mlp_activation</code>               (<code>Optional[str]</code>, default:                   <code>None</code> )           \u2013            <p>Activation function for the dense layers of the MLP. Currently 'tanh', 'relu', 'leaky_relu', 'gelu' and 'preglu' are   supported</p> </li> <li> <code>mlp_dropout</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>float or List of floats with the dropout between the dense layers. e.g: [0.5,0.5]</p> </li> <li> <code>mlp_batchnorm</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the dense layers</p> </li> <li> <code>mlp_batchnorm_last</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating whether or not batch normalization will be applied to the last of the dense layers</p> </li> <li> <code>mlp_linear_first</code>               (<code>Optional[bool]</code>, default:                   <code>None</code> )           \u2013            <p>Boolean indicating the order of the operations in the dense layer. If <code>True: [LIN -&gt; ACT -&gt; BN -&gt; DP]</code>. If <code>False: [BN -&gt; DP -&gt; LIN -&gt; ACT]</code></p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>user_behavior_indexes</code>               (<code>List[int]</code>)           \u2013            <p>List with the indexes of the user behavior columns</p> </li> <li> <code>user_behavior_embed</code>               (<code>BaseTabularModelWithAttention</code>)           \u2013            <p>Embedding layer for the user</p> </li> <li> <code>action_seq_indexes</code>               (<code>List[int]</code>)           \u2013            <p>List with the indexes of the rating sequence columns if the action_seq_config parameter is not None</p> </li> <li> <code>action_embed</code>               (<code>BaseTabularModelWithAttention</code>)           \u2013            <p>Embedding layer for the rating sequence columns if the action_seq_config parameter is not None</p> </li> <li> <code>other_seq_cols_indexes</code>               (<code>Dict[str, List[int]]</code>)           \u2013            <p>Dictionary with the indexes of the other sequential columns if the other_seq_cols_confiq parameter is not None</p> </li> <li> <code>other_seq_cols_embed</code>               (<code>ModuleDict</code>)           \u2013            <p>Dictionary with the embedding layers for the other sequential columns if the other_seq_cols_confiq parameter is not None</p> </li> <li> <code>other_cols_idx</code>               (<code>List[int]</code>)           \u2013            <p>List with the indexes of the other columns if the other_cols_config parameter is not None</p> </li> <li> <code>other_col_embed</code>               (<code>BaseTabularModel</code>)           \u2013            <p>Embedding layer for the other columns if the other_cols_config parameter is not None</p> </li> <li> <code>mlp</code>               (<code>Optional[MLP]</code>)           \u2013            <p>MLP component of the model. If None, no MLP will be used. This should almost always be not None.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; from torch import Tensor\n&gt;&gt;&gt; from typing import Dict, List, Tuple\n&gt;&gt;&gt; from pytorch_widedeep.models.rec import DeepInterestNetwork\n&gt;&gt;&gt; np_seed = np.random.seed(42)\n&gt;&gt;&gt; torch_seed = torch.manual_seed(42)\n&gt;&gt;&gt; num_users = 10\n&gt;&gt;&gt; num_items = 5\n&gt;&gt;&gt; num_contexts = 3\n&gt;&gt;&gt; seq_length = 3\n&gt;&gt;&gt; num_samples = 10\n&gt;&gt;&gt; user_ids = np.random.randint(0, num_users, num_samples)\n&gt;&gt;&gt; target_item_ids = np.random.randint(0, num_items, num_samples)\n&gt;&gt;&gt; context_ids = np.random.randint(0, num_contexts, num_samples)\n&gt;&gt;&gt; user_behavior = np.array(\n...     [\n...         np.random.choice(num_items, seq_length, replace=False)\n...         for _ in range(num_samples)\n...     ]\n... )\n&gt;&gt;&gt; X_arr = np.column_stack((user_ids, target_item_ids, context_ids, user_behavior))\n&gt;&gt;&gt; X = torch.tensor(X_arr, dtype=torch.long)\n&gt;&gt;&gt; column_idx: Dict[str, int] = {\n...     \"user_id\": 0,\n...     \"target_item\": 1,\n...     \"context\": 2,\n...     \"item_1\": 3,\n...     \"item_2\": 4,\n...     \"item_3\": 5,\n... }\n&gt;&gt;&gt; user_behavior_config: Tuple[List[str], int, int] = (\n...     [\"item_1\", \"item_2\", \"item_3\"],\n...     num_items,\n...     8,\n... )\n&gt;&gt;&gt; cat_embed_input: List[Tuple[str, int, int]] = [\n...     (\"user_id\", num_users, 8),\n...     (\"context\", num_contexts, 4),\n... ]\n&gt;&gt;&gt; model = DeepInterestNetwork(\n...     column_idx=column_idx,\n...     target_item_col=\"target_item\",\n...     user_behavior_confiq=user_behavior_config,\n...     cat_embed_input=cat_embed_input,\n...     mlp_hidden_dims=[16, 8],\n... )\n&gt;&gt;&gt; output = model(X)\n</code></pre> Source code in <code>pytorch_widedeep/models/rec/din.py</code> <pre><code>def __init__(\n    self,\n    *,\n    column_idx: Dict[str, int],\n    target_item_col: str,\n    user_behavior_confiq: Tuple[List[str], int, int],\n    action_seq_config: Optional[Tuple[List[str], int]] = None,\n    other_seq_cols_confiq: Optional[List[Tuple[List[str], int, int]]] = None,\n    attention_unit_activation: Literal[\"prelu\", \"dice\"] = \"prelu\",\n    cat_embed_input: Optional[List[Tuple[str, int, int]]] = None,\n    cat_embed_dropout: Optional[float] = None,\n    use_cat_bias: Optional[bool] = None,\n    cat_embed_activation: Optional[str] = None,\n    continuous_cols: Optional[List[str]] = None,\n    cont_norm_layer: Optional[Literal[\"batchnorm\", \"layernorm\"]] = None,\n    embed_continuous: Optional[bool] = None,\n    embed_continuous_method: Optional[\n        Literal[\"standard\", \"piecewise\", \"periodic\"]\n    ] = None,\n    cont_embed_dim: Optional[int] = None,\n    cont_embed_dropout: Optional[float] = None,\n    cont_embed_activation: Optional[str] = None,\n    quantization_setup: Optional[Dict[str, List[float]]] = None,\n    n_frequencies: Optional[int] = None,\n    sigma: Optional[float] = None,\n    share_last_layer: Optional[bool] = None,\n    full_embed_dropout: Optional[bool] = None,\n    mlp_hidden_dims: Optional[List[int]] = None,\n    mlp_activation: Optional[str] = None,\n    mlp_dropout: Optional[float] = None,\n    mlp_batchnorm: Optional[bool] = None,\n    mlp_batchnorm_last: Optional[bool] = None,\n    mlp_linear_first: Optional[bool] = None,\n):\n    super(DeepInterestNetwork, self).__init__()\n\n    self.column_idx = {\n        k: v for k, v in sorted(column_idx.items(), key=lambda x: x[1])\n    }\n\n    self.column_idx = column_idx\n    self.target_item_col = target_item_col\n    self.user_behavior_confiq = user_behavior_confiq\n    self.action_seq_config = (\n        (action_seq_config[0], action_seq_config[1], 1)\n        if action_seq_config is not None\n        else None\n    )\n    self.other_seq_cols_confiq = other_seq_cols_confiq\n    self.cat_embed_input = cat_embed_input\n    self.attention_unit_activation = attention_unit_activation\n\n    self.cat_embed_dropout = cat_embed_dropout\n    self.use_cat_bias = use_cat_bias\n    self.cat_embed_activation = cat_embed_activation\n    self.continuous_cols = continuous_cols\n    self.cont_norm_layer = cont_norm_layer\n    self.embed_continuous = embed_continuous\n    self.embed_continuous_method = embed_continuous_method\n    self.cont_embed_dim = cont_embed_dim\n    self.cont_embed_dropout = cont_embed_dropout\n    self.cont_embed_activation = cont_embed_activation\n    self.quantization_setup = quantization_setup\n    self.n_frequencies = n_frequencies\n    self.sigma = sigma\n    self.share_last_layer = share_last_layer\n    self.full_embed_dropout = full_embed_dropout\n\n    self.mlp_hidden_dims = mlp_hidden_dims\n    self.mlp_activation = mlp_activation\n    self.mlp_dropout = mlp_dropout\n    self.mlp_batchnorm = mlp_batchnorm\n    self.mlp_batchnorm_last = mlp_batchnorm_last\n    self.mlp_linear_first = mlp_linear_first\n\n    self.target_item_idx = self.column_idx[target_item_col]\n\n    self.user_behavior_indexes, self.user_behavior_embed = (\n        self.set_user_behavior_indexes_and_embed(user_behavior_confiq)\n    )\n    self.user_behavior_dim = user_behavior_confiq[2]\n\n    if self.action_seq_config is not None:\n        self.action_seq_indexes, self.action_embed = (\n            self._set_rating_indexes_and_embed(self.action_seq_config)\n        )\n    else:\n        self.action_embed = None\n\n    if self.other_seq_cols_confiq is not None:\n        (\n            self.other_seq_cols_indexes,\n            self.other_seq_cols_embed,\n            self.other_seq_dim,\n        ) = self._set_other_seq_cols_indexes_embed_and_dim(\n            self.other_seq_cols_confiq\n        )\n    else:\n        self.other_seq_cols_embed = None\n        self.other_seq_dim = 0\n\n    if self.cat_embed_input is not None or self.continuous_cols is not None:\n        self.other_cols_idx, self.other_col_embed, self.other_cols_dim = (\n            self._set_other_cols_idx_embed_and_dim(\n                self.cat_embed_input, self.continuous_cols\n            )\n        )\n    else:\n        self.other_col_embed = None\n        self.other_cols_dim = 0\n\n    self.attention = ActivationUnit(\n        user_behavior_confiq[2], attention_unit_activation\n    )\n\n    if self.mlp_hidden_dims is not None:\n        mlp_input_dim = (\n            self.user_behavior_dim * 2 + self.other_seq_dim + self.other_cols_dim\n        )\n        self.mlp = MLP(\n            d_hidden=[mlp_input_dim] + self.mlp_hidden_dims,\n            activation=(\n                \"relu\" if self.mlp_activation is None else self.mlp_activation\n            ),\n            dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,\n            batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,\n            batchnorm_last=(\n                False\n                if self.mlp_batchnorm_last is None\n                else self.mlp_batchnorm_last\n            ),\n            linear_first=(\n                False if self.mlp_linear_first is None else self.mlp_linear_first\n            ),\n        )\n    else:\n        self.mlp = None\n</code></pre>"},{"location":"pytorch-widedeep/the_rec_module.html#pytorch_widedeep.models.rec.basic_transformer.Transformer","title":"Transformer","text":"<pre><code>Transformer(\n    vocab_size,\n    seq_length,\n    input_dim,\n    n_heads,\n    n_blocks,\n    attn_dropout=0.1,\n    ff_dropout=0.1,\n    ff_factor=4,\n    activation=\"gelu\",\n    use_linear_attention=False,\n    use_flash_attention=False,\n    padding_idx=0,\n    with_cls_token=False,\n    *,\n    with_pos_encoding=True,\n    pos_encoding_dropout=0.1,\n    pos_encoder=None\n)\n</code></pre> <p>               Bases: <code>Module</code></p> <p>Basic Encoder-Only Transformer Model for sequence classification/regression. As all other models in the library this model can be used as the <code>deeptext</code> component of a Wide &amp; Deep model or independently by itself.</p> <p> NOTE: This model is introduced in the context of recommendation systems and thought for sequences of any nature (e.g. items). It can, of course, still be used for text.</p> <p>Parameters:</p> <ul> <li> <code>vocab_size</code>               (<code>int</code>)           \u2013            <p>Number of words in the vocabulary</p> </li> <li> <code>input_dim</code>               (<code>int</code>)           \u2013            <p>Dimension of the token embeddings</p> <p>Param aliases: <code>embed_dim</code>, <code>d_model</code>. </p> </li> <li> <code>seq_length</code>               (<code>int</code>)           \u2013            <p>Input sequence length</p> </li> <li> <code>n_heads</code>               (<code>int</code>)           \u2013            <p>Number of attention heads per Transformer block</p> </li> <li> <code>n_blocks</code>               (<code>int</code>)           \u2013            <p>Number of Transformer blocks</p> </li> <li> <code>attn_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Dropout that will be applied to the Multi-Head Attention layers</p> </li> <li> <code>ff_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Dropout that will be applied to the FeedForward network</p> </li> <li> <code>ff_factor</code>               (<code>int</code>, default:                   <code>4</code> )           \u2013            <p>Multiplicative factor applied to the first layer of the FF network in each Transformer block, This is normally set to 4.</p> </li> <li> <code>activation</code>               (<code>str</code>, default:                   <code>'gelu'</code> )           \u2013            <p>Transformer Encoder activation function. 'tanh', 'relu', 'leaky_relu', 'gelu', 'geglu' and 'reglu' are supported</p> </li> <li> <code>padding_idx</code>               (<code>int</code>, default:                   <code>0</code> )           \u2013            <p>index of the padding token in the padded-tokenised sequences.</p> </li> <li> <code>with_cls_token</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if a <code>'[CLS]'</code> token is included in the tokenized sequences. If present, the final hidden state corresponding to this token is used as the aggregated representation for classification and regression tasks. NOTE: if included in the tokenized sequences it must be inserted as the first token in the sequences.</p> </li> <li> <code>with_pos_encoding</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Boolean indicating if positional encoding will be used</p> </li> <li> <code>pos_encoding_dropout</code>               (<code>float</code>, default:                   <code>0.1</code> )           \u2013            <p>Positional encoding dropout</p> </li> <li> <code>pos_encoder</code>               (<code>Optional[Module]</code>, default:                   <code>None</code> )           \u2013            <p>This model uses by default a standard positional encoding approach. However, any custom positional encoder can also be used and pass to the Transformer model via the 'pos_encoder' parameter</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>embedding</code>               (<code>Module</code>)           \u2013            <p>Standard token embedding layer</p> </li> <li> <code>pos_encoder</code>               (<code>Module</code>)           \u2013            <p>Positional Encoder</p> </li> <li> <code>encoder</code>               (<code>Module</code>)           \u2013            <p>Sequence of Transformer blocks</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from pytorch_widedeep.models.rec import Transformer\n&gt;&gt;&gt; X = torch.cat((torch.zeros([5,1]), torch.empty(5, 4).random_(1,4)), axis=1)\n&gt;&gt;&gt; model = Transformer(vocab_size=4, seq_length=5, input_dim=8, n_heads=1, n_blocks=1)\n&gt;&gt;&gt; out = model(X)\n</code></pre> Source code in <code>pytorch_widedeep/models/rec/basic_transformer.py</code> <pre><code>@alias(\"input_dim\", [\"embed_dim\", \"d_model\"])\n@alias(\"seq_length\", [\"max_length\", \"maxlen\"])\ndef __init__(\n    self,\n    vocab_size: int,\n    seq_length: int,\n    input_dim: int,\n    n_heads: int,\n    n_blocks: int,\n    attn_dropout: float = 0.1,\n    ff_dropout: float = 0.1,\n    ff_factor: int = 4,\n    activation: str = \"gelu\",\n    use_linear_attention: bool = False,\n    use_flash_attention: bool = False,\n    padding_idx: int = 0,\n    with_cls_token: bool = False,\n    *,  # from here on pos encoding args\n    with_pos_encoding: bool = True,\n    pos_encoding_dropout: float = 0.1,\n    pos_encoder: Optional[nn.Module] = None,\n):\n    super().__init__()\n\n    self.input_dim = input_dim\n    self.seq_length = seq_length\n    self.n_heads = n_heads\n    self.n_blocks = n_blocks\n    self.attn_dropout = attn_dropout\n    self.ff_dropout = ff_dropout\n    self.ff_factor = ff_factor\n    self.activation = activation\n    self.use_linear_attention = use_linear_attention\n    self.use_flash_attention = use_flash_attention\n    self.padding_idx = padding_idx\n    self.with_cls_token = with_cls_token\n    self.with_pos_encoding = with_pos_encoding\n    self.pos_encoding_dropout = pos_encoding_dropout\n\n    self.embedding = nn.Embedding(\n        vocab_size, input_dim, padding_idx=self.padding_idx\n    )\n\n    if with_pos_encoding:\n        if pos_encoder is not None:\n            self.pos_encoder: Union[nn.Module, nn.Identity, PositionalEncoding] = (\n                pos_encoder\n            )\n        else:\n            self.pos_encoder = PositionalEncoding(\n                input_dim, pos_encoding_dropout, seq_length\n            )\n    else:\n        self.pos_encoder = nn.Identity()\n\n    self.encoder = nn.Sequential()\n    for i in range(n_blocks):\n        self.encoder.add_module(\n            \"transformer_block\" + str(i),\n            TransformerEncoder(\n                input_dim,\n                n_heads,\n                False,  # use_qkv_bias\n                attn_dropout,\n                ff_dropout,\n                ff_factor,\n                activation,\n                use_linear_attention,\n                use_flash_attention,\n            ),\n        )\n</code></pre>"},{"location":"pytorch-widedeep/trainer.html","title":"Training multimodal Deep Learning Models","text":"<p>Here is the documentation for the <code>Trainer</code> class, that will do all the heavy lifting.</p> <p>Trainer is also available from <code>pytorch-widedeep</code> directly, for example, one could do:</p> <pre><code>    from pytorch-widedeep.training import Trainer\n</code></pre> <p>or also:</p> <pre><code>    from pytorch-widedeep import Trainer\n</code></pre>"},{"location":"pytorch-widedeep/trainer.html#pytorch_widedeep.training.Trainer","title":"Trainer","text":"<pre><code>Trainer(\n    model,\n    objective,\n    custom_loss_function=None,\n    optimizers=None,\n    lr_schedulers=None,\n    initializers=None,\n    transforms=None,\n    callbacks=None,\n    metrics=None,\n    verbose=1,\n    seed=1,\n    **kwargs\n)\n</code></pre> <p>               Bases: <code>BaseTrainer</code></p> <p>Class to set the of attributes that will be used during the training process.</p> <p>Parameters:</p> <ul> <li> <code>model</code>               (<code>WideDeep</code>)           \u2013            <p>An object of class <code>WideDeep</code></p> </li> <li> <code>objective</code>               (<code>str</code>)           \u2013            <p>Defines the objective, loss or cost function. </p> <p>Param aliases: <code>loss_function</code>, <code>loss_fn</code>, <code>loss</code>, <code>cost_function</code>, <code>cost_fn</code>, <code>cost</code>. </p> <p>Possible values are:</p> <ul> <li> <p><code>binary</code>, aliases: <code>logistic</code>, <code>binary_logloss</code>, <code>binary_cross_entropy</code></p> </li> <li> <p><code>binary_focal_loss</code></p> </li> <li> <p><code>multiclass</code>, aliases: <code>multi_logloss</code>, <code>cross_entropy</code>, <code>categorical_cross_entropy</code>,</p> </li> <li> <p><code>multiclass_focal_loss</code></p> </li> <li> <p><code>regression</code>, aliases: <code>mse</code>, <code>l2</code>, <code>mean_squared_error</code></p> </li> <li> <p><code>mean_absolute_error</code>, aliases: <code>mae</code>, <code>l1</code></p> </li> <li> <p><code>mean_squared_log_error</code>, aliases: <code>msle</code></p> </li> <li> <p><code>root_mean_squared_error</code>, aliases:  <code>rmse</code></p> </li> <li> <p><code>root_mean_squared_log_error</code>, aliases: <code>rmsle</code></p> </li> <li> <p><code>zero_inflated_lognormal</code>, aliases: <code>ziln</code></p> </li> <li> <p><code>quantile</code></p> </li> <li> <p><code>tweedie</code></p> </li> <li> <p><code>multitarget</code>, aliases: <code>multi_target</code></p> </li> </ul> <p>NOTE: For <code>multitarget</code> a custom loss function must be passed</p> </li> <li> <code>custom_loss_function</code>               (<code>Optional[Module]</code>, default:                   <code>None</code> )           \u2013            <p>It is possible to pass a custom loss function. See for example <code>pytorch_widedeep.losses.FocalLoss</code> for the required structure of the object or the Examples section in this documentation or in the repo. Note that if <code>custom_loss_function</code> is not <code>None</code>, <code>objective</code> must be 'binary', 'multiclass' or 'regression', consistent with the loss function</p> </li> <li> <code>optimizers</code>               (<code>Optional[Union[Optimizer, Dict[str, Union[Optimizer, List[Optimizer]]]]]</code>, default:                   <code>None</code> )           \u2013            <ul> <li>An instance of Pytorch's <code>Optimizer</code> object   (e.g. <code>torch.optim.Adam()</code>) or</li> <li>a dictionary where there keys are the model components (i.e.   'wide', 'deeptabular', 'deeptext', 'deepimage'   and/or 'deephead')  and the values are the corresponding   optimizers or list of optimizers if multiple models are used for   the given data mode (e.g. two text columns/models for the deeptext   component). If multiple optimizers are used the   dictionary MUST contain an optimizer per model component.</li> </ul> <p>if no optimizers are passed it will default to <code>Adam</code> for all model components</p> </li> <li> <code>lr_schedulers</code>               (<code>Optional[Union[LRScheduler, Dict[str, Union[LRScheduler, List[LRScheduler]]]]]</code>, default:                   <code>None</code> )           \u2013            <ul> <li>An instance of Pytorch's <code>LRScheduler</code> object (e.g   <code>torch.optim.lr_scheduler.StepLR(opt, step_size=5)</code>) or</li> <li>a dictionary where there keys are the model componenst (i.e. 'wide',   'deeptabular', 'deeptext', 'deepimage' and/or 'deephead') and the   values are the corresponding learning rate schedulers or list of     learning rate schedulers if multiple models are used for the given     data mode (e.g. two text columns/models for the deeptext component).</li> </ul> </li> <li> <code>initializers</code>               (<code>Optional[Union[Initializer, Dict[str, Union[Initializer, List[Initializer]]]]]</code>, default:                   <code>None</code> )           \u2013            <ul> <li>An instance of an <code>Initializer</code> object see <code>pytorch-widedeep.initializers</code> or</li> <li>a dictionary where there keys are the model components (i.e. 'wide',   'deeptabular', 'deeptext', 'deepimage' and/or 'deephead')   and the values are the corresponding initializers or list of     initializers if multiple models are used for the given data mode (e.g.     two text columns/models for the deeptext component).</li> </ul> </li> <li> <code>transforms</code>               (<code>Optional[List[Transforms]]</code>, default:                   <code>None</code> )           \u2013            <p>List with <code>torchvision.transforms</code> to be applied to the image component of the model (i.e. <code>deepimage</code>) See torchvision transforms.</p> </li> <li> <code>callbacks</code>               (<code>Optional[List[Callback]]</code>, default:                   <code>None</code> )           \u2013            <p>List with <code>Callback</code> objects. The three callbacks available in <code>pytorch-widedeep</code> are: <code>LRHistory</code>, <code>ModelCheckpoint</code> and <code>EarlyStopping</code>. The <code>History</code> and the <code>LRShedulerCallback</code> callbacks are used by default. This can also be a custom callback as long as the object of type <code>Callback</code>. See <code>pytorch_widedeep.callbacks.Callback</code> or the examples folder in the repo.</p> </li> <li> <code>metrics</code>               (<code>Optional[Union[List[Metric], List[Metric]]]</code>, default:                   <code>None</code> )           \u2013            <ul> <li>List of objects of type <code>Metric</code>. Metrics available are:   <code>Accuracy</code>, <code>Precision</code>, <code>Recall</code>, <code>FBetaScore</code>,   <code>F1Score</code> and <code>R2Score</code>. This can also be a custom metric as long   as it is an object of type <code>Metric</code>. See   <code>pytorch_widedeep.metrics.Metric</code> or the examples folder in the   repo</li> <li>List of objects of type <code>torchmetrics.Metric</code>. This can be any   metric from torchmetrics library   Examples.   This can also be a custom metric as long as   it is an object of type <code>Metric</code>. See   the instructions.</li> </ul> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Verbosity level. If set to 0 nothing will be printed during training</p> </li> <li> <code>seed</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>Random seed to be used internally for train/test split</p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>**kwargs</code>           \u2013            <p>Other infrequently used arguments that can also be passed as kwargs are:</p> <ul> <li> <p>device: <code>str</code>     string indicating the device. One of 'cpu' or 'gpu'</p> </li> <li> <p>num_workers: <code>int</code>     number of workers to be used internally by the data loaders</p> </li> <li> <p>lambda_sparse: <code>float</code>     lambda sparse parameter in case the <code>deeptabular</code> component is <code>TabNet</code></p> </li> <li> <p>class_weight: <code>List[float]</code>     This is the <code>weight</code> or <code>pos_weight</code> parameter in     <code>CrossEntropyLoss</code> and <code>BCEWithLogitsLoss</code>, depending on whether</p> </li> <li>reducelronplateau_criterion: <code>str</code>     This sets the criterion that will be used by the lr scheduler to     take a step: One of 'loss' or 'metric'. The ReduceLROnPlateau     learning rate is a bit particular.</li> </ul> </li> </ul> <p>Attributes:</p> <ul> <li> <code>cyclic_lr</code>               (<code>bool</code>)           \u2013            <p>Attribute that indicates if any of the lr_schedulers is cyclic_lr (i.e. <code>CyclicLR</code> or <code>OneCycleLR</code>). See Pytorch schedulers.</p> </li> <li> <code>feature_importance</code>               (<code>dict</code>)           \u2013            <p>dict where the keys are the column names and the values are the corresponding feature importances. This attribute will only exist if the <code>deeptabular</code> component is a Tabnet model.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import torch\n&gt;&gt;&gt; from torchvision.transforms import ToTensor\n&gt;&gt;&gt;\n&gt;&gt;&gt; # wide deep imports\n&gt;&gt;&gt; from pytorch_widedeep.callbacks import EarlyStopping, LRHistory\n&gt;&gt;&gt; from pytorch_widedeep.initializers import KaimingNormal, KaimingUniform, Normal, Uniform\n&gt;&gt;&gt; from pytorch_widedeep.models import TabResnet, Vision, BasicRNN, Wide, WideDeep\n&gt;&gt;&gt; from pytorch_widedeep import Trainer\n&gt;&gt;&gt;\n&gt;&gt;&gt; embed_input = [(u, i, j) for u, i, j in zip([\"a\", \"b\", \"c\"][:4], [4] * 3, [8] * 3)]\n&gt;&gt;&gt; column_idx = {k: v for v, k in enumerate([\"a\", \"b\", \"c\"])}\n&gt;&gt;&gt; wide = Wide(10, 1)\n&gt;&gt;&gt;\n&gt;&gt;&gt; # build the model\n&gt;&gt;&gt; deeptabular = TabResnet(blocks_dims=[8, 4], column_idx=column_idx, cat_embed_input=embed_input)\n&gt;&gt;&gt; deeptext = BasicRNN(vocab_size=10, embed_dim=4, padding_idx=0)\n&gt;&gt;&gt; deepimage = Vision()\n&gt;&gt;&gt; model = WideDeep(wide=wide, deeptabular=deeptabular, deeptext=deeptext, deepimage=deepimage)\n&gt;&gt;&gt;\n&gt;&gt;&gt; # set optimizers and schedulers\n&gt;&gt;&gt; wide_opt = torch.optim.Adam(model.wide.parameters())\n&gt;&gt;&gt; deep_opt = torch.optim.AdamW(model.deeptabular.parameters())\n&gt;&gt;&gt; text_opt = torch.optim.Adam(model.deeptext.parameters())\n&gt;&gt;&gt; img_opt = torch.optim.AdamW(model.deepimage.parameters())\n&gt;&gt;&gt;\n&gt;&gt;&gt; wide_sch = torch.optim.lr_scheduler.StepLR(wide_opt, step_size=5)\n&gt;&gt;&gt; deep_sch = torch.optim.lr_scheduler.StepLR(deep_opt, step_size=3)\n&gt;&gt;&gt; text_sch = torch.optim.lr_scheduler.StepLR(text_opt, step_size=5)\n&gt;&gt;&gt; img_sch = torch.optim.lr_scheduler.StepLR(img_opt, step_size=3)\n&gt;&gt;&gt;\n&gt;&gt;&gt; optimizers = {\"wide\": wide_opt, \"deeptabular\": deep_opt, \"deeptext\": text_opt, \"deepimage\": img_opt}\n&gt;&gt;&gt; schedulers = {\"wide\": wide_sch, \"deeptabular\": deep_sch, \"deeptext\": text_sch, \"deepimage\": img_sch}\n&gt;&gt;&gt;\n&gt;&gt;&gt; # set initializers and callbacks\n&gt;&gt;&gt; initializers = {\"wide\": Uniform, \"deeptabular\": Normal, \"deeptext\": KaimingNormal, \"deepimage\": KaimingUniform}\n&gt;&gt;&gt; transforms = [ToTensor]\n&gt;&gt;&gt; callbacks = [LRHistory(n_epochs=4), EarlyStopping]\n&gt;&gt;&gt;\n&gt;&gt;&gt; # set the trainer\n&gt;&gt;&gt; trainer = Trainer(model, objective=\"regression\", initializers=initializers, optimizers=optimizers,\n... lr_schedulers=schedulers, callbacks=callbacks, transforms=transforms)\n</code></pre> Source code in <code>pytorch_widedeep/training/trainer.py</code> <pre><code>@alias(  # noqa: C901\n    \"objective\",\n    [\"loss_function\", \"loss_fn\", \"loss\", \"cost_function\", \"cost_fn\", \"cost\"],\n)\ndef __init__(\n    self,\n    model: WideDeep,\n    objective: str,\n    custom_loss_function: Optional[nn.Module] = None,\n    optimizers: Optional[\n        Union[Optimizer, Dict[str, Union[Optimizer, List[Optimizer]]]]\n    ] = None,\n    lr_schedulers: Optional[\n        Union[LRScheduler, Dict[str, Union[LRScheduler, List[LRScheduler]]]]\n    ] = None,\n    initializers: Optional[\n        Union[Initializer, Dict[str, Union[Initializer, List[Initializer]]]]\n    ] = None,\n    transforms: Optional[List[Transforms]] = None,\n    callbacks: Optional[List[Callback]] = None,\n    metrics: Optional[Union[List[Metric], List[TorchMetric]]] = None,\n    verbose: int = 1,\n    seed: int = 1,\n    **kwargs,\n):\n    super().__init__(\n        model=model,\n        objective=objective,\n        custom_loss_function=custom_loss_function,\n        optimizers=optimizers,\n        lr_schedulers=lr_schedulers,\n        initializers=initializers,\n        transforms=transforms,\n        callbacks=callbacks,\n        metrics=metrics,\n        verbose=verbose,\n        seed=seed,\n        **kwargs,\n    )\n</code></pre>"},{"location":"pytorch-widedeep/trainer.html#pytorch_widedeep.training.Trainer.fit","title":"fit","text":"<pre><code>fit(\n    X_wide=None,\n    X_tab=None,\n    X_text=None,\n    X_img=None,\n    X_train=None,\n    X_val=None,\n    val_split=None,\n    target=None,\n    n_epochs=1,\n    validation_freq=1,\n    batch_size=32,\n    custom_dataloader=None,\n    feature_importance_sample_size=None,\n    finetune=False,\n    **kwargs\n)\n</code></pre> <p>Fit method.</p> <p>The input datasets can be passed either directly via numpy arrays (<code>X_wide</code>, <code>X_tab</code>, <code>X_text</code> or <code>X_img</code>) or alternatively, in dictionaries (<code>X_train</code> or <code>X_val</code>).</p> <p>Parameters:</p> <ul> <li> <code>X_wide</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>wide</code> model component. See <code>pytorch_widedeep.preprocessing.WidePreprocessor</code></p> </li> <li> <code>X_tab</code>               (<code>Optional[Union[ndarray, List[ndarray]]]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deeptabular</code> model component. See <code>pytorch_widedeep.preprocessing.TabPreprocessor</code>. If multiple tabular models are used for different columns, this should be a list of numpy arrays</p> </li> <li> <code>X_text</code>               (<code>Optional[Union[ndarray, List[ndarray]]]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deeptext</code> model component. See <code>pytorch_widedeep.preprocessing.TextPreprocessor</code>. If multiple text columns/models are used, this should be a list of numpy arrays</p> </li> <li> <code>X_img</code>               (<code>Optional[Union[ndarray, List[ndarray]]]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deepimage</code> model component. See <code>pytorch_widedeep.preprocessing.ImagePreprocessor</code>. If multiple image columns/models are used, this should be a list of numpy arrays</p> </li> <li> <code>X_train</code>               (<code>Optional[Dict[str, Union[ndarray, List[ndarray]]]]</code>, default:                   <code>None</code> )           \u2013            <p>The training dataset can also be passed in a dictionary. Keys are 'X_wide', 'X_tab', 'X_text', 'X_img' and 'target'. Values are the corresponding matrices. Note that of multiple text or image columns/models are used, the corresponding values should be lists of numpy arrays</p> </li> <li> <code>X_val</code>               (<code>Optional[Dict[str, Union[ndarray, List[ndarray]]]]</code>, default:                   <code>None</code> )           \u2013            <p>The validation dataset can also be passed in a dictionary. Keys are 'X_wide', 'X_tab', 'X_text', 'X_img' and 'target'. Values are the corresponding matrices. Note that of multiple text or image columns/models are used, the corresponding values should be lists of numpy arrays</p> </li> <li> <code>val_split</code>               (<code>Optional[float]</code>, default:                   <code>None</code> )           \u2013            <p>train/val split fraction</p> </li> <li> <code>target</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>target values</p> </li> <li> <code>n_epochs</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>number of epochs</p> </li> <li> <code>validation_freq</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>epochs validation frequency</p> </li> <li> <code>batch_size</code>               (<code>int</code>, default:                   <code>32</code> )           \u2013            <p>batch size</p> </li> <li> <code>custom_dataloader</code>               (<code>Optional[DataLoader]</code>, default:                   <code>None</code> )           \u2013            <p>object of class <code>torch.utils.data.DataLoader</code>. Available predefined dataloaders are in <code>pytorch-widedeep.dataloaders</code>.If <code>None</code>, a standard torch <code>DataLoader</code> is used.</p> </li> <li> <code>finetune</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>fine-tune individual model components. This functionality can also be used to 'warm-up' (and hence the alias <code>warmup</code>) individual components before the joined training starts, and hence its alias. See the Examples folder in the repo for more details</p> <p><code>pytorch_widedeep</code> implements 3 fine-tune routines.</p> <ul> <li>fine-tune all trainable layers at once. This routine is   inspired by the work of Howard &amp; Sebastian Ruder 2018 in their   ULMfit paper. Using a   Slanted Triangular learing (see   Leslie N. Smith paper ) ,   the process is the following: i) the learning rate will   gradually increase for 10% of the training steps from max_lr/10   to max_lr. ii) It will then gradually decrease to max_lr/10   for the remaining 90% of the steps. The optimizer used in the   process is <code>Adam</code>.</li> </ul> <p>and two gradual fine-tune routines, where only certain layers are trained at a time.</p> <ul> <li>The so called <code>Felbo</code> gradual fine-tune rourine, based on the the   Felbo et al., 2017 DeepEmoji paper.</li> <li>The <code>Howard</code> routine based on the work of Howard &amp; Sebastian Ruder 2018 in their   ULMfit paper.</li> </ul> <p>For details on how these routines work, please see the Examples section in this documentation and the Examples folder in the repo.  Param Alias: <code>warmup</code></p> </li> </ul> <p>Other Parameters:</p> <ul> <li> <code>**kwargs</code>           \u2013            <p>Other keyword arguments are:</p> <ul> <li> <p>DataLoader related parameters:     For example,  <code>sampler</code>, <code>batch_sampler</code>, <code>collate_fn</code>, etc.     Please, see the pytorch     DataLoader docs     for details.</p> </li> <li> <p>Finetune related parameters:     see the source code at <code>pytorch_widedeep._finetune</code>. Namely, these are:</p> <ul> <li><code>finetune_epochs</code> (<code>int</code>):     number of epochs use for fine tuning</li> <li><code>finetune_max_lr</code> (<code>float</code>):    max lr during fine tuning</li> <li><code>routine</code> (<code>str</code>):    one of 'howard' or 'felbo'</li> <li><code>deeptabular_gradual</code> (<code>bool</code>):    boolean indicating if the <code>deeptabular</code> component will be fine tuned gradually</li> <li><code>deeptabular_layers</code> (<code>Optional[Union[List[nn.Module], List[List[nn.Module]]]]</code>):    List of pytorch modules indicating the layers of the    <code>deeptabular</code> that will be fine tuned</li> <li><code>deeptabular_max_lr</code> (<code>Union[float, List[float]]</code>):    max lr for the <code>deeptabular</code> componet during fine tuning</li> <li><code>deeptext_gradual</code> (<code>bool</code>):    same as <code>deeptabular_gradual</code> but for the <code>deeptext</code> component</li> <li><code>deeptext_layers</code> (<code>Optional[Union[List[nn.Module], List[List[nn.Module]]]]</code>):    same as <code>deeptabular_gradual</code> but for the <code>deeptext</code> component.    If there are multiple text columns/models, this should be a list of lists</li> <li><code>deeptext_max_lr</code> (<code>Union[float, List[float]]</code>):    same as <code>deeptabular_gradual</code> but for the <code>deeptext</code> component    If there are multiple text columns/models, this should be a list of floats</li> <li><code>deepimage_gradual</code> (<code>bool</code>):    same as <code>deeptext_layers</code> but for the <code>deepimage</code> component</li> <li><code>deepimage_layers</code> (<code>Optional[Union[List[nn.Module], List[List[nn.Module]]]]</code>):    same as <code>deeptext_layers</code> but for the <code>deepimage</code> component</li> <li><code>deepimage_max_lr</code> (<code>Union[float, List[float]]</code>):     same as <code>deeptext_layers</code> but for the <code>deepimage</code> component</li> </ul> </li> </ul> </li> </ul> <p>Examples:</p> <p>For a series of comprehensive examples on how to use the <code>fit</code> method, please see the Examples folder in the repo</p> Source code in <code>pytorch_widedeep/training/trainer.py</code> <pre><code>@alias(\"finetune\", [\"warmup\"])\ndef fit(  # noqa: C901\n    self,\n    X_wide: Optional[np.ndarray] = None,\n    X_tab: Optional[Union[np.ndarray, List[np.ndarray]]] = None,\n    X_text: Optional[Union[np.ndarray, List[np.ndarray]]] = None,\n    X_img: Optional[Union[np.ndarray, List[np.ndarray]]] = None,\n    X_train: Optional[Dict[str, Union[np.ndarray, List[np.ndarray]]]] = None,\n    X_val: Optional[Dict[str, Union[np.ndarray, List[np.ndarray]]]] = None,\n    val_split: Optional[float] = None,\n    target: Optional[np.ndarray] = None,\n    n_epochs: int = 1,\n    validation_freq: int = 1,\n    batch_size: int = 32,\n    custom_dataloader: Optional[DataLoader] = None,\n    feature_importance_sample_size: Optional[int] = None,\n    finetune: bool = False,\n    **kwargs,\n):\n    r\"\"\"Fit method.\n\n    The input datasets can be passed either directly via numpy arrays\n    (`X_wide`, `X_tab`, `X_text` or `X_img`) or alternatively, in\n    dictionaries (`X_train` or `X_val`).\n\n    Parameters\n    ----------\n    X_wide: np.ndarray, Optional. default=None\n        Input for the `wide` model component.\n        See `pytorch_widedeep.preprocessing.WidePreprocessor`\n    X_tab: np.ndarray, Optional. default=None\n        Input for the `deeptabular` model component.\n        See `pytorch_widedeep.preprocessing.TabPreprocessor`. If multiple\n        tabular models are used for different columns, this should be a\n        list of numpy arrays\n    X_text: Union[np.ndarray, List[np.ndarray]], Optional. default=None\n        Input for the `deeptext` model component.\n        See `pytorch_widedeep.preprocessing.TextPreprocessor`.\n        If multiple text columns/models are used, this should be a list of\n        numpy arrays\n    X_img: np.ndarray, Optional. default=None\n        Input for the `deepimage` model component.\n        See `pytorch_widedeep.preprocessing.ImagePreprocessor`.\n        If multiple image columns/models are used, this should be a list of\n        numpy arrays\n    X_train: Dict, Optional. default=None\n        The training dataset can also be passed in a dictionary. Keys are\n        _'X_wide'_, _'X_tab'_, _'X_text'_, _'X_img'_ and _'target'_. Values\n        are the corresponding matrices. Note that of multiple text or image\n        columns/models are used, the corresponding values should be lists\n        of numpy arrays\n    X_val: Dict, Optional. default=None\n        The validation dataset can also be passed in a dictionary. Keys\n        are _'X_wide'_, _'X_tab'_, _'X_text'_, _'X_img'_ and _'target'_.\n        Values are the corresponding matrices. Note that of multiple text\n        or image columns/models are used, the corresponding values should\n        be lists of numpy arrays\n    val_split: float, Optional. default=None\n        train/val split fraction\n    target: np.ndarray, Optional. default=None\n        target values\n    n_epochs: int, default=1\n        number of epochs\n    validation_freq: int, default=1\n        epochs validation frequency\n    batch_size: int, default=32\n        batch size\n    custom_dataloader: `DataLoader`, Optional, default=None\n        object of class `torch.utils.data.DataLoader`. Available\n        predefined dataloaders are in `pytorch-widedeep.dataloaders`.If\n        `None`, a standard torch `DataLoader` is used.\n    finetune: bool, default=False\n        fine-tune individual model components. This functionality can also\n        be used to 'warm-up' (and hence the alias `warmup`) individual\n        components before the joined training starts, and hence its\n        alias. See the Examples folder in the repo for more details\n\n        `pytorch_widedeep` implements 3 fine-tune routines.\n\n        - fine-tune all trainable layers at once. This routine is\n          inspired by the work of Howard &amp; Sebastian Ruder 2018 in their\n          [ULMfit paper](https://arxiv.org/abs/1801.06146). Using a\n          Slanted Triangular learing (see\n          [Leslie N. Smith paper](https://arxiv.org/pdf/1506.01186.pdf) ) ,\n          the process is the following: *i*) the learning rate will\n          gradually increase for 10% of the training steps from max_lr/10\n          to max_lr. *ii*) It will then gradually decrease to max_lr/10\n          for the remaining 90% of the steps. The optimizer used in the\n          process is `Adam`.\n\n        and two gradual fine-tune routines, where only certain layers are\n        trained at a time.\n\n        - The so called `Felbo` gradual fine-tune rourine, based on the the\n          Felbo et al., 2017 [DeepEmoji paper](https://arxiv.org/abs/1708.00524).\n        - The `Howard` routine based on the work of Howard &amp; Sebastian Ruder 2018 in their\n          [ULMfit paper](https://arxiv.org/abs/1801.06146&gt;).\n\n        For details on how these routines work, please see the Examples\n        section in this documentation and the Examples folder in the repo. &lt;br/&gt;\n        Param Alias: `warmup`\n\n    Other Parameters\n    ----------------\n    **kwargs:\n        Other keyword arguments are:\n\n        - **DataLoader related parameters**:&lt;br/&gt;\n            For example,  `sampler`, `batch_sampler`, `collate_fn`, etc.\n            Please, see the pytorch\n            [DataLoader docs](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader)\n            for details.\n\n        - **Finetune related parameters**:&lt;br/&gt;\n            see the source code at `pytorch_widedeep._finetune`. Namely, these are:\n\n            - `finetune_epochs` (`int`):\n                number of epochs use for fine tuning\n            - `finetune_max_lr` (`float`):\n               max lr during fine tuning\n            - `routine` (`str`):\n               one of _'howard'_ or _'felbo'_\n            - `deeptabular_gradual` (`bool`):\n               boolean indicating if the `deeptabular` component will be fine tuned gradually\n            - `deeptabular_layers` (`Optional[Union[List[nn.Module], List[List[nn.Module]]]]`):\n               List of pytorch modules indicating the layers of the\n               `deeptabular` that will be fine tuned\n            - `deeptabular_max_lr` (`Union[float, List[float]]`):\n               max lr for the `deeptabular` componet during fine tuning\n            - `deeptext_gradual` (`bool`):\n               same as `deeptabular_gradual` but for the `deeptext` component\n            - `deeptext_layers` (`Optional[Union[List[nn.Module], List[List[nn.Module]]]]`):\n               same as `deeptabular_gradual` but for the `deeptext` component.\n               If there are multiple text columns/models, this should be a list of lists\n            - `deeptext_max_lr` (`Union[float, List[float]]`):\n               same as `deeptabular_gradual` but for the `deeptext` component\n               If there are multiple text columns/models, this should be a list of floats\n            - `deepimage_gradual` (`bool`):\n               same as `deeptext_layers` but for the `deepimage` component\n            - `deepimage_layers` (`Optional[Union[List[nn.Module], List[List[nn.Module]]]]`):\n               same as `deeptext_layers` but for the `deepimage` component\n            - `deepimage_max_lr` (`Union[float, List[float]]`):\n                same as `deeptext_layers` but for the `deepimage` component\n\n    Examples\n    --------\n\n    For a series of comprehensive examples on how to use the `fit` method, please see the\n    [Examples](https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples)\n    folder in the repo\n    \"\"\"\n\n    dataloader_args, finetune_args = self._extract_kwargs(kwargs)\n\n    self.batch_size = batch_size\n\n    train_set, eval_set = wd_train_val_split(\n        self.seed,\n        self.method,  # type: ignore\n        X_wide,\n        X_tab,\n        X_text,\n        X_img,\n        X_train,\n        X_val,\n        val_split,\n        target,\n        self.transforms,\n    )\n    if custom_dataloader is not None:\n        # make sure is callable (and HAS to be an subclass of DataLoader)\n        assert isinstance(custom_dataloader, type)\n        train_loader = custom_dataloader(  # type: ignore[misc]\n            dataset=train_set,\n            batch_size=batch_size,\n            num_workers=self.num_workers,\n            **dataloader_args,\n        )\n    else:\n        train_loader = DataLoader(\n            dataset=train_set,\n            batch_size=batch_size,\n            num_workers=self.num_workers,\n            **dataloader_args,\n        )\n    train_steps = len(train_loader)\n    if eval_set is not None:\n        eval_loader = DataLoader(\n            dataset=eval_set,\n            batch_size=batch_size,\n            num_workers=self.num_workers,\n            shuffle=False,\n        )\n        eval_steps = len(eval_loader)\n\n    if finetune:\n        self.with_finetuning: bool = True\n        self._finetune(train_loader, **finetune_args)\n        if self.verbose:\n            print(\n                \"Fine-tuning (or warmup) of individual components completed. \"\n                \"Training the whole model for {} epochs\".format(n_epochs)\n            )\n    else:\n        self.with_finetuning = False\n\n    self.callback_container.on_train_begin(\n        {\"batch_size\": batch_size, \"train_steps\": train_steps, \"n_epochs\": n_epochs}\n    )\n    for epoch in range(n_epochs):\n        epoch_logs: Dict[str, float] = {}\n        self.callback_container.on_epoch_begin(epoch, logs=epoch_logs)\n\n        self.train_running_loss = 0.0\n        with trange(train_steps, disable=self.verbose != 1) as t:\n            for batch_idx, (data, targett) in zip(t, train_loader):\n                t.set_description(\"epoch %i\" % (epoch + 1))\n                train_score, train_loss = self._train_step(data, targett, batch_idx)\n                print_loss_and_metric(t, train_loss, train_score)\n                self.callback_container.on_batch_end(batch=batch_idx)\n        epoch_logs = save_epoch_logs(epoch_logs, train_loss, train_score, \"train\")\n\n        on_epoch_end_metric = None\n        if eval_set is not None and epoch % validation_freq == (\n            validation_freq - 1\n        ):\n            self.callback_container.on_eval_begin()\n            self.valid_running_loss = 0.0\n            with trange(eval_steps, disable=self.verbose != 1) as v:\n                for i, (data, targett) in zip(v, eval_loader):\n                    v.set_description(\"valid\")\n                    val_score, val_loss = self._eval_step(data, targett, i)\n                    print_loss_and_metric(v, val_loss, val_score)\n            epoch_logs = save_epoch_logs(epoch_logs, val_loss, val_score, \"val\")\n\n            if self.reducelronplateau:\n                if self.reducelronplateau_criterion == \"loss\":\n                    on_epoch_end_metric = val_loss\n                else:\n                    on_epoch_end_metric = val_score[\n                        self.reducelronplateau_criterion\n                    ]\n        else:\n            if self.reducelronplateau:\n                raise NotImplementedError(\n                    \"ReduceLROnPlateau scheduler can be used only with validation data.\"\n                )\n        self.callback_container.on_epoch_end(epoch, epoch_logs, on_epoch_end_metric)\n\n        if self.early_stop:\n            # self.callback_container.on_train_end(epoch_logs)\n            break\n\n    self.callback_container.on_train_end(epoch_logs)\n\n    if feature_importance_sample_size is not None:\n        self.feature_importance = FeatureImportance(\n            self.device, feature_importance_sample_size\n        ).feature_importance(train_loader, self.model)\n    self._restore_best_weights()\n    self.model.train()\n</code></pre>"},{"location":"pytorch-widedeep/trainer.html#pytorch_widedeep.training.Trainer.predict","title":"predict","text":"<pre><code>predict(\n    X_wide=None,\n    X_tab=None,\n    X_text=None,\n    X_img=None,\n    X_test=None,\n    batch_size=None,\n)\n</code></pre> <p>Returns the predictions</p> <p>The input datasets can be passed either directly via numpy arrays (<code>X_wide</code>, <code>X_tab</code>, <code>X_text</code> or <code>X_img</code>) or alternatively, in a dictionary (<code>X_test</code>)</p> <p>Parameters:</p> <ul> <li> <code>X_wide</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>wide</code> model component. See <code>pytorch_widedeep.preprocessing.WidePreprocessor</code></p> </li> <li> <code>X_tab</code>               (<code>Optional[Union[ndarray, List[ndarray]]]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deeptabular</code> model component. See <code>pytorch_widedeep.preprocessing.TabPreprocessor</code></p> </li> <li> <code>X_text</code>               (<code>Optional[Union[ndarray, List[ndarray]]]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deeptext</code> model component. See <code>pytorch_widedeep.preprocessing.TextPreprocessor</code></p> </li> <li> <code>X_img</code>               (<code>Optional[Union[ndarray, List[ndarray]]]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deepimage</code> model component. See <code>pytorch_widedeep.preprocessing.ImagePreprocessor</code></p> </li> <li> <code>X_test</code>               (<code>Optional[Dict[str, Union[ndarray, List[ndarray]]]]</code>, default:                   <code>None</code> )           \u2013            <p>The test dataset can also be passed in a dictionary. Keys are <code>X_wide</code>, 'X_tab', 'X_text', 'X_img' and 'target'. Values are the corresponding matrices.</p> </li> <li> <code>batch_size</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>If a trainer is used to predict after having trained a model, the <code>batch_size</code> needs to be defined as it will not be defined as the <code>Trainer</code> is instantiated</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>np.ndarray:</code>           \u2013            <p>array with the predictions</p> </li> </ul> Source code in <code>pytorch_widedeep/training/trainer.py</code> <pre><code>def predict(  # type: ignore[override, return]\n    self,\n    X_wide: Optional[np.ndarray] = None,\n    X_tab: Optional[Union[np.ndarray, List[np.ndarray]]] = None,\n    X_text: Optional[Union[np.ndarray, List[np.ndarray]]] = None,\n    X_img: Optional[Union[np.ndarray, List[np.ndarray]]] = None,\n    X_test: Optional[Dict[str, Union[np.ndarray, List[np.ndarray]]]] = None,\n    batch_size: Optional[int] = None,\n) -&gt; np.ndarray:\n    r\"\"\"Returns the predictions\n\n    The input datasets can be passed either directly via numpy arrays\n    (`X_wide`, `X_tab`, `X_text` or `X_img`) or alternatively, in\n    a dictionary (`X_test`)\n\n\n    Parameters\n    ----------\n    X_wide: np.ndarray, Optional. default=None\n        Input for the `wide` model component.\n        See `pytorch_widedeep.preprocessing.WidePreprocessor`\n    X_tab: np.ndarray or List[np.ndarray], Optional. default=None\n        Input for the `deeptabular` model component.\n        See `pytorch_widedeep.preprocessing.TabPreprocessor`\n    X_text: np.ndarray or List[np.ndarray], Optional. default=None\n        Input for the `deeptext` model component.\n        See `pytorch_widedeep.preprocessing.TextPreprocessor`\n    X_img: np.ndarray or List[np.ndarray], Optional. default=None\n        Input for the `deepimage` model component.\n        See `pytorch_widedeep.preprocessing.ImagePreprocessor`\n    X_test: Dict, Optional. default=None\n        The test dataset can also be passed in a dictionary. Keys are\n        `X_wide`, _'X_tab'_, _'X_text'_, _'X_img'_ and _'target'_. Values\n        are the corresponding matrices.\n    batch_size: int, default = 256\n        If a trainer is used to predict after having trained a model, the\n        `batch_size` needs to be defined as it will not be defined as\n        the `Trainer` is instantiated\n\n    Returns\n    -------\n    np.ndarray:\n        array with the predictions\n    \"\"\"\n    preds_l = self._predict(X_wide, X_tab, X_text, X_img, X_test, batch_size)\n    if self.method == \"regression\":\n        return np.vstack(preds_l).squeeze(1)\n    if self.method == \"binary\":\n        preds = np.vstack(preds_l).squeeze(1)\n        return (preds &gt; 0.5).astype(\"int\")\n    if self.method == \"qregression\":\n        return np.vstack(preds_l)\n    if self.method == \"multiclass\":\n        preds = np.vstack(preds_l)\n        return np.argmax(preds, 1)  # type: ignore[return-value]\n    if self.method == \"multitarget\":\n        if self.loss_fn.__class__.__name__ in [\n            \"MultiTargetClassificationLoss\",\n            \"MutilTargetRegressionAndClassificationLoss\",\n        ]:\n            raise ValueError(\n                \"MultiTargetClassificationLoss and MutilTargetRegressionAndClassificationLoss \"\n                \"are not supported by predict method. Please use predict_proba method instead.\"\n            )\n        return np.vstack(preds_l)\n</code></pre>"},{"location":"pytorch-widedeep/trainer.html#pytorch_widedeep.training.Trainer.predict_uncertainty","title":"predict_uncertainty","text":"<pre><code>predict_uncertainty(\n    X_wide=None,\n    X_tab=None,\n    X_text=None,\n    X_img=None,\n    X_test=None,\n    batch_size=None,\n    uncertainty_granularity=1000,\n)\n</code></pre> <p>Returns the predicted ucnertainty of the model for the test dataset using a Monte Carlo method during which dropout layers are activated in the evaluation/prediction phase and each sample is predicted N times (<code>uncertainty_granularity</code> times).</p> <p>This is based on Dropout as a Bayesian Approximation: Representing Model Uncertainty in Deep Learning.</p> <p>Parameters:</p> <ul> <li> <code>X_wide</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>wide</code> model component. See <code>pytorch_widedeep.preprocessing.WidePreprocessor</code></p> </li> <li> <code>X_tab</code>               (<code>Optional[Union[ndarray, List[ndarray]]]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deeptabular</code> model component. See <code>pytorch_widedeep.preprocessing.TabPreprocessor</code></p> </li> <li> <code>X_text</code>               (<code>Optional[Union[ndarray, List[ndarray]]]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deeptext</code> model component. See <code>pytorch_widedeep.preprocessing.TextPreprocessor</code></p> </li> <li> <code>X_img</code>               (<code>Optional[Union[ndarray, List[ndarray]]]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deepimage</code> model component. See <code>pytorch_widedeep.preprocessing.ImagePreprocessor</code></p> </li> <li> <code>X_test</code>               (<code>Optional[Dict[str, Union[ndarray, List[ndarray]]]]</code>, default:                   <code>None</code> )           \u2013            <p>The test dataset can also be passed in a dictionary. Keys are 'X_wide', 'X_tab', 'X_text', 'X_img' and 'target'. Values are the corresponding matrices.</p> </li> <li> <code>batch_size</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>If a trainer is used to predict after having trained a model, the <code>batch_size</code> needs to be defined as it will not be defined as the <code>Trainer</code> is instantiated</p> </li> <li> <code>uncertainty_granularity</code>           \u2013            <p>number of times the model does prediction for each sample</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>np.ndarray:</code>           \u2013            <ul> <li>if <code>method = regression</code>, it will return an array with <code>(max, min, mean, stdev)</code>   values for each sample.</li> <li>if <code>method = binary</code> it will return an array with   <code>(mean_cls_0_prob, mean_cls_1_prob, predicted_cls)</code> for each sample.</li> <li>if <code>method = multiclass</code> it will return an array with   <code>(mean_cls_0_prob, mean_cls_1_prob, mean_cls_2_prob, ... , predicted_cls)</code>   values for each sample.</li> </ul> </li> </ul> Source code in <code>pytorch_widedeep/training/trainer.py</code> <pre><code>def predict_uncertainty(  # type: ignore[return]\n    self,\n    X_wide: Optional[np.ndarray] = None,\n    X_tab: Optional[Union[np.ndarray, List[np.ndarray]]] = None,\n    X_text: Optional[Union[np.ndarray, List[np.ndarray]]] = None,\n    X_img: Optional[Union[np.ndarray, List[np.ndarray]]] = None,\n    X_test: Optional[Dict[str, Union[np.ndarray, List[np.ndarray]]]] = None,\n    batch_size: Optional[int] = None,\n    uncertainty_granularity=1000,\n) -&gt; np.ndarray:\n    r\"\"\"Returns the predicted ucnertainty of the model for the test dataset\n    using a Monte Carlo method during which dropout layers are activated\n    in the evaluation/prediction phase and each sample is predicted N\n    times (`uncertainty_granularity` times).\n\n    This is based on\n    [Dropout as a Bayesian Approximation: Representing\n    Model Uncertainty in Deep Learning](https://arxiv.org/abs/1506.02142?context=stat).\n\n    Parameters\n    ----------\n    X_wide: np.ndarray, Optional. default=None\n        Input for the `wide` model component.\n        See `pytorch_widedeep.preprocessing.WidePreprocessor`\n    X_tab: np.ndarray or List[np.ndarray], Optional. default=None\n        Input for the `deeptabular` model component.\n        See `pytorch_widedeep.preprocessing.TabPreprocessor`\n    X_text: np.ndarray or List[np.ndarray], Optional. default=None\n        Input for the `deeptext` model component.\n        See `pytorch_widedeep.preprocessing.TextPreprocessor`\n    X_img: np.ndarray or List[np.ndarray], Optional. default=None\n        Input for the `deepimage` model component.\n        See `pytorch_widedeep.preprocessing.ImagePreprocessor`\n    X_test: Dict, Optional. default=None\n        The test dataset can also be passed in a dictionary. Keys are\n        _'X_wide'_, _'X_tab'_, _'X_text'_, _'X_img'_ and _'target'_. Values\n        are the corresponding matrices.\n    batch_size: int, default = 256\n        If a trainer is used to predict after having trained a model, the\n        `batch_size` needs to be defined as it will not be defined as\n        the `Trainer` is instantiated\n    uncertainty_granularity: int default = 1000\n        number of times the model does prediction for each sample\n\n    Returns\n    -------\n    np.ndarray:\n        - if `method = regression`, it will return an array with `(max, min, mean, stdev)`\n          values for each sample.\n        - if `method = binary` it will return an array with\n          `(mean_cls_0_prob, mean_cls_1_prob, predicted_cls)` for each sample.\n        - if `method = multiclass` it will return an array with\n          `(mean_cls_0_prob, mean_cls_1_prob, mean_cls_2_prob, ... , predicted_cls)`\n          values for each sample.\n\n    \"\"\"\n    preds_l = self._predict(\n        X_wide,\n        X_tab,\n        X_text,\n        X_img,\n        X_test,\n        batch_size,\n        uncertainty_granularity,\n        uncertainty=True,\n    )\n    preds = np.vstack(preds_l)\n    samples_num = int(preds.shape[0] / uncertainty_granularity)\n    if self.method == \"regression\":\n        preds = preds.squeeze(1)\n        preds = preds.reshape((uncertainty_granularity, samples_num))\n        return np.array(\n            (\n                preds.max(axis=0),\n                preds.min(axis=0),\n                preds.mean(axis=0),\n                preds.std(axis=0),\n            )\n        ).T\n    if self.method == \"qregression\":\n        raise ValueError(\n            \"Currently predict_uncertainty is not supported for qregression method\"\n        )\n    if self.method == \"binary\":\n        preds = preds.squeeze(1)\n        preds = preds.reshape((uncertainty_granularity, samples_num))\n        preds = preds.mean(axis=0)\n        probs = np.zeros([preds.shape[0], 3])\n        probs[:, 0] = 1 - preds\n        probs[:, 1] = preds\n        return probs\n    if self.method == \"multiclass\":\n        preds = preds.reshape(uncertainty_granularity, samples_num, preds.shape[1])\n        preds = preds.mean(axis=0)\n        preds = np.hstack((preds, np.vstack(np.argmax(preds, 1))))\n        return preds\n\n    if self.method == \"multitarget\":\n        raise ValueError(\n            \"Currently predict_uncertainty is not supported for multitarget method\"\n        )\n</code></pre>"},{"location":"pytorch-widedeep/trainer.html#pytorch_widedeep.training.Trainer.predict_proba","title":"predict_proba","text":"<pre><code>predict_proba(\n    X_wide=None,\n    X_tab=None,\n    X_text=None,\n    X_img=None,\n    X_test=None,\n    batch_size=None,\n)\n</code></pre> <p>Returns the predicted probabilities for the test dataset for  binary and multiclass methods</p> <p>The input datasets can be passed either directly via numpy arrays (<code>X_wide</code>, <code>X_tab</code>, <code>X_text</code> or <code>X_img</code>) or alternatively, in a dictionary (<code>X_test</code>)</p> <p>Parameters:</p> <ul> <li> <code>X_wide</code>               (<code>Optional[ndarray]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>wide</code> model component. See <code>pytorch_widedeep.preprocessing.WidePreprocessor</code></p> </li> <li> <code>X_tab</code>               (<code>Optional[Union[ndarray, List[ndarray]]]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deeptabular</code> model component. See <code>pytorch_widedeep.preprocessing.TabPreprocessor</code></p> </li> <li> <code>X_text</code>               (<code>Optional[Union[ndarray, List[ndarray]]]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deeptext</code> model component. See <code>pytorch_widedeep.preprocessing.TextPreprocessor</code></p> </li> <li> <code>X_img</code>               (<code>Optional[Union[ndarray, List[ndarray]]]</code>, default:                   <code>None</code> )           \u2013            <p>Input for the <code>deepimage</code> model component. See <code>pytorch_widedeep.preprocessing.ImagePreprocessor</code></p> </li> <li> <code>X_test</code>               (<code>Optional[Dict[str, Union[ndarray, List[ndarray]]]]</code>, default:                   <code>None</code> )           \u2013            <p>The test dataset can also be passed in a dictionary. Keys are <code>X_wide</code>, 'X_tab', 'X_text', 'X_img' and 'target'. Values are the corresponding matrices.</p> </li> <li> <code>batch_size</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>If a trainer is used to predict after having trained a model, the <code>batch_size</code> needs to be defined as it will not be defined as the <code>Trainer</code> is instantiated</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>array with the probabilities per class</p> </li> </ul> Source code in <code>pytorch_widedeep/training/trainer.py</code> <pre><code>def predict_proba(  # type: ignore[override, return]  # noqa: C901\n    self,\n    X_wide: Optional[np.ndarray] = None,\n    X_tab: Optional[Union[np.ndarray, List[np.ndarray]]] = None,\n    X_text: Optional[Union[np.ndarray, List[np.ndarray]]] = None,\n    X_img: Optional[Union[np.ndarray, List[np.ndarray]]] = None,\n    X_test: Optional[Dict[str, Union[np.ndarray, List[np.ndarray]]]] = None,\n    batch_size: Optional[int] = None,\n) -&gt; np.ndarray:\n    r\"\"\"Returns the predicted probabilities for the test dataset for  binary\n    and multiclass methods\n\n    The input datasets can be passed either directly via numpy arrays\n    (`X_wide`, `X_tab`, `X_text` or `X_img`) or alternatively, in\n    a dictionary (`X_test`)\n\n    Parameters\n    ----------\n    X_wide: np.ndarray, Optional. default=None\n        Input for the `wide` model component.\n        See `pytorch_widedeep.preprocessing.WidePreprocessor`\n    X_tab: np.ndarray or List[np.ndarray], Optional. default=None\n        Input for the `deeptabular` model component.\n        See `pytorch_widedeep.preprocessing.TabPreprocessor`\n    X_text: np.ndarray or List[np.ndarray], Optional. default=None\n        Input for the `deeptext` model component.\n        See `pytorch_widedeep.preprocessing.TextPreprocessor`\n    X_img: np.ndarray or List[np.ndarray], Optional. default=None\n        Input for the `deepimage` model component.\n        See `pytorch_widedeep.preprocessing.ImagePreprocessor`\n    X_test: Dict, Optional. default=None\n        The test dataset can also be passed in a dictionary. Keys are\n        `X_wide`, _'X_tab'_, _'X_text'_, _'X_img'_ and _'target'_. Values\n        are the corresponding matrices.\n    batch_size: int, default = 256\n        If a trainer is used to predict after having trained a model, the\n        `batch_size` needs to be defined as it will not be defined as\n        the `Trainer` is instantiated\n\n    Returns\n    -------\n    np.ndarray\n        array with the probabilities per class\n    \"\"\"\n\n    preds_l = self._predict(X_wide, X_tab, X_text, X_img, X_test, batch_size)\n    if self.method == \"binary\":\n        preds = np.vstack(preds_l).squeeze(1)\n        probs = np.zeros([preds.shape[0], 2])\n        probs[:, 0] = 1 - preds\n        probs[:, 1] = preds\n        return probs\n    if self.method == \"multiclass\":\n        return np.vstack(preds_l)\n    if self.method == \"multitarget\":\n        return np.vstack(preds_l)\n</code></pre>"},{"location":"pytorch-widedeep/trainer.html#pytorch_widedeep.training.Trainer.save","title":"save","text":"<pre><code>save(\n    path,\n    save_state_dict=False,\n    save_optimizer=False,\n    model_filename=\"wd_model.pt\",\n)\n</code></pre> <p>Saves the model, training and evaluation history, and the <code>feature_importance</code> attribute (if the <code>deeptabular</code> component is a Tabnet model) to disk</p> <p>The <code>Trainer</code> class is built so that it 'just' trains a model. With that in mind, all the torch related parameters (such as optimizers, learning rate schedulers, initializers, etc) have to be defined externally and then passed to the <code>Trainer</code>. As a result, the <code>Trainer</code> does not generate any attribute or additional data products that need to be saved other than the <code>model</code> object itself, which can be saved as any other torch model (e.g. <code>torch.save(model, path)</code>).</p> <p>The exception is Tabnet. If the <code>deeptabular</code> component is a Tabnet model, an attribute (a dict) called <code>feature_importance</code> will be created at the end of the training process. Therefore, a <code>save</code> method was created that will save the feature importance dictionary to a json file and, since we are here, the model weights, training history and learning rate history.</p> <p>Parameters:</p> <ul> <li> <code>path</code>               (<code>str</code>)           \u2013            <p>path to the directory where the model and the feature importance attribute will be saved.</p> </li> <li> <code>save_state_dict</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether to save directly the model (and optimizer) or the model's (and optimizer's) state dictionary</p> </li> <li> <code>save_optimizer</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether to save the optimizer</p> </li> <li> <code>model_filename</code>               (<code>str</code>, default:                   <code>'wd_model.pt'</code> )           \u2013            <p>filename where the model weights will be store</p> </li> </ul> Source code in <code>pytorch_widedeep/training/trainer.py</code> <pre><code>def save(\n    self,\n    path: str,\n    save_state_dict: bool = False,\n    save_optimizer: bool = False,\n    model_filename: str = \"wd_model.pt\",\n):\n    r\"\"\"Saves the model, training and evaluation history, and the\n    `feature_importance` attribute (if the `deeptabular` component is a\n    Tabnet model) to disk\n\n    The `Trainer` class is built so that it 'just' trains a model. With\n    that in mind, all the torch related parameters (such as optimizers,\n    learning rate schedulers, initializers, etc) have to be defined\n    externally and then passed to the `Trainer`. As a result, the\n    `Trainer` does not generate any attribute or additional data\n    products that need to be saved other than the `model` object itself,\n    which can be saved as any other torch model (e.g. `torch.save(model,\n    path)`).\n\n    The exception is Tabnet. If the `deeptabular` component is a Tabnet\n    model, an attribute (a dict) called `feature_importance` will be\n    created at the end of the training process. Therefore, a `save`\n    method was created that will save the feature importance dictionary\n    to a json file and, since we are here, the model weights, training\n    history and learning rate history.\n\n    Parameters\n    ----------\n    path: str\n        path to the directory where the model and the feature importance\n        attribute will be saved.\n    save_state_dict: bool, default = False\n        Boolean indicating whether to save directly the model\n        (and optimizer) or the model's (and optimizer's) state\n        dictionary\n    save_optimizer: bool, default = False\n        Boolean indicating whether to save the optimizer\n    model_filename: str, Optional, default = \"wd_model.pt\"\n        filename where the model weights will be store\n    \"\"\"\n\n    self._save_history(path)\n\n    self._save_model_and_optimizer(\n        path, save_state_dict, save_optimizer, model_filename\n    )\n\n    if self.model.is_tabnet:\n        with open(Path(path) / \"feature_importance.json\", \"w\") as fi:\n            json.dump(self.feature_importance, fi)\n</code></pre>"},{"location":"pytorch-widedeep/utils/index.html","title":"The <code>utils</code> module","text":"<p>These are a series of utilities that might be useful for a number of preprocessing tasks, even not directly related to <code>pytorch-widedeep</code>. All the classes and functions discussed here are available directly from the <code>utils</code> module. For example, the <code>LabelEncoder</code> within the <code>deeptabular_utils</code> submodule can be imported as:</p> <pre><code>from pytorch_widedeep.utils import LabelEncoder\n</code></pre> <p>These are classes and functions that are internally used in the library. We include them here in case the user finds them useful for other purposes.</p>"},{"location":"pytorch-widedeep/utils/deeptabular_utils.html","title":"deeptabular utils","text":""},{"location":"pytorch-widedeep/utils/deeptabular_utils.html#pytorch_widedeep.utils.deeptabular_utils.LabelEncoder","title":"LabelEncoder","text":"<pre><code>LabelEncoder(\n    columns_to_encode=None,\n    with_attention=False,\n    shared_embed=False,\n)\n</code></pre> <p>Label Encode categorical values for multiple columns at once</p> <p> NOTE: LabelEncoder reserves 0 for <code>unseen</code> new categories. This is convenient when defining the embedding layers, since we can just set padding idx to 0.</p> <p>Parameters:</p> <ul> <li> <code>columns_to_encode</code>               (<code>Optional[List[str]]</code>, default:                   <code>None</code> )           \u2013            <p>List of strings containing the names of the columns to encode. If <code>None</code> all columns of type <code>object</code> in the dataframe will be label encoded.</p> </li> <li> <code>with_attention</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating whether the preprocessed data will be passed to an attention-based model. Aliased as <code>for_transformer</code>.</p> </li> <li> <code>shared_embed</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the embeddings will be \"shared\" when using attention-based models. The idea behind <code>shared_embed</code> is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: <code>pytorch_widedeep.models.transformers._layers.SharedEmbeddings</code>.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>encoding_dict</code>               (<code>Dict</code>)           \u2013            <p>Dictionary containing the encoding mappings in the format, e.g. :  <code>{'colname1': {'cat1': 1, 'cat2': 2, ...}, 'colname2': {'cat1': 1, 'cat2': 2, ...}, ...}</code></p> </li> <li> <code>inverse_encoding_dict</code>               (<code>Dict</code>)           \u2013            <p>Dictionary containing the inverse encoding mappings in the format, e.g. :  <code>{'colname1': {1: 'cat1', 2: 'cat2', ...}, 'colname2': {1: 'cat1', 2: 'cat2', ...}, ...}</code></p> </li> </ul> Source code in <code>pytorch_widedeep/utils/deeptabular_utils.py</code> <pre><code>@alias(\"with_attention\", [\"for_transformer\"])\ndef __init__(\n    self,\n    columns_to_encode: Optional[List[str]] = None,\n    with_attention: bool = False,\n    shared_embed: bool = False,\n):\n    self.columns_to_encode = columns_to_encode\n\n    self.shared_embed = shared_embed\n    self.with_attention = with_attention\n\n    self.reset_embed_idx = not self.with_attention or self.shared_embed\n</code></pre>"},{"location":"pytorch-widedeep/utils/deeptabular_utils.html#pytorch_widedeep.utils.deeptabular_utils.LabelEncoder.partial_fit","title":"partial_fit","text":"<pre><code>partial_fit(df)\n</code></pre> <p>Main method. Creates encoding attributes.</p> <p>Returns:</p> <ul> <li> <code>LabelEncoder</code>           \u2013            <p><code>LabelEncoder</code> fitted object</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/deeptabular_utils.py</code> <pre><code>def partial_fit(self, df: pd.DataFrame) -&gt; \"LabelEncoder\":  # noqa: C901\n    \"\"\"Main method. Creates encoding attributes.\n\n    Returns\n    -------\n    LabelEncoder\n        `LabelEncoder` fitted object\n    \"\"\"\n    # here df is a chunk of the data. this is meant to be run when the\n    # data is large and we pass a chunk at a time. Therefore, we do not\n    # copy the input chunk as mutating a chunk is ok\n    if self.columns_to_encode is None:\n        self.columns_to_encode = list(df.select_dtypes(include=[\"object\"]).columns)\n    else:\n        # sanity check to make sure all categorical columns are in an adequate\n        # format\n        for col in self.columns_to_encode:\n            df[col] = df[col].astype(\"O\")\n\n    unique_column_vals: Dict[str, List[str]] = {}\n    for c in self.columns_to_encode:\n        unique_column_vals[c] = df[c].unique().tolist()\n\n    if not hasattr(self, \"encoding_dict\"):\n        # we run the method 'partial_fit' for the 1st time\n        self.encoding_dict: Dict[str, Dict[str, int]] = {}\n        if \"cls_token\" in unique_column_vals and self.shared_embed:\n            self.encoding_dict[\"cls_token\"] = {\"[CLS]\": 0}\n            del unique_column_vals[\"cls_token\"]\n\n        # leave 0 for padding/\"unseen\" categories. Also we need an\n        # attribute to keep track of the encoding in case we use\n        # attention and we do not re-start the index/counter\n        self.cum_idx: int = 1\n        for k, v in unique_column_vals.items():\n            self.encoding_dict[k] = {o: i + self.cum_idx for i, o in enumerate(v)}\n            self.cum_idx = 1 if self.reset_embed_idx else self.cum_idx + len(v)\n    else:\n        # the 'partial_fit' method has already run.\n        # \"cls_token\" will have been added already\n        if \"cls_token\" in unique_column_vals and self.shared_embed:\n            del unique_column_vals[\"cls_token\"]\n\n        # Classes in the new df/chunk of the dataset that have not been seen\n        # before\n        unseen_classes: Dict[str, List[str]] = {}\n        for c in self.columns_to_encode:\n            unseen_classes[c] = list(\n                np.setdiff1d(\n                    unique_column_vals[c], list(self.encoding_dict[c].keys())\n                )\n            )\n\n        # leave 0 for padding/\"unseen\" categories\n        for k, v in unique_column_vals.items():\n            # if we use attention we need to start encoding from the\n            # last 'overall' encoding index. Otherwise, we use the max\n            # encoding index per categorical col\n            _idx = (\n                max(self.encoding_dict[k].values()) + 1\n                if self.reset_embed_idx\n                else self.cum_idx\n            )\n            if len(unseen_classes[k]) != 0:\n                for i, o in enumerate(unseen_classes[k]):\n                    if o not in self.encoding_dict[k]:\n                        self.encoding_dict[k][o] = i + _idx\n                # if self.reset_embed_idx is True it will be 1 anyway\n                self.cum_idx = (\n                    1\n                    if self.reset_embed_idx\n                    else self.cum_idx + len(unseen_classes[k])\n                )\n\n    return self\n</code></pre>"},{"location":"pytorch-widedeep/utils/deeptabular_utils.html#pytorch_widedeep.utils.deeptabular_utils.LabelEncoder.fit","title":"fit","text":"<pre><code>fit(df)\n</code></pre> <p>Simply runs the <code>partial_fit</code> method when the data fits in memory</p> <p>Returns:</p> <ul> <li> <code>LabelEncoder</code>           \u2013            <p><code>LabelEncoder</code> fitted object</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/deeptabular_utils.py</code> <pre><code>def fit(self, df: pd.DataFrame) -&gt; \"LabelEncoder\":\n    \"\"\"Simply runs the `partial_fit` method when the data fits in memory\n\n    Returns\n    -------\n    LabelEncoder\n        `LabelEncoder` fitted object\n    \"\"\"\n    # this is meant to be run when the data fits in memory and therefore,\n    # we do not want to mutate the original df, so we copy it\n    self.partial_fit(df.copy())\n\n    self.inverse_encoding_dict = self.create_inverse_encoding_dict()\n\n    return self\n</code></pre>"},{"location":"pytorch-widedeep/utils/deeptabular_utils.html#pytorch_widedeep.utils.deeptabular_utils.LabelEncoder.transform","title":"transform","text":"<pre><code>transform(df)\n</code></pre> <p>Label Encoded the categories in <code>columns_to_encode</code></p> <p>Returns:</p> <ul> <li> <code>DataFrame</code>           \u2013            <p>label-encoded dataframe</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/deeptabular_utils.py</code> <pre><code>def transform(self, df: pd.DataFrame) -&gt; pd.DataFrame:\n    \"\"\"Label Encoded the categories in `columns_to_encode`\n\n    Returns\n    -------\n    pd.DataFrame\n        label-encoded dataframe\n    \"\"\"\n    try:\n        self.encoding_dict\n    except AttributeError:\n        raise NotFittedError(\n            \"This LabelEncoder instance is not fitted yet. \"\n            \"Call 'fit' with appropriate arguments before using this LabelEncoder.\"\n        )\n\n    df_inp = df.copy()\n    # sanity check to make sure all categorical columns are in an adequate\n    # format\n    for col in self.columns_to_encode:  # type: ignore\n        df_inp[col] = df_inp[col].astype(\"O\")\n\n    for k, v in self.encoding_dict.items():\n        df_inp[k] = df_inp[k].apply(lambda x: v[x] if x in v.keys() else 0)\n\n    return df_inp\n</code></pre>"},{"location":"pytorch-widedeep/utils/deeptabular_utils.html#pytorch_widedeep.utils.deeptabular_utils.LabelEncoder.fit_transform","title":"fit_transform","text":"<pre><code>fit_transform(df)\n</code></pre> <p>Combines <code>fit</code> and <code>transform</code></p> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from pytorch_widedeep.utils import LabelEncoder\n&gt;&gt;&gt; df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})\n&gt;&gt;&gt; columns_to_encode = ['col2']\n&gt;&gt;&gt; encoder = LabelEncoder(columns_to_encode)\n&gt;&gt;&gt; encoder.fit_transform(df)\n   col1  col2\n0     1     1\n1     2     2\n2     3     3\n&gt;&gt;&gt; encoder.encoding_dict\n{'col2': {'me': 1, 'you': 2, 'him': 3}}\n</code></pre> <p>Returns:</p> <ul> <li> <code>DataFrame</code>           \u2013            <p>label-encoded dataframe</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/deeptabular_utils.py</code> <pre><code>def fit_transform(self, df: pd.DataFrame) -&gt; pd.DataFrame:\n    \"\"\"Combines `fit` and `transform`\n\n    Examples\n    --------\n\n    &gt;&gt;&gt; import pandas as pd\n    &gt;&gt;&gt; from pytorch_widedeep.utils import LabelEncoder\n    &gt;&gt;&gt; df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})\n    &gt;&gt;&gt; columns_to_encode = ['col2']\n    &gt;&gt;&gt; encoder = LabelEncoder(columns_to_encode)\n    &gt;&gt;&gt; encoder.fit_transform(df)\n       col1  col2\n    0     1     1\n    1     2     2\n    2     3     3\n    &gt;&gt;&gt; encoder.encoding_dict\n    {'col2': {'me': 1, 'you': 2, 'him': 3}}\n\n    Returns\n    -------\n    pd.DataFrame\n        label-encoded dataframe\n    \"\"\"\n    return self.fit(df).transform(df)\n</code></pre>"},{"location":"pytorch-widedeep/utils/deeptabular_utils.html#pytorch_widedeep.utils.deeptabular_utils.LabelEncoder.inverse_transform","title":"inverse_transform","text":"<pre><code>inverse_transform(df)\n</code></pre> <p>Returns the original categories</p> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from pytorch_widedeep.utils import LabelEncoder\n&gt;&gt;&gt; df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})\n&gt;&gt;&gt; columns_to_encode = ['col2']\n&gt;&gt;&gt; encoder = LabelEncoder(columns_to_encode)\n&gt;&gt;&gt; df_enc = encoder.fit_transform(df)\n&gt;&gt;&gt; encoder.inverse_transform(df_enc)\n   col1 col2\n0     1   me\n1     2  you\n2     3  him\n</code></pre> <p>Returns:</p> <ul> <li> <code>DataFrame</code>           \u2013            <p>DataFrame with original categories</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/deeptabular_utils.py</code> <pre><code>def inverse_transform(self, df: pd.DataFrame) -&gt; pd.DataFrame:\n    \"\"\"Returns the original categories\n\n    Examples\n    --------\n\n    &gt;&gt;&gt; import pandas as pd\n    &gt;&gt;&gt; from pytorch_widedeep.utils import LabelEncoder\n    &gt;&gt;&gt; df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})\n    &gt;&gt;&gt; columns_to_encode = ['col2']\n    &gt;&gt;&gt; encoder = LabelEncoder(columns_to_encode)\n    &gt;&gt;&gt; df_enc = encoder.fit_transform(df)\n    &gt;&gt;&gt; encoder.inverse_transform(df_enc)\n       col1 col2\n    0     1   me\n    1     2  you\n    2     3  him\n\n    Returns\n    -------\n    pd.DataFrame\n        DataFrame with original categories\n    \"\"\"\n\n    if not hasattr(self, \"inverse_encoding_dict\"):\n        self.inverse_encoding_dict = self.create_inverse_encoding_dict()\n\n    for k, v in self.inverse_encoding_dict.items():\n        df[k] = df[k].apply(lambda x: v[x])\n\n    return df\n</code></pre>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html","title":"Fastai transforms","text":"<p>I directly copied and pasted part of the <code>transforms.py</code> module from the <code>fastai</code> library (from an old version). The reason to do such a thing is because <code>pytorch_widedeep</code> only needs the <code>Tokenizer</code> and the <code>Vocab</code> classes there. This way I avoid extra dependencies. Credit for all the code in the <code>fastai_transforms</code> module in this <code>pytorch-widedeep</code> package goes to Jeremy Howard and the <code>fastai</code> team. I only include the documentation here for completion, but I strongly advise the user to read the <code>fastai</code> documentation.</p>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html#pytorch_widedeep.utils.fastai_transforms.Tokenizer","title":"Tokenizer","text":"<pre><code>Tokenizer(\n    tok_func=SpacyTokenizer,\n    lang=\"en\",\n    pre_rules=None,\n    post_rules=None,\n    special_cases=None,\n    n_cpus=None,\n)\n</code></pre> <p>Class to combine a series of rules and a tokenizer function to tokenize text with multiprocessing.</p> <p>Setting some of the parameters of this class require perhaps some familiarity with the source code.</p> <p>Parameters:</p> <ul> <li> <code>tok_func</code>               (<code>Callable</code>, default:                   <code>SpacyTokenizer</code> )           \u2013            <p>Tokenizer Object. See <code>pytorch_widedeep.utils.fastai_transforms.SpacyTokenizer</code></p> </li> <li> <code>lang</code>               (<code>str</code>, default:                   <code>'en'</code> )           \u2013            <p>Text's Language</p> </li> <li> <code>pre_rules</code>               (<code>Optional[ListRules]</code>, default:                   <code>None</code> )           \u2013            <p>Custom type: <code>Collection[Callable[[str], str]]</code>. These are <code>Callable</code> objects that will be applied to the text (str) directly as <code>rule(tok)</code> before being tokenized.</p> </li> <li> <code>post_rules</code>               (<code>Optional[ListRules]</code>, default:                   <code>None</code> )           \u2013            <p>Custom type: <code>Collection[Callable[[str], str]]</code>. These are <code>Callable</code> objects that will be applied to the tokens as <code>rule(tokens)</code> after the text has been tokenized.</p> </li> <li> <code>special_cases</code>               (<code>Optional[Collection[str]]</code>, default:                   <code>None</code> )           \u2013            <p>special cases to be added to the tokenizer via <code>Spacy</code>'s <code>add_special_case</code> method</p> </li> <li> <code>n_cpus</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>number of CPUs to used during the tokenization process</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code> <pre><code>def __init__(\n    self,\n    tok_func: Callable = SpacyTokenizer,\n    lang: str = \"en\",\n    pre_rules: Optional[ListRules] = None,\n    post_rules: Optional[ListRules] = None,\n    special_cases: Optional[Collection[str]] = None,\n    n_cpus: Optional[int] = None,\n):\n    self.tok_func, self.lang, self.special_cases = tok_func, lang, special_cases\n    self.pre_rules = ifnone(pre_rules, defaults.text_pre_rules)\n    self.post_rules = ifnone(post_rules, defaults.text_post_rules)\n    self.special_cases = (\n        special_cases if special_cases is not None else defaults.text_spec_tok\n    )\n    self.n_cpus = ifnone(n_cpus, defaults.cpus)\n</code></pre>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html#pytorch_widedeep.utils.fastai_transforms.Tokenizer.process_text","title":"process_text","text":"<pre><code>process_text(t, tok)\n</code></pre> <p>Process and tokenize one text <code>t</code> with tokenizer <code>tok</code>.</p> <p>Parameters:</p> <ul> <li> <code>t</code>               (<code>str</code>)           \u2013            <p>text to be processed and tokenized</p> </li> <li> <code>tok</code>               (<code>BaseTokenizer</code>)           \u2013            <p>Instance of <code>BaseTokenizer</code>. See <code>pytorch_widedeep.utils.fastai_transforms.BaseTokenizer</code></p> </li> </ul> <p>Returns:</p> <ul> <li> <code>List[str]</code>           \u2013            <p>List of tokens</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code> <pre><code>def process_text(self, t: str, tok: BaseTokenizer) -&gt; List[str]:\n    r\"\"\"Process and tokenize one text ``t`` with tokenizer ``tok``.\n\n    Parameters\n    ----------\n    t: str\n        text to be processed and tokenized\n    tok: ``BaseTokenizer``\n        Instance of `BaseTokenizer`. See\n        `pytorch_widedeep.utils.fastai_transforms.BaseTokenizer`\n\n    Returns\n    -------\n    List[str]\n        List of tokens\n    \"\"\"\n    for rule in self.pre_rules:\n        t = rule(t)\n    toks = tok.tokenizer(t)\n    for rule in self.post_rules:\n        toks = rule(toks)\n    return toks\n</code></pre>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html#pytorch_widedeep.utils.fastai_transforms.Tokenizer.process_all","title":"process_all","text":"<pre><code>process_all(texts)\n</code></pre> <p>Process a list of texts. Parallel execution of <code>process_text</code>.</p> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from pytorch_widedeep.utils import Tokenizer\n&gt;&gt;&gt; texts = ['Machine learning is great', 'but building stuff is even better']\n&gt;&gt;&gt; tok = Tokenizer()\n&gt;&gt;&gt; tok.process_all(texts)\n[['xxmaj', 'machine', 'learning', 'is', 'great'], ['but', 'building', 'stuff', 'is', 'even', 'better']]\n</code></pre> <p> NOTE: Note the token <code>TK_MAJ</code> (<code>xxmaj</code>), used to indicate the next word begins with a capital in the original text. For more details of special tokens please see the <code>fastai</code> docs.</p> <p>Returns:</p> <ul> <li> <code>List[List[str]]</code>           \u2013            <p>List containing lists of tokens. One list per \"document\"</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code> <pre><code>def process_all(self, texts: Collection[str]) -&gt; List[List[str]]:\n    r\"\"\"Process a list of texts. Parallel execution of ``process_text``.\n\n    Examples\n    --------\n    &gt;&gt;&gt; from pytorch_widedeep.utils import Tokenizer\n    &gt;&gt;&gt; texts = ['Machine learning is great', 'but building stuff is even better']\n    &gt;&gt;&gt; tok = Tokenizer()\n    &gt;&gt;&gt; tok.process_all(texts)\n    [['xxmaj', 'machine', 'learning', 'is', 'great'], ['but', 'building', 'stuff', 'is', 'even', 'better']]\n\n    :information_source: **NOTE**:\n    Note the token ``TK_MAJ`` (`xxmaj`), used to indicate the\n    next word begins with a capital in the original text. For more\n    details of special tokens please see the [``fastai`` docs](https://docs.fast.ai/text.core.html#Tokenizing).\n\n    Returns\n    -------\n    List[List[str]]\n        List containing lists of tokens. One list per \"_document_\"\n\n    \"\"\"\n\n    if self.n_cpus &lt;= 1:\n        return self._process_all_1(texts)\n    with ProcessPoolExecutor(self.n_cpus) as e:\n        return sum(\n            e.map(self._process_all_1, partition_by_cores(texts, self.n_cpus)), []\n        )\n</code></pre>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html#pytorch_widedeep.utils.fastai_transforms.Vocab","title":"Vocab","text":"<pre><code>Vocab(\n    max_vocab, min_freq, pad_idx=None, special_cases=None\n)\n</code></pre> <p>Contains the correspondence between numbers and tokens.</p> <p>Parameters:</p> <ul> <li> <code>max_vocab</code>               (<code>int</code>)           \u2013            <p>maximum vocabulary size</p> </li> <li> <code>min_freq</code>               (<code>int</code>)           \u2013            <p>minimum frequency for a token to be considereds</p> </li> <li> <code>pad_idx</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>padding index. If <code>None</code>, Fastai's Tokenizer leaves the 0 index for the unknown token ('xxunk') and defaults to 1 for the padding token ('xxpad').</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>itos</code>               (<code>Collection</code>)           \u2013            <p><code>index to str</code>. Collection of strings that are the tokens of the vocabulary</p> </li> <li> <code>stoi</code>               (<code>defaultdict</code>)           \u2013            <p><code>str to index</code>. Dictionary containing the tokens of the vocabulary and their corresponding index</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code> <pre><code>def __init__(\n    self,\n    max_vocab: int,\n    min_freq: int,\n    pad_idx: Optional[int] = None,\n    special_cases: Optional[Collection[str]] = None,\n):\n    self.max_vocab = max_vocab\n    self.min_freq = min_freq\n    self.pad_idx = pad_idx\n    self.special_cases = (\n        special_cases if special_cases is not None else defaults.text_spec_tok\n    )\n</code></pre>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html#pytorch_widedeep.utils.fastai_transforms.Vocab.create","title":"create","text":"<pre><code>create(tokens)\n</code></pre> <p>Create a vocabulary object from a set of tokens.</p> <p>Parameters:</p> <ul> <li> <code>tokens</code>               (<code>Tokens</code>)           \u2013            <p>Custom type: <code>Collection[Collection[str]]</code>  see <code>pytorch_widedeep.wdtypes</code>. Collection of collection of strings (e.g. list of tokenized sentences)</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from pytorch_widedeep.utils import Tokenizer, Vocab\n&gt;&gt;&gt; texts = ['Machine learning is great', 'but building stuff is even better']\n&gt;&gt;&gt; tokens = Tokenizer().process_all(texts)\n&gt;&gt;&gt; vocab = Vocab(max_vocab=18, min_freq=1).create(tokens)\n&gt;&gt;&gt; vocab.numericalize(['machine', 'learning', 'is', 'great'])\n[10, 11, 9, 12]\n&gt;&gt;&gt; vocab.textify([10, 11, 9, 12])\n'machine learning is great'\n</code></pre> <p> NOTE: Note the many special tokens that <code>fastai</code>'s' tokenizer adds. These are particularly useful when building Language models and/or in classification/Regression tasks. Please see the <code>fastai</code> docs.</p> <p>Returns:</p> <ul> <li> <code>Vocab</code>           \u2013            <p>An instance of a <code>Vocab</code> object</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code> <pre><code>def create(\n    self,\n    tokens: Tokens,\n) -&gt; \"Vocab\":\n    r\"\"\"Create a vocabulary object from a set of tokens.\n\n    Parameters\n    ----------\n    tokens: Tokens\n        Custom type: ``Collection[Collection[str]]``  see\n        `pytorch_widedeep.wdtypes`. Collection of collection of\n        strings (e.g. list of tokenized sentences)\n\n    Examples\n    --------\n    &gt;&gt;&gt; from pytorch_widedeep.utils import Tokenizer, Vocab\n    &gt;&gt;&gt; texts = ['Machine learning is great', 'but building stuff is even better']\n    &gt;&gt;&gt; tokens = Tokenizer().process_all(texts)\n    &gt;&gt;&gt; vocab = Vocab(max_vocab=18, min_freq=1).create(tokens)\n    &gt;&gt;&gt; vocab.numericalize(['machine', 'learning', 'is', 'great'])\n    [10, 11, 9, 12]\n    &gt;&gt;&gt; vocab.textify([10, 11, 9, 12])\n    'machine learning is great'\n\n    :information_source: **NOTE**:\n    Note the many special tokens that ``fastai``'s' tokenizer adds. These\n    are particularly useful when building Language models and/or in\n    classification/Regression tasks. Please see the [``fastai`` docs](https://docs.fast.ai/text.core.html#Tokenizing).\n\n    Returns\n    -------\n    Vocab\n        An instance of a `Vocab` object\n    \"\"\"\n\n    freq = Counter(p for o in tokens for p in o)\n    itos = [o for o, c in freq.most_common(self.max_vocab) if c &gt;= self.min_freq]\n    for o in reversed(self.special_cases):\n        if o in itos:\n            itos.remove(o)\n        itos.insert(0, o)\n\n    if self.pad_idx is not None and self.pad_idx != 1:\n        itos.remove(PAD)\n        itos.insert(self.pad_idx, PAD)\n        # get the new 'xxunk' index\n        xxunk_idx = np.where([el == \"xxunk\" for el in itos])[0][0]\n    else:\n        xxunk_idx = 0\n\n    itos = itos[: self.max_vocab]\n    if (\n        len(itos) &lt; self.max_vocab\n    ):  # Make sure vocab size is a multiple of 8 for fast mixed precision training\n        while len(itos) % 8 != 0:\n            itos.append(\"xxfake\")\n\n    self.itos = itos\n    self.stoi = defaultdict(\n        lambda: xxunk_idx, {v: k for k, v in enumerate(self.itos)}\n    )\n\n    return self\n</code></pre>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html#pytorch_widedeep.utils.fastai_transforms.Vocab.fit","title":"fit","text":"<pre><code>fit(tokens)\n</code></pre> <p>Calls the <code>create</code> method. I simply want to honor fast ai naming, but for consistency with the rest of the library I am including a fit method</p> Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code> <pre><code>def fit(\n    self,\n    tokens: Tokens,\n) -&gt; \"Vocab\":\n    \"\"\"\n    Calls the `create` method. I simply want to honor fast ai naming, but\n    for consistency with the rest of the library I am including a fit method\n    \"\"\"\n    return self.create(tokens)\n</code></pre>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html#pytorch_widedeep.utils.fastai_transforms.Vocab.numericalize","title":"numericalize","text":"<pre><code>numericalize(t)\n</code></pre> <p>Convert a list of tokens <code>t</code> to their ids.</p> <p>Returns:</p> <ul> <li> <code>List[int]</code>           \u2013            <p>List of 'numericalsed' tokens</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code> <pre><code>def numericalize(self, t: Collection[str]) -&gt; List[int]:\n    \"\"\"Convert a list of tokens ``t`` to their ids.\n\n    Returns\n    -------\n    List[int]\n        List of '_numericalsed_' tokens\n    \"\"\"\n    return [self.stoi[w] for w in t]\n</code></pre>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html#pytorch_widedeep.utils.fastai_transforms.Vocab.transform","title":"transform","text":"<pre><code>transform(t)\n</code></pre> <p>Calls the <code>numericalize</code> method. I simply want to honor fast ai naming, but for consistency with the rest of the library I am including a transform method</p> Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code> <pre><code>def transform(self, t: Collection[str]) -&gt; List[int]:\n    \"\"\"\n    Calls the `numericalize` method. I simply want to honor fast ai naming,\n    but for consistency with the rest of the library I am including a\n    transform method\n    \"\"\"\n    return self.numericalize(t)\n</code></pre>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html#pytorch_widedeep.utils.fastai_transforms.Vocab.textify","title":"textify","text":"<pre><code>textify(nums, sep=' ')\n</code></pre> <p>Convert a list of <code>nums</code> (or indexes) to their tokens.</p> <p>Returns:</p> <ul> <li> <code>List[str]</code>           \u2013            <p>List of tokens</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code> <pre><code>def textify(self, nums: Collection[int], sep=\" \") -&gt; Union[str, List[str]]:\n    \"\"\"Convert a list of ``nums`` (or indexes) to their tokens.\n\n    Returns\n    -------\n    List[str]\n        List of tokens\n    \"\"\"\n    return (\n        sep.join([self.itos[i] for i in nums])\n        if sep is not None\n        else [self.itos[i] for i in nums]\n    )\n</code></pre>"},{"location":"pytorch-widedeep/utils/fastai_transforms.html#pytorch_widedeep.utils.fastai_transforms.Vocab.inverse_transform","title":"inverse_transform","text":"<pre><code>inverse_transform(nums, sep=' ')\n</code></pre> <p>Calls the <code>textify</code> method. I simply want to honor fast ai naming, but for consistency with the rest of the library I am including an inverse_transform method</p> Source code in <code>pytorch_widedeep/utils/fastai_transforms.py</code> <pre><code>def inverse_transform(\n    self, nums: Collection[int], sep=\" \"\n) -&gt; Union[str, List[str]]:\n    \"\"\"\n    Calls the `textify` method. I simply want to honor fast ai naming, but\n    for consistency with the rest of the library I am including an\n    inverse_transform method\n    \"\"\"\n    # I simply want to honor fast ai naming, but for consistency with the\n    # rest of the library I am including an inverse_transform method\n    return self.textify(nums, sep)\n</code></pre>"},{"location":"pytorch-widedeep/utils/image_utils.html","title":"Image utils","text":"<p>SimplePreprocessor and AspectAwarePreprocessor are directly taked from the great series of Books `Deep Learning for Computer Vision by Adrian. Therefore, all credit for the code in the <code>image_utils</code> module goes to Adrian Rosebrock.</p>"},{"location":"pytorch-widedeep/utils/image_utils.html#pytorch_widedeep.utils.image_utils.AspectAwarePreprocessor","title":"AspectAwarePreprocessor","text":"<pre><code>AspectAwarePreprocessor(\n    width, height, inter=cv2.INTER_AREA\n)\n</code></pre> <p>Class to resize an image to a certain width and height taking into account the image aspect ratio</p> <p>Parameters:</p> <ul> <li> <code>width</code>               (<code>int</code>)           \u2013            <p>output width</p> </li> <li> <code>height</code>               (<code>int</code>)           \u2013            <p>output height</p> </li> <li> <code>inter</code>           \u2013            <p><code>opencv</code> interpolation method. See <code>opencv</code> <code>InterpolationFlags</code>.</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/image_utils.py</code> <pre><code>def __init__(self, width: int, height: int, inter=cv2.INTER_AREA):\n    self.width = width\n    self.height = height\n    self.inter = inter\n</code></pre>"},{"location":"pytorch-widedeep/utils/image_utils.html#pytorch_widedeep.utils.image_utils.AspectAwarePreprocessor.preprocess","title":"preprocess","text":"<pre><code>preprocess(image)\n</code></pre> <p>Returns the resized input image taking into account the image aspect ratio</p> <p>Parameters:</p> <ul> <li> <code>image</code>               (<code>ndarray</code>)           \u2013            <p>Input image to be resized</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; import cv2\n&gt;&gt;&gt; from pytorch_widedeep.utils import AspectAwarePreprocessor\n&gt;&gt;&gt; img = cv2.imread(\"tests/test_data_utils/images/galaxy1.png\")\n&gt;&gt;&gt; img.shape\n(694, 890, 3)\n&gt;&gt;&gt; app = AspectAwarePreprocessor(width=224, height=224)\n&gt;&gt;&gt; resized_img = app.preprocess(img)\n&gt;&gt;&gt; resized_img.shape\n(224, 224, 3)\n</code></pre> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>Resized image according to its original image aspect ratio</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/image_utils.py</code> <pre><code>def preprocess(self, image: np.ndarray) -&gt; np.ndarray:\n    r\"\"\"Returns the resized input image taking into account the image aspect ratio\n\n    Parameters\n    ----------\n    image: np.ndarray\n        Input image to be resized\n\n    Examples\n    --------\n    &gt;&gt;&gt; import cv2\n    &gt;&gt;&gt; from pytorch_widedeep.utils import AspectAwarePreprocessor\n    &gt;&gt;&gt; img = cv2.imread(\"tests/test_data_utils/images/galaxy1.png\")\n    &gt;&gt;&gt; img.shape\n    (694, 890, 3)\n    &gt;&gt;&gt; app = AspectAwarePreprocessor(width=224, height=224)\n    &gt;&gt;&gt; resized_img = app.preprocess(img)\n    &gt;&gt;&gt; resized_img.shape\n    (224, 224, 3)\n\n    Returns\n    -------\n    np.ndarray\n        Resized image according to its original image aspect ratio\n    \"\"\"\n    (h, w) = image.shape[:2]\n    dW = 0\n    dH = 0\n\n    if w &lt; h:\n        image = imutils.resize(image, width=self.width, inter=self.inter)\n        dH = int((image.shape[0] - self.height) / 2.0)\n    else:\n        image = imutils.resize(image, height=self.height, inter=self.inter)\n        dW = int((image.shape[1] - self.width) / 2.0)\n\n    (h, w) = image.shape[:2]\n    image = image[dH : h - dH, dW : w - dW]\n\n    resized_image = cv2.resize(\n        image, (self.width, self.height), interpolation=self.inter\n    )\n\n    return resized_image\n</code></pre>"},{"location":"pytorch-widedeep/utils/image_utils.html#pytorch_widedeep.utils.image_utils.SimplePreprocessor","title":"SimplePreprocessor","text":"<pre><code>SimplePreprocessor(width, height, inter=cv2.INTER_AREA)\n</code></pre> <p>Class to resize an image to a certain width and height</p> <p>Parameters:</p> <ul> <li> <code>width</code>               (<code>int</code>)           \u2013            <p>output width</p> </li> <li> <code>height</code>               (<code>int</code>)           \u2013            <p>output height</p> </li> <li> <code>inter</code>           \u2013            <p><code>opencv</code> interpolation method. See <code>opencv</code> <code>InterpolationFlags</code>.</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/image_utils.py</code> <pre><code>def __init__(self, width: int, height: int, inter=cv2.INTER_AREA):\n    self.width = width\n    self.height = height\n    self.inter = inter\n</code></pre>"},{"location":"pytorch-widedeep/utils/image_utils.html#pytorch_widedeep.utils.image_utils.SimplePreprocessor.preprocess","title":"preprocess","text":"<pre><code>preprocess(image)\n</code></pre> <p>Returns the resized input image</p> <p>Parameters:</p> <ul> <li> <code>image</code>               (<code>ndarray</code>)           \u2013            <p>Input image to be resized</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>Resized image</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/image_utils.py</code> <pre><code>def preprocess(self, image: np.ndarray) -&gt; np.ndarray:\n    r\"\"\"Returns the resized input image\n\n    Parameters\n    ----------\n    image: np.ndarray\n        Input image to be resized\n\n    Returns\n    -------\n    np.ndarray\n        Resized image\n\n    \"\"\"\n    resized_image = cv2.resize(\n        image, (self.width, self.height), interpolation=self.inter\n    )\n\n    return resized_image\n</code></pre>"},{"location":"pytorch-widedeep/utils/text_utils.html","title":"Text utils","text":"<p>Collection of helper function that facilitate processing text.</p>"},{"location":"pytorch-widedeep/utils/text_utils.html#pytorch_widedeep.utils.text_utils.simple_preprocess","title":"simple_preprocess","text":"<pre><code>simple_preprocess(\n    doc, lower=False, deacc=False, min_len=2, max_len=15\n)\n</code></pre> <p>This is <code>Gensim</code>'s <code>simple_preprocess</code> with a <code>lower</code> param to indicate wether or not to lower case all the token in the doc</p> <p>For more information see: <code>Gensim</code> utils module</p> <p>Parameters:</p> <ul> <li> <code>doc</code>               (<code>str</code>)           \u2013            <p>Input document.</p> </li> <li> <code>lower</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Lower case tokens in the input doc</p> </li> <li> <code>deacc</code>               (<code>bool</code>, default:                   <code>False</code> )           \u2013            <p>Remove accent marks from tokens using <code>Gensim</code>'s <code>deaccent</code></p> </li> <li> <code>min_len</code>               (<code>int</code>, default:                   <code>2</code> )           \u2013            <p>Minimum length of token (inclusive). Shorter tokens are discarded.</p> </li> <li> <code>max_len</code>               (<code>int</code>, default:                   <code>15</code> )           \u2013            <p>Maximum length of token in result (inclusive). Longer tokens are discarded.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from pytorch_widedeep.utils import simple_preprocess\n&gt;&gt;&gt; simple_preprocess('Machine learning is great')\n['Machine', 'learning', 'is', 'great']\n</code></pre> <p>Returns:</p> <ul> <li> <code>List[str]</code>           \u2013            <p>List with the processed tokens</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/text_utils.py</code> <pre><code>def simple_preprocess(\n    doc: str,\n    lower: bool = False,\n    deacc: bool = False,\n    min_len: int = 2,\n    max_len: int = 15,\n) -&gt; List[str]:\n    r\"\"\"\n    This is `Gensim`'s `simple_preprocess` with a `lower` param to\n    indicate wether or not to lower case all the token in the doc\n\n    For more information see: `Gensim` [utils module](https://radimrehurek.com/gensim/utils.html)\n\n    Parameters\n    ----------\n    doc: str\n        Input document.\n    lower: bool, default = False\n        Lower case tokens in the input doc\n    deacc: bool, default = False\n        Remove accent marks from tokens using `Gensim`'s `deaccent`\n    min_len: int, default = 2\n        Minimum length of token (inclusive). Shorter tokens are discarded.\n    max_len: int, default = 15\n        Maximum length of token in result (inclusive). Longer tokens are discarded.\n\n    Examples\n    --------\n    &gt;&gt;&gt; from pytorch_widedeep.utils import simple_preprocess\n    &gt;&gt;&gt; simple_preprocess('Machine learning is great')\n    ['Machine', 'learning', 'is', 'great']\n\n    Returns\n    -------\n    List[str]\n        List with the processed tokens\n    \"\"\"\n    tokens = [\n        token\n        for token in tokenize(doc, lower=lower, deacc=deacc, errors=\"ignore\")\n        if min_len &lt;= len(token) &lt;= max_len and not token.startswith(\"_\")\n    ]\n    return tokens\n</code></pre>"},{"location":"pytorch-widedeep/utils/text_utils.html#pytorch_widedeep.utils.text_utils.get_texts","title":"get_texts","text":"<pre><code>get_texts(texts, already_processed=False, n_cpus=None)\n</code></pre> <p>Tokenization using <code>Fastai</code>'s <code>Tokenizer</code> because it does a series of very convenients things during the tokenization process</p> <p>See <code>pytorch_widedeep.utils.fastai_utils.Tokenizer</code></p> <p>Parameters:</p> <ul> <li> <code>texts</code>               (<code>List[str]</code>)           \u2013            <p>List of str with the texts (or documents). One str per document</p> </li> <li> <code>already_processed</code>               (<code>Optional[bool]</code>, default:                   <code>False</code> )           \u2013            <p>Boolean indicating if the text is already processed and we simply want to tokenize it. This parameter is thought for those cases where the input sequences might not be text (but IDs, or anything else) and we just want to tokenize it</p> </li> <li> <code>n_cpus</code>               (<code>Optional[int]</code>, default:                   <code>None</code> )           \u2013            <p>number of CPUs to used during the tokenization process</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from pytorch_widedeep.utils import get_texts\n&gt;&gt;&gt; texts = ['Machine learning is great', 'but building stuff is even better']\n&gt;&gt;&gt; get_texts(texts)\n[['xxmaj', 'machine', 'learning', 'is', 'great'], ['but', 'building', 'stuff', 'is', 'even', 'better']]\n</code></pre> <p>Returns:</p> <ul> <li> <code>List[List[str]]</code>           \u2013            <p>List of lists, one list per 'document' containing its corresponding tokens</p> </li> <li> <code>information_source: **NOTE**:</code>           \u2013            </li> <li> <code>`get_texts` uses `pytorch_widedeep.utils.fastai_transforms.Tokenizer`.</code>           \u2013            </li> <li> <code>Such tokenizer uses a series of convenient processing steps, including</code>           \u2013            </li> <li> <code>the  addition of some special tokens, such as `TK_MAJ` (`xxmaj`), used to</code>           \u2013            </li> <li> <code>indicate the next word begins with a capital in the original text. For more</code>           \u2013            </li> <li> <code>details of special tokens please see the [`fastai` `docs](https://docs.fast.ai/text.core.html#Tokenizing)</code>           \u2013            </li> </ul> Source code in <code>pytorch_widedeep/utils/text_utils.py</code> <pre><code>def get_texts(\n    texts: List[str],\n    already_processed: Optional[bool] = False,\n    n_cpus: Optional[int] = None,\n) -&gt; List[List[str]]:\n    r\"\"\"Tokenization using `Fastai`'s `Tokenizer` because it does a\n    series of very convenients things during the tokenization process\n\n    See `pytorch_widedeep.utils.fastai_utils.Tokenizer`\n\n    Parameters\n    ----------\n    texts: List\n        List of str with the texts (or documents). One str per document\n    already_processed: bool, Optional, default = False\n        Boolean indicating if the text is already processed and we simply want\n        to tokenize it. This parameter is thought for those cases where the\n        input sequences might not be text (but IDs, or anything else) and we\n        just want to tokenize it\n    n_cpus: int, Optional, default = None\n        number of CPUs to used during the tokenization process\n\n    Examples\n    --------\n    &gt;&gt;&gt; from pytorch_widedeep.utils import get_texts\n    &gt;&gt;&gt; texts = ['Machine learning is great', 'but building stuff is even better']\n    &gt;&gt;&gt; get_texts(texts)\n    [['xxmaj', 'machine', 'learning', 'is', 'great'], ['but', 'building', 'stuff', 'is', 'even', 'better']]\n\n    Returns\n    -------\n    List[List[str]]\n        List of lists, one list per '_document_' containing its corresponding tokens\n\n    :information_source: **NOTE**:\n    `get_texts` uses `pytorch_widedeep.utils.fastai_transforms.Tokenizer`.\n    Such tokenizer uses a series of convenient processing steps, including\n    the  addition of some special tokens, such as `TK_MAJ` (`xxmaj`), used to\n    indicate the next word begins with a capital in the original text. For more\n    details of special tokens please see the [`fastai` `docs](https://docs.fast.ai/text.core.html#Tokenizing)\n    \"\"\"\n\n    num_cpus = n_cpus if n_cpus is not None else os.cpu_count()\n\n    if not already_processed:\n        processed_texts = [\" \".join(simple_preprocess(t)) for t in texts]\n    else:\n        processed_texts = texts\n    tok = Tokenizer(n_cpus=num_cpus).process_all(processed_texts)\n    return tok\n</code></pre>"},{"location":"pytorch-widedeep/utils/text_utils.html#pytorch_widedeep.utils.text_utils.pad_sequences","title":"pad_sequences","text":"<pre><code>pad_sequences(seq, maxlen, pad_first=True, pad_idx=1)\n</code></pre> <p>Given a List of tokenized and <code>numericalised</code> sequences it will return padded sequences according to the input parameters.</p> <p>Parameters:</p> <ul> <li> <code>seq</code>               (<code>List[int]</code>)           \u2013            <p>List of int with the <code>numericalised</code> tokens</p> </li> <li> <code>maxlen</code>               (<code>int</code>)           \u2013            <p>Maximum length of the padded sequences</p> </li> <li> <code>pad_first</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>Indicates whether the padding index will be added at the beginning or the end of the sequences</p> </li> <li> <code>pad_idx</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>padding index. Fastai's Tokenizer leaves 0 for the 'unknown' token.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from pytorch_widedeep.utils import pad_sequences\n&gt;&gt;&gt; seq = [1,2,3]\n&gt;&gt;&gt; pad_sequences(seq, maxlen=5, pad_idx=0)\narray([0, 0, 1, 2, 3], dtype=int32)\n</code></pre> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>numpy array with the padded sequences</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/text_utils.py</code> <pre><code>def pad_sequences(\n    seq: List[int], maxlen: int, pad_first: bool = True, pad_idx: int = 1\n) -&gt; np.ndarray:\n    r\"\"\"\n    Given a List of tokenized and `numericalised` sequences it will return\n    padded sequences according to the input parameters.\n\n    Parameters\n    ----------\n    seq: List\n        List of int with the `numericalised` tokens\n    maxlen: int\n        Maximum length of the padded sequences\n    pad_first: bool,  default = True\n        Indicates whether the padding index will be added at the beginning or the\n        end of the sequences\n    pad_idx: int, default = 1\n        padding index. Fastai's Tokenizer leaves 0 for the 'unknown' token.\n\n    Examples\n    --------\n    &gt;&gt;&gt; from pytorch_widedeep.utils import pad_sequences\n    &gt;&gt;&gt; seq = [1,2,3]\n    &gt;&gt;&gt; pad_sequences(seq, maxlen=5, pad_idx=0)\n    array([0, 0, 1, 2, 3], dtype=int32)\n\n    Returns\n    -------\n    np.ndarray\n        numpy array with the padded sequences\n    \"\"\"\n    if len(seq) == 0:\n        return np.zeros(maxlen, dtype=\"int32\") + pad_idx\n    elif len(seq) &gt;= maxlen:\n        res = np.array(seq[-maxlen:]).astype(\"int32\")\n        return res\n    else:\n        res = np.zeros(maxlen, dtype=\"int32\") + pad_idx\n        if pad_first:\n            res[-len(seq) :] = seq\n        else:\n            res[: len(seq) :] = seq\n        return res\n</code></pre>"},{"location":"pytorch-widedeep/utils/text_utils.html#pytorch_widedeep.utils.text_utils.build_embeddings_matrix","title":"build_embeddings_matrix","text":"<pre><code>build_embeddings_matrix(\n    vocab, word_vectors_path, min_freq, verbose=1\n)\n</code></pre> <p>Build the embedding matrix using pretrained word vectors.</p> <p>Returns pretrained word embeddings. If a word in our vocabulary is not among the pretrained embeddings it will be assigned the mean pretrained word-embeddings vector</p> <p>Parameters:</p> <ul> <li> <code>vocab</code>               (<code>Union[Vocab, ChunkVocab]</code>)           \u2013            <p>see <code>pytorch_widedeep.utils.fastai_utils.Vocab</code></p> </li> <li> <code>word_vectors_path</code>               (<code>str</code>)           \u2013            <p>path to the pretrained word embeddings</p> </li> <li> <code>min_freq</code>               (<code>int</code>)           \u2013            <p>minimum frequency required for a word to be in the vocabulary</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>1</code> )           \u2013            <p>level of verbosity. Set to 0 for no verbosity</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>ndarray</code>           \u2013            <p>Pretrained word embeddings</p> </li> </ul> Source code in <code>pytorch_widedeep/utils/text_utils.py</code> <pre><code>def build_embeddings_matrix(\n    vocab: Union[Vocab, ChunkVocab],\n    word_vectors_path: str,\n    min_freq: int,\n    verbose: int = 1,\n) -&gt; np.ndarray:  # pragma: no cover\n    r\"\"\"Build the embedding matrix using pretrained word vectors.\n\n    Returns pretrained word embeddings. If a word in our vocabulary is not\n    among the pretrained embeddings it will be assigned the mean pretrained\n    word-embeddings vector\n\n    Parameters\n    ----------\n    vocab: Vocab\n        see `pytorch_widedeep.utils.fastai_utils.Vocab`\n    word_vectors_path: str\n        path to the pretrained word embeddings\n    min_freq: int\n        minimum frequency required for a word to be in the vocabulary\n    verbose: int,  default=1\n        level of verbosity. Set to 0 for no verbosity\n\n    Returns\n    -------\n    np.ndarray\n        Pretrained word embeddings\n    \"\"\"\n    if not os.path.isfile(word_vectors_path):\n        raise FileNotFoundError(\"{} not found\".format(word_vectors_path))\n    if verbose:\n        print(\"Indexing word vectors...\")\n\n    embeddings_index = {}\n    f = open(word_vectors_path)\n    for line in f:\n        values = line.split()\n        word = values[0]\n        coefs = np.asarray(values[1:], dtype=\"float32\")\n        embeddings_index[word] = coefs\n    f.close()\n\n    if verbose:\n        print(\"Loaded {} word vectors\".format(len(embeddings_index)))\n        print(\"Preparing embeddings matrix...\")\n\n    mean_word_vector = np.mean(list(embeddings_index.values()), axis=0)  # type: ignore[arg-type]\n    embedding_dim = len(list(embeddings_index.values())[0])\n    num_words = len(vocab.itos)\n    embedding_matrix = np.zeros((num_words, embedding_dim))\n    found_words = 0\n    for i, word in enumerate(vocab.itos):\n        embedding_vector = embeddings_index.get(word)\n        if embedding_vector is not None:\n            embedding_matrix[i] = embedding_vector\n            found_words += 1\n        else:\n            embedding_matrix[i] = mean_word_vector\n\n    if verbose:\n        print(\n            \"{} words in the vocabulary had {} vectors and appear more than {} times\".format(\n                found_words, word_vectors_path, min_freq\n            )\n        )\n\n    return embedding_matrix.astype(\"float32\")\n</code></pre>"}]}
\ No newline at end of file
diff --git a/mkdocs/site/sitemap.xml b/mkdocs/site/sitemap.xml
index 41bfb97a..f81fce34 100644
--- a/mkdocs/site/sitemap.xml
+++ b/mkdocs/site/sitemap.xml
@@ -2,217 +2,178 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/index.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/contributing.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/installation.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/quick_start.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/01_preprocessors_and_utils.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/02_model_components.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/03_binary_classification_with_defaults.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/04_regression_with_images_and_text.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/05_save_and_load_model_and_artifacts.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/06_finetune_and_warmup.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/07_custom_components.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/08_custom_dataLoader_imbalanced_dataset.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/09_extracting_embeddings.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/10_3rd_party_integration-RayTune_WnB.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/11_auc_multiclass.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/12_ZILNLoss_origkeras_vs_pytorch_widedeep.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/13_model_uncertainty_prediction.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/14_bayesian_models.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/15_Self_Supervised_Pretraning_pt1.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/15_Self_Supervised_Pretraning_pt2.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/16_Usign_a_custom_hugging_face_model.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/17_feature_importance_via_attention_weights.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/18_wide_and_deep_for_recsys_pt1.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/18_wide_and_deep_for_recsys_pt2.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/19_load_from_folder_functionality.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/examples/20_Using_huggingface_within_widedeep.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/pytorch-widedeep/bayesian_models.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/pytorch-widedeep/bayesian_trainer.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/pytorch-widedeep/callbacks.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/pytorch-widedeep/dataloaders.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/pytorch-widedeep/load_from_folder.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/pytorch-widedeep/losses.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/pytorch-widedeep/metrics.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/pytorch-widedeep/model_components.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/pytorch-widedeep/preprocessing.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/pytorch-widedeep/self_supervised_pretraining.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/pytorch-widedeep/tab2vec.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
+    </url>
+    <url>
+         <loc>https://pytorch-widedeep.readthedocs.io/pytorch-widedeep/the_rec_module.html</loc>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/pytorch-widedeep/trainer.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/pytorch-widedeep/utils/index.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/pytorch-widedeep/utils/deeptabular_utils.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/pytorch-widedeep/utils/fastai_transforms.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/pytorch-widedeep/utils/image_utils.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
     <url>
          <loc>https://pytorch-widedeep.readthedocs.io/pytorch-widedeep/utils/text_utils.html</loc>
-         <lastmod>2024-08-18</lastmod>
-         <changefreq>daily</changefreq>
+         <lastmod>2024-09-22</lastmod>
     </url>
 </urlset>
\ No newline at end of file
diff --git a/mkdocs/site/sitemap.xml.gz b/mkdocs/site/sitemap.xml.gz
index a77c868a..eb4fa3c7 100644
Binary files a/mkdocs/site/sitemap.xml.gz and b/mkdocs/site/sitemap.xml.gz differ
diff --git a/mkdocs/sources/docs/figures/arch_7.png b/mkdocs/sources/docs/figures/arch_7.png
index 16a0bfdd..c7780536 100644
Binary files a/mkdocs/sources/docs/figures/arch_7.png and b/mkdocs/sources/docs/figures/arch_7.png differ
diff --git a/mkdocs/sources/docs/figures/arch_8.png b/mkdocs/sources/docs/figures/arch_8.png
new file mode 100644
index 00000000..16a0bfdd
Binary files /dev/null and b/mkdocs/sources/docs/figures/arch_8.png differ
diff --git a/mkdocs/sources/index.md b/mkdocs/sources/index.md
index 208321d5..77d61195 100644
--- a/mkdocs/sources/index.md
+++ b/mkdocs/sources/index.md
@@ -33,6 +33,7 @@ The content of this document is organized as follows:
     - [Introduction](#introduction)
     - [Architectures](#Architectures)
     - [The ``deeptabular`` component](#the-deeptabular-component)
+    - [The ``rec`` module](#the-rec-module)
     - [Text and Images](#text-and-images)
     - [Acknowledgments](#acknowledgments)
     - [License](#license)
@@ -582,13 +583,111 @@ trainer.fit(
 )
 ```
 
-**7. Tabular with a multi-target loss**
+**7. A two-tower model**
+
+This is a popular model in the context of recommendation systems. Let's say we
+have a tabular dataset formed my triples (user features, item features,
+target). We can create a two-tower model where the user and item features are
+passed through two separate models and then "fused" via a dot product.
+
+<p align="center">
+  <img width="350" src="docs/figures/arch_7.png">
+</p>
+
+
+```python
+import numpy as np
+import pandas as pd
+
+from pytorch_widedeep import Trainer
+from pytorch_widedeep.preprocessing import TabPreprocessor
+from pytorch_widedeep.models import TabMlp, WideDeep, ModelFuser
+
+# Let's create the interaction dataset
+# user_features dataframe
+np.random.seed(42)
+user_ids = np.arange(1, 101)
+ages = np.random.randint(18, 60, size=100)
+genders = np.random.choice(["male", "female"], size=100)
+locations = np.random.choice(["city_a", "city_b", "city_c", "city_d"], size=100)
+user_features = pd.DataFrame(
+    {"id": user_ids, "age": ages, "gender": genders, "location": locations}
+)
+
+# item_features dataframe
+item_ids = np.arange(1, 101)
+prices = np.random.uniform(10, 500, size=100).round(2)
+colors = np.random.choice(["red", "blue", "green", "black"], size=100)
+categories = np.random.choice(["electronics", "clothing", "home", "toys"], size=100)
+
+item_features = pd.DataFrame(
+    {"id": item_ids, "price": prices, "color": colors, "category": categories}
+)
+
+# Interactions dataframe
+interaction_user_ids = np.random.choice(user_ids, size=1000)
+interaction_item_ids = np.random.choice(item_ids, size=1000)
+purchased = np.random.choice([0, 1], size=1000, p=[0.7, 0.3])
+interactions = pd.DataFrame(
+    {
+        "user_id": interaction_user_ids,
+        "item_id": interaction_item_ids,
+        "purchased": purchased,
+    }
+)
+user_item_purchased = interactions.merge(
+    user_features, left_on="user_id", right_on="id"
+).merge(item_features, left_on="item_id", right_on="id")
+
+# Users
+tab_preprocessor_user = TabPreprocessor(
+    cat_embed_cols=["gender", "location"],
+    continuous_cols=["age"],
+)
+X_user = tab_preprocessor_user.fit_transform(user_item_purchased)
+tab_mlp_user = TabMlp(
+    column_idx=tab_preprocessor_user.column_idx,
+    cat_embed_input=tab_preprocessor_user.cat_embed_input,
+    continuous_cols=["age"],
+    mlp_hidden_dims=[16, 8],
+    mlp_dropout=[0.2, 0.2],
+)
+
+# Items
+tab_preprocessor_item = TabPreprocessor(
+    cat_embed_cols=["color", "category"],
+    continuous_cols=["price"],
+)
+X_item = tab_preprocessor_item.fit_transform(user_item_purchased)
+tab_mlp_item = TabMlp(
+    column_idx=tab_preprocessor_item.column_idx,
+    cat_embed_input=tab_preprocessor_item.cat_embed_input,
+    continuous_cols=["price"],
+    mlp_hidden_dims=[16, 8],
+    mlp_dropout=[0.2, 0.2],
+)
+
+two_tower_model = ModelFuser([tab_mlp_user, tab_mlp_item], fusion_method="dot")
+
+model = WideDeep(deeptabular=two_tower_model)
+
+trainer = Trainer(model, objective="binary")
+
+trainer.fit(
+    X_tab=[X_user, X_item],
+    target=interactions.purchased.values,
+    n_epochs=1,
+    batch_size=32,
+)
+```
+
+**8. Tabular with a multi-target loss**
 
 This one is "a bonus" to illustrate the use of multi-target losses, more than
 actually a different architecture.
 
 <p align="center">
-  <img width="200" src="docs/figures/arch_7.png">
+  <img width="200" src="docs/figures/arch_8.png">
 </p>
 
 
@@ -692,6 +791,28 @@ encoder-decoder method and constrastive-denoising method. Please, see the
 documentation and the examples for details on this functionality, and all
 other options in the library.
 
+### The ``rec`` module
+
+This module was introduced as an extension to the existing components in the
+library, addressing questions and issues related to recommendation systems.
+While still under active development, it currently includes a select number
+of powerful recommendation models.
+
+It's worth noting that this library already supported the implementation of
+various recommendation algorithms using existing components. For example,
+models like Wide and Deep, Two-Tower, or Neural Collaborative Filtering could
+be constructed using the library's core functionalities.
+
+The recommendation algorithms in the `rec` module are:
+
+1. [DeepFM: A Factorization-Machine based Neural Network for CTR Prediction](https://arxiv.org/abs/1703.04247)
+2. (Deep) Field Aware Factorization Machine (FFM): a Deep Learning version of the algorithm presented in [Field-aware Factorization Machines in a Real-world Online Advertising System](https://arxiv.org/abs/1701.04099)
+3. [xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems](https://arxiv.org/pdf/1803.05170)
+4. [Deep Interest Network for Click-Through Rate Prediction](https://arxiv.org/abs/1706.06978)
+
+These can all be used as the `deeptabular` component in the `WideDeep` model.
+See the examples for more details.
+
 ### Text and Images
 For the text component, `deeptext`, the library offers the following models:
 
diff --git a/mkdocs/sources/pytorch-widedeep/model_components.md b/mkdocs/sources/pytorch-widedeep/model_components.md
index 10ef9e89..ab42e7c9 100644
--- a/mkdocs/sources/pytorch-widedeep/model_components.md
+++ b/mkdocs/sources/pytorch-widedeep/model_components.md
@@ -97,12 +97,6 @@ self-supervised pre-training with tabular data.
             - "!^_"  # exclude all members starting with _
             - "!^forward$"
 
-::: pytorch_widedeep.models.text.miscellaneous.basic_transformer.Transformer
-    selection:
-        filters:
-            - "!^_"  # exclude all members starting with _
-            - "!^forward$"
-
 ::: pytorch_widedeep.models.text.huggingface_transformers.hf_model.HFModel
     selection:
         filters:
diff --git a/mkdocs/sources/pytorch-widedeep/the_rec_module.md b/mkdocs/sources/pytorch-widedeep/the_rec_module.md
new file mode 100644
index 00000000..46ccb90c
--- /dev/null
+++ b/mkdocs/sources/pytorch-widedeep/the_rec_module.md
@@ -0,0 +1,42 @@
+# The ``rec`` module
+
+This module contains models are that specifically designed for recommendation systems.
+While the rest of the models can be accessed from the ``pytorch_widedeep.models`` module, models
+in this module need to be specifically imported from the ``rec`` module, e.g.:
+
+```python
+from pytorch_widedeep.models.rec import DeepFactorizationMachine
+```
+
+The list of models here is not meant to be exhaustive, but it includes some
+common architectures such as factorization machines, field aware
+factorization machines or extreme factorization machines. More models will be
+added in the future.
+
+::: pytorch_widedeep.models.rec.deepfm.DeepFactorizationMachine
+    selection:
+        filters:
+            - "!^forward$"
+
+
+::: pytorch_widedeep.models.rec.deepffm.DeepFieldAwareFactorizationMachine
+    selection:
+        filters:
+            - "!^forward$"
+
+
+::: pytorch_widedeep.models.rec.xdeepfm.ExtremeDeepFactorizationMachine
+    selection:
+        filters:
+            - "!^forward$"
+
+
+::: pytorch_widedeep.models.rec.din.DeepInterestNetwork
+    selection:
+        filters:
+            - "!^forward$"
+
+::: pytorch_widedeep.models.rec.basic_transformer.Transformer
+    selection:
+        filters:
+            - "!^forward$"
diff --git a/pytorch_widedeep/bayesian_models/__init__.py b/pytorch_widedeep/bayesian_models/__init__.py
index 993996c8..6a4c4e2a 100644
--- a/pytorch_widedeep/bayesian_models/__init__.py
+++ b/pytorch_widedeep/bayesian_models/__init__.py
@@ -1,4 +1 @@
-from pytorch_widedeep.bayesian_models.tabular import (
-    BayesianWide,
-    BayesianTabMlp,
-)
+from pytorch_widedeep.bayesian_models.tabular import BayesianWide, BayesianTabMlp
diff --git a/pytorch_widedeep/bayesian_models/bayesian_nn/modules/bayesian_embedding.py b/pytorch_widedeep/bayesian_models/bayesian_nn/modules/bayesian_embedding.py
index aaf8b2a4..a5de9e3c 100644
--- a/pytorch_widedeep/bayesian_models/bayesian_nn/modules/bayesian_embedding.py
+++ b/pytorch_widedeep/bayesian_models/bayesian_nn/modules/bayesian_embedding.py
@@ -13,9 +13,7 @@
     GaussianPosterior,
     ScaleMixtureGaussianPrior,
 )
-from pytorch_widedeep.bayesian_models._base_bayesian_model import (
-    BayesianModule,
-)
+from pytorch_widedeep.bayesian_models._base_bayesian_model import BayesianModule
 
 
 class BayesianEmbedding(BayesianModule):
diff --git a/pytorch_widedeep/bayesian_models/bayesian_nn/modules/bayesian_linear.py b/pytorch_widedeep/bayesian_models/bayesian_nn/modules/bayesian_linear.py
index 53f09f6b..c4b4bc9d 100644
--- a/pytorch_widedeep/bayesian_models/bayesian_nn/modules/bayesian_linear.py
+++ b/pytorch_widedeep/bayesian_models/bayesian_nn/modules/bayesian_linear.py
@@ -17,9 +17,7 @@
     GaussianPosterior,
     ScaleMixtureGaussianPrior,
 )
-from pytorch_widedeep.bayesian_models._base_bayesian_model import (
-    BayesianModule,
-)
+from pytorch_widedeep.bayesian_models._base_bayesian_model import BayesianModule
 
 
 class BayesianLinear(BayesianModule):
diff --git a/pytorch_widedeep/bayesian_models/tabular/__init__.py b/pytorch_widedeep/bayesian_models/tabular/__init__.py
index 920425bf..30714109 100644
--- a/pytorch_widedeep/bayesian_models/tabular/__init__.py
+++ b/pytorch_widedeep/bayesian_models/tabular/__init__.py
@@ -1,6 +1,2 @@
-from pytorch_widedeep.bayesian_models.tabular.bayesian_mlp import (
-    BayesianTabMlp,
-)
-from pytorch_widedeep.bayesian_models.tabular.bayesian_linear import (
-    BayesianWide,
-)
+from pytorch_widedeep.bayesian_models.tabular.bayesian_mlp import BayesianTabMlp
+from pytorch_widedeep.bayesian_models.tabular.bayesian_linear import BayesianWide
diff --git a/pytorch_widedeep/bayesian_models/tabular/bayesian_embeddings_layers.py b/pytorch_widedeep/bayesian_models/tabular/bayesian_embeddings_layers.py
index 5ddf53d6..b8365125 100644
--- a/pytorch_widedeep/bayesian_models/tabular/bayesian_embeddings_layers.py
+++ b/pytorch_widedeep/bayesian_models/tabular/bayesian_embeddings_layers.py
@@ -9,9 +9,7 @@
     GaussianPosterior,
     ScaleMixtureGaussianPrior,
 )
-from pytorch_widedeep.bayesian_models._base_bayesian_model import (
-    BayesianModule,
-)
+from pytorch_widedeep.bayesian_models._base_bayesian_model import BayesianModule
 
 NormLayers = Union[nn.Identity, nn.LayerNorm, nn.BatchNorm1d]
 
diff --git a/pytorch_widedeep/bayesian_models/tabular/bayesian_linear/bayesian_wide.py b/pytorch_widedeep/bayesian_models/tabular/bayesian_linear/bayesian_wide.py
index 748afbf3..01c15f5a 100644
--- a/pytorch_widedeep/bayesian_models/tabular/bayesian_linear/bayesian_wide.py
+++ b/pytorch_widedeep/bayesian_models/tabular/bayesian_linear/bayesian_wide.py
@@ -3,9 +3,7 @@
 
 from pytorch_widedeep.wdtypes import Tensor
 from pytorch_widedeep.bayesian_models import bayesian_nn as bnn
-from pytorch_widedeep.bayesian_models._base_bayesian_model import (
-    BaseBayesianModel,
-)
+from pytorch_widedeep.bayesian_models._base_bayesian_model import BaseBayesianModel
 
 
 class BayesianWide(BaseBayesianModel):
@@ -76,7 +74,7 @@ class BayesianWide(BaseBayesianModel):
     >>> import torch
     >>> from pytorch_widedeep.bayesian_models import BayesianWide
     >>> X = torch.empty(4, 4).random_(6)
-    >>> wide = BayesianWide(input_dim=X.unique().size(0), pred_dim=1)
+    >>> wide = BayesianWide(input_dim=int(X.max().item()), pred_dim=1)
     >>> out = wide(X)
     """
 
diff --git a/pytorch_widedeep/bayesian_models/tabular/bayesian_mlp/bayesian_tab_mlp.py b/pytorch_widedeep/bayesian_models/tabular/bayesian_mlp/bayesian_tab_mlp.py
index bead3d8e..9d6c034a 100644
--- a/pytorch_widedeep/bayesian_models/tabular/bayesian_mlp/bayesian_tab_mlp.py
+++ b/pytorch_widedeep/bayesian_models/tabular/bayesian_mlp/bayesian_tab_mlp.py
@@ -3,20 +3,9 @@
 import einops
 from torch import nn
 
-from pytorch_widedeep.wdtypes import (
-    Dict,
-    List,
-    Tuple,
-    Tensor,
-    Literal,
-    Optional,
-)
-from pytorch_widedeep.bayesian_models._base_bayesian_model import (
-    BaseBayesianModel,
-)
-from pytorch_widedeep.bayesian_models.tabular.bayesian_mlp._layers import (
-    BayesianMLP,
-)
+from pytorch_widedeep.wdtypes import Dict, List, Tuple, Tensor, Literal, Optional
+from pytorch_widedeep.bayesian_models._base_bayesian_model import BaseBayesianModel
+from pytorch_widedeep.bayesian_models.tabular.bayesian_mlp._layers import BayesianMLP
 from pytorch_widedeep.bayesian_models.tabular.bayesian_embeddings_layers import (
     NormLayers,
     BayesianContEmbeddings,
diff --git a/pytorch_widedeep/load_from_folder/__init__.py b/pytorch_widedeep/load_from_folder/__init__.py
index 78f97183..8052b14c 100644
--- a/pytorch_widedeep/load_from_folder/__init__.py
+++ b/pytorch_widedeep/load_from_folder/__init__.py
@@ -1,9 +1,6 @@
 from pytorch_widedeep.load_from_folder.text import TextFromFolder
 from pytorch_widedeep.load_from_folder.image import ImageFromFolder
-from pytorch_widedeep.load_from_folder.tabular import (
-    TabFromFolder,
-    WideFromFolder,
-)
+from pytorch_widedeep.load_from_folder.tabular import TabFromFolder, WideFromFolder
 from pytorch_widedeep.load_from_folder.wd_dataset_from_folder import (
     WideDeepDatasetFromFolder,
 )
diff --git a/pytorch_widedeep/load_from_folder/image/__init__.py b/pytorch_widedeep/load_from_folder/image/__init__.py
index e6443645..4e023a4b 100644
--- a/pytorch_widedeep/load_from_folder/image/__init__.py
+++ b/pytorch_widedeep/load_from_folder/image/__init__.py
@@ -1,3 +1 @@
-from pytorch_widedeep.load_from_folder.image.image_from_folder import (
-    ImageFromFolder,
-)
+from pytorch_widedeep.load_from_folder.image.image_from_folder import ImageFromFolder
diff --git a/pytorch_widedeep/load_from_folder/image/image_from_folder.py b/pytorch_widedeep/load_from_folder/image/image_from_folder.py
index 5c78646d..dee911ae 100644
--- a/pytorch_widedeep/load_from_folder/image/image_from_folder.py
+++ b/pytorch_widedeep/load_from_folder/image/image_from_folder.py
@@ -187,7 +187,7 @@ def _preprocess_one_sample(
             else:
                 processed_sample = preprocessor.transform_sample(sample)
         else:
-            processed_sample = sample
+            processed_sample = sample  # type: ignore
 
         prepared_sample = self._prepare_sample(processed_sample)
 
@@ -240,7 +240,7 @@ def _prepare_sample(  # noqa: C901
             # processed_sample after all the previous manipulation
             processed_sample = self.transforms(torch.tensor(processed_sample))
 
-        return processed_sample
+        return processed_sample  # type: ignore
 
     def __repr__(self) -> str:
         list_of_params: List[str] = []
diff --git a/pytorch_widedeep/load_from_folder/text/__init__.py b/pytorch_widedeep/load_from_folder/text/__init__.py
index a4b7b924..edf2fd84 100644
--- a/pytorch_widedeep/load_from_folder/text/__init__.py
+++ b/pytorch_widedeep/load_from_folder/text/__init__.py
@@ -1,3 +1 @@
-from pytorch_widedeep.load_from_folder.text.text_from_folder import (
-    TextFromFolder,
-)
+from pytorch_widedeep.load_from_folder.text.text_from_folder import TextFromFolder
diff --git a/pytorch_widedeep/losses.py b/pytorch_widedeep/losses.py
index 63530352..2674f1ed 100644
--- a/pytorch_widedeep/losses.py
+++ b/pytorch_widedeep/losses.py
@@ -2,14 +2,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from pytorch_widedeep.wdtypes import (
-    List,
-    Tuple,
-    Union,
-    Tensor,
-    Literal,
-    Optional,
-)
+from pytorch_widedeep.wdtypes import List, Tuple, Union, Tensor, Literal, Optional
 
 use_cuda = torch.cuda.is_available()
 
diff --git a/pytorch_widedeep/losses_multitarget.py b/pytorch_widedeep/losses_multitarget.py
index 4f9e344d..5d8bed61 100644
--- a/pytorch_widedeep/losses_multitarget.py
+++ b/pytorch_widedeep/losses_multitarget.py
@@ -2,14 +2,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from pytorch_widedeep.wdtypes import (
-    List,
-    Tuple,
-    Union,
-    Tensor,
-    Literal,
-    Optional,
-)
+from pytorch_widedeep.wdtypes import List, Tuple, Union, Tensor, Literal, Optional
 from pytorch_widedeep.utils.general_utils import alias
 
 use_cuda = torch.cuda.is_available()
diff --git a/pytorch_widedeep/models/__init__.py b/pytorch_widedeep/models/__init__.py
index 2d3893bc..0b417680 100644
--- a/pytorch_widedeep/models/__init__.py
+++ b/pytorch_widedeep/models/__init__.py
@@ -1,7 +1,6 @@
 from pytorch_widedeep.models.text import (
     HFModel,
     BasicRNN,
-    Transformer,
     AttentiveRNN,
     StackedAttentiveRNN,
 )
diff --git a/pytorch_widedeep/models/_get_activation_fn.py b/pytorch_widedeep/models/_get_activation_fn.py
index f4ebab0c..38c96ee5 100644
--- a/pytorch_widedeep/models/_get_activation_fn.py
+++ b/pytorch_widedeep/models/_get_activation_fn.py
@@ -38,6 +38,8 @@ def get_activation_fn(activation):
         return REGLU()
     elif activation == "softplus":
         return nn.Softplus()
+    elif activation == "prelu":
+        return nn.PReLU()
     else:
         raise ValueError(
             "Only the following activation functions are currently "
diff --git a/pytorch_widedeep/models/image/vision.py b/pytorch_widedeep/models/image/vision.py
index ee9aebc0..343b1d45 100644
--- a/pytorch_widedeep/models/image/vision.py
+++ b/pytorch_widedeep/models/image/vision.py
@@ -16,9 +16,7 @@
 from pytorch_widedeep.utils.general_utils import alias
 from pytorch_widedeep.models.image._layers import conv_layer
 from pytorch_widedeep.models.tabular.mlp._layers import MLP
-from pytorch_widedeep.models._base_wd_model_component import (
-    BaseWDModelComponent,
-)
+from pytorch_widedeep.models._base_wd_model_component import BaseWDModelComponent
 
 # TO DO: Add support for:
 # alexnet
diff --git a/pytorch_widedeep/models/model_fusion.py b/pytorch_widedeep/models/model_fusion.py
index a667aae4..b303802a 100644
--- a/pytorch_widedeep/models/model_fusion.py
+++ b/pytorch_widedeep/models/model_fusion.py
@@ -4,9 +4,7 @@
 from pytorch_widedeep.models import TabNet
 from pytorch_widedeep.wdtypes import List, Union, Tensor, Literal, Optional
 from pytorch_widedeep.models.tabular.mlp._layers import MLP
-from pytorch_widedeep.models._base_wd_model_component import (
-    BaseWDModelComponent,
-)
+from pytorch_widedeep.models._base_wd_model_component import BaseWDModelComponent
 
 
 class ModelFuser(BaseWDModelComponent):
diff --git a/pytorch_widedeep/models/rec/__init__.py b/pytorch_widedeep/models/rec/__init__.py
new file mode 100644
index 00000000..53a92c83
--- /dev/null
+++ b/pytorch_widedeep/models/rec/__init__.py
@@ -0,0 +1,5 @@
+from pytorch_widedeep.models.rec.din import DeepInterestNetwork
+from pytorch_widedeep.models.rec.deepfm import DeepFactorizationMachine
+from pytorch_widedeep.models.rec.deepffm import DeepFieldAwareFactorizationMachine
+from pytorch_widedeep.models.rec.xdeepfm import ExtremeDeepFactorizationMachine
+from pytorch_widedeep.models.rec.basic_transformer import Transformer
diff --git a/pytorch_widedeep/models/rec/_layers.py b/pytorch_widedeep/models/rec/_layers.py
new file mode 100644
index 00000000..32132ab3
--- /dev/null
+++ b/pytorch_widedeep/models/rec/_layers.py
@@ -0,0 +1,81 @@
+from typing import List, Literal, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+
+class Dice(nn.Module):
+    def __init__(self, input_dim: int):
+        super(Dice, self).__init__()
+        self.bn = nn.BatchNorm1d(input_dim, eps=1e-9)
+        self.alpha = nn.Parameter(torch.zeros(input_dim))
+
+    def forward(self, X: Tensor) -> Tensor:
+        # This implementation assumes X has n_dim = 3
+        x_p = self.bn(X.transpose(1, 2)).transpose(1, 2)
+        p = torch.sigmoid(x_p)
+        return p * X + (1 - p) * self.alpha * X
+
+
+class ActivationUnit(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        activation: Literal["prelu", "dice"],
+        proj_dim: Optional[int] = None,
+    ):
+        super(ActivationUnit, self).__init__()
+        self.proj_dim = proj_dim if proj_dim is not None else embed_dim
+        self.linear_in = nn.Linear(embed_dim * 4, self.proj_dim)
+        if activation == "prelu":
+            self.activation: nn.PReLU | Dice = nn.PReLU()
+        elif activation == "dice":
+            self.activation = Dice(self.proj_dim)
+        self.linear_out = nn.Linear(self.proj_dim, 1)
+
+    def forward(self, item: Tensor, user_behavior: Tensor) -> Tensor:
+        # in this implementation:
+        # item: [batch_size, 1, embedding_dim]
+        # user_behavior: [batch_size, seq_len, embedding_dim]
+        item_expanded = item.expand(-1, user_behavior.size(1), -1)
+        attn_input = torch.cat(
+            [
+                item_expanded,
+                user_behavior,
+                item_expanded - user_behavior,
+                item_expanded * user_behavior,
+            ],
+            dim=-1,
+        )
+        attn_output = self.activation(self.linear_in(attn_input))
+        attn_output = self.linear_out(attn_output).squeeze(-1)
+        return F.softmax(attn_output, dim=1)
+
+
+class CompressedInteractionNetwork(nn.Module):
+    def __init__(self, num_cols: int, cin_layer_dims: List[int]):
+        super(CompressedInteractionNetwork, self).__init__()
+        self.num_cols = num_cols
+        self.cin_layer_dims = cin_layer_dims
+        self.cin_layers = nn.ModuleList()
+
+        prev_layer_dim = num_cols
+        for layer_dim in cin_layer_dims:
+            self.cin_layers.append(
+                nn.Conv1d(prev_layer_dim * num_cols, layer_dim, kernel_size=1)
+            )
+            prev_layer_dim = layer_dim
+
+    def forward(self, X: Tensor) -> Tensor:
+        batch_size, embed_dim = X.shape[0], X.shape[-1]
+        prev_x = X
+        cin_outs = []
+        for cin_layer in self.cin_layers:
+            x_i = torch.einsum("b m d, b h d  -> b m h d", X, prev_x)
+            x_i = x_i.reshape(batch_size, self.num_cols * prev_x.shape[1], embed_dim)
+            x_i = cin_layer(x_i)
+            cin_outs.append(x_i.sum(2))
+            prev_x = x_i
+
+        return torch.cat(cin_outs, dim=1)
diff --git a/pytorch_widedeep/models/text/miscellaneous/basic_transformer.py b/pytorch_widedeep/models/rec/basic_transformer.py
similarity index 91%
rename from pytorch_widedeep/models/text/miscellaneous/basic_transformer.py
rename to pytorch_widedeep/models/rec/basic_transformer.py
index d4dd2206..1ea2c7fe 100644
--- a/pytorch_widedeep/models/text/miscellaneous/basic_transformer.py
+++ b/pytorch_widedeep/models/rec/basic_transformer.py
@@ -5,22 +5,20 @@
 
 from pytorch_widedeep.wdtypes import Union, Tensor, Optional
 from pytorch_widedeep.utils.general_utils import alias
-from pytorch_widedeep.models.tabular.transformers._encoders import (
-    TransformerEncoder,
-)
+from pytorch_widedeep.models.tabular.transformers._encoders import TransformerEncoder
 
 
 class Transformer(nn.Module):
-    r"""Basic Encoder-Only Transformer Model for text classification/regression.
-    As all other models in the library this model can be used as the
-    `deeptext` component of a Wide & Deep model or independently by itself.
+    r"""
+    Basic Encoder-Only Transformer Model for sequence
+    classification/regression. As all other models in the library this model
+    can be used as the `deeptext` component of a Wide & Deep model or
+    independently by itself.
 
     :information_source: **NOTE**:
     This model is introduced in the context of recommendation systems and
     thought for sequences of any nature (e.g. items). It can, of course,
-    still be used for text. However, at this stage, we have decided to not
-    include the possibility of loading pretrained word vectors since we aim
-    to integrate the library wit Huggingface in the (hopefully) near future
+    still be used for text.
 
     Parameters
     ----------
@@ -30,7 +28,6 @@ class Transformer(nn.Module):
         Dimension of the token embeddings
 
         Param aliases: `embed_dim`, `d_model`. <br/>
-
     seq_length: int, Optional, default = None
         Input sequence length
     n_heads: int, default = 8
@@ -76,10 +73,10 @@ class Transformer(nn.Module):
     Examples
     --------
     >>> import torch
-    >>> from pytorch_widedeep.models import Transformer
-    >>> X_text = torch.cat((torch.zeros([5,1]), torch.empty(5, 4).random_(1,4)), axis=1)
+    >>> from pytorch_widedeep.models.rec import Transformer
+    >>> X = torch.cat((torch.zeros([5,1]), torch.empty(5, 4).random_(1,4)), axis=1)
     >>> model = Transformer(vocab_size=4, seq_length=5, input_dim=8, n_heads=1, n_blocks=1)
-    >>> out = model(X_text)
+    >>> out = model(X)
     """
 
     @alias("input_dim", ["embed_dim", "d_model"])
diff --git a/pytorch_widedeep/models/rec/deepffm.py b/pytorch_widedeep/models/rec/deepffm.py
new file mode 100644
index 00000000..f288f363
--- /dev/null
+++ b/pytorch_widedeep/models/rec/deepffm.py
@@ -0,0 +1,241 @@
+from typing import Any, Dict, List, Tuple, Optional
+
+import torch
+from torch import Tensor, nn
+
+from pytorch_widedeep.models.tabular.mlp._layers import MLP
+from pytorch_widedeep.models.tabular._base_tabular_model import (
+    BaseTabularModelWithAttention,
+)
+
+
+class DeepFieldAwareFactorizationMachine(BaseTabularModelWithAttention):
+    """
+    Deep Field Aware Factorization Machine (DeepFFM) for recommendation
+    systems. Adaptation of the paper 'Field-aware Factorization Machines in a
+    Real-world Online Advertising System', Juan et al. 2017.
+
+    This class implements only the 'Deep' component of the model described in
+    the paper. The linear component is not implemented 'internally' and, if
+    one wants to include it, it can be easily added using the 'wide'/linear
+    component in this library. See the examples in the examples folder.
+
+    Note that in this case, only categorical features are accepted. This is
+    because the embeddings of each feature will be learned using all other
+    features. Therefore these embeddings have to be all of the same nature.
+    This does not occur if we mix categorical and continuous features.
+
+    Parameters
+    ----------
+    column_idx : Dict[str, int]
+        Dictionary mapping column names to their corresponding index.
+    num_factors : int
+        Number of factors for the factorization machine.
+    reduce_sum : bool, default=True
+        Whether to reduce the sum in the factorization machine output.
+    cat_embed_input : Optional[List[Tuple[str, int]]], default=None
+        List of tuples with categorical column names and number of unique values.
+    cat_embed_dropout : Optional[float], default=None
+        Categorical embeddings dropout. If `None`, it will default
+        to 0.
+    use_cat_bias : Optional[bool], default=None
+        Boolean indicating if bias will be used for the categorical embeddings.
+        If `None`, it will default to 'False'.
+    cat_embed_activation : Optional[str], default=None
+        Activation function for the categorical embeddings, if any. Currently
+        _'tanh'_, _'relu'_, _'leaky_relu'_ and _'gelu'_ are supported
+    mlp_hidden_dims: List, default = [200, 100]
+        List with the number of neurons per dense layer in the mlp.
+    mlp_activation: str, default = "relu"
+        Activation function for the dense layers of the MLP. Currently
+        _'tanh'_, _'relu'_, _'leaky_relu'_ and _'gelu'_ are supported
+    mlp_dropout: float or List, default = 0.1
+        float or List of floats with the dropout between the dense layers.
+        e.g: _[0.5,0.5]_
+    mlp_batchnorm: bool, default = False
+        Boolean indicating whether or not batch normalization will be applied
+        to the dense layers
+    mlp_batchnorm_last: bool, default = False
+        Boolean indicating whether or not batch normalization will be applied
+        to the last of the dense layers
+    mlp_linear_first: bool, default = False
+        Boolean indicating the order of the operations in the dense
+        layer. If `True: [LIN -> ACT -> BN -> DP]`. If `False: [BN -> DP ->
+        LIN -> ACT]`
+
+    Attributes
+    ----------
+    n_features: int
+        Number of unique features/columns
+    n_tokens: int
+        Number of unique values (tokens) in the full dataset (corpus)
+    encoders: nn.ModuleList
+        List of `BaseTabularModelWithAttention` instances. One per categorical
+        column
+    mlp: nn.Module
+        Multi-layer perceptron. If `None` the output will be the output of the
+        factorization machine (i.e. the sum of the interactions)
+
+    Examples
+    --------
+    >>> import torch
+    >>> from torch import Tensor
+    >>> from typing import Dict, List, Tuple
+    >>> from pytorch_widedeep.models.rec import DeepFieldAwareFactorizationMachine
+    >>> X = torch.randint(0, 10, (16, 2))
+    >>> column_idx: Dict[str, int] = {"col1": 0, "col2": 1}
+    >>> cat_embed_input: List[Tuple[str, int]] = [("col1", 10), ("col2", 10)]
+    >>> ffm = DeepFieldAwareFactorizationMachine(
+    ...     column_idx=column_idx,
+    ...     num_factors=4,
+    ...     cat_embed_input=cat_embed_input,
+    ...     mlp_hidden_dims=[16, 8]
+    ... )
+    >>> output = ffm(X)
+    """
+
+    def __init__(
+        self,
+        *,
+        column_idx: Dict[str, int],
+        num_factors: int,
+        cat_embed_input: List[Tuple[str, int]],
+        reduce_sum: bool = True,
+        cat_embed_dropout: Optional[float] = None,
+        use_cat_bias: Optional[bool] = None,
+        cat_embed_activation: Optional[str] = None,
+        mlp_hidden_dims: Optional[List[int]] = None,
+        mlp_activation: Optional[str] = None,
+        mlp_dropout: Optional[float] = None,
+        mlp_batchnorm: Optional[bool] = None,
+        mlp_batchnorm_last: Optional[bool] = None,
+        mlp_linear_first: Optional[bool] = None,
+    ):
+        super(DeepFieldAwareFactorizationMachine, self).__init__(
+            column_idx=column_idx,
+            input_dim=num_factors,
+            cat_embed_input=cat_embed_input,
+            cat_embed_dropout=cat_embed_dropout,
+            use_cat_bias=use_cat_bias,
+            cat_embed_activation=cat_embed_activation,
+            shared_embed=False,
+            add_shared_embed=None,
+            frac_shared_embed=None,
+            continuous_cols=None,
+            cont_norm_layer=None,
+            embed_continuous_method=None,
+            cont_embed_dropout=None,
+            cont_embed_activation=None,
+            quantization_setup=None,
+            n_frequencies=None,
+            sigma=None,
+            share_last_layer=None,
+            full_embed_dropout=None,
+        )
+
+        self.reduce_sum = reduce_sum
+
+        self.mlp_hidden_dims = mlp_hidden_dims
+        self.mlp_activation = mlp_activation
+        self.mlp_dropout = mlp_dropout
+        self.mlp_batchnorm = mlp_batchnorm
+        self.mlp_batchnorm_last = mlp_batchnorm_last
+        self.mlp_linear_first = mlp_linear_first
+
+        self.n_features = len(self.column_idx)
+        self.n_tokens = sum([ei[1] for ei in cat_embed_input])
+
+        self.encoders = nn.ModuleList(
+            [
+                BaseTabularModelWithAttention(**config)
+                for config in self._get_encoder_configs()
+            ]
+        )
+
+        if self.mlp_hidden_dims is not None:
+            d_hidden = [
+                self.n_features * (self.n_features - 1) // 2 * num_factors
+            ] + self.mlp_hidden_dims
+            self.mlp = MLP(
+                d_hidden=d_hidden,
+                activation=(
+                    "relu" if self.mlp_activation is None else self.mlp_activation
+                ),
+                dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,
+                batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,
+                batchnorm_last=(
+                    False
+                    if self.mlp_batchnorm_last is None
+                    else self.mlp_batchnorm_last
+                ),
+                linear_first=(
+                    False if self.mlp_linear_first is None else self.mlp_linear_first
+                ),
+            )
+        else:
+            self.mlp = None
+
+    def forward(self, X: Tensor) -> Tensor:
+
+        interactions_l: List[Tensor] = []
+        for i in range(len(self.column_idx)):
+            for j in range(i + 1, len(self.column_idx)):
+                # the syntax [i] and [j] is to keep the shape of the tensors
+                # as they are sliced within '_get_embeddings'. This will
+                # return a tensor of shape (b, 1, embed_dim). Then it has to
+                # be squeezed to (b, embed_dim)  before multiplied
+                embed_i = self.encoders[i]._get_embeddings(X[:, [i]]).squeeze(1)
+                embed_j = self.encoders[j]._get_embeddings(X[:, [j]]).squeeze(1)
+                interactions_l.append(embed_i * embed_j)
+
+        interactions = torch.cat(interactions_l, dim=1)
+
+        if self.mlp is not None:
+            interactions = interactions.view(X.size(0), -1)
+            deep_out = self.mlp(interactions)
+        else:
+            deep_out = interactions
+
+        if self.reduce_sum:
+            deep_out = deep_out.sum(dim=1, keepdim=True)
+
+        return deep_out
+
+    def _get_encoder_configs(self) -> List[Dict[str, Any]]:
+        config: List[Dict[str, Any]] = []
+        for col, _ in self.column_idx.items():
+            cat_embed_input = [(col, self.n_tokens)]
+            _config = {
+                "column_idx": {col: 0},
+                "input_dim": self.input_dim,
+                "cat_embed_input": cat_embed_input,
+                "cat_embed_dropout": self.cat_embed_dropout,
+                "use_cat_bias": self.use_cat_bias,
+                "cat_embed_activation": self.cat_embed_activation,
+                "shared_embed": None,
+                "add_shared_embed": None,
+                "frac_shared_embed": None,
+                "continuous_cols": None,
+                "cont_norm_layer": None,
+                "embed_continuous_method": None,
+                "cont_embed_dropout": None,
+                "cont_embed_activation": None,
+                "quantization_setup": None,
+                "n_frequencies": None,
+                "sigma": None,
+                "share_last_layer": None,
+                "full_embed_dropout": None,
+            }
+
+            config.append(_config)
+
+        return config
+
+    @property
+    def output_dim(self) -> int:
+        if self.reduce_sum:
+            return 1
+        elif self.mlp_hidden_dims is not None:
+            return self.mlp_hidden_dims[-1]
+        else:
+            return self.n_features * (self.n_features - 1) // 2 * self.input_dim
diff --git a/pytorch_widedeep/models/rec/deepfm.py b/pytorch_widedeep/models/rec/deepfm.py
new file mode 100644
index 00000000..98801e0d
--- /dev/null
+++ b/pytorch_widedeep/models/rec/deepfm.py
@@ -0,0 +1,265 @@
+from typing import Dict, List, Tuple, Literal, Optional
+
+from torch import Tensor
+
+from pytorch_widedeep.models.tabular.mlp._layers import MLP
+from pytorch_widedeep.models.tabular._base_tabular_model import (
+    BaseTabularModelWithAttention,
+)
+
+
+def factorization_machine(input: Tensor, reduce_sum: bool = True) -> Tensor:
+    square_of_sum = (input).sum(dim=1) ** 2.0
+    sum_of_square = (input**2.0).sum(dim=1)
+    if reduce_sum:
+        return 0.5 * (square_of_sum - sum_of_square).sum(1, keepdim=True)
+    else:
+        return 0.5 * (square_of_sum - sum_of_square)
+
+
+class DeepFactorizationMachine(BaseTabularModelWithAttention):
+    """
+    Deep Factorization Machine (DeepFM) for recommendation systems, which is
+    an adaptation of 'Factorization Machines' by Steffen Rendle. Presented
+    in 'DeepFM: A Factorization-Machine based Neural Network for CTR
+    Prediction' by Huifeng Guo, Ruiming Tang, Yunming Yey, Zhenguo Li,
+    Xiuqiang He. 2017.
+
+    The implementation in this library takes advantage of all the
+    functionalities available to encode categorical and continuous features.
+    The model can be used with only the factorization machine
+
+    Note that this class implements only the 'Deep' component of the model
+    described in the paper. The linear component is not
+    implemented 'internally' and, if one wants to include it, it can be
+    easily added using the 'wide' (aka linear) component available in this
+    library. See the examples in the examples folder.
+
+    Parameters
+    ----------
+    column_idx : Dict[str, int]
+        Dictionary mapping column names to their corresponding index.
+    num_factors : int
+        Number of factors for the factorization machine.
+    reduce_sum : bool, default=True
+        Whether to reduce the sum in the factorization machine output.
+    cat_embed_input : Optional[List[Tuple[str, int]]], default=None
+        List of tuples with categorical column names and number of unique values.
+    cat_embed_dropout : Optional[float], default=None
+        Categorical embeddings dropout. If `None`, it will default
+        to 0.
+    use_cat_bias : Optional[bool], default=None
+        Boolean indicating if bias will be used for the categorical embeddings.
+        If `None`, it will default to 'False'.
+    cat_embed_activation : Optional[str], default=None
+        Activation function for the categorical embeddings, if any. Currently
+        _'tanh'_, _'relu'_, _'leaky_relu'_ and _'gelu'_ are supported
+    continuous_cols : Optional[List[str]], default=None
+        List with the name of the numeric (aka continuous) columns
+    cont_norm_layer : Optional[Literal["batchnorm", "layernorm"]], default=None
+        Type of normalization layer applied to the continuous features.
+        Options are: _'layernorm'_ and _'batchnorm'_. if `None`, no
+        normalization layer will be used.
+    embed_continuous_method : Optional[Literal["piecewise", "periodic"]], default="piecewise"
+        Method to use to embed the continuous features. Options are:
+        _'standard'_, _'periodic'_ or _'piecewise'_. The _'standard'_
+        embedding method is based on the FT-Transformer implementation
+        presented in the paper: [Revisiting Deep Learning Models for
+        Tabular Data](https://arxiv.org/abs/2106.11959v5). The _'periodic'_
+        and_'piecewise'_ methods were presented in the paper: [On Embeddings for
+        Numerical Features in Tabular Deep Learning](https://arxiv.org/abs/2203.05556).
+        Please, read the papers for details.
+    cont_embed_dim : Optional[int], default=None
+        Size of the continuous embeddings. If the continuous columns are
+        embedded, `cont_embed_dim` must be passed.
+    cont_embed_dropout : Optional[float], default=None
+        Dropout for the continuous embeddings. If `None`, it will default to 0.0
+    cont_embed_activation : Optional[str], default=None
+        Activation function for the continuous embeddings if any. Currently
+        _'tanh'_, _'relu'_, _'leaky_relu'_ and _'gelu'_ are supported.
+        If `None`, no activation function will be applied.
+    quantization_setup : Optional[Dict[str, List[float]]], default=None
+        This parameter is used when the _'piecewise'_ method is used to embed
+        the continuous cols. It is a dict where keys are the name of the continuous
+        columns and values are lists with the boundaries for the quantization
+        of the continuous_cols. See the examples for details. If
+        If the _'piecewise'_ method is used, this parameter is required.
+    n_frequencies : Optional[int], default=None
+        This is the so called _'k'_ in their paper [On Embeddings for
+        Numerical Features in Tabular Deep Learning](https://arxiv.org/abs/2203.05556),
+        and is the number of 'frequencies' that will be used to represent each
+        continuous column. See their Eq 2 in the paper for details. If
+        the _'periodic'_ method is used, this parameter is required.
+    sigma : Optional[float], default=None
+        This is the sigma parameter in the paper mentioned when describing the
+        previous parameters and it is used to initialise the 'frequency
+        weights'. See their Eq 2 in the paper for details. If
+        the _'periodic'_ method is used, this parameter is required.
+    share_last_layer : Optional[bool], default=None
+        This parameter is not present in the before mentioned paper but it is implemented in
+        the [official repo](https://github.com/yandex-research/rtdl-num-embeddings/tree/main).
+        If `True` the linear layer that turns the frequencies into embeddings
+        will be shared across the continuous columns. If `False` a different
+        linear layer will be used for each continuous column.
+        If the _'periodic'_ method is used, this parameter is required.
+    full_embed_dropout: bool, Optional, default = None,
+        If `True`, the full embedding corresponding to a column will be masked
+        out/dropout. If `None`, it will default to `False`.
+    mlp_hidden_dims: List, default = [200, 100]
+        List with the number of neurons per dense layer in the mlp.
+    mlp_activation: str, default = "relu"
+        Activation function for the dense layers of the MLP. Currently
+        _'tanh'_, _'relu'_, _'leaky_relu'_ and _'gelu'_ are supported
+    mlp_dropout: float or List, default = 0.1
+        float or List of floats with the dropout between the dense layers.
+        e.g: _[0.5,0.5]_
+    mlp_batchnorm: bool, default = False
+        Boolean indicating whether or not batch normalization will be applied
+        to the dense layers
+    mlp_batchnorm_last: bool, default = False
+        Boolean indicating whether or not batch normalization will be applied
+        to the last of the dense layers
+    mlp_linear_first: bool, default = False
+        Boolean indicating the order of the operations in the dense
+        layer. If `True: [LIN -> ACT -> BN -> DP]`. If `False: [BN -> DP ->
+        LIN -> ACT]`
+
+    Attributes
+    ----------
+    mlp: MLP
+        MLP component of the model if the mlp_hidden_dims parameter is not
+        None. If None, the model will only return the output of the
+        factorization machine.
+
+    Examples
+    --------
+    >>> from typing import Dict, List, Tuple
+    >>> import torch
+    >>> from torch import Tensor
+    >>> from pytorch_widedeep.models.rec import DeepFactorizationMachine
+    >>> X = torch.randint(0, 10, (16, 2))
+    >>> column_idx: Dict[str, int] = {"col1": 0, "col2": 1}
+    >>> cat_embed_input: List[Tuple[str, int]] = [("col1", 10), ("col2", 10)]
+    >>> fm = DeepFactorizationMachine(
+    ...     column_idx=column_idx,
+    ...     num_factors=8,
+    ...     cat_embed_input=cat_embed_input,
+    ...     mlp_hidden_dims=[16, 8]
+    ... )
+    >>> out = fm(X)
+    """
+
+    def __init__(
+        self,
+        *,
+        column_idx: Dict[str, int],
+        num_factors: int,
+        reduce_sum: bool = True,
+        cat_embed_input: Optional[List[Tuple[str, int]]] = None,
+        cat_embed_dropout: Optional[float] = None,
+        use_cat_bias: Optional[bool] = None,
+        cat_embed_activation: Optional[str] = None,
+        continuous_cols: Optional[List[str]] = None,
+        cont_norm_layer: Optional[Literal["batchnorm", "layernorm"]] = None,
+        embed_continuous: Optional[bool] = None,
+        embed_continuous_method: Optional[
+            Literal["piecewise", "periodic"]
+        ] = "piecewise",
+        cont_embed_dropout: Optional[float] = None,
+        cont_embed_activation: Optional[str] = None,
+        quantization_setup: Optional[Dict[str, List[float]]] = None,
+        n_frequencies: Optional[int] = None,
+        sigma: Optional[float] = None,
+        share_last_layer: Optional[bool] = None,
+        full_embed_dropout: Optional[bool] = None,
+        mlp_hidden_dims: Optional[List[int]] = None,
+        mlp_activation: Optional[str] = None,
+        mlp_dropout: Optional[float] = None,
+        mlp_batchnorm: Optional[bool] = None,
+        mlp_batchnorm_last: Optional[bool] = None,
+        mlp_linear_first: Optional[bool] = None,
+    ):
+        super(DeepFactorizationMachine, self).__init__(
+            column_idx=column_idx,
+            input_dim=num_factors,
+            cat_embed_input=cat_embed_input,
+            cat_embed_dropout=cat_embed_dropout,
+            use_cat_bias=use_cat_bias,
+            cat_embed_activation=cat_embed_activation,
+            shared_embed=False,
+            add_shared_embed=None,
+            frac_shared_embed=None,
+            continuous_cols=continuous_cols,
+            cont_norm_layer=cont_norm_layer,
+            embed_continuous_method=embed_continuous_method,
+            cont_embed_dropout=cont_embed_dropout,
+            cont_embed_activation=cont_embed_activation,
+            quantization_setup=quantization_setup,
+            n_frequencies=n_frequencies,
+            sigma=sigma,
+            share_last_layer=share_last_layer,
+            full_embed_dropout=full_embed_dropout,
+        )
+
+        self.reduce_sum = reduce_sum
+
+        self.mlp_hidden_dims = mlp_hidden_dims
+        self.mlp_activation = mlp_activation
+        self.mlp_dropout = mlp_dropout
+        self.mlp_batchnorm = mlp_batchnorm
+        self.mlp_batchnorm_last = mlp_batchnorm_last
+        self.mlp_linear_first = mlp_linear_first
+
+        if self.mlp_hidden_dims is not None:
+
+            if self.mlp_hidden_dims[-1] != self.input_dim:
+                d_hidden = (
+                    [self.input_dim * len(self.column_idx)]
+                    + self.mlp_hidden_dims
+                    + [self.input_dim]
+                )
+            else:
+                d_hidden = [
+                    self.input_dim * len(self.column_idx)
+                ] + self.mlp_hidden_dims
+
+            self.mlp = MLP(
+                d_hidden=d_hidden,
+                activation=(
+                    "relu" if self.mlp_activation is None else self.mlp_activation
+                ),
+                dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,
+                batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,
+                batchnorm_last=(
+                    False
+                    if self.mlp_batchnorm_last is None
+                    else self.mlp_batchnorm_last
+                ),
+                linear_first=(
+                    False if self.mlp_linear_first is None else self.mlp_linear_first
+                ),
+            )
+        else:
+            self.mlp = None
+
+    def forward(self, X: Tensor) -> Tensor:
+        embed = self._get_embeddings(X)
+        fm_output = factorization_machine(embed, self.reduce_sum)
+
+        if self.mlp is None:
+            return fm_output
+
+        mlp_input = embed.view(embed.size(0), -1)
+        mlp_output = self.mlp(mlp_input)
+
+        if self.reduce_sum:
+            mlp_output = mlp_output.sum(1, keepdim=True)
+
+        return fm_output + mlp_output
+
+    @property
+    def output_dim(self) -> int:
+        if self.reduce_sum:
+            return 1
+        else:
+            return self.input_dim
diff --git a/pytorch_widedeep/models/rec/din.py b/pytorch_widedeep/models/rec/din.py
new file mode 100644
index 00000000..1eba4f97
--- /dev/null
+++ b/pytorch_widedeep/models/rec/din.py
@@ -0,0 +1,573 @@
+from typing import Any, Dict, List, Tuple, Literal, Optional
+
+import torch
+from torch import Tensor, nn
+
+from pytorch_widedeep.models.rec._layers import ActivationUnit
+from pytorch_widedeep.models.tabular.mlp._layers import MLP
+from pytorch_widedeep.models._base_wd_model_component import BaseWDModelComponent
+from pytorch_widedeep.models.tabular._base_tabular_model import (
+    BaseTabularModelWithAttention,
+    BaseTabularModelWithoutAttention,
+)
+
+
+class DeepInterestNetwork(BaseWDModelComponent):
+    """
+    Adaptation of the Deep Interest Network (DIN) for recommendation systems
+    as described in the paper: 'Deep Interest Network for Click-Through Rate
+    Prediction' by Guorui Zhou et al. 2018.
+
+    Note that all the categorical- and continuous-related parameters refer to
+    the categorical and continuous columns that are not part of the
+    sequential columns and will be treated as standard tabular data.
+
+    This model requires some specific data preparation that allows for quite a
+    lot of flexibility. Therefore, this library does not currently include a
+    preprocessor designed specifically for this model. Please, see the
+    example 'movielens_din.py' in the examples folder to understand the data
+    preparation process.
+
+    Parameters
+    ----------
+    column_idx : Dict[str, int]
+        Dictionary mapping column names to their corresponding index.
+    target_item_col : str
+        Name of the target item column.
+    user_behavior_confiq : Tuple[List[str], int, int]
+        Configuration for user behavior sequence columns. Tuple containing:
+        - List of column names that correspond to the user behavior sequence<br/>
+        - Number of unique feature values (n_tokens)<br/>
+        - Embedding dimension<br/>
+        Example: `(["item_1", "item_2", "item_3"], 5, 8)`
+    action_seq_config : Optional[Tuple[List[str], int]], default=None
+        Configuration for a so-called action sequence columns (for example a
+        rating, or purchased/not-purchased, etc). Tuple containing:<br/>
+        - List of column names<br/>
+        - Number of unique feature values (n_tokens)<br/>
+        This action will **always** be learned as a 1d embedding and will be
+        combined with the user behaviour. For example, imagine that the
+        action is purchased/not-purchased. then per item in the user
+        behaviour sequence there will be a binary action to learn 0/1. Such
+        action will be represented by a float number that will multiply the
+        corresponding item embedding in the user behaviour sequence.<br/>
+        Example: `(["rating_1", "rating_2", "rating_3"], 5)`<br/>
+        Internally, the embedding dimension will be set to 1
+    other_seq_cols_confiq : Optional[List[Tuple[List[str], int, int]]], default=None
+        Configuration for other sequential columns. List of tuples containing:<br/>
+        - List of column names that correspond to the sequential column<br/>
+        - Number of unique feature values (n_tokens)<br/>
+        - Embedding dimension<br/>
+        Example: `[(["seq1_col1", "seq1_col2"], 5, 8), (["seq2_col1", "seq2_col2"], 5, 8)]`
+    attention_unit_activation : Literal["prelu", "dice"], default="prelu"
+        Activation function to use in the attention unit.
+    cat_embed_input : Optional[List[Tuple[str, int, int]]], default=None
+        Configuration for other columns. List of tuples containing:<br/>
+        - Column name<br/>
+        - Number of unique feature values (n_tokens)<br/>
+        - Embedding dimension<br/>
+
+        **Note**: From here in advance the remaining parameters are related to
+        the categorical and continuous columns that are not part of the
+        sequential columns and will be treated as standard tabular data.
+
+    cat_embed_dropout : Optional[float], default=None
+        Categorical embeddings dropout. If `None`, it will default
+        to 0.
+    use_cat_bias : Optional[bool], default=None
+        Boolean indicating if bias will be used for the categorical embeddings.
+        If `None`, it will default to 'False'.
+    cat_embed_activation : Optional[str], default=None
+        Activation function for the categorical embeddings, if any. Currently
+        _'tanh'_, _'relu'_, _'leaky_relu'_ and _'gelu'_ are supported
+    continuous_cols : Optional[List[str]], default=None
+        List with the name of the numeric (aka continuous) columns
+    cont_norm_layer : Optional[Literal["batchnorm", "layernorm"]], default=None
+        Type of normalization layer applied to the continuous features.
+        Options are: _'layernorm'_ and _'batchnorm'_. if `None`, no
+        normalization layer will be used.
+    embed_continuous : Optional[bool], default=None
+        Boolean indicating if the continuous columns will be embedded using
+        one of the available methods: _'standard'_, _'periodic'_
+        or _'piecewise'_. If `None`, it will default to 'False'.<br/>
+        :information_source: **NOTE**: This parameter is deprecated and it
+         will be removed in future releases. Please, use the
+         `embed_continuous_method` parameter instead.
+    embed_continuous_method : Optional[Literal["piecewise", "periodic"]], default="piecewise"
+        Method to use to embed the continuous features. Options are:
+        _'standard'_, _'periodic'_ or _'piecewise'_. The _'standard'_
+        embedding method is based on the FT-Transformer implementation
+        presented in the paper: [Revisiting Deep Learning Models for
+        Tabular Data](https://arxiv.org/abs/2106.11959v5). The _'periodic'_
+        and_'piecewise'_ methods were presented in the paper: [On Embeddings for
+        Numerical Features in Tabular Deep Learning](https://arxiv.org/abs/2203.05556).
+        Please, read the papers for details.
+    cont_embed_dim : Optional[int], default=None
+        Size of the continuous embeddings. If the continuous columns are
+        embedded, `cont_embed_dim` must be passed.
+    cont_embed_dropout : Optional[float], default=None
+        Dropout for the continuous embeddings. If `None`, it will default to 0.0
+    cont_embed_activation : Optional[str], default=None
+        Activation function for the continuous embeddings if any. Currently
+        _'tanh'_, _'relu'_, _'leaky_relu'_ and _'gelu'_ are supported.
+        If `None`, no activation function will be applied.
+    quantization_setup : Optional[Dict[str, List[float]]], default=None
+        This parameter is used when the _'piecewise'_ method is used to embed
+        the continuous cols. It is a dict where keys are the name of the continuous
+        columns and values are lists with the boundaries for the quantization
+        of the continuous_cols. See the examples for details. If
+        If the _'piecewise'_ method is used, this parameter is required.
+    n_frequencies : Optional[int], default=None
+        This is the so called _'k'_ in their paper [On Embeddings for
+        Numerical Features in Tabular Deep Learning](https://arxiv.org/abs/2203.05556),
+        and is the number of 'frequencies' that will be used to represent each
+        continuous column. See their Eq 2 in the paper for details. If
+        the _'periodic'_ method is used, this parameter is required.
+    sigma : Optional[float], default=None
+        This is the sigma parameter in the paper mentioned when describing the
+        previous parameters and it is used to initialise the 'frequency
+        weights'. See their Eq 2 in the paper for details. If
+        the _'periodic'_ method is used, this parameter is required.
+    share_last_layer : Optional[bool], default=None
+        This parameter is not present in the before mentioned paper but it is implemented in
+        the [official repo](https://github.com/yandex-research/rtdl-num-embeddings/tree/main).
+        If `True` the linear layer that turns the frequencies into embeddings
+        will be shared across the continuous columns. If `False` a different
+        linear layer will be used for each continuous column.
+        If the _'periodic'_ method is used, this parameter is required.
+    full_embed_dropout: bool, Optional, default = None,
+        If `True`, the full embedding corresponding to a column will be masked
+        out/dropout. If `None`, it will default to `False`.
+    mlp_hidden_dims: List, default = [200, 100]
+        List with the number of neurons per dense layer in the mlp.
+    mlp_activation: str, default = "relu"
+        Activation function for the dense layers of the MLP. Currently
+        _'tanh'_, _'relu'_, _'leaky_relu'_, _'gelu'_ and _'preglu'_ are
+          supported
+    mlp_dropout: float or List, default = 0.1
+        float or List of floats with the dropout between the dense layers.
+        e.g: _[0.5,0.5]_
+    mlp_batchnorm: bool, default = False
+        Boolean indicating whether or not batch normalization will be applied
+        to the dense layers
+    mlp_batchnorm_last: bool, default = False
+        Boolean indicating whether or not batch normalization will be applied
+        to the last of the dense layers
+    mlp_linear_first: bool, default = False
+        Boolean indicating the order of the operations in the dense
+        layer. If `True: [LIN -> ACT -> BN -> DP]`. If `False: [BN -> DP ->
+        LIN -> ACT]`
+
+    Attributes
+    ----------
+    user_behavior_indexes: List[int]
+        List with the indexes of the user behavior columns
+    user_behavior_embed: BaseTabularModelWithAttention
+        Embedding layer for the user
+    action_seq_indexes: List[int]
+        List with the indexes of the rating sequence columns if the
+        action_seq_config parameter is not None
+    action_embed: BaseTabularModelWithAttention
+        Embedding layer for the rating sequence columns if the
+        action_seq_config parameter is not None
+    other_seq_cols_indexes: Dict[str, List[int]]
+        Dictionary with the indexes of the other sequential columns if the
+        other_seq_cols_confiq parameter is not None
+    other_seq_cols_embed: nn.ModuleDict
+        Dictionary with the embedding layers for the other sequential columns
+        if the other_seq_cols_confiq parameter is not None
+    other_cols_idx: List[int]
+        List with the indexes of the other columns if the other_cols_config
+        parameter is not None
+    other_col_embed: BaseTabularModel
+        Embedding layer for the other columns if the other_cols_config
+        parameter is not None
+    mlp: Optional[MLP]
+        MLP component of the model. If None, no MLP will be used. This should
+        almost always be not None.
+
+    Examples
+    --------
+    >>> import torch
+    >>> import numpy as np
+    >>> from torch import Tensor
+    >>> from typing import Dict, List, Tuple
+    >>> from pytorch_widedeep.models.rec import DeepInterestNetwork
+    >>> np_seed = np.random.seed(42)
+    >>> torch_seed = torch.manual_seed(42)
+    >>> num_users = 10
+    >>> num_items = 5
+    >>> num_contexts = 3
+    >>> seq_length = 3
+    >>> num_samples = 10
+    >>> user_ids = np.random.randint(0, num_users, num_samples)
+    >>> target_item_ids = np.random.randint(0, num_items, num_samples)
+    >>> context_ids = np.random.randint(0, num_contexts, num_samples)
+    >>> user_behavior = np.array(
+    ...     [
+    ...         np.random.choice(num_items, seq_length, replace=False)
+    ...         for _ in range(num_samples)
+    ...     ]
+    ... )
+    >>> X_arr = np.column_stack((user_ids, target_item_ids, context_ids, user_behavior))
+    >>> X = torch.tensor(X_arr, dtype=torch.long)
+    >>> column_idx: Dict[str, int] = {
+    ...     "user_id": 0,
+    ...     "target_item": 1,
+    ...     "context": 2,
+    ...     "item_1": 3,
+    ...     "item_2": 4,
+    ...     "item_3": 5,
+    ... }
+    >>> user_behavior_config: Tuple[List[str], int, int] = (
+    ...     ["item_1", "item_2", "item_3"],
+    ...     num_items,
+    ...     8,
+    ... )
+    >>> cat_embed_input: List[Tuple[str, int, int]] = [
+    ...     ("user_id", num_users, 8),
+    ...     ("context", num_contexts, 4),
+    ... ]
+    >>> model = DeepInterestNetwork(
+    ...     column_idx=column_idx,
+    ...     target_item_col="target_item",
+    ...     user_behavior_confiq=user_behavior_config,
+    ...     cat_embed_input=cat_embed_input,
+    ...     mlp_hidden_dims=[16, 8],
+    ... )
+    >>> output = model(X)
+    """
+
+    def __init__(
+        self,
+        *,
+        column_idx: Dict[str, int],
+        target_item_col: str,
+        user_behavior_confiq: Tuple[List[str], int, int],
+        action_seq_config: Optional[Tuple[List[str], int]] = None,
+        other_seq_cols_confiq: Optional[List[Tuple[List[str], int, int]]] = None,
+        attention_unit_activation: Literal["prelu", "dice"] = "prelu",
+        cat_embed_input: Optional[List[Tuple[str, int, int]]] = None,
+        cat_embed_dropout: Optional[float] = None,
+        use_cat_bias: Optional[bool] = None,
+        cat_embed_activation: Optional[str] = None,
+        continuous_cols: Optional[List[str]] = None,
+        cont_norm_layer: Optional[Literal["batchnorm", "layernorm"]] = None,
+        embed_continuous: Optional[bool] = None,
+        embed_continuous_method: Optional[
+            Literal["standard", "piecewise", "periodic"]
+        ] = None,
+        cont_embed_dim: Optional[int] = None,
+        cont_embed_dropout: Optional[float] = None,
+        cont_embed_activation: Optional[str] = None,
+        quantization_setup: Optional[Dict[str, List[float]]] = None,
+        n_frequencies: Optional[int] = None,
+        sigma: Optional[float] = None,
+        share_last_layer: Optional[bool] = None,
+        full_embed_dropout: Optional[bool] = None,
+        mlp_hidden_dims: Optional[List[int]] = None,
+        mlp_activation: Optional[str] = None,
+        mlp_dropout: Optional[float] = None,
+        mlp_batchnorm: Optional[bool] = None,
+        mlp_batchnorm_last: Optional[bool] = None,
+        mlp_linear_first: Optional[bool] = None,
+    ):
+        super(DeepInterestNetwork, self).__init__()
+
+        self.column_idx = {
+            k: v for k, v in sorted(column_idx.items(), key=lambda x: x[1])
+        }
+
+        self.column_idx = column_idx
+        self.target_item_col = target_item_col
+        self.user_behavior_confiq = user_behavior_confiq
+        self.action_seq_config = (
+            (action_seq_config[0], action_seq_config[1], 1)
+            if action_seq_config is not None
+            else None
+        )
+        self.other_seq_cols_confiq = other_seq_cols_confiq
+        self.cat_embed_input = cat_embed_input
+        self.attention_unit_activation = attention_unit_activation
+
+        self.cat_embed_dropout = cat_embed_dropout
+        self.use_cat_bias = use_cat_bias
+        self.cat_embed_activation = cat_embed_activation
+        self.continuous_cols = continuous_cols
+        self.cont_norm_layer = cont_norm_layer
+        self.embed_continuous = embed_continuous
+        self.embed_continuous_method = embed_continuous_method
+        self.cont_embed_dim = cont_embed_dim
+        self.cont_embed_dropout = cont_embed_dropout
+        self.cont_embed_activation = cont_embed_activation
+        self.quantization_setup = quantization_setup
+        self.n_frequencies = n_frequencies
+        self.sigma = sigma
+        self.share_last_layer = share_last_layer
+        self.full_embed_dropout = full_embed_dropout
+
+        self.mlp_hidden_dims = mlp_hidden_dims
+        self.mlp_activation = mlp_activation
+        self.mlp_dropout = mlp_dropout
+        self.mlp_batchnorm = mlp_batchnorm
+        self.mlp_batchnorm_last = mlp_batchnorm_last
+        self.mlp_linear_first = mlp_linear_first
+
+        self.target_item_idx = self.column_idx[target_item_col]
+
+        self.user_behavior_indexes, self.user_behavior_embed = (
+            self.set_user_behavior_indexes_and_embed(user_behavior_confiq)
+        )
+        self.user_behavior_dim = user_behavior_confiq[2]
+
+        if self.action_seq_config is not None:
+            self.action_seq_indexes, self.action_embed = (
+                self._set_rating_indexes_and_embed(self.action_seq_config)
+            )
+        else:
+            self.action_embed = None
+
+        if self.other_seq_cols_confiq is not None:
+            (
+                self.other_seq_cols_indexes,
+                self.other_seq_cols_embed,
+                self.other_seq_dim,
+            ) = self._set_other_seq_cols_indexes_embed_and_dim(
+                self.other_seq_cols_confiq
+            )
+        else:
+            self.other_seq_cols_embed = None
+            self.other_seq_dim = 0
+
+        if self.cat_embed_input is not None or self.continuous_cols is not None:
+            self.other_cols_idx, self.other_col_embed, self.other_cols_dim = (
+                self._set_other_cols_idx_embed_and_dim(
+                    self.cat_embed_input, self.continuous_cols
+                )
+            )
+        else:
+            self.other_col_embed = None
+            self.other_cols_dim = 0
+
+        self.attention = ActivationUnit(
+            user_behavior_confiq[2], attention_unit_activation
+        )
+
+        if self.mlp_hidden_dims is not None:
+            mlp_input_dim = (
+                self.user_behavior_dim * 2 + self.other_seq_dim + self.other_cols_dim
+            )
+            self.mlp = MLP(
+                d_hidden=[mlp_input_dim] + self.mlp_hidden_dims,
+                activation=(
+                    "relu" if self.mlp_activation is None else self.mlp_activation
+                ),
+                dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,
+                batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,
+                batchnorm_last=(
+                    False
+                    if self.mlp_batchnorm_last is None
+                    else self.mlp_batchnorm_last
+                ),
+                linear_first=(
+                    False if self.mlp_linear_first is None else self.mlp_linear_first
+                ),
+            )
+        else:
+            self.mlp = None
+
+    def forward(self, X: Tensor) -> Tensor:
+
+        X_target_item = X[:, [self.target_item_idx]]
+        item_embed = self.user_behavior_embed.cat_embed.embed(X_target_item.long())
+
+        X_user_behavior = X[:, self.user_behavior_indexes]
+        user_behavior_embed = self.user_behavior_embed._get_embeddings(X_user_behavior)
+        # 0 is the padding index
+        mask = (X_user_behavior != 0).float().to(X.device)
+
+        if self.action_embed is not None:
+            X_rating = X[:, self.action_seq_indexes]
+            action_embed = self.action_embed._get_embeddings(X_rating)
+            user_behavior_embed = user_behavior_embed * action_embed
+
+        attention_scores = self.attention(item_embed, user_behavior_embed)
+        attention_scores = attention_scores * mask
+        user_interest = (attention_scores.unsqueeze(-1) * user_behavior_embed).sum(1)
+
+        deep_out = torch.cat([item_embed.squeeze(1), user_interest], dim=1)
+
+        if self.other_seq_cols_embed is not None:
+            X_other_seq: Dict[str, Tensor] = {
+                col: X[:, idx] for col, idx in self.other_seq_cols_indexes.items()
+            }
+            other_seq_embed = torch.cat(
+                [
+                    self.other_seq_cols_embed[col]._get_embeddings(X_other_seq[col])
+                    for col in self.other_seq_cols_indexes.keys()
+                ]
+            ).sum(1)
+            deep_out = torch.cat([deep_out, other_seq_embed], dim=1)
+
+        if self.other_col_embed is not None:
+            X_other_cols = X[:, self.other_cols_idx]
+            other_cols_embed = self.other_col_embed._get_embeddings(X_other_cols)
+            deep_out = torch.cat([deep_out, other_cols_embed], dim=1)
+
+        if self.mlp is not None:
+            deep_out = self.mlp(deep_out)
+
+        return deep_out
+
+    @staticmethod
+    def _get_seq_cols_embed_confiq(tup: Tuple[List[str], int, int]) -> Dict[str, Any]:
+        # tup[0] is the list of columns
+        # tup[1] is the number of unique feat value or "n_tokens"
+        # tup[2] is the embedding dimension
+
+        # Once sliced, the indexes will go from 0 to len(tup[0]) it is assumed
+        # that the columns in tup[0] are ordered of appearance in the input
+        # data
+        column_idx = {col: i for i, col in enumerate(tup[0])}
+
+        # This is a hack so that I can use any BaseTabularModelWithAttention.
+        # For this model to work 'cat_embed_input' is normally a List of
+        # Tuples where the first element is the column name and the second is
+        # the number of unique values for that column. That second elements
+        # is added internally to compute what one could call "n_tokens". Here
+        # I'm passing that value as the second element of the first tuple
+        # and then adding 0s for the rest of the columns
+        cat_embed_input = [(tup[0][0], tup[1])] + [(col, 0) for col in tup[0][1:]]
+
+        input_dim = tup[2]
+
+        col_config = {
+            "column_idx": column_idx,
+            "input_dim": input_dim,
+            "cat_embed_input": cat_embed_input,
+            "cat_embed_dropout": None,
+            "use_cat_bias": None,
+            "cat_embed_activation": None,
+            "shared_embed": None,
+            "add_shared_embed": None,
+            "frac_shared_embed": None,
+            "continuous_cols": None,
+            "cont_norm_layer": None,
+            "embed_continuous_method": None,
+            "cont_embed_dropout": None,
+            "cont_embed_activation": None,
+            "quantization_setup": None,
+            "n_frequencies": None,
+            "sigma": None,
+            "share_last_layer": None,
+            "full_embed_dropout": None,
+        }
+
+        return col_config
+
+    def _get_other_cols_embed_config(
+        self,
+        cat_embed_input: Optional[List[Tuple[str, int, int]]],
+        continuous_cols: Optional[List[str]],
+        column_idx: Dict[str, int],
+    ) -> Dict[str, Any]:
+
+        cols_config = {
+            "column_idx": {col: i for i, col in enumerate(column_idx.keys())},
+            "cat_embed_input": cat_embed_input,
+            "cat_embed_dropout": self.cat_embed_dropout,
+            "use_cat_bias": self.use_cat_bias,
+            "cat_embed_activation": self.cat_embed_activation,
+            "continuous_cols": continuous_cols,
+            "cont_norm_layer": self.cont_norm_layer,
+            "embed_continuous": self.embed_continuous,
+            "embed_continuous_method": self.embed_continuous_method,
+            "cont_embed_dim": self.cont_embed_dim,
+            "cont_embed_dropout": self.cont_embed_dropout,
+            "cont_embed_activation": self.cont_embed_activation,
+            "quantization_setup": self.quantization_setup,
+            "n_frequencies": self.n_frequencies,
+            "sigma": self.sigma,
+            "share_last_layer": self.share_last_layer,
+            "full_embed_dropout": self.full_embed_dropout,
+        }
+
+        return cols_config
+
+    def set_user_behavior_indexes_and_embed(
+        self, user_behavior_confiq: Tuple[List[str], int, int]
+    ) -> Tuple[List[int], BaseTabularModelWithAttention]:
+        user_behavior_indexes = [
+            self.column_idx[col] for col in user_behavior_confiq[0]
+        ]
+        user_behavior_embed = BaseTabularModelWithAttention(
+            **self._get_seq_cols_embed_confiq(user_behavior_confiq)
+        )
+        return user_behavior_indexes, user_behavior_embed
+
+    def _set_rating_indexes_and_embed(
+        self, action_seq_config: Tuple[List[str], int, int]
+    ) -> Tuple[List[int], BaseTabularModelWithAttention]:
+        action_seq_indexes = [self.column_idx[col] for col in action_seq_config[0]]
+        action_embed = BaseTabularModelWithAttention(
+            **self._get_seq_cols_embed_confiq(action_seq_config)
+        )
+        return action_seq_indexes, action_embed
+
+    def _set_other_seq_cols_indexes_embed_and_dim(
+        self, other_seq_cols_confiq: List[Tuple[List[str], int, int]]
+    ) -> Tuple[Dict[str, List[int]], nn.ModuleDict, int]:
+        other_seq_cols_indexes: Dict[str, List[int]] = {}
+        for i, el in enumerate(other_seq_cols_confiq):
+            key = f"seq_{i}"
+            idxs = [self.column_idx[col] for col in el[0]]
+            other_seq_cols_indexes[key] = idxs
+        other_seq_cols_config = {
+            f"seq_{i}": self._get_seq_cols_embed_confiq(el)
+            for i, el in enumerate(other_seq_cols_confiq)
+        }
+        other_seq_cols_embed = nn.ModuleDict(
+            {
+                key: BaseTabularModelWithAttention(**config)
+                for key, config in other_seq_cols_config.items()
+            }
+        )
+        other_seq_dim = sum([el[2] for el in other_seq_cols_confiq])
+
+        return other_seq_cols_indexes, other_seq_cols_embed, other_seq_dim
+
+    def _set_other_cols_idx_embed_and_dim(
+        self,
+        cat_embed_input: Optional[List[Tuple[str, int, int]]],
+        continuous_cols: Optional[List[str]],
+    ) -> Tuple[List[int], BaseTabularModelWithoutAttention, int]:
+        other_cols_idx: Dict[str, int] = {}
+        if cat_embed_input is not None:
+            other_cols_idx = {
+                col: self.column_idx[col] for col in [el[0] for el in cat_embed_input]
+            }
+        if continuous_cols is not None:
+            other_cols_idx.update(
+                {col: self.column_idx[col] for col in continuous_cols}
+            )
+
+        sorted_other_cols_idx = {
+            k: v for k, v in sorted(other_cols_idx.items(), key=lambda x: x[1])
+        }
+
+        other_col_embed = BaseTabularModelWithoutAttention(
+            **self._get_other_cols_embed_config(
+                cat_embed_input, continuous_cols, sorted_other_cols_idx
+            )
+        )
+
+        other_cols_dim = other_col_embed.output_dim
+
+        return list(other_cols_idx.values()), other_col_embed, other_cols_dim
+
+    @property
+    def output_dim(self) -> int:
+        if self.mlp_hidden_dims is not None:
+            return self.mlp_hidden_dims[-1]
+        else:
+            return self.user_behavior_dim * 2 + self.other_seq_dim + self.other_cols_dim
diff --git a/pytorch_widedeep/models/rec/xdeepfm.py b/pytorch_widedeep/models/rec/xdeepfm.py
new file mode 100644
index 00000000..3a89e796
--- /dev/null
+++ b/pytorch_widedeep/models/rec/xdeepfm.py
@@ -0,0 +1,270 @@
+from typing import Dict, List, Tuple, Literal, Optional
+
+from torch import Tensor
+
+from pytorch_widedeep.models.rec._layers import CompressedInteractionNetwork
+from pytorch_widedeep.models.tabular.mlp._layers import MLP
+from pytorch_widedeep.models.tabular._base_tabular_model import (
+    BaseTabularModelWithAttention,
+)
+
+
+class ExtremeDeepFactorizationMachine(BaseTabularModelWithAttention):
+    """
+    Adaptation of 'xDeepFM implementation: xDeepFM: Combining Explicit and
+    Implicit Feature Interactions for Recommender Systems' by Jianxun Lian,
+    Xiaohuan Zhou, Fuzheng Zhang, Zhongxia Chen, Xing Xie, Guangzhong Sun and
+    Enhong Chen, 2018
+
+    The implementation in this library takes advantage of all the
+    functionalities available to encode categorical and continuous features.
+    The model can be used with only the factorization machine
+
+    Note that this class implements only the 'Deep' component of the model
+    described in the paper. The linear component is not
+    implemented 'internally' and, if one wants to include it, it can be
+    easily added using the 'wide'/linear component in this library. See the
+    examples in the examples folder.
+
+    Parameters
+    ----------
+    column_idx : Dict[str, int]
+        Dictionary mapping column names to their corresponding index.
+    input_dim : int
+        Embedding input dimensions
+    reduce_sum : bool, default=True
+        Whether to reduce the sum in the factorization machine output.
+    cin_layer_dims : List[int]
+        List with the number of units per CIN layer. e.g: _[128, 64]_
+    cat_embed_input : Optional[List[Tuple[str, int]]], default=None
+        List of tuples with categorical column names and number of unique values.
+    cat_embed_dropout : Optional[float], default=None
+        Categorical embeddings dropout. If `None`, it will default
+        to 0.
+    use_cat_bias : Optional[bool], default=None
+        Boolean indicating if bias will be used for the categorical embeddings.
+        If `None`, it will default to 'False'.
+    cat_embed_activation : Optional[str], default=None
+        Activation function for the categorical embeddings, if any. Currently
+        _'tanh'_, _'relu'_, _'leaky_relu'_ and _'gelu'_ are supported
+    continuous_cols : Optional[List[str]], default=None
+        List with the name of the numeric (aka continuous) columns
+    cont_norm_layer : Optional[Literal["batchnorm", "layernorm"]], default=None
+        Type of normalization layer applied to the continuous features.
+        Options are: _'layernorm'_ and _'batchnorm'_. if `None`, no
+        normalization layer will be used.
+    embed_continuous_method : Optional[Literal["piecewise", "periodic"]], default="piecewise"
+        Method to use to embed the continuous features. Options are:
+        _'standard'_, _'periodic'_ or _'piecewise'_. The _'standard'_
+        embedding method is based on the FT-Transformer implementation
+        presented in the paper: [Revisiting Deep Learning Models for
+        Tabular Data](https://arxiv.org/abs/2106.11959v5). The _'periodic'_
+        and_'piecewise'_ methods were presented in the paper: [On Embeddings for
+        Numerical Features in Tabular Deep Learning](https://arxiv.org/abs/2203.05556).
+        Please, read the papers for details.
+    cont_embed_dim : Optional[int], default=None
+        Size of the continuous embeddings. If the continuous columns are
+        embedded, `cont_embed_dim` must be passed.
+    cont_embed_dropout : Optional[float], default=None
+        Dropout for the continuous embeddings. If `None`, it will default to 0.0
+    cont_embed_activation : Optional[str], default=None
+        Activation function for the continuous embeddings if any. Currently
+        _'tanh'_, _'relu'_, _'leaky_relu'_ and _'gelu'_ are supported.
+        If `None`, no activation function will be applied.
+    quantization_setup : Optional[Dict[str, List[float]]], default=None
+        This parameter is used when the _'piecewise'_ method is used to embed
+        the continuous cols. It is a dict where keys are the name of the continuous
+        columns and values are lists with the boundaries for the quantization
+        of the continuous_cols. See the examples for details. If
+        If the _'piecewise'_ method is used, this parameter is required.
+    n_frequencies : Optional[int], default=None
+        This is the so called _'k'_ in their paper [On Embeddings for
+        Numerical Features in Tabular Deep Learning](https://arxiv.org/abs/2203.05556),
+        and is the number of 'frequencies' that will be used to represent each
+        continuous column. See their Eq 2 in the paper for details. If
+        the _'periodic'_ method is used, this parameter is required.
+    sigma : Optional[float], default=None
+        This is the sigma parameter in the paper mentioned when describing the
+        previous parameters and it is used to initialise the 'frequency
+        weights'. See their Eq 2 in the paper for details. If
+        the _'periodic'_ method is used, this parameter is required.
+    share_last_layer : Optional[bool], default=None
+        This parameter is not present in the before mentioned paper but it is implemented in
+        the [official repo](https://github.com/yandex-research/rtdl-num-embeddings/tree/main).
+        If `True` the linear layer that turns the frequencies into embeddings
+        will be shared across the continuous columns. If `False` a different
+        linear layer will be used for each continuous column.
+        If the _'periodic'_ method is used, this parameter is required.
+    full_embed_dropout: bool, Optional, default = None,
+        If `True`, the full embedding corresponding to a column will be masked
+        out/dropout. If `None`, it will default to `False`.
+    mlp_hidden_dims: List, default = [200, 100]
+        List with the number of neurons per dense layer in the mlp.
+    mlp_activation: str, default = "relu"
+        Activation function for the dense layers of the MLP. Currently
+        _'tanh'_, _'relu'_, _'leaky_relu'_ and _'gelu'_ are supported
+    mlp_dropout: float or List, default = 0.1
+        float or List of floats with the dropout between the dense layers.
+        e.g: _[0.5,0.5]_
+    mlp_batchnorm: bool, default = False
+        Boolean indicating whether or not batch normalization will be applied
+        to the dense layers
+    mlp_batchnorm_last: bool, default = False
+        Boolean indicating whether or not batch normalization will be applied
+        to the last of the dense layers
+    mlp_linear_first: bool, default = False
+        Boolean indicating the order of the operations in the dense
+        layer. If `True: [LIN -> ACT -> BN -> DP]`. If `False: [BN -> DP ->
+        LIN -> ACT]`
+
+    Attributes
+    ----------
+    n_features: int
+        Number of unique features/columns
+    cin: CompressedInteractionNetwork
+        Instance of the `CompressedInteractionNetwork` class
+    mlp: MLP
+        Instance of the `MLP` class if `mlp_hidden_dims` is not None. If None,
+        the model will return directly the output of the `CIN`
+
+    Examples
+    --------
+    >>> import torch
+    >>> from pytorch_widedeep.models.rec import ExtremeDeepFactorizationMachine
+    >>> X_tab = torch.randint(0, 10, (16, 2))
+    >>> column_idx = {"col1": 0, "col2": 1}
+    >>> cat_embed_input = [("col1", 10), ("col2", 10)]
+    >>> xdeepfm = ExtremeDeepFactorizationMachine(
+    ...     column_idx=column_idx,
+    ...     input_dim=4,
+    ...     cin_layer_dims=[8, 16],
+    ...     cat_embed_input=cat_embed_input,
+    ...     mlp_hidden_dims=[16, 8]
+    ... )
+    >>> output = xdeepfm(X_tab)
+    """
+
+    def __init__(
+        self,
+        *,
+        column_idx: Dict[str, int],
+        input_dim: int,
+        reduce_sum: bool = True,
+        cin_layer_dims: List[int],
+        cat_embed_input: List[Tuple[str, int]],
+        cat_embed_dropout: Optional[float] = None,
+        use_cat_bias: Optional[bool] = None,
+        cat_embed_activation: Optional[str] = None,
+        continuous_cols: Optional[List[str]] = None,
+        cont_norm_layer: Optional[Literal["batchnorm", "layernorm"]] = None,
+        embed_continuous_method: Optional[
+            Literal["piecewise", "periodic"]
+        ] = "piecewise",
+        cont_embed_dropout: Optional[float] = None,
+        cont_embed_activation: Optional[str] = None,
+        quantization_setup: Optional[Dict[str, List[float]]] = None,
+        n_frequencies: Optional[int] = None,
+        sigma: Optional[float] = None,
+        share_last_layer: Optional[bool] = None,
+        full_embed_dropout: Optional[bool] = None,
+        mlp_hidden_dims: Optional[List[int]] = None,
+        mlp_activation: Optional[str] = None,
+        mlp_dropout: Optional[float] = None,
+        mlp_batchnorm: Optional[bool] = None,
+        mlp_batchnorm_last: Optional[bool] = None,
+        mlp_linear_first: Optional[bool] = None,
+    ):
+        super(ExtremeDeepFactorizationMachine, self).__init__(
+            column_idx=column_idx,
+            input_dim=input_dim,
+            cat_embed_input=cat_embed_input,
+            cat_embed_dropout=cat_embed_dropout,
+            use_cat_bias=use_cat_bias,
+            cat_embed_activation=cat_embed_activation,
+            shared_embed=False,
+            add_shared_embed=None,
+            frac_shared_embed=None,
+            continuous_cols=continuous_cols,
+            cont_norm_layer=cont_norm_layer,
+            embed_continuous_method=embed_continuous_method,
+            cont_embed_dropout=cont_embed_dropout,
+            cont_embed_activation=cont_embed_activation,
+            quantization_setup=quantization_setup,
+            n_frequencies=n_frequencies,
+            sigma=sigma,
+            share_last_layer=share_last_layer,
+            full_embed_dropout=full_embed_dropout,
+        )
+
+        self.mlp_hidden_dims = mlp_hidden_dims
+        self.mlp_activation = mlp_activation
+        self.mlp_dropout = mlp_dropout
+        self.mlp_batchnorm = mlp_batchnorm
+        self.mlp_batchnorm_last = mlp_batchnorm_last
+        self.mlp_linear_first = mlp_linear_first
+
+        self.reduce_sum = reduce_sum
+        self.cin_layer_dims = cin_layer_dims
+
+        self.n_features = len(self.column_idx)
+
+        self.cin = CompressedInteractionNetwork(
+            num_cols=self.n_features, cin_layer_dims=self.cin_layer_dims
+        )
+
+        if self.mlp_hidden_dims is not None:
+            if (
+                self.mlp_hidden_dims[-1] != sum(self.cin_layer_dims)
+                and not self.reduce_sum
+            ):
+                d_hidden = (
+                    [sum(self.cin_layer_dims)]
+                    + self.mlp_hidden_dims
+                    + [sum(self.cin_layer_dims)]
+                )
+            else:
+                d_hidden = [sum(self.cin_layer_dims)] + self.mlp_hidden_dims
+
+            self.mlp = MLP(
+                d_hidden=d_hidden,
+                activation=(
+                    "relu" if self.mlp_activation is None else self.mlp_activation
+                ),
+                dropout=0.0 if self.mlp_dropout is None else self.mlp_dropout,
+                batchnorm=False if self.mlp_batchnorm is None else self.mlp_batchnorm,
+                batchnorm_last=(
+                    False
+                    if self.mlp_batchnorm_last is None
+                    else self.mlp_batchnorm_last
+                ),
+                linear_first=(
+                    False if self.mlp_linear_first is None else self.mlp_linear_first
+                ),
+            )
+        else:
+            self.mlp = None
+
+    def forward(self, X: Tensor) -> Tensor:
+
+        embeddings = self._get_embeddings(X)
+        cin_out = self.cin(embeddings)
+
+        if self.mlp is None:
+            if self.reduce_sum:
+                return cin_out.sum(dim=1, keepdim=True)
+            return cin_out
+
+        mlp_out = self.mlp(cin_out)
+
+        if self.reduce_sum:
+            cin_out = cin_out.sum(dim=1, keepdim=True)
+            mlp_out = mlp_out.sum(dim=1, keepdim=True)
+
+        return mlp_out + cin_out
+
+    @property
+    def output_dim(self):
+        if self.reduce_sum:
+            return 1
+        else:
+            return sum(self.cin_layer_dims)
diff --git a/pytorch_widedeep/models/tabular/_base_tabular_model.py b/pytorch_widedeep/models/tabular/_base_tabular_model.py
index b8f04564..89cc25ae 100644
--- a/pytorch_widedeep/models/tabular/_base_tabular_model.py
+++ b/pytorch_widedeep/models/tabular/_base_tabular_model.py
@@ -1,23 +1,11 @@
-import warnings
-
 import numpy as np
 import torch
 import einops
 from torch import nn
 
-from pytorch_widedeep.wdtypes import (
-    Dict,
-    List,
-    Tuple,
-    Union,
-    Tensor,
-    Literal,
-    Optional,
-)
+from pytorch_widedeep.wdtypes import Dict, List, Tuple, Union, Tensor, Literal, Optional
 from pytorch_widedeep.utils.general_utils import alias
-from pytorch_widedeep.models._base_wd_model_component import (
-    BaseWDModelComponent,
-)
+from pytorch_widedeep.models._base_wd_model_component import BaseWDModelComponent
 from pytorch_widedeep.models.tabular.embeddings_layers import (
     NormLayers,
     ContEmbeddings,
@@ -79,13 +67,6 @@ def __init__(
         self.full_embed_dropout = full_embed_dropout
         if embed_continuous is not None:
             self.embed_continuous = embed_continuous
-            warnings.warn(
-                "The 'embed_continuous' parameter is deprecated and will be removed in "
-                "the next release. Please use 'embed_continuous_method' instead "
-                "See the documentation for more details.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
             if embed_continuous and embed_continuous_method is None:
                 raise ValueError(
                     "If 'embed_continuous' is True, 'embed_continuous_method' must be "
@@ -165,6 +146,10 @@ def _get_embeddings(self, X: Tensor) -> Tensor:
 
         return x
 
+    @property
+    def output_dim(self) -> int:
+        return self.cat_out_dim + self.cont_out_dim
+
 
 class BaseTabularModelWithAttention(BaseWDModelComponent):
     @alias("shared_embed", ["cat_embed_shared"])
@@ -182,7 +167,6 @@ def __init__(
         frac_shared_embed: Optional[float],
         continuous_cols: Optional[List[str]],
         cont_norm_layer: Optional[Literal["batchnorm", "layernorm"]],
-        embed_continuous: Optional[bool],
         embed_continuous_method: Optional[Literal["standard", "piecewise", "periodic"]],
         cont_embed_dropout: Optional[float],
         cont_embed_activation: Optional[str],
@@ -218,24 +202,6 @@ def __init__(
         self.n_frequencies = n_frequencies
         self.sigma = sigma
         self.share_last_layer = share_last_layer
-        if embed_continuous is not None:
-            self.embed_continuous = embed_continuous
-            warnings.warn(
-                "The 'embed_continuous' parameter is deprecated and will be removed in "
-                "the next release. Please use 'embed_continuous_method' instead "
-                "See the documentation for more details.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-            if embed_continuous and embed_continuous_method is None:
-                raise ValueError(
-                    "If 'embed_continuous' is True, 'embed_continuous_method' must be "
-                    "one of 'standard', 'piecewise' or 'periodic'."
-                )
-        elif embed_continuous_method is not None:
-            self.embed_continuous = True
-        else:
-            self.embed_continuous = False
 
         # if full_embed_dropout is not None will be applied to all cont and cat
         self.full_embed_dropout = full_embed_dropout
@@ -272,33 +238,38 @@ def __init__(
                 activation_fn=self.cat_embed_activation,
             )
 
-        # Continuous cols can be embedded or not
+        # Continuous cols embeddings
         if self.continuous_cols is not None:
             self.cont_idx = [self.column_idx[col] for col in self.continuous_cols]
             self.cont_norm = _set_continous_normalization_layer(
                 self.continuous_cols, self.cont_norm_layer
             )
-            if self.embed_continuous:
-                assert (
-                    self.embed_continuous_method
-                    is not None  # assertion to avoid typing conflicts
-                ), (
-                    "If continuous features are embedded, 'embed_continuous_method' must be "
-                    "one of 'standard', 'piecewise' or 'periodic'"
-                )
-                self.cont_embed = _set_continous_embeddings_layer(
-                    column_idx=self.column_idx,
-                    continuous_cols=self.continuous_cols,
-                    embed_continuous_method=self.embed_continuous_method,
-                    cont_embed_dim=self.input_dim,
-                    cont_embed_dropout=self.cont_embed_dropout,
-                    full_embed_dropout=self.full_embed_dropout,
-                    cont_embed_activation=self.cont_embed_activation,
-                    quantization_setup=self.quantization_setup,
-                    n_frequencies=self.n_frequencies,
-                    sigma=self.sigma,
-                    share_last_layer=self.share_last_layer,
-                )
+            # assert embed_continuous_method is one of the three methods
+            assert (
+                self.embed_continuous_method is not None
+                and self.embed_continuous_method
+                in [
+                    "standard",
+                    "piecewise",
+                    "periodic",
+                ]
+            ), (
+                "If continuous features are present, 'embed_continuous_method' must be "
+                "one of 'standard', 'piecewise' or 'periodic'"
+            )
+            self.cont_embed = _set_continous_embeddings_layer(
+                column_idx=self.column_idx,
+                continuous_cols=self.continuous_cols,
+                embed_continuous_method=self.embed_continuous_method,
+                cont_embed_dim=self.input_dim,
+                cont_embed_dropout=self.cont_embed_dropout,
+                full_embed_dropout=self.full_embed_dropout,
+                cont_embed_activation=self.cont_embed_activation,
+                quantization_setup=self.quantization_setup,
+                n_frequencies=self.n_frequencies,
+                sigma=self.sigma,
+                share_last_layer=self.share_last_layer,
+            )
 
     def _get_embeddings(self, X: Tensor) -> Tensor:
         tensors_to_concat: List[Tensor] = []
@@ -308,8 +279,7 @@ def _get_embeddings(self, X: Tensor) -> Tensor:
 
         if self.continuous_cols is not None:
             x_cont = self.cont_norm((X[:, self.cont_idx].float()))
-            if self.embed_continuous:
-                x_cont = self.cont_embed(x_cont)
+            x_cont = self.cont_embed(x_cont)
             tensors_to_concat.append(x_cont)
 
         x = torch.cat(tensors_to_concat, 1)
@@ -320,6 +290,10 @@ def _get_embeddings(self, X: Tensor) -> Tensor:
     def attention_weights(self):
         raise NotImplementedError
 
+    @property
+    def output_dim(self) -> int:
+        return self.input_dim
+
 
 # Eventually these two floating functions will be part of a base class
 # BaseContEmbeddings. For now is easier to keep them this way in terms of
diff --git a/pytorch_widedeep/models/tabular/embeddings_layers.py b/pytorch_widedeep/models/tabular/embeddings_layers.py
index a0918b05..99786cf7 100644
--- a/pytorch_widedeep/models/tabular/embeddings_layers.py
+++ b/pytorch_widedeep/models/tabular/embeddings_layers.py
@@ -209,6 +209,14 @@ def forward(self, X: Tensor) -> Tensor:
                 bucket_indices,
             ] = frac.float()
 
+            # if greater_mask dim 1 is less than max_num_buckets, pad with
+            # zeros up to max_num_buckets
+            if greater_mask.shape[1] < self.max_num_buckets:
+                pad = torch.zeros(
+                    greater_mask.shape[0], self.max_num_buckets - greater_mask.shape[1]
+                ).to(greater_mask.device)
+                greater_mask = torch.cat([greater_mask, pad], dim=1)
+
             encoded_values.append(greater_mask)
 
         out = torch.stack(encoded_values, dim=1)
diff --git a/pytorch_widedeep/models/tabular/linear/wide.py b/pytorch_widedeep/models/tabular/linear/wide.py
index a734c070..975be977 100644
--- a/pytorch_widedeep/models/tabular/linear/wide.py
+++ b/pytorch_widedeep/models/tabular/linear/wide.py
@@ -35,8 +35,8 @@ class Wide(nn.Module):
     --------
     >>> import torch
     >>> from pytorch_widedeep.models import Wide
-    >>> X = torch.empty(4, 4).random_(4)
-    >>> wide = Wide(input_dim=X.unique().size(0), pred_dim=1)
+    >>> X = torch.empty(4, 4).random_(20)
+    >>> wide = Wide(input_dim=int(X.max().item()), pred_dim=1)
     >>> out = wide(X)
     """
 
diff --git a/pytorch_widedeep/models/tabular/mlp/__init__.py b/pytorch_widedeep/models/tabular/mlp/__init__.py
index cfed671b..701a4d9f 100644
--- a/pytorch_widedeep/models/tabular/mlp/__init__.py
+++ b/pytorch_widedeep/models/tabular/mlp/__init__.py
@@ -1,7 +1,5 @@
 from pytorch_widedeep.models.tabular.mlp.tab_mlp import TabMlp, TabMlpDecoder
-from pytorch_widedeep.models.tabular.mlp.self_attention_mlp import (
-    SelfAttentionMLP,
-)
+from pytorch_widedeep.models.tabular.mlp.self_attention_mlp import SelfAttentionMLP
 from pytorch_widedeep.models.tabular.mlp.context_attention_mlp import (
     ContextAttentionMLP,
 )
diff --git a/pytorch_widedeep/models/tabular/mlp/_encoders.py b/pytorch_widedeep/models/tabular/mlp/_encoders.py
index 5dbbf6b9..5911eac7 100644
--- a/pytorch_widedeep/models/tabular/mlp/_encoders.py
+++ b/pytorch_widedeep/models/tabular/mlp/_encoders.py
@@ -6,9 +6,7 @@
     ContextAttention,
     QueryKeySelfAttention,
 )
-from pytorch_widedeep.models.tabular.transformers._attention_layers import (
-    AddNorm,
-)
+from pytorch_widedeep.models.tabular.transformers._attention_layers import AddNorm
 
 
 class ContextAttentionEncoder(nn.Module):
diff --git a/pytorch_widedeep/models/tabular/mlp/context_attention_mlp.py b/pytorch_widedeep/models/tabular/mlp/context_attention_mlp.py
index bd5790ee..40ca46e6 100644
--- a/pytorch_widedeep/models/tabular/mlp/context_attention_mlp.py
+++ b/pytorch_widedeep/models/tabular/mlp/context_attention_mlp.py
@@ -1,16 +1,7 @@
 from torch import nn
 
-from pytorch_widedeep.wdtypes import (
-    Dict,
-    List,
-    Tuple,
-    Tensor,
-    Literal,
-    Optional,
-)
-from pytorch_widedeep.models.tabular.mlp._encoders import (
-    ContextAttentionEncoder,
-)
+from pytorch_widedeep.wdtypes import Dict, List, Tuple, Tensor, Literal, Optional
+from pytorch_widedeep.models.tabular.mlp._encoders import ContextAttentionEncoder
 from pytorch_widedeep.models.tabular._base_tabular_model import (
     BaseTabularModelWithAttention,
 )
@@ -189,7 +180,6 @@ def __init__(
             frac_shared_embed=frac_shared_embed,
             continuous_cols=continuous_cols,
             cont_norm_layer=cont_norm_layer,
-            embed_continuous=None,
             embed_continuous_method=embed_continuous_method,
             cont_embed_dropout=cont_embed_dropout,
             cont_embed_activation=cont_embed_activation,
diff --git a/pytorch_widedeep/models/tabular/mlp/self_attention_mlp.py b/pytorch_widedeep/models/tabular/mlp/self_attention_mlp.py
index aa7f1dd7..3b745b7f 100644
--- a/pytorch_widedeep/models/tabular/mlp/self_attention_mlp.py
+++ b/pytorch_widedeep/models/tabular/mlp/self_attention_mlp.py
@@ -1,13 +1,6 @@
 from torch import nn
 
-from pytorch_widedeep.wdtypes import (
-    Dict,
-    List,
-    Tuple,
-    Tensor,
-    Literal,
-    Optional,
-)
+from pytorch_widedeep.wdtypes import Dict, List, Tuple, Tensor, Literal, Optional
 from pytorch_widedeep.models.tabular.mlp._encoders import SelfAttentionEncoder
 from pytorch_widedeep.models.tabular._base_tabular_model import (
     BaseTabularModelWithAttention,
@@ -210,7 +203,6 @@ def __init__(
             frac_shared_embed=frac_shared_embed,
             continuous_cols=continuous_cols,
             cont_norm_layer=cont_norm_layer,
-            embed_continuous=None,
             embed_continuous_method=embed_continuous_method,
             cont_embed_dropout=cont_embed_dropout,
             cont_embed_activation=cont_embed_activation,
diff --git a/pytorch_widedeep/models/tabular/mlp/tab_mlp.py b/pytorch_widedeep/models/tabular/mlp/tab_mlp.py
index 30dae4d1..0be528c3 100644
--- a/pytorch_widedeep/models/tabular/mlp/tab_mlp.py
+++ b/pytorch_widedeep/models/tabular/mlp/tab_mlp.py
@@ -1,14 +1,6 @@
 from torch import nn
 
-from pytorch_widedeep.wdtypes import (
-    Dict,
-    List,
-    Tuple,
-    Union,
-    Tensor,
-    Literal,
-    Optional,
-)
+from pytorch_widedeep.wdtypes import Dict, List, Tuple, Union, Tensor, Literal, Optional
 from pytorch_widedeep.models.tabular.mlp._layers import MLP
 from pytorch_widedeep.models.tabular._base_tabular_model import (
     BaseTabularModelWithoutAttention,
diff --git a/pytorch_widedeep/models/tabular/resnet/tab_resnet.py b/pytorch_widedeep/models/tabular/resnet/tab_resnet.py
index 251eb5e8..b4a4f310 100644
--- a/pytorch_widedeep/models/tabular/resnet/tab_resnet.py
+++ b/pytorch_widedeep/models/tabular/resnet/tab_resnet.py
@@ -1,13 +1,6 @@
 from torch import nn
 
-from pytorch_widedeep.wdtypes import (
-    Dict,
-    List,
-    Tuple,
-    Tensor,
-    Literal,
-    Optional,
-)
+from pytorch_widedeep.wdtypes import Dict, List, Tuple, Tensor, Literal, Optional
 from pytorch_widedeep.models.tabular.mlp._layers import MLP
 from pytorch_widedeep.models.tabular.resnet._layers import DenseResnet
 from pytorch_widedeep.models.tabular._base_tabular_model import (
diff --git a/pytorch_widedeep/models/tabular/tabnet/__init__.py b/pytorch_widedeep/models/tabular/tabnet/__init__.py
index 0de1b5b7..a6ff9c02 100644
--- a/pytorch_widedeep/models/tabular/tabnet/__init__.py
+++ b/pytorch_widedeep/models/tabular/tabnet/__init__.py
@@ -1,4 +1 @@
-from pytorch_widedeep.models.tabular.tabnet.tab_net import (
-    TabNet,
-    TabNetDecoder,
-)
+from pytorch_widedeep.models.tabular.tabnet.tab_net import TabNet, TabNetDecoder
diff --git a/pytorch_widedeep/models/tabular/tabnet/tab_net.py b/pytorch_widedeep/models/tabular/tabnet/tab_net.py
index 14fcc923..009e7132 100644
--- a/pytorch_widedeep/models/tabular/tabnet/tab_net.py
+++ b/pytorch_widedeep/models/tabular/tabnet/tab_net.py
@@ -1,14 +1,7 @@
 import torch
 from torch import nn
 
-from pytorch_widedeep.wdtypes import (
-    Dict,
-    List,
-    Tuple,
-    Tensor,
-    Literal,
-    Optional,
-)
+from pytorch_widedeep.wdtypes import Dict, List, Tuple, Tensor, Literal, Optional
 from pytorch_widedeep.models.tabular.tabnet._layers import (
     TabNetEncoder,
     FeatTransformer,
diff --git a/pytorch_widedeep/models/tabular/transformers/__init__.py b/pytorch_widedeep/models/tabular/transformers/__init__.py
index 72a077d8..88c25f45 100644
--- a/pytorch_widedeep/models/tabular/transformers/__init__.py
+++ b/pytorch_widedeep/models/tabular/transformers/__init__.py
@@ -1,13 +1,5 @@
 from pytorch_widedeep.models.tabular.transformers.saint import SAINT
-from pytorch_widedeep.models.tabular.transformers.tab_perceiver import (
-    TabPerceiver,
-)
-from pytorch_widedeep.models.tabular.transformers.ft_transformer import (
-    FTTransformer,
-)
-from pytorch_widedeep.models.tabular.transformers.tab_fastformer import (
-    TabFastFormer,
-)
-from pytorch_widedeep.models.tabular.transformers.tab_transformer import (
-    TabTransformer,
-)
+from pytorch_widedeep.models.tabular.transformers.tab_perceiver import TabPerceiver
+from pytorch_widedeep.models.tabular.transformers.ft_transformer import FTTransformer
+from pytorch_widedeep.models.tabular.transformers.tab_fastformer import TabFastFormer
+from pytorch_widedeep.models.tabular.transformers.tab_transformer import TabTransformer
diff --git a/pytorch_widedeep/models/tabular/transformers/ft_transformer.py b/pytorch_widedeep/models/tabular/transformers/ft_transformer.py
index acb7297b..9efec1ba 100644
--- a/pytorch_widedeep/models/tabular/transformers/ft_transformer.py
+++ b/pytorch_widedeep/models/tabular/transformers/ft_transformer.py
@@ -1,20 +1,11 @@
 from torch import nn
 
-from pytorch_widedeep.wdtypes import (
-    Dict,
-    List,
-    Tuple,
-    Tensor,
-    Literal,
-    Optional,
-)
+from pytorch_widedeep.wdtypes import Dict, List, Tuple, Tensor, Literal, Optional
 from pytorch_widedeep.models.tabular.mlp._layers import MLP
 from pytorch_widedeep.models.tabular._base_tabular_model import (
     BaseTabularModelWithAttention,
 )
-from pytorch_widedeep.models.tabular.transformers._encoders import (
-    FTTransformerEncoder,
-)
+from pytorch_widedeep.models.tabular.transformers._encoders import FTTransformerEncoder
 
 
 class FTTransformer(BaseTabularModelWithAttention):
@@ -242,7 +233,6 @@ def __init__(
             frac_shared_embed=frac_shared_embed,
             continuous_cols=continuous_cols,
             cont_norm_layer=cont_norm_layer,
-            embed_continuous=None,
             embed_continuous_method=embed_continuous_method,
             cont_embed_dropout=cont_embed_dropout,
             cont_embed_activation=cont_embed_activation,
diff --git a/pytorch_widedeep/models/tabular/transformers/saint.py b/pytorch_widedeep/models/tabular/transformers/saint.py
index 120eaebf..ab487720 100644
--- a/pytorch_widedeep/models/tabular/transformers/saint.py
+++ b/pytorch_widedeep/models/tabular/transformers/saint.py
@@ -1,13 +1,6 @@
 from torch import nn
 
-from pytorch_widedeep.wdtypes import (
-    Dict,
-    List,
-    Tuple,
-    Tensor,
-    Literal,
-    Optional,
-)
+from pytorch_widedeep.wdtypes import Dict, List, Tuple, Tensor, Literal, Optional
 from pytorch_widedeep.models.tabular.mlp._layers import MLP
 from pytorch_widedeep.models.tabular._base_tabular_model import (
     BaseTabularModelWithAttention,
@@ -230,7 +223,6 @@ def __init__(
             frac_shared_embed=frac_shared_embed,
             continuous_cols=continuous_cols,
             cont_norm_layer=cont_norm_layer,
-            embed_continuous=None,
             embed_continuous_method=embed_continuous_method,
             cont_embed_dropout=cont_embed_dropout,
             cont_embed_activation=cont_embed_activation,
diff --git a/pytorch_widedeep/models/tabular/transformers/tab_fastformer.py b/pytorch_widedeep/models/tabular/transformers/tab_fastformer.py
index 0e8235bf..b4699a03 100644
--- a/pytorch_widedeep/models/tabular/transformers/tab_fastformer.py
+++ b/pytorch_widedeep/models/tabular/transformers/tab_fastformer.py
@@ -1,20 +1,11 @@
 from torch import nn
 
-from pytorch_widedeep.wdtypes import (
-    Dict,
-    List,
-    Tuple,
-    Tensor,
-    Literal,
-    Optional,
-)
+from pytorch_widedeep.wdtypes import Dict, List, Tuple, Tensor, Literal, Optional
 from pytorch_widedeep.models.tabular.mlp._layers import MLP
 from pytorch_widedeep.models.tabular._base_tabular_model import (
     BaseTabularModelWithAttention,
 )
-from pytorch_widedeep.models.tabular.transformers._encoders import (
-    FastFormerEncoder,
-)
+from pytorch_widedeep.models.tabular.transformers._encoders import FastFormerEncoder
 
 
 class TabFastFormer(BaseTabularModelWithAttention):
@@ -248,7 +239,6 @@ def __init__(
             frac_shared_embed=frac_shared_embed,
             continuous_cols=continuous_cols,
             cont_norm_layer=cont_norm_layer,
-            embed_continuous=None,
             embed_continuous_method=embed_continuous_method,
             cont_embed_dropout=cont_embed_dropout,
             cont_embed_activation=cont_embed_activation,
diff --git a/pytorch_widedeep/models/tabular/transformers/tab_perceiver.py b/pytorch_widedeep/models/tabular/transformers/tab_perceiver.py
index c09f224c..f221baed 100644
--- a/pytorch_widedeep/models/tabular/transformers/tab_perceiver.py
+++ b/pytorch_widedeep/models/tabular/transformers/tab_perceiver.py
@@ -2,21 +2,12 @@
 import einops
 from torch import nn
 
-from pytorch_widedeep.wdtypes import (
-    Dict,
-    List,
-    Tuple,
-    Tensor,
-    Literal,
-    Optional,
-)
+from pytorch_widedeep.wdtypes import Dict, List, Tuple, Tensor, Literal, Optional
 from pytorch_widedeep.models.tabular.mlp._layers import MLP
 from pytorch_widedeep.models.tabular._base_tabular_model import (
     BaseTabularModelWithAttention,
 )
-from pytorch_widedeep.models.tabular.transformers._encoders import (
-    PerceiverEncoder,
-)
+from pytorch_widedeep.models.tabular.transformers._encoders import PerceiverEncoder
 
 
 class TabPerceiver(BaseTabularModelWithAttention):
@@ -267,7 +258,6 @@ def __init__(
             frac_shared_embed=frac_shared_embed,
             continuous_cols=continuous_cols,
             cont_norm_layer=cont_norm_layer,
-            embed_continuous=None,
             embed_continuous_method=embed_continuous_method,
             cont_embed_dropout=cont_embed_dropout,
             cont_embed_activation=cont_embed_activation,
diff --git a/pytorch_widedeep/models/tabular/transformers/tab_transformer.py b/pytorch_widedeep/models/tabular/transformers/tab_transformer.py
index 725c6003..23325623 100644
--- a/pytorch_widedeep/models/tabular/transformers/tab_transformer.py
+++ b/pytorch_widedeep/models/tabular/transformers/tab_transformer.py
@@ -1,21 +1,12 @@
 import torch
 from torch import nn
 
-from pytorch_widedeep.wdtypes import (
-    Dict,
-    List,
-    Tuple,
-    Tensor,
-    Literal,
-    Optional,
-)
+from pytorch_widedeep.wdtypes import Dict, List, Tuple, Tensor, Literal, Optional
 from pytorch_widedeep.models.tabular.mlp._layers import MLP
 from pytorch_widedeep.models.tabular._base_tabular_model import (
     BaseTabularModelWithAttention,
 )
-from pytorch_widedeep.models.tabular.transformers._encoders import (
-    TransformerEncoder,
-)
+from pytorch_widedeep.models.tabular.transformers._encoders import TransformerEncoder
 
 
 class TabTransformer(BaseTabularModelWithAttention):
@@ -250,8 +241,12 @@ def __init__(
             frac_shared_embed=frac_shared_embed,
             continuous_cols=continuous_cols,
             cont_norm_layer=cont_norm_layer,
-            embed_continuous=embed_continuous,
-            embed_continuous_method=embed_continuous_method,
+            # UGLY hack to be able to use the base class __init__ method
+            embed_continuous_method=(
+                "standard"
+                if embed_continuous_method is None
+                else embed_continuous_method
+            ),
             cont_embed_dropout=cont_embed_dropout,
             cont_embed_activation=cont_embed_activation,
             quantization_setup=quantization_setup,
@@ -262,6 +257,25 @@ def __init__(
             full_embed_dropout=full_embed_dropout,
         )
 
+        # we overwrite the embed_continuous_method parameter that was set
+        # to 'standard' if embed_continuous_method is None. In other words,
+        # this will already have the right value if the user provided a value
+        # for embed_continuous_method, otherwise will be "standard" when it
+        # should be None
+        self.embed_continuous_method = embed_continuous_method
+
+        if embed_continuous is not None:
+            self.embed_continuous = embed_continuous
+            if embed_continuous and embed_continuous_method is None:
+                raise ValueError(
+                    "If 'embed_continuous' is True, 'embed_continuous_method' must be "
+                    "one of 'standard', 'piecewise' or 'periodic'."
+                )
+        elif self.embed_continuous_method is not None:
+            self.embed_continuous = True
+        else:
+            self.embed_continuous = False
+
         self.n_heads = n_heads
         self.use_qkv_bias = use_qkv_bias
         self.n_blocks = n_blocks
diff --git a/pytorch_widedeep/models/text/__init__.py b/pytorch_widedeep/models/text/__init__.py
index e7fe28ee..4fc92dea 100644
--- a/pytorch_widedeep/models/text/__init__.py
+++ b/pytorch_widedeep/models/text/__init__.py
@@ -3,5 +3,4 @@
     AttentiveRNN,
     StackedAttentiveRNN,
 )
-from pytorch_widedeep.models.text.miscellaneous import Transformer
 from pytorch_widedeep.models.text.huggingface_transformers import HFModel
diff --git a/pytorch_widedeep/models/text/huggingface_transformers/__init__.py b/pytorch_widedeep/models/text/huggingface_transformers/__init__.py
index e313abbe..ec55e276 100644
--- a/pytorch_widedeep/models/text/huggingface_transformers/__init__.py
+++ b/pytorch_widedeep/models/text/huggingface_transformers/__init__.py
@@ -1,3 +1 @@
-from pytorch_widedeep.models.text.huggingface_transformers.hf_model import (
-    HFModel,
-)
+from pytorch_widedeep.models.text.huggingface_transformers.hf_model import HFModel
diff --git a/pytorch_widedeep/models/text/huggingface_transformers/hf_model.py b/pytorch_widedeep/models/text/huggingface_transformers/hf_model.py
index f679408b..a17275ee 100644
--- a/pytorch_widedeep/models/text/huggingface_transformers/hf_model.py
+++ b/pytorch_widedeep/models/text/huggingface_transformers/hf_model.py
@@ -3,15 +3,10 @@
 import torch
 
 from pytorch_widedeep.wdtypes import List, Tensor, Optional
-from pytorch_widedeep.utils.hf_utils import (
-    get_model_class,
-    get_config_and_model,
-)
+from pytorch_widedeep.utils.hf_utils import get_model_class, get_config_and_model
 from pytorch_widedeep.utils.general_utils import alias
 from pytorch_widedeep.models.tabular.mlp._layers import MLP
-from pytorch_widedeep.models._base_wd_model_component import (
-    BaseWDModelComponent,
-)
+from pytorch_widedeep.models._base_wd_model_component import BaseWDModelComponent
 
 
 class HFModel(BaseWDModelComponent):
diff --git a/pytorch_widedeep/models/text/miscellaneous/__init__.py b/pytorch_widedeep/models/text/miscellaneous/__init__.py
deleted file mode 100644
index 862dfb3b..00000000
--- a/pytorch_widedeep/models/text/miscellaneous/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from pytorch_widedeep.models.text.miscellaneous.basic_transformer import (
-    Transformer,
-)
diff --git a/pytorch_widedeep/models/text/rnns/__init__.py b/pytorch_widedeep/models/text/rnns/__init__.py
index c949e614..152acd8c 100644
--- a/pytorch_widedeep/models/text/rnns/__init__.py
+++ b/pytorch_widedeep/models/text/rnns/__init__.py
@@ -1,5 +1,3 @@
 from pytorch_widedeep.models.text.rnns.basic_rnn import BasicRNN
 from pytorch_widedeep.models.text.rnns.attentive_rnn import AttentiveRNN
-from pytorch_widedeep.models.text.rnns.stacked_attentive_rnn import (
-    StackedAttentiveRNN,
-)
+from pytorch_widedeep.models.text.rnns.stacked_attentive_rnn import StackedAttentiveRNN
diff --git a/pytorch_widedeep/models/text/rnns/_encoders.py b/pytorch_widedeep/models/text/rnns/_encoders.py
index d24a6be6..c4f2157a 100644
--- a/pytorch_widedeep/models/text/rnns/_encoders.py
+++ b/pytorch_widedeep/models/text/rnns/_encoders.py
@@ -2,12 +2,8 @@
 from torch import nn
 
 from pytorch_widedeep.wdtypes import Tuple, Tensor
-from pytorch_widedeep.models.tabular.mlp._attention_layers import (
-    ContextAttention,
-)
-from pytorch_widedeep.models.tabular.transformers._attention_layers import (
-    AddNorm,
-)
+from pytorch_widedeep.models.tabular.mlp._attention_layers import ContextAttention
+from pytorch_widedeep.models.tabular.transformers._attention_layers import AddNorm
 
 
 class ContextAttentionEncoder(nn.Module):
diff --git a/pytorch_widedeep/models/text/rnns/attentive_rnn.py b/pytorch_widedeep/models/text/rnns/attentive_rnn.py
index ea573148..499b8924 100644
--- a/pytorch_widedeep/models/text/rnns/attentive_rnn.py
+++ b/pytorch_widedeep/models/text/rnns/attentive_rnn.py
@@ -4,9 +4,7 @@
 from pytorch_widedeep.wdtypes import List, Tensor, Literal, Optional
 from pytorch_widedeep.models.tabular.mlp._layers import MLP
 from pytorch_widedeep.models.text.rnns.basic_rnn import BasicRNN
-from pytorch_widedeep.models.tabular.mlp._attention_layers import (
-    ContextAttention,
-)
+from pytorch_widedeep.models.tabular.mlp._attention_layers import ContextAttention
 
 
 class AttentiveRNN(BasicRNN):
diff --git a/pytorch_widedeep/models/text/rnns/basic_rnn.py b/pytorch_widedeep/models/text/rnns/basic_rnn.py
index 87db8e16..d12fdff2 100644
--- a/pytorch_widedeep/models/text/rnns/basic_rnn.py
+++ b/pytorch_widedeep/models/text/rnns/basic_rnn.py
@@ -4,18 +4,9 @@
 import torch
 from torch import nn
 
-from pytorch_widedeep.wdtypes import (
-    List,
-    Tuple,
-    Union,
-    Tensor,
-    Literal,
-    Optional,
-)
+from pytorch_widedeep.wdtypes import List, Tuple, Union, Tensor, Literal, Optional
 from pytorch_widedeep.models.tabular.mlp._layers import MLP
-from pytorch_widedeep.models._base_wd_model_component import (
-    BaseWDModelComponent,
-)
+from pytorch_widedeep.models._base_wd_model_component import BaseWDModelComponent
 
 
 class BasicRNN(BaseWDModelComponent):
diff --git a/pytorch_widedeep/models/text/rnns/stacked_attentive_rnn.py b/pytorch_widedeep/models/text/rnns/stacked_attentive_rnn.py
index 58daf3e2..586c2c96 100644
--- a/pytorch_widedeep/models/text/rnns/stacked_attentive_rnn.py
+++ b/pytorch_widedeep/models/text/rnns/stacked_attentive_rnn.py
@@ -4,20 +4,10 @@
 import torch
 from torch import nn
 
-from pytorch_widedeep.wdtypes import (
-    Any,
-    List,
-    Tuple,
-    Union,
-    Tensor,
-    Literal,
-    Optional,
-)
+from pytorch_widedeep.wdtypes import Any, List, Tuple, Union, Tensor, Literal, Optional
 from pytorch_widedeep.models.tabular.mlp._layers import MLP
 from pytorch_widedeep.models.text.rnns._encoders import ContextAttentionEncoder
-from pytorch_widedeep.models._base_wd_model_component import (
-    BaseWDModelComponent,
-)
+from pytorch_widedeep.models._base_wd_model_component import BaseWDModelComponent
 
 
 class StackedAttentiveRNN(BaseWDModelComponent):
diff --git a/pytorch_widedeep/models/wide_deep.py b/pytorch_widedeep/models/wide_deep.py
index 7d131b44..7d70c954 100644
--- a/pytorch_widedeep/models/wide_deep.py
+++ b/pytorch_widedeep/models/wide_deep.py
@@ -3,22 +3,12 @@
 import torch
 from torch import nn
 
-from pytorch_widedeep.wdtypes import (
-    Dict,
-    List,
-    Tuple,
-    Union,
-    Tensor,
-    Literal,
-    Optional,
-)
+from pytorch_widedeep.wdtypes import Dict, List, Tuple, Union, Tensor, Literal, Optional
 from pytorch_widedeep.utils.general_utils import alias
 from pytorch_widedeep.models._get_activation_fn import get_activation_fn
 from pytorch_widedeep.models.tabular.mlp._layers import MLP
 from pytorch_widedeep.models.tabular.tabnet.tab_net import TabNetPredLayer
-from pytorch_widedeep.models._base_wd_model_component import (
-    BaseWDModelComponent,
-)
+from pytorch_widedeep.models._base_wd_model_component import BaseWDModelComponent
 
 warnings.filterwarnings("default", category=UserWarning)
 
diff --git a/pytorch_widedeep/preprocessing/tab_preprocessor.py b/pytorch_widedeep/preprocessing/tab_preprocessor.py
index 30b84203..4490fe46 100644
--- a/pytorch_widedeep/preprocessing/tab_preprocessor.py
+++ b/pytorch_widedeep/preprocessing/tab_preprocessor.py
@@ -4,14 +4,7 @@
 import pandas as pd
 from sklearn.preprocessing import StandardScaler
 
-from pytorch_widedeep.wdtypes import (
-    Dict,
-    List,
-    Tuple,
-    Union,
-    Literal,
-    Optional,
-)
+from pytorch_widedeep.wdtypes import Dict, List, Tuple, Union, Literal, Optional
 from pytorch_widedeep.utils.general_utils import alias
 from pytorch_widedeep.utils.deeptabular_utils import LabelEncoder
 from pytorch_widedeep.preprocessing.base_preprocessor import (
@@ -246,7 +239,7 @@ class TabPreprocessor(BasePreprocessor):
     >>> ft_cont_df2 = tab_preprocessor2.fit_transform(cont_df)
     """
 
-    @alias("with_attention", ["for_transformer"])
+    @alias("with_attention", ["for_transformer", "for_matrix_factorization", "for_mf"])
     @alias("cat_embed_cols", ["embed_cols"])
     @alias("scale", ["scale_cont_cols"])
     @alias("quantization_setup", ["cols_and_bins"])
diff --git a/pytorch_widedeep/preprocessing/wide_preprocessor.py b/pytorch_widedeep/preprocessing/wide_preprocessor.py
index caa8b7cb..b8f897e1 100644
--- a/pytorch_widedeep/preprocessing/wide_preprocessor.py
+++ b/pytorch_widedeep/preprocessing/wide_preprocessor.py
@@ -1,4 +1,3 @@
-import sys
 from typing import List, Tuple, Optional
 
 import numpy as np
diff --git a/pytorch_widedeep/self_supervised_training/_base_contrastive_denoising_trainer.py b/pytorch_widedeep/self_supervised_training/_base_contrastive_denoising_trainer.py
index 1d8c76aa..95706b50 100644
--- a/pytorch_widedeep/self_supervised_training/_base_contrastive_denoising_trainer.py
+++ b/pytorch_widedeep/self_supervised_training/_base_contrastive_denoising_trainer.py
@@ -27,9 +27,7 @@
     CallbackContainer,
     LRShedulerCallback,
 )
-from pytorch_widedeep.models.tabular.self_supervised import (
-    ContrastiveDenoisingModel,
-)
+from pytorch_widedeep.models.tabular.self_supervised import ContrastiveDenoisingModel
 from pytorch_widedeep.preprocessing.tab_preprocessor import TabPreprocessor
 
 
diff --git a/pytorch_widedeep/tab2vec.py b/pytorch_widedeep/tab2vec.py
index 306a009f..af3d9273 100644
--- a/pytorch_widedeep/tab2vec.py
+++ b/pytorch_widedeep/tab2vec.py
@@ -5,19 +5,10 @@
 import einops
 import pandas as pd
 
-from pytorch_widedeep.wdtypes import (
-    List,
-    Tuple,
-    Union,
-    Callable,
-    Optional,
-    WideDeep,
-)
+from pytorch_widedeep.wdtypes import List, Tuple, Union, Callable, Optional, WideDeep
 from pytorch_widedeep.preprocessing import TabPreprocessor
 from pytorch_widedeep.bayesian_models import BayesianWide, BayesianTabMlp
-from pytorch_widedeep.bayesian_models._base_bayesian_model import (
-    BaseBayesianModel,
-)
+from pytorch_widedeep.bayesian_models._base_bayesian_model import BaseBayesianModel
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
diff --git a/pytorch_widedeep/training/_base_bayesian_trainer.py b/pytorch_widedeep/training/_base_bayesian_trainer.py
index 42fa0f8b..02993ef9 100644
--- a/pytorch_widedeep/training/_base_bayesian_trainer.py
+++ b/pytorch_widedeep/training/_base_bayesian_trainer.py
@@ -26,9 +26,7 @@
     LRShedulerCallback,
 )
 from pytorch_widedeep.training._trainer_utils import bayesian_alias_to_loss
-from pytorch_widedeep.bayesian_models._base_bayesian_model import (
-    BaseBayesianModel,
-)
+from pytorch_widedeep.bayesian_models._base_bayesian_model import BaseBayesianModel
 
 
 # There are some nuances in the Bayesian Trainer that make it hard to build an
diff --git a/pytorch_widedeep/training/_base_trainer.py b/pytorch_widedeep/training/_base_trainer.py
index 16b9f520..74a57202 100644
--- a/pytorch_widedeep/training/_base_trainer.py
+++ b/pytorch_widedeep/training/_base_trainer.py
@@ -35,9 +35,7 @@
 from pytorch_widedeep.training._multiple_optimizer import MultipleOptimizer
 from pytorch_widedeep.training._multiple_transforms import MultipleTransforms
 from pytorch_widedeep.training._loss_and_obj_aliases import _ObjectiveToMethod
-from pytorch_widedeep.training._multiple_lr_scheduler import (
-    MultipleLRScheduler,
-)
+from pytorch_widedeep.training._multiple_lr_scheduler import MultipleLRScheduler
 
 # I would like to be more specific with the abstract methods in 'BaseTrainer'
 # so they are more informative. However I also want to use this class to be
diff --git a/pytorch_widedeep/training/_finetune.py b/pytorch_widedeep/training/_finetune.py
index 3c4fac62..90579179 100644
--- a/pytorch_widedeep/training/_finetune.py
+++ b/pytorch_widedeep/training/_finetune.py
@@ -16,9 +16,7 @@
     DataLoader,
     LRScheduler,
 )
-from pytorch_widedeep.models._base_wd_model_component import (
-    BaseWDModelComponent,
-)
+from pytorch_widedeep.models._base_wd_model_component import BaseWDModelComponent
 
 use_cuda = torch.cuda.is_available()
 
diff --git a/pytorch_widedeep/training/_trainer_utils.py b/pytorch_widedeep/training/_trainer_utils.py
index dd85c1f2..702c2d31 100644
--- a/pytorch_widedeep/training/_trainer_utils.py
+++ b/pytorch_widedeep/training/_trainer_utils.py
@@ -115,7 +115,7 @@ def tabular_train_val_split(
 
 def wd_train_val_split(  # noqa: C901
     seed: int,
-    method: Literal["regression", "binary", "multiclass", "qregression"],
+    method: Literal["regression", "binary", "multiclass", "qregression", "multitarget"],
     X_wide: Optional[np.ndarray] = None,
     X_tab: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
     X_text: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
@@ -139,7 +139,7 @@ def wd_train_val_split(  # noqa: C901
     seed: int
         random seed to be used during train/val split
     method: str
-        'regression',  'binary' or 'multiclass'
+        'regression',  'binary', 'multiclass', 'qregression' or 'multitarget'
     X_wide: np.ndaaray, Optional, default = None
         wide dataset
     X_tab: np.ndarray or List[np.ndarray], Optional, default = None
@@ -184,7 +184,7 @@ def wd_train_val_split(  # noqa: C901
             random_state=seed,
             stratify=(
                 X_train["target"]
-                if method not in ["regression", "qregression"]
+                if method not in ["regression", "qregression", "multitarget"]
                 else None
             ),
         )
diff --git a/pytorch_widedeep/training/bayesian_trainer.py b/pytorch_widedeep/training/bayesian_trainer.py
index 85d59df3..88c426fe 100644
--- a/pytorch_widedeep/training/bayesian_trainer.py
+++ b/pytorch_widedeep/training/bayesian_trainer.py
@@ -26,12 +26,8 @@
     print_loss_and_metric,
     tabular_train_val_split,
 )
-from pytorch_widedeep.training._base_bayesian_trainer import (
-    BaseBayesianTrainer,
-)
-from pytorch_widedeep.bayesian_models._base_bayesian_model import (
-    BaseBayesianModel,
-)
+from pytorch_widedeep.training._base_bayesian_trainer import BaseBayesianTrainer
+from pytorch_widedeep.bayesian_models._base_bayesian_model import BaseBayesianModel
 
 
 class BayesianTrainer(BaseBayesianTrainer):
@@ -500,8 +496,8 @@ def _predict(  # noqa: C901
                             else F.softmax(preds, dim=1)
                         )
 
-                    preds = preds.cpu().data.numpy()
-                    preds_l.append(preds)
+                    preds_cpu = preds.cpu().data.numpy()
+                    preds_l.append(preds_cpu)
 
         self.model.train()
 
diff --git a/pytorch_widedeep/training/trainer.py b/pytorch_widedeep/training/trainer.py
index b0a2672a..f4dd1a91 100644
--- a/pytorch_widedeep/training/trainer.py
+++ b/pytorch_widedeep/training/trainer.py
@@ -34,10 +34,7 @@
     wd_train_val_split,
     print_loss_and_metric,
 )
-from pytorch_widedeep.training._feature_importance import (
-    Explainer,
-    FeatureImportance,
-)
+from pytorch_widedeep.training._feature_importance import Explainer, FeatureImportance
 
 
 class Trainer(BaseTrainer):
@@ -583,6 +580,16 @@ def predict(  # type: ignore[override, return]
         if self.method == "multiclass":
             preds = np.vstack(preds_l)
             return np.argmax(preds, 1)  # type: ignore[return-value]
+        if self.method == "multitarget":
+            if self.loss_fn.__class__.__name__ in [
+                "MultiTargetClassificationLoss",
+                "MutilTargetRegressionAndClassificationLoss",
+            ]:
+                raise ValueError(
+                    "MultiTargetClassificationLoss and MutilTargetRegressionAndClassificationLoss "
+                    "are not supported by predict method. Please use predict_proba method instead."
+                )
+            return np.vstack(preds_l)
 
     def predict_uncertainty(  # type: ignore[return]
         self,
@@ -681,6 +688,11 @@ def predict_uncertainty(  # type: ignore[return]
             preds = np.hstack((preds, np.vstack(np.argmax(preds, 1))))
             return preds
 
+        if self.method == "multitarget":
+            raise ValueError(
+                "Currently predict_uncertainty is not supported for multitarget method"
+            )
+
     def predict_proba(  # type: ignore[override, return]  # noqa: C901
         self,
         X_wide: Optional[np.ndarray] = None,
@@ -735,6 +747,8 @@ def predict_proba(  # type: ignore[override, return]  # noqa: C901
             return probs
         if self.method == "multiclass":
             return np.vstack(preds_l)
+        if self.method == "multitarget":
+            return np.vstack(preds_l)
 
     def explain(self, X_tab: np.ndarray, save_step_masks: Optional[bool] = None):
         # TO DO: Add docs to this, to the feat imp parameter and the all
diff --git a/pytorch_widedeep/utils/fastai_transforms.py b/pytorch_widedeep/utils/fastai_transforms.py
index 79597daa..667f5212 100644
--- a/pytorch_widedeep/utils/fastai_transforms.py
+++ b/pytorch_widedeep/utils/fastai_transforms.py
@@ -368,10 +368,14 @@ def __init__(
         max_vocab: int,
         min_freq: int,
         pad_idx: Optional[int] = None,
+        special_cases: Optional[Collection[str]] = None,
     ):
         self.max_vocab = max_vocab
         self.min_freq = min_freq
         self.pad_idx = pad_idx
+        self.special_cases = (
+            special_cases if special_cases is not None else defaults.text_spec_tok
+        )
 
     def create(
         self,
@@ -410,7 +414,7 @@ def create(
 
         freq = Counter(p for o in tokens for p in o)
         itos = [o for o, c in freq.most_common(self.max_vocab) if c >= self.min_freq]
-        for o in reversed(defaults.text_spec_tok):
+        for o in reversed(self.special_cases):
             if o in itos:
                 itos.remove(o)
             itos.insert(0, o)
diff --git a/pytorch_widedeep/utils/text_utils.py b/pytorch_widedeep/utils/text_utils.py
index 60e3505b..353b188b 100644
--- a/pytorch_widedeep/utils/text_utils.py
+++ b/pytorch_widedeep/utils/text_utils.py
@@ -4,11 +4,7 @@
 import numpy as np
 from gensim.utils import tokenize
 
-from pytorch_widedeep.utils.fastai_transforms import (
-    Vocab,
-    Tokenizer,
-    ChunkVocab,
-)
+from pytorch_widedeep.utils.fastai_transforms import Vocab, Tokenizer, ChunkVocab
 
 __all__ = ["simple_preprocess", "get_texts", "pad_sequences", "build_embeddings_matrix"]
 
diff --git a/pytorch_widedeep/version.py b/pytorch_widedeep/version.py
index 31e744e4..dc79f8f0 100644
--- a/pytorch_widedeep/version.py
+++ b/pytorch_widedeep/version.py
@@ -1 +1 @@
-__version__ = "1.6.3"
+__version__ = "1.6.4"
diff --git a/tests/test_bayesian_models/test_bayes_model_functioning/test_b_callbacks.py b/tests/test_bayesian_models/test_bayes_model_functioning/test_b_callbacks.py
index 02e943b6..5d675e8b 100644
--- a/tests/test_bayesian_models/test_bayes_model_functioning/test_b_callbacks.py
+++ b/tests/test_bayesian_models/test_bayes_model_functioning/test_b_callbacks.py
@@ -8,11 +8,7 @@
 from torch.optim.lr_scheduler import StepLR, CyclicLR
 
 from pytorch_widedeep.training import BayesianTrainer
-from pytorch_widedeep.callbacks import (
-    LRHistory,
-    EarlyStopping,
-    ModelCheckpoint,
-)
+from pytorch_widedeep.callbacks import LRHistory, EarlyStopping, ModelCheckpoint
 from pytorch_widedeep.bayesian_models import BayesianWide, BayesianTabMlp
 
 # Wide array
diff --git a/tests/test_losses/test_multi_target_losses.py b/tests/test_losses/test_multi_target_losses.py
index 1650d982..076e8626 100644
--- a/tests/test_losses/test_multi_target_losses.py
+++ b/tests/test_losses/test_multi_target_losses.py
@@ -388,8 +388,65 @@ def test_multi_target_losses_integration(problem):
 
     model = WideDeep(deeptabular=tab_ml, pred_dim=pred_dim)
 
-    trainer = Trainer(model, objective="multitarget", custom_loss_function=loss)
+    trainer = Trainer(
+        model, objective="multitarget", custom_loss_function=loss, verbose=0
+    )
 
     trainer.fit(X_tab=X_tab, target=target, n_epochs=1)
 
+    if problem == "regression":
+        preds = trainer.predict(X_tab=X_tab)
+    else:
+        preds = trainer.predict_proba(X_tab=X_tab)
+
     assert trainer.history["train_loss"][0] != 0
+    assert preds.shape[0] == df.shape[0] and preds.shape[1] == pred_dim
+
+
+@pytest.mark.parametrize("problem", ["classification", "regression_and_classification"])
+def test_predict_error_for_classification_problems(problem):
+
+    tab_preprocessor = TabPreprocessor(
+        continuous_cols=["col1", "col2", "col3", "col4", "col5", "col6", "col7", "col8"]
+    )
+    X_tab = tab_preprocessor.fit_transform(df)
+
+    tab_ml = TabMlp(
+        column_idx=tab_preprocessor.column_idx,
+        continuous_cols=tab_preprocessor.continuous_cols,
+        mlp_hidden_dims=[16, 8],
+    )
+
+    if problem == "classification":
+        loss = MultiTargetClassificationLoss(
+            binary_config=[0],
+            multiclass_config=[(1, 3)],
+        )
+        pred_dim = 1 + 3
+        target = df[["target3_binary", "target4_multiclass"]].values
+    else:
+        loss = MutilTargetRegressionAndClassificationLoss(
+            regression_config=[0, 1],
+            binary_config=[2],
+            multiclass_config=[(3, 3)],
+        )
+        pred_dim = 2 + 1 + 3
+        target = df[
+            [
+                "target1_regression",
+                "target2_regression",
+                "target3_binary",
+                "target4_multiclass",
+            ]
+        ].values
+
+    model = WideDeep(deeptabular=tab_ml, pred_dim=pred_dim)
+
+    trainer = Trainer(
+        model, objective="multitarget", custom_loss_function=loss, verbose=0
+    )
+
+    trainer.fit(X_tab=X_tab, target=target, n_epochs=1)
+
+    with pytest.raises(ValueError):
+        trainer.predict(X_tab=X_tab)
diff --git a/tests/test_model_components/test_mc_text.py b/tests/test_model_components/test_mc_text.py
index f487633c..39c37b25 100644
--- a/tests/test_model_components/test_mc_text.py
+++ b/tests/test_model_components/test_mc_text.py
@@ -2,12 +2,7 @@
 import torch
 import pytest
 
-from pytorch_widedeep.models import (
-    BasicRNN,
-    Transformer,
-    AttentiveRNN,
-    StackedAttentiveRNN,
-)
+from pytorch_widedeep.models import BasicRNN, AttentiveRNN, StackedAttentiveRNN
 
 padded_sequences = np.random.choice(np.arange(1, 100), (100, 48))
 padded_sequences = np.hstack(
@@ -307,80 +302,3 @@ def test_attn_weights(stacked):
         )
     else:
         assert attn_w.size() == torch.Size([100, 50])
-
-
-# ###############################################################################
-# # Test Basic Transformer
-# ###############################################################################
-
-
-@pytest.mark.parametrize(
-    "with_cls_token",
-    [True, False],
-)
-def test_basic_transformer(with_cls_token):
-    if with_cls_token:
-        # if we use a 'CLS' token it must be inserted at the beginning of the
-        # sequence
-        _padded_sequences = np.zeros(
-            (padded_sequences.shape[0], padded_sequences.shape[1] + 1), dtype=int
-        )
-        _padded_sequences[:, 0] = padded_sequences.max() + 1
-        _padded_sequences[:, 1:] = padded_sequences
-    else:
-        _padded_sequences = padded_sequences
-
-    model = Transformer(
-        vocab_size=_padded_sequences.max() + 1,
-        seq_length=_padded_sequences.shape[1],
-        input_dim=8,
-        n_heads=2,
-        n_blocks=2,
-        with_pos_encoding=False,
-        with_cls_token=with_cls_token,
-    )
-
-    out = model(torch.from_numpy(_padded_sequences))
-
-    res = []
-    res.append(out.size(0) == _padded_sequences.shape[0])
-    res.append(out.size(1) == model.output_dim)
-
-    assert all(res)
-
-
-# ###############################################################################
-# # Test Custom Positional Encoder
-# ###############################################################################
-
-
-class DummyPositionalEncoding(torch.nn.Module):
-    def __init__(self, input_dim: int, seq_length: int):
-        super().__init__()
-
-        pe = torch.ones(1, seq_length, input_dim)
-        self.register_buffer("pe", pe)
-
-    def forward(self, X):
-        return X + self.pe
-
-
-def test_custom_pos_encoder():
-    model = Transformer(
-        vocab_size=padded_sequences.max() + 1,
-        seq_length=padded_sequences.shape[1],
-        input_dim=8,
-        n_heads=2,
-        n_blocks=2,
-        pos_encoder=DummyPositionalEncoding(
-            input_dim=8, seq_length=padded_sequences.shape[1]
-        ),
-    )
-
-    out = model(torch.from_numpy(padded_sequences))
-
-    res = []
-    res.append(out.size(0) == padded_sequences.shape[0])
-    res.append(out.size(1) == model.output_dim)
-
-    assert all(res)
diff --git a/tests/test_model_components/test_wide_deep.py b/tests/test_model_components/test_wide_deep.py
index 9cee7a31..c1a4ecd0 100644
--- a/tests/test_model_components/test_wide_deep.py
+++ b/tests/test_model_components/test_wide_deep.py
@@ -1,14 +1,7 @@
 import pytest
 from torch import nn
 
-from pytorch_widedeep.models import (
-    Wide,
-    TabMlp,
-    TabNet,
-    Vision,
-    BasicRNN,
-    WideDeep,
-)
+from pytorch_widedeep.models import Wide, TabMlp, TabNet, Vision, BasicRNN, WideDeep
 
 embed_input = [(u, i, j) for u, i, j in zip(["a", "b", "c"][:4], [4] * 3, [8] * 3)]
 column_idx = {k: v for v, k in enumerate(["a", "b", "c"])}
diff --git a/tests/test_model_functioning/test_callbacks.py b/tests/test_model_functioning/test_callbacks.py
index 6084fb17..998bb41f 100644
--- a/tests/test_model_functioning/test_callbacks.py
+++ b/tests/test_model_functioning/test_callbacks.py
@@ -12,11 +12,7 @@
 
 from pytorch_widedeep.models import Wide, TabMlp, WideDeep, TabTransformer
 from pytorch_widedeep.training import Trainer
-from pytorch_widedeep.callbacks import (
-    LRHistory,
-    EarlyStopping,
-    ModelCheckpoint,
-)
+from pytorch_widedeep.callbacks import LRHistory, EarlyStopping, ModelCheckpoint
 
 # Wide array
 X_wide = np.random.choice(50, (32, 10))
diff --git a/tests/test_model_functioning/test_fit_methods.py b/tests/test_model_functioning/test_fit_methods.py
index 7ee2284a..f873f75d 100644
--- a/tests/test_model_functioning/test_fit_methods.py
+++ b/tests/test_model_functioning/test_fit_methods.py
@@ -5,13 +5,7 @@
 import pytest
 from torch import nn
 
-from pytorch_widedeep.models import (
-    Wide,
-    TabMlp,
-    TabNet,
-    WideDeep,
-    TabTransformer,
-)
+from pytorch_widedeep.models import Wide, TabMlp, TabNet, WideDeep, TabTransformer
 from pytorch_widedeep.metrics import R2Score
 from pytorch_widedeep.training import Trainer
 from pytorch_widedeep.dataloaders import DataLoaderImbalanced
diff --git a/tests/test_model_functioning/test_miscellaneous.py b/tests/test_model_functioning/test_miscellaneous.py
index e54d407d..3cc68572 100644
--- a/tests/test_model_functioning/test_miscellaneous.py
+++ b/tests/test_model_functioning/test_miscellaneous.py
@@ -17,12 +17,12 @@
     BasicRNN,
     WideDeep,
     TabResnet,
-    Transformer,
     TabTransformer,
 )
 from pytorch_widedeep.metrics import Accuracy, Precision
 from pytorch_widedeep.training import Trainer
 from pytorch_widedeep.callbacks import EarlyStopping
+from pytorch_widedeep.models.rec import Transformer  # left here for 'legacy' reasons
 from pytorch_widedeep.preprocessing import TabPreprocessor
 
 # Wide array
diff --git a/tests/test_multi_model_and_mutil_data/test_multi_tab_and_text_components.py b/tests/test_multi_model_and_mutil_data/test_multi_tab_and_text_components.py
index 11c710e0..f19f22f8 100644
--- a/tests/test_multi_model_and_mutil_data/test_multi_tab_and_text_components.py
+++ b/tests/test_multi_model_and_mutil_data/test_multi_tab_and_text_components.py
@@ -10,20 +10,12 @@
 import pytest
 
 from pytorch_widedeep import Trainer
-from pytorch_widedeep.models import (
-    TabMlp,
-    TabNet,
-    BasicRNN,
-    WideDeep,
-    ModelFuser,
-)
+from pytorch_widedeep.models import TabMlp, TabNet, BasicRNN, WideDeep, ModelFuser
 from pytorch_widedeep.metrics import F1Score, Accuracy
 from pytorch_widedeep.callbacks import LRHistory
 from pytorch_widedeep.initializers import XavierNormal, KaimingNormal
 from pytorch_widedeep.preprocessing import TabPreprocessor, TextPreprocessor
-from pytorch_widedeep.models._base_wd_model_component import (
-    BaseWDModelComponent,
-)
+from pytorch_widedeep.models._base_wd_model_component import BaseWDModelComponent
 
 
 class CustomHead(BaseWDModelComponent):
diff --git a/tests/test_multi_model_and_mutil_data/test_multi_text_or_image_cols.py b/tests/test_multi_model_and_mutil_data/test_multi_text_or_image_cols.py
index 395f9f60..bccd0d0c 100644
--- a/tests/test_multi_model_and_mutil_data/test_multi_text_or_image_cols.py
+++ b/tests/test_multi_model_and_mutil_data/test_multi_text_or_image_cols.py
@@ -8,20 +8,12 @@
 from torchvision.transforms import RandomVerticalFlip, RandomHorizontalFlip
 
 from pytorch_widedeep import Trainer
-from pytorch_widedeep.models import (
-    TabMlp,
-    Vision,
-    BasicRNN,
-    WideDeep,
-    ModelFuser,
-)
+from pytorch_widedeep.models import TabMlp, Vision, BasicRNN, WideDeep, ModelFuser
 from pytorch_widedeep.metrics import F1Score, Accuracy
 from pytorch_widedeep.callbacks import LRHistory
 from pytorch_widedeep.initializers import XavierNormal, KaimingNormal
 from pytorch_widedeep.preprocessing import TabPreprocessor, TextPreprocessor
-from pytorch_widedeep.models._base_wd_model_component import (
-    BaseWDModelComponent,
-)
+from pytorch_widedeep.models._base_wd_model_component import BaseWDModelComponent
 
 
 class CustomHead(BaseWDModelComponent):
diff --git a/tests/test_multi_model_and_mutil_data/test_multi_text_or_image_cols_from_folder.py b/tests/test_multi_model_and_mutil_data/test_multi_text_or_image_cols_from_folder.py
index 53b7b7f0..c45f501f 100644
--- a/tests/test_multi_model_and_mutil_data/test_multi_text_or_image_cols_from_folder.py
+++ b/tests/test_multi_model_and_mutil_data/test_multi_text_or_image_cols_from_folder.py
@@ -6,10 +6,7 @@
 
 from pytorch_widedeep.models import TabMlp, Vision, BasicRNN, WideDeep
 from pytorch_widedeep.training import TrainerFromFolder
-from pytorch_widedeep.preprocessing import (
-    ChunkTabPreprocessor,
-    ChunkTextPreprocessor,
-)
+from pytorch_widedeep.preprocessing import ChunkTabPreprocessor, ChunkTextPreprocessor
 from pytorch_widedeep.load_from_folder import (
     TabFromFolder,
     TextFromFolder,
diff --git a/tests/test_rec/__init__.py b/tests/test_rec/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_rec/conftest.py b/tests/test_rec/conftest.py
new file mode 100644
index 00000000..739cdb96
--- /dev/null
+++ b/tests/test_rec/conftest.py
@@ -0,0 +1,11 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def cat_embed_cols():
+    return ["user_id", "item_id", "user_location", "item_category"]
+
+
+@pytest.fixture(scope="module")
+def continuous_cols():
+    return ["user_age", "item_price"]
diff --git a/tests/test_rec/test_deepffm.py b/tests/test_rec/test_deepffm.py
new file mode 100644
index 00000000..a823d165
--- /dev/null
+++ b/tests/test_rec/test_deepffm.py
@@ -0,0 +1,129 @@
+import torch
+import pytest
+
+from pytorch_widedeep import Trainer
+from pytorch_widedeep.models import Wide, WideDeep
+from pytorch_widedeep.models.rec import DeepFieldAwareFactorizationMachine
+from pytorch_widedeep.preprocessing import TabPreprocessor, WidePreprocessor
+
+from .utils_test_rec import create_train_val_test_data
+
+train, valid, test = create_train_val_test_data()
+
+
+@pytest.mark.parametrize(
+    "reduce_sum, mlp_hidden_dims",
+    [
+        (True, None),
+        (False, None),
+        (True, [16, 8]),
+        (False, [16, 8]),
+    ],
+)
+def test_deepffm_reduce_sum(reduce_sum, mlp_hidden_dims, cat_embed_cols):
+
+    tab_preprocessor = TabPreprocessor(
+        cat_embed_cols=cat_embed_cols,
+        for_mf=True,
+    )
+
+    X_tab = tab_preprocessor.fit_transform(train)
+
+    deepffm = DeepFieldAwareFactorizationMachine(
+        column_idx=tab_preprocessor.column_idx,
+        num_factors=8,
+        reduce_sum=reduce_sum,
+        cat_embed_input=tab_preprocessor.cat_embed_input,
+        mlp_hidden_dims=mlp_hidden_dims,
+    )
+
+    X_tab_tnsr = torch.tensor(X_tab)
+    res = deepffm(X_tab_tnsr)
+
+    if reduce_sum:
+        assert res.shape == (X_tab_tnsr.shape[0], 1)
+    else:
+        assert res.shape == (X_tab_tnsr.shape[0], deepffm.output_dim)
+
+
+def test_deepffm_model(cat_embed_cols):
+
+    tab_preprocessor = TabPreprocessor(
+        cat_embed_cols=cat_embed_cols,
+        for_mf=True,
+    )
+
+    X_tab = tab_preprocessor.fit_transform(train)
+    X_tab_tnsr = torch.tensor(X_tab)
+
+    wide_preprocessor = WidePreprocessor(wide_cols=cat_embed_cols)
+    X_wide = wide_preprocessor.fit_transform(train)
+    X_wide_tnsr = torch.tensor(X_wide)
+
+    deepffm = DeepFieldAwareFactorizationMachine(
+        column_idx=tab_preprocessor.column_idx,
+        num_factors=8,
+        reduce_sum=True,
+        cat_embed_input=tab_preprocessor.cat_embed_input,
+        mlp_hidden_dims=[16, 8],
+    )
+
+    linear = Wide(input_dim=X_wide.max())
+
+    fm_model = WideDeep(wide=linear, deeptabular=deepffm)
+
+    X_inp = {"wide": X_wide_tnsr, "deeptabular": X_tab_tnsr}
+
+    out = fm_model(X_inp)
+
+    assert out.shape[0] == X_tab_tnsr.shape[0]
+    assert out.shape[1] == 1
+
+
+def test_deepffm_full_process(cat_embed_cols):
+
+    tab_preprocessor = TabPreprocessor(
+        cat_embed_cols=cat_embed_cols,
+        for_mf=True,
+    )
+
+    X_tab_tr = tab_preprocessor.fit_transform(train)
+    X_tab_val = tab_preprocessor.transform(valid)
+    X_tab_te = tab_preprocessor.transform(test)
+
+    y_tr = train["purchased"].values
+    y_val = valid["purchased"].values
+
+    wide_preprocessor = WidePreprocessor(wide_cols=cat_embed_cols)
+    X_wide_tr = wide_preprocessor.fit_transform(train)
+    X_wide_val = wide_preprocessor.transform(valid)
+    X_wide_te = wide_preprocessor.transform(test)
+
+    deepffm = DeepFieldAwareFactorizationMachine(
+        column_idx=tab_preprocessor.column_idx,
+        num_factors=8,
+        reduce_sum=True,
+        cat_embed_input=tab_preprocessor.cat_embed_input,
+        mlp_hidden_dims=[16, 8],
+    )
+
+    linear = Wide(input_dim=X_wide_tr.max())
+
+    fm_model = WideDeep(wide=linear, deeptabular=deepffm)
+
+    X_train = {"X_wide": X_wide_tr, "X_tab": X_tab_tr, "target": y_tr}
+    X_val = {"X_wide": X_wide_val, "X_tab": X_tab_val, "target": y_val}
+    X_test = {"X_wide": X_wide_te, "X_tab": X_tab_te}
+
+    trainer = Trainer(model=fm_model, objective="binary", verbose=0)
+
+    trainer.fit(X_train=X_train, X_val=X_val, n_epochs=1)
+
+    preds = trainer.predict(X_test=X_test)
+
+    assert preds.shape[0] == X_tab_te.shape[0]
+    assert (
+        trainer.history is not None
+        and "train_loss" in trainer.history
+        and "val_loss" in trainer.history
+    )
diff --git a/tests/test_rec/test_deepfm.py b/tests/test_rec/test_deepfm.py
new file mode 100644
index 00000000..55592bec
--- /dev/null
+++ b/tests/test_rec/test_deepfm.py
@@ -0,0 +1,207 @@
+import torch
+import pytest
+
+from pytorch_widedeep import Trainer
+from pytorch_widedeep.models import Wide, WideDeep
+from pytorch_widedeep.models.rec import DeepFactorizationMachine
+from pytorch_widedeep.preprocessing import TabPreprocessor, WidePreprocessor
+
+from .utils_test_rec import create_train_val_test_data
+
+train, valid, test = create_train_val_test_data()
+
+
+@pytest.mark.parametrize(
+    "reduce_sum, mlp_hidden_dims",
+    [
+        (True, None),
+        (False, None),
+        (True, [16, 8]),
+        (False, [16, 8]),
+    ],
+)
+def test_deepfm_reduce_sum(
+    reduce_sum, mlp_hidden_dims, cat_embed_cols, continuous_cols
+):
+
+    tab_preprocessor = TabPreprocessor(
+        cat_embed_cols=cat_embed_cols,
+        continuous_cols=continuous_cols,
+        for_mf=True,
+    )
+
+    X_tab = tab_preprocessor.fit_transform(train)
+
+    model = DeepFactorizationMachine(
+        column_idx=tab_preprocessor.column_idx,
+        num_factors=8,
+        reduce_sum=reduce_sum,
+        cat_embed_input=tab_preprocessor.cat_embed_input,
+        continuous_cols=continuous_cols,
+        embed_continuous_method="periodic",
+        n_frequencies=5,
+        sigma=0.1,
+        share_last_layer=False,
+        mlp_hidden_dims=mlp_hidden_dims,
+    )
+
+    X_tab_tnsr = torch.tensor(X_tab)
+    res = model(X_tab_tnsr)
+
+    if reduce_sum:
+        assert res.shape == (X_tab_tnsr.shape[0], 1)
+    else:
+        assert res.shape == (X_tab_tnsr.shape[0], model.output_dim)
+
+
+@pytest.mark.parametrize("embed_continuous_method", ["periodic", "piecewise"])
+@pytest.mark.parametrize("reduce_sum", [True, False])
+def test_deepfm_cont_embed_methods(
+    embed_continuous_method, reduce_sum, cat_embed_cols, continuous_cols
+):
+
+    tab_preprocessor = TabPreprocessor(
+        cat_embed_cols=cat_embed_cols,
+        continuous_cols=continuous_cols,
+        for_mf=True,
+    )
+
+    X_tab = tab_preprocessor.fit_transform(train)
+    X_tab_tnsr = torch.tensor(X_tab)
+
+    if embed_continuous_method == "periodic":
+        model = DeepFactorizationMachine(
+            column_idx=tab_preprocessor.column_idx,
+            num_factors=8,
+            reduce_sum=True,
+            cat_embed_input=tab_preprocessor.cat_embed_input,
+            continuous_cols=continuous_cols,
+            embed_continuous_method=embed_continuous_method,
+            n_frequencies=5,
+            sigma=0.1,
+            share_last_layer=False,
+            mlp_hidden_dims=[16, 8],
+        )
+    else:
+        quantization_setup = {
+            "item_price": list(train["item_price"].quantile([0.25, 0.5, 0.75]).values),
+            "user_age": list(train["user_age"].quantile([0.25, 0.5, 0.75]).values),
+        }
+        model = DeepFactorizationMachine(
+            column_idx=tab_preprocessor.column_idx,
+            num_factors=8,
+            reduce_sum=True,
+            cat_embed_input=tab_preprocessor.cat_embed_input,
+            continuous_cols=continuous_cols,
+            embed_continuous_method=embed_continuous_method,
+            quantization_setup=quantization_setup,
+            mlp_hidden_dims=[16, 8],
+        )
+
+    res = model(X_tab_tnsr)
+
+    if reduce_sum:
+        assert res.shape == (X_tab_tnsr.shape[0], 1)
+    else:
+        assert res.shape == (X_tab_tnsr.shape[0], model.output_dim)
+
+
+def test_deepfm_model(cat_embed_cols, continuous_cols):
+
+    tab_preprocessor = TabPreprocessor(
+        cat_embed_cols=cat_embed_cols,
+        continuous_cols=continuous_cols,
+        for_mf=True,
+    )
+
+    X_tab = tab_preprocessor.fit_transform(train)
+    X_tab_tnsr = torch.tensor(X_tab)
+
+    wide_preprocessor = WidePreprocessor(wide_cols=cat_embed_cols + continuous_cols)
+    # This is not entirely correct and needs comments in the docs
+    train_for_wide = train.copy()
+    train_for_wide["item_price"] = train_for_wide["item_price"].astype("int")
+    train_for_wide["user_age"] = train_for_wide["user_age"].astype("int")
+    X_wide = wide_preprocessor.fit_transform(train_for_wide)
+    X_wide_tnsr = torch.tensor(X_wide)
+
+    deepfm = DeepFactorizationMachine(
+        column_idx=tab_preprocessor.column_idx,
+        num_factors=8,
+        reduce_sum=True,
+        cat_embed_input=tab_preprocessor.cat_embed_input,
+        continuous_cols=continuous_cols,
+        embed_continuous_method="periodic",
+        n_frequencies=5,
+        sigma=0.1,
+        share_last_layer=False,
+        mlp_hidden_dims=[16, 8],
+    )
+
+    linear = Wide(input_dim=X_wide.max())
+
+    fm_model = WideDeep(wide=linear, deeptabular=deepfm)
+
+    X_inp = {"wide": X_wide_tnsr, "deeptabular": X_tab_tnsr}
+
+    out = fm_model(X_inp)
+
+    assert out.shape[0] == X_tab_tnsr.shape[0]
+    assert out.shape[1] == 1
+
+
+def test_deepfm_full_process(cat_embed_cols, continuous_cols):
+
+    tab_preprocessor = TabPreprocessor(
+        cat_embed_cols=cat_embed_cols,
+        continuous_cols=continuous_cols,
+        for_mf=True,
+    )
+
+    X_tab_tr = tab_preprocessor.fit_transform(train)
+    y_tr = train["purchased"].values
+    X_tab_val = tab_preprocessor.transform(valid)
+    y_val = valid["purchased"].values
+    X_tab_te = tab_preprocessor.transform(test)
+
+    wide_preprocessor = WidePreprocessor(wide_cols=cat_embed_cols + continuous_cols)
+    # This is not entirely correct and needs comments in the docs
+    train_for_wide = train.copy()
+    train_for_wide["item_price"] = train_for_wide["item_price"].astype("int")
+    train_for_wide["user_age"] = train_for_wide["user_age"].astype("int")
+    X_wide_tr = wide_preprocessor.fit_transform(train_for_wide)
+    X_wide_val = wide_preprocessor.transform(valid)
+    X_wide_te = wide_preprocessor.transform(test)
+
+    deepfm = DeepFactorizationMachine(
+        column_idx=tab_preprocessor.column_idx,
+        num_factors=8,
+        reduce_sum=True,
+        cat_embed_input=tab_preprocessor.cat_embed_input,
+        continuous_cols=continuous_cols,
+        embed_continuous_method="periodic",
+        n_frequencies=5,
+        sigma=0.1,
+        share_last_layer=False,
+        mlp_hidden_dims=[16, 8],
+    )
+
+    linear = Wide(input_dim=X_wide_tr.max())
+
+    fm_model = WideDeep(wide=linear, deeptabular=deepfm)
+
+    trainer = Trainer(model=fm_model, objective="binary", verbose=0)
+
+    X_train = {"X_wide": X_wide_tr, "X_tab": X_tab_tr, "target": y_tr}
+    X_val = {"X_wide": X_wide_val, "X_tab": X_tab_val, "target": y_val}
+    X_test = {"X_wide": X_wide_te, "X_tab": X_tab_te}
+
+    trainer.fit(X_train=X_train, X_val=X_val, n_epochs=1)
+    preds = trainer.predict(X_test=X_test)
+
+    assert preds.shape[0] == X_tab_te.shape[0]
+    assert (
+        trainer.history is not None
+        and "train_loss" in trainer.history
+        and "val_loss" in trainer.history
+    )
diff --git a/tests/test_rec/test_din.py b/tests/test_rec/test_din.py
new file mode 100644
index 00000000..92724791
--- /dev/null
+++ b/tests/test_rec/test_din.py
@@ -0,0 +1,200 @@
+import numpy as np
+import torch
+import pytest
+
+from pytorch_widedeep import Trainer
+from pytorch_widedeep.models import WideDeep
+from pytorch_widedeep.models.rec import DeepInterestNetwork
+
+from .utils_test_rec import prepare_data_for_din
+
+data, config, column_idx = prepare_data_for_din()
+
+item_seq_config = config["item_seq_config"]
+item_purchased_seq_config = config["item_purchased_seq_config"]
+item_category_seq_config = config["item_category_seq_config"]
+cat_embed_input = config["cat_embed_input"]
+continuous_cols = config["continuous_cols"]
+
+
+def test_din_initialization():
+
+    din = DeepInterestNetwork(
+        column_idx=column_idx,
+        target_item_col="target_item",
+        user_behavior_confiq=item_seq_config,
+        action_seq_config=item_purchased_seq_config,
+        other_seq_cols_confiq=item_category_seq_config,
+        cat_embed_input=cat_embed_input,
+        continuous_cols=continuous_cols,
+    )
+
+    # Other cols assertion:
+    other_col_embed = din.other_col_embed.cat_embed.embed_layers
+    for col, n_unique, embed_dim in cat_embed_input:
+        assert other_col_embed[f"emb_layer_{col}"].num_embeddings == n_unique + 1
+        assert other_col_embed[f"emb_layer_{col}"].embedding_dim == embed_dim
+
+    # we have 5 users, so user_behavior_embed must have 6 num_embeddings (5 + padding)
+    assert din.user_behavior_embed.cat_embed.embed.num_embeddings == 6
+
+    # we have 2 actions per user, so action_embed must have 3 num_embeddings (2 + padding)
+    assert din.action_embed.cat_embed.embed.num_embeddings == 3
+
+    # we have 5 categories, so other_cols_embed must have 6 num_embeddings (5 + padding)
+    assert din.other_seq_cols_embed["seq_0"].cat_embed.embed.num_embeddings
+
+
+def test_din_forward():
+
+    X_tab = torch.tensor(data["train"][0])
+    din = DeepInterestNetwork(
+        column_idx=column_idx,
+        target_item_col="target_item",
+        user_behavior_confiq=item_seq_config,
+        action_seq_config=item_purchased_seq_config,
+        other_seq_cols_confiq=item_category_seq_config,
+        cat_embed_input=cat_embed_input,
+        continuous_cols=continuous_cols,
+    )
+
+    res = din(X_tab)
+    assert res.shape[0] == X_tab.shape[0]
+
+
+@pytest.mark.parametrize("use_cat_bias", [True, False])
+@pytest.mark.parametrize("cat_embed_activation", ["relu", None])
+@pytest.mark.parametrize("cont_norm_layer", ["layernorm", "batchnorm"])
+@pytest.mark.parametrize("cont_embed_activation", ["relu", None])
+@pytest.mark.parametrize("mlp_hidden_dims", [None, [16, 8]])
+def test_din_with_params(
+    use_cat_bias,
+    cat_embed_activation,
+    cont_norm_layer,
+    cont_embed_activation,
+    mlp_hidden_dims,
+):
+
+    X_tab = torch.tensor(data["train"][0])
+
+    din = DeepInterestNetwork(
+        column_idx=column_idx,
+        target_item_col="target_item",
+        user_behavior_confiq=item_seq_config,
+        action_seq_config=item_purchased_seq_config,
+        other_seq_cols_confiq=item_category_seq_config,
+        cat_embed_input=cat_embed_input,
+        cat_embed_dropout=0.2,
+        use_cat_bias=use_cat_bias,
+        cat_embed_activation=cat_embed_activation,
+        continuous_cols=continuous_cols,
+        cont_norm_layer=cont_norm_layer,
+        embed_continuous=True,
+        embed_continuous_method="standard",
+        cont_embed_dim=4,
+        cont_embed_dropout=0.2,
+        cont_embed_activation=cont_embed_activation,
+        mlp_hidden_dims=mlp_hidden_dims,
+    )
+
+    res = din(X_tab)
+
+    assert res.shape[0] == X_tab.shape[0]
+
+
+def test_din_with_piecewise():
+    X_tab = torch.tensor(data["train"][0])
+
+    quantization_setup = {}
+    for col, idx in column_idx.items():
+        if col in continuous_cols:
+            quantization_setup[col] = list(
+                np.quantile(X_tab[:, idx], [0.0, 0.25, 0.5, 1.0])
+            )
+
+    din = DeepInterestNetwork(
+        column_idx=column_idx,
+        target_item_col="target_item",
+        user_behavior_confiq=item_seq_config,
+        action_seq_config=item_purchased_seq_config,
+        other_seq_cols_confiq=item_category_seq_config,
+        cat_embed_input=cat_embed_input,
+        continuous_cols=continuous_cols,
+        cont_embed_dim=4,
+        embed_continuous_method="piecewise",
+        quantization_setup=quantization_setup,
+        mlp_hidden_dims=[16, 8],
+    )
+
+    res = din(X_tab)
+    assert res.shape[0] == X_tab.shape[0]
+
+
+def test_din_with_periodic():
+    X_tab = torch.tensor(data["train"][0])
+
+    din = DeepInterestNetwork(
+        column_idx=column_idx,
+        target_item_col="target_item",
+        user_behavior_confiq=item_seq_config,
+        action_seq_config=item_purchased_seq_config,
+        other_seq_cols_confiq=item_category_seq_config,
+        cat_embed_input=cat_embed_input,
+        continuous_cols=continuous_cols,
+        cont_embed_dim=4,
+        embed_continuous_method="periodic",
+        cont_embed_activation="relu",
+        n_frequencies=4,
+        sigma=0.1,
+        share_last_layer=True,
+        mlp_hidden_dims=[16, 8],
+    )
+
+    res = din(X_tab)
+    assert res.shape[0] == X_tab.shape[0]
+
+
+@pytest.mark.parametrize("mlp_hidden_dims", [None, [16, 8]])
+def test_din_full_process(mlp_hidden_dims):
+
+    X_tab_tr, y_tr = data["train"]
+    X_tab_val, y_val = data["val"]
+    X_tab_te, _ = data["test"]
+
+    quantization_setup = {}
+    for col, idx in column_idx.items():
+        if col in continuous_cols:
+            quantization_setup[col] = list(
+                np.quantile(X_tab_tr[:, idx], [0.0, 0.25, 0.5, 1.0])
+            )
+
+    din = DeepInterestNetwork(
+        column_idx=column_idx,
+        target_item_col="target_item",
+        user_behavior_confiq=item_seq_config,
+        action_seq_config=item_purchased_seq_config,
+        other_seq_cols_confiq=item_category_seq_config,
+        cat_embed_input=cat_embed_input,
+        continuous_cols=continuous_cols,
+        cont_embed_dim=4,
+        embed_continuous_method="piecewise",
+        quantization_setup=quantization_setup,
+        mlp_hidden_dims=mlp_hidden_dims,
+    )
+
+    model = WideDeep(deeptabular=din)
+
+    trainer = Trainer(model, objective="binary", verbose=0)
+
+    X_train = {"X_tab": X_tab_tr, "target": y_tr}
+    X_val = {"X_tab": X_tab_val, "target": y_val}
+    trainer.fit(X_train=X_train, X_val=X_val, n_epochs=1)
+
+    preds = trainer.predict_proba(X_tab=X_tab_te)
+
+    assert preds.shape[0] == X_tab_te.shape[0]
+    assert (
+        trainer.history is not None
+        and "train_loss" in trainer.history
+        and "val_loss" in trainer.history
+    )
diff --git a/tests/test_rec/test_layers.py b/tests/test_rec/test_layers.py
new file mode 100644
index 00000000..ab0884fb
--- /dev/null
+++ b/tests/test_rec/test_layers.py
@@ -0,0 +1,104 @@
+import torch
+import pytest
+
+from pytorch_widedeep.models.rec._layers import (
+    Dice,
+    ActivationUnit,
+    CompressedInteractionNetwork,
+)
+
+
+@pytest.fixture
+def batch_size():
+    return 32
+
+
+@pytest.fixture
+def embed_dim():
+    return 16
+
+
+@pytest.fixture
+def seq_len():
+    return 10
+
+
+@pytest.fixture
+def num_cols():
+    return 5
+
+
+def test_dice(batch_size, embed_dim, seq_len):
+    dice = Dice(input_dim=embed_dim)
+
+    # random tensor with non negative values
+    X = torch.randn(batch_size, seq_len, embed_dim)
+    X = X.clip(min=0)
+
+    output = dice(X)
+
+    assert output.shape == (batch_size, seq_len, embed_dim)
+    assert torch.all(output >= 0) and torch.all(output <= X)
+
+
+def test_dice_backward(batch_size, embed_dim, seq_len):
+    dice = Dice(input_dim=embed_dim)
+    X = torch.randn(batch_size, seq_len, embed_dim, requires_grad=True)
+
+    output = dice(X)
+    loss = output.sum()
+    loss.backward()
+
+    assert X.grad is not None
+    assert X.grad.shape == X.shape
+
+
+@pytest.mark.parametrize("activation", ["prelu", "dice"])
+def test_activation_unit(batch_size, embed_dim, seq_len, activation):
+    au = ActivationUnit(embed_dim=embed_dim, activation=activation)
+    item = torch.randn(batch_size, 1, embed_dim)
+    user_behavior = torch.randn(batch_size, seq_len, embed_dim)
+
+    output = au(item, user_behavior)
+
+    assert output.shape == (batch_size, seq_len)
+    assert torch.allclose(output.sum(dim=1), torch.ones(batch_size))
+
+
+def test_activation_unit_backward(batch_size, embed_dim, seq_len):
+    au = ActivationUnit(embed_dim=embed_dim, activation="prelu")
+    item = torch.randn(batch_size, 1, embed_dim, requires_grad=True)
+    user_behavior = torch.randn(batch_size, seq_len, embed_dim, requires_grad=True)
+
+    output = au(item, user_behavior)
+    loss = output.sum()
+    loss.backward()
+
+    assert item.grad is not None
+    assert user_behavior.grad is not None
+    assert item.grad.shape == item.shape
+    assert user_behavior.grad.shape == user_behavior.shape
+
+
+def test_cin(batch_size, embed_dim, num_cols):
+    cin_layer_dims = [16, 8]
+    cin = CompressedInteractionNetwork(num_cols=num_cols, cin_layer_dims=cin_layer_dims)
+    X = torch.randn(batch_size, num_cols, embed_dim)
+
+    output = cin(X)
+
+    expected_output_dim = sum(cin_layer_dims)
+    assert output.shape == (batch_size, expected_output_dim)
+
+
+def test_cin_backward(batch_size, embed_dim, num_cols):
+    cin_layer_dims = [64, 32]
+    cin = CompressedInteractionNetwork(num_cols=num_cols, cin_layer_dims=cin_layer_dims)
+    X = torch.randn(batch_size, num_cols, embed_dim, requires_grad=True)
+
+    output = cin(X)
+    loss = output.sum()
+    loss.backward()
+
+    assert X.grad is not None
+    assert X.grad.shape == X.shape
diff --git a/tests/test_rec/test_transformer.py b/tests/test_rec/test_transformer.py
new file mode 100644
index 00000000..695e4453
--- /dev/null
+++ b/tests/test_rec/test_transformer.py
@@ -0,0 +1,86 @@
+# ###############################################################################
+# # Test Basic Transformer
+# ###############################################################################
+
+import numpy as np
+import torch
+import pytest
+
+from pytorch_widedeep.models.rec import Transformer
+
+padded_sequences = np.random.choice(np.arange(1, 100), (100, 48))
+padded_sequences = np.hstack(
+    (np.repeat(np.array([[0, 0]]), 100, axis=0), padded_sequences)
+)
+
+
+@pytest.mark.parametrize(
+    "with_cls_token",
+    [True, False],
+)
+def test_basic_transformer(with_cls_token):
+    if with_cls_token:
+        # if we use a 'CLS' token it must be inserted at the beginning of the
+        # sequence
+        _padded_sequences = np.zeros(
+            (padded_sequences.shape[0], padded_sequences.shape[1] + 1), dtype=int
+        )
+        _padded_sequences[:, 0] = padded_sequences.max() + 1
+        _padded_sequences[:, 1:] = padded_sequences
+    else:
+        _padded_sequences = padded_sequences
+
+    model = Transformer(
+        vocab_size=_padded_sequences.max() + 1,
+        seq_length=_padded_sequences.shape[1],
+        input_dim=8,
+        n_heads=2,
+        n_blocks=2,
+        with_pos_encoding=False,
+        with_cls_token=with_cls_token,
+    )
+
+    out = model(torch.from_numpy(_padded_sequences))
+
+    res = []
+    res.append(out.size(0) == _padded_sequences.shape[0])
+    res.append(out.size(1) == model.output_dim)
+
+    assert all(res)
+
+
+# ###############################################################################
+# # Test Custom Positional Encoder
+# ###############################################################################
+
+
+class DummyPositionalEncoding(torch.nn.Module):
+    def __init__(self, input_dim: int, seq_length: int):
+        super().__init__()
+
+        pe = torch.ones(1, seq_length, input_dim)
+        self.register_buffer("pe", pe)
+
+    def forward(self, X):
+        return X + self.pe
+
+
+def test_custom_pos_encoder():
+    model = Transformer(
+        vocab_size=padded_sequences.max() + 1,
+        seq_length=padded_sequences.shape[1],
+        input_dim=8,
+        n_heads=2,
+        n_blocks=2,
+        pos_encoder=DummyPositionalEncoding(
+            input_dim=8, seq_length=padded_sequences.shape[1]
+        ),
+    )
+
+    out = model(torch.from_numpy(padded_sequences))
+
+    res = []
+    res.append(out.size(0) == padded_sequences.shape[0])
+    res.append(out.size(1) == model.output_dim)
+
+    assert all(res)
diff --git a/tests/test_rec/test_xdeepfm.py b/tests/test_rec/test_xdeepfm.py
new file mode 100644
index 00000000..0b2273c0
--- /dev/null
+++ b/tests/test_rec/test_xdeepfm.py
@@ -0,0 +1,214 @@
+import torch
+import pytest
+
+from pytorch_widedeep import Trainer
+from pytorch_widedeep.models import Wide, WideDeep
+from pytorch_widedeep.models.rec import ExtremeDeepFactorizationMachine as xDeepFM
+from pytorch_widedeep.preprocessing import TabPreprocessor, WidePreprocessor
+
+from .utils_test_rec import create_train_val_test_data
+
+train, valid, test = create_train_val_test_data()
+
+
+@pytest.mark.parametrize(
+    "reduce_sum, mlp_hidden_dims",
+    [
+        (True, None),
+        (False, None),
+        (True, [16, 8]),
+        (False, [16, 8]),
+    ],
+)
+def test_xdeepfm_reduce_sum(
+    reduce_sum, mlp_hidden_dims, cat_embed_cols, continuous_cols
+):
+
+    tab_preprocessor = TabPreprocessor(
+        cat_embed_cols=cat_embed_cols,
+        continuous_cols=continuous_cols,
+        for_mf=True,
+    )
+
+    X_tab = tab_preprocessor.fit_transform(train)
+
+    xdeepfm = xDeepFM(
+        column_idx=tab_preprocessor.column_idx,
+        input_dim=8,
+        reduce_sum=reduce_sum,
+        cin_layer_dims=[16, 8],
+        cat_embed_input=tab_preprocessor.cat_embed_input,
+        continuous_cols=continuous_cols,
+        embed_continuous_method="periodic",
+        n_frequencies=5,
+        sigma=0.1,
+        share_last_layer=False,
+        mlp_hidden_dims=mlp_hidden_dims,
+    )
+
+    X_tab_tnsr = torch.tensor(X_tab)
+    res = xdeepfm(X_tab_tnsr)
+
+    if reduce_sum:
+        assert res.shape == (X_tab_tnsr.shape[0], 1)
+    else:
+        assert res.shape == (X_tab_tnsr.shape[0], xdeepfm.output_dim)
+
+
+@pytest.mark.parametrize("embed_continuous_method", ["periodic", "piecewise"])
+@pytest.mark.parametrize("reduce_sum", [True, False])
+def test_xdeepfm_cont_embed_methods(
+    embed_continuous_method, reduce_sum, cat_embed_cols, continuous_cols
+):
+
+    tab_preprocessor = TabPreprocessor(
+        cat_embed_cols=cat_embed_cols,
+        continuous_cols=continuous_cols,
+        for_mf=True,
+    )
+
+    X_tab = tab_preprocessor.fit_transform(train)
+    X_tab_tnsr = torch.tensor(X_tab)
+
+    if embed_continuous_method == "periodic":
+        xdeepfm = xDeepFM(
+            column_idx=tab_preprocessor.column_idx,
+            input_dim=8,
+            reduce_sum=True,
+            cin_layer_dims=[16, 8],
+            cat_embed_input=tab_preprocessor.cat_embed_input,
+            continuous_cols=continuous_cols,
+            embed_continuous_method=embed_continuous_method,
+            n_frequencies=5,
+            sigma=0.1,
+            share_last_layer=False,
+            mlp_hidden_dims=[16, 8],
+        )
+    else:
+        quantization_setup = {
+            "item_price": list(train["item_price"].quantile([0.25, 0.5, 0.75]).values),
+            "user_age": list(train["user_age"].quantile([0.25, 0.5, 0.75]).values),
+        }
+        xdeepfm = xDeepFM(
+            column_idx=tab_preprocessor.column_idx,
+            input_dim=8,
+            reduce_sum=True,
+            cin_layer_dims=[16, 8],
+            cat_embed_input=tab_preprocessor.cat_embed_input,
+            continuous_cols=continuous_cols,
+            embed_continuous_method=embed_continuous_method,
+            quantization_setup=quantization_setup,
+            mlp_hidden_dims=[16, 8],
+        )
+
+    res = xdeepfm(X_tab_tnsr)
+
+    if reduce_sum:
+        assert res.shape == (X_tab_tnsr.shape[0], 1)
+    else:
+        assert res.shape == (X_tab_tnsr.shape[0], xdeepfm.output_dim)
+
+
+def test_xdeepfm_model(cat_embed_cols, continuous_cols):
+
+    tab_preprocessor = TabPreprocessor(
+        cat_embed_cols=cat_embed_cols,
+        continuous_cols=continuous_cols,
+        for_mf=True,
+    )
+
+    X_tab = tab_preprocessor.fit_transform(train)
+    X_tab_tnsr = torch.tensor(X_tab)
+
+    wide_preprocessor = WidePreprocessor(wide_cols=cat_embed_cols + continuous_cols)
+    # This is not entirely correct and needs comments in the docs
+    train_for_wide = train.copy()
+    train_for_wide["item_price"] = train_for_wide["item_price"].astype("int")
+    train_for_wide["user_age"] = train_for_wide["user_age"].astype("int")
+    X_wide = wide_preprocessor.fit_transform(train_for_wide)
+    X_wide_tnsr = torch.tensor(X_wide)
+
+    xdeepfm = xDeepFM(
+        column_idx=tab_preprocessor.column_idx,
+        input_dim=8,
+        reduce_sum=True,
+        cin_layer_dims=[16, 8],
+        cat_embed_input=tab_preprocessor.cat_embed_input,
+        continuous_cols=continuous_cols,
+        embed_continuous_method="periodic",
+        n_frequencies=5,
+        sigma=0.1,
+        share_last_layer=False,
+        mlp_hidden_dims=[16, 8],
+    )
+
+    linear = Wide(input_dim=X_wide.max())
+
+    fm_model = WideDeep(wide=linear, deeptabular=xdeepfm)
+
+    X_inp = {"wide": X_wide_tnsr, "deeptabular": X_tab_tnsr}
+
+    out = fm_model(X_inp)
+
+    assert out.shape[0] == X_tab_tnsr.shape[0]
+    assert out.shape[1] == 1
+
+
+def test_xdeepfm_full_process(cat_embed_cols, continuous_cols):
+
+    tab_preprocessor = TabPreprocessor(
+        cat_embed_cols=cat_embed_cols,
+        continuous_cols=continuous_cols,
+        for_mf=True,
+    )
+
+    X_tab_tr = tab_preprocessor.fit_transform(train)
+    X_tab_val = tab_preprocessor.transform(valid)
+    X_tab_te = tab_preprocessor.transform(test)
+
+    y_tr = train["purchased"].values
+    y_val = valid["purchased"].values
+
+    wide_preprocessor = WidePreprocessor(wide_cols=cat_embed_cols + continuous_cols)
+    # This is not entirely correct and needs comments in the docs
+    train_for_wide = train.copy()
+    train_for_wide["item_price"] = train_for_wide["item_price"].astype("int")
+    train_for_wide["user_age"] = train_for_wide["user_age"].astype("int")
+
+    X_wide_tr = wide_preprocessor.fit_transform(train_for_wide)
+    X_wide_val = wide_preprocessor.transform(valid)
+    X_wide_te = wide_preprocessor.transform(test)
+
+    xdeepfm = xDeepFM(
+        column_idx=tab_preprocessor.column_idx,
+        input_dim=8,
+        reduce_sum=True,
+        cin_layer_dims=[16, 8],
+        cat_embed_input=tab_preprocessor.cat_embed_input,
+        continuous_cols=continuous_cols,
+        embed_continuous_method="periodic",
+        n_frequencies=5,
+        sigma=0.1,
+        share_last_layer=False,
+        mlp_hidden_dims=[16, 8],
+    )
+
+    linear = Wide(input_dim=X_wide_tr.max())
+
+    fm_model = WideDeep(wide=linear, deeptabular=xdeepfm)
+
+    trainer = Trainer(model=fm_model, objective="binary", verbose=0)
+
+    X_train = {"X_wide": X_wide_tr, "X_tab": X_tab_tr, "target": y_tr}
+    X_val = {"X_wide": X_wide_val, "X_tab": X_tab_val, "target": y_val}
+    X_test = {"X_wide": X_wide_te, "X_tab": X_tab_te}
+
+    trainer.fit(X_train=X_train, X_val=X_val, n_epochs=1)
+    preds = trainer.predict(X_test=X_test)
+
+    assert preds.shape[0] == X_tab_te.shape[0]
+    assert (
+        trainer.history is not None
+        and "train_loss" in trainer.history
+        and "val_loss" in trainer.history
+    )
diff --git a/tests/test_rec/utils_test_rec.py b/tests/test_rec/utils_test_rec.py
new file mode 100644
index 00000000..cb359e45
--- /dev/null
+++ b/tests/test_rec/utils_test_rec.py
@@ -0,0 +1,205 @@
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import LabelEncoder
+
+from pytorch_widedeep.preprocessing import TabPreprocessor
+
+
+def create_purchase_dataframe():
+    np.random.seed(42)
+    n_rows = 100
+
+    df = pd.DataFrame(
+        {
+            "user_id": np.random.choice(range(1, 6), n_rows),
+            "item_id": np.random.choice(range(1, 6), n_rows),
+            "item_price": np.random.uniform(10, 100, n_rows).round(2),
+            "item_category": np.random.choice(
+                ["Electronics", "Clothing", "Books", "Home", "Sports"], n_rows
+            ),
+            "user_age": np.random.randint(18, 65, n_rows),
+            "user_location": np.random.choice(
+                ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix"], n_rows
+            ),
+            "purchased": np.random.choice([0, 1], n_rows, p=[0.7, 0.3]),
+        }
+    )
+
+    return df
+
+
+def create_user_sequences(df, sequence_length=5):
+    sequences = []
+
+    for user_id in df["user_id"].unique():
+        user_df = df[df["user_id"] == user_id].sort_index()
+
+        if len(user_df) < sequence_length + 1:
+            continue
+
+        for i in range(len(user_df) - sequence_length):
+            seq = user_df.iloc[i : i + sequence_length]
+            target = user_df.iloc[i + sequence_length]
+
+            sequences.append(
+                {
+                    "user_id": user_id,
+                    "user_age": seq["user_age"].iloc[0],
+                    "user_location": seq["user_location"].iloc[0],
+                    "item_seq": seq["item_id"].tolist(),
+                    "item_price_seq": seq["item_price"].tolist(),
+                    "item_category_seq": seq["item_category"].tolist(),
+                    "item_purchased_seq": seq["purchased"].tolist(),
+                    "target_item": target["item_id"],
+                    "purchased": target["purchased"],
+                }
+            )
+
+    return pd.DataFrame(sequences)
+
+
+def split_user_data(df):
+    train_data = []
+    val_data = []
+    test_data = []
+
+    for user_id in df["user_id"].unique():
+        user_df = df[df["user_id"] == user_id].sort_index()
+        n_interactions = len(user_df)
+
+        if n_interactions >= 3:
+            train_data.append(user_df.iloc[:-2])
+            val_data.append(user_df.iloc[-2:-1])
+            test_data.append(user_df.iloc[-1:])
+        elif n_interactions == 2:
+            train_data.append(user_df.iloc[0:1])
+            val_data.append(user_df.iloc[1:2])
+        elif n_interactions == 1:
+            train_data.append(user_df)
+
+    train_df = pd.concat(train_data, ignore_index=True)
+    val_df = pd.concat(val_data, ignore_index=True)
+    test_df = pd.concat(test_data, ignore_index=True)
+
+    return train_df, val_df, test_df
+
+
+def create_train_val_test_data():
+    df = create_purchase_dataframe()
+    train_df, val_df, test_df = split_user_data(df)
+    return train_df, val_df, test_df
+
+
+def label_encode_column(df, column_name):
+    le = LabelEncoder()
+    df[column_name] = le.fit_transform(df[column_name])
+    return df, le
+
+
+def prepare_data_for_din():
+    df = create_and_encode_dataframe()
+    df_seq = create_user_sequences(df)
+    df_seq = process_item_seq_and_target(df_seq)
+    train, val, test = split_user_data(df_seq)
+    tab_preprocessor = TabPreprocessor(
+        cat_embed_cols=["user_id", "user_age", "user_location"],
+        continuous_cols=[f"item_price_{i}" for i in range(5)],
+    )
+    data = process_datasets(train, val, test, tab_preprocessor)
+    config, column_idx = create_config(train, tab_preprocessor)
+    return data, config, column_idx
+
+
+def create_and_encode_dataframe():
+    df = create_purchase_dataframe()
+    df_le = df.copy()
+
+    for column in ["item_id", "item_category", "user_location"]:
+        df_le, _ = label_encode_column(df_le, column)
+        df_le[column] += 1  # Add 1 because 0 will be used for padding
+
+    df_le["purchased"] += 1
+    return df_le
+
+
+def process_item_seq_and_target(df_seq):
+    df_seq["purchased"] = (df_seq["purchased"] - 1).astype(int)
+
+    for i in range(5):
+        df_seq[f"item_price_{i}"] = df_seq["item_price_seq"].apply(lambda x: x[i])
+
+    df_seq.drop(columns="item_price_seq", inplace=True)
+    return df_seq
+
+
+def process_dataset(dataset, tab_preprocessor, is_train=False):
+    X_item_seq = np.array(dataset["item_seq"].tolist())
+    X_item_purchased_seq = np.array(dataset["item_purchased_seq"].tolist())
+    X_item_category_seq = np.array(dataset["item_category_seq"].tolist())
+    X_target_item = dataset["target_item"].values.reshape(-1, 1)
+
+    if is_train:
+        X_other_cols = tab_preprocessor.fit_transform(dataset)
+    else:
+        X_other_cols = tab_preprocessor.transform(dataset)
+
+    y = dataset["purchased"].values
+
+    X = np.concatenate(
+        [
+            X_item_seq,
+            X_item_purchased_seq,
+            X_item_category_seq,
+            X_other_cols,
+            X_target_item,
+        ],
+        axis=1,
+    )
+
+    return X, y
+
+
+def process_datasets(train, val, test, tab_preprocessor):
+    return {
+        "train": process_dataset(train, tab_preprocessor, is_train=True),
+        "val": process_dataset(val, tab_preprocessor),
+        "test": process_dataset(test, tab_preprocessor),
+    }
+
+
+def create_config(train, tab_preprocessor):
+    item_seq_cols = [f"item_seq_{i}" for i in range(5)]
+    item_purchased_seq_cols = [f"item_purchased_seq_{i}" for i in range(5)]
+    item_category_seq_cols = [f"item_category_seq_{i}" for i in range(5)]
+
+    col_order = (
+        item_seq_cols
+        + item_purchased_seq_cols
+        + item_category_seq_cols
+        + [el[0] for el in tab_preprocessor.cat_embed_input]
+        + [f"item_price_{i}" for i in range(5)]
+        + ["target_item"]
+    )
+
+    column_idx = {k: i for i, k in enumerate(col_order)}
+
+    config = {
+        "column_idx": column_idx,
+        "item_seq_config": (
+            item_seq_cols,
+            np.array(train["item_seq"].tolist()).max(),
+            8,
+        ),
+        "item_purchased_seq_config": (item_purchased_seq_cols, 2),
+        "item_category_seq_config": [
+            (
+                item_category_seq_cols,
+                np.array(train["item_category_seq"].tolist()).max(),
+                8,
+            )
+        ],
+        "cat_embed_input": tab_preprocessor.cat_embed_input,
+        "continuous_cols": tab_preprocessor.continuous_cols,
+    }
+
+    return config, column_idx
diff --git a/tests/test_self_supervised/test_ss_callbacks.py b/tests/test_self_supervised/test_ss_callbacks.py
index 787305ca..b6719bf9 100644
--- a/tests/test_self_supervised/test_ss_callbacks.py
+++ b/tests/test_self_supervised/test_ss_callbacks.py
@@ -9,11 +9,7 @@
 from torch.optim.lr_scheduler import StepLR, CyclicLR, ReduceLROnPlateau
 
 from pytorch_widedeep.models import TabMlp, TabTransformer
-from pytorch_widedeep.callbacks import (
-    LRHistory,
-    EarlyStopping,
-    ModelCheckpoint,
-)
+from pytorch_widedeep.callbacks import LRHistory, EarlyStopping, ModelCheckpoint
 from pytorch_widedeep.preprocessing import TabPreprocessor
 from pytorch_widedeep.self_supervised_training import (
     EncoderDecoderTrainer,
diff --git a/tests/test_self_supervised/test_ss_model_pretrain_method.py b/tests/test_self_supervised/test_ss_model_pretrain_method.py
index 62532b49..a6714533 100644
--- a/tests/test_self_supervised/test_ss_model_pretrain_method.py
+++ b/tests/test_self_supervised/test_ss_model_pretrain_method.py
@@ -7,19 +7,13 @@
 from pytorch_widedeep.models import TabMlp as TabMlpEncoder
 from pytorch_widedeep.models import TabNet as TabNetEncoder
 from pytorch_widedeep.models import TabResnet as TabResnetEncoder
-from pytorch_widedeep.models import (
-    TabMlpDecoder,
-    TabNetDecoder,
-    TabResnetDecoder,
-)
+from pytorch_widedeep.models import TabMlpDecoder, TabNetDecoder, TabResnetDecoder
 from pytorch_widedeep.preprocessing import TabPreprocessor
 from pytorch_widedeep.self_supervised_training import (
     EncoderDecoderTrainer,
     ContrastiveDenoisingTrainer,
 )
-from tests.test_self_supervised.test_ss_model_components import (
-    _build_transf_model,
-)
+from tests.test_self_supervised.test_ss_model_components import _build_transf_model
 
 some_letters = list(string.ascii_lowercase)
 some_numbers = range(10)