-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbaf.py
84 lines (65 loc) · 3.95 KB
/
baf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
class BAFDataset:
def load_data(self, file_path):
"""
Load the specified subset of the BAF dataset as a pandas DataFrame.
Args:
- subset (str): The subset to load. Choices are 'Base', 'Variant I', 'Variant II', 'Variant III', 'Variant IV', and 'Variant V'.
Returns:
- df (pd.DataFrame): The BAF dataset as a pandas DataFrame.
"""
ds_name = file_path.split("/")[-1] # Split the path by "/" and get the last element
subset = ds_name.split(".")[0] # Split the filename by "." and get the first element
if subset not in ['Base', 'Variant I', 'Variant II', 'Variant III', 'Variant IV', 'Variant V']:
raise ValueError("Invalid subset type. Choices are 'Base', 'Variant I', 'Variant II', 'Variant III', 'Variant IV', and 'Variant V'.")
df = pd.read_csv(file_path).drop('device_fraud_count', axis=1)
return df
def train_test_split(self, df, month):
"""
Split the BAF dataset into train and test sets based on the specified month.
Args:
- df (pd.DataFrame): The BAF dataset as a pandas DataFrame.
- month (int): The month to use as the splitting point. Months 0-5 are in the train set, and 6-7 are in the test set, as proposed in the paper.
Returns:
- (X_train, y_train), (X_test, y_test) (tuple of pd.DataFrame): A tuple containing the train and test sets.
"""
train_mask = df['month'] < month
X_train, y_train = df[train_mask].drop(['month', 'fraud_bool'], axis=1), df[train_mask]['fraud_bool']
X_test, y_test = df[~train_mask].drop(['month', 'fraud_bool'], axis=1), df[~train_mask]['fraud_bool']
return (X_train, y_train), (X_test, y_test)
def one_hot_encode_categorical(self, X_train, X_test):
"""
One-hot encode the categorical features in the BAF dataset.
This function takes the training and test sets as input, identifies the categorical features
in the dataset, and one-hot encodes them. The one-hot encoding process converts categorical
features into binary columns for each category/label. This is done separately for the training
and test sets to avoid data leakage. The transformed data is then combined with the numerical
features to form the final one-hot encoded train and test sets.
Args:
- X_train (pd.DataFrame): The training set features.
- X_test (pd.DataFrame): The test set features.
Returns:
- X_train, X_test (tuple of pd.DataFrame): The one-hot encoded train and test sets.
"""
# Identify the categorical features
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)
# Initialize the one-hot encoder with 'ignore' for handling unknown categories in the test set
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
# Apply one-hot encoding to the categorical features in the train and test sets
ohe_cols_train = pd.DataFrame(ohe.fit_transform(X_train[object_cols]))
ohe_cols_test = pd.DataFrame(ohe.transform(X_test[object_cols]))
# Set the index of the transformed data to match the original data
ohe_cols_train.index = X_train.index
ohe_cols_test.index = X_test.index
# Remove the original categorical features from the train and test sets
num_X_train = X_train.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)
# Concatenate the numerical features with the one-hot encoded categorical features
X_train = pd.concat([num_X_train, ohe_cols_train], axis=1)
X_test = pd.concat([num_X_test, ohe_cols_test], axis=1)
# Ensure the column names are strings to meet newer sklearn requirements
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)
return X_train, X_test