tomis9's cookbook

useful processing

May 17, 2019 5 minutes read

1. What is useful processing?

many machine learning algorithms require the same kinds of data preprocessing in order for them to work properly. In other words, theses kinds of processing are useful.

2. Examples

one-hot encoding

# data.table
dt_iris <- data.table::as.data.table(iris)
mltools::one_hot(dt_iris)

##      Sepal.Length Sepal.Width Petal.Length Petal.Width Species_setosa
##   1:          5.1         3.5          1.4         0.2              1
##   2:          4.9         3.0          1.4         0.2              1
##   3:          4.7         3.2          1.3         0.2              1
##   4:          4.6         3.1          1.5         0.2              1
##   5:          5.0         3.6          1.4         0.2              1
##  ---                                                                 
## 146:          6.7         3.0          5.2         2.3              0
## 147:          6.3         2.5          5.0         1.9              0
## 148:          6.5         3.0          5.2         2.0              0
## 149:          6.2         3.4          5.4         2.3              0
## 150:          5.9         3.0          5.1         1.8              0
##      Species_versicolor Species_virginica
##   1:                  0                 0
##   2:                  0                 0
##   3:                  0                 0
##   4:                  0                 0
##   5:                  0                 0
##  ---                                     
## 146:                  0                 1
## 147:                  0                 1
## 148:                  0                 1
## 149:                  0                 1
## 150:                  0                 1

# caret
library(caret)
dummy <- caret::dummyVars(" ~ .", data = iris)
head(predict(dummy, iris))

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species.setosa
## 1          5.1         3.5          1.4         0.2              1
## 2          4.9         3.0          1.4         0.2              1
## 3          4.7         3.2          1.3         0.2              1
## 4          4.6         3.1          1.5         0.2              1
## 5          5.0         3.6          1.4         0.2              1
## 6          5.4         3.9          1.7         0.4              1
##   Species.versicolor Species.virginica
## 1                  0                 0
## 2                  0                 0
## 3                  0                 0
## 4                  0                 0
## 5                  0                 0
## 6                  0                 0

# dplyr is not that clever
library(dplyr)
iris %>%
  mutate("Species_setosa" = ifelse(Species == "setosa", 1, 0)) %>%
  mutate("Species_virgninica" = ifelse(Species == "virgninica", 1, 0)) %>%
  mutate("Species_versicolor" = ifelse(Species == "versicolor", 1, 0)) %>%
  head()

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species Species_setosa
## 1          5.1         3.5          1.4         0.2  setosa              1
## 2          4.9         3.0          1.4         0.2  setosa              1
## 3          4.7         3.2          1.3         0.2  setosa              1
## 4          4.6         3.1          1.5         0.2  setosa              1
## 5          5.0         3.6          1.4         0.2  setosa              1
## 6          5.4         3.9          1.7         0.4  setosa              1
##   Species_virgninica Species_versicolor
## 1                  0                  0
## 2                  0                  0
## 3                  0                  0
## 4                  0                  0
## 5                  0                  0
## 6                  0                  0

As you can see, caret recognised dummy variables (Species) and processed them to binary variables.

TODO: library(dummies); library(onehot)

Python

# https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import datasets

data = datasets.load_iris()
y = [data.target_names[i] for i in data.target]

# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(y)
print(integer_encoded[:5])
# binary encode

## [0 0 0 0 0]

onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

## /usr/local/lib/python3.5/dist-packages/sklearn/preprocessing/_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.
## If you want the future behaviour and silence this warning, you can specify "categories='auto'".
## In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
##   warnings.warn(msg, FutureWarning)

print(onehot_encoded[:5])

## [[1. 0. 0.]
##  [1. 0. 0.]
##  [1. 0. 0.]
##  [1. 0. 0.]
##  [1. 0. 0.]]

scaling

In R it’s extremely simple. base R

scale(1:5)

##            [,1]
## [1,] -1.2649111
## [2,] -0.6324555
## [3,]  0.0000000
## [4,]  0.6324555
## [5,]  1.2649111
## attr(,"scaled:center")
## [1] 3
## attr(,"scaled:scale")
## [1] 1.581139

mean(1:5)

## [1] 3

sd(1:5)

## [1] 1.581139

sc <- scale(iris[,1:4])
head(sc)

##      Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,]   -0.8976739  1.01560199    -1.335752   -1.311052
## [2,]   -1.1392005 -0.13153881    -1.335752   -1.311052
## [3,]   -1.3807271  0.32731751    -1.392399   -1.311052
## [4,]   -1.5014904  0.09788935    -1.279104   -1.311052
## [5,]   -1.0184372  1.24503015    -1.335752   -1.311052
## [6,]   -0.5353840  1.93331463    -1.165809   -1.048667

attributes(sc)

## $dim
## [1] 150   4
## 
## $dimnames
## $dimnames[[1]]
## NULL
## 
## $dimnames[[2]]
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width" 
## 
## 
## $`scaled:center`
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##     5.843333     3.057333     3.758000     1.199333 
## 
## $`scaled:scale`
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##    0.8280661    0.4358663    1.7652982    0.7622377

Python

from sklearn.preprocessing import scale, StandardScaler
from sklearn import datasets
import numpy as np

# simple approach
sc = scale(np.arange(1, 6))
print(np.std(sc))

## 0.9999999999999999

print(np.mean(sc))

# and a way compatible with pandas

## 0.0

data = datasets.load_iris()
X, y = data.data, data.target

scaler = StandardScaler()
scaled_df = scaler.fit_transform(X)

print(scaled_df.mean(axis=0))

## [-1.69031455e-15 -1.84297022e-15 -1.69864123e-15 -1.40924309e-15]

print(scaled_df.std(axis=0))

## [1. 1. 1. 1.]

splitting your dataset into train and test subsets

The idea for the following solution comes from this post at stackoverflow.

base R

train_test_split <- function(test_proportion = 0.75, dataset) {
    smp_size <- floor(test_proportion * nrow(dataset))
    train_ind <- sample(seq_len(nrow(dataset)), size = smp_size)

    train <- dataset[train_ind, ]
    test <- dataset[-train_ind, ]
    return(list(train = train, test = test))
}
library(gsubfn)

## Warning: no DISPLAY variable so Tk is not available

list[train, test] <- train_test_split(0.8, iris)

caret R

library(caret)
# ..., the random sampling is done within the
# levels of ‘y’ when ‘y’ is a factor in an attempt to balance the class
# distributions within the splits.
# I provide package's name before function's name for clarity
trainIndex <- caret::createDataPartition(iris$Species, p=0.7, list = FALSE, 
                                         times = 1)
train <- iris[trainIndex,]
test <- iris[-trainIndex,]

Python - sklearn

from sklearn import datasets
from sklearn.model_selection import train_test_split

data = datasets.load_iris()

X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

Python - pandas

import pandas as pd

# data = pd.DataFrame(data)
# train = data.sample(frac=0.8)
# test = data.drop(train.index)

sklearn pipeline

a short article about sklearn pipelines

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn import datasets

data = datasets.load_iris()
X, y = data.data, data.target

# one way
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

svc = SVC()
svc.fit(X_scaled, y)

# or another - with pipeline

## SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
##     decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
##     kernel='rbf', max_iter=-1, probability=False, random_state=None,
##     shrinking=True, tol=0.001, verbose=False)

from sklearn.pipeline import Pipeline

svc = Pipeline([('scaler', StandardScaler()), ('SVM', SVC())])
svc.fit(X, y)

## Pipeline(memory=None,
##          steps=[('scaler',
##                  StandardScaler(copy=True, with_mean=True, with_std=True)),
##                 ('SVM',
##                  SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
##                      decision_function_shape='ovr', degree=3,
##                      gamma='auto_deprecated', kernel='rbf', max_iter=-1,
##                      probability=False, random_state=None, shrinking=True,
##                      tol=0.001, verbose=False))],
##          verbose=False)

TODO: pd.get_dummies()