Machine learning Data engineering
useful processing
May 17, 2019     5 minutes read

1. What is useful processing?

2. Examples

one-hot encoding

R

# data.table
dt_iris <- data.table::as.data.table(iris)
mltools::one_hot(dt_iris)
##      Sepal.Length Sepal.Width Petal.Length Petal.Width Species_setosa
##   1:          5.1         3.5          1.4         0.2              1
##   2:          4.9         3.0          1.4         0.2              1
##   3:          4.7         3.2          1.3         0.2              1
##   4:          4.6         3.1          1.5         0.2              1
##   5:          5.0         3.6          1.4         0.2              1
##  ---                                                                 
## 146:          6.7         3.0          5.2         2.3              0
## 147:          6.3         2.5          5.0         1.9              0
## 148:          6.5         3.0          5.2         2.0              0
## 149:          6.2         3.4          5.4         2.3              0
## 150:          5.9         3.0          5.1         1.8              0
##      Species_versicolor Species_virginica
##   1:                  0                 0
##   2:                  0                 0
##   3:                  0                 0
##   4:                  0                 0
##   5:                  0                 0
##  ---                                     
## 146:                  0                 1
## 147:                  0                 1
## 148:                  0                 1
## 149:                  0                 1
## 150:                  0                 1
# caret
library(caret)
dummy <- caret::dummyVars(" ~ .", data = iris)
head(predict(dummy, iris))
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species.setosa
## 1          5.1         3.5          1.4         0.2              1
## 2          4.9         3.0          1.4         0.2              1
## 3          4.7         3.2          1.3         0.2              1
## 4          4.6         3.1          1.5         0.2              1
## 5          5.0         3.6          1.4         0.2              1
## 6          5.4         3.9          1.7         0.4              1
##   Species.versicolor Species.virginica
## 1                  0                 0
## 2                  0                 0
## 3                  0                 0
## 4                  0                 0
## 5                  0                 0
## 6                  0                 0
# dplyr is not that clever
library(dplyr)
iris %>%
  mutate("Species_setosa" = ifelse(Species == "setosa", 1, 0)) %>%
  mutate("Species_virgninica" = ifelse(Species == "virgninica", 1, 0)) %>%
  mutate("Species_versicolor" = ifelse(Species == "versicolor", 1, 0)) %>%
  head()
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species Species_setosa
## 1          5.1         3.5          1.4         0.2  setosa              1
## 2          4.9         3.0          1.4         0.2  setosa              1
## 3          4.7         3.2          1.3         0.2  setosa              1
## 4          4.6         3.1          1.5         0.2  setosa              1
## 5          5.0         3.6          1.4         0.2  setosa              1
## 6          5.4         3.9          1.7         0.4  setosa              1
##   Species_virgninica Species_versicolor
## 1                  0                  0
## 2                  0                  0
## 3                  0                  0
## 4                  0                  0
## 5                  0                  0
## 6                  0                  0

As you can see, caret recognised dummy variables (Species) and processed them to binary variables.

TODO: library(dummies); library(onehot)

Python

# https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import datasets

data = datasets.load_iris()
y = [data.target_names[i] for i in data.target]

# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(y)
print(integer_encoded[:5])
# binary encode
## [0 0 0 0 0]
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
## /usr/local/lib/python3.5/dist-packages/sklearn/preprocessing/_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.
## If you want the future behaviour and silence this warning, you can specify "categories='auto'".
## In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
##   warnings.warn(msg, FutureWarning)
print(onehot_encoded[:5])
## [[1. 0. 0.]
##  [1. 0. 0.]
##  [1. 0. 0.]
##  [1. 0. 0.]
##  [1. 0. 0.]]

scaling

In R it’s extremely simple. base R

scale(1:5)
##            [,1]
## [1,] -1.2649111
## [2,] -0.6324555
## [3,]  0.0000000
## [4,]  0.6324555
## [5,]  1.2649111
## attr(,"scaled:center")
## [1] 3
## attr(,"scaled:scale")
## [1] 1.581139
mean(1:5)
## [1] 3
sd(1:5)
## [1] 1.581139
sc <- scale(iris[,1:4])
head(sc)
##      Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,]   -0.8976739  1.01560199    -1.335752   -1.311052
## [2,]   -1.1392005 -0.13153881    -1.335752   -1.311052
## [3,]   -1.3807271  0.32731751    -1.392399   -1.311052
## [4,]   -1.5014904  0.09788935    -1.279104   -1.311052
## [5,]   -1.0184372  1.24503015    -1.335752   -1.311052
## [6,]   -0.5353840  1.93331463    -1.165809   -1.048667
attributes(sc)
## $dim
## [1] 150   4
## 
## $dimnames
## $dimnames[[1]]
## NULL
## 
## $dimnames[[2]]
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width" 
## 
## 
## $`scaled:center`
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##     5.843333     3.057333     3.758000     1.199333 
## 
## $`scaled:scale`
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##    0.8280661    0.4358663    1.7652982    0.7622377

Python

from sklearn.preprocessing import scale, StandardScaler
from sklearn import datasets
import numpy as np

# simple approach
sc = scale(np.arange(1, 6))
print(np.std(sc))
## 0.9999999999999999
print(np.mean(sc))

# and a way compatible with pandas
## 0.0
data = datasets.load_iris()
X, y = data.data, data.target

scaler = StandardScaler()
scaled_df = scaler.fit_transform(X)

print(scaled_df.mean(axis=0))
## [-1.69031455e-15 -1.84297022e-15 -1.69864123e-15 -1.40924309e-15]
print(scaled_df.std(axis=0))
## [1. 1. 1. 1.]

splitting your dataset into train and test subsets

The idea for the following solution comes from this post at stackoverflow.

base R

train_test_split <- function(test_proportion = 0.75, dataset) {
    smp_size <- floor(test_proportion * nrow(dataset))
    train_ind <- sample(seq_len(nrow(dataset)), size = smp_size)

    train <- dataset[train_ind, ]
    test <- dataset[-train_ind, ]
    return(list(train = train, test = test))
}
library(gsubfn)
## Warning: no DISPLAY variable so Tk is not available
list[train, test] <- train_test_split(0.8, iris)

caret R

library(caret)
# ..., the random sampling is done within the
# levels of ‘y’ when ‘y’ is a factor in an attempt to balance the class
# distributions within the splits.
# I provide package's name before function's name for clarity
trainIndex <- caret::createDataPartition(iris$Species, p=0.7, list = FALSE, 
                                         times = 1)
train <- iris[trainIndex,]
test <- iris[-trainIndex,]

Python - sklearn

from sklearn import datasets
from sklearn.model_selection import train_test_split

data = datasets.load_iris()

X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

Python - pandas

import pandas as pd

# data = pd.DataFrame(data)
# train = data.sample(frac=0.8)
# test = data.drop(train.index)

sklearn pipeline

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn import datasets

data = datasets.load_iris()
X, y = data.data, data.target

# one way
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

svc = SVC()
svc.fit(X_scaled, y)

# or another - with pipeline
## SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
##     decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
##     kernel='rbf', max_iter=-1, probability=False, random_state=None,
##     shrinking=True, tol=0.001, verbose=False)
from sklearn.pipeline import Pipeline

svc = Pipeline([('scaler', StandardScaler()), ('SVM', SVC())])
svc.fit(X, y)
## Pipeline(memory=None,
##          steps=[('scaler',
##                  StandardScaler(copy=True, with_mean=True, with_std=True)),
##                 ('SVM',
##                  SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
##                      decision_function_shape='ovr', degree=3,
##                      gamma='auto_deprecated', kernel='rbf', max_iter=-1,
##                      probability=False, random_state=None, shrinking=True,
##                      tol=0.001, verbose=False))],
##          verbose=False)

TODO: pd.get_dummies()