data science tutorials and snippets prepared by tomis9
R
# data.table
dt_iris <- data.table::as.data.table(iris)
mltools::one_hot(dt_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species_setosa
## 1: 5.1 3.5 1.4 0.2 1
## 2: 4.9 3.0 1.4 0.2 1
## 3: 4.7 3.2 1.3 0.2 1
## 4: 4.6 3.1 1.5 0.2 1
## 5: 5.0 3.6 1.4 0.2 1
## ---
## 146: 6.7 3.0 5.2 2.3 0
## 147: 6.3 2.5 5.0 1.9 0
## 148: 6.5 3.0 5.2 2.0 0
## 149: 6.2 3.4 5.4 2.3 0
## 150: 5.9 3.0 5.1 1.8 0
## Species_versicolor Species_virginica
## 1: 0 0
## 2: 0 0
## 3: 0 0
## 4: 0 0
## 5: 0 0
## ---
## 146: 0 1
## 147: 0 1
## 148: 0 1
## 149: 0 1
## 150: 0 1
# caret
library(caret)
dummy <- caret::dummyVars(" ~ .", data = iris)
head(predict(dummy, iris))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species.setosa
## 1 5.1 3.5 1.4 0.2 1
## 2 4.9 3.0 1.4 0.2 1
## 3 4.7 3.2 1.3 0.2 1
## 4 4.6 3.1 1.5 0.2 1
## 5 5.0 3.6 1.4 0.2 1
## 6 5.4 3.9 1.7 0.4 1
## Species.versicolor Species.virginica
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
# dplyr is not that clever
library(dplyr)
iris %>%
mutate("Species_setosa" = ifelse(Species == "setosa", 1, 0)) %>%
mutate("Species_virgninica" = ifelse(Species == "virgninica", 1, 0)) %>%
mutate("Species_versicolor" = ifelse(Species == "versicolor", 1, 0)) %>%
head()
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species Species_setosa
## 1 5.1 3.5 1.4 0.2 setosa 1
## 2 4.9 3.0 1.4 0.2 setosa 1
## 3 4.7 3.2 1.3 0.2 setosa 1
## 4 4.6 3.1 1.5 0.2 setosa 1
## 5 5.0 3.6 1.4 0.2 setosa 1
## 6 5.4 3.9 1.7 0.4 setosa 1
## Species_virgninica Species_versicolor
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
As you can see, caret
recognised dummy variables (Species
) and processed them to binary variables.
TODO: library(dummies); library(onehot)
Python
# https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import datasets
data = datasets.load_iris()
y = [data.target_names[i] for i in data.target]
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(y)
print(integer_encoded[:5])
# binary encode
## [0 0 0 0 0]
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
## /usr/local/lib/python3.5/dist-packages/sklearn/preprocessing/_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.
## If you want the future behaviour and silence this warning, you can specify "categories='auto'".
## In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
## warnings.warn(msg, FutureWarning)
print(onehot_encoded[:5])
## [[1. 0. 0.]
## [1. 0. 0.]
## [1. 0. 0.]
## [1. 0. 0.]
## [1. 0. 0.]]
In R it’s extremely simple. base R
scale(1:5)
## [,1]
## [1,] -1.2649111
## [2,] -0.6324555
## [3,] 0.0000000
## [4,] 0.6324555
## [5,] 1.2649111
## attr(,"scaled:center")
## [1] 3
## attr(,"scaled:scale")
## [1] 1.581139
mean(1:5)
## [1] 3
sd(1:5)
## [1] 1.581139
sc <- scale(iris[,1:4])
head(sc)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] -0.8976739 1.01560199 -1.335752 -1.311052
## [2,] -1.1392005 -0.13153881 -1.335752 -1.311052
## [3,] -1.3807271 0.32731751 -1.392399 -1.311052
## [4,] -1.5014904 0.09788935 -1.279104 -1.311052
## [5,] -1.0184372 1.24503015 -1.335752 -1.311052
## [6,] -0.5353840 1.93331463 -1.165809 -1.048667
attributes(sc)
## $dim
## [1] 150 4
##
## $dimnames
## $dimnames[[1]]
## NULL
##
## $dimnames[[2]]
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##
##
## $`scaled:center`
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 5.843333 3.057333 3.758000 1.199333
##
## $`scaled:scale`
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0.8280661 0.4358663 1.7652982 0.7622377
Python
from sklearn.preprocessing import scale, StandardScaler
from sklearn import datasets
import numpy as np
# simple approach
sc = scale(np.arange(1, 6))
print(np.std(sc))
## 0.9999999999999999
print(np.mean(sc))
# and a way compatible with pandas
## 0.0
data = datasets.load_iris()
X, y = data.data, data.target
scaler = StandardScaler()
scaled_df = scaler.fit_transform(X)
print(scaled_df.mean(axis=0))
## [-1.69031455e-15 -1.84297022e-15 -1.69864123e-15 -1.40924309e-15]
print(scaled_df.std(axis=0))
## [1. 1. 1. 1.]
The idea for the following solution comes from this post at stackoverflow.
base R
train_test_split <- function(test_proportion = 0.75, dataset) {
smp_size <- floor(test_proportion * nrow(dataset))
train_ind <- sample(seq_len(nrow(dataset)), size = smp_size)
train <- dataset[train_ind, ]
test <- dataset[-train_ind, ]
return(list(train = train, test = test))
}
library(gsubfn)
## Warning: no DISPLAY variable so Tk is not available
list[train, test] <- train_test_split(0.8, iris)
caret R
library(caret)
# ..., the random sampling is done within the
# levels of ‘y’ when ‘y’ is a factor in an attempt to balance the class
# distributions within the splits.
# I provide package's name before function's name for clarity
trainIndex <- caret::createDataPartition(iris$Species, p=0.7, list = FALSE,
times = 1)
train <- iris[trainIndex,]
test <- iris[-trainIndex,]
Python - sklearn
from sklearn import datasets
from sklearn.model_selection import train_test_split
data = datasets.load_iris()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)
Python - pandas
import pandas as pd
# data = pd.DataFrame(data)
# train = data.sample(frac=0.8)
# test = data.drop(train.index)
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
data = datasets.load_iris()
X, y = data.data, data.target
# one way
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
svc = SVC()
svc.fit(X_scaled, y)
# or another - with pipeline
## SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
## decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
## kernel='rbf', max_iter=-1, probability=False, random_state=None,
## shrinking=True, tol=0.001, verbose=False)
from sklearn.pipeline import Pipeline
svc = Pipeline([('scaler', StandardScaler()), ('SVM', SVC())])
svc.fit(X, y)
## Pipeline(memory=None,
## steps=[('scaler',
## StandardScaler(copy=True, with_mean=True, with_std=True)),
## ('SVM',
## SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
## decision_function_shape='ovr', degree=3,
## gamma='auto_deprecated', kernel='rbf', max_iter=-1,
## probability=False, random_state=None, shrinking=True,
## tol=0.001, verbose=False))],
## verbose=False)
TODO: pd.get_dummies()