12 Pre-processing

12.1 RT filtering

12.1.1 Function to filter by RT

Here you can see an example on how to use a function to filter by RT (log filtering version below).


We use the following dataset.


if (!require('readr')) install.packages('readr'); library('readr')
if (!require('dplyr')) install.packages('dplyr'); library('dplyr')
if (!require('purrr')) install.packages('purrr'); library('purrr')

# RT Filtering function
Filter_FUN <- function(DB, Column, Low_PCT = 0.05, High_PCT = 0.95) {
  
  args = as.list(match.call())
  
  # Creamos Column_LOG
  DB = DB %>% mutate(Column_LOG = log(eval(args$Column, DB)))
  lowcut = quantile(DB$Column_LOG, Low_PCT, na.rm = T)
  highcut = quantile(DB$Column_LOG, High_PCT, na.rm = T)
  
  # Returns DB_Filtered
  DB_Filtered <<- subset(DB, DB$Column_LOG > lowcut & DB$Column_LOG < highcut) 
  
  # return(DB_Filtered)
  cat("**** Datos filtrados a partir de la columna", args$Column, ": ", nrow(DB) - nrow(DB_Filtered), "/", nrow(DB), "filas eliminadas ***")
  
}

After creating the function, we read the DB and apply the filtering. A dataframe called DB_Filtered is created with the filtered data.

dataset = read_csv("Data/10-Pre_processing/RT_filtering.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   ID = col_double(),
##   Dem_Gender = col_double(),
##   Dem_Age = col_double(),
##   Time = col_double(),
##   Formato = col_character(),
##   Accuracy = col_double()
## )
dataset
## # A tibble: 200 x 6
##       ID Dem_Gender Dem_Age  Time Formato Accuracy
##    <dbl>      <dbl>   <dbl> <dbl> <chr>      <dbl>
##  1     1          2      37  2.62 PR             0
##  2     2          2      32 23.3  PR             0
##  3     3          1     100  6.9  FN             0
##  4     4          2      34 12.9  FN             1
##  5     5          1      29 38.9  FN             1
##  6     6          2      25 42.5  FN             1
##  7     7          1      27  3.29 PR             0
##  8     8          1      29 16.4  PR             0
##  9     9          1      30 60.8  PR             1
## 10    10          2      28 26.2  PR             0
## # … with 190 more rows
Filter_FUN(dataset, Time)
## **** Datos filtrados a partir de la columna Time :  20 / 200 filas eliminadas ***
DB_Filtered
## # A tibble: 180 x 7
##       ID Dem_Gender Dem_Age  Time Formato Accuracy Column_LOG
##    <dbl>      <dbl>   <dbl> <dbl> <chr>      <dbl>      <dbl>
##  1     1          2      37  2.62 PR             0      0.962
##  2     2          2      32 23.3  PR             0      3.15 
##  3     3          1     100  6.9  FN             0      1.93 
##  4     4          2      34 12.9  FN             1      2.56 
##  5     5          1      29 38.9  FN             1      3.66 
##  6     6          2      25 42.5  FN             1      3.75 
##  7     7          1      27  3.29 PR             0      1.19 
##  8     8          1      29 16.4  PR             0      2.80 
##  9     9          1      30 60.8  PR             1      4.11 
## 10    10          2      28 26.2  PR             0      3.26 
## # … with 170 more rows

12.1.2 n SD from the median (MAD)3

Probably should NOT use n SD from mean


We use the following dataset.


dataset = read.csv("Data/10-Pre_processing/RT_filtering.csv")
head(dataset)
##   ID Dem_Gender Dem_Age   Time Formato Accuracy
## 1  1          2      37  2.618      PR        0
## 2  2          2      32 23.303      PR        0
## 3  3          1     100  6.900      FN        0
## 4  4          2      34 12.905      FN        1
## 5  5          1      29 38.895      FN        1
## 6  6          2      25 42.473      FN        1
  • Code
#SD criteria
SD_filter = 2.5

dataset_filtered = dataset %>% 
  filter(Time < (median(Time) + SD_filter * sd(Time)) & Time > (median(Time) - SD_filter * sd(Time)))

# dataset_filtered = dataset[dataset$Time < (median(dataset$Time) + SD_filter * sd(dataset$Time) )  & dataset$Time > (median(dataset$Time) - SD_filter * sd(dataset$Time) ), ]
  • Plots
summary(dataset$Time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.242   7.686  15.453  24.252  28.858 165.529
summary(dataset_filtered$Time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.242   7.585  14.376  19.777  26.109  77.829
par(mfrow = c(1,2))
    #
    plot(density(dataset$Time), main = "Valid reaction time density")
    plot(density(dataset_filtered$Time), main = "Valid reaction time density")

par(mfrow = c(1,1))

12.1.3 Gamma law regression analyses4


We use the following dataset.


#Import to variable "dataset"
dataset = read.csv("Data/10-Pre_processing/RT_filtering.csv")
head(dataset)
##   ID Dem_Gender Dem_Age   Time Formato Accuracy
## 1  1          2      37  2.618      PR        0
## 2  2          2      32 23.303      PR        0
## 3  3          1     100  6.900      FN        0
## 4  4          2      34 12.905      FN        1
## 5  5          1      29 38.895      FN        1
## 6  6          2      25 42.473      FN        1
  • Code
#Gamma
glm1 = (glm(Time ~ Formato, data = dataset, family = Gamma))
summary(glm1)
## 
## Call:
## glm(formula = Time ~ Formato, family = Gamma, data = dataset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0026  -0.9522  -0.4051   0.1645   2.7565  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.042011   0.004558   9.217   <2e-16 ***
## FormatoPR   -0.001526   0.006330  -0.241     0.81    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 1.177074)
## 
##     Null deviance: 195.70  on 199  degrees of freedom
## Residual deviance: 195.63  on 198  degrees of freedom
## AIC: 1680.6
## 
## Number of Fisher Scoring iterations: 6
#Gausian
glm2 = (glm(Time ~ Formato, data = dataset))
summary(glm2)
## 
## Call:
## glm(formula = Time ~ Formato, data = dataset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -22.811  -16.245   -8.387    4.293  140.829  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  23.8031     2.6346   9.035   <2e-16 ***
## FormatoPR     0.8972     3.7258   0.241     0.81    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 694.0939)
## 
##     Null deviance: 137471  on 199  degrees of freedom
## Residual deviance: 137431  on 198  degrees of freedom
## AIC: 1880.1
## 
## Number of Fisher Scoring iterations: 2

12.1.4 Logarithms

Log filtering may not be the best option available5


We use the following dataset.


#Import to variable "dataset"
dataset = read.csv("Data/10-Pre_processing/RT_filtering.csv")
head(dataset)
##   ID Dem_Gender Dem_Age   Time Formato Accuracy
## 1  1          2      37  2.618      PR        0
## 2  2          2      32 23.303      PR        0
## 3  3          1     100  6.900      FN        0
## 4  4          2      34 12.905      FN        1
## 5  5          1      29 38.895      FN        1
## 6  6          2      25 42.473      FN        1
  • Code
#SET low and high PERCENTILES
    lowcutquant = 0.05
    highcutquant = 0.95
    
    # Add log-rt
    # dataset_filtered = data_long_RAW
    dataset$logrt <- log(dataset$Time)
    
    # Take correct answers
        # WE USE CORRECT ANSWERS TO SET TIME LIMITS!!!
        # BETTER USE dataset?
    corr = subset(dataset, dataset$Accuracy == 1)
    lowcut = quantile(corr$logrt, lowcutquant)
    highcut = quantile(corr$logrt, highcutquant)
    dataset_filtered <- subset(dataset, dataset$logrt > lowcut & dataset$logrt < highcut)
  • Plots
plot(density(dataset_filtered$logrt), main = "Log reaction time density")

par(mfrow = c(1,2))

    #
    plot(density(dataset$Time[dataset$Time < 10]), main = "Density of small reaction times")
    plot(density(dataset_filtered$Time[dataset_filtered$Time < 10]), main = "Density of small reaction times")

    #
    plot(density(dataset$Time), main = "Valid reaction time density")
    plot(density(dataset_filtered$Time), main = "Valid reaction time density")

par(mfrow = c(1,1))

    #
    summary(dataset$Time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.242   7.686  15.453  24.252  28.858 165.529
    summary(dataset_filtered$Time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.770   9.318  16.484  21.066  26.824  64.141

12.1.5 References

12.2 Reliability

A continuación mostramos dos sistemas para calcular reliability. Alpha de Chronbach y Omega.

12.2.1 Cronbach’s Alpha

https://www.ncbi.nlm.nih.gov/pubmed/28557467?dopt=Abstract

** TODO ** * Items negatively correlated * Alphadrop function para ordenar

En R hay, al menos, dos funciones:

  • cronbach.alpha() - library(ltm) - interfeeres with dplyr select()
  • alpha() - library(psych)
    • We use alpha() - Gives correlation information, etc.
  • Procedure ** WIP: references for this procedure (?) **
  1. Delete items with correlations (r.drop) < 0.25 one by one:
    • Eliminate the item with the smaller correlation min(a$item.stats[["r.drop"]])
    • Run reliability
    • Eliminate the next item with the smaller correlation (<0.25!) - Leave at least 3 items!
  2. Reasonable Cronbach score: >0.6. Good score >0.7
  • Cargamos librerias
if (!require('psych')) install.packages('psych'); library('psych')
if (!require('readr')) install.packages('readr'); library('readr')
if (!require('dplyr')) install.packages('dplyr'); library('dplyr')

We use the following dataset.


temp = read_csv(here::here("Data/10-Pre_processing/Reliability.csv"))
## Warning: Missing column names filled in: 'X1' [1]
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_double(),
##   OA.OA01. = col_double(),
##   OA.OA02. = col_double(),
##   OA.OA03. = col_double(),
##   OA.OA04. = col_double(),
##   OA.OA05. = col_double(),
##   OA.OA06. = col_double()
## )
temp = temp[-1] 

#Convierte todo a integer
# temp = temp %>% dplyr::mutate_each(funs(as.integer))
temp = as.data.frame(sapply(temp, as.integer))

head(temp)
##   OA.OA01. OA.OA02. OA.OA03. OA.OA04. OA.OA05. OA.OA06.
## 1        5        3        5        2        1        5
## 2        3        1        5        1        1        3
## 3        5        1        5        4        1        5
## 4        2        2        1        3        3        5
## 5        4        3        4        4        2        4
## 6        5        5        5        1        1        5
psych::alpha(temp)
## Warning in psych::alpha(temp): Some items were negatively correlated with the total scale and probably 
## should be reversed.  
## To do this, run the function again with the 'check.keys=TRUE' option
## Some items ( OA.OA02. OA.OA04. OA.OA05. ) were negatively correlated with the total scale and 
## probably should be reversed.  
## To do this, run the function again with the 'check.keys=TRUE' option
## 
## Reliability analysis   
## Call: psych::alpha(x = temp)
## 
##   raw_alpha std.alpha G6(smc) average_r   S/N   ase mean   sd median_r
##      0.062     0.087    0.22     0.016 0.096 0.094  3.5 0.48   -0.048
## 
##  lower alpha upper     95% confidence boundaries
## -0.12 0.06 0.25 
## 
##  Reliability if an item is dropped:
##          raw_alpha std.alpha G6(smc) average_r    S/N alpha se var.r  med.r
## OA.OA01.    -0.029    -0.075   0.090   -0.0141 -0.070    0.103 0.054 -0.058
## OA.OA02.    -0.011     0.073   0.218    0.0154  0.078    0.105 0.076 -0.059
## OA.OA03.     0.019    -0.053   0.068   -0.0102 -0.050    0.096 0.038 -0.025
## OA.OA04.    -0.013     0.064   0.214    0.0135  0.068    0.104 0.076 -0.024
## OA.OA05.     0.252     0.318   0.356    0.0852  0.466    0.078 0.042 -0.024
## OA.OA06.     0.045     0.022   0.116    0.0045  0.023    0.097 0.036 -0.024
## 
##  Item statistics 
##            n raw.r std.r r.cor r.drop mean   sd
## OA.OA01. 238  0.46  0.54  0.45  0.108  4.2 1.04
## OA.OA02. 238  0.55  0.43  0.11  0.071  3.2 1.40
## OA.OA03. 238  0.43  0.53  0.50  0.057  4.2 1.10
## OA.OA04. 238  0.50  0.43  0.12  0.079  2.7 1.25
## OA.OA05. 238  0.25  0.15 -0.38 -0.174  2.0 1.20
## OA.OA06. 238  0.30  0.47  0.39  0.038  4.6 0.76
## 
## Non missing response frequency for each item
##             1    2    3    4    5 miss
## OA.OA01. 0.01 0.08 0.14 0.21 0.55    0
## OA.OA02. 0.17 0.14 0.27 0.16 0.25    0
## OA.OA03. 0.02 0.10 0.08 0.21 0.58    0
## OA.OA04. 0.19 0.24 0.33 0.11 0.13    0
## OA.OA05. 0.50 0.21 0.16 0.07 0.05    0
## OA.OA06. 0.00 0.03 0.09 0.16 0.73    0
# temp2 = temp %>% select(-1, -OA.OA05.)
# alpha(temp2)
# temp2 = temp %>% select(-1, -OA.OA05., -OA.OA04.)
# alpha(temp2)
temp2 = temp %>% dplyr::select(-1, -OA.OA05., -OA.OA04., -OA.OA02.)
psych::alpha(temp2)
## 
## Reliability analysis   
## Call: psych::alpha(x = temp2)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N   ase mean   sd median_r
##       0.57       0.6    0.43      0.43 1.5 0.051  4.4 0.79     0.43
## 
##  lower alpha upper     95% confidence boundaries
## 0.47 0.57 0.67 
## 
##  Reliability if an item is dropped:
##          raw_alpha std.alpha G6(smc) average_r  S/N alpha se var.r med.r
## OA.OA03.      0.63      0.43    0.19      0.43 0.76       NA     0  0.43
## OA.OA06.      0.30      0.43    0.19      0.43 0.76       NA     0  0.43
## 
##  Item statistics 
##            n raw.r std.r r.cor r.drop mean   sd
## OA.OA03. 238  0.90  0.85  0.55   0.43  4.2 1.10
## OA.OA06. 238  0.78  0.85  0.55   0.43  4.6 0.76
## 
## Non missing response frequency for each item
##             1    2    3    4    5 miss
## OA.OA03. 0.02 0.10 0.08 0.21 0.58    0
## OA.OA06. 0.00 0.03 0.09 0.16 0.73    0

12.2.2 Omega

  • Cargamos librerias
# if (!require('pacman')) install.packages('pacman'); library('pacman')
# # p_load(p_depends(MBESS)$Depends, character.only = TRUE)
# install.packages("MBESS", dependencies = TRUE)

if (!require('psych')) install.packages('psych'); library('psych')
if (!require('readr')) install.packages('readr'); library('readr')
if (!require('dplyr')) install.packages('dplyr'); library('dplyr')

if (!require('MBESS')) install.packages('MBESS', dependencies = TRUE); library('MBESS')
if (!require('GPArotation')) install.packages('GPArotation'); library('GPArotation')

We use the following dataset.


temp = read_csv(here::here("Data/10-Pre_processing/Reliability.csv"))
## Warning: Missing column names filled in: 'X1' [1]
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_double(),
##   OA.OA01. = col_double(),
##   OA.OA02. = col_double(),
##   OA.OA03. = col_double(),
##   OA.OA04. = col_double(),
##   OA.OA05. = col_double(),
##   OA.OA06. = col_double()
## )
temp = temp[-1] 

#Convierte todo a integer
# temp = temp %>% dplyr::mutate_each(funs(as.integer))
temp = as.data.frame(sapply(temp, as.integer))

head(temp)
##   OA.OA01. OA.OA02. OA.OA03. OA.OA04. OA.OA05. OA.OA06.
## 1        5        3        5        2        1        5
## 2        3        1        5        1        1        3
## 3        5        1        5        4        1        5
## 4        2        2        1        3        3        5
## 5        4        3        4        4        2        4
## 6        5        5        5        1        1        5

12.2.2.1 Method 1 - with Bootstrap 1

# Use at least (?) B = 1000

# Execution time is non-trivial
set.seed(1)
# ci.reliability(data = temp, type = "omega", conf.level = 0.95, interval.type = "bca", B = 1000) 

12.2.2.2 Method 2 2

# See options
omega(temp)

## Omega 
## Call: omegah(m = m, nfactors = nfactors, fm = fm, key = key, flip = flip, 
##     digits = digits, title = title, sl = sl, labels = labels, 
##     plot = plot, n.obs = n.obs, rotate = rotate, Phi = Phi, option = option, 
##     covar = covar)
## Alpha:                 0.57 
## G.6:                   0.56 
## Omega Hierarchical:    0.17 
## Omega H asymptotic:    0.24 
## Omega Total            0.7 
## 
## Schmid Leiman Factor loadings greater than  0.2 
##               g   F1*   F2*   F3*   h2   u2   p2
## OA.OA01.-       -0.48             0.26 0.74 0.04
## OA.OA02.   0.23              0.35 0.19 0.81 0.29
## OA.OA03.-       -0.63             0.44 0.56 0.08
## OA.OA04.   0.34        0.94       1.00 0.00 0.12
## OA.OA05.   0.23 -0.43             0.26 0.74 0.21
## OA.OA06.-  0.28 -0.62             0.47 0.53 0.16
## 
## With eigenvalues of:
##    g  F1*  F2*  F3* 
## 0.35 1.20 0.89 0.18 
## 
## general/max  0.29   max/min =   6.81
## mean percent general =  0.15    with sd =  0.09 and cv of  0.61 
## Explained Common Variance of the general factor =  0.13 
## 
## The degrees of freedom are 0  and the fit is  0 
## The number of observations was  238  with Chi Square =  0.01  with prob <  NA
## The root mean square of the residuals is  0 
## The df corrected root mean square of the residuals is  NA
## 
## Compare this with the adequacy of just a general factor and no group factors
## The degrees of freedom for just the general factor are 9  and the fit is  0.42 
## The number of observations was  238  with Chi Square =  97.83  with prob <  4.3e-17
## The root mean square of the residuals is  0.19 
## The df corrected root mean square of the residuals is  0.24 
## 
## RMSEA index =  0.204  and the 10 % confidence intervals are  0.169 0.242
## BIC =  48.58 
## 
## Measures of factor score adequacy             
##                                                   g  F1*  F2*   F3*
## Correlation of scores with factors             0.47 0.79 0.94  0.42
## Multiple R square of scores with factors       0.22 0.62 0.89  0.18
## Minimum correlation of factor score estimates -0.56 0.24 0.78 -0.64
## 
##  Total, General and Subset omega for each subset
##                                                  g  F1*  F2*  F3*
## Omega total for total scores and subscales    0.70 0.67 1.00 0.18
## Omega general for total scores and subscales  0.17 0.08 0.12 0.05
## Omega group for total scores and subscales    0.50 0.59 0.88 0.12

12.3 Inter rater reliability

if (!require('readr')) install.packages('readr'); library('readr')
if (!require('irr')) install.packages('irr'); library('irr')
## Loading required package: irr
## Loading required package: lpSolve
if (!require('dplyr')) install.packages('dplyr'); library('dplyr')

DF_IRR = temp %>% select(OA.OA01., OA.OA02.)
kappa2(DF_IRR, "unweighted")
##  Cohen's Kappa for 2 Raters (Weights: unweighted)
## 
##  Subjects = 238 
##    Raters = 2 
##     Kappa = 0.0599 
## 
##         z = 1.96 
##   p-value = 0.0504
# Interpretation
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3900052/#:~:text=Cohen%20suggested%20the%20Kappa%20result,1.00%20as%20almost%20perfect%20agreement.

# Cohen suggested the Kappa result be interpreted as follows: 
  # - values ≤ 0 as indicating no agreement 
  # - 0.01–0.20 as none to slight
  # - 0.21–0.40 as fair
  # - 0.41– 0.60 as moderate
  # - 0.61–0.80 as substantial
  # - 0.81–1.00 as almost perfect agreement

12.3.1 References

https://www.ncbi.nlm.nih.gov/pubmed/28557467?dopt=Abstract

1 http://onlinelibrary.wiley.com/doi/10.1111/bjop.12046/pdf

2

12.4 Impute missing data

12.4.1 Impute with mice

TODO: see https://www.r-bloggers.com/imputing-missing-data-with-r-mice-package/

FROM: https://www.r-bloggers.com/graphical-presentation-of-missing-data-vim-package/

if (!require('VIM')) install.packages('VIM'); library('VIM')
## Loading required package: VIM
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
if (!require('mice')) install.packages('mice'); library('mice')
## Loading required package: mice
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
if (!require('dplyr')) install.packages('dplyr'); library('dplyr')
if (!require('tibble')) install.packages('tibble'); library('tibble')
## Loading required package: tibble
# Read data
dat <- read.csv(url("https://goo.gl/4DYzru"), header = TRUE, sep = ",")
head(dat)
##    Age Gender Cholesterol SystolicBP  BMI Smoking Education
## 1 67.9 Female       236.4      129.8 26.4     Yes      High
## 2 54.8 Female       256.3      133.4 28.4      No    Medium
## 3 68.4   Male       198.7      158.5 24.1     Yes      High
## 4 67.9   Male       205.0      136.0 19.9      No       Low
## 5 60.9   Male       207.7      145.4 26.7      No    Medium
## 6 44.9 Female       222.5      130.6 30.6      No       Low
# Create some missings
set.seed(10)
missing = rbinom(250, 1, 0.3)
dat$Cholesterol = with(dat, ifelse(BMI >= 30 & missing == 1, NA, Cholesterol))
sum(is.na(dat$Cholesterol))
## [1] 16
# Impute
init = mice(dat, maxit = 0) 
## Warning: Number of logged events: 3
meth = init$method
predM = init$predictorMatrix
meth[c("Cholesterol")] = "pmm" 
set.seed(101)
imputed = mice(dat, method = meth, predictorMatrix = predM, m = 1)
## 
##  iter imp variable
##   1   1  Cholesterol
##   2   1  Cholesterol
##   3   1  Cholesterol
##   4   1  Cholesterol
##   5   1  Cholesterol
imp = complete(imputed)

# Prepare data for visualization
dt1 = dat %>% 
  select(Cholesterol, BMI) %>% 
  dplyr::rename(Cholesterol_imp = Cholesterol) %>% 
  mutate(
    Cholesterol_imp = as.logical(ifelse(is.na(Cholesterol_imp), "TRUE", "FALSE"))
  ) %>% 
  rownames_to_column()

dt2 = imp %>% 
  select(Cholesterol, BMI) %>% 
  rownames_to_column()

dt = left_join(dt1, dt2)
## Joining, by = c("rowname", "BMI")
head(dt)
##   rowname Cholesterol_imp  BMI Cholesterol
## 1       1           FALSE 26.4       236.4
## 2       2           FALSE 28.4       256.3
## 3       3           FALSE 24.1       198.7
## 4       4           FALSE 19.9       205.0
## 5       5           FALSE 26.7       207.7
## 6       6           FALSE 30.6       222.5
# Visualize how imputation went
vars <- c("BMI","Cholesterol","Cholesterol_imp")
marginplot(dt[,vars], delimiter = "_imp", alpha = 0.6, pch = c(19))

12.4.2 Impute with recipes

if (!require('recipes')) install.packages('recipes'); library('recipes')
## Loading required package: recipes
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:VIM':
## 
##     prepare
## The following object is masked from 'package:gtsummary':
## 
##     all_numeric
## The following object is masked from 'package:Matrix':
## 
##     update
## The following object is masked from 'package:stringr':
## 
##     fixed
## The following object is masked from 'package:stats':
## 
##     step
# Read data
dat <- read.csv(url("https://goo.gl/4DYzru"), header = TRUE, sep = ",")
head(dat)
##    Age Gender Cholesterol SystolicBP  BMI Smoking Education
## 1 67.9 Female       236.4      129.8 26.4     Yes      High
## 2 54.8 Female       256.3      133.4 28.4      No    Medium
## 3 68.4   Male       198.7      158.5 24.1     Yes      High
## 4 67.9   Male       205.0      136.0 19.9      No       Low
## 5 60.9   Male       207.7      145.4 26.7      No    Medium
## 6 44.9 Female       222.5      130.6 30.6      No       Low
# Create some missings
set.seed(10)
missing = rbinom(250, 1, 0.3)
dat$Cholesterol = with(dat, ifelse(BMI >= 30 & missing == 1, NA, Cholesterol))
sum(is.na(dat$Cholesterol))
## [1] 16
# Impute
impute_recipe = recipe(Cholesterol ~ ., dat) %>% 
  # step_meanimpute(Cholesterol) %>% 
  step_knnimpute(Cholesterol)
  # step_knnimpute(all_predictors())

imputed = prep(impute_recipe) %>% juice()


# Prepare data for visualization
dt1 = dat %>% 
  select(Cholesterol, BMI) %>% 
  dplyr::rename(Cholesterol_imp = Cholesterol) %>% 
  mutate(
    Cholesterol_imp = as.logical(ifelse(is.na(Cholesterol_imp), "TRUE", "FALSE"))
  ) %>% 
  rownames_to_column()

dt2 = imputed %>% 
  select(Cholesterol, BMI) %>% 
  rownames_to_column()

dt = left_join(dt1, dt2)
## Joining, by = c("rowname", "BMI")
head(dt)
##   rowname Cholesterol_imp  BMI Cholesterol
## 1       1           FALSE 26.4       236.4
## 2       2           FALSE 28.4       256.3
## 3       3           FALSE 24.1       198.7
## 4       4           FALSE 19.9       205.0
## 5       5           FALSE 26.7       207.7
## 6       6           FALSE 30.6       222.5
# Visualize how imputation went

vars <- c("BMI","Cholesterol","Cholesterol_imp")
marginplot(dt[,vars], delimiter = "_imp", alpha = 0.6, pch = c(19))


  1. Leys, C., Ley, C., Klein, O., Bernard, P. & Licata, L. Detecting outliers: Do not use standard deviation around the mean, use absolute deviation around the median. J. Exp. Soc. Psychol. 49, 764–766 (2013).↩︎

  2. See Serban C. Musca post↩︎

  3. See Serban C. Musca post↩︎