impute median plus jitter - r

I would like to efficiently impute missing values with a slightly different value in each cell.
for example:
df <- data_frame(x = rnorm(100), y = rnorm(100))
df[1:5,1] <- NA
df[1:5, 2] <- NA
df %<>% mutate_all(funs(ifelse(is.na(.), jitter(median(., na.rm = TRUE)), .)))
However, this imputes with the same number in all cells.
How can I add a different noise to each cell?
Of course, I could do this with a loop, but my data frame is huge and I would like to do this efficiently

We can use rep with n()
library(dplyr)
library(magrittr)
df %<>%
mutate_all(list(~ case_when(is.na(.) ~ jitter(rep(median(., na.rm = TRUE), n())),
TRUE ~ .)))

Related

Better & faster way to sum & ifelse for a large set of columns in a big data frame using ddply R

Question
I am trying to sum each column in a data frame by group and set the value as 1 if the sum is not 0. I tried to use max function instead of the combo (sum & ifelse), but I kept getting Inf values. However, the combo takes too much time to compute, where I have 1.5m rows and 500 dummy variables to summarize.
Is there a better way to achieve this?
Example dataset
library(tidyverse)
library(tibble)
library(data.table)
rename <- dplyr::rename
select <- dplyr::select
set.seed(10002)
id <- sample(1:20, 1000, replace=T)
set.seed(10003)
group1 <- sample(0:1, 1000, replace=T)
set.seed(10004)
group2 <- sample(0:1, 1000, replace=T)
dummies <-
data.frame(id, group1, group2)
Current Approach
# I am trying to sum each column in a data frame by group and
# set the value as 1 if the sum is not 0.
dummies %>%
ddply('id', function(x){
x %>%
select_if(is.numeric) %>%
summarise_each(list(sum)) %>%
mutate_if(is.numeric, ~ifelse(.x > 0,1,.x))
}, .progress = 'text') # It takes too much time
We could possibly reduce the time by switching to dplyr. Also, instead of doing the sum and then using ifelse to check and reconvert, this can be directly done by checking any value greater than 0
library(dplyr)
dummies %>%
dplyr::select(id, where(is.numeric)) %>%
dplyr::group_by(id) %>%
dplyr::summarise(across(everything(), ~ +(any(. > 0, na.rm = TRUE))))
or using data.table
library(data.table)
setDT(dummies)[, lapply(.SD, function(x)
+(any(x > 0, na.rm = TRUE))), id, .SDcols = patterns('group')]

Reduce a data frame by combining like rows according to two qualitative factors

I have a dataframe like the following:
observations<- data.frame(X=c("00KS089001","00KS089001","00KS089002","00KS089002","00KS089003","00KS089003","00KS105001","00KS105001", "00KS177011","00KS177011","00P0006","00P006","00P006","00P006"), hzdept = c(0,20,0,15,0,13,0,20,0,16,0,6,13,29), hzdepb = c(20,30,15,30,13,30,20,30,16,30,6,13,29,30),Y=c("Red","White","Red","White","Green","Red","Red","Blue", "Black","Black","Red","White","White","White"), Z = c(0.67,0.33,0.5,0.5,0.43,0.57,0.67,0.33,0.53,0.47,0.2,0.23,0.53,0.04))
I want to be able to reduce this so that anytime X and Y are the same for two rows, the observations are combined i.e.
data.frame(X=c("00KS089001","00KS089001","00KS089002","00KS089002","00KS089003","00KS089003","00KS105001","00KS105001", "00KS177011","00P0006","00P006"), hzdept = c(0,20,0,15,0,13,0,20,0,0,6), hzdepb = c(20,30,15,30,13,30,20,30,30,6,30),Y=c("Red","White","Red","White","Green","Red","Red","Blue", "Black","Red","White"), Z = c(0.67,0.33,0.5,0.5,0.43,0.57,0.67,0.33,1.00,0.20,0.80))
Any suggestions on how to best go about this?
Edit: ok, now that I see how hzdept and hzdepb are supposed to be combined from your commment above:
library(tidyverse)
df <- observations %>% count(X,Y,wt = Z,name = "Z")
df_hzdept <- observations %>%
arrange(hzdept) %>%
distinct(X,Y,.keep_all = T) %>%
select(X,Y,hzdept)
df_hzdepb <- observations %>%
arrange(desc(hzdepb)) %>%
distinct(X,Y,.keep_all = T) %>%
select(X,Y,hzdepb)
df <- df %>% left_join(df_hzdept) %>% left_join(df_hzdepb)
Using dplyr
Here is how you would group by two columns and summarize using the minimum, max, and sum other columns in a dataframe:
library(magrittr) # For the pipe: %>%
observations %>%
dplyr::group_by(X, Y) %>%
dplyr::summarise(hzdept = min(hzdept),
hzdepb = max(hzdepb),
Z = sum(Z), .groups = 'drop')

Replace numeric columns of dataset from other dataset

I want to replace specifically the numerical columns of one dataset, with the numerical columns of a corresponding transformed dataset. How can I do it (without using code specific to the particular dataset)?
e.g. toy example from mpg in library ggplot2:
mpg0 <- mpg
set.seed(123)
mpg0[sample(nrow(mpg),70,replace=FALSE),3] <- NA
mpg0[sample(nrow(mpg),70,replace=FALSE),8] <- NA
mpg0[sample(nrow(mpg),70,replace=FALSE),9] <- NA
sampled <- sample(nrow(mpg),50,replace=FALSE)
mpg_test <- mpg0[sampled,]
mpg_train <- mpg0[-sampled,]
mpg_mean <- mpg_train %>% group_by(cyl) %>% summarise_if(is.numeric,mean,na.rm=TRUE)
temp1 <- mpg_test %>% left_join(mpg_mean, by = 'cyl')
Now I would like to replace the missing values in the numeric columns of mpg_test (columns displ, cty, hwy--there are no NAs in the other numeric columns) with the values in the corresponding column from the left-join. I can do it with
temp1 <- as.data.frame(temp1)
temp1[c(3,8,9)][is.na(temp1[c(3,8,9)])] <- temp1[c(12,14,15)][is.na(temp[c(3,8,9)])]
But that is specific to this dataset. Problem with mutate_if is that I don't know what function to put in. Is there a good general way of doing this, i.e. mutating the numeric columns to get the means, replacing NA with values in the same row from the corresponding left-joined columns?
(dplyr only please)
You can do this by altering your left join and using case_when:
library(dplyr)
temp1 <- left_join(mpg_test, mpg_mean, by = "cyl")
temp1 %>%
mutate_if(is.integer, as.numeric) %>%
mutate(displ.x =
case_when(
is.na(displ.x) ~ displ.y,
TRUE ~ displ.x
),
cty.x =
case_when(
is.na(cty.x) ~ cty.y,
TRUE ~ cty.x
),
hwy.x =
case_when(
is.na(hwy.x) ~ hwy.y,
TRUE ~ hwy.x
)) %>%
select(-c(displ.y, year.y, cty.y, hwy.y)) %>%
rename(displ = displ.x,
year = year.x,
cty = cty.x,
hwy = hwy.x)
You can use coalesce :
library(dplyr)
mpg_test %>%
left_join(mpg_mean, by = 'cyl') %>%
mutate(displ = coalesce(displ.x, displ.y),
cty = coalesce(displ.x, displ.y),
hwy = coalesce(hwy.x, hwy.y)) %>%
select(-matches('\\.x|\\.y'))

How to conditionally replace values with NA across multiple columns

I would like to replace outliers in each column of a dataframe with NA.
If for example we define outliers as being any value greater than 3 standard deviations from the mean I can achieve this per variable with the code below.
Rather than specify each column individually I'd like to perform the same operation on all columns of df in one call. Any pointers on how to do this?!
Thanks!
library(dplyr)
data("iris")
df <- iris %>%
select(Sepal.Length, Sepal.Width, Petal.Length)%>%
head(10)
# add a clear outlier to each variable
df[1, 1:3] = 99
# replace values above 3 SD's with NA
df_cleaned <- df %>%
mutate(Sepal.Length = replace(Sepal.Length, Sepal.Length > (abs(3 * sd(df$Sepal.Length, na.rm = TRUE))), NA))
You need to use mutate_all(), i.e.
library(dplyr)
df %>%
mutate_all(funs(replace(., . > (abs(3 * sd(., na.rm = TRUE))), NA)))
Another option is base R
df[] <- lapply(df, function(x) replace(x, . > (abs(3 * sd(x, na.rm = TRUE))), NA))
or with colSds from matrixStats
library(matrixStats)
df[df > abs(3 * colSds(as.matrix(df), na.rm = TRUE))] <- NA

Using replace_na for multiple data subsets

I'm trying to replace the NAs in multiple column variables with randomly generated values from each student_id's subset row data:
data snapshot
so for student 3, systolic needs two NAs replaced. I used the min and max values for each variable within the student 3 subset to generate random values.
library(dplyr)
library(tidyr)
library(tibble)
library(tidyverse)
dplyr::filter(exercise, student_id == "3") %>% replace_na(list(systolic= round(sample(runif(1000, 125,130),2),0),
diastolic =round(sample(runif(1000, 85,85),3),0), heart_rate= round(sample(runif(1000, 79,86),2),0),
phys_score = round(sample(runif(1000, 8,9),2),0)
However it works only when one NA needs replacing: successfully replaced systolic NA values. When I try to replace more than one NAs, this error comes up.
Error: Replacement for `systolic` is length 2, not length 1
Is there a way to fix this? I tried converting the column variables to data frames instead of the vectors they are now, but it only returned the original data without any replacement changes.
Are there any simpler ways to this? Any suggestions/comments would be appreciated. Thanks.
A solution that makes things a little more automated but may be unnecessarily complex.
Generated some grouped missing data from the mtcars dataset
library(magrittr)
library(purrr)
library(dplyr)
library(stringr)
library(tidyr)
## Generate some missing data with a subset of car make
mtcars_miss <- mtcars %>%
as_tibble(rownames = "car") %>%
select(car) %>%
separate(car, c("make", "name"), " ") %>%
bind_cols(mtcars[, -1] %>%
map_df(~.[sample(c(TRUE, NA), prob = c(0.8, 0.2),
size = length(.), replace = TRUE)])) %>%
filter(make %in% c("Mazda", "Hornet", "Merc"))
Function to replace na values from a given variable by sampling within the min and max and depending on some group (here make).
replace_na_sample <- function(df_miss, var, group = "make") {
var <- enquo(var)
df_miss %>%
group_by(.dots = group) %>%
mutate(replace_var := round(runif(n(), min(!!var, na.rm = T),
max(!!var, na.rm = T)), 0)) %>%
rowwise %>%
mutate_at(.vars = vars(!!var),
.funs = funs(replace_na(., replace_var))) %>%
select(-replace_var) %>%
ungroup
}
Example replacing several missing values in multiple columns.
mtcars_replaced <- mtcars_miss %>%
replace_na_sample(cyl, group = "make") %>%
replace_na_sample(disp, group = "make") %>%
replace_na_sample(hp, group = "make")

Resources