Related
I'm trying to recode several dummy variables at once but am struggling to come up with a functioning vectorized solution (alternatively a for loop).
reprex:
library(tidyverse)
library(magrittr)
library(dummies)
library(janitor)
df_raw <- data.frame(
species = as.factor(c("cat", "dog", NA, "dog", "dog")),
weight = rnorm(5, mean = 5, sd = 1),
sex = as.factor(c("m", NA, "f", "f", "m"))
)
df_raw
species weight sex
1 cat 3.025896 m
2 dog 3.223064 <NA>
3 <NA> 5.230367 f
4 dog 4.231511 f
5 dog 5.819032 m
I split the factor variables (species and sex) into dummies but the NA get their own indicators (species_na and sex_na)
df_dummy <- dummies::dummy.data.frame(df_raw,
dummy.classes = "factor",
sep = "_",
omit.constants = TRUE,
all = TRUE) %>%
janitor::clean_names()
species_cat species_dog species_na weight sex_f sex_m sex_na
1 1 0 0 3.025896 0 1 0
2 0 1 0 3.223064 0 0 1
3 0 0 1 5.230367 1 0 0
4 0 1 0 4.231511 1 0 0
5 0 1 0 5.819032 0 1 0
My problem: how do I efficiently recode all of the factor dummies ("indexed" by the prefix, e.g. species_) to NA conditional on value of the _na dummy in the respective group of dummies? In other words, I need to mutate all dummies with the prefix species_ as NA whenever the species_na == 1 etc.
I have come up with the solution below, but I haven't been able to generalize the last step to the entire dataset
factor_vars <- dplyr::select_if(df_raw, is.factor) %>% colnames()
na_labs <- paste(factor_vars,
"na",
sep = "_")
df_dummy <- df_dummy %>%
dplyr::mutate(across(all_of(na_labs),
.fns = list(var = ~ . == 1),
.names = "{fn}_{col}" ))
# --- trial run for one variable only
test <- df_dummy %>%
mutate(species_cat = ifelse(var_species_na == TRUE,
NA,
species_cat))
Any help is appreciated!
How about this?
df_dummy <- df_dummy %>%
mutate(across(c(starts_with("species")), ~ factor(ifelse(species_na == 1, NA, .)))) %>%
mutate(across(c(starts_with("sex")), ~ factor(ifelse(sex_na == 1, NA, .))))
df_dummy
species_cat species_dog species_na weight sex_f sex_m sex_na
1 1 0 0 4.879161 0 1 0
2 0 1 0 5.960176 <NA> <NA> <NA>
3 <NA> <NA> <NA> 5.189566 1 0 0
4 0 1 0 5.165760 1 0 0
5 0 1 0 5.952365 0 1 0
You can try -
library(dplyr)
library(purrr)
df_dummy <- dummies::dummy.data.frame(df_raw,
dummy.classes = "factor",
sep = "_",
omit.constants = TRUE,
all = TRUE) %>%
janitor::clean_names()
factor_vars <- dplyr::select_if(df_raw, is.factor) %>% colnames()
na_labs <- paste(factor_vars,
"na",
sep = "_")
map_dfc(factor_vars, ~df_dummy %>%
select(contains(.x)) %>%
mutate(across(.fns = ~ifelse(.data[[paste0(.x, '_na')]] == 1, NA, .))))
# species_cat species_dog species_na sex_f sex_m sex_na
#1 1 0 0 0 1 0
#2 0 1 0 NA NA NA
#3 NA NA NA 1 0 0
#4 0 1 0 1 0 0
#5 0 1 0 0 1 0
I have a package on github {dplyover} which can create dummy variables in an across-like manner. Below we select all factor variables with where(is.factor) and apply to each column dist_value which is a wrapper around unique which returns all non-NA values. The function in .fns takes each selected column as .x and applies to it each of the unique values from dist_values as .y.
library(dplyr)
library(dplyover) # https://github.com/TimTeaFan/dplyover
df_raw %>%
mutate(crossover(where(is.factor),
dist_values,
.fns = ~ if_else(.y == .x, 1, 0)))
#> species weight sex species_cat species_dog sex_f sex_m
#> 1 cat 5.281178 m 1 0 0 1
#> 2 dog 4.343656 <NA> 0 1 NA NA
#> 3 <NA> 4.555380 f NA NA 1 0
#> 4 dog 4.990039 f 0 1 1 0
#> 5 dog 4.988497 m 0 1 0 1
Created on 2021-09-13 by the reprex package (v2.0.1)
I have a quite big dataframe and I'm trying to add a new variable which is the sum of the three previous rows on a running basis, also it should be grouped by ID. The first three rows per ID should be 0. Here's what it should look like.
ID Var1 VarNew
1 2 0
1 2 0
1 3 0
1 0 7
1 4 5
1 1 7
Here's an example dataframe
ID <- c(1, 1, 1, 1, 1, 1)
Var1 <- c(2, 2, 3, 0, 4, 1)
df <- data.frame(ID, Var1)
You can use any of the package that has rolling calculation function with a window size of 3 and lag the result. For example with zoo::rollsumr.
library(dplyr)
df %>%
group_by(ID) %>%
mutate(VarNew = lag(zoo::rollsumr(Var1, 3, fill = 0), default = 0)) %>%
ungroup
# ID Var1 VarNew
# <dbl> <dbl> <dbl>
#1 1 2 0
#2 1 2 0
33 1 3 0
#4 1 0 7
#5 1 4 5
#6 1 1 7
You can use filter in ave.
df$VarNew <- ave(df$Var1, df$ID, FUN=function(x) c(0, 0, 0,
filter(head(df$Var1, -1), c(1,1,1), side=1)[-1:-2]))
df
# ID Var1 VarNew
#1 1 2 0
#2 1 2 0
#3 1 3 0
#4 1 0 7
#5 1 4 5
#6 1 1 7
or using cumsum in combination with head and tail.
df$VarNew <- ave(df$Var1, df$ID, FUN=function(x) {y <- cumsum(x)
c(0, 0, 0, tail(y, -3) - head(y, -3))})
Library runner also helps
library(runner)
df %>% mutate(var_new = sum_run(Var1, k =3, na_pad = T, lag = 1))
ID Var1 var_new
1 1 2 NA
2 1 2 NA
3 1 3 NA
4 1 0 7
5 1 4 5
6 1 1 7
NAs can be mutated to 0 if desired so, easily.
I have a column with group1 group 2 in data frame.
group <- c( "group1", "group1", "group2", "group1", "group2" )
value<- c(1:5)
dat <- data.frame(value, group)
I want to make it like this-
group1 <- c(1, 1, 0, 1, 0)
group2 <- c(0, 0, 1, 0, 1)
dat<- data.frame(value, group1, group2)
I tried this but have to remove the group column later
dat<- dat %>%
mutate( group1 = ifelse(data1$group =="group1", 1, 0 ),
group2 = ifelse(data1$group =="group2", 1, 0 ) )
Is there any other nice way to do this job.
Thanks in advance for your help.
You could create a dummy column and get data in wide format.
library(dplyr)
library(tidyr)
dat %>%
mutate(n = 1) %>%
pivot_wider(names_from = group, values_from = n, values_fill = 0) -> result
# value group1 group2
# <int> <dbl> <dbl>
#1 1 1 0
#2 2 1 0
#3 3 0 1
#4 4 1 0
#5 5 0 1
Or in base R use table :
table(dat)
# group
#value group1 group2
# 1 1 0
# 2 1 0
# 3 0 1
# 4 1 0
# 5 0 1
A base R option using reshape
replace(
out <- reshape(
cbind(dat, q = 1),
direction = "wide",
idvar = "value",
timevar = "group"
),
is.na(out),
0
)
giving
value q.group1 q.group2
1 1 1 0
2 2 1 0
3 3 0 1
4 4 1 0
5 5 0 1
We can use data.table
library(data.table)
dcast(setDT(dat), value ~ group, length)
# value group1 group2
#1: 1 1 0
#2: 2 1 0
#3: 3 0 1
#4: 4 1 0
#5: 5 0 1
Or this can be done with pivot_wider in a single step by specifying values_fn
library(dplyr)
library(tidyr)
dat %>%
pivot_wider(names_from = group, values_from = group,
values_fn = length, values_fill = 0)
# A tibble: 5 x 3
# value group1 group2
# <int> <int> <int>
#1 1 1 0
#2 2 1 0
#3 3 0 1
#4 4 1 0
#5 5 0 1
Insert %>% select(!"group") at the end of the dplyr pipe. Also remove data1$ from it - you probably meant dat, even that's not needed.
dat %>%
mutate(group1 = ifelse(group =="group1", 1, 0 ),
group2 = ifelse(group =="group2", 1, 0 )) %>%
select(!"group")
value group1 group2
1 1 1 0
2 2 1 0
3 3 0 1
4 4 1 0
5 5 0 1
My current data is like below,
df<-data.frame(id=c(1:5),t1=c(NA,1,0,0,0),t2=c(0,1,0,1,0),
t3=c(NA,0,0,0,1),t4=c(NA,NA,NA,0,0))
And the way I'm trying to restructure this is,
for each id, if there's a "1" in that row, all the 0s in the subsequent columns would change to 1. (but leaving the NA as an NA).
So for id#1, nothing would change since there's no 1 in that row, but for id#2, after 1 in the column t2, any 0s afterwards would be replaced by 1.
i.e., this is what I'm trying to get at the end:
final<-data.frame(id=c(1:5),t1=c(0,1,0,0,0),t2=c(0,1,0,1,0),
t3=c(NA,1,0,1,1),t4=c(NA,NA,NA,1,1))
I've been trying different ways but nothing seems to work... I'd really appreciate any help!!!
In base R we can apply the cummax by row after changing the NA to a lower value and then replace the value back to NA
df[-1] <- t(apply(replace(df[-1], is.na(df[-1]), -999), 1, cummax)) *
NA^(is.na(df[-1]))
df
# id t1 t2 t3 t4
#1 1 NA 0 NA NA
#2 2 1 1 1 NA
#3 3 0 0 0 NA
#4 4 0 1 1 1
#5 5 0 0 1 1
Or use rowCummaxs from matrixStats
library(matrixStats)
df[-1] <- rowCummaxs(as.matrix(replace(df[-1], is.na(df[-1]), -999))) *
NA^(is.na(df[-1]))
With tidyverse you can try:
library(tidyverse)
df %>%
pivot_longer(cols = starts_with("t"), names_to = "Time", values_to = "Value") %>%
group_by(id) %>%
mutate(Cummax = cummax(Value)) %>%
mutate(Value = replace(Value, Value == 0 & Cummax == 1, 1)) %>%
pivot_wider(id_cols = id, names_from = "Time", values_from = "Value")
Output
# A tibble: 5 x 5
# Groups: id [5]
id t1 t2 t3 t4
<int> <dbl> <dbl> <dbl> <dbl>
1 1 NA 0 NA NA
2 2 1 1 1 NA
3 3 0 0 0 NA
4 4 0 1 1 1
5 5 0 0 1 1
Another approach in base R using apply row-wise could be to find out column number where first 1 occurs and replace all the 0 values after it with 1.
df[-1] <- t(apply(df[-1], 1, function(x) {
a_id <- which(x == 1)[1]
if(length(a_id) > 0)
replace(x, x == 0 & seq_along(x) > a_id, 1)
else x
}))
df
# id t1 t2 t3 t4
#1 1 NA 0 NA NA
#2 2 1 1 1 NA
#3 3 0 0 0 NA
#4 4 0 1 1 1
#5 5 0 0 1 1
I have a longhitudinal dataframe with a lot of missing values that looks like this.
ID = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3)
date = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5)
cond = c(0,0,0,1,0,0,0,0,1,0,0,0,0,0,0)
var = c(1, NA , 2, 0,NA, NA, 3, NA,0, NA, 2, NA, 1,NA,NA)
df = data.frame(ID, date, cond,var)
I would like to carry forward the last observation based on two conditions:
1) when cond=0 it should carry on the observation the higher value of the variable of interest.
2) when cond=1 it should carry forward the lower value of the variable of interest.
Does anyone have an idea on how I could do this in an elegant way?
The final dataset should look like this
ID = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3)
date = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5)
cond = c(0,0,0,1,0,0,0,0,1,0,0,0,0,0,0)
var = c(1, 1 , 2, 0, 0, NA, 3, 3, 0, 0,2,2,2,2,2)
final = data.frame(ID, date, cond,var)
So far I was able to carry forward the last observation, but I was unable to impose the conditions
library(zoo)
df <- df %>%
group_by(ID) %>%
mutate(var =
na.locf(var, na.rm = F))
any suggestion is welcomed
This is the use of accumulate2 ie
df%>%
group_by(ID)%>%
mutate(d = unlist(accumulate2(var,cond[-1],function(z,x,y) if(y) min(z,x,na.rm=TRUE) else max(z,x,na.rm=TRUE))))
# A tibble: 15 x 5
# Groups: ID [3]
ID date cond var d
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 1 1
2 1 2 0 NA 1
3 1 3 0 2 2
4 1 4 1 0 0
5 1 5 0 NA 0
6 2 1 0 NA NA
7 2 2 0 3 3
8 2 3 0 NA 3
9 2 4 1 0 0
10 2 5 0 NA 0
11 3 1 0 2 2
12 3 2 0 NA 2
13 3 3 0 1 2
14 3 4 0 NA 2
15 3 5 0 NA 2
I think, if I understand what you are after is this?
ID = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3)
date = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5)
cond = c(0,0,0,1,0,0,0,0,1,0,0,0,0,0,0)
var = c(1, NA , 2, 0,NA, NA, 3, NA,0, NA, 2, NA, 1,NA,NA)
df = data.frame(ID, date, cond,var)
Using case_when you can do some conditional checks. I'm unsure if you mean to return the minimum for all of the "ID" field, but this will look at the condition and then lag or lead to find a non missing value
library(dplyr)
df %>%
mutate(var_imput = case_when(
cond == 0 & is.na(var)~lag(x = var, n = 1, default = NA),
cond == 1 & is.na(var)~lead(x = var, n = 1, default = NA),
TRUE~var
))
Which yields:
ID date cond var var_imput
1 1 1 0 1 1
2 1 2 0 NA 1
3 1 3 0 2 2
4 1 4 1 0 0
5 1 5 0 NA 0
6 2 1 0 NA NA
7 2 2 0 3 3
8 2 3 0 NA 3
9 2 4 1 0 0
10 2 5 0 NA 0
11 3 1 0 2 2
12 3 2 0 NA 2
13 3 3 0 1 1
14 3 4 0 NA 1
15 3 5 0 NA NA
If you want to group by ID then you could generate an impute table by ID, then join it with the original table like this:
# enerate input table
input_table <- df %>%
group_by(ID) %>%
summarise(min = min(var, na.rm = T),
max = max(var, na.rm = T)) %>%
gather(cond, value, -ID) %>%
mutate(cond = ifelse(cond == "min", 0, 1))
# Join and impute missing
df %>%
left_join(input_table,by = c("ID", "cond")) %>%
mutate(var_imput = ifelse(is.na(var), value, var))