I'm a beginner with R and I'm trying to do a for-loop to recode many variables: when "test" modality is missing, then have "test.v1" modality. It looked very easy to do, but I can't get it:
VEC_1 <- c("test1","test2","test3","test4","test5","test6","test7","test8","test9")
VEC_2 <- c("test1.v1","test2.v1","test3.v1","test4.v1","test5.v1","test6.v1","test7.v1","test8.v1","test9.v1")
for (i in 1:(min(length(VEC_1), length(VEC_2)))){
df2 <- df1 %>%
mutate(
VEC_1[i] = case_when(
is.na(VEC_1[i]) & !is.na(VEC_2[i]) ~ VEC_2[i],
TRUE ~ VEC_1[i])
)
}
I have this error
Unexpected error : '=' in:
" mutate(
VEC_1[i] ="
Does anyone have an idea ?
EDIT: df1 is like :
test1 <- c("A","B","A","A",NA,"B","A",NA,"A")
test1.v1 <- c("B",NA,"B","B","A","B","B",NA,"A")
test2 <- c("B","B","B","B",NA,"C","C","C","C")
test2.v1 <- c("C",NA,"A","A","B","B","C",NA,"C")
test3 <- c("A","B","B","B",NA,"C","C",NA,"C")
test3.v1 <- c("B","A","B",NA,"A","A","A","A",NA)
test4 <- c(NA,"B","B","A",NA,"B","A",NA,"A")
test4.v1 <- c("B","B","B","A","A","B","B","B","B")
df1 <- data.frame(test1,test1.v1,test2,test2.v1,test3,test3.v1,test4,test4.v1)
Based on the example data.frame df1, I'm wondering if you might try putting your data into long form, then grouping by row number and test, then substituting missing values.
library(tidyverse)
df1 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_to = c("test", "mode"), names_pattern = "test(\\d+)([.v1]*)") %>%
group_by(rn, test) %>%
mutate(value = ifelse(mode == "" & is.na(value), value[mode == ".v1"], value)) %>%
pivot_wider(id_cols = rn, names_from = c(test, mode), values_from = value, names_prefix = "test", names_sep = "")
Output
rn test1 test1.v1 test2 test2.v1 test3 test3.v1 test4 test4.v1
<int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 A B B C A B B B
2 2 B NA B NA B A B B
3 3 A B B A B B B B
4 4 A B B A B NA A A
5 5 A A B B A A A A
6 6 B B C B C A B B
7 7 A B C C C A A B
8 8 NA NA C NA A A B B
9 9 A A C C C NA A B
I'm assuming VEC_1 and VEC_2 are the same length here. you don't need to use a for-loop, mutate affects the whole column of the dataframe, it kinda behaves like a for-loop. I have rewritten your code like this and changed the data for testing purposes:
library(dplyr)
VEC_1 <- c("test1","test2","test3","test4",NA,"test6","test7",NA,"test9")
VEC_2 <- c("test1.v1",NA,"test3.v1","test4.v1","test5.v1","test6.v1","test7.v1",NA,"test9.v1")
df <- data.frame(VEC_1,VEC_2)
df %>% mutate(VEC_1 = if_else(is.na(VEC_1) & !is.na(VEC_2),VEC_2,VEC_1))
which is equal to
df <- data.frame(VEC_1,VEC_2)
for (i in 1:nrow(df)){
if(is.na(df$VEC_1[i]) & !is.na(df$VEC_2[i])){
df$VEC_1[i] = df$VEC_2[i]
}
}
Output:
> df
VEC_1 VEC_2
1 test1 test1.v1
2 test2 <NA>
3 test3 test3.v1
4 test4 test4.v1
5 test5.v1 test5.v1
6 test6 test6.v1
7 test7 test7.v1
8 <NA> <NA>
9 test9 test9.v1
i also changed from case_when to if_else, because you only check one condition.
?if_else
if_else(condition, true, false, missing = NULL)
ifelse/if_else will be a good solution, unless you have multiple conditions, then case_when is your friend. A loop isn't necessary:
library(dplyr)
VEC_1 <- c("test1","test2",NA,NA)
VEC_2 <- c("test1.v1",NA,NA,"test4.v1")
df <- tibble(VEC_1, VEC_2)
df %>%
mutate(VEC_1 = case_when(
is.na(VEC_2) ~ VEC_1,
is.na(VEC_1) ~ VEC_2,
TRUE ~ VEC_1)
)
# A tibble: 4 × 2
VEC_1 VEC_2
<chr> <chr>
1 test1 test1.v1
2 test2 NA
3 NA NA
4 NA test4.v1
# A tibble: 4 × 2
VEC_1 VEC_2
<chr> <chr>
1 test1 test1.v1
2 test2 NA
3 NA NA
4 test4.v1 test4.v1
Related
I need to get the common columns of a data frame list separated in different data frames. Please look at the following example:
df1 <- data.frame(Dates = c('01-01-2020','02-01-2020','03-01-2020'), col1 = c(1,2,3), col2 = c(3,2,1))
df2 <- data.frame(Dates = c('01-01-2020','02-01-2020','03-01-2020'), col1 = c(4,5,6), col2 = c(6,5,4))
df3 <- data.frame(Dates = c('01-01-2020','02-01-2020'), col1 = c(7,8), col2 = c(8,7))
ldf <- list(df1, df2, df3)
The desired output would be the following two data frames:
df_col1:
Date df1 df2 df3
01-01-2020 1 4 7
02-01-2020 2 5 8
03-01-2020 3 6 NA
df_col2:
Date df1 df2 df3
01-01-2020 3 6 8
02-01-2020 2 5 7
03-01-2020 1 4 NA
Of course, ldf is actually way longer, but the number of columns is fixed to 5, so the number of outputs is also fixed (4). This means I wouldn't mind if I use a block of code for each output.
I've tried several things but none seems to work. I'm using base R and hope to find a solution wihtout additional packages.
Thanks a lot for your time!
We bind the list elements with bind_rows from dplyr, then loop over the 'col' columns, along with the common 'Dates', reshape to 'wide' format with pivot_wider and rename if needed
library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
newdf <- bind_rows(ldf)
out <- map(names(newdf)[-1], ~
newdf %>%
select(Dates, .x) %>%
mutate(rn = rowid(Dates)) %>%
pivot_wider(names_from =rn, values_from = !! rlang::sym(.x)) %>%
rename_at(-1, ~ str_c('df', seq_along(.))))
-output
out
#[[1]]
# A tibble: 3 x 4
# Dates df1 df2 df3
# <chr> <dbl> <dbl> <dbl>
#1 01-01-2020 1 4 7
#2 02-01-2020 2 5 8
#3 03-01-2020 3 6 NA
#[[2]]
# A tibble: 3 x 4
# Dates df1 df2 df3
# <chr> <dbl> <dbl> <dbl>
#1 01-01-2020 3 6 8
#2 02-01-2020 2 5 7
#3 03-01-2020 1 4 NA
Or using base R
newdf <- do.call(rbind, ldf)
f1 <- function(dat, colName) {
lst1 <- split(dat[[colName]], dat$Dates)
do.call(rbind, lapply(lst1, `length<-`, max(lengths(lst1))))
}
f1(newdf, 'col1')
f1(newdf, 'col2')
Another Base R option is to do:
m <- Reduce(function(x,y)merge(x, y, by='Dates', all=TRUE), ldf)
lapply(split.default(m[-1], sub("\\..*", "", names(m[-1]))), cbind, m[1])
Another wordy approach using base R:
#Code
names(ldf) <- paste0('df',1:length(ldf))
#Function
myfun <- function(x) {
y <- reshape(x,direction = 'long',
v.names='col',
idvar = 'Dates',varying = list(2:3))
return(y)
}
z <- do.call(rbind,lapply(ldf,myfun))
z$Data <- gsub("\\..*","",rownames(z))
rownames(z) <- NULL
#Reshape
z2 <- reshape(z,idvar = c('Dates','time'),timevar = 'Data')
#List
List <- split(z2,z2$time)
List
Output:
List
$`1`
Dates time col.df1 col.df2 col.df3
1 01-01-2020 1 1 4 7
2 02-01-2020 1 2 5 8
3 03-01-2020 1 3 6 NA
$`2`
Dates time col.df1 col.df2 col.df3
4 01-01-2020 2 3 6 8
5 02-01-2020 2 2 5 7
6 03-01-2020 2 1 4 NA
I have a tibble with the explicit "id" and colnames I need to convert to NA's. Is there anyway I can create the NA's without making my df a long dataset? I considered using the new rows_update function, but I'm not sure if this is correct because I only want certain columns to be NA.
library(dplyr)
to_na <- tribble(~x, ~col,
1, "z",
3, "y"
)
df <- tibble(x = c(1,2,3),
y = c(1,1,1),
z = c(2,2,2))
# desired output:
#> # A tibble: 3 x 3
#> x y z
#> <dbl> <dbl> <dbl>
#> 1 1 1 NA
#> 2 2 1 2
#> 3 3 NA 2
Created on 2020-07-03 by the reprex package (v0.3.0)
This definitely isn't the most elegant solution, but it gets the output you want.
library(dplyr)
library(purrr)
to_na <- tribble(~x, ~col,
1, "z",
3, "y"
)
df <- tibble(x = c(1,2,3),
y = c(1,1,1),
z = c(2,2,2))
map2(to_na$x, to_na$col, #Pass through these two objects in parallel
function(xval_to_missing, col) df %>% #Two objects above matched by position here.
mutate_at(col, #mutate_at the specified cols
~if_else(x == xval_to_missing, NA_real_, .) #if x == xval_to_missing, make NA, else keep as is.
) %>%
select(x, col) #keep x and the modified column.
) %>% #end of map2
reduce(left_join, by = "x") %>% #merge within the above list, by x.
relocate(x, y, z) #Keep your ordering
Output:
# A tibble: 3 x 3
x y z
<dbl> <dbl> <dbl>
1 1 1 NA
2 2 1 2
3 3 NA 2
We can use row/column indexing to assign the values to NA in base R
df <- as.data.frame(df)
df[cbind(to_na$x, match(to_na$col, names(df)))] <- NA
df
# x y z
#1 1 1 NA
#2 2 1 2
#3 3 NA 2
If we want to use rows_update
library(dplyr)
library(tidyr)
library(purrr)
lst1 <- to_na %>%
mutate(new = NA_real_) %>%
split(seq_len(nrow(.))) %>%
map(~ .x %>%
pivot_wider(names_from = col, values_from = new))
for(i in seq_along(lst1)) df <- rows_update(df, lst1[[i]])
df
# A tibble: 3 x 3
# x y z
# <dbl> <dbl> <dbl>
#1 1 1 NA
#2 2 1 2
#3 3 NA 2
There is my problem that I can't solve it:
Data:
df <- data.frame(f1=c("a", "a", "b", "b", "c", "c", "c"),
v1=c(10, 11, 4, 5, 0, 1, 2))
data.frame:f1 is factor
f1 v1
a 10
a 11
b 4
b 5
c 0
c 1
c 2
# What I want is:(for example, fetch data with the number of element of some level == 2, then to data.frame)
a b
10 4
11 5
Thanks in advance!
I might be missing something simple here , but the below approach using dplyr works.
library(dplyr)
nlevels = 2
df1 <- df %>%
add_count(f1) %>%
filter(n == nlevels) %>%
select(-n) %>%
mutate(rn = row_number()) %>%
spread(f1, v1) %>%
select(-rn)
This gives
# a b
# <int> <int>
#1 10 NA
#2 11 NA
#3 NA 4
#4 NA 5
Now, if you want to remove NA's we can do
do.call("cbind.data.frame", lapply(df1, function(x) x[!is.na(x)]))
# a b
#1 10 4
#2 11 5
As we have filtered the dataframe which has only nlevels observations, we would have same number of rows for each column in the final dataframe.
split might be useful here to split df$v1 into parts corresponding to df$f1. Since you are always extracting equal length chunks, it can then simply be combined back to a data.frame:
spl <- split(df$v1, df$f1)
data.frame(spl[lengths(spl)==2])
# a b
#1 10 4
#2 11 5
Or do it all in one call by combining this with Filter:
data.frame(Filter(function(x) length(x)==2, split(df$v1, df$f1)))
# a b
#1 10 4
#2 11 5
Here is a solution using unstack :
unstack(
droplevels(df[ave(df$v1, df$f1, FUN = function(x) length(x) == 2)==1,]),
v1 ~ f1)
# a b
# 1 10 4
# 2 11 5
A variant, similar to #thelatemail's solution :
data.frame(Filter(function(x) length(x) == 2, unstack(df,v1 ~ f1)))
My tidyverse solution would be:
library(tidyverse)
df %>%
group_by(f1) %>%
filter(n() == 2) %>%
mutate(i = row_number()) %>%
spread(f1, v1) %>%
select(-i)
# # A tibble: 2 x 2
# a b
# * <dbl> <dbl>
# 1 10 4
# 2 11 5
or mixing approaches :
as_tibble(keep(unstack(df,v1 ~ f1), ~length(.x) == 2))
Using all base functions (but you should use tidyverse)
# Add count of instances
x$len <- ave(x$v1, x$f1, FUN = length)
# Filter, drop the count
x <- x[x$len==2, c('f1','v1')]
# Hacky pivot
result <- data.frame(
lapply(unique(x$f1), FUN = function(y) x$v1[x$f1==y])
)
colnames(result) <- unique(x$f1)
> result
a b
1 10 4
2 11 5
I'd like code this, may it helps for you
library(reshape2)
library(dplyr)
aa = data.frame(v1=c('a','a','b','b','c','c','c'),f1=c(10,11,4,5,0,1,2))
cc = aa %>% group_by(v1) %>% summarise(id = length((v1)))
dd= merge(aa,cc) #get the level
ee = dd[dd$aa==2,] #select number of level equal to 2
ee$id = rep(c(1,2),nrow(ee)/2) # reset index like (1,2,1,2)
dcast(ee, id~v1,value.var = 'f1')
all done!
I am looking to check the pattern of missing values according to a class label (dependent variable) in my data. The output I want is the class labels and the number of missing values in the class.
library(tidyverse)
fakeData <- data.frame(var1 = c(1,2,NA,4,NA,6,7,8,9,10),
var2=c(11,NA,NA,14,NA,16,17,NA,19,NA),
Class = c(rep("A", 5), rep("B", 5)))
fakeData %>% group_by(Class) %>% summarize(numMissing = sum(is.na()))
Error in summarise_impl(.data, dots) :
Evaluation error: 0 arguments passed to 'is.na' which requires 1.
What is wrong with my approach here?
I think this is a cleaner solution, using tidyverse only. You don't need to know the number of columns. You can also use ?select_helpers in gather() to select columns, eg. starts_with("var").
fakeData %>%
group_by(Class) %>%
gather(variable, value, -Class) %>% # all except Class
summarise(missing_n = sum(is.na(value)))
# A tibble: 2 x 2
Class missing_n
<fctr> <int>
1 A 5
2 B 2
Perhaps, we can do
fakeData %>%
group_by(Class) %>%
summarise_all(funs(sum(is.na(.)))) %>%
transmute(Class, numMissing = var1 + var2)
If we have many columns, then use purrr::reduce
fakeData %>%
group_by(Class) %>%
summarise_all(funs(sum(is.na(.)))) %>%
transmute(Class, numMissing = .[-1] %>% reduce(`+`))
#or with rowSums
#transmute(Class, numMissing = rowSums(.[-1]))
I would suggest melting dataset in long format using reshape lib. Then just use aggregate function by Class variable.
library(reshape)
fakeData <- data.frame(var1 = c(1,2,NA,4,NA,6,7,8,9,10),
var2=c(11,NA,NA,14,NA,16,17,NA,19,NA),
Class = c(rep("A", 5), rep("B", 5)))
fData <- melt(fakeData, measure.vars = c("var1", "var2"))
fData
Class variable value
1 A var1 1
2 A var1 2
3 A var1 NA
4 A var1 4
5 A var1 NA
6 B var1 6
7 B var1 7
8 B var1 8
9 B var1 9
10 B var1 10
11 A var2 11
12 A var2 NA
13 A var2 NA
14 A var2 14
15 A var2 NA
16 B var2 16
17 B var2 17
18 B var2 NA
19 B var2 19
20 B var2 NA
with(fData, aggregate(value, list(Class), function(x) { sum(is.na(x)) }))
Group.1 x
1 A 5
2 B 2
Given a situation such as the following
library(dplyr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
I would like to group `myData' to eventually find summary data grouping by all possible combinations of var2, var3, and var4.
I can create a list with all possible combinations of variables as character values with
groupNames <- names(myData)[2:4]
myGroups <- Map(combn,
list(groupNames),
seq_along(groupNames),
simplify = FALSE) %>%
unlist(recursive = FALSE)
My plan was to make separate data sets for each variable combination with a for() loop, something like
### This Does Not Work
for (i in 1:length(myGroups)){
assign( myGroups[i]%>%
unlist() %>%
paste0(collapse = "")%>%
paste0("Data"),
myData %>%
group_by_(lapply(myGroups[[i]], as.symbol)) %>%
summarise( n = length(var1),
avgVar2 = var2 %>%
mean()))
}
Admittedly I am not very good with lists, and looking up this issue was a bit challenging since dpyr updates have altered how grouping works a bit.
If there is a better way to do this than separate data sets I would love to know.
I've gotten a loop similar to above working when I am only grouping by a single variable.
Any and all help is greatly appreciated! Thank you!
This seems convulated, and there's probably a way to simplify or fancy it up with a do, but it works. Using your myData and myGroups,
results = lapply(myGroups, FUN = function(x) {
do.call(what = group_by_, args = c(list(myData), x)) %>%
summarise( n = length(var1),
avgVar1 = mean(var1))
}
)
> results[[1]]
Source: local data frame [3 x 3]
var2 n avgVar1
1 a 31 0.38929738
2 b 31 -0.07451717
3 c 38 -0.22522129
> results[[4]]
Source: local data frame [9 x 4]
Groups: var2
var2 var3 n avgVar1
1 a A 11 -0.1159160
2 a B 11 0.5663312
3 a C 9 0.7904056
4 b A 7 0.0856384
5 b B 13 0.1309756
6 b C 11 -0.4192895
7 c A 15 -0.2783099
8 c B 10 -0.1110877
9 c C 13 -0.2517602
> results[[7]]
# I won't paste them here, but it has all 27 rows, grouped by var2, var3 and var4.
I changed your summarise call to average var1 since var2 isn't numeric.
I have created a function based on the answer of #Gregor and the comments that followed:
library(magrittr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
Function combSummarise
combSummarise <- function(data, variables=..., summarise=...){
# Get all different combinations of selected variables (credit to #Michael)
myGroups <- lapply(seq_along(variables), function(x) {
combn(c(variables), x, simplify = FALSE)}) %>%
unlist(recursive = FALSE)
# Group by selected variables (credit to #konvas)
df <- eval(parse(text=paste("lapply(myGroups, function(x){
dplyr::group_by_(data, .dots=x) %>%
dplyr::summarize_( \"", paste(summarise, collapse="\",\""),"\")})"))) %>%
do.call(plyr::rbind.fill,.)
groupNames <- c(myGroups[[length(myGroups)]])
newNames <- names(df)[!(names(df) %in% groupNames)]
df <- cbind(df[, groupNames], df[, newNames])
names(df) <- c(groupNames, newNames)
df
}
Call of combSummarise
combSummarise (myData, var=c("var2", "var3", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)"))
etc
Inspired by the answers by Gregor and dimitris_ps, I wrote a dplyr style function that runs summarise for all combinations of group variables.
summarise_combo <- function(data, ...) {
groupVars <- group_vars(data) %>% map(as.name)
groupCombos <- map( 0:length(groupVars), ~combn(groupVars, ., simplify=FALSE) ) %>%
unlist(recursive = FALSE)
results <- groupCombos %>%
map(function(x) {data %>% group_by(!!! x) %>% summarise(...)} ) %>%
bind_rows()
results %>% select(!!! groupVars, everything())
}
Example
library(tidyverse)
mtcars %>% group_by(cyl, vs) %>% summarise_combo(cyl_n = n(), mean(mpg))
Using unite to create a new column is the simplest way
library(tidyverse)
df = tibble(
a = c(1,1,2,2,1,1,2,2),
b = c(3,4,3,4,3,4,3,4),
val = c(1,2,3,4,5,6,7,8)
)
print(df)#output1
df_2 = unite(df, 'combined_header', a, b, sep='_', remove=FALSE) #remove=F doesn't remove existing columns
print(df_2)#output2
df_2 %>% group_by(combined_header) %>%
summarize(avg_val=mean(val)) %>% print()#output3
#avg 1_3 = mean(1,5)=3 avg 1_4 = mean(2, 6) = 4
RESULTS
Output:
output1
a b val
<dbl> <dbl> <dbl>
1 1 3 1
2 1 4 2
3 2 3 3
4 2 4 4
5 1 3 5
6 1 4 6
7 2 3 7
8 2 4 8
output2
combined_header a b val
<chr> <dbl> <dbl> <dbl>
1 1_3 1 3 1
2 1_4 1 4 2
3 2_3 2 3 3
4 2_4 2 4 4
5 1_3 1 3 5
6 1_4 1 4 6
7 2_3 2 3 7
8 2_4 2 4 8
output3
combined_header avg_val
<chr> <dbl>
1 1_3 3
2 1_4 4
3 2_3 5
4 2_4 6