I want to produce output that shows my df sorted by the number of NAs in each row (as in the df_rows_sorted_by_NAs column below) but that keeps the original row name/number (df col). The combination would look like column 3 below:
# df_rows_sorted_by_NAs df desired_output
# Row 1 : 38 Row 442 : 37 Row 3112 : 38
# Row 2 : 38 Row 3112 : 38 Row 3113 : 38
# Row 3 : 37 Row 3113 : 38 Row 442 : 37
# Row 18 : 30 Row 1128 : 30 Row 1128 : 30
I get the first output with this:
# Sort df by num of NAs
df_rows_sorted_by_NAs <- df[order(rowSums(is.na(df)), decreasing = TRUE), drop = FALSE, ]
# View obs with >=30 NAs
for (row_name in row.names(df_rows_sorted_by_NAs)) {
if (rowSums(is.na(df_rows_sorted_by_NAs[row_name,])) >= 30) {
cat("Row ", row_name, ": ",
rowSums(is.na(df_rows_sorted_by_NAs[row_name,])), "\n")
}
}
I get the second output with this:
for (row_name in row.names(df)) {
if (rowSums(is.na(df[row_name,])) >= 30) {
cat("Row ", row_name, ": ", rowSums(is.na(df[row_name,])), "\n")
}
}
I tried drop = FALSE for order but got the same result. Any suggestions on how to keep the row names when I create the new df?
This seems to work for me:
a <- c(1, 2, 3)
b<- c(1, NA, 3)
c <- c(NA, NA, 3)
d <- c(1, NA, NA)
e <- c(NA, 2, 3)
df <- data.frame(a, b, c, d, e)
df
df <- df[order(rowSums(is.na(df)), decreasing = TRUE),]
df
gives
a b c d e
1 1 1 NA 1 NA
2 2 NA NA NA 2
3 3 3 3 NA 3
then
a b c d e
2 2 NA NA NA 2
1 1 1 NA 1 NA
3 3 3 3 NA 3
and then
df[rowSums(is.na(df)) >1,]
a b c d e
2 2 NA NA NA 2
1 1 1 NA 1 NA
Is the actual question how do you put "Row:" in front?
paste0("Row ", row.names( df[rowSums(is.na(df)) >1,]), ": ",
rowSums(is.na(df)))
Gives you the vector with the strings, you can make that print vertically but that's a different question than getting the sort done.
The tidyverse package is good for these tasks:
library(tidyverse)
An example dataframe:
df <- tribble(
~Length, ~Width, ~Mass, ~Date,
10.3, 3.1, 0.021, "2018-11-28",
NA, 3.3, NA, "2018-11-29",
10.5, NA, 0.025, "2018-11-30"
)
With package dplyr, you can create an ID column and "number of NAs" column with row_number() and rowSums. Of course, if you already have a row ID column, then you can remove ID = row_number() from mutate:
df %>%
mutate(ID = row_number(), noNAs = rowSums(is.na(.)))
... results in ...
# A tibble: 3 x 6
Length Width Mass Date ID noNAs
<dbl> <dbl> <dbl> <chr> <int> <dbl>
1 10.3 3.1 0.021 2018-11-28 1 0
2 NA 3.3 NA 2018-11-29 2 2
3 10.5 NA 0.025 2018-11-30 3 1
... adding select by ID and noNAs, arranging by noNAs (in descending order):
df <- df %>%
mutate(ID = row_number(), noNAs = rowSums(is.na(.)))%>%
select(ID, noNAs) %>%
arrange(desc(noNAs))
... results in ...
# A tibble: 3 x 2
ID noNAs
<int> <dbl>
1 2 2
2 3 1
3 1 0
Finally, if you wanted to filter for rows with more than 30 NAs, then:
df %>% filter(noNAs > 30)
Related
I have the following dataframe df and list l (dput below):
> df
group value
1 A 1
2 B 2
3 C 3
> l
$A
[1] 999
$B
[1] 55
I would like to join the values of the list to the dataframe based on the names in the list with the group variable of the dataframe and call it "value_l". The expected output should look like this:
group value value_l
1 A 1 999
2 B 2 55
3 C 3 NA
So I was wondering if anyone knows how to join a list to a dataframe based on their names?
dput df and l:
df <- structure(list(group = c("A", "B", "C"), value = c(1, 2, 3)), class = "data.frame", row.names = c(NA,
-3L))
l <- list(A = 999, B = 55)
You can use match. In case df$group is a character (what is here the case) it could be directly used to subset the list.
df$value_l <- l[match(df$group, names(l))]
#df$value_l <- l[df$group] #Short alternative by #akrun works only in case df$group is a character, but not for factor or numeric
#df$value_l <- l[as.character(df$group)] #Maybe more secure
df
# group value value_l
#1 A 1 999
#2 B 2 55
#3 C 3 NULL
In case there is a need for NA, instead of NULL use in addition:
df$value_l[vapply(df$value_l, is.null, TRUE)] <- NA
df
# group value value_l
#1 A 1 999
#2 B 2 55
#3 C 3 NA
Or make it in single steps:
. <- match(df$group, names(l))
df$value_l <- l[.]
is.na(df$value_l) <- is.na(.)
Here we have joined a list to a data.frame.
str(df)
#'data.frame': 3 obs. of 3 variables:
# $ group : chr "A" "B" "C"
# $ value : num 1 2 3
# $ value_l:List of 3
# ..$ A : num 999
# ..$ B : num 55
# ..$ NA: logi NA
In case the List can be trasfomed to a vector you can use unlist before (thanks to #G. Grothendieck for the comment). But here we have then joined a vector to the data.frame.
df$value_l <- unlist(l)[match(df$group, names(l))]
#df$value_l <- unlist(l)[as.character(df$group)] #Option like shown above
df
# group value value_l
#1 A 1 999
#2 B 2 55
#3 C 3 NA
str(df)
#'data.frame': 3 obs. of 3 variables:
# $ group : chr "A" "B" "C"
# $ value : num 1 2 3
# $ value_l: num 999 55 NA
Another option, also joined a vector to the data.frame will be using merge.
merge(df, unlist(l), by.x="group", by.y=0, all.x = TRUE)
# group value y
#1 A 1 999
#2 B 2 55
#3 C 3 NA
Note: For the given list the results look similar but this will not be the case if the list looks e.g. like:
l <- list(A = 999, B = c(7, 55), A = 9)
A potential solution might be:
Taking first match:
df$value_l <- l[as.character(df$group)]
df
# group value value_l
#1 A 1 999
#2 B 2 7, 55
#3 C 3 NULL
Making a left Join
merge(df, list2DF(list(group = names(l), value_l = l)), all.x=TRUE)
#merge(df, data.frame(group = names(l), value_l = I(l)), all.x=TRUE) #Alternative
# group value value_l
#1 A 1 999
#2 A 1 9
#3 B 2 7, 55
#4 C 3 NA
Other options.
merge(df, list2DF(list(group = names(l), value_l = l))) #Inner
merge(df, list2DF(list(group = names(l), value_l = l)), all=TRUE) #Outer
merge(df, list2DF(list(group = names(l), value_l = l)), all.y=TRUE) #Right
For other options please have a look at How to join (merge) data frames (inner, outer, left, right).
Use merge from base R
merge(df, stack(l), by.x = 'group', by.y = 'ind', all.x = TRUE)
group value values
1 A 1 999
2 B 2 55
3 C 3 NA
Or with dplyr
library(dplyr)
df %>%
rowwise %>%
mutate(value_l = if(group %in% names(l)) l[[group]] else NA) %>%
ungroup
-output
# A tibble: 3 × 3
group value value_l
<chr> <dbl> <dbl>
1 A 1 999
2 B 2 55
3 C 3 NA
Or using enframe/unnest
library(tidyr)
library(tibble)
enframe(l, name = 'group', value = 'value_l') %>%
unnest(value_l) %>%
left_join(df, .)
group value value_l
1 A 1 999
2 B 2 55
3 C 3 NA
Or if it can be a list column
df$value_l <- l[df$group]
> df
group value value_l
1 A 1 999
2 B 2 55
3 C 3 NULL
You can do:
library(tidyverse)
l |>
as.data.frame() |>
pivot_longer(cols = everything(),
names_to = "group",
values_to = "value_1") |>
left_join(x = df,
y = _,
by = "group")
which gives:
group value value_1
1 A 1 999
2 B 2 55
3 C 3 NA
Update:
Maybe this one:
library(dplyr)
stack(unlist(l)) %>%
full_join(df, by=c("ind"="group"))
values ind value
1 999 A 1
2 55 B 2
3 NA C 3
First answer:
Slightly different:
library(dplyr)
library(tidyr)
bind_rows(l) %>%
pivot_longer(everything()) %>%
full_join(df, by=c("name"="group")) %>%
select(name, value = value.y, value_l=value.x)
name value value_l
<chr> <dbl> <dbl>
1 A 1 999
2 B 2 55
3 C 3 NA
This is a simpler version of what GKi suggested with unlist(). If your list always has a name and a single numeric value, you can convert it to a named vector and then use it as a lookup vector, which is simpler than doing merges or matches:
temp_vec = unlist(l)
df$l_value = temp_vec[df$group]
df
group value l_value
1 A 1 999
2 B 2 55
3 C 3 NA
Without the intermediate variable for a single line solution:
df$l_value = unlist(l)[df$group]
df
group value l_value
1 A 1 999
2 B 2 55
3 C 3 NA
Depending on what else you need the list for, it may even make sense just to use a named vector instead of a list in the first place.
I have made a very complex solution to something I feel should have a much simpler solution.
In short what I want:
I want to compute a new column containing the minimum value across 3 columns
I want to ignore zeros and NAs
If I only have zeros and NAs I want a zero
If I have only NAs I want a NA
Here is my solution, it works, but it is very complex and produces a warning.
> library(dplyr)
> df <- data.frame(
+ id = c(1, 2, 3, 4),
+ test1 = c( NA, NA, 2 , 3),
+ test2 = c( NA, 0, 1 , 1),
+ test3 = c(NA, NA, 0 , 2)
+ )
> df2 <- df %>%
+ mutate(nieuw = apply(across(test1:test3), 1, function(x) min(x[x>0]))) %>%
+ rowwise() %>%
+ mutate(nieuw = if_else(is.na(nieuw), max(across(test1:test3), na.rm = TRUE), nieuw)) %>%
+ mutate(nieuw = ifelse(is.infinite(nieuw), NA, nieuw))
> df
id test1 test2 test3
1 1 NA NA NA
2 2 NA 0 NA
3 3 2 1 0
4 4 3 1 2
> df2
# A tibble: 4 x 5
# Rowwise:
id test1 test2 test3 nieuw
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 NA NA NA NA
2 2 NA 0 NA 0
3 3 2 1 0 1
4 4 3 1 2 1
Warning message:
Problem while computing `nieuw = if_else(...)`.
i no non-missing arguments to max; returning -Inf
i The warning occurred in row 1.
You can create a helper function and then apply it rowwise:
library(dplyr)
safe <- function(x, f, ...) ifelse(all(is.na(x)), NA,
ifelse(all(is.na(x) | x == 0),
0, f(x[x > 0], na.rm = TRUE, ...)))
df %>%
rowwise() %>%
mutate(a = safe(c_across(test1:test3), min))
# A tibble: 4 × 5
# Rowwise:
id test1 test2 test3 a
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 NA NA NA NA
2 2 NA 0 NA 0
3 3 2 1 0 1
4 4 3 1 2 1
Here is another option. It leverages making zeros and NA's very large and then recodes them at the end:
library(tidyverse)
get_min <- function(data, cols){
data[is.na(data)] <- 1e6
data[data == 0] <- 1e5
nums <- do.call(pmin, select(data, all_of(cols)))
recode(nums, `1e+06` = NA_real_, `1e+05` = 0.)
}
df %>%
mutate(nieuw = get_min(., c("test1", "test2", "test3")))
#> id test1 test2 test3 nieuw
#> 1 1 NA NA NA NA
#> 2 2 NA 0 NA 0
#> 3 3 2 1 0 1
#> 4 4 3 1 2 1
lets say I have the following data frame:
dt <- data.frame(id= c(1),
parameter= c("a","b","c"),
start_day = c(1,8,4),
end_day = c(16,NA,30))
I need to combine start_day and end_day columns (lets call the new column as day) such that I reserve all the other columns. Also I need to create another column that indicates if each row is showing start_day or end_day. To clarify, I am looking to create the following data frame
I am creating the above data frame using the following code:
dt1 <- subset(dt, select = -c(end_day))
dt1 <- dt1 %>% rename(day = start_day)
dt1$start <- 1
dt2 <- subset(dt, select = -c(start_day))
dt2 <- dt2 %>% rename(day = end_day)
dt2$end <- 1
dt <- bind_rows(dt1, dt2)
dt <- dt[order(dt$id, dt$parameter),]
Although my code works, but I am not happy with my solution. I am certain that there is a better and cleaner way to do that. I would appreciate any input on better alternatives of tackling this problem.
(tidyr::pivot_longer(dt, cols = c(start_day, end_day), values_to = "day")
|> dplyr::mutate(start = ifelse(name == "start_day", 1, NA),
end = ifelse(name == "end_day", 1, NA))
)
Result:
# A tibble: 6 × 6
id parameter name day start end
<dbl> <chr> <chr> <dbl> <dbl> <dbl>
1 1 a start_day 1 1 NA
2 1 a end_day 16 NA 1
3 1 b start_day 8 1 NA
4 1 b end_day NA NA 1
5 1 c start_day 4 1 NA
6 1 c end_day 30 NA 1
You could get rid of the name column, but maybe it would be more useful than your new start/end columns?
using base R (faster than data.table up to ~300 rows; faster than tidyr up to ~1k rows) :
cbind(dt[1:2], day = c(dt$start_day,dt$end_day)) |>
(\(x) x[order(x$id, x$parameter),])() |>
(`[[<-`)("start", value = c(1, NA)) |>
(`[[<-`)("end", value = c(NA, 1))
id parameter day start end
1 1 a 1 1 NA
4 1 a 16 NA 1
2 1 b 8 1 NA
5 1 b NA NA 1
3 1 c 4 1 NA
6 1 c 30 NA 1
using the data.table package (faster than tidyr up to ~500k rows) :
dt <- as.data.table(dt)
dt[,.(day = c(start_day, end_day),
start = rep(c(1, NA), .N),
end = rep(c(NA, 1), .N)),
by = .(id, parameter)]
id parameter day start end
1: 1 a 1 1 NA
2: 1 a 16 NA 1
3: 1 b 8 1 NA
4: 1 b NA NA 1
5: 1 c 4 1 NA
6: 1 c 30 NA 1
Slightly difficult to phrase, as far as I saw none of the similar questions answered my problem.
I have a data.frame such as:
df1 <- data.frame(id = rep(c("a", "b","c"), each = 4),
val = c(NA, NA, NA, NA, 1, 2, 2, 3,NA,2,NA,3))
df1
id val
1 a NA
2 a NA
3 a NA
4 a NA
5 b 1
6 b 2
7 b 2
8 b 3
9 c NA
10 c 2
11 c NA
12 c 3
and I want to get rid of all the NA values (easy enough using e.g. filter() ) but make sure that if this removes all of one id value (in this case it removes every instance of "a") that one extra row is inserted of (e.g.) a = 0
so that:
id val
1 a 0
2 b 1
3 b 2
4 b 2
5 b 3
6 c 2
7 c 3
obviously easy enough to do this in a roundabout way but I was wondering if there's a tidy/elegant way to do this. I thought tidyr::complete() might help but not entirely sure how to apply it to a case like this
I don't care about the order of the rows
Cheers!
edit: updated with clearer desired output. might make desired answers submitted before that a bit less clear
Another idea using dplyr,
library(dplyr)
df1 %>%
group_by(id) %>%
mutate(val = ifelse(row_number() == 1 & all(is.na(val)), 0, val)) %>%
na.omit()
which gives,
# A tibble: 5 x 2
# Groups: id [2]
id val
<fct> <dbl>
1 a 0
2 b 1
3 b 2
4 b 2
5 b 3
We may do
df1 %>% group_by(id) %>% do(if(all(is.na(.$val))) replace(.[1, ], 2, 0) else na.omit(.))
# A tibble: 5 x 2
# Groups: id [2]
# id val
# <fct> <dbl>
# 1 a 0
# 2 b 1
# 3 b 2
# 4 b 2
# 5 b 3
After grouping by id, if everything in val is NA, then we leave only the first row with the second element replaced by 0, otherwise the same data is returned after applying na.omit.
In a more readable format that would be
df1 %>% group_by(id) %>%
do(if(all(is.na(.$val))) data.frame(id = .$id[1], val = 0) else na.omit(.))
(Here I presume that you indeed want to get rid of all NA values; otherwise there is no need for na.omit.)
df1[is.na(df1)] <- 0
df1[!(duplicated(df1$id) & df1$val == 0), ]
id val
1 a 0
5 b 1
6 b 2
7 b 2
8 b 3
Base R option is to find groups with all NAs and transform them by changing their val to 0 and select only unique rows so that there is only one row per group. We rbind this dataframe with the groups which are !all_NA.
all_NA <- with(df1, ave(is.na(val), id, FUN = all))
rbind(unique(transform(df1[all_NA, ], val = 0)), df1[!all_NA, ])
# id val
#1 a 0
#5 b 1
#6 b 2
#7 b 2
#8 b 3
dplyr option looks ugly but one way is to make two groups of dataframes one with groups of all NA values and other with groups of all non-NA values. For groups with all NA values we add row with it's id and val as 0 and bind this to the other group.
library(dplyr)
bind_rows(df1 %>%
group_by(id) %>%
filter(all(!is.na(val))),
df1 %>%
group_by(id) %>%
filter(all(is.na(val))) %>%
ungroup() %>%
summarise(id = unique(id),
val = 0)) %>%
arrange(id)
# id val
# <fct> <dbl>
#1 a 0
#2 b 1
#3 b 2
#4 b 2
#5 b 3
Changed the df to make example more exhaustive -
df1 <- data.frame(id = rep(c("a", "b","c"), each = 4),
val = c(NA, NA, NA, NA, 1, 2, 2, 3,NA,2,NA,3))
library(dplyr)
df1 %>%
group_by(id) %>%
mutate(case=sum(is.na(val))==n(), row_num=row_number() ) %>%
mutate(val=ifelse(is.na(val)&case,0,val)) %>%
filter( !(case&row_num!=1) ) %>%
select(id, val)
Output
id val
<fct> <dbl>
1 a 0
2 b 1
3 b 2
4 b 2
5 b 3
6 c NA
7 c 2
8 c NA
9 c 3
Another base approach, one that doesn't maintain the order of the rows and takes advantage of factors remembering lost values:
df1 <- na.omit(df1)
df1 <- rbind(
df1,
data.frame(
id = levels(df1$id)[!levels(df1$id) %in% df1$id],
val = 0)
)
I do personally prefer the dplyr approach given by Sotos, as I don't like rbind-ing data.frames back together so it's a matter of taste, but this isn't unbearably complicated by my eye. It's easy enough to adapt to a character id column with a unique(df1$id) variable.
Here is an option too:
df1 %>%
mutate_if(is.factor,as.character) %>%
mutate_all(funs(replace(.,is.na(.),0))) %>%
slice(4:nrow(.))
This gives:
id val
1 a 0
2 b 1
3 b 2
4 b 2
5 b 3
Alternative:
df1 %>%
mutate_if(is.factor,as.character) %>%
mutate_all(funs(replace(.,is.na(.),0))) %>%
unique()
UPDATE based on other requirements:
Some users suggested to test on this dataframe. Of course this answer assumes you'll look at everything by hand. Might be less useful if you have to look at everything by "hand" but here goes:
df1 <- data.frame(id = rep(c("a", "b","c"), each = 4), val = c(NA, NA, NA, NA, 1, 2, 2, 3,NA,2,NA,3))
df1 %>%
mutate_if(is.factor,as.character) %>%
mutate(val=ifelse(id=="a",0,val)) %>%
slice(4:nrow(.))
This yields:
id val
1 a 0
2 b 1
3 b 2
4 b 2
5 b 3
6 c NA
7 c 2
8 c NA
9 c 3
Here is a base R solution.
res <- lapply(split(df1, df1$id), function(DF){
if(anyNA(DF$val)) {
i <- is.na(DF$val)
DF$val[i] <- 0
DF <- rbind(DF[i & !duplicated(DF[i, ]), ], DF[!i, ])
}
DF
})
res <- do.call(rbind, res)
row.names(res) <- NULL
res
# id val
#1 a 0
#2 b 1
#3 b 2
#4 b 2
#5 b 3
Edit.
A dplyr solution could be the following.
It was tested with the original dataset posted by the OP, with the dataset in Vivek Kalyanarangan's answer and with the dataset in markus' comment, renamed df2 and df3, respectively.
library(dplyr)
na2zero <- function(DF){
DF %>%
group_by(id) %>%
mutate(val = ifelse(is.na(val), 0, val),
crit = val == 0 & duplicated(val)) %>%
filter(!crit) %>%
select(-crit)
}
na2zero(df1)
na2zero(df2)
na2zero(df3)
One may try this :
df1 = data.frame(id = rep(c("a", "b","c"), each = 4),
val = c(NA, NA, NA, NA, 1, 2, 2, 3,NA,2,NA,3))
df1
# id val
#1 a NA
#2 a NA
#3 a NA
#4 a NA
#5 b 1
#6 b 2
#7 b 2
#8 b 3
#9 c NA
#10 c 2
#11 c NA
#12 c 3
Task is to remove all rows corresponding to any id IFF val for the corresponding id is all NAs and add new row with this id and val = 0.
In this example, id = a.
Note : val for c also has NAs but all the val corresponding to c are not NA therefore we need to remove the corresponding row for c where val = NA.
So lets create another column say, val2 which indicates 0 means its all NAs and 1 otherwise.
library(dplyr)
df1 = df1 %>%
group_by(id) %>%
mutate(val2 = if_else(condition = all(is.na(val)),true = 0, false = 1))
df1
# A tibble: 12 x 3
# Groups: id [3]
# id val val2
# <fct> <dbl> <dbl>
#1 a NA 0
#2 a NA 0
#3 a NA 0
#4 a NA 0
#5 b 1 1
#6 b 2 1
#7 b 2 1
#8 b 3 1
#9 c NA 1
#10 c 2 1
#11 c NA 1
#12 c 3 1
Get the list of ids with corresponding val = NA for all.
all_na = unique(df1$id[df1$val2 == 0])
Then remove theids from the dataframe df1 with val = NA.
df1 = na.omit(df1)
df1
# A tibble: 6 x 3
# Groups: id [2]
# id val val2
# <fct> <dbl> <dbl>
# 1 b 1 1
# 2 b 2 1
# 3 b 2 1
# 4 b 3 1
# 5 c 2 1
# 6 c 3 1
And create a new dataframe with ids in all_na and val = 0
all_na_df = data.frame(id = all_na, val = 0)
all_na_df
# id val
# 1 a 0
then combine these two dataframes.
df1 = bind_rows(all_na_df, df1[,c('id', 'val')])
df1
# id val
# 1 a 0
# 2 b 1
# 3 b 2
# 4 b 2
# 5 b 3
# 6 c 2
# 7 c 3
Hope this helps and Edits are most welcomed :-)
I have a dataframe with a lot of variables seen in multiple conditions. I'd like to merge each variable by condition.
The example data frame is a simplified version of what I have (3 variables over 2 conditions).
VAR.B_1 <- c(1, 2, 3, 4, 5, 'NA', 'NA', 'NA', 'NA', 'NA')
VAR.B_2 <- c(2, 2, 3, 4, 5,'NA', 'NA', 'NA', 'NA', 'NA')
VAR.B_3 <- c(1, 1, 1, 1, 1,'NA', 'NA', 'NA', 'NA', 'NA')
VAR.E_1 <- c(NA, NA, NA, NA, NA, 1, 1, 1, 1, 1)
VAR.E_2 <- c(NA, NA, NA, NA, NA, 1, 2, 3, 4, 5)
VAR.E_3 <- c(NA, NA, NA, NA, NA, 1, 1, 1, 1, 1)
Condition <- c("B", "B","B","B","B","E","E","E","E","E")
#Example dataset
data<-as.data.frame(cbind(VAR.B_1,VAR.B_2,VAR.B_3, VAR.E_1,VAR.E_2, VAR.E_3, Condition))
I want to end up with this, appended to the original data frame:
VAR_1 VAR_2 VAR_3
1 2 1
2 2 1
3 3 1
4 4 1
5 5 1
1 1 1
1 2 1
1 3 1
1 4 1
1 5 1
I understand that R won't work with i inside the variable name, but I have an example of the kind of for loop I was trying to do. I would rather not call variables by column location, since there will be a lot of variables.
##Example of how I want to merge - this code does not work
for(i in 1:3) {
data$VAR_[,i] <-ifelse(data$Condition == "B", VAR.B_[,i],
ifelse(data$Condition == "E", VAR.E_[,i], NA))
}
This might work for your situation:
library(tidyverse)
library(stringr)
data %>%
mutate_all(as.character) %>%
gather(key, value, -Condition) %>%
filter(!is.na(value), value != "NA") %>%
mutate(key = str_replace(key, paste0("\\.", Condition), "")) %>%
group_by(Condition, key) %>%
mutate(rowid = 1:n()) %>%
spread(key, value) %>%
bind_cols(data)
#> # A tibble: 10 x 12
#> # Groups: Condition [2]
#> Condition rowid VAR_1 VAR_2 VAR_3 VAR.B_1 VAR.B_2 VAR.B_3 VAR.E_1
#> <chr> <int> <chr> <chr> <chr> <fctr> <fctr> <fctr> <fctr>
#> 1 B 1 1 2 1 1 2 1 NA
#> 2 B 2 2 2 1 2 2 1 NA
#> 3 B 3 3 3 1 3 3 1 NA
#> 4 B 4 4 4 1 4 4 1 NA
#> 5 B 5 5 5 1 5 5 1 NA
#> 6 E 1 1 1 1 NA NA NA 1
#> 7 E 2 1 2 1 NA NA NA 1
#> 8 E 3 1 3 1 NA NA NA 1
#> 9 E 4 1 4 1 NA NA NA 1
#> 10 E 5 1 5 1 NA NA NA 1
#> # ... with 3 more variables: VAR.E_2 <fctr>, VAR.E_3 <fctr>,
#> # Condition1 <fctr>
data.frame(lapply(split.default(data[-NCOL(data)], gsub("\\D+", "", head(names(data), -1))),
function(a){
a = sapply(a, function(x) as.numeric(as.character(x)))
rowSums(a, na.rm = TRUE)
}))
# X1 X2 X3
#1 1 2 1
#2 2 2 1
#3 3 3 1
#4 4 4 1
#5 5 5 1
#6 1 1 1
#7 1 2 1
#8 1 3 1
#9 1 4 1
#10 1 5 1
#Warning messages:
#1: In FUN(X[[i]], ...) : NAs introduced by coercion
#2: In FUN(X[[i]], ...) : NAs introduced by coercion
#3: In FUN(X[[i]], ...) : NAs introduced by coercion
Your data appears to have two kinds of NA values in it. It has NA, or R's NA value, and it also has the string 'NA'. In my solution below, I replace both with zero, cast each column in the data frame to numeric, and then just sum together like-numbered VAR columns. Then, drop the original columns which you don't want anymore.
data <- as.data.frame(cbind(VAR.B_1,VAR.B_2,VAR.B_3, VAR.E_1,VAR.E_2, VAR.E_3),
stringsAsFactors=FALSE)
data[is.na(data)] <- 0
data[data == 'NA'] <- 0
data <- as.data.frame(lapply(data, as.numeric))
data$VAR_1 <- data$VAR.B_1 + data$VAR.E_1
data$VAR_2 <- data$VAR.B_2 + data$VAR.E_2
data$VAR_3 <- data$VAR.B_3 + data$VAR.E_3
data <- data[c("VAR_1", "VAR_2", "VAR_3")]
Demo