data = data.frame(STUDENT=c(1,2,3,4,5,6,7,8),
CAT=c(NA,NA,1,2,3,NA,NA,0),
DOG=c(NA,NA,2,3,2,NA,1,NA),
MOUSE=c(2,3,NA,NA,NA,NA,NA,NA),
WANT=c(2,3,2,2,3,NA,NA,NA))
I have 'data' and wish to create the 'WANT' variable and what it does is it takes the first non-NA value that does not equals to '1' or '0' and it stores it in 'WANT'. The code example above shows an example of what I hope to get.
We can use coalesce after changing the values 0, 1 in the selected columns to NA, then bind the column with the original dataset
library(dplyr)
data %>%
transmute(across(CAT:MOUSE, ~ replace(., . %in% 0:1, NA))) %>%
transmute(WANT2 = coalesce(!!! .)) %>%
bind_cols(data, .)
# STUDENT CAT DOG MOUSE WANT WANT2
#1 1 NA NA 2 2 2
#2 2 NA NA 3 3 3
#3 3 1 2 NA 2 2
#4 4 2 3 NA 2 2
#5 5 3 2 NA 3 3
#6 6 NA NA NA NA NA
#7 7 NA 1 NA NA NA
#8 8 0 NA NA NA NA
Or using data.table with fcoalesce. Convert the 'data.frame' to 'data.table' (setDT(data)), specify the columns of interest in .SDcols, loop over the .SD replace the values that are 0, 1 to NA, use fcoalesce and assign (:=) it to create new column 'WANT2'
library(data.table)
setDT(data)[, WANT2 := do.call(fcoalesce, lapply(.SD, function(x)
replace(x, x %in% 0:1, NA))), .SDcols = CAT:MOUSE]
or with base R, we can use a vectorized option with row/column indexing to extract the first non-NA element after replaceing the values 0, 1 to NA
m1 <- !is.na(replace(data[2:4], data[2:4] == 1|data[2:4] == 0, NA))
data$WAN2 <- data[2:4][cbind(seq_len(nrow(m1)), max.col(m1, "first"))]
data$WANT2[data$WANT2 == 0] <- NA
Try this:
data$Want2 <- apply(data[,-c(1,5)],1,function(x) x[min(which(!is.na(x) & x!=0 & x!=1))])
STUDENT CAT DOG MOUSE WANT Want2
1 1 NA NA 2 2 2
2 2 NA NA 3 3 3
3 3 1 2 NA 2 2
4 4 2 3 NA 2 2
5 5 3 2 NA 3 3
6 6 NA NA NA NA NA
7 7 NA 1 NA NA NA
8 8 0 NA NA 0 NA
Related
I need to create a new column named "condition" (which is not there initially) based on the first three columns. If the values are from cond1 then it should be 1 in my condition column and so on. Any suggestions.
cond_test = read.csv("https://www.dropbox.com/s/du76g4vlfz2uaph/cond_test.csv?dl=1")
cond_test
#> ï..cond1 cond2 cond3 condition
#> 1 2 NA NA 1
#> 2 4 NA NA 1
#> 3 NA 3 NA 2
#> 4 NA 5 NA 2
#> 5 NA 4 NA 2
#> 6 NA NA 1 3
#> 7 NA NA 4 3
#> 8 NA NA 7 3
You can use max.col to get first non-NA value in each row.
max.col(!is.na(cond_test))
#[1] 1 1 2 2 2 3 3 3
If you have more than one non-NA value in the row you can look at ties.method argument in ?max.col on how to handle ties.
In dplyr you can use rowwise :
library(dplyr)
cond_test %>%
rowwise() %>%
mutate(condition = which.max(!is.na(c_across())))
I tried the following code and is working. But any elegant solutions are welcome.
cond_test$condition = ifelse(!is.na(cond_test$ï..cond1), 1,
ifelse(!is.na(cond_test$cond2), 2, 3))
Suppose I have this dataframe
df <- data.frame(
x=c(1, NA, NA, 4, 5, NA),
y=c(NA, 2, 3, NA, NA, 6)
which looks like this
x y
1 1 NA
2 NA 2
3 NA 3
4 4 NA
5 5 NA
6 NA 6
How can I merge the two columns into one? Basically the NA values are in complementary rows. It would be nice to also obtain (in the process) a flag column containing 0 if the entry comes from x and 1 if the entry comes from y.
We can try using the coalesce function from the dplyr package:
df$merged <- coalesce(df$x, df$y)
df$flag <- ifelse(is.na(df$y), 0, 1)
df
x y merged flag
1 1 NA 1 0
2 NA 2 2 1
3 NA 3 3 1
4 4 NA 4 0
5 5 NA 5 0
6 NA 6 6 1
We can also use base R methods with max.col on the logical matrix to get the column index, cbind with row index and extract the values that are not NA
df$merged <- df[cbind(seq_len(nrow(df)), max.col(!is.na(df)))]
df$flag <- +(!is.na(df$y))
df
# x y merged flag
#1 1 NA 1 0
#2 NA 2 2 1
#3 NA 3 3 1
#4 4 NA 4 0
#5 5 NA 5 0
#6 NA 6 6 1
Or we can use fcoalesce from data.table which is written in C and is multithreaded for numeric and factor types.
library(data.table)
setDT(df)[, c('merged', 'flag' ) := .(fcoalesce(x, y), +(!is.na(y)))]
df
# x y merged flag
#1: 1 NA 1 0
#2: NA 2 2 1
#3: NA 3 3 1
#4: 4 NA 4 0
#5: 5 NA 5 0
#6: NA 6 6 1
You can do that using dplyr as follows;
library(dplyr)
# Creating dataframe
df <-
data.frame(
x = c(1, NA, NA, 4, 5, NA),
y = c(NA, 2, 3, NA, NA, 6))
df %>%
# If x is null then replace it with y
mutate(merged = coalesce(x, y),
# If x is null then put 1 else put 0
flag = if_else(is.na(x), 1, 0))
# x y merged flag
# 1 NA 1 0
# NA 2 2 1
# NA 3 3 1
# 4 NA 4 0
# 5 NA 5 0
# NA 6 6 1
I want to make value of each row of column A , NA ,where column B is 2:
data
A B
1 2
2 4
NA 5
6 2
output
A B
NA 2
2 4
NA 5
NA 2
first and last row of B was 2 so A got NA in those.
Here's a way using ifelse in base R -
df$A <- ifelse(df$B == 2, NA_real_, df$A)
set.seed(0)
df <- data.frame(A = sample(1:10, size=5, replace=T),
B = sample(1:10, size=5, replace=T))
df
A B
1 9 7
2 4 2
3 7 3
4 1 1
5 2 5
df$A[df$B == 2] <- NA
df
A B
1 9 7
2 NA 2
3 7 3
4 1 1
5 2 5
I am trying to create a function which takes a dataframe and the columns by which I want to sort as arguments. This is what I have come up with:
sortDf <- function(df, columns){
df <- df[order(df[,columns]),]
return(df)
}
This is my usecase:
set.seed(24)
dataset <- matrix(sample(c(NA, 1:5), 25, replace = TRUE), 5)
df <- as.data.frame(dataset)
sortedDf <- sortDf(df, c('V1', 'V2'))
How ever I get this as a result:
V1 V2 V3 V4 V5
3 1 1 5 3 4
5 1 5 2 5 2
NA NA NA NA NA NA
NA.1 NA NA NA NA NA
NA.2 NA NA NA NA NA
NA.3 NA NA NA NA NA
1 5 2 1 2 5
4 5 2 1 2 1
NA.4 NA NA NA NA NA
2 NA 4 NA 1 4
The dataframe is kinda sorted but where does the 'NA' come from and how can I remove them? What do I do wrong? I want to sort descending. Thanks in advance.
We can create a different function
f1 <- function(dat, cols){
dat[do.call(order, dat[cols]),]
}
f1(df, c("V1", "V2"))
# V1 V2 V3 V4 V5
#2 1 1 2 1 3
#1 1 5 3 5 NA
#5 3 1 1 NA 1
#4 3 4 4 3 NA
#3 4 4 4 NA 4
In the OP's code, the order is applied on a data.frame instead of a vector. It can be used either separately or within do.call i.e.
df[order(df$V1, df$V2),]
# V1 V2 V3 V4 V5
#2 1 1 2 1 3
#1 1 5 3 5 NA
#5 3 1 1 NA 1
#4 3 4 4 3 NA
#3 4 4 4 NA 4
gives the same result as the OP's code. So, either it columns can be individually mentioned (which would not be easy when there are more number of columns) or use do.call.
This can also be implemented using the devel version of dplyr (soon to be released 0.6.0) with quosures. After taking the input vector, it is converted to quosures (parse_quosures) and then evaluated by unquoting (!!!) it in arrange
library(dplyr)
f2 <- function(dat, cols){
cols <- rlang::parse_quosures(paste(cols, collapse=";"))
dat %>%
arrange(!!! cols)
}
f2(df, c("V1", "V2"))
# V1 V2 V3 V4 V5
#1 1 1 2 1 3
#2 1 5 3 5 NA
#3 3 1 1 NA 1
#4 3 4 4 3 NA
#5 4 4 4 NA 4
data
set.seed(24)
df <- as.data.frame(matrix(sample(c(NA, 1:5), 25, replace = TRUE), 5))
df <- data.frame(x=c(1,2,1,2,3,3), y = c(letters[1:5],'a'), val = c(1:5, 9))
print(df)
x y val
1 a 1
2 b 2
1 c 3
2 d 4
3 e 5
3 a 9
I want to create a function fun(df, rowname, colname, valname)that takes a dataframe, rowname, colname, and value inputs and returns a data.frame or matrix with row names, column names and values as shown below
fun(df, "x","y","val") should return
1 2 3
a 1 NA 9
b NA 2 NA
c 3 NA NA
d NA 4 NA
e NA NA 5
The reshape2 library allows this kind of manipulation:
library(reshape2)
dcast(data=df, y~x, value.var = "val")
y 1 2 3
1 a 1 NA 9
2 b NA 2 NA
3 c 3 NA NA
4 d NA 4 NA
5 e NA NA 5