Combine of information based on two dataframes in R - r

This is my sample data
> data.frame
a b c d
W_1_N NA NA NA NA
W_1_E 2 2 2 4
W_1_C 4 2 2 4
W_1_D NA NA NA NA
First I had to combine elements from matrix to get pairs of column names of them, where one of element is 4 and another is 2 in the same row.
In a result it looks like this
W_1_E.1 d a
W_1_E.2 d b
W_1_E.3 d c
W_1_C.1 a b
W_1_C.2 a c
W_1_C.3 d b
W_1_C.4 d c
I wanted only pairs where one element is 4 and other is 2 in the same row. W_1_N and W_1_D have only NA so was ommited. W_1_E appears in 3 rows because there are 3 pairs of (4,2) in row in sample data.W_1_C has 4 pairs.
This is code:
lst=data.frame(df) %>%
rownames_to_column("rn") %>%
drop_na() %>%
gather(key, value, -rn) %>%
group_by(rn, value) %>%
summarise(l = list(unique(key))) %>%
split(.$rn)
pair=do.call("rbind", lapply(lst, function(x) expand.grid(x$l[[1]],
x$l[[2]])))
It works perfectly, but now I have second data.frame:
a b c d
W_1_N 0 1 1 1
W_1_E 1 1 0 0
W_1_C 1 1 1 0
W_1_D 1 0 1 1
Here is my problem, I want to get only this pairs where value of both elements of pair is 1 in second data.frame. For example first pair of my result W_1_E.1 d a should be eliminated because d has value 0 in W_1_E row in second data.frame.
The output should be:
W_1_C.1 a b
W_1_C.2 a c
d has value 0 in W_1_E row, so all rows with W_1_E in my result data.frame were eliminated (all pars were with d). The last two rows were eliminated because d is also 0 in W_1_C row in second dataframe.
Thanks for your help

How's this?
x <- "N a b c d
W_1_N NA NA NA NA
W_1_E 2 2 2 4
W_1_C 4 2 2 4
W_1_D NA NA NA NA "
x1 <- read.table(text = x, header = TRUE)
x <- "N a b c d
W_1_N 0 1 1 1
W_1_E 1 1 0 0
W_1_C 1 1 1 0
W_1_D 1 0 1 1 "
x2 <- read.table(text = x, header = TRUE)
df <- merge(x1, x2, by="N")
df$a <- ifelse(df$a.y == 0,NA,df$a.x)
df$b <- ifelse(df$b.y == 0,NA,df$b.x)
df$c <- ifelse(df$c.y == 0,NA,df$c.x)
df$d <- ifelse(df$d.y == 0,NA,df$d.x)
df <- df[ , c(1,10:13)]
library(tidyr)
df_all <- df %>%
gather(key = key1, value, 2:5)
df2 <- df_all[!is.na(df_all$value) & df_all$value == 2,]
df4 <- df_all[!is.na(df_all$value) & df_all$value == 4,]
merge(df2[,1:2], df4[1:2], by = "N", all.x = FALSE, all.y = FALSE)

Related

Find unique max values for each row in DF with equal numbers

I have a data frame that looks like this:
A <- rep(1, times = 3)
B <- 1:3
C <- c(1,3,2)
DF <- data.frame(A,B,C)
Which makes:
> DF
A B C
1 1 1 1
2 1 2 3
3 1 3 2
I would like to create a new column that indicates the columname in which the max value for each row can be found but only if they are unique, otherwise I would like to give it an NA.
I have tried various options, however this one for example would always use the first column name in which the value was found as the max:
DF$max <- colnames(DF)[max.col(DF, ties.method = "first")]
Reulting in:
A C B
I would like to have
NA C B
You can count the number of max values in each row using rowSums, turn the output to NA if they are more than 1.
col <- colnames(DF)[max.col(DF)]
col[rowSums(DF == do.call(pmax, DF)) > 1] <- NA
DF$max <- col
DF
# A B C max
#1 1 1 1 <NA>
#2 1 2 3 C
#3 1 3 2 B
You can test if the result of ties.method = "first" is equal to the result when ties.method = "last" is used.
i <- max.col(DF, ties.method = "first")
j <- max.col(DF, ties.method = "last")
DF$max <- colnames(DF)[i]
DF$max[i != j] <- NA
DF
# A B C max
#1 1 1 1 <NA>
#2 1 2 3 C
#3 1 3 2 B
We can also use pmap for this purpose:
library(dplyr)
library(purrr)
DF %>%
mutate(Max = pmap_chr(DF, ~ {
x <- c(...)
if(sum(x == max(x, na.rm = TRUE)) > 1) {
NA_character_
} else {
names(DF)[which(x == max(x, na.rm = TRUE))]
}
}
))
A B C Max
1 1 1 1 <NA>
2 1 2 3 C
3 1 3 2 B
We can use
DF$max <- names(DF)[max.col(DF, "first")*NA^(rowSums(DF == do.call(pmax, DF)) > 1)]
DF$max
[1] NA "C" "B"

How to merge elements of atomic vector in R?

I wanted to merge different elements of atomic vectors by elements names stored in list. See example:
ls = list(a = c(a = 1, b = 2, d = 2), b = c(b = 2, c = 3), c = c(a = 1, b = 2))
Now, I wanted to get output like this:
a b c
a 1 NA 1
b 2 2 2
c NA 3 NA
d 2 NA NA
I tried Reduce, but it is not working. I do not want to use any external package for this problem.
Thanks
You can use [ in sapply after you have extracted all elements names.
i <- sort(unique(unlist(lapply(ls, names))))
x <- sapply(ls, "[", i)
rownames(x) <- i
x
# a b c
#a 1 NA 1
#b 2 2 2
#c NA 3 NA
#d 2 NA NA
We could also use bind_rows here
library(dplyr)
library(tibble)
bind_rows(ls, .id = 'x') %>%
column_to_rownames('x') %>%
t
a b c
a 1 NA 1
b 2 2 2
d 2 NA NA
c NA 3 NA
Or using base R
xtabs(values ~ ind + x, do.call(rbind, Map(cbind, x = names(ls), lapply(ls, stack))))
x
ind a b c
a 1 0 1
b 2 2 2
d 2 0 0
c 0 3 0
A data.table option using rbindlist
> t(rbindlist(Map(function(x) data.table(t(x)), lst), fill = TRUE))
[,1] [,2] [,3]
a 1 NA 1
b 2 2 2
d 2 NA NA
c NA 3 NA

Dispatch values in list column to separate columns

I have a data.table with a list column "c":
df <- data.table(a = 1:3, c = list(1L, 1:2, 1:3))
df
a c
1: 1 1
2: 2 1,2
3: 3 1,2,3
I want to create separate columns for the values in "c".
I create a set of new columns F_1, F_2, F_3:
mmax <- max(df$a)
flux <- paste("F", 1:mmax, sep = "_")
df[, (flux) := 0]
df
a c F_1 F_2 F_3
1: 1 1 0 0 0
2: 2 1,2 0 0 0
3: 3 1,2,3 0 0 0
I want to dispatch values in "c" to columns F_1, F_2, F_3 like this:
df
a c F_1 F_2 F_3
1: 1 1 1 0 0
2: 2 1,2 1 2 0
3: 3 1,2,3 1 2 3
What I have tried:
comp_vect <- function(vec, mmax){
vec <- vec %>% unlist()
n <- length(vec)
answr <- c(vec, rep(0, l = mmax -n))
}
df[ , ..flux := mapply(comp_vect, c, mmax)]
The expected data.table is :
> df
a c F_1 F_2 F_3
1: 1 1 1 0 0
2: 2 1,2 1 2 0
3: 3 1,2,3 1 2 3
I followed a radically different approach. I rbinded the list column and then dcasted it, obtaining the desired result. Last part is to set the names.
library(data.table)
df <- data.table(a = 1:3, d = list(1L, c(1L, 2L), c(1L, 2L, 3L)))
df2 <- df[, rbind(d), by = a][, dcast(.SD, a ~ V1, fill = 0)]
setnames(df2, 2:4, flux)[]
a F_1 F_2 F_3
1: 1 1 0 0
2: 2 1 2 0
3: 3 1 2 3
where flux is the variable of names that you defined in your question.
Please notice that avoided using the column name c, as it may be confused with the function c().
Solution :
for(idx in seq(max(sapply(df$c, length)))){ # maximum number of values according to all the elements of the list
set(x = df,
i = NULL,
j = paste0("F_",idx), # column's name
value = sapply(df$c, function(x){
if(is.na(x[idx])){
return(0) # 0 instead of NA
} else {
return(x[idx])
}
})
)
}
Explications :
We can extract the values from a list like this :
sapply(df$c, function(ll) return(ll[1])) # first value
[1] 1 1 1
sapply(df$c, function(ll) return(ll[2])) # second value
[1] NA 2 2
sapply(df$c, function(ll) return(ll[3])) # third value
[1] NA NA 3
We see that if there is no value, we have a NA.
We need an iterator to extract all values at the position idx. For that, we'll find the number of values in each element of df$c (the list) and keep the maximum.
max(sapply(df$c, length))
[1] 3
If we want zeros instead of NAs, we need to create a function in the sapply to convert them :
vec <- c(NA, 5, 1, NA)
> sapply(vec, function(x) if(is.na(x)) return(0) else return(x))
[1] 0 5 1 0

Merge data.frame columns on set number of columns removing na's unless not enough values in row

I'd like to remove the NA values from my columns, merge all columns into four columns, while keeping NA's if there is not 4 values in each row.
Say I have data like this,
df <- data.frame('a' = c(1,4,NA,3),
'b' = c(3,NA,3,NA),
'c' = c(NA,2,NA,NA),
'd' = c(4,2,NA,NA),
'e'= c(NA,5,3,NA),
'f'= c(1,NA,NA,4),
'g'= c(NA,NA,NA,4))
#> a b c d e f g
#> 1 1 3 NA 4 NA 1 NA
#> 2 4 NA 2 2 5 NA NA
#> 3 NA 3 NA NA 3 NA NA
#> 4 3 NA NA NA NA 4 4
My desired outcome would be,
df.desired <- data.frame('a' = c(1,4,3,3),
'b' = c(3,2,3,4),
'c' = c(4,2,NA,4),
'd' = c(1,5,NA,NA))
df.desired
#> a b c d
#> 1 1 3 4 1
#> 2 4 2 2 5
#> 3 3 3 NA NA
#> 4 3 4 4 NA
You could've probably explored a bit more on SO to tweak two answers 1 & 2.
Shifting all the Numbers with NAs
Remove the columns where you've got All NAs
Result:
df <- data.frame('a' = c(1,4,NA,3),
'b' = c(3,NA,3,NA),
'c' = c(NA,2,NA,NA),
'd' = c(4,2,NA,NA),
'e'= c(NA,5,3,NA),
'f'= c(1,NA,NA,4),
'g'= c(NA,NA,NA,4))
df.new<-do.call(rbind,lapply(1:nrow(df),function(x) t(matrix(df[x,order(is.na(df[x,]))])) ))
colnames(df.new)<-colnames(df)
df.new
df.new[,colSums(is.na(df.new))<nrow(df.new)]
Output:
> df.new[,colSums(is.na(df.new))<nrow(df.new)]
a b c d
[1,] 1 3 4 1
[2,] 4 2 2 5
[3,] 3 3 NA NA
[4,] 3 4 4 NA
I believe there are more efficient ways, anyhow that is my try:
x00=sapply(1:nrow(df),function(x) df[x,][!is.na( df[x,])])
x01=lapply(x00,function(x) x=c(x,rep(NA,7-length(x)-1)))
x02=as.data.frame(do.call("rbind",x01))
x02 <- x02[,colSums(is.na(x02))<nrow(x02)]
I have following solution:
df <- data.frame('a' = c(1,4,NA,3),
'b' = c(3,NA,3,NA),
'c' = c(NA,2,NA,NA),
'd' = c(4,2,NA,NA),
'e'= c(NA,5,3,NA),
'f'= c(1,NA,NA,4),
'g'= c(NA,NA,NA,4))
df
x <-list()
for(i in 1:nrow(df)){
x[[i]] <- df[i,]
x[[i]] <- x[[i]][!is.na(x[[i]])]
# x[[i]] <- as.data.frame(x[[i]], stringsAsFactors = FALSE)
x[[i]] <- c(x[[i]], rep(0, 5 -length(x[[i]])))
}
result <- do.call(rbind, x)
result

Replacing 0 values with NA in data.frame conditionally

dat <- data.frame(A=c("name1", "name2", "name3"),
B=c(0,1,0), C=c(0,0,5), D= c(4,4,0), E=c(1,0,0), F=c(4,0,0) )
desiredresult <- data.frame(A=c("name1", "name2", "name3"),
B=c(NA,1,NA), C=c(NA,0,5), D= c(4,4,0), E=c(1,0,NA), F=c(4,NA,NA))
I want to replace 0 values with NA in every row until a positive value is encountered (no negative values in dataset). In addition to that I want to replace all values if their ending are all zeros leaving first 0 in place after last positive value. etc 5,0,0,0 -> 5,0,NA,NA
Provided example data with desiredresult. I was trying approach something like this, but there would need to be 5+ conditions to cover it all. Is there a better way to do this? Maybe with data.table?
dat$B[dat$B == 0 & (dat$C!=0 | dat$D!=0)] <- NA
dat$C[dat$C == 0 & dat$D!=0 & is.na(dat$B)] <- NA
Using the data.table-package, you could approach this as follows:
cols <- names(dat)[2:6]
library(data.table)
setDT(dat)[, (cols) := {x <- unlist(.SD);
x[cumsum(x)==0] <- NA;
l <- c(tail(cumsum(rev(x)),-1),1) == 0;
x[rev(l)] <- NA;
names(x) <- cols;
as.list(x)},
by = A]
you get:
> dat
A B C D E F
1: name1 NA NA 4 1 4
2: name2 1 0 4 0 NA
3: name3 NA 5 0 NA NA
The same kind of thinking, but then with base R:
dl <- as.data.frame(t(dat[,-1]))
idx1 <- cumsum(dl) == 0
idx2 <- sapply(dl, function(x) {
l <- c(tail(cumsum(rev(x)),-1),1) == 0
l[is.na(l)] <- FALSE
rev(l)
})
dl[idx1 | idx2] <- NA
dat[,-1] <- t(dl)
which will get you the same result:
> dat
A B C D E F
1 name1 NA NA 4 1 4
2 name2 1 0 0 4 0
3 name3 NA 5 0 NA NA
New example data:
dat <- data.frame(A=c("name1", "name2", "name3"),
B=c(0,1,0), C=c(0,0,5), D=c(4,0,0), E=c(1,4,0), F=c(4,0,0) )
This should work:
#Apply the first rule: convert 0 to NA until we find a non negative
res1<-t(apply(dat[,-1], 1, function(x) {
xc <- cumsum(x) #cumulative sum
x[xc==0]<-NA #NA where cumulative sum iz 0
x
}))
# Apply the second rule
res2<-t(apply(res1, 1, function(x) {
xc <- cumsum(rev(x)) #reverse the sum
xc<-c(tail(xc,-1),1) # shift the sum
res<-rev(x) #reverse the vector
res[xc==0]<-NA
rev(res)
}))
#Reconstruct the data frame
cbind(data.frame(name=dat[,1]),res2)
# name B C D E F
#1 name1 NA NA 4 1 4
#2 name2 1 0 4 0 NA
#3 name3 NA 5 0 NA NA

Resources