I have 2 dataframes
DF x:
ID A B C
1 x y z
2 x y z
DF y:
ID A B C
1 NA d f
2 e NA NA
I want to join them in such way that the value of x gets overwritten by the value of y, but only if there is a value in y for the matching column in x.
Hence, the outcome of the above should be:
ID A B C
1 x d f
2 e y z
One option is coalesce
library(dplyr)
left_join(dfx, dfy, by = 'ID') %>%
transmute(ID, A = coalesce(A.y, A.x),
B = coalesce(B.y, B.x),
C = coalesce(C.y, C.x))
# ID A B C
#1 1 x d f
#2 2 e y z
Or if there are many columns, reshape it to 'long' format, do the coalesce and then reshape into 'wide' format
library(tidyr)
left_join(dfx, dfy, by = 'ID') %>%
pivot_longer(cols = -ID, names_to = c("group", ".value"), names_sep = "\\.") %>%
mutate(x = coalesce(y, x)) %>%
select(-y) %>%
pivot_wider(names_from = group, values_from = x)
Or another option is to bind_rows the two datasets and then do a group_by summarise (assuming one row per 'ID')
bind_rows(dfy, dfx) %>%
group_by(ID) %>%
summarise_at(vars(-group_cols()), ~ first(.[!is.na(.)]))
Or using a join on data.table
library(data.table)
nm1 <- names(dfx)[-1]
nm2 <- paste0("i.", nm1)
setDT(dfy)[dfx, (nm1) := Map(coalesce, mget(nm1), mget(nm2)), on = .(ID)]
dfy
data
dfx <- structure(list(ID = 1:2, A = c("x", "x"), B = c("y", "y"), C = c("z",
"z")), class = "data.frame", row.names = c(NA, -2L))
dfy <- structure(list(ID = 1:2, A = c(NA, "e"), B = c("d", NA), C = c("f",
NA)), class = "data.frame", row.names = c(NA, -2L))
Using base R, we can get the index of non-NA values in dfy and replace the corresponding values of dfx with it.
#Rearrange dfy if you have columns in different order than dfx
#dfy <- dfy[names(dfx)]
inds <- which(!is.na(dfy[-1]), arr.ind = TRUE)
dfx[-1][inds] <- dfy[-1][inds]
dfx
# ID A B C
#1 1 x d f
#2 2 e y z
Related
I want to fill df2 with information from df1.
df1 as below
ID Mutation
1 A
2 B
2 C
3 A
df2 as below
ID A B C
1
2
3
For example, if mutation A is found in ID 1, then I want it in df2 it marked as "Y".
So the df2 result should be
ID A B C
1 Y
2 Y Y
3 Y
I have hundreds of IDs and more than 20 mutations. How can I efficiently achieve this in R? Thanks!
Using data.table you can try
setDT(df)
df2 <- dcast(df,formula = ID~Mutation )
df2[, c("A", "B", "C") := lapply(.SD, function(x) ifelse(is.na(x), " ", "Y")), ID]
df2
#Output
ID A B C
1: 1 Y
2: 2 Y Y
3: 3 Y
Create a new column with value 'Y' and cast the data in wide format.
library(dplyr)
library(tidyr)
df %>%
mutate(value = 'Y') %>%
pivot_wider(names_from = Mutation, values_from = value, values_fill = '')
# ID A B C
# <int> <chr> <chr> <chr>
#1 1 "Y" "" ""
#2 2 "" "Y" "Y"
#3 3 "Y" "" ""
data
df <- structure(list(ID = c(1L, 2L, 2L, 3L), Mutation = c("A", "B",
"C", "A")), class = "data.frame", row.names = c(NA, -4L))
This is my input data
key col_a col_b
a QQQ <NA>
a QQC <NA>
b <NA> ACQ
b <NA> ACQ
I'd like to create this output
key col_a col_b
a 2 0
b 0 1
I tried to do this with length(unique(x$col_a)) but it counts the NA as values
I'm creating this object with data.tables and it comes from an ifelse() statement.
Can I change the value I'm putting in the ifelse statement to something else or count unique ignoring NAs?
For each key we can find out unique values in columns with n_distinct
library(dplyr)
df %>%
group_by(key) %>%
summarise(across(col_a:col_b, n_distinct, na.rm = TRUE))
In data.table this can be done as :
library(data.table)
setDT(df)[, lapply(.SD, uniqueN, na.rm = TRUE), key, .SDcols = col_a:col_b]
key col_a col_b
1: a 2 0
2: b 0 1
We can use base R methods for this
aggregate(. ~ key, df, FUN = function(x)
length(unique(na.omit(x))), na.action = NULL)
# key col_a col_b
#1 a 2 0
#2 b 0 1
Or with tidyverse using anonymous function
library(dplyr)
df %>%
group_by(key) %>%
summarise(across(everything(), ~ n_distinct(., na.rm = TRUE)))
data
df <- structure(list(key = c("a", "a", "b", "b"), col_a = c("QQQ",
"QQC", NA, NA), col_b = c(NA, NA, "ACQ", "ACQ")),
class = "data.frame", row.names = c(NA,
-4L))
I'm trying to do multiple merges/joins onto different columns in the same dataset, but when I do so the output is entirely wrong.
df1 df2
P1 P2 P3 P4 P Output
A B C C 1
A B B 2
E F G H H 3
E E 4
I'm trying to merge df2 onto df1 and the output I would like to get would look like
df3
P1 P2 P3 P4 Output
A B C NA 1
A B NA NA 2
E F G H 3
E NA NA NA 4
I've tried
df3<- merge(df1,df2, by.x = "P1", by.y = "P", all.x = T, all.y = T)
df3<- merge(df1,df2, by.x = "P2", by.y = "P", all.x = T, all.y = T)
df3<- merge(df1,df2, by.x = "P3", by.y = "P", all.x = T, all.y = T)
df3<- merge(df1,df2, by.x = "P4", by.y = "P", all.x = T, all.y = T)
however it doesn't work the way I think it should. Is there an easier function that can cleanly merge like this that I am not aware of?
Based on the output showed, it seems that for each row, we need to get the last non-NA element and do a match with the second data.frame 'P' column to get the corresponding 'Output'. If that is the logic,
df3 <- df1
df3$Output <- apply(df1, 1, function(x)
setNames(df2$Output, df2$P)[tail(x[!is.na(x)], 1)])
Or with tidyverse
library(dplyr)
library(tidyr)
df1 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, values_drop_na = TRUE) %>%
group_by(rn) %>%
slice(n()) %>%
ungroup %>%
left_join(df2, by = c('value' = 'P')) %>%
select(Output) %>%
bind_cols(df1, .)
data
df1 <- structure(list(P1 = c("A", "A", "E", "E"), P2 = c("B", "B", "F",
NA), P3 = c("C", NA, "G", NA), P4 = c(NA, NA, "H", NA)), class = "data.frame",
row.names = c(NA,
-4L))
df2 <- structure(list(P = c("C", "B", "H", "E"), Output = 1:4),
class = "data.frame", row.names = c(NA,
-4L))
You can use coalesce from the dplyr package to create a new field in df1 which will be the key between the two datasets.
library(dplyr)
#create column P, which takes first non null value
df1$P <- coalesce(df1$P4,df1$P3,df1$P2,df1$P1)
#Join data frames on P
df3 <- inner_join(df1, df2, by='P')
#Rmove P from df3
df3$P <- NULL
>> df3
P1 P2 P3 P4 Output
1 A B C <NA> 1
2 A B <NA> <NA> 2
3 E F G H 3
4 E <NA> <NA> <NA> 4
I have two data frames. dfOne is made like this:
X Y Z T J
3 4 5 6 1
1 2 3 4 1
5 1 2 5 1
and dfTwo is made like this
C.1 C.2
X Z
Y T
I want to obtain a new dataframe where there are simultaneously X, Y, Z, T Values which are major than a specific threshold.
Example. I need simultaneously (in the same row):
X, Y > 2
Z, T > 4
I need to use the second data frame to reach my objective, I expect something like:
dfTwo$C.1>2
so the result would be a new dataframe with this structure:
X Y Z T J
3 4 5 6 1
How could I do it?
Here is a base R method with Map and Reduce.
# build lookup table of thresholds relative to variable name
vals <- setNames(c(2, 2, 4, 4), unlist(dat2))
# subset data.frame
dat[Reduce("&", Map(">", dat[names(vals)], vals)), ]
X Y Z T J
1 3 4 5 6 1
Here, Map returns a list of length 4 with logical variables corresponding to each comparison. This list is passed to Reduce which returns a single logical vector with length corresponding to the number of rows in the data.frame, dat. This logical vector is used to subset dat.
data
dat <-
structure(list(X = c(3L, 1L, 5L), Y = c(4L, 2L, 1L), Z = c(5L,
3L, 2L), T = c(6L, 4L, 5L), J = c(1L, 1L, 1L)), .Names = c("X",
"Y", "Z", "T", "J"), class = "data.frame", row.names = c(NA,
-3L))
dat2 <-
structure(list(C.1 = structure(1:2, .Label = c("X", "Y"), class = "factor"),
C.2 = structure(c(2L, 1L), .Label = c("T", "Z"), class = "factor")), .Names = c("C.1",
"C.2"), class = "data.frame", row.names = c(NA, -2L))
We can use the purrr package
Here is the input data.
# Data frame from lmo's solution
dat <-
structure(list(X = c(3L, 1L, 5L), Y = c(4L, 2L, 1L), Z = c(5L,
3L, 2L), T = c(6L, 4L, 5L), J = c(1L, 1L, 1L)), .Names = c("X",
"Y", "Z", "T", "J"), class = "data.frame", row.names = c(NA,
-3L))
# A numeric vector to show the threshold values
# Notice that columns without any requirements need NA
vals <- c(X = 2, Y = 2, Z = 4, T = 4, J = NA)
Here is the implementation
library(purrr)
map2_dfc(dat, vals, ~ifelse(.x > .y | is.na(.y), .x, NA)) %>% na.omit()
# A tibble: 1 x 5
X Y Z T J
<int> <int> <int> <int> <int>
1 3 4 5 6 1
map2_dfc loop through each column in dat and each value in vals one by one with a defined function. ~ifelse(.x > .y | is.na(.y), .x, NA) means if the number in each column is larger than the corresponding value in vals, or vals is NA, the output should be the original value from the column. Otherwise, the value is replaced to be NA. The output of map2_dfc(dat, vals, ~ifelse(.x > .y | is.na(.y), .x, NA)) is a data frame with NA values in some rows indicating that the condition is not met. Finally, na.omit removes those rows.
Update
Here I demonstrate how to covert the dfTwo dataframe to the vals vector in my example.
First, let's create the dfTwo data frame.
dfTwo <- read.table(text = "C.1 C.2
X Z
Y T",
header = TRUE, stringsAsFactors = FALSE)
dfTwo
C.1 C.2
1 X Z
2 Y T
To complete the task, I load the dplyr and tidyr package.
library(dplyr)
library(tidyr)
Now I begin the transformation of dfTwo. The first step is to use stack function to convert the format.
dfTwo2 <- dfTwo %>%
stack() %>%
setNames(c("Col", "Group")) %>%
mutate(Group = as.character(Group))
dfTwo2
Col Group
1 X C.1
2 Y C.1
3 Z C.2
4 T C.2
The second step is to add the threshold information. One way to do this is to create a look-up table showing the association between Group and Value
threshold_df <- data.frame(Group = c("C.1", "C.2"),
Value = c(2, 4),
stringsAsFactors = FALSE)
threshold_df
Group Value
1 C.1 2
2 C.2 4
And then we can use the left_join function to combine the data frame.
dfTwo3 <- dfTwo2 %>% left_join(threshold_dt, by = "Group")
dfTwo3
Col Group Value
1 X C.1 2
2 Y C.1 2
3 Z C.2 4
4 T C.2 4
Now it is the third step. Notice that there is a column called J which does not need any threshold. So we need to add this information to dfTwo3. We can use the complete function from tidyr. The following code completes the data frame by adding Col in dat but not in dfTwo3 and NA to the Value.
dfTwo4 <- dfTwo3 %>% complete(Col = colnames(dat))
dfTwo4
# A tibble: 5 x 3
Col Group Value
<chr> <chr> <dbl>
1 J <NA> NA
2 T C.2 4
3 X C.1 2
4 Y C.1 2
5 Z C.2 4
The fourth step is arrange the right order of dfTwo4. We can achieve this by turning Col to factor and assign the level based on the order of the column name in dat.
dfTwo5 <- dfTwo4 %>%
mutate(Col = factor(Col, levels = colnames(dat))) %>%
arrange(Col) %>%
mutate(Col = as.character(Col))
dfTwo5
# A tibble: 5 x 3
Col Group Value
<chr> <chr> <dbl>
1 X C.1 2
2 Y C.1 2
3 Z C.2 4
4 T C.2 4
5 J <NA> NA
We are almost there. Now we can create vals from dfTwo5.
vals <- dfTwo5$Value
names(vals) <- dfTwo5$Col
vals
X Y Z T J
2 2 4 4 NA
Now we are ready to use the purrr package to filter the data.
The aboved are the breakdown of steps. We can combine all these steps into the following code for simlicity.
library(dplyr)
library(tidyr)
threshold_df <- data.frame(Group = c("C.1", "C.2"),
Value = c(2, 4),
stringsAsFactors = FALSE)
dfTwo2 <- dfTwo %>%
stack() %>%
setNames(c("Col", "Group")) %>%
mutate(Group = as.character(Group)) %>%
left_join(threshold_df, by = "Group") %>%
complete(Col = colnames(dat)) %>%
mutate(Col = factor(Col, levels = colnames(dat))) %>%
arrange(Col) %>%
mutate(Col = as.character(Col))
vals <- dfTwo2$Value
names(vals) <- dfTwo2$Col
dfOne[Reduce(intersect, list(which(dfOne["X"] > 2),
which(dfOne["Y"] > 2),
which(dfOne["Z"] > 4),
which(dfOne["T"] > 4))),]
# X Y Z T J
#1 3 4 5 6 1
Or iteratively (so fewer inequalities are tested):
vals = c(X = 2, Y = 2, Z = 4, T = 4) # from #lmo's answer
dfOne[Reduce(intersect, lapply(names(vals), function(x) which(dfOne[x] > vals[x]))),]
# X Y Z T J
#1 3 4 5 6 1
I'm writing this assuming that the second DF is meant to categorize the fields in the first DF. It's way simpler if you don't need to use the second one to define the conditions:
dfNew = dfOne[dfOne$X > 2 & dfOne$Y > 2 & dfOne$Z > 4 & dfOne$T > 4, ]
Or, using dplyr:
library(dplyr)
dfNew = dfOne %>% filter(X > 2 & Y > 2 & Z > 4 & T > 4)
In case that's all you need, I'll save this comment while I poke at the more complicated version of the question.
I have the following input
#mydata
ID variable1 variable2
1 a,b,c,d c,a
2 g,f,h h
3 p,l,m,n,c c,l
I wish to subtract the strings of varible2 from variable1 and I'd like to have the following output?
#Output
ID Output
1 b,d
2 g,f
3 p,m,n
#dput
structure(list(ID = 1:3, variable1 = structure(1:3, .Label = c("a,b,c,d",
"g,f,h", "p,l,m,n,c"), class = "factor"), variable2 = structure(c(1L,
3L, 2L), .Label = c("c,a", "c,l", "h"), class = "factor")), .Names = c("ID",
"variable1", "variable2"), class = "data.frame", row.names = c(NA,
-3L))
You can try,
Map(setdiff, strsplit(as.character(df$variable1), ',')), strsplit(as.character(df$variable2), ','))
We can use Map after splitting each of the columns by , get the setdiff, paste them together, set the names of the list output with 'ID' column, stack it to 'data.frame' and set the names to 'ID' and 'Output' for the columns.
setNames(stack(setNames(Map(function(x,y) toString(setdiff(x,y)),
strsplit(as.character(df1$variable1), ","),
strsplit(as.character(df1$variable2), ",")),
df1$ID))[2:1], c("ID", "Output"))
# ID Output
#1 1 b, d
#2 2 g, f
#3 3 p, m, n
Or a compact option would be
library(splitstackshape)
cSplit(df1, 2:3, ",", "long")[, .(Output = toString(setdiff(variable1, variable2))) , ID]
# ID Output
#1: 1 b, d
#2: 2 g, f
#3: 3 p, m, n
Using grepl instead of setdiff
library(stringr)
a1 <- str_split(d$variable1, ",")
a2 <- str_split(d$variable2, ",")
do.call("rbind",Map(function(x,y) paste(x[!grepl(paste(y, collapse="|"), x)], collapse=","), a1, a2))
[,1]
[1,] "b,d"
[2,] "g,f"
[3,] "p,m,n"
Using Dplyr
mydata %>%
rowwise() %>%
mutate(output = paste0(setdiff(strsplit(as.character(variable1),split = ",")[[1]], strsplit(as.character(variable2),",")[[1]] ),collapse = ","))
%>% select(ID,output)
output:
ID output
(int) (chr)
1 1 b,d
2 2 g,f
3 3 p,m,n