I have 2 data frames, and I want to write a code that will allow me to check if a row from data frame1 exist in data frame2, and if so then I want to replace the row(s) from data frame1 with the row(s) from data frame2. Here is an example:
dataframe1:
name
A
B
AA
1
1
BB
1
0
CC
0
1
dataframe2:
name
A
B
AA
1
2
DD
1
3
EE
4
1
I want to switch rows between both dataframes, and the outcome will be:
dataframe1:
name
A
B
AA
1
2
BB
1
0
CC
0
1
To clarify, I want to row AA from dataframe1 to be switched by the row AA dataframe2.
This is what I tried to do:
df1[which(df1$name %in% df2$name)[1:nrow(df2)],] <- df2
And:
df1$name[match(df2$name,df1$name)] <- df2$name
Both didn't work unfortunately.
Thanks for helping!
Does this work:
df1
name A B
1 AA 1 1
2 BB 1 0
3 CC 0 1
df2
name A B
1 AA 1 2
2 DD 1 3
3 EE 4 1
df2$name %in% df1$name
[1] TRUE FALSE FALSE
df1[df1$name %in% df2$name, ] = df2[df2$name %in% df1$name, ]
df1
name A B
1 AA 1 2
2 BB 1 0
3 CC 0 1
The natural_join is the function you are looking for
library(rqdatatable)
dataframe1 <- data.frame(
name = c('AA', 'BB', 'CC'),
A = c(1,1,0),
B = c(1,0,1)
)
dataframe2 <- data.frame(
name = c('AA', 'DD', 'EE'),
A = c(1,1,4),
B = c(2,3,1)
)
natural_join(dataframe2, dataframe1, by = "name",
jointype = 'RIGHT')
You can make an update join:
i <- match(df1$name, df2$name)
j <- which(!is.na(i))
df1[j,] <- df2[i[j],]
df1
# name A B
#1 AA 1 2
#2 BB 1 0
#3 CC 0 1
Data:
df1 <- data.frame(name = c("AA","BB","CC"), A = c(1,1,0), B = c(1,0,1))
df2 <- data.frame(name = c("AA","DD","EE"), A = c(1,1,4), B = c(2,3,1))
A dplyr way using across, left_join and coalesce
library(dplyr, warn.conflicts = F)
df1 <- data.frame(name = c("AA","BB","CC"), A = c(1,1,0), B = c(1,0,1))
df2 <- data.frame(name = c("AA","DD","EE"), A = c(1,1,4), B = c(2,3,1))
df1 %>% left_join(df2, by = 'name') %>%
mutate(across(ends_with('.y'), ~coalesce(., get(gsub('\\.y', '\\.x', cur_column()))),
.names = "{gsub('.y', '', .col)}"), .keep = 'unused')
#> name A B
#> 1 AA 1 2
#> 2 BB 1 0
#> 3 CC 0 1
Created on 2021-07-06 by the reprex package (v2.0.0)
Related
I'm trying to add a named row between two data.frames d1 and d2. My Desired_output is shown below.
I have tried a solution but it failed to get me to my desired output. Is there a solution to this?
d1 <- data.frame(b = 1:2, SE = 2:3)
d2 <- data.frame(b = 0:1, SE = 1:2)
a <- "obs"
# Solution failed:
dplyr::bind_rows(d1, !!a := rep(NA, ncol(d1)), d2)
Desired_output =
" b SE
1 1 2
2 2 3
obs NA NA
4 0 1
5 1 2"
In base R, we may also do
rbind(d1, `row.names<-`(d1[NA,][1,], a),
`row.names<-`(d2, nrow(d1) + seq_len(nrow(d2))))
-output
b SE
1 1 2
2 2 3
obs NA NA
3 0 1
4 1 2
Maybe this will help -
d1 <- data.frame(b = 1:2, SE = 2:3)
d2 <- data.frame(b = 0:1, SE = 1:2)
a <- "obs"
d3 <- d1[1, ]
d3[] <- NA
rownames(d3) <- a
rbind(d1, d3, d2)
# b SE
#1 1 2
#2 2 3
#obs NA NA
#11 0 1
#21 1 2
Here is a an alternative solution, in case you want it in a pipe!
We wrap the whole procedure arround add_row from tibble package.
With bind_rows we bind both tables together and add a row before index2.
Then we have to change between rownames_to_column and vice versa.
library(tibble)
library(dplyr)
add_row(bind_rows(d1,d2),
b = NA,
SE = NA,
.before = 3)%>%
data.frame() %>%
rownames_to_column("x") %>%
mutate(x = ifelse(x == "3", "obs", x)) %>%
column_to_rownames("x")
)
b SE
1 1 2
2 2 3
obs NA NA
4 0 1
5 1 2
I did some search but cannot find an obvious answer of this question so hopefully it's not a duplicated question. I have a data frame looks like this:
X1 X2 V1 V2 V3 ... Vn
A B 0 1 2 1
B C 1 0 1 0
A C 2 1 0 1
What I want to achieve is to replace V1 to Vn values to the "dosage" of X2. So for row 1 (each row may have different values of X1 and X2),
if the value is 0, I want to replace it to AA;
if the value is 1, I want to replace it to AB;
if the value is 2, I want to replace it to BB;
The expected outcome is:
X1 X2 V1 V2 V3 ... Vn
A B AA AB BB AB
B C BC BB BC BB
A C CC AC AA AC
Here is the sample data frame:
df=data.frame(X1=c("A","B","A"),
X2=c("B","C","C"),
V1=c(0,1,2),
V2=c(1,0,1),
V3=c(2,1,0))
Thanks for the help!
This is inspired from #Matt's answer. We can use mutate_at with paste0 to achieve this task.
## Load packages
library(dplyr)
dat2 <- dat %>%
mutate_at(vars(-X1, -X2), .funs = list(
~case_when(
. == 0 ~paste0(X1, X1),
. == 1 ~paste0(X1, X2),
. == 2 ~paste0(X2, X2),
TRUE ~NA_character_
)
))
dat2
# X1 X2 V1 V2 V3 Vn
# 1 A B AA AB BB AB
# 2 B C BC BB BC BB
# 3 A C CC AC AA AC
DATA
dat <- read.table(text = "X1 X2 V1 V2 V3 Vn
A B 0 1 2 1
B C 1 0 1 0
A C 2 1 0 1 ",
stringsAsFactors = FALSE, header = TRUE)
With your actual df you can replace V1:V3 with V1:Vn.
Using your reprex, you can do:
library(dplyr)
df %>%
mutate_at(
vars(V1:V3),
funs(case_when(
. == 0 ~ "AA",
. == 1 ~ "AB",
. == 2 ~ "BB"
))
)
It not an elegant solution but for the sake of completeness: just nest two for-loops
for (i in 1:dim(df)[1]) {
for (j in 3:dim(df)[2]){
if (df[i,j] == 0){
df[i,j] <- paste0(df[i,1], df[i,1])
} else if (df[i,j] == 1) {
df[i,j] <- paste0(df[i,1], df[i,2])
} else if (df[i,j] == 2) {
df[i,j] <- paste0(df[i,2], df[i,2])
}
}
}
Sorry for that.
Use spread and gather
library(tidyverse)
df <- tibble(X1=c("A","B","A"),
X2=c("B","C","C"),
V1=c(0,1,2),
V2=c(1,0,1),
V3=c(2,1,0))
Capture your from:to translation
transl <- tibble(DOSE = c(0,1,2),
OUTCOME = c("AA", "AB", "AC"))
Then Gather your values into long form
longTbl <- df %>%
gather(key = "TheV", value = "DOSE", na.rm = TRUE,starts_with("V")) %>%
left_join(transl, by = "DOSE") %>%
select(- DOSE)
# A tibble: 9 x 4
X1 X2 TheV OUTCOME
<chr> <chr> <chr> <chr>
1 A B V1 AA
2 B C V1 AB
3 A C V1 AC
4 A B V2 AB
5 B C V2 AA
6 A C V2 AB
7 A B V3 AC
8 B C V3 AB
9 A C V3 AA
You might be better leaving it there. But we can pivot it back with spread.
widTbl <- longTbl %>%
spread(TheV, OUTCOME )
widTbl
# A tibble: 3 x 5
X1 X2 V1 V2 V3
<chr> <chr> <chr> <chr> <chr>
1 A B AA AB AC
2 A C AC AB AA
3 B C AB AA AB
And Bob's your uncle.
I have two data frames as below and am trying to improve my code so the letters column in df1 should replaced with the letters column in df2 if they match.
df1 <- data.frame(ID = c(1,3,2,4,5), Letters = LETTERS[1:5], stringsAsFactors = F)
df2 <- data.frame(ID = c(1,3,4), Letters2 = "F", stringsAsFactors = F)
desired:
ID letters
1 F
2 B
3 F
4 D
5 F
It would be like doing the following by in one line:
desired <- merge(df1, df2, by = "ID", all.x = T)
desired$letters <- ifelse(is.na(desired$letters2), desired$letters, desired$letters2)
desired$letters2 <- NULL
Try this:
library(tidyverse)
df1%>%
left_join(df2)%>%
mutate(Letters=coalesce(letters2,Letters),letters2=NULL)
Joining, by = "ID"
ID Letters
1 1 F
2 2 B
3 3 F
4 4 F
5 5 E
We could use the numeric 'ID' as index to change the values in 'Letters' to those of 'letters2' (which are all 'F's)
df1$Letters[df2$ID] <- df2$letters2
df1
# ID Letters
#1 1 F
#2 2 B
#3 3 F
#4 4 F
#5 5 E
Or using data.table
library(data.table)
setDT(df1)[df2, Letters := Letters2, on = .(ID)]
df1
# ID Letters
#1: 1 F
#2: 3 F
#3: 2 C
#4: 4 F
#5: 5 E
Suppose I have a dataframe like this
testtbl <- data.frame(ID = c('1','2','3','4'),
A = c(1,0,1,1),
B = c(1,1,1,1),
C = c(0,0,0,1),
D = c(0,1,1,1))
> testtbl
ID A B C D
1 1 1 1 0 0
2 2 0 1 0 1
3 3 1 1 0 1
4 4 1 1 1 1
Where columns A-D are flags that can either be 1 or 0. I would like to consolidate these columns into 1 column, where the new dataframe would look something like:
> testtbl
ID flag
1 1 A,B
2 2 B,D
3 3 A,B,D
4 4 A,B,C,D
At little confused on how I would approach this and would appreciate any hints or help.
A solution from dplyr and tidyr.
library(dplyr)
library(tidyr)
testtbl2 <- testtbl %>%
gather(Col, Val, -ID) %>%
filter(Val == 1) %>%
select(-Val) %>%
group_by(ID) %>%
summarise(flag = toString(Col))
testtbl2
# # A tibble: 4 x 2
# ID flag
# <fctr> <chr>
# 1 1 A, B
# 2 2 B, D
# 3 3 A, B, D
# 4 4 A, B, C, D
You can also do it without any libraries with an apply:
testtbl <- data.frame(ID = c('1','2','3','4'),
A = c(1,0,1,1),
B = c(1,1,1,1),
C = c(0,0,0,1),
D = c(0,1,1,1))
test<-data.frame(ID=testtbl$ID, flag=(apply(testtbl[,-1], 1, function(x) paste0(names(x)[which(x==1)], collapse=','))))
Base R
do.call(rbind, lapply(split(testtbl, testtbl$ID), function(x)
data.frame(ID = x[1],
flag = paste(sort(names(x)[-1][x[-1] > 0]),
collapse = ","))))
# ID flag
#1 1 A,B
#2 2 B,D
#3 3 A,B,D
#4 4 A,B,C,D
Please help. I have a data frame like the following:
df <- data.frame("G"=c(1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7),
"C"=c(1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0),
"SKU"=c("a","b","c","a","c","d","a","c","d","a","b","c","a","b","c","b","c","d","a","b","c"))
df
G C SKU
1 1 a
1 0 b
1 0 c
2 0 a
2 1 c
2 0 d
3 1 a
3 0 c
3 0 d
4 0 a
4 1 b
4 0 c
5 1 a
5 0 b
5 0 c
6 0 b
6 1 c
6 0 d
7 0 a
7 1 b
7 0 c
I want to find unique "blocks" in this data frame. For example, here we have three blocks: (a,b,c) , (a,c,d) and (b,c,d). I want to create table with these unique blocks, and to sum variable "C" in all "G" for each SKU in certain block. And finally get data frame like this:
New_G SKU New_C
1 1 a 2
2 1 b 2
3 1 c 0
4 2 a 1
5 2 c 1
6 2 d 0
7 3 b 0
8 3 c 1
9 3 d 0
As I said, here three unique blocks, New_G - identificator of every block, and New_C - the sum of "C" for each SKU that in certain block. (For example, see the first row. SKU = "a", NEW_C = 2 - it means that in the old data frame SKU "a" while being in the block (a,b,c) has variable "C" = 1 two times) (Another example: see the forth row. Again SKU = "a", but NEW_C = 1 - it means that in the old data frame SKU "a" while being in the block (a,c,d) has variable "C" = 1 once)
If my question is not clear let me now, please.
You can create an index with toString and use a simple sum by group from there. The challenge is getting the index of unique groups:
ind <- df %>% group_by(G) %>% summarise(temp=toString(SKU)) %>% mutate(fac=as.numeric(as.factor(temp)))
ind <- rep(ind$fac, each=3)
df$ind <- ind
df %>% group_by(ind, SKU) %>% summarise(New_C = sum(C))
# Source: local data frame [9 x 3]
# Groups: ind [?]
#
# ind SKU New_C
# (dbl) (fctr) (dbl)
# 1 1 a 2
# 2 1 b 2
# 3 1 c 0
# 4 2 a 1
# 5 2 c 1
# 6 2 d 0
# 7 3 b 0
# 8 3 c 1
# 9 3 d 0
Edit
This is possibly quicker:
df %>% group_by(G) %>%
mutate(temp=toString(SKU)) %>%
group_by(temp, SKU) %>%
summarise(New_C = sum(C))
Using dplyr:
library(dplyr)
df %>%
group_by(G) %>%
summarize(bin = paste(SKU, collapse=',')) %>%
left_join(df, by=c('G' = 'G')) %>%
group_by(bin, SKU) %>%
summarize(New_C = sum(C))
Output:
bin SKU New_C
(chr) (fctr) (dbl)
1 a,b,c a 2
2 a,b,c b 2
3 a,b,c c 0
4 a,c,d a 1
5 a,c,d c 1
6 a,c,d d 0
7 b,c,d b 0
8 b,c,d c 1
9 b,c,d d 0
Here is a solution in base R.
Grp <- vapply(unique(df$G), function(x) paste(df$SKU[which(df$G==x)], collapse = ""), "abc", USE.NAMES = FALSE)
ID <- vapply(1:nrow(df), function(x) paste(df$SKU[x],Grp[df$G[x]], collapse=""), "a abc", USE.NAMES = FALSE)
UniG <- unique(Grp)
New_G <- do.call(c, lapply(1:length(UniG), function(x) rep(x, nchar(UniG[x]))))
Newdf <- data.frame(New_G, t(sapply(unique(ID), function(x) list(SKU = strsplit(x,split = " ")[[1]][1], New_C = sum(df$C[which(ID==x)])), USE.NAMES = FALSE)))
> Newdf
New_G SKU New_C
1 1 a 2
2 1 b 2
3 1 c 0
4 2 a 1
5 2 c 1
6 2 d 0
7 3 b 0
8 3 c 1
9 3 d 0
The dplyr solutions provided by Pierre Lafortune and Edward R. Mazurek are much faster. The BuildRandomDF below builds a data frame very similar to the one the OP posted.
library(gtools)
BuildRandomDF <- function(n) {
set.seed(117)
samp1 <- sample(3:5, n, replace = TRUE)
Len5 <- length(which(samp1==5))
Len4 <- length(which(samp1==4))
Len3 <- length(which(samp1==3))
perm5 <- permutations(5,5,letters[1:5])
perm4 <- permutations(4,4,letters[1:4])
perm3 <- permutations(3,3,letters[1:3])
sampPerm5 <- sample(nrow(perm5), Len5, replace = TRUE)
sampPerm4 <- sample(nrow(perm4), Len4, replace = TRUE)
sampPerm3 <- sample(nrow(perm3), Len3, replace = TRUE)
G <- do.call(c, lapply(1:n, function(x) rep(x, samp1[x])))
i <- j <- k <- 0L
SKU <- do.call(c, lapply(1:n, function(x) {
if (samp1[x]==3) {
perm3[sampPerm3[j <<- j+1L],]
} else if (samp1[x]==4) {
perm4[sampPerm4[k <<- k+1L],]
} else {
perm5[sampPerm5[i <<- i+1L],]
}}))
C <- sample(0:1, length(SKU), replace = TRUE)
data.frame(G, C, SKU)
}
Below are the functions:
library(dplyr)
DplyrTest <- function(df) {
df %>% group_by(G) %>%
mutate(temp=toString(SKU)) %>%
group_by(temp, SKU) %>%
summarise(New_C = sum(C))
}
DplyrCheck2 <- function(df) {
df %>%
group_by(G) %>%
summarize(bin = paste(SKU, collapse=',')) %>%
left_join(df, by=c('G' = 'G')) %>%
group_by(bin, SKU) %>%
summarize(New_C = sum(C))
}
BaseTest <- function(df) {
Grp <- vapply(unique(df$G), function(x) paste(df$SKU[which(df$G==x)], collapse = ""), "abc", USE.NAMES = FALSE)
ID <- vapply(1:nrow(df), function(x) paste(df$SKU[x],Grp[df$G[x]], collapse=""), "a abc", USE.NAMES = FALSE)
UniG <- unique(Grp)
New_G <- do.call(c, lapply(1:length(UniG), function(x) rep(x, nchar(UniG[x]))))
Newdf <- data.frame(New_G, t(sapply(unique(ID), function(x) list(SKU = strsplit(x,split = " ")[[1]][1], New_C = sum(df$C[which(ID==x)])), USE.NAMES = FALSE)))
Newdf
}
Below are the timings:
df <- BuildRandomDF(10^4)
system.time(df1 <- DplyrCheck(df))
user system elapsed
0.43 0.00 0.43
system.time(df2 <- DplyrCheck2(df))
user system elapsed
0.39 0.00 0.39
system.time(df3 <- BaseTest(df))
user system elapsed
5.15 0.00 5.19
all(sort(unlist(df3$New_C))==sort(df1$New_C))
[1] TRUE
all(sort(df1$New_C)==sort(df2$New_C))
[1] TRUE