R - Merge and Replace Column If ID Found on Another Data Frame - r

I have two data frames as below and am trying to improve my code so the letters column in df1 should replaced with the letters column in df2 if they match.
df1 <- data.frame(ID = c(1,3,2,4,5), Letters = LETTERS[1:5], stringsAsFactors = F)
df2 <- data.frame(ID = c(1,3,4), Letters2 = "F", stringsAsFactors = F)
desired:
ID letters
1 F
2 B
3 F
4 D
5 F
It would be like doing the following by in one line:
desired <- merge(df1, df2, by = "ID", all.x = T)
desired$letters <- ifelse(is.na(desired$letters2), desired$letters, desired$letters2)
desired$letters2 <- NULL

Try this:
library(tidyverse)
df1%>%
left_join(df2)%>%
mutate(Letters=coalesce(letters2,Letters),letters2=NULL)
Joining, by = "ID"
ID Letters
1 1 F
2 2 B
3 3 F
4 4 F
5 5 E

We could use the numeric 'ID' as index to change the values in 'Letters' to those of 'letters2' (which are all 'F's)
df1$Letters[df2$ID] <- df2$letters2
df1
# ID Letters
#1 1 F
#2 2 B
#3 3 F
#4 4 F
#5 5 E
Or using data.table
library(data.table)
setDT(df1)[df2, Letters := Letters2, on = .(ID)]
df1
# ID Letters
#1: 1 F
#2: 3 F
#3: 2 C
#4: 4 F
#5: 5 E

Related

Using lapply with data in two lists in R

I have two lists and I want to use lapply to get a new list
The data is
library(dplyr)
list.A <- list(df1=data.frame(x= c(1:5), y = letters[1:5], z= rep(1,5)),
df2=data.frame(x= c(10:15), y = letters[5:10], z= rep(10,6)))
list.B <- list(df1=data.frame(x= c(1:6), var2 = letters[10:15], var3= rep(7,6)),
df2=data.frame(x= c(10,12), var2 = letters[1:2], var3= rep(5,2)))
I want the result to be as following
dat.1 <- left_join(list.A[[1]], list.B[[1]], by=("x"))
dat.2 <- left_join(list.A[[2]], list.B[[2]], by=("x"))
new.list <- list(df1 = dat.1, df2 =dat.2)
But when I use lapply the results are weird and not as I wish them to be
new.list <- lapply(list.A, function(a){lapply(list.B, function(b){
df <-left_join(a, b, by=("x"))
})
})
Any help, please. I need to apply loop or lapply would work?
my actual lists have so many data frames
We need either map2 from purrr as this loops over each corresponding elements of both list and do the left_join by the 'x' column
library(dplyr)
library(purrr)
map2(list.A, list.B, ~ left_join(.x, .y, by = 'x'))
-output
#$df1
# x y z var2 var3
#1 1 a 1 j 7
#2 2 b 1 k 7
#3 3 c 1 l 7
#4 4 d 1 m 7
#5 5 e 1 n 7
#$df2
# x y z var2 var3
#1 10 e 10 a 5
#2 11 f 10 <NA> NA
#3 12 g 10 b 5
#4 13 h 10 <NA> NA
#5 14 i 10 <NA> NA
#6 15 j 10 <NA> NA
Or Map (from base R)
Map(merge, list.A, list.B, MoreArgs = list(all.x = TRUE, by = 'x'))

r - append one table to another if they share the same value in a column

df1 <- data.frame(a = c(1:5), b = c(6:10), c=c("df1","df1","df1","df1","df1"))
df2 <- data.frame(a = c(1,3,5,7,9), b = c(16:20), c=c("df2","df2","df2","df2","df2"), d= LETTERS[1:5], e= LETTERS[6:10])
I would like to create a new table that does following:
stack one table on top of the other only if the value in column a matches (i.e. 1,3,5 only)
show only columns a, b, and c (ignore columns d and e)
in total there should be 6 rows and 3 columns, with rows 1-3 from df1 (a=1,3,5), and rows 4-6 from df2 (a=1,3,5)
base R
common <- intersect(df1$a, df2$a)
rbind(
subset(df1, a %in% common, select = a:c),
subset(df1, a %in% common, select = a:c)
)
# a b c
# 1 1 6 df1
# 3 3 8 df1
# 5 5 10 df1
# 11 1 6 df1
# 31 3 8 df1
# 51 5 10 df1
dplyr
library(dplyr)
bind_rows(
semi_join(df1, df2, by = "a"),
semi_join(df2, df1, by = "a")
) %>%
select(a, b, c)
# a b c
# 1 1 6 df1
# 2 3 8 df1
# 3 5 10 df1
# 4 1 16 df2
# 5 3 17 df2
# 6 5 18 df2
Use semi_join() from dplyr package.
df1 <- data.frame(a = c(1:5), b = c(6:10), c=c("df1","df1","df1","df1","df1"))
df2 <- data.frame(a = c(1,3,5,7,9), b = c(16:20), c=c("df2","df2","df2","df2","df2"), d= LETTERS[1:5], e= LETTERS[6:10])
library(dplyr)
new_df <- rbind(semi_join(df1,df2,by="a")[,c(1:3)],semi_join(df2,df1,by="a")[,c(1:3)])
new_df
Semi-Join: Returns all rows from df1 where there are matching
values in df2, keeping just columns from df1. Its a filtering join.
Output:
> new_df
a b c
1 1 6 df1
2 3 8 df1
3 5 10 df1
4 1 16 df2
5 3 17 df2
6 5 18 df2
Here is a base R option using merge and split.default :
df3 <- merge(df1, df2, by = 'a')
result <- subset(do.call(cbind.data.frame,
sapply(split.default(df3, sub('\\..*', '', names(df3))),
unlist, use.names = FALSE)), select = a:c)
result
# a b c
#1 1 6 df1
#2 3 8 df1
#3 5 10 df1
#4 1 16 df2
#5 3 17 df2
#6 5 18 df2

Merge two dataframes and create multiple columns in R

Suppose that we have two data frames as shown below:
df1 <- data.frame(Team1 = c("A","B","C"), Team2 = c("D","E","F"), Winner = c("A","E","F"))
df2 <- data.frame(Country = c("A","B","C","D","E","F"), Index = c(1,2,3,4,5,6))
What i want is create three columns in df2 as Team1_index, Team2_index, and Winner_index.
Team1 Team2 Winner Team1_index Team2_index Winner_index
A D A 1 4 1
B E E 2 5 5
C F F 3 6 6
I tried many ways but failed. Tips and advice!
If you just have a small number of columns, you can use the match function as in the example:
df1$Team1_index <- df2$Index[match(df1$Team1, df2$Country)]
df1$Team2_index <- df2$Index[match(df1$Team2, df2$Country)]
df1$Winner_index <- df2$Index[match(df1$Winner, df2$Country)]
df1
If you have more columns, you may look for more systematic solutions, but if it's really just three cases, this should do:
library("tidyverse")
df1 <- data.frame(Team1 = c("A","B","C"), Team2 = c("D","E","F"), Winner = c("A","E","F"))
df2 <- data.frame(Country = c("A","B","C","D","E","F"), Index = c(1,2,3,4,5,6))
df1 %>%
left_join(df2 %>% rename(Team1 = Country), by = "Team1") %>%
rename(Team1_Index = Index) %>%
left_join(df2 %>% rename(Team2 = Country), by = "Team2") %>%
rename(Team2_Index = Index) %>%
left_join(df2 %>% rename(Winner = Country), by = "Winner") %>%
rename(Winner_Index = Index)
#> Warning: Column `Team1` joining factors with different levels, coercing to
#> character vector
#> Warning: Column `Team2` joining factors with different levels, coercing to
#> character vector
#> Warning: Column `Winner` joining factors with different levels, coercing to
#> character vector
#> Team1 Team2 Winner Team1_Index Team2_Index Winner_Index
#> 1 A D A 1 4 1
#> 2 B E E 2 5 5
#> 3 C F F 3 6 6
You can safely ignore the warnings.
To get new columns as factors :
df1[paste0(colnames(df1),"_index")] <- lapply(df1,factor,df2$Country,df2$Index)
# Team1 Team2 Winner Team1_index Team2_index Winner_index
# 1 A D A 1 4 1
# 2 B E E 2 5 5
# 3 C F F 3 6 6
To get new columns as numeric :
df1[paste0(colnames(df1),"_index")] <-
lapply(df1,function(x) as.numeric(as.character(factor(x,df2$Country,df2$Index))))
# Team1 Team2 Winner Team1_index Team2_index Winner_index
# 1 A D A 1 4 1
# 2 B E E 2 5 5
# 3 C F F 3 6 6
Note that for this specific case (index from 1 incremented by 1), this shorter version works:
df1[paste0(colnames(df1),"_index")] <-
lapply(df1,function(x) as.numeric(factor(x,df2$Country)))
I have an almost solution with data.table, using melt and dacst to change shape
library(data.table)
df1 <- data.table(Team1 = c("A","B","C"), Team2 = c("D","E","F"), Winner = c("A","E","F"))
df2 <- data.table(Country = c("A","B","C","D","E","F"), Index = c(1,2,3,4,5,6))
melt(data = df1 , id.vars = )
plouf <- merge(df2,melt(df1,measure = 1:2), by.x = "Country", by.y = "value")
plouf[,winneridx := Index[Country == Winner]]
dcast(plouf,Country+winneridx~variable,value.var = "Index")
Country winneridx Team1 Team2
1: A 1 1 NA
2: B 5 2 NA
3: C 6 3 NA
4: D 1 NA 4
5: E 5 NA 5
6: F 6 NA 6
This is basically the same as giocomai's answer, just uses purrr to help eliminate duplication:
library(rlang)
library(dplyr)
getIndexCols <- function(df1, df2, colName){
idxColName <- sym(paste0(colName, "_Index"))
df1 %>% left_join(df2 %>% rename(!! sym(colName) := Country, !! idxColName := Index))
}
names(df1) %>% purrr::map(~ getIndexCols(df1, df2, .)) %>% reduce(~ left_join(.x, .y))
You can use chartr This will take into consideration both the country column and the index column:
df3=as.matrix(setNames(df1,paste0(names(df1),"_index")))
cbind(df1,chartr(paste0(df2$Country,collapse=""),paste0(df2$Index,collapse=""),df3))
Team1 Team2 Winner Team1_index Team2_index Winner_index
1 A D A 1 4 1
2 B E E 2 5 5
3 C F F 3 6 6
you can also do:
cbind(df1,do.call(chartr,c(as.list(sapply(unname(df2),paste,collapse="")),list(df3))))
Team1 Team2 Winner Team1_index Team2_index Winner_index
1 A D A 1 4 1
2 B E E 2 5 5
3 C F F 3 6 6
Here is another option for you that uses match and cbind.
df3 <- as.matrix(df1)
colnames(df3) <- paste0(colnames(df3), "_index")
# match the positions
df3[] <- match(df3, df2$Country)
cbind(df1, df3)
# Team1 Team2 Winner Team1_index Team2_index Winner_index
#1 A D A 1 4 1
#2 B E E 2 5 5
#3 C F F 3 6 6
df3 is created as a matrix, i.e. a vector with dimensions attribute, such that we can replace its entries with the result of match (a vector) right away and don't need to repeat the code for every column.
Or in one go
df1[paste0(colnames(df1), "_index")] <- match(as.matrix(df1), df2$Country)
Note however, that this ignores the index column of df2.
Thanks to #Moody_Mudskipper we could also write this more general as
df1[paste0(colnames(df1), "_index")] <- lapply(df1, function(x) df2$Index[match(x, df2$Country)])

Count character values by group in a data.frame

I have got a data.frame which contains two columns: ID and Letter. I need to summarize the Letter observations by ID.
Here an example:
df = read.table(text = 'ID Letter
1 A
1 A
1 B
1 A
1 C
1 D
1 B
2 A
2 B
2 B
2 B
2 D
2 F
3 B
3 A
3 A
3 C
3 D, header = TRUE)
My output should be 3 data.frames as follows:
df_1
A 3
B 2
C 1
D 1
df_2
A 1
B 3
D 1
F 1
df_3
A 2
B 1
C 1
D 1
It is just the count of the letters within each ID group. I think I could use a combination of the functions table and aggregate, but how?
thanks to #akrun, please see below how I managed to do the trick:
#create list of data.frames
library(dplyr)
lst = lapply(split(df, df$ID), function(x) count(x, ID, Letter) %>% ungroup() %>% select(-ID))
lst = lapply(lst, function(y) y = as.data.frame(y)) #convert data into data.frames
This will also work (with base R):
lapply(split(df, df$ID), function(x) subset(as.data.frame(table(x$Letter)), Freq != 0))

how to subset in r for this particular condition?

df1 and df2 have columns a,b. I want to subset data from df1 such that each entry in df1$a along with df1$b is in df2$a along with df2$b.
df1
a b c
1 m df1
2 f df1
3 f df1
4 m df1
5 f df1
6 m df1
df2
a b c
1 m df2
3 f df2
4 f df2
5 m df2
6 f df2
7 m df2
desired output
df
a b c
1 m df1
3 f df1
i am using :
df <- subset(df1,(df1$a%in%df2$a & df1$b%in%df2$b))
but this is giving results similar to
df <-subset(df1,df1$a%in%df2$a)
You can use package dplyr:
library(dplyr)
intersect(df1,df2)
# a b
#1 1 m
#2 3 f
Edit for the new data.frames with c column:
you can use function semi_join (also from dplyr):
semi_join(df1,df2,by=c("a","b"))
# a b c
#1 1 m df1
#2 3 f df1
Other option, in base R:
you can paste your a and b variables to subset your data.frame:
df1[paste(df1$a,df1$b) %in% paste(df2$a,df2$b), ]
# a b
#1 1 m
#3 3 f
and with the new data.frames:
# a b c
# 1 1 m df1
# 3 3 f df1
Or you could do
Res <- rbind(df1, df2)
Res[duplicated(Res), ]
# a b
# 7 1 m
# 8 3 f
Edit1: Per the edit, here's a similar data.table solution
library(data.table)
Res <- rbind(df1, df2)
setDT(Res)[duplicated(Res, by = c("a", "b"), fromLast = TRUE)]
# a b c
# 1: 1 m df1
# 2: 3 f df1
Edit2: I see that #CathG opened a join battlefront, so here's how we do it with data.table
setkey(setDT(df1), a, b) ; setkey(setDT(df2), a, b)
df1[df2, nomatch = 0]
# a b c i.c
# 1: 1 m df1 df2
# 2: 3 f df1 df2

Resources