dplyr join with three data frame - r

I have 3 data frames as like this
df1 <- structure(list(Vehicle = c("Car1", "Car2", "Car8"), Year = c(20L,
21L, 20L), type = c("A", "A", "A")), class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(Vehicle = c("Car1", "Car2", "Car7"), Year = c(20L,
21L, 90L), type = c("M", "M", "M")), class = "data.frame", row.names = c(NA, -3L))
df3 <- structure(list(Vehicle = c("Car1", "Car2", "Car9"), Year = c(20L,
21L, 92L), type = c("I", "I", "I")), class = "data.frame", row.names = c(NA, -3L))
And I need to make a new table as follows
Vehicle Year type
Car1 20 A/M/I
Car2 21 A/M/I
Car7 90 M
Car8 20 A
Car9 92 I
for this purpose I used this code using dplyr as like this, but it is not working with 3 data frames:
dplyr::full_join(df1, df2, df3, by = c('Vehicle', 'Year')) %>%
tidyr::unite(type, type.x, type.y, sep = '/', na.rm = TRUE)

Try this approach. Instead of merging it looks like you want to combine all dataframes and then aggregate. Here the code using dplyr:
library(dplyr)
#Code
newdf <- bind_rows(df1,df2,df3) %>%
group_by(Vehicle,Year) %>%
summarise(type=paste0(type,collapse='|'))
Output:
# A tibble: 5 x 3
# Groups: Vehicle [5]
Vehicle Year type
<chr> <int> <chr>
1 Car1 20 A|M|I
2 Car2 21 A|M|I
3 Car7 90 M
4 Car8 20 A
5 Car9 92 I

Generally, to merge >2 data.frame's/tibble's you'd use either base R's Reduce or purrr::reduce; for example using the latter:
list(df1, df2, df3) %>%
purrr::reduce(dplyr::full_join, by = c("Vehicle", "Year")) %>%
tidyr::unite(type, dplyr::starts_with("type"), sep = "/", na.rm = TRUE)
# Vehicle Year type
#1 Car1 20 A/M/I
#2 Car2 21 A/M/I
#3 Car8 20 A
#4 Car7 90 M
#5 Car9 92 I

Using base R
aggregate(type ~ Vehicle + Year, rbind(df1, df2, df3) ,
FUN = paste, collapse="|")
-output
# Vehicle Year type
#1 Car1 20 A|M|I
#2 Car8 20 A
#3 Car2 21 A|M|I
#4 Car7 90 M
#5 Car9 92 I

Related

How to merge data frame by picking up selected values based on some logical criteria?

I have 3 data frames with similar structure and i try to fill a 4rth data frame with values from first 3 data frames but on logical condition basis.
My data frame 1
`Account id Value $ RMSE
1 500 10
2 7000 15
3 1900 20
My data frame 2
`Account id Value $ RMSE
1 400 5
2 8000 18
3 1700 18
My data frame 3
`Account id Value $ RMSE
1 500 10
2 2000 25
3 5000 0.2
My desired result is (Value picked up from data frame which has lowest corresponding RMSE)
`Account id Value $
1 400
2 7000
3 5000
Request your help on how to merge.
In the case of your issue you have to bind all your dataframes by row. After that you can use tidyverse functions in order to filter by group defined by account id. Here the code with a tidyverse approach:
library(tidyverse)
#Code
ndf <- do.call(bind_rows,list(df1,df2,df3)) %>%
group_by(Account.id) %>%
filter(RMSE==min(RMSE)) %>% select(Account.id,Value) %>%
arrange(Account.id)
Output:
# A tibble: 3 x 2
# Groups: Account.id [3]
Account.id Value
<int> <int>
1 1 400
2 2 7000
3 3 5000
Some data used:
#Data 1
df1 <- structure(list(Account.id = 1:3, Value = c(500L, 7000L, 1900L
), RMSE = c(10L, 15L, 20L)), class = "data.frame", row.names = c(NA,
-3L))
#Data 2
df2 <- structure(list(Account.id = 1:3, Value = c(400L, 8000L, 1700L
), RMSE = c(5L, 18L, 18L)), class = "data.frame", row.names = c(NA,
-3L))
#Data 3
df3 <- structure(list(Account.id = 1:3, Value = c(500L, 2000L, 5000L
), RMSE = c(10, 25, 0.2)), class = "data.frame", row.names = c(NA,
-3L))
An option with data.table
library(data.table)
rbindlist(list(df1, df2, df3))[, .(Value = Value[which.min(RMSE)]), .(Account.id)]
# Account.id Value
#1: 1 400
#2: 2 7000
#3: 3 5000
Or with tidyverse using slice_min after binding the datasets together with bind_rows
library(dplyr)
bind_rows(df1, df2, df3) %>%
group_by(Account.id) %>%
slice_min(RMSE) %>%
select(-RMSE)
# A tibble: 3 x 2
# Groups: Account.id [3]
# Account.id Value
# <int> <int>
#1 1 400
#2 2 7000
#3 3 5000
df1 <- structure(list(Account.id = 1:3, Value = c(500L, 7000L, 1900L
), RMSE = c(10L, 15L, 20L)), class = "data.frame", row.names = c(NA,
-3L))
df2 <- structure(list(Account.id = 1:3, Value = c(400L, 8000L, 1700L
), RMSE = c(5L, 18L, 18L)), class = "data.frame", row.names = c(NA,
-3L))
df3 <- structure(list(Account.id = 1:3, Value = c(500L, 2000L, 5000L
), RMSE = c(10, 25, 0.2)), class = "data.frame", row.names = c(NA,
-3L))
A base R option is using merge + aggregate
merge(
df <- do.call(rbind, lst(df1, df2, df3)),
aggregate(RMSE ~ Account.id, df, min)
)[c("Account.id","Value")]
which gives
Account.id Value
1 1 400
2 2 7000
3 3 5000

Combine two matrix and mark common

I have two matrix as like this
Vehicle1 Year type
Car1 20 A
Car2 21 A
Car8 20 A
Second one
Vehicle2 Year type
Car1 20 M
Car2 21 M
Car7 90 M
I just need to combine the matrix based on the first column(Vehicle) and need to mark common as A/M as like this
Vehicle Year type
Car1 20 A/M
Car2 21 A/M
Car7 90 M
Car8 20 A
I used merge function for this but it only printing the common one
You can join the two dataframe and combine the type columns :
dplyr::full_join(df1, df2, by = c('Vehicle1' = 'Vehicle2', 'Year')) %>%
tidyr::unite(type, type.x, type.y, sep = '/', na.rm = TRUE)
# Vehicle1 Year type
#1 Car1 20 A/M
#2 Car2 21 A/M
#3 Car8 20 A
#4 Car7 90 M
data
df1 <- structure(list(Vehicle1 = c("Car1", "Car2", "Car8"), Year = c(20L,
21L, 20L), type = c("A", "A", "A")), class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(Vehicle2 = c("Car1", "Car2", "Car7"), Year = c(20L,
21L, 90L), type = c("M", "M", "M")), class = "data.frame", row.names = c(NA, -3L))
Another dplyr solution.
library(dplyr)
df2 %>%
rename(Vehicle1 = Vehicle2) %>%
bind_rows(df1, .) %>%
group_by(Vehicle1, Year) %>%
summarise(type = paste(type, collapse = "/")) %>%
ungroup()
# # A tibble: 4 x 3
# Vehicle1 Year type
# <chr> <int> <chr>
# 1 Car1 20 A/M
# 2 Car2 21 A/M
# 3 Car7 90 M
# 4 Car8 20 A
You can also do this easily in base R.
rr <- merge(m1, m2, all=T, by.x="Vehicle1", by.y="Vehicle2")
rr <- setNames(na.omit(reshape(rr, idvar="Vehicle1", varying=list(c(2, 4), c(3, 5)),
direction="long")), c("Vehicle1", "t", names(m1)[-1]))
dupes <- which(duplicated(rr$Vehicle1))
rr[rr$Vehicle1 %in% rr$Vehicle1[dupes], 4] <- "A/M"
res <- rr[-dupes, -2]
res
# Vehicle1 Year type
# Car1.1 Car1 20 A/M
# Car2.1 Car2 21 A/M
# Car8.1 Car8 20 A
# Car7.2 Car7 90 M
Data:
m1 <- structure(list(Vehicle1 = c("Car1", "Car2", "Car8"), Year = c(20L,
21L, 20L), type = c("A", "A", "A")), class = "data.frame", row.names = c(NA,
-3L))
m2 <- structure(list(Vehicle2 = c("Car1", "Car2", "Car7"), Year = c(20L,
21L, 90L), type = c("M", "M", "M")), class = "data.frame", row.names = c(NA,
-3L))
Here is a base R option using merge
tmp <- merge(df1, df2, by.x = c("Vehicle1", "Year"), by.y = c("Vehicle2", "Year"), all = TRUE)
dfout <- cbind(tmp[c("Vehicle1", "Year")],
type = apply(
tmp[grep("type", names(tmp))],
1,
function(...) ifelse(any(is.na(...)), na.omit(...), paste0(..., collapse = "/"))
)
)
such that
> dfout
Vehicle1 Year type
1 Car1 20 A/M
2 Car2 21 A/M
3 Car7 90 M
4 Car8 20 A
Data
> dput(df1)
structure(list(Vehicle1 = c("Car1", "Car2", "Car8"), Year = c(20L,
21L, 20L), type = c("A", "A", "A")), class = "data.frame", row.names = c(NA,
-3L))
> dput(df2)
structure(list(Vehicle2 = c("Car1", "Car2", "Car7"), Year = c(20L,
21L, 90L), type = c("M", "M", "M")), class = "data.frame", row.names = c(NA,
-3L))

join variable with variable in which many data contains in row in R

I want perform join.
df1=structure(list(id = 1:3, group_id = c(10L, 20L, 40L)), class = "data.frame", row.names = c(NA,
-3L))
df2 has another structure, in group_id's field contain many groups. For examle {10,100,400}
so dput()
df2=structure(list(id = 1:3, group_id = structure(c(1L, 3L, 2L), .Label = c("{`10`,100,`40`}",
"{3,`40`,600,100}", "{4}"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
df2 has group_id 10 and 40,but they are in braces together with other groups.
How get desired joined output
id group_id
1 10
1 40
3 40
You can clean group_id in df2 using gsub, bring each id in separate rows and filter.
library(dplyr)
df2 %>%
mutate(group_id = gsub('[{}`]', '', group_id)) %>%
tidyr::separate_rows(group_id) %>%
filter(group_id %in% df1$group_id)
# id group_id
#1 1 10
#2 1 40
#3 3 40
Here's a data.table alternative:
df2[, strsplit(gsub('[{}`]', '', group_id), ','), by = id][V1 %in% df1$group_id]
# id V1
#1: 1 10
#2: 1 40
#3: 3 40
here is an option with base R using regmatches/regexpr
subset(setNames(stack(setNames(regmatches(df2$group_id, gregexpr("\\d+", df2$group_id)),
df2$id))[2:1], c('id', 'group_id')), group_id %in% df1$group_id)
# id group_id
#1 1 10
#3 1 40
#6 3 40

Merging two df One to Many within List - R

To start I will ignore the use of lists and show what I want using two df's.
I have df1
ID v1 Join_ID
1 100 1
2 110 2
3 150 3
And df2
Join_ID Type v2
1 a 80
1 b 90
2 a 70
2 b 60
3 a 50
3 b 40
I want the df.join to be:
ID v1 Join_ID a_v2 b_v2
1 100 1 80 90
2 110 2 70 60
3 150 3 50 40
I have tried:
df.merged <- merge(df1, df2, by="Join_ID")
df.wide <- dcast(melt(df.merged, id.vars=c("ID", "type")), ID~variable+type)
But this repeats all the variables in df1 for each type: v1_a v1_b
On top of this I have two lists
list.1
df1_a
df1_b
df1_c
list.2
df2_a
df2_b
df2_c
And I want the df1_a in list 1 to join with the df2_a in list 2
We can do this with maping through the list elements and then do the join
library(tidyverse)
map2(list.1, list.2, ~
.y %>%
mutate(Type = paste0(Type, "_v2")) %>%
spread(Type, v2) %>%
inner_join(.x, by = 'Join_ID'))
data
df1 <- structure(list(ID = 1:3, v1 = c(100L, 110L, 150L), Join_ID = 1:3),
.Names = c("ID",
"v1", "Join_ID"), class = "data.frame", row.names = c(NA, -3L
))
df2 <- structure(list(Join_ID = c(1L, 1L, 2L, 2L, 3L, 3L), Type = c("a",
"b", "a", "b", "a", "b"), v2 = c(80L, 90L, 70L, 60L, 50L, 40L
)), .Names = c("Join_ID", "Type", "v2"), class = "data.frame", row.names = c(NA,
-6L))
list.1 <- list(df1_a = df1, df1_b = df1, df1_c = df1)
list.2 <- list(df2_a = df2, df2_b = df2, df2_c = df2)
Some replies to your request :
1. the reshaping of df2
2. the join with different column names
library(reshape2)
df1=data.frame(id=c(1,2,3), v1=c(100,110,150))
df2=data.frame(Join_ID=c(1,1,2,2,3,3),Type=c("a","b","a","b","a","b"),v2=c(80,90,70,60,50,40))
cast_df2=dcast(df2, Join_ID ~ Type)
mergedData <- full_join(df1,cast_df2, by=c("id"="Join_ID"),suffixes=c("_df1","_df2") )

Manipulating all split data sets

I'm drawing a blank-- I have 51 sets of split data from a data frame that I had, and I want to take the mean of the height of each set.
print(dataset)
$`1`
ID Species Plant Height
1 A 1 42.7
2 A 1 32.5
$`2`
ID Species Plant Height
3 A 2 43.5
4 A 2 54.3
5 A 2 45.7
...
...
...
$`51`
ID Species Plant Height
134 A 51 52.5
135 A 51 61.2
I know how to run each individually, but with 51 split sections, it would take me ages.
I thought that
mean(dataset[,4])
might work, but it says that I have the wrong number of dimensions. I get now why that is incorrect, but I am no closer to figuring out how to average all of the heights.
The dataset is a list. We could use lapply/sapply/vapply etc to loop through the list elements and get the mean of the 'Height' column. Using vapply, we can specify the class and length of the output (numeric(1)). This will be useful for debugging.
vapply(dataset, function(x) mean(x[,4], na.rm=TRUE), numeric(1))
# 1 2 51
#37.60000 47.83333 56.85000
Or another option (if we have the same columnames/number of columns for the data.frames in the list), would be to use rbindlist from data.table with the optionidcol=TRUEto generate a singledata.table. The '.id' column shows the name of thelistelements. We group by '.id' and get themeanof theHeight`.
library(data.table)
rbindlist(dataset, idcol=TRUE)[, list(Mean=mean(Height, na.rm=TRUE)), by = .id]
# .id Mean
#1: 1 37.60000
#2: 2 47.83333
#3: 51 56.85000
Or a similar option as above is unnest from library(tidyr) to return a single dataset with the '.id' column, grouped by '.id', we summarise to get the mean of 'Height'.
library(tidyr)
library(dplyr)
unnest(dataset, .id) %>%
group_by(.id) %>%
summarise(Mean= mean(Height, na.rm=TRUE))
# .id Mean
#1 1 37.60000
#2 2 47.83333
#3 51 56.85000
The syntax for plyr is
df1 <- unnest(dataset, .id)
ddply(df1, .(.id), summarise, Mean=mean(Height, na.rm=TRUE))
# .id Mean
#1 1 37.60000
#2 2 47.83333
#3 51 56.85000
data
dataset <- structure(list(`1` = structure(list(ID = 1:2, Species = c("A",
"A"), Plant = c(1L, 1L), Height = c(42.7, 32.5)), .Names = c("ID",
"Species", "Plant", "Height"), class = "data.frame", row.names = c(NA,
-2L)), `2` = structure(list(ID = 3:5, Species = c("A", "A", "A"
), Plant = c(2L, 2L, 2L), Height = c(43.5, 54.3, 45.7)), .Names = c("ID",
"Species", "Plant", "Height"), class = "data.frame", row.names = c(NA,
-3L)), `51` = structure(list(ID = 134:135, Species = c("A", "A"
), Plant = c(51L, 51L), Height = c(52.5, 61.2)), .Names = c("ID",
"Species", "Plant", "Height"), class = "data.frame", row.names = c(NA,
-2L))), .Names = c("1", "2", "51"))
This also works, though it uses dplyr.
library(dplyr)
1:length(dataset) %>%
lapply(function(i)
test[[i]] %>%
mutate(section = i ) ) %>%
bind_rows %>%
group_by(section) %>%
summarize(mean_height = mean(height) )

Resources