I have a data set like this....
ID Brand
--- --------
1 Cokacola
2 Pepsi
3 merge with 1
4 merge with 2
5 merge with 1
6 Fanta
And I want to write a R function which merge the rows and introduce new variable according to ID just like following...
ID Brand merge
---- -------- --------
1 Cokacola 1,3,5
2 Pepsi 2,4
6 Fanta 6
Your data:
dat <- data.frame(
id = 1:6,
brand = c('Cokacola', 'Pepsi', 'merge with 1', 'merge with 2', 'merge with 1', 'Fanta'))
Inelegant-but-functional code:
repeats <- grepl('^merge with', dat$brand)
groups <- ifelse(repeats, gsub('merge with ', '', dat$brand), dat$id)
merge <- sapply(unique(groups), function(x) paste(dat$id[groups==x], collapse=','))
dat <- dat[!repeats,]
dat$merge <- merge
dat
## id brand merge
## 1 1 Cokacola 1,3,5
## 2 2 Pepsi 2,4
## 6 6 Fanta 6
There are most certainly ways to make this more elegant, depending on the consistency and makeup of the data.
You could try
library(reshape2)
indx <- !grepl('merge', df$Brand)
df1 <- df[indx,]
val <- as.numeric(sub('[^0-9]+', '', df[!indx, 'Brand']))
ml <- melt(tapply(which(!indx), val, FUN=toString))
df2 <- merge(df1, ml, by.x='ID', by.y='Var1', all=TRUE)
df2$merge <- with(df2, ifelse(!is.na(value),
paste(ID, value, sep=', '), ID))
df2[-3]
# ID Brand merge
#1 1 Cokacola 1, 3, 5
#2 2 Pepsi 2, 4
#3 6 Fanta 6
Related
I often have two data frames that I wish to join, where I expect all values to join. If not all values are present in both data frames, I want it to return an error.
Here is a MWE:
library(dplyr, warn.conflicts = FALSE)
df1 <- data.frame(
id = c(1:5),
value1 = rep(1, 5)
)
print(df1)
#> id value1
#> 1 1 1
#> 2 2 1
#> 3 3 1
#> 4 4 1
#> 5 5 1
df2 <- data.frame(
id = c(1:4),
value2 = rep(2, 4)
)
print(df2)
#> id value2
#> 1 1 2
#> 2 2 2
#> 3 3 2
#> 4 4 2
df3 <- inner_join(
df1,
df2,
by = "id")
print(df3)
#> id value1 value2
#> 1 1 1 2
#> 2 2 1 2
#> 3 3 1 2
#> 4 4 1 2
# Check if all values have joined
stopifnot(
nrow(df3) == max(nrow(df1), nrow(df2))
)
#> Error: nrow(df3) == max(nrow(df1), nrow(df2)) is not TRUE
Created on 2021-03-31 by the reprex package (v1.0.0)
This works, but I do not like the stopifnot(). It feels cumbersome, and particularly if I wish to overwrite df2, then I need to create a temp value df2_previous_row_num = nrow(df2) and then do stopifnot(nrow(df2) == df2_previous_row_num).
Also the nrow() test only works if all values in id are unique. There are other methods, e.g. stopifnot(c(df1$id %in% df3$id, df2$id %in% df3$id)) but again these are ugly.
Really what I am looking for is a parameter that makes the join fail if some values do not join. Something like, inner_join(df1, df2, fail_if_not_all_present = TRUE).
I am not attached to the tidyverse - if there is a base R or data.table way of doing this then I would consider those.
Does anyone know anything?
You can try writing a custom inner join function.
custom_inner_join <- function(data1,data2,by, fail_if_not_all_present = FALSE) {
if(fail_if_not_all_present) {
vals1 <- do.call(paste, data1[cols])
vals2 <- do.call(paste, data2[cols])
if(all(vals1 %in% vals2) && all(vals2 %in% vals1)) {
merge(data1, data2, by)
} else stop('Not all key values are present')
} else {
merge(data1, data2, by)
}
}
custom_inner_join(df1, df2, 'id')
# id value1 value2
#1 1 1 2
#2 2 1 2
#3 3 1 2
#4 4 1 2
custom_inner_join(df1, df2, 'id', fail_if_not_all_present = TRUE)
Error in custom_inner_join(df1, df2, "id", fail_if_not_all_present = TRUE) :
Not all key values are present
I have a data frame with a certain number of rows.
Would like to drop all rows after a specific row number or after a date.
Any suggestions?
Could not find anything on the web that works for me for the moment...
Here's a way how you can do this:
df <- df[1:2, ] ## one way of selecting rows from first row to row number you want in a data frame
# a b c date
#1 1 2 3 2017-01-01
#2 1 2 3 2017-01-02
df <- df[-(3:nrow(df)), ] ## another way of filtering rows from starting from row which you don't want to total number of rows in a data frame
# a b c date
#1 1 2 3 2017-01-01
#2 1 2 3 2017-01-02
df <- df[df$date < "2017-01-03", ] ## subset based on a date value
# a b c date
#1 1 2 3 2017-01-01
#2 1 2 3 2017-01-02
data
df = data.frame(a = c(1,1,4,4), b = c(2,2,5,5), c = c(3,3,6,6),
date = seq(from = as.Date("2017-01-01"), to = as.Date("2017-01-04"), by = 'day')) ## creating a dummy data frame
We can use head
n <- 5
df2 <- head(df1, n)
df2
# date col2
#1 2019-01-01 -0.5458808
#2 2019-02-01 0.5365853
#3 2019-03-01 0.4196231
#4 2019-04-01 -0.5836272
#5 2019-05-01 0.8474600
Or create a logical vector
df1[seq_len(nrow(df1)) <= n, ]
Or another option is slice
library(dplyr)
df1 %>%
slice(seq_len(n))
Or with data.table
library(data.table)
setDT(df1)[seq_len(n)]
If it is based on a date value
date1 <- as.Date("2019-05-01")
subset(df1, date <= date1)
data
set.seed(24)
df1 <- data.frame(date = seq(as.Date("2019-01-01"), length.out = 10,
by = "month"), col2 = rnorm(10))
I have a list of identically structured lists as follows:
test1 <- list(first = data.frame(col1 = c(1,2), col2 = c(3,4)),
second = data.frame(COL1 = c(100,200), COL2 = c(300, 400)))
test2 <- list(first = data.frame(col1 = c(5,6), col2 = c(7,8)),
second = data.frame(COL1 = c(500,600), COL2 = c(700,800)))
orig.list <- list(test1, test2)
I want to:
Bind the rows the first element of each nested list together, bind the rows 2nd element of each nested list together, etc.
Recombine the resulting elements into a single list with an identical structure to the first list.
I can easily do this element by element via:
firsts <- orig.list %>% purr::map(1) %>% dplyr::bind_rows()
seconds <- orig.list %>% purr::map(2) %>% dplyr::bind_rows()
new.list <- list(first = firsts, second = seconds)
However, for n list elements this requires that I:
know the number of elements in each list,
know the names and orders of the elements so I can recreate the new list with the correct names and order,
copy and past the same line of code over and over again.
I'm looking for how to apply purrr:map (or some other tidyverse function) more generically to combine all elements of a list of lists, preserving the element names and order.
Under the simplest cases as you've shown with your data, you can use pmap to walk through the list in parallel and bind_rows to combine individual data frames:
library(tidyverse)
pmap(orig.list, bind_rows)
#$first
# col1 col2
#1 1 3
#2 2 4
#3 5 7
#4 6 8
#$second
# COL1 COL2
#1 100 300
#2 200 400
#3 500 700
#4 600 800
identical(pmap(orig.list, bind_rows), new.list)
# [1] TRUE
To make this a little bit more generic, i.e. handles cases where the number of elements and order of names in each sublist can vary, you can use:
map(map_df(orig.list, ~ as.data.frame(map(.x, ~ unname(nest(.))))), bind_rows)
i.e. you nest each sub list as a data frame, and let bind_rows to check the names for you.
Test Cases:
With test1 the same, switch the order of the elements in test2:
test2 <- list(second = data.frame(COL1 = c(500,600), COL2 = c(700,800)),
first = data.frame(col1 = c(5,6), col2 = c(7,8)))
orig.list1 <- list(test1, test2)
map(map_df(orig.list1, ~ as.data.frame(map(.x, ~ unname(nest(.))))), bind_rows)
gives:
#$first
# col1 col2
#1 1 3
#2 2 4
#3 5 7
#4 6 8
#$second
# COL1 COL2
#1 100 300
#2 200 400
#3 500 700
#4 600 800
Now drop one element from test2:
test2 <- list(first = data.frame(col1 = c(5,6), col2 = c(7,8)))
orig.list2 <- list(test1, test2)
map(map_df(orig.list2, ~ as.data.frame(map(.x, ~ unname(nest(.))))), bind_rows)
gives:
#$first
# col1 col2
#1 1 3
#2 2 4
#3 5 7
#4 6 8
#$second
# COL1 COL2
#1 100 300
#2 200 400
You want purrr::transpose :
library(purrr)
library(dplyr)
transpose(orig.list) %>% map(bind_rows)
# $first
# col1 col2
# 1 1 3
# 2 2 4
# 3 5 7
# 4 6 8
#
# $second
# COL1 COL2
# 1 100 300
# 2 200 400
# 3 500 700
# 4 600 800
I have 3 dataframes with unequal rows
df1-
T1 T2 T3
1 Joe TTT
2 PP YYY
3 JJ QQQ
5 UU OOO
6 OO GGG
df2
X1 X2
1 09/20/2017
2 08/02/2015
3 05/02/2000
8 06/03/1999
df3
L1 L2
1 New
6 Notsure
9 Also
The final dataframe should be like a left join of all 3 only retaining rows of df1. The matching rows are T1, X1 and L1 but with different header names. The number of rows are different in each dataframe. I couldn't find a solution for this situation. On SO, what i found was available for 2 dataframes or 3 dataframes with equal rows or same column name
T1 T2 T3 X2 L2
1 Joe TTT 09/20/2017 New
2 PP YYY 08/02/2015 NA
3 JJ QQQ 05/02/2000 NA
5 UU OOO NA NA
6 OO GGG NA NotSure
I am comparatively new in R, and couldn't find a R code for this
The idea is to put your data frames in a list, change the name of the first column, and use Reduce to merge, i.e.
Reduce(function(...) merge(..., by = 'Var1', all.x = TRUE),
lapply( mget(ls(pattern = 'df[0-9]+')), function(i) {names(i)[1] <- 'Var1'; i}))
which gives,
Var1 T2 T3 X2 L2
1 1 Joe TTT 09/20/2017 New
2 2 PP YYY 08/02/2015 Old
3 3 JJ QQQ 05/02/2000 <NA>
4 5 UU OOO <NA> <NA>
5 6 OO GGG <NA> Notsure
using tidyverse functions, you can try:
df1 %>%
left_join(df2, by = c("T1" = "X1")) %>%
left_join(df3, by = c("T1" = "L1"))
which gives:
T1 T2 T3 X2 L2
1 1 Joe TTT 09/20/2017 New
2 2 PP YYY 08/02/2015 <NA>
3 3 JJ QQQ 05/02/2000 <NA>
4 5 UU OOO <NA> <NA>
5 6 OO GGG <NA> Notsure
1) sqldf
library(sqldf)
sqldf("select df1.*, X2, L2
from df1
left join df2 on T1 = X1
left join df3 on T1 = L1")
1a) Although slightly longer this variation can make it easier later when reviewing the code by making it explicit as to which source each column came from. If the data frame names were long you might want to use aliases, e.g. from df1 as a, but here we don't bother since they are short.
sqldf("select df1.*, df2.X2, df3.L2
from df1
left join df2 on df1.T1 = df2.X1
left join df3 on df1.T1 = df3.L1")
2) merge Using repeated merge. No packages used.
Merge <- function(x, y) merge(x, y, by = 1, all.x = TRUE)
Merge(Merge(df1, df2), df3)
2a) This could also be written using a magrittr pipeline like this:
library(magrittr)
df1 %>% Merge(df2) %>% Merge(df3)
2b) Using Reduce we can do the repeated merges like this:
Reduce(Merge, list(df1, df2, df3))
Note: The inputs in reproducible form are:
Lines1 <- "
T1 T2 T3
1 Joe TTT
2 PP YYY
3 JJ QQQ
5 UU OOO
6 OO GGG"
Lines2 <- "
X1 X2
1 09/20/2017
2 08/02/2015
3 05/02/2000
8 06/03/1999"
Lines3 <- "
L1 L2
1 New
6 Notsure
9 Also"
df1 <- read.table(text = Lines1, header = TRUE)
df2 <- read.table(text = Lines2, header = TRUE)
df3 <- read.table(text = Lines3, header = TRUE)
With left_join() it would be something like this
df1 = data.frame(X = c("a", "b", "c"), var1 = c(1,2, 3))
df2 = data.frame(V = c("a", "b", "c"), var2 =c(5,NA, NA) )
df3 = data.frame(Y = c("a", "b", "c"), var3 =c("name", NA, "age") )
# rename
df2 = df2 %>% rename(X = V)
df3 = df3 %>% rename(X = Y)
df = left_join(df1, df2, by = "X") %>%
left_join(., df3, by = "X")
> df
X var1 var2 var3
1 a 1 5 name
2 b 2 NA <NA>
3 c 3 NA age
Define a list dats with two dataframes, df1 and df2
dats <- list( df1 = data.frame(a=sample(1:3), b = sample(11:13)),
df2 = data.frame(a=sample(1:3), b = sample(11:13)))
> dats
$df1
a b
1 2 12
2 3 11
3 1 13
$df2
a b
1 3 13
2 2 11
3 1 12
I would like to drop variable a in each data frame. Next I would like to add a variable with the id of each dataframe from an external dataframe, like:
ids <- data.frame(id=c("id1","id2"),df=c("df1","df2"))
> ids
id df
1 id1 df1
2 id2 df2
To drop unnecessary vars I tried this without luck:
> dats <- lapply(dats, function(x) assign(x, x[,c("b")]))
> Error in assign(x, x[, c("b")]) : invalid first argument
Not sure how to add the id either.
I also tried, perhaps more appropriately:
> temp <- lapply(dats, function(x) subset(x[1], select=x[[1]]$b))
Error in x[[1]]$b : $ operator is invalid for atomic vectors
What I find confusing is that str(out[1]) returns a list, str(out[[1]]) returns a dataframe. I think that may have something to do with it.
Or try this: Extract your ids into a named vector that maps the data-frame name to the id:
df2id <- ids$id
names(df2id) <- ids$df
> df2id
df1 df2
id1 id2
Levels: id1 id2
Then use mapply to both (a) drop the a column from each data-frame, and (b) add the id column:
> mapply( function(d,x) cbind( subset(d, select = -a),
+ id = x),
+ dats, df2id[ names(dats) ] ,
+ SIMPLIFY=FALSE)
$df1
b id
1 12 id1
2 11 id1
3 13 id1
$df2
b id
1 12 id2
2 11 id2
3 13 id2
Note that we are passing df2id[ names(dats) ] to the mapply -- this ensures that the data-frames in df2id are "aligned" with the data-frames in dats.
Is this OK?
dats <- list( df1 = data.frame(a=sample(1:3), b = sample(11:13)),
df2 = data.frame(a=sample(1:3), b = sample(11:13)))
ids <- data.frame(id=c("id1","id2"),df=c("df1","df2"))
# remove variable a
dats2 <- lapply(dats, function(x) x[,!names(x) == "a"])
# add id
for(i in 1:length(dats2)) {
dats2[[i]] <- merge(dats2[[i]], ids$id[ids$df == names(dats2)[i]])
}
dats2
$df1
x y
1 11 id1
2 12 id1
3 13 id1
$df2
x y
1 11 id2
2 12 id2
3 13 id2