I have 2 dataframes in R (df1, df2).
A C D
1 1 1
2 2 2
df2 as
A B C
1 1 1
2 2 2
How can I merge these 2 dataframes to produce the following output?
A B C D
2 1 2 1
4 2 4 2
Columns are sorted and column values are added. Both DFs have same number of rows. Thank you in advance.
Code to create DF:
df1 <- data.frame("A" = 1:2, "C" = 1:2, "D" = 1:2)
df2 <- data.frame("A" = 1:2, "B" = 1:2, "C" = 1:2)
nm1 = names(df1)
nm2 = names(df2)
nm = intersect(nm1, nm2)
if (length(nm) == 0){ # if no column names in common
cbind(df1, df2)
} else { # if column names in common
cbind(df1[!nm1 %in% nm2], # columns only in df1
df1[nm] + df2[nm], # add columns common to both
df2[!nm2 %in% nm1]) # columns only in df2
}
# D A C B
#1 1 2 2 1
#2 2 4 4 2
You can try:
library(tidyverse)
list(df2, df1) %>%
map(rownames_to_column) %>%
bind_rows %>%
group_by(rowname) %>%
summarise_all(sum, na.rm = TRUE)
# A tibble: 2 x 5
rowname A B C D
<chr> <int> <int> <int> <int>
1 1 2 1 2 1
2 2 4 2 4 2
By using left_join() from dplyr you won't lose the column
library(tidyverse)
dat1 <- tibble(a = 1:10,
b = 1:10,
c = 1:10)
dat2 <- tibble(c = 1:10,
d = 1:10,
e = 1:10)
left_join(dat1, dat2, by = "c")
#> # A tibble: 10 x 5
#> a b c d e
#> <int> <int> <int> <int> <int>
#> 1 1 1 1 1 1
#> 2 2 2 2 2 2
#> 3 3 3 3 3 3
#> 4 4 4 4 4 4
#> 5 5 5 5 5 5
#> 6 6 6 6 6 6
#> 7 7 7 7 7 7
#> 8 8 8 8 8 8
#> 9 9 9 9 9 9
#> 10 10 10 10 10 10
Created on 2019-01-16 by the reprex package (v0.2.1)
allnames <- sort(unique(c(names(df1), names(df2))))
df3 <- data.frame(matrix(0, nrow = nrow(df1), ncol = length(allnames)))
names(df3) <- allnames
df3[,allnames %in% names(df1)] <- df3[,allnames %in% names(df1)] + df1
df3[,allnames %in% names(df2)] <- df3[,allnames %in% names(df2)] + df2
df3
A B C D
1 2 1 2 1
2 4 2 4 2
Here is a fun base R method with Reduce.
Reduce(cbind,
list(Reduce("+", list(df1[intersect(names(df1), names(df2))],
df2[intersect(names(df1), names(df2))])), # sum results
df1[setdiff(names(df1), names(df2))], # in df1, not df2
df2[setdiff(names(df2), names(df1))])) # in df2, not df1
This returns
A C D B
1 2 2 1 1
2 4 4 2 2
This assumes that both df1 and df2 have columns that are not present in the other. If this is not true, you'd have to adjust the list.
Note also that you could replace Reduce with do.call in both places and you'd get the same result.
Related
I am having different files that have a variable that is named differently but has the same string character “type_category” e.g., type_category_lifestyle_characterstics, type_category_uniqueness etc. The idea is to go through these files and rename such variables to type_category. Below are examples of data frames
df1 <- data.frame(id = c(1,2,3), type_category_lifestyle_characterstics = c(5,6,7), rating = c(1,3,4))
df2 <- data.frame(id = c(9,5,3), type_category_uniqueness = c(4,6,1), rating = c(2,7,4))
Thanks in advance
We can get the datasets in a list
library(dplyr)
library(purrr)
out <- map(mget(ls(pattern = '^df\\d+$')), ~ .x %>%
rename_with(~ "type_category",
starts_with("type_category")))
-output
out
$df1
id type_category rating
1 1 5 1
2 2 6 3
3 3 7 4
$df2
id type_category rating
1 9 4 2
2 5 6 7
3 3 1 4
We could use setNames with lapply:
my_list <- list(df1, df2)
colnames <- c("id","type_category","rating")
lapply(my_list, setNames, colnames)
output:
[[1]]
id type_category rating
1 1 5 1
2 2 6 3
3 3 7 4
[[2]]
id type_category rating
1 9 4 2
2 5 6 7
3 3 1 4
Base R
Once you got them in a list, you can use lapply to change the variable names in all of them
df1 <- data.frame(id = c(1,2,3), type_category_lifestyle_characterstics = c(5,6,7), rating = c(1,3,4))
df2 <- data.frame(id = c(9,5,3), type_category_uniqueness = c(4,6,1), rating = c(2,7,4))
lapply(list(df1, df2),
function(df){
nms <- names(df)
nms[grepl(pattern = "type_category",
x = nms,
ignore.case = TRUE)] <- "type_category"
names(df) <- nms
return(df)
})
#> [[1]]
#> id type_category rating
#> 1 1 5 1
#> 2 2 6 3
#> 3 3 7 4
#>
#> [[2]]
#> id type_category rating
#> 1 9 4 2
#> 2 5 6 7
#> 3 3 1 4
Just note that you would need to assign the result back to a list.
data.table
Since you tagged data.table, this allows you to change the names in place and no extra assignment is necessary
library(data.table)
dt1 <- data.table::data.table(id = c(1,2,3), type_category_lifestyle_characterstics = c(5,6,7), rating = c(1,3,4))
dt2 <- data.table::data.table(id = c(9,5,3), type_category_uniqueness = c(4,6,1), rating = c(2,7,4))
invisible(
lapply(list(dt1, dt2),
function(dt){
nms_old <- names(data.table::copy(dt))
nms_new <- data.table::copy(nms_old)
nms_new[grepl(pattern = "type_category",
x = nms_old,
ignore.case = TRUE)] <- "type_category"
data.table::setnames(dt, old = nms_old, new = nms_new)
return(NULL)
})
)
dt1
#> id type_category rating
#> 1: 1 5 1
#> 2: 2 6 3
#> 3: 3 7 4
dt2
#> id type_category rating
#> 1: 9 4 2
#> 2: 5 6 7
#> 3: 3 1 4
In the example below how can I calculate the row mean when column A is NA? The row mean would replace the NA in column A. Using base R, I can use this:
foo <- tibble(A = c(3,5,NA,6,NA,7,NA),
B = c(4,5,4,5,6,4,NA),
C = c(6,5,2,8,8,5,NA))
foo
tmp <- rowMeans(foo[,-1],na.rm = TRUE)
foo$A[is.na(foo$A)] <- tmp[is.na(foo$A)]
foo$A[is.nan(foo$A)] <- NA
Curious how I might do this with dplyR?
You can use ifelse :
library(dplyr)
foo %>%
mutate(A = ifelse(is.na(A), rowMeans(., na.rm = TRUE), A),
A = replace(A, is.nan(A), NA))
# A B C
# <dbl> <dbl> <dbl>
#1 3 4 6
#2 5 5 5
#3 3 4 2
#4 6 5 8
#5 7 6 8
#6 7 4 5
#7 NA NA NA
Here is a solution that not only replace NA in column A, but for all columns in the data frame.
library(dplyr)
foo2 <- foo %>%
mutate(RowMean = rowMeans(., na.rm = TRUE)) %>%
mutate(across(-RowMean, .fns =
function(x) ifelse(is.na(x) & !is.nan(RowMean), RowMean, x))) %>%
select(-RowMean)
Use coalesce:
foo %>%
mutate(m = rowMeans(across(), na.rm = T),
A = if_else(is.na(A) & !is.na(m), m, A)) %>%
select(-m)
# # A tibble: 7 x 3
# A B C
# <dbl> <dbl> <dbl>
# 1 3 4 6
# 2 5 5 5
# 3 3 4 2
# 4 6 5 8
# 5 7 6 8
# 6 7 4 5
# 7 NA NA NA
Consider the following named vector vec and tibble df:
vec <- c("1" = "a", "2" = "b", "3" = "c")
df <- tibble(col = rep(1:3, c(4, 2, 5)))
df
# # A tibble: 11 x 1
# col
# <int>
# 1 1
# 2 1
# 3 1
# 4 1
# 5 2
# 6 2
# 7 3
# 8 3
# 9 3
# 10 3
# 11 3
I would like to replace the values in the col column with the corresponding named values in vec.
I'm looking for a tidyverse approach, that doesn't involve converting vec as a tibble.
I tried the following, without success:
df %>%
mutate(col = map(
vec,
~ str_replace(col, names(.x), .x)
))
Expected output:
# A tibble: 11 x 1
col
<chr>
1 a
2 a
3 a
4 a
5 b
6 b
7 c
8 c
9 c
10 c
11 c
You could use col :
df$col1 <- vec[as.character(df$col)]
Or in mutate :
library(dplyr)
df %>% mutate(col1 = vec[as.character(col)])
# col col1
# <int> <chr>
# 1 1 a
# 2 1 a
# 3 1 a
# 4 1 a
# 5 2 b
# 6 2 b
# 7 3 c
# 8 3 c
# 9 3 c
#10 3 c
#11 3 c
We can also use data.table
library(data.table)
setDT(df)[, col1 := vec[as.character(col)]]
I want to remove columns that have only a unique value.
First, I try it for a single column and it works:
data %/%
select_if(length(unique(data$policy_id)) > 1)
then I try it for multiple columns as below:
data %/%
select_if(length(unique(data[, c("policy_date", "policy_id"])) > 1)
but it does not work. I think it is a conceptual mistake due to my lack of experience.
thanks in advance
You can use select(where()).
Suppose I have a data frame like this:
df <- data.frame(A = LETTERS[1:5], B = 1:5, C = 2)
df
#> A B C
#> 1 A 1 2
#> 2 B 2 2
#> 3 C 3 2
#> 4 D 4 2
#> 5 E 5 2
Then I can do:
df %>% select(where(~ n_distinct(.) > 1))
#> A B
#> 1 A 1
#> 2 B 2
#> 3 C 3
#> 4 D 4
#> 5 E 5
Some base R options:
Using lengths + unique + sapply
subset(df,select = lengths(sapply(df,unique))>1)
Using Filter + length + unique
Filter(function(x) length(unique(x))>1,df)
Does this work:
> df <- data.frame(col1 = 1:10,
+ col2 = rep(10,10),
+ col3 = round(rnorm(10,1)))
> df
col1 col2 col3
1 1 10 1
2 2 10 0
3 3 10 1
4 4 10 1
5 5 10 1
6 6 10 0
7 7 10 2
8 8 10 1
9 9 10 1
10 10 10 1
> df %>% select_if(~length(unique(.)) > 1)
col1 col3
1 1 1
2 2 0
3 3 1
4 4 1
5 5 1
6 6 0
7 7 2
8 8 1
9 9 1
10 10 1
>
Another option would be to use purrr:
df %>% purrr::keep(~all(n_distinct(.) > 1))
df %>% purrr::keep(~all(length(unique(.)) > 1))
df %>% purrr::discard(~!all(n_distinct(.) > 1))
df %>% purrr::discard(~!all(length(unique(.)) > 1))
Mixing table with apply generates the same output.
df[, apply(df, 2, function(i) length(table(i)) > 1)]
df <- data.frame(A = LETTERS[1:5], B = 1:5, C = 2)
An option with base R
df[sapply(df, function(x) length(unique(x))) > 1]
data
df <- data.frame(A = LETTERS[1:5], B = 1:5, C = 2)
Here is my issue:
df1 <- data.frame(x = 1:5, y = 2:6, z = 3:7)
rownames(df1) <- LETTERS[1:5]
df1
x y z
A 1 2 3
B 2 3 4
C 3 4 5
D 4 5 6
E 5 6 7
df2 <- data.frame(x = 1:5, y = 2:6, z = 3:7)
rownames(df2) <- LETTERS[3:7]
df2
x y z
C 1 2 3
D 2 3 4
E 3 4 5
F 4 5 6
G 5 6 7
what I wanted is:
x y z
A 1 2 3
B 2 3 4
C 4 6 8
D 6 8 10
E 8 10 12
F 4 5 6
G 5 6 7
where duplicated rows were added up by same variable.
A solution with base R:
# create a new variable from the rownames
df1$rn <- rownames(df1)
df2$rn <- rownames(df2)
# bind the two dataframes together by row and aggregate
res <- aggregate(cbind(x,y,z) ~ rn, rbind(df1,df2), sum)
# or (thx to #alistaire for reminding me):
res <- aggregate(. ~ rn, rbind(df1,df2), sum)
# assign the rownames again
rownames(res) <- res$rn
# get rid of the 'rn' column
res <- res[, -1]
which gives:
> res
x y z
A 1 2 3
B 2 3 4
C 4 6 8
D 6 8 10
E 8 10 12
F 4 5 6
G 5 6 7
With dplyr,
library(dplyr)
# add rownames as a column in each data.frame and bind rows
bind_rows(df1 %>% add_rownames(),
df2 %>% add_rownames()) %>%
# evaluate following calls for each value in the rowname column
group_by(rowname) %>%
# add all non-grouping variables
summarise_all(sum)
## # A tibble: 7 x 4
## rowname x y z
## <chr> <int> <int> <int>
## 1 A 1 2 3
## 2 B 2 3 4
## 3 C 4 6 8
## 4 D 6 8 10
## 5 E 8 10 12
## 6 F 4 5 6
## 7 G 5 6 7
could also vectorize the operation turning the dfs to matrices:
result_df <- as.data.frame(as.matrix(df1) + as.matrix(df2))
This might need some teaking to get the rownames logic working on a longer example:
dfr <-rbind(df1,df2)
do.call(rbind, lapply( split(dfr, sapply(rownames(dfr),substr,1,1)), colSums))
x y z
A 1 2 3
B 2 3 4
C 4 6 8
D 6 8 10
E 8 10 12
F 4 5 6
G 5 6 7
If the rownames could all be assumed to be alpha characters a gsub solution should be easy.
An alternative is to melt the data and cast it. At first we set the row names to the last column of both data frames thanks to #Jaap
df1$rn <- rownames(df1)
df2$rn <- rownames(df2)
Then we melt the data based on the name
melt(list(df1, df2), id.vars = "rn")
Then we use dcast with mget function which is used to retrieve multiple variables at once.
mydf<- dcast(melt(mget(ls(pattern = "df\\d+")), id.vars = "rn"),
rn ~ variable, value.var = "value", fun.aggregate = sum)
rownames(mydf) <- mydf$rn
# get rid of the 'rn' column
mydf <- mydf[, -1]
> mydf
# x y z
#A 1 2 3
#B 2 3 4
#C 4 6 8
#D 6 8 10
#E 8 10 12
#F 4 5 6
#G 5 6 7