Add 2 dataframe with dfifferent lengths in R - r

I have the above 2 dataframes in R,
df1 = [a,2 df2 = [a,10
b,3] c,2]
I want to add those 2 df, so the output can be
df = [a, 12,
b, 3,
c, 2]
Any advice would be much appreciated, thanks!

We can rbind the two datasets and do a group by sum
aggregate(col2 ~ col1, rbind(df1, df2), sum)
-output
# col1 col2
#1 a 12
#2 b 3
#3 c 2
Or in dplyr
library(dplyr)
bind_rows(df1, df2) %>%
group_by(col1) %>%
summarise(col2 = sum(col2), .groups = 'drop')
-output
# A tibble: 3 x 2
# col1 col2
# <chr> <dbl>
#1 a 12
#2 b 3
#3 c 2
data
df2 <- data.frame(col1 = c('a', 'c'), col2 = c(10, 2))
df1 <- data.frame(col1 = c('a', 'b'), col2 = c(2, 3))

Related

Merge/combine Row based on unique value in R

My data :
I want the output this way :
I already played around with aggregate, merge and group_by functions but the output does not come out the way I want.
One way, using tidyr::separate:
d %>%
group_by(V1) %>%
summarise(V2 = toString(V2)) %>%
separate(V2, into = c("V2", "V1"))
# A tibble: 2 x 2
V2 V1
<chr> <chr>
1 A C
2 B D
You could do:
library(tidyverse)
df <- data.frame(V1 = c(1,2,1,2),
V2 = LETTERS[1:4])
df %>%
mutate(id = rep(1:2, each = 2)) %>%
pivot_wider(names_from = V1,
names_prefix = 'V',
values_from = V2) %>%
select(-id)
Which gives:
# A tibble: 2 x 2
V1 V2
<chr> <chr>
1 A B
2 C D
How about -
library(dplyr)
df <- tibble(V1 = rep(1:2, 2), V2 = LETTERS[1:4])
df %>%
left_join(df, by = "V1") %>%
filter(V2.x != V2.y & V2.x %in% c("A", "B")) %>%
select(V1 = V2.y, V2 = V2.x)
# A tibble: 2 x 2
V1 V2
<chr> <chr>
1 C A
2 D B

Replacing NA with 0 in columns that contain a substring in the column name

Let's say I had a data frame and I wanted to replace NA's with 0's only in columns that contained a specific string in the column name.
Let's call the dataframe df, the columns are numeric type, and the string "keyword".
i.e., a column name would be, "Column1keyword", etc.
How can I do this?
I've tried this and it didn't work:
df %>%
mutate(across(where(~ colnames.is.numeric(.x) && 'keyword' %in% colnames.x), replace_na, 0))
Update
If it is to select columns having 'keyword' as substring in the column names, use contains to select across those columns
library(dplyr)
library(tidyr)
df1 <- df1 %>%
mutate(across(contains('keyword'), replace_na, 0))
-output
df1
# A tibble: 5 × 4
col1 col2_keyword col3 col4
<int> <chr> <chr> <dbl>
1 1 a a 1
2 2 b b 3
3 3 0 c NA
4 4 c d 5
5 5 d <NA> 6
Assuming that the OP mentioned to replace NA only in columns that have a specific element 'keyword', use where with a logical expression to select the columns that have the 'keyword', loop across those columns and use replace_na to replace the NA to 0
df <- df %>%
mutate(across(where(~ is.character(.x) && 'keyword' %in% .x), replace_na, 0))
-output
df
# A tibble: 5 × 4
col1 col2 col3 col4
<int> <chr> <chr> <dbl>
1 1 a a 1
2 2 b b 3
3 3 keyword c NA
4 4 0 d 5
5 5 c <NA> 6
data
df <- tibble(col1 = 1:5, col2 = c("a", "b", "keyword", NA, 'c'),
col3 = c('a', 'b', 'c', 'd', NA), col4 = c(1, 3, NA, 5, 6))
df1 <- tibble(col1 = 1:5, col2_keyword = c("a", "b", NA, 'c', 'd'),
col3 =c('a', 'b', 'c', 'd', NA), col4 = c(1, 3, NA, 5, 6))

Join of column values for specific row values

I'd like to join (left_join) a tibble (df2) to another one (df1) only where the value of col2 in df1 is NA. I am currently using a code that is not very elegant. Any advice on how to shorten the code would be greatly appreciated!
library(tidyverse)
# df1 contains NAs that need to be replaced by values from df2, for relevant col1 values
df1 <- tibble(col1 = c("a", "b", "c", "d"), col2 = c(1, 2, NA, NA), col3 = c(10, 20, 30, 40))
df2 <- tibble(col1 = c("a", "b", "c", "d"), col2 = c(5, 6, 7, 8), col3 = c(50, 60, 70, 80))
# my current approach
df3 <- df1 %>%
filter(!is.na(col2))
df4 <- df1 %>%
filter(is.na(col2)) %>%
select(col1)%>%
left_join(df2)
# output tibble that is expected
df_final <- df3 %>%
bind_rows(df4)
Here's a small dplyr answer that works for me, although it might get slow if you have tons of rows:
df1 %>%
filter(is.na(col2)) %>%
select(col1) %>%
left_join(df2, by = "col1") %>%
bind_rows(df1, .) %>%
filter(!is.na(col2))
We can use data.table methods
library(data.table)
setDT(df1)[setDT(df2), col2 := fcoalesce(col2, i.col2), on = .(col1)]
-output
> df1
col1 col2 col3
1: a 1 10
2: b 2 20
3: c 7 30
4: d 8 40
Or an option with tidyverse
library(dplyr)
library(stringr)
df1 %>%
left_join(df2, by = c("col1")) %>%
transmute(col1, across(ends_with(".x"),
~ coalesce(., get(str_replace(cur_column(), ".x", ".y"))),
.names = "{str_remove(.col, '.x')}"))
-output
# A tibble: 4 x 3
col1 col2 col3
<chr> <dbl> <dbl>
1 a 1 10
2 b 2 20
3 c 7 30
4 d 8 40

Stack columns in a data frame

Can someone help me stack the following data frame so that the as are on top of each other and also the 1s and 2s, preferably using a pipe and form a 3x4 dataframe
df <- rbind(data.frame(X1 = 'a', X2 = 1, X3 = 2, X4 = 'a', X5 = 1, X6 = 2), data.frame(X1 = 'a', X2 = 1, X3 = 2, X4 = 'a', X5 = 1, X6 = 2))
Thank you
Here is a data.table solution...
library(data.table)
cols <- 3
# Split df to chuncks of 3 (=ncol) columns
L <- split.default(df, f = cols:(ncol(df) + 2) %/% cols)
# Rowbind, ignore columns names
data.table::rbindlist(L, use.names = FALSE)
# X1 X2 X3
# 1: a 1 2
# 2: a 1 2
# 3: a 1 2
# 4: a 1 2
Using tidyverse -
library(dplyr)
library(tidyr)
df %>%
mutate(across(.fns = as.character)) %>%
pivot_longer(cols = everything()) %>%
mutate(id = paste0('col', rep(1:3, length.out = n()))) %>%
group_by(id) %>%
mutate(name = row_number()) %>%
pivot_wider(names_from = id, values_from = value) %>%
select(-name)
# col1 col2 col3
# <chr> <chr> <chr>
#1 a 1 2
#2 a 1 2
#3 a 1 2
#4 a 1 2

Merge data frames and divide rows by group

I would like to divide the values from df1 over the values from df2. In this reproducible example, I am able to sum these values. What about the division? Thanks in advance!
df1 <- data.frame(country = c("a", "b", "c"), year1 = c(1, 2, 3), year2 = c(1, 2, 3))
df2 <- data.frame(country = c("a", "b", "d"), year1 = c(1, 2, NA), year2 = c(1, 2, 3))
df3 <- bind_rows(df1, df2) %>%
mutate_if(is.numeric, tidyr::replace_na, 0) %>%
group_by(country) %>%
summarise_all(., sum, na.rm = TRUE) %>%
na_if(., 0)
Expected result is:
# A tibble: 4 x 3
country year1 year2
<chr> <dbl> <dbl>
1 a 1 1
2 b 1 1
3 c NA NA
4 d NA NA
As there are groups with 2 rows and some with 1, use an if/else condition within summarise/across to divide the first element by the last if there are two elements or else return NA
library(dplyr) # version 1.0.4
library(tidyr)
bind_rows(df1, df2) %>%
mutate(across(where(is.numeric), replace_na, 0)) %>%
group_by(country) %>%
summarise(across(everything(), ~ if(n() == 2) first(.)/last(.)
else NA_real_))
-output
# A tibble: 4 x 3
# country year1 year2
#* <chr> <dbl> <dbl>
#1 a 1 1
#2 b 1 1
#3 c NA NA
#4 d NA NA
Here is a base R option using merge + split.default
df <- merge(df1, df2, by = "country", all = TRUE)
cbind(
df[1],
list2DF(lapply(
split.default(df[-1], gsub("\\.(x|y)", "", names(df)[-1])),
function(v) do.call("/", v)
))
)
which gives
country year1 year2
1 a 1 1
2 b 1 1
3 c NA NA
4 d NA NA

Resources