Combine multiple dataframe of different columns [duplicate] - r

This question already has answers here:
Combine two data frames by rows (rbind) when they have different sets of columns
(14 answers)
Closed 2 years ago.
Can we combine rows of multiple dataframe with different columns. Example below
> asd1 <- data.frame(a = c("a","b"), b = c("fd", "fg"))
> asd1
a b
1 a fd
2 b fg
> asd2 <- data.frame(a = c("a","b"), e = c("fd", "fg"), c = c("gfd","asd"))
> asd2
a e c
1 a fd gfd
2 b fg asd
Newdf <- rbind(asd1, asd2)
Error in rbind(deparse.level, ...) :
numbers of columns of arguments do not match
Right now there is an error since of different columns.
Expected output
newdf
data a b e c
asd1 a fd NA NA
asd1 b fg NA NA
asd2 a NA fd gfd
asd2 b NA fg asd
Is the above output possible?

I would suggest you bind_rows() from dplyr:
library(dplyr)
#Data 1
asd1 <- data.frame(a = c("a","b"), b = c("fd", "fg"))
#Data 2
asd2 <- data.frame(a = c("a","b"), e = c("fd", "fg"), c = c("gfd","asd"))
#Bind
df <- bind_rows(asd1,asd2)
Output:
a b e c
1 a fd <NA> <NA>
2 b fg <NA> <NA>
3 a <NA> fd gfd
4 b <NA> fg asd

library(dplyr)
bind_rows(asd1, asd2, .id = "data")
# data a b e c
# 1 1 a fd <NA> <NA>
# 2 1 b fg <NA> <NA>
# 3 2 a <NA> fd gfd
# 4 2 b <NA> fg asd

Related

make a join of all the data frames inside a list in R [duplicate]

This question already has answers here:
Simultaneously merge multiple data.frames in a list
(9 answers)
Closed 6 months ago.
I have listed 3 data frames. The 3 data frames have a variable in common and I would like to make a full outer join of the three data frames. I know I can iterate the elements of the list, but is there any other way of making this?
I guess you can try Reduce to merge all the data.frames iteratively, e.g.,
Reduce(function(x, y) merge(x, y, all = TRUE), list(df1, df2, df3))
If you only have 3 data.frames, I'd recommend joining them manually. Here's a minimal reproducible example:
# 3 data.frames
df1 <- data.frame(a=c(1:3), b=letters[1:3])
df2 <- data.frame(c=c(2,3,4), d=c(letters[1:3]))
df3 <- data.frame(e=c(5:7), f=c(letters[1:3]))
df1
# a b
# 1 1 a
# 2 2 b
# 3 3 c
df2
# c d
# 1 2 a
# 2 3 b
# 3 4 c
df3
# e f
# 1 5 a
# 2 6 b
# 3 7 c
Now full_join them:
library(tidyverse)
df1 %>%
full_join(df2, by = c("a"="c")) %>%
full_join(df3, by = c("a"="e"))
# a b d f
# 1 1 a <NA> <NA>
# 2 2 b a <NA>
# 3 3 c b <NA>
# 4 4 <NA> c <NA>
# 5 5 <NA> <NA> a
# 6 6 <NA> <NA> b
# 7 7 <NA> <NA> c
Note: since you mention the data.frames are inside a list, here's how you could access them:
df_list <- list(df1, df2, df3)
df_list[[1]] %>%
full_join(df_list[[2]], by = c("a"="c")) %>%
full_join(df_list[[3]], by = c("a"="e"))
# gives same result as above
You can also use purrr's reduce for this, e.g.
library(purrr)
library(dplyr)
purrr::reduce(df_list, full_join, by = "a")
Data:
df1 <- data.frame(a=c(1:3), b=letters[1:3])
df2 <- data.frame(a=c(2,3,4), d=c(letters[1:3]))
df3 <- data.frame(a=c(5:7), f=c(letters[1:3]))
df_list <- list(df1, df2, df3)
Output:
a b d f
1 1 a <NA> <NA>
2 2 b a <NA>
3 3 c b <NA>
4 4 <NA> c <NA>
5 5 <NA> <NA> a
6 6 <NA> <NA> b
7 7 <NA> <NA> c

Split variable from comma into an ordered dataframe

I have a dataframe like this, where the values are separated by comma.
# Events
# A,B,C
# C,D
# B,A
# D,B,A,E
# A,E,B
I would like to have the next data frame
# Event1 Event2 Event3 Event4 Event5
# A B C NA NA
# NA NA C NA NA
# A B NA NA NA
# A B NA D E
# A B NA NA E
I have tried with cSplit but I don't have the desired df. Is possible?
NOTE: The values doesn't appear in the same possition as the variable Event in the second dataframe.
1) Here is a base R solution. split each row giving list s and create cols which contains the possible values. Then iterate over s and convert that to a data frame.
Note that this does not hard code the column names and continues to work even if some column names are substrings of other column names.
s <- strsplit(DF$Events, ",")
cols <- unique(sort(unlist(s)))
data.frame(Event = t(sapply(s, function(x) ifelse(cols %in% x, cols, NA))))
giving:
Event.1 Event.2 Event.3 Event.4 Event.5
1 A B C <NA> <NA>
2 <NA> <NA> C D <NA>
3 A B <NA> <NA> <NA>
4 A B <NA> D E
5 A B <NA> <NA> E
2) This base R solution uses strsplit as above and then names the components since stack requires a named list and then invokes stack. Then we expand that into a wide form using tapply and convert it to a data frame and fix up the names.
s <- strsplit(DF$Events, ",")
names(s) <- seq_along(s)
stk <- stack(s)
mat <- t(tapply(stk$values, stk, c))
colnames(mat) <- NULL
data.frame(Event = mat)
giving:
Event.1 Event.2 Event.3 Event.4 Event.5
1 A B C <NA> <NA>
2 <NA> <NA> C D <NA>
3 A B <NA> <NA> <NA>
4 A B <NA> D E
5 A B <NA> <NA> E
This could also be represented as an R 4.2+ pipeline:
DF |>
with(setNames(Events, seq_along(Events))) |>
strsplit(",") |>
stack() |>
with(tapply(values, data.frame(ind, values), c)) |>
`colnames<-`(NULL) |>
data.frame(Event = _)
Note
The input in reproducible form:
Lines <- "Events
A,B,C
C,D
B,A
D,B,A,E
A,E,B"
DF <- read.table(text = Lines, header = TRUE, strip.white = TRUE)
Another approach using tidyverse:
library(dplyr)
library(purrr)
library(stringr)
Events = c("A,B,C", 'C,D', "B,A", "D,B,A,E", "A,E,B")
letters <- Events %>% str_split(",") %>% unlist() %>% unique()
df <- data.frame(Events)
df %>%
map2_dfc(.y = letters, ~ ifelse(str_detect(.x, .y), .y, NA)) %>%
set_names(nm = paste0("Events", 1:length(letters)))
#> # A tibble: 5 × 5
#> Events1 Events2 Events3 Events4 Events5
#> <chr> <chr> <chr> <chr> <chr>
#> 1 A B C <NA> <NA>
#> 2 <NA> <NA> C D <NA>
#> 3 A B <NA> <NA> <NA>
#> 4 A B <NA> D E
#> 5 A B <NA> <NA> E
Created on 2022-07-11 by the reprex package (v2.0.1)
This tidyverse solution is easily the most economical in terms of amount of code used:
library(tidyverse)
data.frame(Events) %>%
# split the strings by the comma:
mutate(Events = str_split(Events, ",")) %>%
# unnest splitted values wider into columns:
unnest_wider(Events, names_sep = "")
# A tibble: 5 × 4
Events1 Events2 Events3 Events4
<chr> <chr> <chr> <chr>
1 A B C NA
2 C D NA NA
3 B A NA NA
4 D B A E
5 A E B NA
Data:
Events = c("A,B,C", 'C,D', "B,A", "D,B,A,E", "A,E,B")
We can try the following base R code
> d <- t(table(stack(setNames(strsplit(df$Events, ","), 1:nrow(df)))))
> as.data.frame.matrix(`dim<-`(colnames(d)[ifelse(d > 0, d * col(d), NA)], dim(d)))
V1 V2 V3 V4 V5
1 A B C <NA> <NA>
2 <NA> <NA> C D <NA>
3 A B <NA> <NA> <NA>
4 A B <NA> D E
5 A B <NA> <NA> E

Conditionally copy contents of one column to another [duplicate]

This question already has answers here:
How to join (merge) data frames (inner, outer, left, right)
(13 answers)
Closed 2 years ago.
I want to add extra columns depending on values of code which are defined in VAR
DF <- data.frame(id = c(1:5), code = c("A","B","C","D","E"), sub = c("A1","B1","C1","D1","E1"))
id code sub
1 1 A A1
2 2 B B1
3 3 C C1
4 4 D D1
5 5 E E1
VAR <- c("A","B")
How result should be:
id code sub AB ABsub
1 1 A A1 A A1
2 2 B B1 B B1
3 3 C C1 <NA> <NA>
4 4 D D1 <NA> <NA>
5 5 E E1 <NA> <NA>
Or using dplyr:
library(dplyr)
DF<-data.frame(id=c(1:5),code=c("A","B","C","D","E"),sub=c("A1","B1","C1","D1","E1"), stringsAsFactors = FALSE)
VAR<-c("A","B")
DF <- DF %>%
mutate(AB = ifelse(code %in% {{VAR}}, code, NA_character_)) %>%
mutate(ABsub = ifelse(code == AB, sub, NA_character_))
with:
> DF
id code sub AB ABsub
1 1 A A1 A A1
2 2 B B1 B B1
3 3 C C1 <NA> <NA>
4 4 D D1 <NA> <NA>
5 5 E E1 <NA> <NA>
Also works if VAR would equal c("A", "B", "C") but we do not know if that is what you are after.
A simple base R option using merge + subset
merge(DF,subset(DF,code %in% VAR),by = "id",all = TRUE)
such that
> merge(DF,subset(DF,code %in% VAR),by = "id",all = TRUE)
id code.x sub.x code.y sub.y
1 1 A A1 A A1
2 2 B B1 B B1
3 3 C C1 <NA> <NA>
4 4 D D1 <NA> <NA>
5 5 E E1 <NA> <NA>
A dplyr solution with across():
library(dplyr)
DF %>%
mutate(across(-id, ~ replace(.x, !(code %in% VAR), NA), .names = "AB{col}"))
# id code sub ABcode ABsub
# 1 1 A A1 A A1
# 2 2 B B1 B B1
# 3 3 C C1 <NA> <NA>
# 4 4 D D1 <NA> <NA>
# 5 5 E E1 <NA> <NA>
or with left_join():
DF %>%
filter(code %in% VAR) %>%
left_join(DF, ., by = "id", suffix = c("", "AB"))
# id code sub codeAB subAB
# 1 1 A A1 A A1
# 2 2 B B1 B B1
# 3 3 C C1 <NA> <NA>
# 4 4 D D1 <NA> <NA>
# 5 5 E E1 <NA> <NA>
Note: If you have multiple columns in your real data, you don't need to type
mutate(Col1 = ifelse(...), Col2 = ifelse(...), etc.)
one by one.
Here's a solution
ABsub <- ifelse(DF$code %in% VAR, DF$code, NA)
cbind(DF, ABsub)

How to add rows from another data frame to another but only selected column

How to add rows from certain column in a data frame to another column of another data frame. See Example below.
> DF1
A B C
1 3 axe aa
2 6 base bb
3 9 lol cc
> DF2
D E
1 x ss
2 y dd
3 z vv
And I want to add/merge the rows of Column E of DF2 into Column C of DF1. And the other columns' rows should be NA.
> DF3
A B C
1 3 axe aa
2 6 base bb
3 9 lol cc
4 NA NA ss
5 NA NA dd
6 NA NA vv
You can rename E to C and rbind. I prefer bind_rows
> library(dplyr)
> names(DF2)[2] <- "C"
> DF1 <- bind_rows(DF1, select(DF2, C))
> DF1
A B C
1 3 axe aa
2 6 base bb
3 9 lol cc
4 NA <NA> ss
5 NA <NA> dd
6 NA <NA> vv
Another approach:
> DF1 %>%
+ bind_rows(DF2) %>%
+ mutate(C = ifelse(is.na(C), E, C)) %>%
+ select(A:C)
A B C
1 3 axe aa
2 6 base bb
3 9 lol cc
4 NA <NA> ss
5 NA <NA> dd
6 NA <NA> vv
Use rbind from base R:
DF3 <- rbind(DF1, data.frame(A = NA, B = NA, C = DF2$E))

data.table shift right all cell values by number of na within each row [R]

How do I shift the cells in a data table TO THE RIGHT by the number of NA in each row in R?
Example Data:
data <- data.table(c1=c("a","e","h","j"),
c2=c("b","f","i",NA),
c3=c("c","g",NA,NA),
c4=c("d",NA,NA,NA), stringsAsFactors = F)
c1 c2 c3 c4
1 a b c d
2 e f g <NA>
3 h i <NA> <NA>
4 j <NA> <NA> <NA>
Desired Data from example:
data.desired <- data.table(
c1=c("a",NA,NA,NA),
c2=c("b","e",NA,NA),
c3=c("c","f","h",NA),
c4=c("d","g","i","j"), stringsAsFactors = F)
c1 c2 c3 c4
1 a b c d
2 <NA> e f g
3 <NA> <NA> h i
4 <NA> <NA> <NA> j
Here's one attempt using matrix indexing and a counter of NA values by row:
#convert back to a data.frame to take advantage of matrix indexing
setDF(data)
arr <- which(!is.na(data), arr.ind=TRUE)
arr[,"col"] <- arr[,"col"] + rowSums(is.na(data))[arr[,"row"]]
out <- data
out[] <- NA
out[arr] <- data[!is.na(data)]
out
# c1 c2 c3 c4
#1 a b c d
#2 <NA> e f g
#3 <NA> <NA> h i
#4 <NA> <NA> <NA> j
#convert to data.table if necessary
setDT(out)
This option is pretty quick and from a brief test churns through 4 columns / 2 million rows in about 3-4 seconds.
We can use
data.table(t(apply(data, 1, function(x){ c(rep(NA, sum(is.na(x))), x[!is.na(x)])})))
# V1 V2 V3 V4
# 1: a b c d
# 2: <NA> e f g
# 3: <NA> <NA> h i
# 4: <NA> <NA> <NA> j

Resources