Create a null column with given name, if missing from one dataset - r

I have 5 data sets, each containing some columns. The data sets have common column names, but all columns are not present in all the data sets. So whenever a column name (that appears in at least one of the data set) is not present in some other data set, I want to create a column of all zeros with that column name in that data set. So that all the data sets have same number of columns (and same column names).

Put the dataframes in the list, get the all the unique column names present in all the dataframes combined and add columns which are absent in each dataframe with 0.
all_names <- unique(unlist(sapply(list_df, names)))
lst1 <- lapply(list_df, function(x) {x[setdiff(all_names, names(x))] <- 0;x})
lst1
#[[1]]
# a b c
#1 1 6 0
#2 2 7 0
#3 3 8 0
#4 4 9 0
#5 5 10 0
#[[2]]
# a c b
#1 1 6 0
#2 2 7 0
#3 3 8 0
#4 4 9 0
#5 5 10 0
#[[3]]
# a c b
#1 1 6 11
#2 2 7 12
#3 3 8 13
#4 4 9 14
#5 5 10 15
If you need separate dataframes you can use lst1[[1]], lst1[[2]] individually again.
data
df1 <- data.frame(a = 1:5, b = 6:10)
df2 <- data.frame(a = 1:5, c = 6:10)
df3 <- data.frame(a = 1:5, c = 6:10, b = 11:15)
list_df <- list(df1, df2, df3)

We can use a for loop to do this
un1 <- Reduce(union, lapply(lst1, names))
for(i in seq_along(lst1)) lst1[[i]][setdiff(un1, names(lst1[[i]]))] <- 0
data
lst1 <- list(structure(list(a = 1:5, b = 6:10, c = c(0, 0, 0, 0, 0)),
row.names = c(NA,
-5L), class = "data.frame"), structure(list(a = 1:5, c = 6:10,
b = c(0, 0, 0, 0, 0)),
row.names = c(NA, -5L), class = "data.frame"),
structure(list(a = 1:5, c = 6:10, b = 11:15),
class = "data.frame", row.names = c(NA,
-5L)))

I would use dplyr's bind_rows, which automatically fills missing values with NA. If you include .id = "df_id" a column will be added connecting each row to the original dataframe:
library(dplyr)
bind_rows(df1, df2, df3, .id = "df_id")
#### OUTPUT ####
df_id x y z
1 1 1 2 NA
2 2 3 NA 4
3 3 NA 5 6
If you want 0s instead of NAs just runt df[is.na(df)] <- 0. If you want a more informative df_id column you can pass in a named list:
bind_rows(list(df1 = df1, df2 = df2, df3 = df3), .id = "df_id")
#### OUTPUT ####
df_id x y z
1 df1 1 2 NA
2 df2 3 NA 4
3 df3 NA 5 6
If you want your dataframes separate then simply split by df_id, which generates a list of dataframes:
df <- bind_rows(df1, df2, df3, .id = "df_id")
split(df, df$df_id)
#### OUTPUT ####
$`1`
df_id x y z
1 1 1 2 NA
$`2`
df_id x y z
2 2 3 NA 4
$`3`
df_id x y z
3 3 NA 5 6
Data:
df1 <- data.frame(x = 1, y = 2)
df2 <- data.frame(x = 3, z = 4)
df3 <- data.frame(y = 5, z = 6)

In addition to the previous answers, you can use the bind_rows function in order to quickly combine all your data frames, which will take care of differences in column names:
library(dplyr)
x <- data.frame(
a = 1:3,
b = 4:6
)
y <- data.frame(
a = 4:7
)
z <- data.frame(
c = 8:10
)
xyz <- bind_rows(x, y, z)
xyz %>% replace(., is.na(.), 0)

Related

how to use tidyverse's mutate function to create multiple columns by a self defined function

I have a
df = data.frame(a = c(1,2,3), b = c(6,7,8))
I want to add two columns of the distance from mean of a:
a
b
diff_a
diff_b
1
4
-1
2
2
5
0
3
3
6
1
4
I don't want to write columns separately in mutate, as it will calculate mean multiple times(mean is example here, I actually have a functions takes a lot time). I want to use one function like
calculates <- function(a, b){
e_a <- mean(a)
return list(a - e_a, b - e_a)
}
We many need
library(dplyr)
df %>%
mutate(Meana = mean(a), across(a:b,
~ . - Meana, .names = "diff_{.col}"), Meana = NULL)
-output
a b diff_a diff_b
1 1 4 -1 2
2 2 5 0 3
3 3 6 1 4
data
df <- structure(list(a = c(1, 2, 3), b = c(4, 5, 6)),
class = "data.frame", row.names = c(NA,
-3L))
You may return a named list from the function and use cbind to add new columns to the dataframe.
df = data.frame(a = c(1,2,3), b = c(4,5,6))
calculates <- function(a, b){
e_a <- mean(a)
return(list(diff_a = a - e_a, diff_b = b - e_a))
}
cbind(df, calculates(df$a, df$b))
# a b diff_a diff_b
#1 1 4 -1 2
#2 2 5 0 3
#3 3 6 1 4

Split df column of integers into individual digits in R

I have a df where one variable is an integer. I'd like to split this column into it's individual digits. See my example below
Group Number
A 456
B 3
C 18
To
Group Number Digit1 Digit2 Digit3
A 456 4 5 6
B 3 3 NA NA
C 18 1 8 NA
We can use read.fwf from base R. Find the max number of character (nchar) in 'Number' column (mx). Read the 'Number' column after converting to character (as.character), specify the 'widths' as 1 by replicating 1 with mx and assign the output to new 'Digit' columns in the data
mx <- max(nchar(df1$Number))
df1[paste0("Digit", seq_len(mx))] <- read.fwf(textConnection(
as.character(df1$Number)), widths = rep(1, mx))
-output
df1
# Group Number Digit1 Digit2 Digit3
#1 A 456 4 5 6
#2 B 3 3 NA NA
#3 C 18 1 8 NA
data
df1 <- structure(list(Group = c("A", "B", "C"), Number = c(456L, 3L,
18L)), class = "data.frame", row.names = c(NA, -3L))
Another base R option (I think #akrun's approach using read.fwf is much simpler)
cbind(
df,
with(
df,
type.convert(
`colnames<-`(do.call(
rbind,
lapply(
strsplit(as.character(Number), ""),
`length<-`, max(nchar(Number))
)
), paste0("Digit", seq(max(nchar(Number))))),
as.is = TRUE
)
)
)
which gives
Group Number Digit1 Digit2 Digit3
1 A 456 4 5 6
2 B 3 3 NA NA
3 C 18 1 8 NA
Using splitstackshape::cSplit
splitstackshape::cSplit(df, 'Number', sep = '', stripWhite = FALSE, drop = FALSE)
# Group Number Number_1 Number_2 Number_3
#1: A 456 4 5 6
#2: B 3 3 NA NA
#3: C 18 1 8 NA
Updated
I realized I could use max function for counting characters limit in each row so that I could include it in my map2 function and save some lines of codes thanks to an accident that led to an inspiration by dear #ThomasIsCoding.
library(dplyr)
library(tidyr)
library(purrr)
library(stringr)
df %>%
rowwise() %>%
mutate(map2_dfc(Number, 1:max(nchar(Number)), ~ str_sub(.x, .y, .y))) %>%
unnest(cols = !c(Group, Number)) %>%
rename_with(~ str_replace(., "\\.\\.\\.", "Digit"), .cols = !c(Group, Number)) %>%
mutate(across(!c(Group, Number), as.numeric, na.rm = TRUE))
# A tibble: 3 x 5
Group Number Digit1 Digit2 Digit3
<chr> <dbl> <dbl> <dbl> <dbl>
1 A 456 4 5 6
2 B 3 3 NA NA
3 C 18 1 8 NA
Data
df <- tribble(
~Group, ~Number,
"A", 456,
"B", 3,
"C", 18
)
Two base r methods:
no_cols <- max(nchar(as.character(df1$Number)))
# Using `strsplit()`:
cbind(df1, setNames(data.frame(do.call(rbind,
lapply(strsplit(as.character(df1$Number), ""),
function(x) {
length(x) <- no_cols
x
}
)
)
), paste0("Digit", seq_len(no_cols))))
# Using `regmatches()` and `gregexpr()`:
cbind(df1, setNames(data.frame(do.call(rbind,
lapply(regmatches(df1$Number, gregexpr("\\d", df1$Number)),
function(x) {
length(x) <- no_cols
x
}
)
)
), paste0("Digit", seq_len(no_cols))))

R: How to insert a row in Dataframe starting at a certain column?

I have the following data frame:
df <- tibble(x = 1:3, y = 3:1, z = 4:6, a = 6:4, b = 7:9)
I now need to extract the values from the second row, third to fifth column with this command:
newrow <- df[2,3:5]
I now want to insert a new row after the second row. The problem is that I need the new row to start at column 2. If I use the following code, the row will be added at the same column positions as I extracted it from:
df%>% add_row(newrow, .before = 3)
Hope anybody can help with this, any help is much appreciated.
Your newrow dataframe has the colnames from coluns 3:5 (z,a,b). Therefore add_row()matches the newrow to these columns.
You need to rename the columns of newrow with the first three column names.
df%>% add_row(setNames(newrow, names(df)[1:ncol(newrow)]),
.before = 3)
I'm not sure exactly what you're desired outcome is but does this achieve what you want?
library(tibble)
library(dplyr)
df <- tibble::tibble(x = 1:3, y = 3:1, z = 4:6, a = 6:4, b = 7:9)
whatrow <- 2
whatcolumns <- 3:5
beforerow <- 3
newdf <-
slice(df, whatrow) %>%
select(all_of(whatcolumns)) %>%
setNames(., names(df)[whatcolumns - 1]) %>%
add_row(df, ., .before = beforerow)
newdf
#> # A tibble: 4 x 5
#> x y z a b
#> <int> <int> <int> <int> <int>
#> 1 1 3 4 6 7
#> 2 2 2 5 5 8
#> 3 NA 5 5 8 NA
#> 4 3 1 6 4 9

Renaming the headers of every data frame in a list

I have a list containing a number of data frames, all with the same number of columns.
E.g, for a list df_list with two data frames, df1 and df2:
>df_list
df1
a b c
1 1 1
2 2 2
3 3 3
df2
a b c
3 2 1
3 2 1
3 2 1
I want to rename the headers of every data frame to new_headings <- c("A", "B", "C").
I constructed a for loop:
for (i in 1:length(list)) {
names(list[[i]]) <- new_headings
}
However, this doesn't work. The headings remain as they were. If I do it individually instead of in a loop, it works fine, however, e.g., names(list[[1]]) <- new_headings changes the headings appropriately.
My actual list is very long with many data frames. Can anyone explain why this isn't working or what other approach I can use? Thank you.
We can use Map with setNames
df_listNew <- Map(setNames, df_list, list(new_headings))
Or using lapply
lapply(df_list, setNames, new_headings)
#$df1
# A B C
#1 1 1 1
#2 2 2 2
#3 3 3 3
#$df2
# A B C
#1 3 2 1
#2 3 2 1
#3 3 2 1
data
df_list <- list(df1 = structure(list(a = 1:3, b = 1:3, c = 1:3),
class = "data.frame", row.names = c(NA,
-3L)), df2 = structure(list(a = c(3, 3, 3), b = c(2, 2, 2), c = c(1,
1, 1)), class = "data.frame", row.names = c(NA, -3L)))
You can use two for loops
a<-c(1,2,3)
b<-c(1,2,3)
c<-c(1,2,3)
df1<-as.data.frame(cbind(a,b,c))
a<-c(3,2,1)
b<-c(3,2,1)
c<-c(3,2,1)
df2<-as.data.frame(cbind(a,b,c))
df_list<-list(df1,df2)
new_headings <- c("A", "B", "C")
for (i in 1:length(df_list)) {
for (j in 1:length(df_list[[i]])) {
colnames(df_list[[i]])[j] <- new_headings[j]
}
}
df_list

Add (not merge!) two data frames with unequal rows and columns

I want to efficiently sum the entries of two data frames, though the data frames are not guaranteed to have the same dimensions or column names. Merge isn't really what I'm after here. Instead I want to create an output object with all of the row and column names that belong to either of the added data frames. In each position of that output, I want to use the following logic for the computed value:
If a row/column pairing belongs to both input data frames I want the output to include their sum
If a row/column pairing belongs to just one input data frame I want to include that value in the output
If a row/column pairing does not belong to any input matrix I want to have 0 in that position in the output.
As an example, consider the following input data frames:
df1 = data.frame(x = c(1,2,3), y = c(4,5,6))
rownames(df1) = c("a", "b", "c")
df2 = data.frame(x = c(7,8), z = c(9,10), w = c(2, 3))
rownames(df2) = c("a", "d")
> df1
x y
a 1 4
b 2 5
c 3 6
> df2
x z w
a 7 9 2
d 8 10 3
I want the final result to be
> df2
x y z w
a 8 4 9 2
b 2 5 0 0
c 3 6 0 0
d 8 0 10 3
What I've done so far -
bind_rows / bind_cols in dplyr can throw the following:
"Error: incompatible number of rows (3, expecting 2)"
I have duplicated column names, so 'merge' isn't working for my purposes either - returns an empty df for some reason.
Seems like you could merge on the rownames, then take care of the sums and conversion of NA to zero with some additional munging:
library(dplyr)
df.new = df1 %>% add_rownames %>%
full_join(df2 %>% add_rownames, by="rowname") %>%
mutate_each(funs(replace(., which(is.na(.)), 0))) %>%
mutate(x = x.x + x.y) %>%
select(rowname,x,y,z,w)
Or, with #DavidArenburg's much more elegant and extensible solution:
df.new = df1 %>% add_rownames %>%
full_join(df2 %>% add_rownames) %>%
group_by(rowname) %>%
summarise_each(funs(sum(., na.rm = TRUE)))
df.new
rowname x y z w
1 a 8 4 9 2
2 b 2 5 0 0
3 c 3 6 0 0
4 d 8 0 10 3
This seems like some type of a simple merge on common column names (+ row names) and then a simple aggregation, this is how I would tackle this
library(data.table)
merge(setDT(df1, keep.rownames = TRUE), # Convert to data.table + keep rows
setDT(df2, keep.rownames = TRUE), # Convert to data.table + keep rows
by = intersect(names(df1), names(df2)), # merge on common column names
all = TRUE)[, lapply(.SD, sum, na.rm = TRUE), by = rn] # Sum all columns by group
# rn x y z w
# 1: a 8 4 9 2
# 2: b 2 5 0 0
# 3: c 3 6 0 0
# 4: d 8 0 10 3
Are a pretty straight forward base R solution
df1$rn <- row.names(df1)
df2$rn <- row.names(df2)
res <- merge(df1, df2, all = TRUE)
rowsum(res[setdiff(names(res), "rn")], res[, "rn"], na.rm = TRUE)
# x y z w
# a 8 4 9 2
# b 2 5 0 0
# c 3 6 0 0
# d 8 0 10 3
First, I would grab the names of all the rows and columns of the new entity:
(all.rows <- unique(c(row.names(df1), row.names(df2))))
# [1] "a" "b" "c" "d"
(all.cols <- unique(c(names(df1), names(df2))))
# [1] "x" "y" "z" "w"
Then I would construct an output matrix with those rows and column names (with matrix data initialized to all 0s), adding df1 and df2 to the relevant parts of that matrix.
out <- matrix(0, nrow=length(all.rows), ncol=length(all.cols))
rownames(out) <- all.rows
colnames(out) <- all.cols
out[row.names(df1),names(df1)] <- unlist(df1)
out[row.names(df2),names(df2)] <- out[row.names(df2),names(df2)] + unlist(df2)
out
# x y z w
# a 8 4 9 2
# b 2 5 0 0
# c 3 6 0 0
# d 8 0 10 3
Using xtabs on melted / stacked data frames:
out <- rbind(cbind(rn=rownames(df1),stack(df1)), cbind(rn=rownames(df2),stack(df2)))
as.data.frame.matrix(xtabs(values ~ rn + ind, data=out))
# x y w z
#a 8 4 2 9
#b 2 5 0 0
#c 3 6 0 0
#d 8 0 3 10
I’m not convinced the accepted (or alternative merge) method is the best. It will give incorrect results if you have common rows, they’ll get joined and not summed.
This can be shown trivialy by changing df2 to:
df2 = data.frame(x = c(1,2), y = c(4,5), z = c(9,10), w = c(2, 3))
rownames(df2) = c("a", "d")
expected results:
rn x y z w
1: a 2 8 9 2
2: b 2 5 0 0
3: c 3 6 0 0
4: d 2 5 10 3
actual results
merge(setDT(df1, keep.rownames = TRUE),
setDT(df2, keep.rownames = TRUE),
by = intersect(names(df1), names(df2)),
all = TRUE)[, lapply(.SD, sum, na.rm = TRUE), by = rn]
rn x y z w
1: a 1 4 9 2
2: b 2 5 0 0
3: c 3 6 0 0
4: d 2 5 10 3
You need to combine both the outer join with an inner join (or left/right joins, merge all=T/all=F). Or alternatively using plyr’s rbind.fill :
base R solution
res <- rbind.fill(df1,df2)
rowsum(res[setdiff(names(res), "rn")], res[, "rn"], na.rm = TRUE)
data table solution
as.data.table(rbind.fill(
setDT(df1, keep.rownames = TRUE),
setDT(df2, keep.rownames = TRUE)
))[, lapply(.SD, sum, na.rm = TRUE), by = rn]
I prefer the rbind.fill method as you can "merge" > 2 data frames using the same syntax.

Resources