r collapsing data from multiple columns into one - r

I know there are many questions on this topic so I apologize if this is a duplicate question. I'm trying to collapse multiple columns in a data set into one column:
Assuming this is the structure of the dataset I am working with,
df <- data.frame(
cbind(
variable_1 = c('Var1', NA, NA,'Var1'),
variable_2 = c('Var2', 'No', NA, NA),
variable_3 = c(NA, NA, 'Var3', NA),
variable_4 = c(NA, 'Var4', NA, NA),
variable_5 = c(NA, 'No', 'Var5', NA),
variable_6 = c(NA, NA, 'Var6', NA)
))
variable_1 variable_2 variable_3 variable_4 variable_5 variable_6
Var1 Var2 NA NA NA NA
NA No NA Var4 No NA
NA NA Var3 NA Var5 Var6
Var1 NA NA NA NA NA
What I am expecting is a one column variable_7 like this
variable_1 variable_2 variable_3 variable_4 variable_5 variable_6 variable_7
Var1 Var2 NA NA NA NA Var1, Var2
NA No NA Var4 No NA Var4
NA NA Var3 NA Var5 Var6 Var3, Var5, Var6
Var1 NA NA NA NA NA Var1

df$variable_7 <- apply(df, 1, function(x) paste(x[!is.na(x) & x != "No"], collapse = ", "));
df;
# variable_1 variable_2 variable_3 variable_4 variable_5 variable_6
#1 Var1 Var2 <NA> <NA> <NA> <NA>
#2 <NA> No <NA> Var4 No <NA>
#3 <NA> <NA> Var3 <NA> Var5 Var6
#4 Var1 <NA> <NA> <NA> <NA> <NA>
# variable_7
#1 Var1, Var2
#2 Var4
#3 Var3, Var5, Var6
#4 Var1
Explanation: Use apply and paste(..., collapse = ", ") to concatenate all row entries (except NAs and "No"s) and store in new column variable_7.
Sample data
df <- data.frame(
cbind(
variable_1 = c('Var1', NA, NA,'Var1'),
variable_2 = c('Var2', 'No', NA, NA),
variable_3 = c(NA, NA, 'Var3', NA),
variable_4 = c(NA, 'Var4', NA, NA),
variable_5 = c(NA, 'No', 'Var5', NA),
variable_6 = c(NA, NA, 'Var6', NA)
))

I gather that if there are n rows then objective is to create a an n-vector of comma-separated character strings of those values in each row that contain the characters Var. (If you intended some other criterion for separating the desired and undesired values then change the grep accordingly.)
apply(df, 1, function(x) toString(grep("Var", x, value = TRUE)))
## [1] "Var1, Var2" "Var4" "Var3, Var5, Var6" "Var1"

A solution using dplyr. df4 is the final output. Please see how I created the data frame df. The cbind is not required, and it would be great to add stringsAsFactors = FALSE to prevent the creation of factor columns.
library(dplyr)
library(tidyr)
df2 <- df %>% mutate(ID = 1:n())
df3 <- df2 %>%
gather(Variable, Value, -ID, na.rm = TRUE) %>%
filter(!Value %in% "No") %>%
group_by(ID) %>%
summarise(variable_7 = toString(Value))
df4 <- df2 %>%
left_join(df3, by = "ID") %>%
select(-ID)
df4
# variable_1 variable_2 variable_3 variable_4 variable_5 variable_6 variable_7
# 1 Var1 Var2 <NA> <NA> <NA> <NA> Var1, Var2
# 2 <NA> No <NA> Var4 No <NA> Var4
# 3 <NA> <NA> Var3 <NA> Var5 Var6 Var3, Var5, Var6
# 4 Var1 <NA> <NA> <NA> <NA> <NA> Var1
DATA
df <- data.frame(
variable_1 = c('Var1', NA, NA,'Var1'),
variable_2 = c('Var2', 'No', NA, NA),
variable_3 = c(NA, NA, 'Var3', NA),
variable_4 = c(NA, 'Var4', NA, NA),
variable_5 = c(NA, 'No', 'Var5', NA),
variable_6 = c(NA, NA, 'Var6', NA),
stringsAsFactors = FALSE
)

Using a data.table 'reshap'-ing approach rather than a loop/apply
library(data.table)
setDT(df)
df[, id := .I][
melt(df, id.vars = "id")[grepl("Var", value), .(variable_7 = paste0(value, collapse = ",")), by = .(id)]
, on = "id"
, nomatch = 0
][order(id)]
# variable_1 variable_2 variable_3 variable_4 variable_5 variable_6 id variable_7
# 1: Var1 Var2 NA NA NA NA 1 Var1,Var2
# 2: NA No NA Var4 No NA 2 Var4
# 3: NA NA Var3 NA Var5 Var6 3 Var3,Var5,Var6
# 4: Var1 NA NA NA NA NA 4 Var1

Related

Merge column if duplicates in rows between columns

I have a dataframe such as :
COL1 COL2 COL3 COL4 COL5 COL6 COL7
1 Sp1-2 Sp1-2 Sp3_2-54 Sp3-2 Sp3-2 Sp3-2 SP9-43
2 Sp5-1 Sp5-2 Sp2-4 Sp9-2 Sp10-3 SP9-90 NA
3 Sp_7-3 Sp_7-3 NA SP6-56 Sp2-7 SP3-3 NA
And I would simply like to merge columns when at leats two elements are duplicated.
for example, in COL1 and COL2, Sp1-2 & Sp_7-3 are duplicated in both columns, then I merge it that way by adding a pipe "|" between non-duplicated elements:
COL1|COL2 COL3 COL4|COL5|COL6 COL7
1 Sp1-2 Sp3_2-54 Sp3-2 SP9-43
2 Sp5-1|Sp5-2 Sp2-4 Sp9-2|Sp10-3|SP9-90 NA
3 Sp_7-3 NA SP6-56|Sp2-7|SP3-3 NA
Here is the dput format :
structure(list(COL1 = c("Sp1-2", "Sp5-1", "Sp_7-3"), COL2 = c("Sp1-2",
"Sp5-2", "Sp_7-3"), COL3 = c("Sp3_2-54", "Sp2-4", NA), COL4 = c("Sp3-2",
"Sp9-2", "SP6-56"), COL5 = c("Sp3-2", "Sp10-3", "Sp2-7"), COL6 = c("Sp3-2",
"SP9-90", "SP3-3"), COL7 = c("SP9-43", NA, NA)), class = "data.frame", row.names = c(NA,
-3L))
Another example :
G136 G348 G465
1 NA NA NA
2 NA NA NA
3 SP4-140 SP4-140 NA
4 SP2-8 NA NA
5 SP3-59 NA NA
6 SP1_contig.682-8 NA SP1_contig.682-8
expected output:
G136|G348|G465
1 NA
2 NA
3 SP4-140
4 SP2-8
5 SP3-59
6 SP1_contig.682-8
the deput format :
dat<- structure(list(G136 = c(NA, NA, "SP4-140", "SP2-8", "SP3-59", "SP1_contig.682-8", NA, NA, NA), G348 = c(NA, NA, "SP4-140", NA, NA, NA, NA, NA, NA), G465 = c(NA, NA, NA, NA, NA, "SP1_contig.682-8", NA, NA, NA)), row.names = c(NA, -9L), class = c("tbl_df", "tbl", "data.frame"))
This is probably best handled by reshaping your data first, then it's straight forward to use various groupings to achieve your desired result:
library(tidyr)
library(dplyr)
dat %>%
rowid_to_column() %>%
pivot_longer(-rowid) %>%
filter(!is.na(value)) %>%
group_by(rowid, value) %>%
mutate(new_name = paste(name, collapse = "|")) %>%
separate_rows(new_name, sep = "\\|") %>%
group_by(name) %>%
mutate(new_name = paste(unique(new_name), collapse = "|")) %>%
group_by(value) %>%
filter(nchar(new_name) == max(nchar(new_name))) %>%
ungroup() %>%
select(-name) %>%
pivot_wider(names_from = new_name, values_from = value, values_fn = ~ paste(unique(.x), collapse = "|")) %>%
complete(rowid = full_seq(c(1, rowid), 1))
# A tibble: 3 × 5
rowid `COL1|COL2` COL3 `COL4|COL5|COL6` COL7
<dbl> <chr> <chr> <chr> <chr>
1 1 Sp1-2 Sp3_2-54 Sp3-2 SP9-43
2 2 Sp5-1|Sp5-2 Sp2-4 Sp9-2|Sp10-3|SP9-90 NA
3 3 Sp_7-3 NA SP6-56|Sp2-7|SP3-3 NA
And using the data in your second example gives:
# A tibble: 6 × 2
rowid `G136|G348|G465`
<dbl> <chr>
1 1 NA
2 2 NA
3 3 SP4-140
4 4 SP2-8
5 5 SP3-59
6 6 SP1_contig.682-8
It's really messy...but you may try
library(igraph)
library(stringdist)
library(data.table)
table(df[1,])
d <- c()
for (i in 1:(ncol(df)-1)){
for (j in (i+1):ncol(df)) {
if(any(na.omit(stringdist(df[,i], df[,j], method = "lv") == 0))) {
d <- rbind(d, c(i,j))
}
}
}
dd <- data.table(d)
net <- graph_from_data_frame(d = dd, directed = F)
key <- split(names(V(net)), components(net)$membership)
res <- matrix(NA,nrow = nrow(df), ncol = 0)
names_dummy <- c()
df_dummy <- c()
for (i in key){
i <- as.numeric(i)
names_dummy <- c(names_dummy, paste0(colnames(df)[i], collapse = "|"))
df_dummy <- cbind(df_dummy, apply(df[,i], 1, function(x) {paste0(unique(unlist(x)), collapse = "|")}))
}
colnames(df_dummy) <- names_dummy
df_dummy
res <- cbind(df_dummy, df[,-as.numeric(unlist(key))])
res <- res[,sort(colnames(res))]
res
COL1|COL2 COL3 COL4|COL5|COL6 COL7
1 Sp1-2 Sp3_2-54 Sp3-2 SP9-43
2 Sp5-1|Sp5-2 Sp2-4 Sp9-2|Sp10-3|SP9-90 <NA>
3 Sp_7-3 <NA> SP6-56|Sp2-7|SP3-3 <NA>

Count and merge NA rows in R

I have the following dataframe:
var1 <- c("a", "b", "c", "d", "e")
var2 <- c(5, 10, NA, NA, NA)
df <- data.frame (var1, var2)
df
# A tibble: 5 × 2
var1 var2
<chr> <dbl>
1 a 5
2 b 10
3 c NA
4 d NA
5 e NA
I would like to count and merge the NA rows. Expected output:
# A tibble: 3 × 2
var1 var2
<chr> <dbl>
1 a 5
2 b 10
3 x 3
I have tried aggregate(data=df,var2~.,na.rm = FALSE, FUN = sum) but it only returns the results for a and b.
Thank you in advance
With dplyr
df %>%
mutate(var1 = ifelse(is.na(var2), "x", var1),
var2 = ifelse(var1=="x", sum(is.na(var2) & var1 == "x"),
var2)
) %>%
unique()
var1 var2
1 a 1
2 b 2
3 c 3
4 x 2
Data
df <- structure(list(var1 = c("a", "b", "c", "d", "e"), var2 = c(1,
2, 3, NA, NA)), class = "data.frame", row.names = c(NA, -5L))
Try this with Base R
s <- df[complete.cases(df) , ]
s[nrow(s)+1 ,] <- c("x" , sum(is.na(df$var2) == T))
s
output
var1 var2
1 a 5
2 b 10
3 x 3
Using aggregate:
df$var1 <- ifelse(is.na(df$var2), "x", df$var1)
aggregate(data = df, var2 ~ .,
FUN = \(x) if (!all(is.na(x))) sum(x) else length(x), na.action = NULL)
#> var1 var2
#> 1 a 5
#> 2 b 10
#> 3 x 3
Another approach with dplyr
df %>%
mutate(var1 = ifelse(is.na(var2), "x", var1),
var2 = ifelse(is.na(var2), 1, var2)) %>%
count(var1, wt=var2, name="var2")

R: Combine rows based on equal values in several columns

I have a dataframe that takes similar form as the toy dataframe below. I would like to merge the rows if var1, var2, and var3 are all equal values, creating a combination of data in the merged rows. For rows 4 - 6, where there are different values in the rows, I was wondering if there is a way to put them in the same column with a seperator in between.
df <- data.frame(var1 = c("1635", "1635", "1729", "1847", "1847", "1847"),
var2 = c("Aa", "Aa", "Bb", "Cc", "Cc", "Cc"),
var3 = c("28", "28", "85", "27", "27", "27"),
var4 = c("apple", NA, "orange", "pear", NA, NA),
var5 = c(NA, "tree", NA, NA, "ground", "desk")
)
So the output would look something like this:
in base R you would do:
aggregate(.~var1+var2+var3, df, \(x)toString(unique(na.omit(x))), na.action = identity)
var1 var2 var3 var4 var5
1 1847 Cc 27 pear ground, desk
2 1635 Aa 28 apple tree
3 1729 Bb 85 orange
in tidyverse:
library(tidyverse)
df %>%
group_by(var1,var2,var3) %>%
summarize(across(var4:var5, ~toString(unique(na.omit(.x)))),.groups = 'drop')
# Groups: var1, var2 [3]
var1 var2 var3 var4 var5
<chr> <chr> <chr> <chr> <chr>
1 1635 Aa 28 apple "tree"
2 1729 Bb 85 orange ""
3 1847 Cc 27 pear "ground, desk"
With dplyr, you can group_by the three columns, then use summarize to concatenate the strings if they are not NA.
library(dplyr)
df %>%
group_by(var1, var2, var3) %>%
summarize(across(var4:var5, ~ifelse(all(is.na(.x)), NA, paste0(na.omit(.x), collapse = ","))), .groups = "drop")
# A tibble: 3 × 5
var1 var2 var3 var4 var5
<chr> <chr> <chr> <chr> <chr>
1 1635 Aa 28 apple tree
2 1729 Bb 85 orange NA
3 1847 Cc 27 pear ground,desk
With data.table:
setDT(df)
df[,
lapply(.SD, \(x) if (all(is.na(x))) NA_character_ else paste(na.omit(x), collapse = "; ")),
by = var1:var3]
setDF(df)
# var1 var2 var3 var4 var5
# <char> <char> <char> <char> <char>
# 1: 1635 Aa 28 apple tree
# 2: 1729 Bb 85 orange <NA>
# 3: 1847 Cc 27 pear ground; desk

Join similar observations within a data.frame with R

I want to mix several observations in a data.frame using as a reference one constantly repeated variable.
Example:
id var1 var2 var3
a 1 na na
a na 2 na
a na na 3
b 1 na
b na 2 na
b na na na
c na na 3
c na 2 na
c 1 na na
Expected result:
id var1 var2 var3
a 1 2 3
b 1 2 na
c 1 2 3
A possible solution (replacing "na" by NA with na_if):
library(tidyverse)
df %>%
na_if("na") %>%
group_by(id) %>%
summarize(across(var1:var3, ~ sort(.x)[1]))
#> # A tibble: 3 × 4
#> id var1 var2 var3
#> <chr> <chr> <chr> <chr>
#> 1 a 1 2 3
#> 2 b 1 2 <NA>
#> 3 c 1 2 3
Assumptions:
"na" above is really R's native NA (not a string);
b's first row, var2 should be NA instead of an empty string ""
perhaps from the above, var1:var3 should be numbers
either you will never have a group where there is more than one non-NA in a group/column, or you don't care about anything other than the first and want the remaining discarded
library(dplyr)
dat %>%
group_by(id) %>%
summarize(across(everything(), ~ na.omit(.)[1]))
# # A tibble: 3 x 4
# id var1 var2 var3
# <chr> <int> <int> <int>
# 1 a 1 2 3
# 2 b 1 2 NA
# 3 c 1 2 3
Data
dat <- structure(list(id = c("a", "a", "a", "b", "b", "b", "c", "c", "c"), var1 = c(1L, NA, NA, 1L, NA, NA, NA, NA, 1L), var2 = c(NA, 2L, NA, NA, 2L, NA, NA, 2L, NA), var3 = c(NA, NA, 3L, NA, NA, NA, 3L, NA, NA)), class = "data.frame", row.names = c(NA, -9L))
Assuming that your data has NA, you can use the following base R option using the Data from #r2evans (thanks!):
aggregate(.~id, dat, mean, na.rm = TRUE, na.action=NULL)
Output:
id var1 var2 var3
1 a 1 2 3
2 b 1 2 NaN
3 c 1 2 3

Fill in missing data that is same across columns

I need to fill in some missing data from a merge that is the same in all columns. After the merge, all the values are NA, but i would like a quick way to fill them in since their values are the same.
Example:
df <- structure(list(date = structure(c(-25932, -25931, -25930, -25929,
-25928), class = "Date"), year = c(1899, 1899, 1899, 1899, 1899
), month = c(1, 1, 1, 1, 1), day = c(1, 2, 3, 4, 5), test1 = c(NA,
NA, "VAR1", NA, NA), test2 = c(NA, NA, "VAR2", NA, NA), test3 = c(NA,
NA, "VAR3", NA, NA)), .Names = c("date", "year", "month", "day",
"test1", "test2", "test3"), row.names = c(NA, 5L), class = "data.frame")
# Tedious way, but works
df$test1 <- "VAR1"
# Desired output
date year month day test1 test2 test3
1 1899-01-01 1899 1 1 VAR1 VAR2 VAR3
2 1899-01-02 1899 1 2 VAR1 VAR2 VAR3
3 1899-01-03 1899 1 3 VAR1 VAR2 VAR3
4 1899-01-04 1899 1 4 VAR1 VAR2 VAR3
5 1899-01-05 1899 1 5 VAR1 VAR2 VAR3
You can try something like the following:
df
# date year month day test1 test2 test3
# 1 1899-01-01 1899 1 1 <NA> <NA> <NA>
# 2 1899-01-02 1899 1 2 <NA> <NA> <NA>
# 3 1899-01-03 1899 1 3 VAR1 VAR2 VAR3
# 4 1899-01-04 1899 1 4 <NA> <NA> <NA>
# 5 1899-01-05 1899 1 5 <NA> <NA> <NA>
df[grep("test", names(df))] <- lapply(df[grep("test", names(df))],
function(x) x[!is.na(x)][1])
df
# date year month day test1 test2 test3
# 1 1899-01-01 1899 1 1 VAR1 VAR2 VAR3
# 2 1899-01-02 1899 1 2 VAR1 VAR2 VAR3
# 3 1899-01-03 1899 1 3 VAR1 VAR2 VAR3
# 4 1899-01-04 1899 1 4 VAR1 VAR2 VAR3
# 5 1899-01-05 1899 1 5 VAR1 VAR2 VAR3
We can also use data.table. Convert the 'data.frame' to 'data.table' (setDT(df)). Based on the index of 'test' columns ('nm1'), we loop with for and set the NA elements by the non-NA elements in each column.
library(data.table)
nm1 <- grep('^test', names(df))
setDT(df)
for(j in nm1){
set(df, i=which(is.na(df[[j]])), j=j, value= na.omit(df[[j]]))
}
df
# date year month day test1 test2 test3
#1: 1899-01-01 1899 1 1 VAR1 VAR2 VAR3
#2: 1899-01-02 1899 1 2 VAR1 VAR2 VAR3
#3: 1899-01-03 1899 1 3 VAR1 VAR2 VAR3
#4: 1899-01-04 1899 1 4 VAR1 VAR2 VAR3
#5: 1899-01-05 1899 1 5 VAR1 VAR2 VAR3

Resources