Merge column if duplicates in rows between columns - r

I have a dataframe such as :
COL1 COL2 COL3 COL4 COL5 COL6 COL7
1 Sp1-2 Sp1-2 Sp3_2-54 Sp3-2 Sp3-2 Sp3-2 SP9-43
2 Sp5-1 Sp5-2 Sp2-4 Sp9-2 Sp10-3 SP9-90 NA
3 Sp_7-3 Sp_7-3 NA SP6-56 Sp2-7 SP3-3 NA
And I would simply like to merge columns when at leats two elements are duplicated.
for example, in COL1 and COL2, Sp1-2 & Sp_7-3 are duplicated in both columns, then I merge it that way by adding a pipe "|" between non-duplicated elements:
COL1|COL2 COL3 COL4|COL5|COL6 COL7
1 Sp1-2 Sp3_2-54 Sp3-2 SP9-43
2 Sp5-1|Sp5-2 Sp2-4 Sp9-2|Sp10-3|SP9-90 NA
3 Sp_7-3 NA SP6-56|Sp2-7|SP3-3 NA
Here is the dput format :
structure(list(COL1 = c("Sp1-2", "Sp5-1", "Sp_7-3"), COL2 = c("Sp1-2",
"Sp5-2", "Sp_7-3"), COL3 = c("Sp3_2-54", "Sp2-4", NA), COL4 = c("Sp3-2",
"Sp9-2", "SP6-56"), COL5 = c("Sp3-2", "Sp10-3", "Sp2-7"), COL6 = c("Sp3-2",
"SP9-90", "SP3-3"), COL7 = c("SP9-43", NA, NA)), class = "data.frame", row.names = c(NA,
-3L))
Another example :
G136 G348 G465
1 NA NA NA
2 NA NA NA
3 SP4-140 SP4-140 NA
4 SP2-8 NA NA
5 SP3-59 NA NA
6 SP1_contig.682-8 NA SP1_contig.682-8
expected output:
G136|G348|G465
1 NA
2 NA
3 SP4-140
4 SP2-8
5 SP3-59
6 SP1_contig.682-8
the deput format :
dat<- structure(list(G136 = c(NA, NA, "SP4-140", "SP2-8", "SP3-59", "SP1_contig.682-8", NA, NA, NA), G348 = c(NA, NA, "SP4-140", NA, NA, NA, NA, NA, NA), G465 = c(NA, NA, NA, NA, NA, "SP1_contig.682-8", NA, NA, NA)), row.names = c(NA, -9L), class = c("tbl_df", "tbl", "data.frame"))

This is probably best handled by reshaping your data first, then it's straight forward to use various groupings to achieve your desired result:
library(tidyr)
library(dplyr)
dat %>%
rowid_to_column() %>%
pivot_longer(-rowid) %>%
filter(!is.na(value)) %>%
group_by(rowid, value) %>%
mutate(new_name = paste(name, collapse = "|")) %>%
separate_rows(new_name, sep = "\\|") %>%
group_by(name) %>%
mutate(new_name = paste(unique(new_name), collapse = "|")) %>%
group_by(value) %>%
filter(nchar(new_name) == max(nchar(new_name))) %>%
ungroup() %>%
select(-name) %>%
pivot_wider(names_from = new_name, values_from = value, values_fn = ~ paste(unique(.x), collapse = "|")) %>%
complete(rowid = full_seq(c(1, rowid), 1))
# A tibble: 3 × 5
rowid `COL1|COL2` COL3 `COL4|COL5|COL6` COL7
<dbl> <chr> <chr> <chr> <chr>
1 1 Sp1-2 Sp3_2-54 Sp3-2 SP9-43
2 2 Sp5-1|Sp5-2 Sp2-4 Sp9-2|Sp10-3|SP9-90 NA
3 3 Sp_7-3 NA SP6-56|Sp2-7|SP3-3 NA
And using the data in your second example gives:
# A tibble: 6 × 2
rowid `G136|G348|G465`
<dbl> <chr>
1 1 NA
2 2 NA
3 3 SP4-140
4 4 SP2-8
5 5 SP3-59
6 6 SP1_contig.682-8

It's really messy...but you may try
library(igraph)
library(stringdist)
library(data.table)
table(df[1,])
d <- c()
for (i in 1:(ncol(df)-1)){
for (j in (i+1):ncol(df)) {
if(any(na.omit(stringdist(df[,i], df[,j], method = "lv") == 0))) {
d <- rbind(d, c(i,j))
}
}
}
dd <- data.table(d)
net <- graph_from_data_frame(d = dd, directed = F)
key <- split(names(V(net)), components(net)$membership)
res <- matrix(NA,nrow = nrow(df), ncol = 0)
names_dummy <- c()
df_dummy <- c()
for (i in key){
i <- as.numeric(i)
names_dummy <- c(names_dummy, paste0(colnames(df)[i], collapse = "|"))
df_dummy <- cbind(df_dummy, apply(df[,i], 1, function(x) {paste0(unique(unlist(x)), collapse = "|")}))
}
colnames(df_dummy) <- names_dummy
df_dummy
res <- cbind(df_dummy, df[,-as.numeric(unlist(key))])
res <- res[,sort(colnames(res))]
res
COL1|COL2 COL3 COL4|COL5|COL6 COL7
1 Sp1-2 Sp3_2-54 Sp3-2 SP9-43
2 Sp5-1|Sp5-2 Sp2-4 Sp9-2|Sp10-3|SP9-90 <NA>
3 Sp_7-3 <NA> SP6-56|Sp2-7|SP3-3 <NA>

Related

Filter by group and conditions

I have this type of data, where Sequis a grouping variable:
df <- data.frame(
Sequ = c(1,1,1,
2,2,2,
3,3,
4,4),
Answerer = c("A", NA, NA, "A", NA, NA, "B", NA, "C", NA),
PP_by = c(rep("A",5), rep("B",5)),
pp = c(0.1,0.2,0.3, 1, NA, NA, NA, NA, NA, NA)
)
I need to remove any Sequ where
(i) Answerer == PP_by AND
(ii) there is any NA in pp
I've tried this, but it obviously implements just the first condition (i):
library(dplyr)
df %>%
group_by(Sequ) %>%
filter(
all(!is.na(pp))
)
The expected result is:
Sequ Answerer PP_by pp
1 1 A A 0.1
2 1 <NA> A 0.2
3 1 <NA> A 0.3
9 4 C B NA
10 4 <NA> B NA
EDIT:
I've come up with this solution:
df %>%
group_by(Sequ) %>%
filter(
first(Answerer) != first(PP_by)
|
all(!is.na(pp))
)
Here's another way:
df %>%
group_by(Sequ) %>%
filter(!(
any(Answerer == PP_by, na.rm = TRUE) &
any(is.na(pp))
))
# # A tibble: 5 × 4
# # Groups: Sequ [2]
# Sequ Answerer PP_by pp
# <dbl> <chr> <chr> <dbl>
# 1 1 A A 0.1
# 2 1 NA A 0.2
# 3 1 NA A 0.3
# 4 4 C B NA
# 5 4 NA B NA

Conditionally Create New Column Based on Row Values

thanks in advance for any assistance.
I have a dataframe:
df <- structure(list(ID = c("0001", "0002", "0003", "0004"), May_1 = c(1,
2, 1, 3), May_5 = c(NA, 1, 2, 1), May_10 = c(NA, 3, 3, NA), May_16 = c(2,
NA, NA, NA), May_20 = c(3, NA, NA, 2)), row.names = c(NA, -4L
), class = c("tbl_df", "tbl", "data.frame"))
I would like to create new columns named "First Preference", "Second Preference" and "Third Preference" based on the row values for each response.
If a row value == 1, I would like to append a column called "First Preference" that contains the column name where the row value == 1.
My actual data contains about 40 dates that will be changing week over week, so a generalizable solution is most appreciated.
Here's the ideal df:
df_ideal <- structure(list(ID = c("0001", "0002", "0003", "0004"), May_1 = c(1,
2, 1, 3), May_5 = c(NA, 1, 2, 1), May_10 = c(NA, 3, 3, NA), May_16 = c(2,
NA, NA, NA), May_20 = c(3, NA, NA, 2), First_Preference = c("May_1",
"May_5", "May_1", "May_5"), Second_Preference = c("May_16", "May_1",
"May_5", "May_20"), Third_Preference = c("May_20", "May_10",
"May_10", "May_1")), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
A tidyverse solution would be preferred, but I'm certainly open to anything.
Thanks!
In base R, we can use apply row-wise order the values removing NA values and get corresponding column names.
cols <- paste(c('First', 'Second', 'Third'), "Preference", sep = "_")
df[cols] <- t(apply(df[-1], 1, function(x) names(df)[-1][order(x, na.last= NA)]))
df
# A tibble: 4 x 9
# ID May_1 May_5 May_10 May_16 May_20 First_Preference Second_Preference Third_Preference
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
#1 0001 1 NA NA 2 3 May_1 May_16 May_20
#2 0002 2 1 3 NA NA May_5 May_1 May_10
#3 0003 1 2 3 NA NA May_1 May_5 May_10
#4 0004 3 1 NA NA 2 May_5 May_20 May_1
We can reshape it to 'long' format, while dropping the NA elements with values_drop_na, then use the 'value' column as index to change the labels and then convert back to 'wide' format with pivot_wider
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -ID, values_drop_na = TRUE) %>%
group_by(ID) %>%
mutate(value = c("First_Preference", "Second_Preference",
"Third_Preference")[value]) %>%
ungroup %>%
pivot_wider(names_from = value, values_from = name) %>%
left_join(df, .)
# A tibble: 4 x 9
# ID May_1 May_5 May_10 May_16 May_20 First_Preference Second_Preference Third_Preference
#* <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
#1 0001 1 NA NA 2 3 May_1 May_16 May_20
#2 0002 2 1 3 NA NA May_5 May_1 May_10
#3 0003 1 2 3 NA NA May_1 May_5 May_10
#4 0004 3 1 NA NA 2 May_5 May_20 May_1
To get the column names automatically, we can use ordinal from english
library(english)
library(stringr)
df %>%
pivot_longer(cols = -ID, values_drop_na = TRUE) %>%
group_by(ID) %>%
mutate(value = str_c(ordinal(value), "_preference")) %>%
ungroup %>%
pivot_wider(names_from = value, values_from = name) %>%
left_join(df, .)
Or using data.table
library(data.table)
setDT(df)[dcast(melt(df, id.var = 'ID', na.rm = TRUE),
ID ~ paste0(ordinal(value), "_preference"), value.var = 'variable'), on = .(ID)]
# ID May_1 May_5 May_10 May_16 May_20 first_preference second_preference third_preference
#1: 0001 1 NA NA 2 3 May_1 May_16 May_20
#2: 0002 2 1 3 NA NA May_5 May_1 May_10
#3: 0003 1 2 3 NA NA May_1 May_5 May_10
#4: 0004 3 1 NA NA 2 May_5 May_20 May_1

R: Return rows with only 1 non-NA value for a set of columns

Suppose I have a data.table with the following data:
colA colB colC result
1 2 3 231
1 NA 2 123
NA 3 NA 345
11 NA NA 754
How would I use dplyr and magrittr to only select the following rows:
colA colB colC result
NA 3 NA 345
11 NA NA 754
The selection criteria is: only 1 non-NA value for columns A-C (i.e. colA, colB, ColC)
I have been unable to find a similar question; guessing this is an odd situation.
A base R option would be
df[apply(df, 1, function(x) sum(!is.na(x)) == 1), ]
# colA colB colC
#3 NA 3 NA
#4 11 NA NA
A dplyr option is
df %>% filter(rowSums(!is.na(.)) == 1)
Update
In response to your comment, you can do
df[apply(df[, -ncol(df)], 1, function(x) sum(!is.na(x)) == 1), ]
# colA colB colC result
#3 NA 3 NA 345
#4 11 NA NA 754
Or the same in dplyr
df %>% filter(rowSums(!is.na(.[-length(.)])) == 1)
This assumes that the last column is the one you'd like to ignore.
Sample data
df <-read.table(text = "colA colB colC
1 2 3
1 NA 2
NA 3 NA
11 NA NA", header = T)
Sample data for update
df <- read.table(text =
"colA colB colC result
1 2 3 231
1 NA 2 123
NA 3 NA 345
11 NA NA 754
", header = T)
Another option is filter with map
library(dplyr)
library(purrr)
df %>%
filter(map(select(., starts_with('col')), ~ !is.na(.)) %>%
reduce(`+`) == 1)
# colA colB colC result
#1 NA 3 NA 345
#2 11 NA NA 754
Or another option is to use transmute_at
df %>%
transmute_at(vars(starts_with('col')), ~ !is.na(.)) %>%
reduce(`+`) %>%
magrittr::equals(1) %>% filter(df, .)
# colA colB colC result
#1 NA 3 NA 345
#2 11 NA NA 754
data
df <- structure(list(colA = c(1L, 1L, NA, 11L), colB = c(2L, NA, 3L,
NA), colC = c(3L, 2L, NA, NA), result = c(231L, 123L, 345L, 754L
)), class = "data.frame", row.names = c(NA, -4L))
I think this would be possible with filter_at but I was not able to make it work. Here is one attempt with filter and pmap_lgl where you can specify the range of columns in select or specify by their positions or use other tidyselect helper variables.
library(dplyr)
library(purrr)
df %>%
filter(pmap_lgl(select(., colA:colC), ~sum(!is.na(c(...))) == 1))
# colA colB colC result
#1 NA 3 NA 345
#2 11 NA NA 754
data
df <- structure(list(colA = c(1L, 1L, NA, 11L), colB = c(2L, NA, 3L,
NA), colC = c(3L, 2L, NA, NA), result = c(231L, 123L, 345L, 754L
)), class = "data.frame", row.names = c(NA, -4L))

remove NA values and combine non NA values into a single column

I have a data set which has numeric and NA values in all columns. I would like to create a new column with all non NA values and preserve the row names
v1 v2 v3 v4 v5
a 1 NA NA NA NA
b NA 2 NA NA NA
c NA NA 3 NA NA
d NA NA NA 4 NA
e NA NA NA NA 5
I have tried using the coalesce function from dplyr
digital_metrics_FB <- fb_all_data %>%
mutate(fb_metrics = coalesce("v1",
"v2",
"v3",
"v4",
"v5"))
and also tried an apply function
df2 <- sapply(fb_all_data,function(x) x[!is.na(x)])
still cannot get it to work.
I am looking for the final result to be where all non NA values come together in the final column and the row names are preserved
final
a 1
b 2
c 3
d 4
e 5
any help would be much appreciated
We can use pmax
do.call(pmax, c(fb_all_data , na.rm = TRUE))
If there are more than one non-NA element and want to combine as a string, a simple base R option would be
data.frame(final = apply(fb_all_data, 1, function(x) toString(x[!is.na(x)])))
Or using coalesce
library(dplyr)
library(tibble)
fb_all_data %>%
rownames_to_column('rn') %>%
transmute(rn, final = coalesce(v1, v2, v3, v4, v5)) %>%
column_to_rownames('rn')
# final
#a 1
#b 2
#c 3
#d 4
#e 5
Or using tidyverse, for multiple non-NA elements
fb_all_data %>%
rownames_to_column('rn') %>%
transmute(rn, final = pmap_chr(.[-1], ~ c(...) %>%
na.omit %>%
toString)) %>%
column_to_rownames('rn')
NOTE: Here we are showing data that the OP showed as example and not some other dataset
data
fb_all_data <- structure(list(v1 = c(1L, NA, NA, NA, NA), v2 = c(NA, 2L, NA,
NA, NA), v3 = c(NA, NA, 3L, NA, NA), v4 = c(NA, NA, NA, 4L, NA
), v5 = c(NA, NA, NA, NA, 5L)), class = "data.frame",
row.names = c("a",
"b", "c", "d", "e"))
With tidyverse, you can do:
df %>%
rownames_to_column() %>%
gather(var, val, -1, na.rm = TRUE) %>%
group_by(rowname) %>%
summarise(val = paste(val, collapse = ", "))
rowname val
<chr> <chr>
1 a 1
2 b 2, 3
3 c 3
4 d 4
5 e 5
Sample data to have a row with more than one non-NA value:
df <- read.table(text = " v1 v2 v3 v4 v5
a 1 NA NA NA NA
b NA 2 3 NA NA
c NA NA 3 NA NA
d NA NA NA 4 NA
e NA NA NA NA 5", header = TRUE)

How do I remove na in R and make below value to go up

I have a data frame like below:
how do I remove na and use below value to go up?
Thanks
id name.america name.europe name.asia
1 a <NA> <NA>
2 <NA> b <NA>
3 <NA> <NA> c
4 d <NA> <NA>
Change to:
id name.america name.europe name.asia
1 a b c
2 d
We can loop through the columns and remove the NA, then make the lengths of the list elements same by appending NA at the end after getting the max length of the list element. Based on that, subset the 'id' column of the dataset and append with the output
lst <- lapply(df1[-1], na.omit)
lst1 <- lapply(lst, `length<-`, max(lengths(lst)))
out <- data.frame(lst1)
out1 <- cbind(id = df1$id[seq_len(nrow(out))], out)
out1
# id name.america name.europe name.asia
#1 1 a b c
#2 2 d <NA> <NA>
If we need NA to be changed to blanks ("") - not recommended
out1[is.na(out1)] <- ""
data
df1 <- structure(list(id = 1:4, name.america = c("a", NA, NA, "d"),
name.europe = c(NA, "b", NA, NA), name.asia = c(NA, NA, "c",
NA)), class = "data.frame", row.names = c(NA, -4L))
tidyverse-based solution
require(tidyverse)
df1 %>%
gather(key = "name", value = "val", -id) %>%
na.omit() %>%
select(-id) %>%
group_by(name) %>%
mutate(id = 1:n()) %>%
spread(key = name, value = val)
Results
# A tibble: 2 x 4
id name.america name.asia name.europe
<int> <chr> <chr> <chr>
1 1 a c b
2 2 d NA NA
Notes
If desired you can re-order columns with select or that variable prior to transformation.
NAs are left as such. If desired, you can use tidyr::replace_na to insert some string or space. I would discourage you from doing that.
Data
Taken from #akrun's answer above.
df1 <- structure(
list(
id = 1:4,
name.america = c("a", NA, NA, "d"),
name.europe = c(NA, "b", NA, NA),
name.asia = c(NA, NA, "c",
NA)
),
class = "data.frame",
row.names = c(NA, -4L)
)
df1[, -1] <- lapply(df1[,-1], function(x) c(na.omit(x), rep("",length(x)-length(na.omit(x)))))
df1[1:max(colSums(!(df1[,-1]==""))),]
# id name.america name.europe name.asia
#1 1 a b c
#2 2 d

Resources