paste column elements with condition in r - r

I have a data frame and I want to paste elements in name1, name2 and name3 which do not contain NA.
c <- data.frame(name1 = letters[1:3],
name2 = c('A', NA, 'C'),
name3 = c('pig', 'cow', NA)
)
The result should like this:
c %>% mutate(new_name = c('a&A&pig', 'b&cow', 'c&C'))
When I use paste0() it binds all the elements including NA. I do not want this.
c %>% mutate(new_name = paste0(name1,'&', name2, '&', name3))
Then I tried another two method. One is split the data frame into list with group_split(), the other is nest the data frame by index. And then use map() and select() to select the column that do not contain NA after the two methods but all failed.
c %>%
mutate(index = row_number()) %>%
group_split(index) %>%
map(select(~where(~!any(is.na(.)))))
c %>%
mutate(index = row_number()) %>%
nest(data = name1:name3) %>%
mutate(without_NA_data = map(data, select(~where(~!any(is.na(.))))))
Is there any way I can get what I want?
Any help will be highly appreciated!

We can use rowwise with c_across by loading only dplyr package
library(dplyr)
c %>%
rowwise %>%
mutate(new_name = paste(na.omit(c_across(everything())), collapse="&")) %>%
ungroup
# A tibble: 3 x 4
# name1 name2 name3 new_name
# <chr> <chr> <chr> <chr>
#1 a A pig a&A&pig
#2 b <NA> cow b&cow
#3 c C <NA> c&C
Or with pmap
library(purrr)
c %>%
mutate(new_name = pmap_chr(., ~ paste(na.omit(c(...)), collapse="&")))
# name1 name2 name3 new_name
#1 a A pig a&A&pig
#2 b <NA> cow b&cow
#3 c C <NA> c&C
Or using base R with paste and replace
trimws(do.call(paste, c(replace(c, is.na(c), ''), sep="&")), whitespace = "&")
#[1] "a&A&pig" "b&&cow" "c&C"
Or using apply
apply(c, 1, function(x) paste(na.omit(x), collapse="&"))
#[1] "a&A&pig" "b&cow" "c&C"
Or paste first and remove the NA substring
gsub("&NA|NA&|NA$", "", do.call(paste, c(c, sep="&")))
#[1] "a&A&pig" "b&cow" "c&C"

We can use unite from tidyr by using na.rm = TRUE to remove NA values
tidyr::unite(c, new_name, starts_with('name'),
sep = '&', na.rm = TRUE, remove = FALSE)
# new_name name1 name2 name3
#1 a&A&pig a A pig
#2 b&cow b <NA> cow
#3 c&C c C <NA>

Related

Collapsing Columns in R using tidyverse with mutate, replace, and unite. Writing a function to reuse?

Data:
ID
B
C
1
NA
x
2
x
NA
3
x
x
Results:
ID
Unified
1
C
2
B
3
B_C
I'm trying to combine colums B and C, using mutate and unify, but how would I scale up this function so that I can reuse this for multiple columns (think 100+), instead of having to write out the variables each time? Or is there a function that's already built in to do this?
My current solution is this:
library(tidyverse)
Data %>%
mutate(B = replace(B, B == 'x', 'B'), C = replace(C, C == 'x', 'C')) %>%
unite("Unified", B:C, na.rm = TRUE, remove= TRUE)
We may use across to loop over the column, replace the value that corresponds to 'x' with column name (cur_column())
library(dplyr)
library(tidyr)
Data %>%
mutate(across(B:C, ~ replace(., .== 'x', cur_column()))) %>%
unite(Unified, B:C, na.rm = TRUE, remove = TRUE)
-output
ID Unified
1 1 C
2 2 B
3 3 B_C
data
Data <- structure(list(ID = 1:3, B = c(NA, "x", "x"), C = c("x", NA,
"x")), class = "data.frame", row.names = c(NA, -3L))
Here are couple of options.
Using dplyr -
library(dplyr)
cols <- names(Data)[-1]
Data %>%
rowwise() %>%
mutate(Unified = paste0(cols[!is.na(c_across(B:C))], collapse = '_')) %>%
ungroup -> Data
Data
# ID B C Unified
# <int> <chr> <chr> <chr>
#1 1 NA x C
#2 2 x NA B
#3 3 x x B_C
Base R
Data$Unified <- apply(Data[cols], 1, function(x)
paste0(cols[!is.na(x)], collapse = '_'))

Sub setting a column into multiple values in r

I have the following data,
col <- c('Data1,Data2','a,b,c','d')
df <- data.frame(col)
I want to split the data where the elements are more than 2 in a cell. So "a,b,c" should be split into "a,b" , "b,c" and "c,a". See attached for reference.
We create a row identifier (row_number()), split the 'col' by the delimiter (separate_rows), grouped by 'rn', summarise on those groups where the number of rows is greater than 1 to get the combn of 'col' and paste them together
library(stringr)
library(dplyr)
library(tidyr)
df %>%
mutate(rn = row_number()) %>%
separate_rows(col) %>%
group_by(rn) %>%
summarise(col = if(n() > 1) combn(col, 2, FUN = str_c, collapse=",") else col,
.groups = 'drop') %>%
select(-rn)
-output
# A tibble: 5 x 1
# col
# <chr>
#1 Data1,Data2
#2 a,b
#3 a,c
#4 b,c
#5 d
Here is a base R option using combn
data.frame(col = unlist(sapply(
strsplit(df$col, ","),
function(x) {
if (length(x) == 1) {
x
} else {
combn(x, 2, paste0, collapse = ",")
}
}
)))
which gives
col
1 Data1,Data2
2 a,b
3 a,c
4 b,c
5 d
library(tidyverse)
df %>%
rowwise()%>%
mutate(col = list(if(str_count(col, ",")>1) combn(strsplit(col, ",")[[1]], 2, toString) else col))%>%
unnest(col)
# A tibble: 5 x 1
col
<chr>
1 Data1,Data2
2 a, b
3 a, c
4 b, c
5 d

Turning a text column into a vector in r

I want to see whether the text column has elements outside the specified values of "a" and "b"
specified_value=c("a","b")
df=data.frame(key=c(1,2,3,4),text=c("a,b,c","a,d","1,2","a,b")
df_out=data.frame(key=c(1,2,3),text=c("c","d","1,2",NA))
This is what I have tried:
df=df%>%mutate(text_vector=strsplit(text, split=","),
extra=text_vector[which(!text_vector %in% specified_value)])
But this doesn't work, any suggestions?
We can split the 'text' by the delimiter , with separate_rows, grouped by 'key', get the elements that are not in 'specified_value' with setdiff and paste them together (toString), then do a join to get the other columns in the original dataset
library(dplyr) # >= 1.0.0
library(tidyr)
df %>%
separate_rows(text) %>%
group_by(key) %>%
summarise(extra = toString(setdiff(text, specified_value))) %>%
left_join(df) %>%
mutate(extra = na_if(extra, ""))
# A tibble: 4 x 3
# key extra text
# <dbl> <chr> <chr>
#1 1 c a,b,c
#2 2 d a,d
#3 3 1, 2 1,2
#4 4 <NA> a,b
Using setdiff.
df$outside <- sapply({
x <- lapply(strsplit(df$text, ","), setdiff, specified_value)
replace(x, lengths(x) == 0, NA)},
paste, collapse=",")
df
# key text outside
# 1 1 a,b,c c
# 2 2 a,d d
# 3 3 1,2 1,2
# 4 4 a,b NA
Data:
df <- structure(list(key = c(1, 2, 3, 4), text = c("a,b,c", "a,d",
"1,2", "a,b")), class = "data.frame", row.names = c(NA, -4L))
specified_value <- c("a", "b")
use stringi::stri_split_fixed
library(stringi)
!all(stri_split_fixed("a,b", ",", simplify=T) %in% specified_value) #FALSE
!all(stri_split_fixed("a,b,c", ",", simplify=T) %in% specified_value) #TRUE
An option using regex without splitting the data on comma :
#Collapse the specified_value in one string and remove from text
df$text1 <- gsub(paste0(specified_value, collapse = "|"), '', df$text)
#Remove extra commas
df$text1 <- gsub('(?<![a-z0-9]),', '', df$text1, perl = TRUE)
df
# key text text1
#1 1 a,b,c c
#2 2 a,d d
#3 3 1,2 1,2
#4 4 a,b

Elegant solution for casting (spreading) multiple columns of character vectors

I want to transforms a data frame with contact information with of a for a list of municipalities in which similar information such as e.g. phone number appears in multiple columns.
I have tried using both reshape2::dcast() as well as tidyr::spread(), neither of which solves my problem. I have also checked other post of stack overflow e.g.
Multiple column spread
Have yet to find a solution which works. It seems to me that the problems should be fairly straightforward (and solvable with spread or dcast).
tmp <- tibble(municipality = c("M1", "M2"),
name1 = c("n1", "n2"), name2 = c("n3", "n4"), name3 = c(NA, "n5"), # placeholder names
phone1 = c("p1", "p2"), phone2 = c("p3", "p4"), phone3 = c(NA, "p5")) # placeholder phone numbers
#solution 1
tmp %>% gather("colname", "value", -municipality) %>%
filter(municipality == "M1") %>% #too simplify, should be replaced with group_by(municipality)
na.omit() %>% mutate(colname = str_replace(colname, "\\d", replacement = "")) %>%
spread(., key = "colname", value = "value")
#Solution 2
tmp %>% gather("colname", "value", -municipality) %>%
filter(municipality == "M1") %>% # same as above
na.omit() %>% mutate(colname = str_replace(colname, "\\d", replacement = "")) %>%
dcast(municipality + value ~colname)
Solution 1 results in the following error:
Error: Each row of output must be identified by a unique combination of keys.
Solution 2 results in the following data frame (which is the desired result except it needs to be collapsed):
municipality value name phone
1 M1 n1 n1 <NA>
2 M1 n3 n3 <NA>
3 M1 p1 <NA> p1
4 M1 p3 <NA> p3
Are you looking for?
library(dplyr)
library(tidyr)
tmp %>%
gather(key, value, -municipality, na.rm = TRUE) %>%
mutate(key = gsub("\\d+", "", key)) %>%
group_by(municipality, key) %>%
mutate(row = row_number()) %>%
spread(key, value) %>%
select(-row)
# municipality name phone
# <chr> <chr> <chr>
#1 M1 n1 p1
#2 M1 n3 p3
#3 M2 n2 p2
#4 M2 n4 p4
#5 M2 n5 p5
We can use gather to bring the data in long format dropping NA values. Remove numbers from individual column names so that they share the same key, create a column group_by municipality and key to spread the data into wide format.
We can do this elegantly with pivot_longer from the dev version of tidyr
library(dplyr)
library(tidyr)# 0.8.3.9000
library(stringr)
tmp %>%
rename_at(-1, ~str_replace(., "(\\d+$)", "_\\1")) %>%
pivot_longer(cols = -municipality, names_to = c(".value", "group"),
names_sep="_", values_drop_na = TRUE) %>%
select(-group)
# A tibble: 5 x 3
# municipality name phone
# <chr> <chr> <chr>
#1 M1 n1 p1
#2 M1 n3 p3
#3 M2 n2 p2
#4 M2 n4 p4
#5 M2 n5 p5
Or another option is melt from data.table
library(data.table)
melt(setDT(tmp), measure = patterns("^name", "^phone"),
value.name = c("name", "phone"), na.rm = TRUE)[, variable := NULL][]
#. municipality name phone
#1: M1 n1 p1
#2: M2 n2 p2
#3: M1 n3 p3
#4: M2 n4 p4
#5: M2 n5 p5

Create a new column in dplyr by appending values to a list from other columns?

I would like to make a new column by appending to a list conditional on the values of other columns. If possible, I would like to do so in dplyr. Sample input and desired output is below.
Suppose a dataframe newdata:
col1 col2 col3 col4
dog cat NA NA
NA cat foo bar
dog NA NA NA
NA cat NA NA
Here is my desired output, with the new column newCol:
col1 col2 col3 col4 newCol
dog cat NA NA (dog, cat)
NA cat foo bar (cat, foo, bar)
dog NA NA NA (dog)
NA cat NA bar (cat, bar)
I have tried using ifelse within mutate and case_when within mutate, but both will not allow concatenation to a list. Here is my (unsuccessful) attempt with case_when:
newdata = newdata %>% mutate(
newCol = case_when(
col1 == "dog" ~ c("dog"),
col2 == "cat" ~ c(newCol, "cat"),
col3 == "foo" ~ c(newCol, "foo"),
col4 == "bar" ~ c(newcol, "dog")
)
)
I tried a similar approach with an ifelse statement for each column but also could not append to the list.
In the Note at the end we show the input data used here. It is as in the question except we have added a row of NAs at the end to show that all solutions work in that case too.
We show both list and character column solutions. The question specifically refers to list so this is the assumed desired output but if it was intended that newCol be a character vector then we show that as well.
This is so easy to do using base functions that we show that first; however, we do redo it in tidyverse although it involves significantly more code.
1) base We can use apply like this:
reduce <- function(x) unname(x[!is.na(x)])
DF$newCol <- apply(DF, 1, reduce)
giving the following where newCol is a list whose first component is c("dog", "cat"), etc.
col1 col2 col3 col4 newCol
1 dog cat <NA> <NA> dog, cat
2 <NA> cat foo bar cat, foo, bar
3 dog <NA> <NA> <NA> dog
4 <NA> cat <NA> <NA> cat
5 <NA> <NA> <NA> <NA>
The last line of code could alternately be:
DF$newCol <- lapply(split(DF, 1:nrow(DF)), reduce)
The question refers to concatenating to a list so I assume that a list is wanted for newCol but if a string is wanted then use this for reduce instead:
reduce_ch <- function(x) sprintf("(%s)", toString(x[!is.na(x)]))
apply(DF, 1, reduce_ch)
2) tidyverse or using tpldyr/tidyr/tibble we gather it to long form, remove the NAs, nest it, sort it back to the original order and cbind it back with DF.
library(dplyr)
library(tibble)
library(tidyr)
DF %>%
rownames_to_column %>%
gather(colName, Value, -rowname) %>%
na.omit %>%
select(-colName) %>%
nest(Value, .key = newCol) %>%
arrange(rowname) %>%
left_join(cbind(DF %>% rownames_to_column), .) %>%
select(-rowname)
giving:
col1 col2 col3 col4 newCol
1 dog cat <NA> <NA> dog, cat
2 <NA> cat foo bar cat, foo, bar
3 dog <NA> <NA> <NA> dog
4 <NA> cat <NA> <NA> cat
5 <NA> <NA> <NA> <NA> NULL
If character output is wanted then use this instead:
DF %>%
rownames_to_column %>%
gather(colName, Value, -rowname) %>%
select(-colName) %>%
group_by(rowname) %>%
summarize(newCol = sprintf("(%s)", toString(na.omit(Value)))) %>%
ungroup %>%
{ cbind(DF, .) } %>%
select(-rowname)
giving:
col1 col2 col3 col4 newCol
1 dog cat <NA> <NA> (dog, cat)
2 <NA> cat foo bar (cat, foo, bar)
3 dog <NA> <NA> <NA> (dog)
4 <NA> cat <NA> <NA> (cat)
5 <NA> <NA> <NA> <NA> ()
Note
The input DF in reproducible form:
Lines <- "col1 col2 col3 col4
dog cat NA NA
NA cat foo bar
dog NA NA NA
NA cat NA NA
NA NA NA NA"
DF <- read.table(text = Lines, header = TRUE, as.is = TRUE)
Solution using na.omit() and paste() with collapse argument:
apply(newdata, 1,
function(x) paste0("(", paste(na.omit(x), collapse = ", "), ")"))
[1] "(dog, cat)" "(cat, foo, bar)" "(dog)" "(cat)"
Demo
This looks like a use case for tidyr::unite. You'll still need to do some dplyr cleanup at the end, but this should work for now.
library(tibble)
library(dplyr)
library(tidyr)
df <- tribble(~col1, ~col2, ~col3, ~col4,
"dog", "cat", NA, NA,
NA, "cat", "foo", "bar",
"dog", NA, NA, NA,
NA, "cat", NA, NA)
df %>%
unite(newCol, col1, col2, col3, col4,
remove = FALSE,
sep = ', ') %>%
# Replace NAs and "NA, "s with ''
mutate(newCol = gsub('NA[, ]*', '', newCol)) %>%
# Replace ', ' with '' if it is at the end of the line
mutate(newCol = gsub(', $', '', newCol)) %>%
# Add the parentheses on either side
mutate(newCol = paste0('(', newCol, ')'))
#> # A tibble: 4 x 5
#> newCol col1 col2 col3 col4
#> <chr> <chr> <chr> <chr> <chr>
#> 1 (dog, cat) dog cat <NA> <NA>
#> 2 (cat, foo, bar) <NA> cat foo bar
#> 3 (dog) dog <NA> <NA> <NA>
#> 4 (cat) <NA> cat <NA> <NA>
Also for what it's worth, other people are discussing this problem!

Resources