Elegant solution for casting (spreading) multiple columns of character vectors - r

I want to transforms a data frame with contact information with of a for a list of municipalities in which similar information such as e.g. phone number appears in multiple columns.
I have tried using both reshape2::dcast() as well as tidyr::spread(), neither of which solves my problem. I have also checked other post of stack overflow e.g.
Multiple column spread
Have yet to find a solution which works. It seems to me that the problems should be fairly straightforward (and solvable with spread or dcast).
tmp <- tibble(municipality = c("M1", "M2"),
name1 = c("n1", "n2"), name2 = c("n3", "n4"), name3 = c(NA, "n5"), # placeholder names
phone1 = c("p1", "p2"), phone2 = c("p3", "p4"), phone3 = c(NA, "p5")) # placeholder phone numbers
#solution 1
tmp %>% gather("colname", "value", -municipality) %>%
filter(municipality == "M1") %>% #too simplify, should be replaced with group_by(municipality)
na.omit() %>% mutate(colname = str_replace(colname, "\\d", replacement = "")) %>%
spread(., key = "colname", value = "value")
#Solution 2
tmp %>% gather("colname", "value", -municipality) %>%
filter(municipality == "M1") %>% # same as above
na.omit() %>% mutate(colname = str_replace(colname, "\\d", replacement = "")) %>%
dcast(municipality + value ~colname)
Solution 1 results in the following error:
Error: Each row of output must be identified by a unique combination of keys.
Solution 2 results in the following data frame (which is the desired result except it needs to be collapsed):
municipality value name phone
1 M1 n1 n1 <NA>
2 M1 n3 n3 <NA>
3 M1 p1 <NA> p1
4 M1 p3 <NA> p3

Are you looking for?
library(dplyr)
library(tidyr)
tmp %>%
gather(key, value, -municipality, na.rm = TRUE) %>%
mutate(key = gsub("\\d+", "", key)) %>%
group_by(municipality, key) %>%
mutate(row = row_number()) %>%
spread(key, value) %>%
select(-row)
# municipality name phone
# <chr> <chr> <chr>
#1 M1 n1 p1
#2 M1 n3 p3
#3 M2 n2 p2
#4 M2 n4 p4
#5 M2 n5 p5
We can use gather to bring the data in long format dropping NA values. Remove numbers from individual column names so that they share the same key, create a column group_by municipality and key to spread the data into wide format.

We can do this elegantly with pivot_longer from the dev version of tidyr
library(dplyr)
library(tidyr)# 0.8.3.9000
library(stringr)
tmp %>%
rename_at(-1, ~str_replace(., "(\\d+$)", "_\\1")) %>%
pivot_longer(cols = -municipality, names_to = c(".value", "group"),
names_sep="_", values_drop_na = TRUE) %>%
select(-group)
# A tibble: 5 x 3
# municipality name phone
# <chr> <chr> <chr>
#1 M1 n1 p1
#2 M1 n3 p3
#3 M2 n2 p2
#4 M2 n4 p4
#5 M2 n5 p5
Or another option is melt from data.table
library(data.table)
melt(setDT(tmp), measure = patterns("^name", "^phone"),
value.name = c("name", "phone"), na.rm = TRUE)[, variable := NULL][]
#. municipality name phone
#1: M1 n1 p1
#2: M2 n2 p2
#3: M1 n3 p3
#4: M2 n4 p4
#5: M2 n5 p5

Related

How can I make a custom aggregation of a dataframe in R?

I have a dataframe such as
group <- c("A", "A", "B", "C", "C")
tx <- c("A-201", "A-202", "B-201", "C-205", "C-206")
feature <- c("coding", "decay", "pending", "coding", "coding")
df <- data.frame(group, tx, feature)
I want to generate a new df with the entries in tx "listed" for each feature. I want the output to look like
group <- c("A", "B", "C")
coding <- c("A-201", NA, "C-205|C-206")
decay <- c("A-202", NA, NA)
pending <- c(NA, "B-201", NA)
df.out <- data.frame(group, coding, decay, pending)
So far I did not find a means to achieve this via a dplyr function. Do I have to loop through my initial df?
You may get the data in wide format using tidyr::pivot_wider and use a function in values_fn -
df.out <- tidyr::pivot_wider(df, names_from = feature, values_from = tx,
values_fn = function(x) paste0(x, collapse = '|'))
df.out
# group coding decay pending
# <chr> <chr> <chr> <chr>
#1 A A-201 A-202 NA
#2 B NA NA B-201
#3 C C-205|C-206 NA NA
Here is an alternative way:
library(dplyr)
library(tidyr)
df %>%
group_by(group, feature) %>%
mutate(tx = paste(tx, collapse = "|")) %>%
distinct() %>%
pivot_wider(
names_from = feature,
values_from = tx
)
group coding decay pending
<chr> <chr> <chr> <chr>
1 A A-201 A-202 NA
2 B NA NA B-201
3 C C-205|C-206 NA NA
Using dcast from data.table
library(data.table)
dcast(setDT(df), group ~ feature, value.var = 'tx',
function(x) paste(x, collapse = "|"), fill = NA)
group coding decay pending
1: A A-201 A-202 <NA>
2: B <NA> <NA> B-201
3: C C-205|C-206 <NA> <NA>

Collapsing Columns in R using tidyverse with mutate, replace, and unite. Writing a function to reuse?

Data:
ID
B
C
1
NA
x
2
x
NA
3
x
x
Results:
ID
Unified
1
C
2
B
3
B_C
I'm trying to combine colums B and C, using mutate and unify, but how would I scale up this function so that I can reuse this for multiple columns (think 100+), instead of having to write out the variables each time? Or is there a function that's already built in to do this?
My current solution is this:
library(tidyverse)
Data %>%
mutate(B = replace(B, B == 'x', 'B'), C = replace(C, C == 'x', 'C')) %>%
unite("Unified", B:C, na.rm = TRUE, remove= TRUE)
We may use across to loop over the column, replace the value that corresponds to 'x' with column name (cur_column())
library(dplyr)
library(tidyr)
Data %>%
mutate(across(B:C, ~ replace(., .== 'x', cur_column()))) %>%
unite(Unified, B:C, na.rm = TRUE, remove = TRUE)
-output
ID Unified
1 1 C
2 2 B
3 3 B_C
data
Data <- structure(list(ID = 1:3, B = c(NA, "x", "x"), C = c("x", NA,
"x")), class = "data.frame", row.names = c(NA, -3L))
Here are couple of options.
Using dplyr -
library(dplyr)
cols <- names(Data)[-1]
Data %>%
rowwise() %>%
mutate(Unified = paste0(cols[!is.na(c_across(B:C))], collapse = '_')) %>%
ungroup -> Data
Data
# ID B C Unified
# <int> <chr> <chr> <chr>
#1 1 NA x C
#2 2 x NA B
#3 3 x x B_C
Base R
Data$Unified <- apply(Data[cols], 1, function(x)
paste0(cols[!is.na(x)], collapse = '_'))

Add more rows based on a grouping variable R

I'd like to add more rows to my dataset based on a grouping variable. Right now, my data has 2 rows but I would like 3 rows and the var app to be repeated for the third row.
This is what my data currently looks like:
my_data <- data.frame(app = c('a','b'), type = c('blue','red'), code = c(1:2), type_2 = c(NA, 'blue'), code_2 = c(NA, 3))
app type code type_2 code_2
a blue 1 NA NA
b red 2 blue 3
I would like the data to look like this:
app type code
a blue 1
b red 2
b blue 3
library(data.table)
setDT(my_data)
res <-
melt(
my_data,
id.vars = "app",
measure.vars = patterns(c("^type", "^code")),
value.name = c("type", "code")
)[!is.na(type), .(app, type, code)]
Using tidyverse
library(dplyr)
library(stringr)
library(tidyr)
my_data %>%
rename_at(vars(c(type, code)), ~ str_c(., "_1")) %>%
pivot_longer(cols = -app, names_to = c(".value", "grp"), names_sep = "_",
values_drop_na = TRUE) %>% select(-grp)
# A tibble: 3 x 3
# app type code
# <chr> <chr> <dbl>
#1 a blue 1
#2 b red 2
#3 b blue 3

paste column elements with condition in r

I have a data frame and I want to paste elements in name1, name2 and name3 which do not contain NA.
c <- data.frame(name1 = letters[1:3],
name2 = c('A', NA, 'C'),
name3 = c('pig', 'cow', NA)
)
The result should like this:
c %>% mutate(new_name = c('a&A&pig', 'b&cow', 'c&C'))
When I use paste0() it binds all the elements including NA. I do not want this.
c %>% mutate(new_name = paste0(name1,'&', name2, '&', name3))
Then I tried another two method. One is split the data frame into list with group_split(), the other is nest the data frame by index. And then use map() and select() to select the column that do not contain NA after the two methods but all failed.
c %>%
mutate(index = row_number()) %>%
group_split(index) %>%
map(select(~where(~!any(is.na(.)))))
c %>%
mutate(index = row_number()) %>%
nest(data = name1:name3) %>%
mutate(without_NA_data = map(data, select(~where(~!any(is.na(.))))))
Is there any way I can get what I want?
Any help will be highly appreciated!
We can use rowwise with c_across by loading only dplyr package
library(dplyr)
c %>%
rowwise %>%
mutate(new_name = paste(na.omit(c_across(everything())), collapse="&")) %>%
ungroup
# A tibble: 3 x 4
# name1 name2 name3 new_name
# <chr> <chr> <chr> <chr>
#1 a A pig a&A&pig
#2 b <NA> cow b&cow
#3 c C <NA> c&C
Or with pmap
library(purrr)
c %>%
mutate(new_name = pmap_chr(., ~ paste(na.omit(c(...)), collapse="&")))
# name1 name2 name3 new_name
#1 a A pig a&A&pig
#2 b <NA> cow b&cow
#3 c C <NA> c&C
Or using base R with paste and replace
trimws(do.call(paste, c(replace(c, is.na(c), ''), sep="&")), whitespace = "&")
#[1] "a&A&pig" "b&&cow" "c&C"
Or using apply
apply(c, 1, function(x) paste(na.omit(x), collapse="&"))
#[1] "a&A&pig" "b&cow" "c&C"
Or paste first and remove the NA substring
gsub("&NA|NA&|NA$", "", do.call(paste, c(c, sep="&")))
#[1] "a&A&pig" "b&cow" "c&C"
We can use unite from tidyr by using na.rm = TRUE to remove NA values
tidyr::unite(c, new_name, starts_with('name'),
sep = '&', na.rm = TRUE, remove = FALSE)
# new_name name1 name2 name3
#1 a&A&pig a A pig
#2 b&cow b <NA> cow
#3 c&C c C <NA>

Tidying table with multiple groups of wide columns, using tidyverse

I often find myself in a situation where I have a table that contains multiple groups of wide columns, like so:
replicate groupA VA1 VA2 groupB VB1 VB2
1 1 a 0.3429166 -2.30336406 f 0.05363582 1.6454078
2 2 b -1.3183732 -0.13516849 g -0.42586417 0.1541541
3 3 c -0.7908358 -0.10746447 h 1.05134242 1.4297350
4 4 d -0.9963677 -1.82557058 i -1.14532536 1.0815733
5 5 e -1.3634609 0.04385812 j -0.65643595 -0.1452877
And I'd like to turn the columns into one long table, like so:
replicate group key value
1 1 a V1 0.34291665
2 2 b V1 -1.31837322
3 3 c V1 -0.79083580
4 4 d V1 -0.99636772
5 5 e V1 -1.36346088
6 1 a V2 -2.30336406
7 2 b V2 -0.13516849
8 3 c V2 -0.10746447
9 4 d V2 -1.82557058
10 5 e V2 0.04385812
11 1 f V1 0.05363582
12 2 g V1 -0.42586417
13 3 h V1 1.05134242
14 4 i V1 -1.14532536
15 5 j V1 -0.65643595
16 1 f V2 1.64540784
17 2 g V2 0.15415408
18 3 h V2 1.42973499
19 4 i V2 1.08157329
20 5 j V2 -0.14528774
I can do this by selecting the two groups of columns individually, tidying, and then rbinding together (code below). However, this approach doesn't seem particularly elegant, and it becomes cumbersome if there are more than two groups of columns. I'm wondering whether there's a more elegant approach, using a single pipe chain of data transformations.
The fundamental question here is: How do we automate the process of breaking the table into groups of columns, tidying those, and then combining back together.
My current code:
library(dplyr)
library(tidyr)
# generate example code
df_wide <- data.frame(replicate = 1:5,
groupA = letters[1:5],
VA1 = rnorm(5),
VA2 = rnorm(5),
groupB = letters[6:10],
VB1 = rnorm(5),
VB2 = rnorm(5))
# tidy columns with A in the name
dfA <- select(df_wide, replicate, groupA, VA1, VA2) %>%
gather(key, value, VA1, VA2) %>%
mutate(key = case_when(key == "VA1" ~ "V1",
key == "VA2" ~ "V2")) %>%
select(replicate, group = groupA, key, value)
# tidy columns with B in the name
dfB <- select(df_wide, replicate, groupB, VB1, VB2) %>%
gather(key, value, VB1, VB2) %>%
mutate(key = case_when(key == "VB1" ~ "V1",
key == "VB2" ~ "V2")) %>%
select(replicate, group = groupB, key, value)
# combine
df_long <- rbind(dfA, dfB)
Note: Similar questions have been asked here and here, but I think the accepted answer shows that this here is a subtly different problem.
1
Although the question asked for a tidyverse solution, there is a convenient option with melt from data.table, which also can take multiple patterns in the measure argument.
library(data.table)
setnames(melt(melt(setDT(df1), measure = patterns('group', 'VA', 'VB')),
id.var = 1:3)[, -4, with = FALSE], 2:3, c('key', 'group'))[]
2. a
with tidyverse we can subset the datasets into a list, then loop through the list with map_df convert it to 'long' format with gather to get a single data.frame
library(tidyverse)
list(df1[1:4], df1[c(1,5:7)]) %>%
map_df(~gather(., key, value, 3:4) %>%
{names(.)[2] <- 'group';.}) %>%
mutate(key = sub('(.).(.)', '\\1\\2', key))
# replicate group key value
#1 1 a V1 0.34291660
#2 2 b V1 -1.31837320
#3 3 c V1 -0.79083580
#4 4 d V1 -0.99636770
#5 5 e V1 -1.36346090
#6 1 a V2 -2.30336406
#7 2 b V2 -0.13516849
#8 3 c V2 -0.10746447
#9 4 d V2 -1.82557058
#10 5 e V2 0.04385812
#11 1 f V1 0.05363582
#12 2 g V1 -0.42586417
#13 3 h V1 1.05134242
#14 4 i V1 -1.14532536
#15 5 j V1 -0.65643595
#16 1 f V2 1.64540780
#17 2 g V2 0.15415410
#18 3 h V2 1.42973500
#19 4 i V2 1.08157330
#20 5 j V2 -0.14528770
2.b
If we need to split based on the occurence of 'group'
split.default(df1[-1], cumsum(grepl('group', names(df1)[-1]))) %>%
map(~bind_cols(df1[1], .)) %>%
map_df(~gather(., key, value, 3:4) %>%
{names(.)[2] <- 'group';.}) %>%
mutate(key = sub('(.).(.)', '\\1\\2', key))
2.c
Included rename_at instead of names assignment in the spirit of tidyverse options
df1[-1] %>%
split.default(cumsum(grepl('group', names(df1)[-1]))) %>%
map_df(~bind_cols(df1[1], .) %>%
gather(., key, value, 3:4) %>%
rename_at(2, funs(substring(.,1, 5))))
NOTE:
1) Both 2.a, 2.b, 2.c used tidyverse functions
2) It doesn't depend upon on the substring 'A' or 'B' in the column names
3) Assumed the patterns in the OP's dataset will be 'group' followed by value columns
1) This solution consists of a:
gather which generates the desired number of rows
a mutate which combines the groupA and groupB columns and changes the key column to that requested and
select which picks out the columns wanted.
First gather the columns whose names start with V and then create a new group column from groupA and groupB choosing groupA if the key has an A in it and groupB if the key has B in it. (We used mapply(switch, ...) here for easy extension to the 3+ group case but we could have used an ifelse, viz. ifelse(grepl("A", key), as.character(groupA), as.character(groupB)), given that we have only two groups.) The mutate also reduces the key names from VA1 to V1, etc. and finally select out the columns desired.
DF %>%
gather(key, value, starts_with("V")) %>%
mutate(group = mapply(switch, gsub("[^AB]", "", key), A = groupA, B = groupB),
key = sub("[AB]", "", key)) %>%
select(replicate, group, key, value)
giving:
replicate group key value
1 1 a V1 0.34291660
2 2 b V1 -1.31837320
3 3 c V1 -0.79083580
4 4 d V1 -0.99636770
5 5 e V1 -1.36346090
6 1 a V2 -2.30336406
7 2 b V2 -0.13516849
8 3 c V2 -0.10746447
9 4 d V2 -1.82557058
10 5 e V2 0.04385812
11 1 f V1 0.05363582
12 2 g V1 -0.42586417
13 3 h V1 1.05134242
14 4 i V1 -1.14532536
15 5 j V1 -0.65643595
16 1 f V2 1.64540780
17 2 g V2 0.15415410
18 3 h V2 1.42973500
19 4 i V2 1.08157330
20 5 j V2 -0.14528770
2) Another approach would be to split the columns into groups such that all columns in a group have the same name after removing A and B from their names. Performi unlist on each such group to reduce the list to a list of plain vectors and convert that list to a data.frame. Finally gather the V columns and rearrange. Note that rownames_to_column is from the tibble package.
DF %>%
as.list %>%
split(sub("[AB]", "", names(.))) %>%
lapply(unlist) %>%
as.data.frame %>%
rownames_to_column %>%
gather(key, value, starts_with("V")) %>%
arrange(gsub("[^AB]", "", rowname), key) %>%
select(replicate, group, key, value)
2a) If the row order is not important then the rownames_to_column, arrange and select lines could be omitted shortening it to this:
DF %>%
as.list %>%
split(sub("[AB]", "", names(.))) %>%
lapply(unlist) %>%
as.data.frame %>%
gather(key, value, starts_with("V"))
Solutions (2) and (2a) could easily be converted to base-only solutions by replacing the gather with the appropriate reshape from base as in the second reshape, i.e. the one producing d2, in (3).
3) Although the question asked for a tidyverse solution there is a fairly convenient base solution consisting of two reshape calls. The varying produced by the split is: list(group = c("groupA", "groupB"), V1 = c("VA1", "VB1"), V2 = c("VA2", "VB2")) -- that is it matches up the ith column in each set of columns.
varying <- split(names(DF)[-1], gsub("[AB]", "", names(DF))[-1])
d <- reshape(DF, dir = "long", varying = varying, v.names = names(varying))
d <- subset(d, select = -c(time, id))
d2 <- reshape(d, dir = "long", varying = list(grep("V", names(d))), v.names = "value",
timevar = "key")
d2 <- subset(d2, select = c(replication, group, key, value))
d2
Note: The input in reproducible form is:
DF <- structure(list(replicate = 1:5, groupA = structure(1:5, .Label = c("a",
"b", "c", "d", "e"), class = "factor"), VA1 = c(0.3429166, -1.3183732,
-0.7908358, -0.9963677, -1.3634609), VA2 = c(-2.30336406, -0.13516849,
-0.10746447, -1.82557058, 0.04385812), groupB = structure(1:5, .Label = c("f",
"g", "h", "i", "j"), class = "factor"), VB1 = c(0.05363582, -0.42586417,
1.05134242, -1.14532536, -0.65643595), VB2 = c(1.6454078, 0.1541541,
1.429735, 1.0815733, -0.1452877)), .Names = c("replicate", "groupA",
"VA1", "VA2", "groupB", "VB1", "VB2"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5"))

Resources