Coalescing multiple columns from both the left and right side - r

Given the following data
df1 <- structure(list(ID = 1:3, alpha_1 = c(2L, 2L, 3L),
alpha_2 = c(1L, 2L,
3L), alpha_3 = c(4L, 4L, 2L), alpha_4 = c(3L, NA, NA), beta_1 = c(NA,
2L, NA), beta_2 = c(3L, NA, 2L), charlie_1 = c(1L, NA, 1L), charlie_2 = c(NA,
2L, NA)), class = "data.frame", row.names = c(NA, -3L))
I'm trying to coalesce all columns sharing the same initial prefix name (i.e. coalesce alpha_1, alpha_2, alpha_3, alpha_4, and coalesce beta_1 beta_2, etc.), but from both the left and right sides. That is, I want to generate two new variables, say 'alpha_left' and 'alpha_right', whose columns would be, in this example, (2, 2, 3) and (3, 4, 2) respectively (first non-missing elements from the left and right side of the dataframe).
User #akrun offered a great solution for the coalescing part here, but I'm unsure how to create two new variables from both the left and right coalesces.

Here is an option in tidyverse
Reshape to 'long' format - pivot_longer
Grouped by 'ID'
Do the summarise across the columns 'alpha' till 'charlie'
Get the column name - cur_column()
Create a tibble with the first non-NA element from the left and the right
Change the column names by appending the 'nm1' as prefix
Finally, unnest the list columns created in summarise
library(dplyr)
library(tidyr)
library(stringr)
df1 %>%
pivot_longer(cols = contains("_"),
names_to = c( ".value", "grp"), names_sep = "_") %>%
group_by(ID) %>%
summarise(across(alpha:charlie, ~ {
nm1 <- cur_column()
tbl1 <- tibble(left= .[complete.cases(.)][1],
right = rev(.)[complete.cases(rev(.))][1]);
names(tbl1) <- str_c(nm1, "_", names(tbl1))
list(tbl1)})) %>%
unnest(c(alpha, beta, charlie))
-output
# A tibble: 3 x 7
ID alpha_left alpha_right beta_left beta_right charlie_left charlie_right
<int> <int> <int> <int> <int> <int> <int>
1 1 2 3 3 3 1 1
2 2 2 4 2 2 2 2
3 3 3 2 2 2 1 1
Or using base R
lst1 <- lapply(split.default(df1[-1], sub("_\\d+$", "", names(df1)[-1])),
function(x) {
x1 <- apply(x, 1, function(y) {
y1 <- na.omit(y)
if(length(y1) > 1 ) y1[c(1, length(y1))] else y1[1]
})
if(is.vector(x1)) as.data.frame(matrix(x1)) else as.data.frame(t(x1))
})

You could also do:
df1[-1] %>%
split.default(sub("_\\d+", "", names(.))) %>%
imap_dfc(~data.frame(right = coalesce(!!!.x),
left = coalesce(!!!rev(.x))) %>%
set_names(paste(.y, names(.), sep="_")))
alpha_right alpha_left beta_right beta_left charlie_right charlie_left
1 2 3 3 3 1 1
2 2 4 2 2 2 2
3 3 2 2 2 1 1

One more approach not as elegant as #Onyambu's
library(tidyverse)
df1[-1] %>%
split.default(sub("_\\d+", "", names(.))) %>%
imap_dfc(~ .x %>% rowwise() %>%
mutate(!!paste0(.y, '_left') := head(na.omit(c_across(everything())),1),
!!paste0(.y, '_right') := tail(na.omit(c_across(!last_col())),1),
.keep = 'none' )
)
#> # A tibble: 3 x 6
#> # Rowwise:
#> alpha_left alpha_right beta_left beta_right charlie_left charlie_right
#> <int> <int> <int> <int> <int> <int>
#> 1 2 3 3 3 1 1
#> 2 2 4 2 2 2 2
#> 3 3 2 2 2 1 1
Created on 2021-06-19 by the reprex package (v2.0.0)

Another option
library(tidyverse)
df1 <- structure(list(ID = 1:3, alpha_1 = c(2L, 2L, 3L),
alpha_2 = c(1L, 2L,
3L), alpha_3 = c(4L, 4L, 2L), alpha_4 = c(3L, NA, NA), beta_1 = c(NA,
2L, NA), beta_2 = c(3L, NA, 2L), charlie_1 = c(1L, NA, 1L), charlie_2 = c(NA,
2L, NA)), class = "data.frame", row.names = c(NA, -3L))
df1 %>%
pivot_longer(cols = -ID, names_sep = "_", names_to = c(".value", "set")) %>%
group_by(ID) %>%
fill(alpha:charlie, .direction = "updown") %>%
filter(set %in% range(set)) %>%
mutate(set = c("left", "right")) %>%
pivot_wider(id_cols = ID, names_from = set, values_from = alpha:charlie)
#> # A tibble: 3 x 7
#> # Groups: ID [3]
#> ID alpha_left alpha_right beta_left beta_right charlie_left charlie_right
#> <int> <int> <int> <int> <int> <int> <int>
#> 1 1 2 3 3 3 1 1
#> 2 2 2 4 2 2 2 2
#> 3 3 3 2 2 2 1 1
Created on 2021-06-20 by the reprex package (v2.0.0)

Related

Coalescing multiple chunks of columns with the same suffix in names (R)

I have a dataset with various "chunks" of columns with different prefixes, but the same suffix:
ID
A034
B034
C034
D034
A099
B099
A123
B123
...
1
NA
1
NA
NA
NA
3
1
NA
...
2
2
NA
NA
NA
2
NA
NA
2
...
3
NA
NA
2
NA
NA
2
1
NA
...
The number of columns within each "chunk" also varies. Is there any way (other than manually, which is what I have been painstakingly doing with coalesce(!!! select(., contains("XXX")))) to automatically coalesce by chunk based on the shared suffix? That is, the result should resemble
ID
034
099
123
...
1
1
3
1
...
2
2
2
2
...
3
2
2
1
...
I'm not sure how to begin doing something like this, so any suggestions would be very helpful.
We reshape the data into 'long' format with pivot_longer, then we group by 'ID' and loop across the other columns, apply the na.omit to remove the NA elements (we assume that there is only one non-NA per each column by group)
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = -ID, names_to = ".value",
names_pattern = "[A-Z](\\d+)") %>%
group_by(ID) %>%
summarise(across(everything(), na.omit), .groups = 'drop')
-output
# A tibble: 3 x 4
ID `034` `099` `123`
<int> <int> <int> <int>
1 1 1 3 1
2 2 2 2 2
3 3 2 2 1
Or to be safe, use complete.cases to create a logical vector for non-NA elements, and extract the first element (assuming we need only a single non-NA - if the non-NA lengths are different, we may need to return a list)
df1 %>%
pivot_longer(cols = -ID, names_to = ".value",
names_pattern = "[A-Z](\\d+)") %>%
group_by(ID) %>%
summarise(across(everything(), ~ .[complete.cases(.)][1]))
data
df1 <- structure(list(ID = 1:3, A034 = c(NA, 2L, NA), B034 = c(1L, NA,
NA), C034 = c(NA, NA, 2L), D034 = c(NA, NA, NA), A099 = c(NA,
2L, NA), B099 = c(3L, NA, 2L), A123 = c(1L, NA, 1L), B123 = c(NA,
2L, NA)), class = "data.frame", row.names = c(NA, -3L))
one more approach
library(tidyverse)
split(names(df1)[-1], gsub('^\\D*(\\d+)$', '\\1', names(df1)[-1])) %>% map(~df1[c('ID', .x)]) %>%
imap(~ .x %>% group_by(ID) %>% rowwise %>% transmute(!!.y := first(na.omit(c_across(everything())))) %>% ungroup) %>%
reduce(left_join, by = 'ID')
#> # A tibble: 3 x 4
#> ID `034` `099` `123`
#> <int> <int> <int> <int>
#> 1 1 1 3 1
#> 2 2 2 2 2
#> 3 3 2 2 1
Created on 2021-06-20 by the reprex package (v2.0.0)

Add multiple columns counting frequencies in rows in R

I have a dataset of protocols that participants followed and they chose how frequently they followed the protocol - never, sometimes, always. This is coded as 1, 2 and 3.
Sample df
Protocol 1
Protocol 2
Protocol 3
1
2
3
3
2
3
2
1
2
2
3
3
1
3
3
3
3
3
Each row corresponds to one participant saying how frequently they followed each protocol (never = 1, sometimes = 2, always = 3)
I want to transpose the df and add three columns counting the frequency of each option of each protocol and percentage in brackets
So, it would look something like this:
Never
Sometimes
Always
Protocol 1
1
3
2
2
1
3
2(33.33%)
2(33.33%)
2(33.33%)
Protocol 2
2
2
1
3
3
3
1(16.67%)
2(33.33%)
3(50%)
Protocol 3
3
3
2
3
3
3
0(0%)
1(16.67%)
5(83.33%)
I tried multiple solutions from stackoverflow and others but nothing seems to work. Even if I can't show percentages and just the frequency, that would work too.
Here is one tidyverse approach -
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = everything()) %>%
mutate(value = factor(value)) %>%
count(name, value, .drop = FALSE) %>%
group_by(name) %>%
mutate(n = sprintf('%d (%.2f %%)', n, prop.table(n) * 100),
value = recode(value,`1`='never',`2` = 'sometimes',`3` = 'always')) %>%
ungroup %>%
pivot_wider(names_from = value, values_from = n)
# name never sometimes always
# <chr> <chr> <chr> <chr>
#1 Protocol1 2 (33.33 %) 2 (33.33 %) 2 (33.33 %)
#2 Protocol2 1 (16.67 %) 2 (33.33 %) 3 (50.00 %)
#3 Protocol3 0 (0.00 %) 1 (16.67 %) 5 (83.33 %)
data
df <- structure(list(Protocol1 = c(1L, 3L, 2L, 2L, 1L, 3L), Protocol2 = c(2L,
2L, 1L, 3L, 3L, 3L), Protocol3 = c(3L, 3L, 2L, 3L, 3L, 3L)),
class = "data.frame", row.names = c(NA, -6L))
Here is another approach.
library(tidyverse)
library(glue)
dat1 <- dat %>%
pivot_longer(everything()) %>%
group_by(name) %>%
summarise(
Never = glue("{sum(value == 1)} ({round(mean(value == 1) * 100, 2)}%)"),
Sometiimes = glue("{sum(value == 2)} ({round(mean(value == 2) * 100, 2)}%)"),
Always = glue("{sum(value == 3)} ({round(mean(value == 3) * 100, 2)}%)"),
)
dat2 <- dat %>%
pivot_longer(everything()) %>%
group_by(name) %>%
mutate(n = row_number()) %>%
ungroup() %>%
pivot_wider(
id_cols = name,
names_from = n,
values_from = value
) %>%
left_join(dat1, by = "name")
dat2
# # A tibble: 3 x 10
# name `1` `2` `3` `4` `5` `6` Never Sometiimes Always
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <glue> <glue> <glue>
# 1 Protocol 1 1 3 2 2 1 3 2 (33.33%) 2 (33.33%) 2 (33.33%)
# 2 Protocol 2 2 2 1 3 3 3 1 (16.67%) 2 (33.33%) 3 (50%)
# 3 Protocol 3 3 3 2 3 3 3 0 (0%) 1 (16.67%) 5 (83.33%)
Data:
dat <- tibble(
`Protocol 1` = c(1, 3, 2, 2, 1, 3),
`Protocol 2` = c(2, 2, 1, 3, 3, 3),
`Protocol 3` = c(3, 3, 2, 3, 3, 3)
)

Pivot Wide with Custom Names, Original Values in the cell

I have data that is set up like the following - the CODE variable is character and needs to remain as it is because the numbers have meaning.
ID CODE
1 1.0
1 0.00
1 9.99
2 40.56
3 33.54
3 0.00
How would I use pivot wider to rearrange it so it is like the following, where I can have 4 CODE columns and if there isn't a fourth code per ID, it is just left blank
ID CODE_1 CODE_2 CODE_3 CODE_4
1 1.0 0.00 9.99 "."
2 40.56 "." "." "."
3 33.54 0.00 "." "."
Thank you!
This approach can be close to what you want. You can use tidyverse function complete() to enable the level not present in your original values. Here the code:
library(tidyverse)
#Code
df <- df %>% group_by(ID) %>% mutate(Var=factor(paste0('CODE_',row_number()),
levels = paste0('CODE_',1:4),
labels = paste0('CODE_',1:4),ordered = T,
exclude = F)) %>%
complete(Var = Var) %>%
pivot_wider(names_from = Var,values_from=CODE)
Output:
# A tibble: 3 x 5
# Groups: ID [3]
ID CODE_1 CODE_2 CODE_3 CODE_4
<int> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 9.99 NA
2 2 40.6 NA NA NA
3 3 33.5 0 NA NA
Some data used:
#Data
df <- structure(list(ID = c(1L, 1L, 1L, 2L, 3L, 3L), CODE = c(1, 0,
9.99, 40.56, 33.54, 0)), class = "data.frame", row.names = c(NA,
-6L))
If you really want dots for missing values, you have to transform the variables to character and then assign the replace like this:
#Code 2
df <- df %>% group_by(ID) %>% mutate(Var=factor(paste0('CODE_',row_number()),
levels = paste0('CODE_',1:4),
labels = paste0('CODE_',1:4),ordered = T,
exclude = F)) %>%
complete(Var = Var) %>%
pivot_wider(names_from = Var,values_from=CODE) %>%
mutate(across(CODE_1:CODE_4,~as.character(.))) %>%
replace(is.na(.),'.')
Output:
# A tibble: 3 x 5
# Groups: ID [3]
ID CODE_1 CODE_2 CODE_3 CODE_4
<int> <chr> <chr> <chr> <chr>
1 1 1 0 9.99 .
2 2 40.56 . . .
3 3 33.54 0 . .
We can use dcast from data.table
library(data.table)
dcast(setDT(df), ID ~ paste0("CODE_", rowid(ID)), value.var = 'CODE')
# ID CODE_1 CODE_2 CODE_3
#1: 1 1.00 0 9.99
#2: 2 40.56 NA NA
#3: 3 33.54 0 NA
data
df <- structure(list(ID = c(1L, 1L, 1L, 2L, 3L, 3L), CODE = c(1, 0,
9.99, 40.56, 33.54, 0)), class = "data.frame", row.names = c(NA,
-6L))

R function to paste information from different rows with a common column? [duplicate]

This question already has an answer here:
dplyr::first() to choose first non NA value
(1 answer)
Closed 2 years ago.
I understand we can use the dplyr function coalesce() to unite different columns, but is there such function to unite rows?
I am struggling with a confusing incomplete/doubled dataframe with duplicate rows for the same id, but with different columns filled. E.g.
id sex age source
12 M NA 1
12 NA 3 1
13 NA 2 2
13 NA NA NA
13 F 2 NA
and I am trying to achieve:
id sex age source
12 M 3 1
13 F 2 2
You can try:
library(dplyr)
#Data
df <- structure(list(id = c(12L, 12L, 13L, 13L, 13L), sex = structure(c(2L,
NA, NA, NA, 1L), .Label = c("F", "M"), class = "factor"), age = c(NA,
3L, 2L, NA, 2L), source = c(1L, 1L, 2L, NA, NA)), class = "data.frame", row.names = c(NA,
-5L))
df %>%
group_by(id) %>%
fill(everything(), .direction = "down") %>%
fill(everything(), .direction = "up") %>%
slice(1)
# A tibble: 2 x 4
# Groups: id [2]
id sex age source
<int> <fct> <int> <int>
1 12 M 3 1
2 13 F 2 2
As mentioned by #A5C1D2H2I1M1N2O1R2T1 you can select the first non-NA value in each group. This can be done using dplyr :
library(dplyr)
df %>% group_by(id) %>% summarise(across(.fns = ~na.omit(.)[1]))
# A tibble: 2 x 4
# id sex age source
# <int> <fct> <int> <int>
#1 12 M 3 1
#2 13 F 2 2
Base R :
aggregate(.~id, df, function(x) na.omit(x)[1], na.action = 'na.pass')
Or data.table :
library(data.table)
setDT(df)[, lapply(.SD, function(x) na.omit(x)[1]), id]

Pmax of columns ending with a given string

I would like to conditionally mutate a new column representing the pmax() of columns ending with "_n" for a given row. I know I can do this by explicitly specifying the column names, but I would prefer to have this be the result of a call to ends_with() or similar.
I have tried mutate_at() and plain mutate(). My general thought is that I need to pass a vars(ends_with("_n")) to something, but I'm just missing that something.
Thanks in advance.
library(dplyr)
library(tidyr)
mtcars %>%
group_by(vs, gear) %>%
summarize(mean = mean(disp),
sd = sd(disp),
n = n()) %>%
mutate_if(is.double, round, 1) %>%
mutate(mean_sd = paste0(mean, " (", sd, ")")) %>%
select(-mean, -sd) %>%
group_by(vs, gear) %>%
nest(n, mean_sd, .key = "summary") %>%
spread(key = vs, value = summary) %>%
unnest(`0`, `1`, .sep = "_")
gear `0_n` `0_mean_sd` `1_n` `1_mean_sd`
<dbl> <int> <chr> <int> <chr>
1 3 12 357.6 (71.8) 3 201 (72)
2 4 2 160 (0) 10 115.6 (38.5)
3 5 4 229.3 (113.9) 1 95.1 (NA)
edit: both answers are much appreciated. Cheers!
Here's one way using the unquote-splice operator. We can select columns that we want to compare and then splice them as vectors into pmax:
library(tidyverse)
tbl <- structure(list(gear = c(3, 4, 5), `0_n` = c(12L, 2L, 4L), `0_mean_sd` = c("357.6 (71.8)", "160 (0)", "229.3 (113.9)"), `1_n` = c(3L, 10L, 1L), `1_mean_sd` = c("201 (72)", "115.6 (38.5)", "95.1 (NA)")), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"))
tbl %>%
mutate(pmax = pmax(!!!select(., ends_with("_n"))))
#> # A tibble: 3 x 6
#> gear `0_n` `0_mean_sd` `1_n` `1_mean_sd` pmax
#> <dbl> <int> <chr> <int> <chr> <int>
#> 1 3 12 357.6 (71.8) 3 201 (72) 12
#> 2 4 2 160 (0) 10 115.6 (38.5) 10
#> 3 5 4 229.3 (113.9) 1 95.1 (NA) 4
Created on 2019-04-23 by the reprex package (v0.2.1)
A base R version, just as an alternative:
tbl <- structure(list(gear = c(3, 4, 5), `0_n` = c(12L, 2L, 4L), `0_mean_sd` = c("357.6 (71.8)", "160 (0)", "229.3 (113.9)"), `1_n` = c(3L, 10L, 1L), `1_mean_sd` = c("201 (72)", "115.6 (38.5)", "95.1 (NA)")), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"))
tbl$pmax <- do.call(pmax,as.list(dat[,grepl("_n$",names(dat))]))

Resources