Related
I have a dataframe that is complicated and Im trying to reshape it.
Here is an example of the type of data frame that I have:
names <- c("var1", 'var2', "split")
values <- rnorm(8)
from <- data.frame(a = rep(1, 10),
b = c(rep(1,3), rep(2, 7)),
c = c(names, names, rep("split", 4)),
d = c(rep("NA", 5), names, rep("split", 2)),
e = c(rep("NA", 7), names),
f = c(values[1:2], "NA", values[3:8], "NA"))
And this produces something that looks like this:
> from
a b c d e f
1 1 1 var1 NA NA -0.271930473373158
2 1 1 var2 NA NA -0.0968100775823158
3 1 1 split NA NA NA
4 1 2 var1 NA NA -1.73919094720254
5 1 2 var2 NA NA -0.52398152119997
6 1 2 split var1 NA 0.856367467674763
7 1 2 split var2 NA -0.729762707907525
8 1 2 split split var1 0.561460771889416
9 1 2 split split var2 0.0432022687633195
10 1 2 split split split NA
Inside my data frame from, I want to take var1 and var2 and turn them into columns. And then use the value from column f in from as the values that correspond to var1 and var2 (reading row-wise).
In other words, I am trying to reshape this data frame into something that looks like this:
> out
a b var1 var2
1 1 1 -0.2719305 -0.09681008
2 1 2 -1.7391909 -0.52398152
3 1 2 0.8563675 -0.72976271
4 1 2 0.5614608 0.04320227
Any suggestions as to how I could do this?
We could reshape to 'long' with pivot_longer, remove the NA elements and filter by keeping on the 'var' elements and then back to 'wide' with pivot_wider
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)
from %>%
type.convert(as.is = TRUE) %>%
pivot_longer(cols = c:e, values_drop_na = TRUE) %>%
filter(str_detect(value, 'var')) %>%
select(-name) %>%
mutate(rn = rowid(a, b, value)) %>%
pivot_wider(names_from = value, values_from = f) %>%
select(-rn)
-output
# A tibble: 4 × 4
a b var1 var2
<int> <int> <dbl> <dbl>
1 1 1 -0.272 -0.0968
2 1 2 -1.74 -0.524
3 1 2 0.856 -0.730
4 1 2 0.561 0.0432
data
from <- structure(list(a = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
b = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), c = c("var1",
"var2", "split", "var1", "var2", "split", "split", "split",
"split", "split"), d = c("NA", "NA", "NA", "NA", "NA", "var1",
"var2", "split", "split", "split"), e = c("NA", "NA", "NA",
"NA", "NA", "NA", "NA", "var1", "var2", "split"), f = c("-0.271930473373158",
"-0.0968100775823158", "NA", "-1.73919094720254", "-0.52398152119997",
"0.856367467674763", "-0.729762707907525", "0.561460771889416",
"0.0432022687633195", "NA")), row.names = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10"), class = "data.frame")
Here is a solution with one time pivoting:
library(dplyr)
library(tidyr)
library(stringr)
from %>%
type.convert(as.is = TRUE) %>%
filter(!is.na(f)) %>%
mutate(name = str_extract_all(paste(c,d,e), 'var(.)')) %>%
select(a, b, f, name) %>%
pivot_wider(
names_from = name,
values_from = f,
values_fn = list
) %>%
unnest(cols = c(var1, var2))
a b var1 var2
<int> <int> <dbl> <dbl>
1 1 1 -0.272 -0.0968
2 1 2 -1.74 -0.524
3 1 2 0.856 -0.730
4 1 2 0.561 0.0432
This can be achieved by coupling a series of logical operations to get the values in from$f
data.frame( a=from$a[rowSums(from == "var1", na.rm=T) == 1],
b=from$b[rowSums(from == "var1", na.rm=T) == 1],
var1=from$f[rowSums(from == "var1", na.rm=T) == 1],
var2=from$f[rowSums(from == "var2", na.rm=T) == 1] )
a b var1 var2
1 1 1 -0.2719305 -0.09681008
2 1 2 -1.7391909 -0.52398152
3 1 2 0.8563675 -0.72976271
4 1 2 0.5614608 0.04320227
The notion is to have a row_number mutation:
library(dplyr)
library(tidyr)
from %>%
type.convert(as.is = TRUE) %>%
filter(!is.na(f)) %>%
group_by(name = invoke(coalesce, across(c:e, na_if, 'split')))%>%
mutate(id = row_number()) %>%
pivot_wider(c(a, b, id), values_from = f) %>%
select(-id)
# A tibble: 4 x 4
a b var1 var2
<int> <int> <dbl> <dbl>
1 1 1 -0.272 -0.0968
2 1 2 -1.74 -0.524
3 1 2 0.856 -0.730
4 1 2 0.561 0.0432
I have a dataset with various "chunks" of columns with different prefixes, but the same suffix:
ID
A034
B034
C034
D034
A099
B099
A123
B123
...
1
NA
1
NA
NA
NA
3
1
NA
...
2
2
NA
NA
NA
2
NA
NA
2
...
3
NA
NA
2
NA
NA
2
1
NA
...
The number of columns within each "chunk" also varies. Is there any way (other than manually, which is what I have been painstakingly doing with coalesce(!!! select(., contains("XXX")))) to automatically coalesce by chunk based on the shared suffix? That is, the result should resemble
ID
034
099
123
...
1
1
3
1
...
2
2
2
2
...
3
2
2
1
...
I'm not sure how to begin doing something like this, so any suggestions would be very helpful.
We reshape the data into 'long' format with pivot_longer, then we group by 'ID' and loop across the other columns, apply the na.omit to remove the NA elements (we assume that there is only one non-NA per each column by group)
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = -ID, names_to = ".value",
names_pattern = "[A-Z](\\d+)") %>%
group_by(ID) %>%
summarise(across(everything(), na.omit), .groups = 'drop')
-output
# A tibble: 3 x 4
ID `034` `099` `123`
<int> <int> <int> <int>
1 1 1 3 1
2 2 2 2 2
3 3 2 2 1
Or to be safe, use complete.cases to create a logical vector for non-NA elements, and extract the first element (assuming we need only a single non-NA - if the non-NA lengths are different, we may need to return a list)
df1 %>%
pivot_longer(cols = -ID, names_to = ".value",
names_pattern = "[A-Z](\\d+)") %>%
group_by(ID) %>%
summarise(across(everything(), ~ .[complete.cases(.)][1]))
data
df1 <- structure(list(ID = 1:3, A034 = c(NA, 2L, NA), B034 = c(1L, NA,
NA), C034 = c(NA, NA, 2L), D034 = c(NA, NA, NA), A099 = c(NA,
2L, NA), B099 = c(3L, NA, 2L), A123 = c(1L, NA, 1L), B123 = c(NA,
2L, NA)), class = "data.frame", row.names = c(NA, -3L))
one more approach
library(tidyverse)
split(names(df1)[-1], gsub('^\\D*(\\d+)$', '\\1', names(df1)[-1])) %>% map(~df1[c('ID', .x)]) %>%
imap(~ .x %>% group_by(ID) %>% rowwise %>% transmute(!!.y := first(na.omit(c_across(everything())))) %>% ungroup) %>%
reduce(left_join, by = 'ID')
#> # A tibble: 3 x 4
#> ID `034` `099` `123`
#> <int> <int> <int> <int>
#> 1 1 1 3 1
#> 2 2 2 2 2
#> 3 3 2 2 1
Created on 2021-06-20 by the reprex package (v2.0.0)
Given the following data
df1 <- structure(list(ID = 1:3, alpha_1 = c(2L, 2L, 3L),
alpha_2 = c(1L, 2L,
3L), alpha_3 = c(4L, 4L, 2L), alpha_4 = c(3L, NA, NA), beta_1 = c(NA,
2L, NA), beta_2 = c(3L, NA, 2L), charlie_1 = c(1L, NA, 1L), charlie_2 = c(NA,
2L, NA)), class = "data.frame", row.names = c(NA, -3L))
I'm trying to coalesce all columns sharing the same initial prefix name (i.e. coalesce alpha_1, alpha_2, alpha_3, alpha_4, and coalesce beta_1 beta_2, etc.), but from both the left and right sides. That is, I want to generate two new variables, say 'alpha_left' and 'alpha_right', whose columns would be, in this example, (2, 2, 3) and (3, 4, 2) respectively (first non-missing elements from the left and right side of the dataframe).
User #akrun offered a great solution for the coalescing part here, but I'm unsure how to create two new variables from both the left and right coalesces.
Here is an option in tidyverse
Reshape to 'long' format - pivot_longer
Grouped by 'ID'
Do the summarise across the columns 'alpha' till 'charlie'
Get the column name - cur_column()
Create a tibble with the first non-NA element from the left and the right
Change the column names by appending the 'nm1' as prefix
Finally, unnest the list columns created in summarise
library(dplyr)
library(tidyr)
library(stringr)
df1 %>%
pivot_longer(cols = contains("_"),
names_to = c( ".value", "grp"), names_sep = "_") %>%
group_by(ID) %>%
summarise(across(alpha:charlie, ~ {
nm1 <- cur_column()
tbl1 <- tibble(left= .[complete.cases(.)][1],
right = rev(.)[complete.cases(rev(.))][1]);
names(tbl1) <- str_c(nm1, "_", names(tbl1))
list(tbl1)})) %>%
unnest(c(alpha, beta, charlie))
-output
# A tibble: 3 x 7
ID alpha_left alpha_right beta_left beta_right charlie_left charlie_right
<int> <int> <int> <int> <int> <int> <int>
1 1 2 3 3 3 1 1
2 2 2 4 2 2 2 2
3 3 3 2 2 2 1 1
Or using base R
lst1 <- lapply(split.default(df1[-1], sub("_\\d+$", "", names(df1)[-1])),
function(x) {
x1 <- apply(x, 1, function(y) {
y1 <- na.omit(y)
if(length(y1) > 1 ) y1[c(1, length(y1))] else y1[1]
})
if(is.vector(x1)) as.data.frame(matrix(x1)) else as.data.frame(t(x1))
})
You could also do:
df1[-1] %>%
split.default(sub("_\\d+", "", names(.))) %>%
imap_dfc(~data.frame(right = coalesce(!!!.x),
left = coalesce(!!!rev(.x))) %>%
set_names(paste(.y, names(.), sep="_")))
alpha_right alpha_left beta_right beta_left charlie_right charlie_left
1 2 3 3 3 1 1
2 2 4 2 2 2 2
3 3 2 2 2 1 1
One more approach not as elegant as #Onyambu's
library(tidyverse)
df1[-1] %>%
split.default(sub("_\\d+", "", names(.))) %>%
imap_dfc(~ .x %>% rowwise() %>%
mutate(!!paste0(.y, '_left') := head(na.omit(c_across(everything())),1),
!!paste0(.y, '_right') := tail(na.omit(c_across(!last_col())),1),
.keep = 'none' )
)
#> # A tibble: 3 x 6
#> # Rowwise:
#> alpha_left alpha_right beta_left beta_right charlie_left charlie_right
#> <int> <int> <int> <int> <int> <int>
#> 1 2 3 3 3 1 1
#> 2 2 4 2 2 2 2
#> 3 3 2 2 2 1 1
Created on 2021-06-19 by the reprex package (v2.0.0)
Another option
library(tidyverse)
df1 <- structure(list(ID = 1:3, alpha_1 = c(2L, 2L, 3L),
alpha_2 = c(1L, 2L,
3L), alpha_3 = c(4L, 4L, 2L), alpha_4 = c(3L, NA, NA), beta_1 = c(NA,
2L, NA), beta_2 = c(3L, NA, 2L), charlie_1 = c(1L, NA, 1L), charlie_2 = c(NA,
2L, NA)), class = "data.frame", row.names = c(NA, -3L))
df1 %>%
pivot_longer(cols = -ID, names_sep = "_", names_to = c(".value", "set")) %>%
group_by(ID) %>%
fill(alpha:charlie, .direction = "updown") %>%
filter(set %in% range(set)) %>%
mutate(set = c("left", "right")) %>%
pivot_wider(id_cols = ID, names_from = set, values_from = alpha:charlie)
#> # A tibble: 3 x 7
#> # Groups: ID [3]
#> ID alpha_left alpha_right beta_left beta_right charlie_left charlie_right
#> <int> <int> <int> <int> <int> <int> <int>
#> 1 1 2 3 3 3 1 1
#> 2 2 2 4 2 2 2 2
#> 3 3 3 2 2 2 1 1
Created on 2021-06-20 by the reprex package (v2.0.0)
I have a "long" data frame in R, and I want to create a new "wider" data frame from it. I looked into using pivot_wider from dplyr, but I'm having trouble figuring it out since what I want to do is more complicated than the examples I can find. I have successfully used a for loop, but I want to see if there is a faster way to do this (pivot_wider or otherwise).
Here's a simplified example of the original data frame:
df
USER_ID EVENT_ISCHECKED EVENT_VALUE EVENT_ID
1 4 0 CB_1
2 4 1 CB_2
3 4 1 CB_3
4 4 V_1
5 4 33 V_2
6 4 V_3
7 5 1
8 5 0 CB_2
9 5 1 CB_3
10 5 V_1
11 5 V_2
12 5 47 V_3
The second data frame should have just one row for each USER_ID and separate columns for each EVENT_ID that are populated with indicator values (0 or 1). The tricky part is that the values they get come from different columns/conditionals. The CB (checkbox) columns get a 1 if EVENT_ISCHECKED==1, while the V (value) columns get a 1 if EVENT_VALUE has a number/isn't empty. The result from this simplified example should look like this:
outDF
USER_ID CB_1 CB_2 CB_3 V_1 V_2 V_3
1 4 0 1 1 0 1 0
7 5 0 0 1 0 0 1
Here's my code with the for loop going through each row of the original data frame:
# Setting up the example data frame
df <- data.frame("USER_ID" = c(rep(4,6), rep(5,6)),
"EVENT_ISCHECKED" = c(0,1,1,"","","",1,0,1,"","",""),
"EVENT_VALUE" = c("","","","",33,"","","","","","",47),
"EVENT_ID" = c("CB_1","CB_2","CB_3","V_1","V_2","V_3","","CB_2","CB_3","V_1","V_2","V_3"),
stringsAsFactors = FALSE)
# Vectors of possible checkbox and value codes
CB <- c("CB_1", "CB_2", "CB_3")
V <- c("V_1", "V_2", "V_3")
# Creating the ouput data frame with one row per user and
# separate columns for each Event ID with default value of 0
outDF <- unique(df[, 'USER_ID',drop = FALSE])
outDF[,CB] <- 0
outDF[,V] <- 0
for(i in 1:nrow(df)){
# Going through each row and setting
# the current User ID, Event ID, etc.
U.ID <- df[[i, "USER_ID"]]
E.ID <- df[[i, "EVENT_ID"]]
E.CH <- df[[i, "EVENT_ISCHECKED"]]
E.V <- df[[i, "EVENT_VALUE"]]
# Getting the index of the row in the outDF
# that matches the current User ID
outputRow <- which(outDF$USER_ID == U.ID)
# If the Event ID is one of the Check Box IDs and the
# Event is checked, then that user gets a 1
# in that CB column
if(E.ID %in% CB & E.CH==1){
outDF[outputRow, E.ID] <- 1
}
# If the Event ID is one of the Value IDs and the
# value is not empty, then that user gets a 1
# in that V column
if(E.ID %in% V & E.V!=""){
outDF[outputRow, E.ID] <- 1
}
}
EDIT
Starja's answer now works for the missing EVENT_ID case!
You can first make a column with the expected values and then use pivot_wider to generate the wide table:
library(tidyr)
library(dplyr)
library(stringr)
df <- data.frame("USER_ID" = c(rep(4,6), rep(5,6)),
"EVENT_ISCHECKED" = c(0,1,1,"","","",1,0,1,"","",""),
"EVENT_VALUE" = c("","","","",33,"","","","","","",47),
"EVENT_ID" = c("CB_1","CB_2","CB_3","V_1","V_2","V_3","","CB_2","CB_3","V_1","V_2","V_3"),
stringsAsFactors = FALSE)
df_wide <- df %>%
filter(EVENT_ID != "") %>%
mutate(value = case_when(str_detect(EVENT_ID, "^CB") ~ as.numeric(EVENT_ISCHECKED),
EVENT_VALUE == "" ~ 0,
TRUE ~ 1)) %>%
pivot_wider(id_cols = USER_ID,
names_from = EVENT_ID,
values_from = value,
values_fill = list(value = 0))
df_wide
# A tibble: 2 x 7
USER_ID CB_1 CB_2 CB_3 V_1 V_2 V_3
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 4 0 1 1 0 1 0
2 5 0 0 1 0 0 1
Edit
I've included #IceCreamToucan's suggestion with id_cols into pivot_wider and #Martin Gal's suggestion to use case_when instead of if_else.
Now I also filter out rows with a missing EVENT_ID and replace these missing values with 0.
We can also use coalecse with pivot_wider
library(dplyr)
library(tidyr)
df %>%
transmute(USER_ID, EVENT_ID,
value = coalesce(EVENT_ISCHECKED, !is.na(EVENT_VALUE))) %>%
pivot_wider(names_from = EVENT_ID, values_from = value)
# A tibble: 2 x 7
# USER_ID CB_1 CB_2 CB_3 V_1 V_2 V_3
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 4 0 1 1 0 1 0
#2 5 1 0 1 0 0 1
If we want to get NA where both columns are NA, then do
df %>%
transmute(USER_ID, EVENT_ID,
value = replace(coalesce(EVENT_ISCHECKED,
!is.na(EVENT_VALUE)), is.na(EVENT_ISCHECKED) & is.na(EVENT_VALUE), NA %>%
pivot_wider(names_from = EVENT_ID, values_from = value)
data
df <- structure(list(USER_ID = c(4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L, 5L, 5L), EVENT_ISCHECKED = c(0, 1, 1, NA, NA, NA, 1, 0, 1,
NA, NA, NA), EVENT_VALUE = c(NA, NA, NA, NA, 33, NA, NA, NA,
NA, NA, NA, 47), EVENT_ID = c("CB_1", "CB_2", "CB_3", "V_1",
"V_2", "V_3", "CB_1", "CB_2", "CB_3", "V_1", "V_2", "V_3")),
class = "data.frame", row.names = c(NA,
-12L))
This question already has an answer here:
dplyr::first() to choose first non NA value
(1 answer)
Closed 2 years ago.
I understand we can use the dplyr function coalesce() to unite different columns, but is there such function to unite rows?
I am struggling with a confusing incomplete/doubled dataframe with duplicate rows for the same id, but with different columns filled. E.g.
id sex age source
12 M NA 1
12 NA 3 1
13 NA 2 2
13 NA NA NA
13 F 2 NA
and I am trying to achieve:
id sex age source
12 M 3 1
13 F 2 2
You can try:
library(dplyr)
#Data
df <- structure(list(id = c(12L, 12L, 13L, 13L, 13L), sex = structure(c(2L,
NA, NA, NA, 1L), .Label = c("F", "M"), class = "factor"), age = c(NA,
3L, 2L, NA, 2L), source = c(1L, 1L, 2L, NA, NA)), class = "data.frame", row.names = c(NA,
-5L))
df %>%
group_by(id) %>%
fill(everything(), .direction = "down") %>%
fill(everything(), .direction = "up") %>%
slice(1)
# A tibble: 2 x 4
# Groups: id [2]
id sex age source
<int> <fct> <int> <int>
1 12 M 3 1
2 13 F 2 2
As mentioned by #A5C1D2H2I1M1N2O1R2T1 you can select the first non-NA value in each group. This can be done using dplyr :
library(dplyr)
df %>% group_by(id) %>% summarise(across(.fns = ~na.omit(.)[1]))
# A tibble: 2 x 4
# id sex age source
# <int> <fct> <int> <int>
#1 12 M 3 1
#2 13 F 2 2
Base R :
aggregate(.~id, df, function(x) na.omit(x)[1], na.action = 'na.pass')
Or data.table :
library(data.table)
setDT(df)[, lapply(.SD, function(x) na.omit(x)[1]), id]