Related
Let's say I have
> df
fu1_date fu1_n_symp fu5_date fu5_n_symp fu7_date fu7_n_symp
1 2012-03-05 1 2014-03-05 NA 2016-03-05 1
2 2013-08-09 1 2015-10-09 2 2017-11-09 NA
3 2019-05-05 1 2020-06-07 2 2021-07-09 2
df denotes an extremely large dataframe. In this example, I have recorded the number of symptoms n_symp on different follow-up dates fu_date.
There are up to 20 follow-ups for each row in my dataframe, fu1_, fu2_, ... , fu20_. I need to correct my dataframe, so that if n_symp is NA then the corresponding fuX_date should be converted from as.Date() to NA.
You can see that row 1 had missing values in follow-up 5 (fu5_n_symp == NA), but not FU1 or FU7. Consequently, fu5_date in row 1 should be converted from 2014-03-05 to NA
I am looking for a solution in dplyr only.
Expected output
> df
fu1_date fu1_n_symp fu5_date fu5_n_symp fu7_date fu7_n_symp
1 2012-03-05 1 <NA> NA 2016-03-05 1
2 2013-08-09 1 2015-10-09 2 <NA> NA
3 2019-05-05 1 2020-06-07 2 2021-07-09 2
Data
df <- structure(list(fu1_date = structure(c(15404, 15926, 18021), class = "Date"),
fu1_n_symp = c(1L, 1L, 1L), fu5_date = structure(c(16134,
16717, 18420), class = "Date"), fu5_n_symp = c(NA, 2L, 2L
), fu7_date = structure(c(16865, 17479, 18817), class = "Date"),
fu7_n_symp = c(1L, NA, 2L)), class = "data.frame", row.names = c(NA, -3L))
With pivot_longer(), you can specify ".value" to names_to to stack date and n_symp pairwise. In this case, one of names_sep or names_pattern must be supplied to specify how the column names should be split. Then you can easily replace those dates with NA where n_symp are missing. Finally, pivot the long data wider to get the original format.
library(dplyr)
library(tidyr)
df %>%
mutate(id = 1:n()) %>%
pivot_longer(-id, names_to = c("fu", ".value"), names_sep = "(?<=\\d)_") %>%
mutate(date = replace(date, is.na(n_symp), NA)) %>%
pivot_wider(names_from = fu, values_from = c(date, n_symp),
names_glue = "{fu}_{.value}", names_vary = "slowest")
# # A tibble: 3 × 7
# id fu1_date fu1_n_symp fu5_date fu5_n_symp fu7_date fu7_n_symp
# <int> <date> <int> <date> <int> <date> <int>
# 1 1 2012-03-05 1 NA NA 2016-03-05 1
# 2 2 2013-08-09 1 2015-10-09 2 NA NA
# 3 3 2019-05-05 1 2020-06-07 2 2021-07-09 2
names_vary in pivot_wider() controls in what order should the resulting column names be combined.
"fastest" (default)
fu1_date fu5_date fu7_date fu1_n_symp fu5_n_symp fu7_n_symp
"slowest"
fu1_date fu1_n_symp fu5_date fu5_n_symp fu7_date fu7_n_symp
Upate: Tweaked code after #Darren Tsai input:
df %>%
mutate(id = row_number()) %>%
pivot_longer(-id,
names_to = "key",
values_to = "val",
values_transform = list(val = as.character)) %>% # change all to character class
mutate(val = ifelse(is.na(lead(val, default = val[1])), NA_character_, val)) %>%
pivot_wider(names_from= key, values_from = val) %>%
type_convert() %>%
select(-id)
Here is one way how we can do it, using pivoting:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(everything(),
names_to = "key",
values_to = "val",
values_transform = list(val = as.character)) %>% # change all to character class
mutate(val = ifelse(is.na(lead(val, default = val[1])), NA_character_, val)) %>%
group_by(key) %>%
mutate(row = row_number()) %>%
pivot_wider(names_from= key, values_from = val) %>%
mutate(across(contains("date"), as.Date)) # to get back to date class
select(-row)
fu1_date fu1_n_symp fu5_date fu5_n_symp fu7_date fu7_n_symp
<chr> <chr> <chr> <chr> <chr> <chr>
1 2012-03-05 1 NA NA 2016-03-05 1
2 2013-08-09 1 2015-10-09 2 NA NA
3 2019-05-05 1 2020-06-07 2 2021-07-09 2
I realize that you ask for a dplyr solution. On the other hand, you also write that you have an extremely large dataframe. Therefore, for future reference, I think it may be worth considering a base solution, which is faster.
do.call(cbind,
setNames(
lapply(split.default(df, (seq_len(ncol(df)) + 1) %/% 2), \(d){
d[[1]] = replace(d[[1]], is.na(d[[2]]), NA)
d
}), NULL))
# fu1_date fu1_n_symp fu5_date fu5_n_symp fu7_date fu7_n_symp
# 1 2012-03-05 1 <NA> NA 2016-03-05 1
# 2 2013-08-09 1 2015-10-09 2 <NA> NA
# 3 2019-05-05 1 2020-06-07 2 2021-07-09 2
Benchmark
Prepare some larger data (certainly not "extremely" large):
d = setNames(cbind(df, df, df, df, df, df, df),
paste0(paste0("fu", rep(1:21, each = 2)),
"_", c("date", "n_symp")))
n = 1e6
d2 = d[rep(1:nrow(d), n), ]
dim(d2)
# [1] 3000000 42
dplyr 1 (TarJae):
system.time({
d3 = d2 %>%
mutate(id = row_number()) %>%
pivot_longer(-id,
names_to = "key",
values_to = "val",
values_transform = list(val = as.character)) %>% # change all to character class
mutate(val = ifelse(is.na(lead(val, default = val[1])), NA_character_, val)) %>%
pivot_wider(names_from= key, values_from = val) %>%
type_convert()
})
# n = 1e6
# user system elapsed
# 423.76 13.02 445.55
dplyr2 (Darren Tsai):
system.time({
d4 = d2 %>%
mutate(id = 1:n()) %>%
pivot_longer(-id, names_to = c("fu", ".value"), names_sep = "(?<=\\d)_") %>%
mutate(date = replace(date, is.na(n_symp), NA)) %>%
pivot_wider(names_from = fu, values_from = c(date, n_symp),
names_glue = "{fu}_{.value}", names_vary = "slowest")
})
# n = 1e6
# user system elapsed
# 21.34 2.28 23.73
# n = 1e7
# user system elapsed
# 293.25 441.64 871.64
base:
system.time({
d5 = do.call(cbind,
setNames(
lapply(split.default(d2, (seq_len(ncol(d2)) + 1) %/% 2), \(d){
d[[1]] = replace(d[[1]], is.na(d[[2]]), NA)
d
}), NULL))
})
# n = 1e6
# user system elapsed
# 1.28 0.11 1.39
# n = 1e7
# user system elapsed
# 11.43 16.79 35.72
all.equal(d3, d4)
# [1] TRUE
all.equal(as.data.frame(d3[ , -1]), `rownames<-`(d5, NULL))
# [1] TRUE
Thus, for n = 1e6, the base alternative is about 300 and 15 times faster, respectively, than the dplyr code. It also seems to scale better; for n = 1e7, the base alternative is ca. 24 times faster than the fastest dplyr.
Given a table of counts specified in 'dat' I would like to create a dataframe with 3 columns (race, grp and outcome) and 206 rows. The variable outcome would be 1 if for ascertained, and 0 if 'missed'.
dat <- structure(list(race = structure(c(1L, 2L, 1L, 2L), levels = c("black",
"nonblack"), class = "factor"), grp = structure(c(1L, 1L, 2L,
2L), levels = c("hbpm", "uc"), class = "factor"), ascertained = c(63,
32, 24, 21), missed = c(5, 3, 49, 9), total = c(68, 35, 73, 30
)), class = "data.frame", row.names = c(NA, -4L))
1) For each row set race in the output to that race, grp in the output to that group and then generate the appropriate number of 1s and 0s for outcome. The result is 206 x 3.
library(dplyr)
dat %>%
rowwise %>%
summarize(race = race, grp = grp, outcome = rep(1:0, c(ascertained, missed)))
2) In the example data there are no duplicate race/grp and if that is true in general then it can alternately be written as::
dat %>%
group_by(race, grp) %>%
summarize(outcome = rep(1:0, c(ascertained, missed)), .groups = "drop")
3) A base R solution would be the following. If each combination of race/grp occurs on only one row of the input then 1:nrow(dat) could optionally be replaced with dat[1:2].
do.call("rbind",
by(dat,
1:nrow(dat),
with,
data.frame(race = race, grp = grp, outcome = rep(1:0, c(ascertained, missed)))
)
)
How about this:
library(tidyverse)
dat <- structure(list(race = structure(c(1L, 2L, 1L, 2L), levels = c("black",
"nonblack"), class = "factor"), grp = structure(c(1L, 1L, 2L,
2L), levels = c("hbpm", "uc"), class = "factor"), ascertained = c(63,
32, 24, 21), missed = c(5, 3, 49, 9), total = c(68, 35, 73, 30
)), class = "data.frame", row.names = c(NA, -4L))
dat2 <- dat %>% select(-total) %>%
pivot_longer(c(ascertained, missed), names_to = "var", values_to="vals") %>%
uncount(vals) %>%
mutate(outcome = case_when(var == "ascertained" ~ 1,
TRUE ~ 0)) %>%
select(-var)
head(dat2)
#> # A tibble: 6 × 3
#> race grp outcome
#> <fct> <fct> <dbl>
#> 1 black hbpm 1
#> 2 black hbpm 1
#> 3 black hbpm 1
#> 4 black hbpm 1
#> 5 black hbpm 1
#> 6 black hbpm 1
dat2 %>%
group_by(race, grp, outcome) %>%
tally()
#> # A tibble: 8 × 4
#> # Groups: race, grp [4]
#> race grp outcome n
#> <fct> <fct> <dbl> <int>
#> 1 black hbpm 0 5
#> 2 black hbpm 1 63
#> 3 black uc 0 49
#> 4 black uc 1 24
#> 5 nonblack hbpm 0 3
#> 6 nonblack hbpm 1 32
#> 7 nonblack uc 0 9
#> 8 nonblack uc 1 21
This is based partially on the linked question from Limey in the comments:
library(tidyverse)
bind_rows(
dat %>% uncount(ascertained) %>% mutate(outcome = 1) %>% select(-missed, -total),
dat %>% uncount(missed) %>% mutate(outcome = 0) %>% select(-ascertained, -total)
)
Here is a relatively simple answer that is based on, in part, the answer suggested in a comment, but adapted to work for your problem, since you need multiple "uncounts". This answer uses function from the packages tibble, dplyr, and tidyr. These are all in the tidyverse.
The exact method is to create two sub-lists, one listing out the "ascertained", and one listing out the "missed", formatting the ascertained column as you wanted, and then mashing these two together with a basic tibble::add_row.
The relevant code is:
library(tidyverse)
dat2 <- uncount(dat, ascertained, .remove = F) %>%
mutate(ascertained = 1) %>%
select(-missed)
dat3 <- uncount(dat, missed, .remove = T) %>%
mutate(ascertained = 0)
dat4 <- add_row(dat2, dat3) %>% select(-total) %>%
rename(outcome = ascertained)
dat4 should be the data as you asked for it. I would suggest also generating an id column to make things easier to work with, but obviously that is up to you.
I have the following data:
ID cancer cancer_date stroke stroke_date diabetes diabetes_date
1 1 Feb2017 0 Jan2015 1 Jun2015
2 0 Feb2014 1 Jan2015 1 Jun2015
I would like to get
ID condition date
1 cancer xx
1 diabetes xx
2 stroke xx
2 diabetes xx
I tried reshape and gather, but it did not do what I want. Any ideas how can I do this?
This should do it. The key to make it work easily is to change the names of cancer, stroke and diabetes to x_val and then you can use pivot_longer() from tidyr to do the work.
library(tidyr)
library(dplyr)
dat <- tibble::tribble(
~ID, ~cancer, ~cancer_date, ~stroke, ~stroke_date, ~diabetes, ~diabetes_date,
1, 1, "Feb2017", 0, "Jan2015", 1, "Jun2015",
2, 0, "Feb2014", 1, "Jan2015", 1, "Jun2015")
dat %>%
rename("cancer_val" = "cancer",
"stroke_val" = "stroke",
"diabetes_val" = "diabetes") %>%
pivot_longer(cols=-ID,
names_to = c("diagnosis", ".value"),
names_pattern="(.*)_(.*)") %>%
filter(val == 1)
# # A tibble: 4 x 4
# ID diagnosis val date
# <dbl> <chr> <dbl> <chr>
# 1 1 cancer 1 Feb2017
# 2 1 diabetes 1 Jun2015
# 3 2 stroke 1 Jan2015
# 4 2 diabetes 1 Jun2015
library(data.table)
data <- data.table(ID = c(1, 2), cancer = c(1, 0), cancer_date = c("Feb2017", "Feb2014"), stroke = c(0, 1), stroke_date = c("Jan2015", "Jan2015"), diabetes = c(1, 1), diabetes_date = c("Jun2015", "Jun2015"))
datawide <-
melt(data, id.vars = c("ID", "cancer", "stroke", "diabetes"),
measure.vars = c("cancer_date", "stroke_date", "diabetes_date"))
datawide[(cancer == 1 & variable == "cancer_date") |
(stroke == 1 & variable == "stroke_date") |
(diabetes == 1 & variable == "diabetes_date"), .(ID, condition = variable, date = value)]
Try this solution using pivot_longer() and a flag variable to filter the desired states. After pivoting you can filter the values different to zero and only choose the one values. Here the code:
library(tidyverse)
#Code
df2 <- df %>% pivot_longer(cols = -c(ID,contains('_'))) %>%
filter(value!=0) %>% rename(condition=name) %>% select(-value) %>%
pivot_longer(-c(ID,condition)) %>%
separate(name,c('v1','v2'),sep='_') %>%
mutate(Flag=ifelse(condition==v1,1,0)) %>%
filter(Flag==1) %>% select(-c(v1,v2,Flag)) %>%
rename(date=value)
Output:
# A tibble: 4 x 3
ID condition date
<int> <chr> <chr>
1 1 cancer Feb2017
2 1 diabetes Jun2015
3 2 stroke Jan2015
4 2 diabetes Jun2015
Some data used:
#Data
df <- structure(list(ID = 1:2, cancer = 1:0, cancer_date = c("Feb2017",
"Feb2014"), stroke = 0:1, stroke_date = c("Jan2015", "Jan2015"
), diabetes = c(1L, 1L), diabetes_date = c("Jun2015", "Jun2015"
)), class = "data.frame", row.names = c(NA, -2L))
If the first obtain is complex, here another choice:
#Code 2
df2 <- df %>% mutate(across(everything(),~as.character(.))) %>%
pivot_longer(cols = -c(ID)) %>%
separate(name,c('condition','v2'),sep = '_') %>%
replace(is.na(.),'val') %>%
pivot_wider(names_from = v2,values_from=value) %>%
filter(val==1) %>% select(-val)
Output:
# A tibble: 4 x 3
ID condition date
<chr> <chr> <chr>
1 1 cancer Feb2017
2 1 diabetes Jun2015
3 2 stroke Jan2015
4 2 diabetes Jun2015
I'm new to programming in R and I have the following dataframe:
A B C D E
1 3 0 4 5 0
2 0 0 5 1 0
3 2 1 2 0 3
I would like to get a new dataframe containing the indices of the n max values of each row, e.g: If I wanted the column indices of the 3 biggest values in each row (n=3), I want my new dataframe to be like this:
F G H
1 1 3 4
2 1 3 4
3 1 3 5
So in the first row of this dataframe containts the column indices of the 3 biggest values of row 1 in the original dataframe. And so on.
My original idea was to write a loop with which.max, but that seems way too long and ineffective. Does anyone have a better idea?
We can use apply
t(apply(df1, 1, function(x) sort(head(seq_along(x)[order(-x)], 3))))
# [,1] [,2] [,3]
#1 1 3 4
#2 1 3 4
#3 1 3 5
Or using tidyverse
library(dplyr)
library(tidyr)
df1 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn) %>%
group_by(rn) %>%
mutate(ind = row_number()) %>%
arrange(rn, desc(value)) %>%
slice(n = 1:3) %>%
select(-name, -value) %>%
arrange(rn, ind) %>%
mutate(nm1 = c("F", "G", "H")) %>%
ungroup %>%
pivot_wider(names_from = nm1, values_from = ind)
data
df1 <- structure(list(A = c(3L, 0L, 2L), B = c(0L, 0L, 1L), C = c(4L,
5L, 2L), D = c(5L, 1L, 0L), E = c(0L, 0L, 3L)), class = "data.frame",
row.names = c("1",
"2", "3"))
I have a dataframe I've created in the form
FREQ CNT
0 5
1 20
2 1000
3 3
4 3
I want to further group my results to be in the following form:
CUT CNT
0+1 25
2+3 1003
4+5 ...
.....
I've tried using the between and cut functions in dplyr but it just adds a new interval column to my dataframe can anyone give me a good indication as to where to go to achieve this?
Here is a way to do it in dplyr:
library(dplyr)
df <- df %>%
mutate(id = 1:n()) %>%
mutate(new_freq = ifelse(id %% 2 != 0, paste0(FREQ, "+", lead(FREQ, 1)), paste0(lag(FREQ, 1), "+", FREQ)))
df <- df %>%
group_by(new_freq) %>%
mutate(new_cnt = sum(CNT))
unique(df[, 4:5])
# A tibble: 2 x 2
# Groups: new_freq [2]
# new_freq new_cnt
# <chr> <int>
#1 0+1 25
#2 2+3 1003
data
df <- structure(list(FREQ = 0:3, CNT = c(5L, 20L, 1000L, 3L)), class = "data.frame", row.names = c(NA, -4L))
A non-elegant solution using dplyr... probably a better way to do this.
dat <- data.frame(FREQ = c(0,1,2,3,4), CNT = c(5,20,1000, 3, 3))
dat2 <- dat %>%
mutate(index = 0:(nrow(dat)-1)%/%2) %>%
group_by(index)
dat2 %>%
summarise(new_CNT = sum(CNT)) %>%
left_join(dat2 %>%
mutate(CUT = paste0(FREQ[1], "+", FREQ[2])) %>%
distinct(index, CUT),
by = "index") %>%
select(-index)
# A tibble: 3 x 2
new_CNT CUT
<dbl> <chr>
1 25 0+1
2 1003 2+3
3 3 4+NA