Related
I have a large dataset of ID of patients with delays in days between the surgery and radiotherapy (RT) sessions. Some patients may had two or three RT treatments. To identidy those patients, I consider a delay being greater than 91 days (3 months).
This delay of 91 days corresponds to the end of one RT treatment and the start of another one. For analysis purposes it may be set at 61 days (2 months).
How to make correspond this delay above 91 days between two values to a new RT treatement and add a corresponding order into a new column?
My database looks like this:
df1 <- data.frame (
id = c("a","a","a","a","b","b","b","b","b","b","b","b","b","b","b","b","b", "c","c","c","c"),
delay = c(2,3,5,6, 3,5,7,9, 190,195,201,203,205, 1299,1303,1306,1307, 200,202,204,205))
> df1
id delay
1 a 2
2 a 3
3 a 5
4 a 6
5 b 3
6 b 5
7 b 7
8 b 9
9 b 190
10 b 195
11 b 201
12 b 203
13 b 205
14 b 1299
15 b 1303
16 b 1306
17 b 1307
18 c 200
19 c 202
20 c 204
21 c 205
I failed to produce something like this considering if the time between the first set of delays is greater than 100 days.
df2 <- data.frame (
id = c("a","a","a","a","b","b","b","b","b","b","b","b","b","b","b","b","b", "c","c","c","c"),
delay = c(2,3,5,6, 3,5,7,9, 190,195,201,203,205, 1299,1303,1306,1307, 200,202,204,205),
tt_order = c("1st","1st","1st","1st"," 1st","1st","1st","1st"," 2nd","2nd","2nd","2nd","2nd"," 3rd","3rd","3rd","3rd"," 1st","1st","1st","1st"))
> df2
id delay tt_order
1 a 2 1st
2 a 3 1st
3 a 5 1st
4 a 6 1st
5 b 3 1st
6 b 5 1st
7 b 7 1st
8 b 9 1st
9 b 190 2nd
10 b 195 2nd
11 b 201 2nd
12 b 203 2nd
13 b 205 2nd
14 b 1299 3rd
15 b 1303 3rd
16 b 1306 3rd
17 b 1307 3rd
18 c 200 1st
19 c 202 1st
20 c 204 1st
21 c 205 1st
I will be grateful for any help you can provide.
One way would be to divide delay by 100 and then use match and unique to get unique index in a sequential fashion for each id.
library(dplyr)
df2 %>%
group_by(id) %>%
mutate(n_tt = floor(delay/100),
n_tt = match(n_tt, unique(n_tt)))
# id delay tt_order n_tt
# <chr> <dbl> <dbl> <int>
# 1 a 2 1 1
# 2 a 3 1 1
# 3 a 5 1 1
# 4 a 6 1 1
# 5 b 3 1 1
# 6 b 5 1 1
# 7 b 7 1 1
# 8 b 9 1 1
# 9 b 150 2 2
#10 b 152 2 2
#11 b 155 2 2
#12 b 159 2 2
#13 b 1301 3 3
#14 b 1303 3 3
#15 b 1306 3 3
#16 b 1307 3 3
#17 c 200 1 1
#18 c 202 1 1
#19 c 204 1 1
#20 c 205 1 1
Created a new column n_tt for comparison purposes with tt_order in df2.
#CharlesLDN - perhaps this might be what you are looking for. This will look at differences in delay within each id, and gaps of > 90 days will be considered a new treatment.
library(tidyverse)
df1 %>%
group_by(id) %>%
mutate(tt_order = cumsum(c(0, diff(delay)) > 90) + 1)
Output
id delay tt_order
<chr> <dbl> <dbl>
1 a 2 1
2 a 3 1
3 a 5 1
4 a 6 1
5 b 3 1
6 b 5 1
7 b 7 1
8 b 9 1
9 b 190 2
10 b 195 2
11 b 201 2
12 b 203 2
13 b 205 2
14 b 1299 3
15 b 1303 3
16 b 1306 3
17 b 1307 3
18 c 200 1
19 c 202 1
20 c 204 1
21 c 205 1
I have a data frame and wish to sort specific columns alphabetically in dplyr. I know I can use the code below to sort all columns, but I would only like to sort columns C, B and A alphabetically. I tried using the across function as I would effectively like to select columns C:A, but this did not work.
df <- data.frame(1:16)
df$Testinfo1 <- 1
df$Band <- 1
df$Alpha <- 1
df$C <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
df$B <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16,16)
df$A <- c(1,1,1,1,1,1,1,1,1,1,1,14,NA_real_,NA_real_,NA_real_,16)
df
df %>%
select(sort(names(.)))
A Alpha B Band C Testinfo1 X1.16
1: 1 1 10 1 10 1 1
2: 1 1 0 1 12 1 2
3: 1 1 0 1 14 1 3
4: 1 1 0 1 16 1 4
5: 1 1 12 1 10 1 5
6: 1 1 12 1 12 1 6
7: 1 1 12 1 14 1 7
8: 1 1 12 1 16 1 8
9: 1 1 0 1 10 1 9
10: 1 1 14 1 12 1 10
11: 1 1 NA 1 14 1 11
12: 14 1 14 1 16 1 12
13: NA 1 16 1 10 1 13
14: NA 1 16 1 12 1 14
15: NA 1 16 1 14 1 15
16: 16 1 16 1 16 1 16
My desired output is below:
X1.16 Testinfo1 Band Alpha A B C
1: 1 1 1 1 1 10 10
2: 2 1 1 1 1 0 12
3: 3 1 1 1 1 0 14
4: 4 1 1 1 1 0 16
5: 5 1 1 1 1 12 10
6: 6 1 1 1 1 12 12
7: 7 1 1 1 1 12 14
8: 8 1 1 1 1 12 16
9: 9 1 1 1 1 0 10
10: 10 1 1 1 1 14 12
11: 11 1 1 1 1 NA 14
12: 12 1 1 1 14 14 16
13: 13 1 1 1 NA 16 10
14: 14 1 1 1 NA 16 12
15: 15 1 1 1 NA 16 14
16: 16 1 1 1 16 16 16
You can use relocate() (from dplyr 1.0.0 onwards):
library(dplyr)
vars <- c("C", "B", "A")
df %>%
relocate(all_of(sort(vars)), .after = last_col())
If you are passing a character vector of names you should wrap it in all_of() (which will error if any variables are missing) or any_of() which won't.
You can do
sortcols <- c("A","B","C")
library(dplyr)
df %>%
select(-sortcols, sort(sortcols))
The -sortcols part selects everything but the columns you want to sort and then you put the columns you want after those.
A base R option for a case which may or may not exist. If the columns that you want to sort are not at the end of the dataframe.
We add a new column D which you don't want to change the position of.
df$D <- 1:16
cols_to_sort <- c('A', 'B', 'C')
inds <- match(cols_to_sort, names(df))
cols <- seq_along(df)
cols[cols %in% inds] <- inds
df[cols]
# X1.16 Testinfo1 Band Alpha A B C D
#1 1 1 1 1 1 10 10 1
#2 2 1 1 1 1 0 12 2
#3 3 1 1 1 1 0 14 3
#4 4 1 1 1 1 0 16 4
#5 5 1 1 1 1 12 10 5
#6 6 1 1 1 1 12 12 6
#7 7 1 1 1 1 12 14 7
#8 8 1 1 1 1 12 16 8
#9 9 1 1 1 1 0 10 9
#10 10 1 1 1 1 14 12 10
#11 11 1 1 1 1 NA 14 11
#12 12 1 1 1 14 14 16 12
#13 13 1 1 1 NA 16 10 13
#14 14 1 1 1 NA 16 12 14
#15 15 1 1 1 NA 16 14 15
#16 16 1 1 1 16 16 16 16
I have a tall data frame as such:
data = data.frame("id"=c(1,2,3,4,5,6,7,8,9,10),
"group"=c(1,1,2,1,2,2,2,2,1,2),
"type"=c(1,1,2,3,2,2,3,3,3,1),
"score1"=c(sample(1:4,10,r=T)),
"score2"=c(sample(1:4,10,r=T)),
"score3"=c(sample(1:4,10,r=T)),
"score4"=c(sample(1:4,10,r=T)),
"score5"=c(sample(1:4,10,r=T)),
"weight1"=c(173,109,136,189,186,146,173,102,178,174),
"weight2"=c(147,187,125,126,120,165,142,129,144,197),
"weight3"=c(103,192,102,159,128,179,195,193,135,145),
"weight4"=c(114,182,199,101,111,116,198,123,119,181),
"weight5"=c(159,125,104,171,166,154,197,124,180,154))
library(reshape2)
library(plyr)
data1 <- reshape(data, direction = "long",
varying = list(c(paste0("score",1:5)),c(paste0("weight",1:5))),
v.names = c("score","weight"),
idvar = "id", timevar = "count", times = c(1:5))
data1 <- data1[order(data1$id), ]
And what I want to create is a new data frame like so:
want = data.frame("score"=rep(1:4,6),
"group"=rep(1:2,12),
"type"=rep(1:3,8),
"weightedCOUNT"=NA) # how to calculate this? count(data1, score, wt = weight)
I am just not sure how to calculate weightedCOUNT which should apply the weights to the score variable so then it gives in column 'weightedCOUNT' a weighted count that is aggregated by score and group and type.
An option would be to melt (from data.table - which can take multiple measure patterns, and then grouped by 'group', 'type' get the count
library(data.table)
library(dplyr)
melt(setDT(data), measure = patterns('^score', "^weight"),
value.name = c("score", "weight")) %>%
group_by(group, type) %>%
count(score, wt = weight)
If we need to have a complete set of combinations
library(tidyr)
melt(setDT(data), measure = patterns('^score', "^weight"),
value.name = c("score", "weight")) %>%
group_by(group, type) %>%
ungroup %>%
complete(group, type, score, fill = list(n = 0))
If I understand correctly, weightedCOUNT is the sum of weights grouped by score, group, and type.
For the sake of completeness, I would like to show how the accepted solution would look like when implemented in pure base R and pure data.table syntax, resp.
Base R
The OP was almost there. He has already reshaped data from wide to long format for multiple value variables. Only the final aggregation step was missing:
data1 <- reshape(data, direction = "long",
varying = list(c(paste0("score",1:5)),c(paste0("weight",1:5))),
v.names = c("score","weight"),
idvar = "id", timevar = "count", times = c(1:5))
result <- aggregate(weight ~ score + group + type, data1, FUN = sum)
result
score group type weight
1 1 1 1 479
2 3 1 1 558
3 4 1 1 454
4 1 2 1 378
5 2 2 1 154
6 3 2 1 174
7 4 2 1 145
8 1 2 2 535
9 2 2 2 855
10 3 2 2 248
11 4 2 2 499
12 1 1 3 189
13 2 1 3 351
14 3 1 3 600
15 4 1 3 362
16 1 2 3 596
17 2 2 3 265
18 3 2 3 193
19 4 2 3 522
result can be reordered by
with(result, result[order(score, group, type), ])
score group type weight
1 1 1 1 479
12 1 1 3 189
4 1 2 1 378
8 1 2 2 535
16 1 2 3 596
13 2 1 3 351
5 2 2 1 154
9 2 2 2 855
17 2 2 3 265
2 3 1 1 558
14 3 1 3 600
6 3 2 1 174
10 3 2 2 248
18 3 2 3 193
3 4 1 1 454
15 4 1 3 362
7 4 2 1 145
11 4 2 2 499
19 4 2 3 522
data.table
As shown by akrun, melt() from the data.table package can be combined with dplyr. Alternatively, we can stay with the data.table syntax for aggregation:
library(data.table)
cols <- c("score", "weight") # to save typing
melt(setDT(data), measure = patterns(cols), value.name = cols)[
, .(weightedCOUNT = sum(weight)), keyby = .(score, group, type)]
score group type weightedCOUNT
1: 1 1 1 479
2: 1 1 3 189
3: 1 2 1 378
4: 1 2 2 535
5: 1 2 3 596
6: 2 1 3 351
7: 2 2 1 154
8: 2 2 2 855
9: 2 2 3 265
10: 3 1 1 558
11: 3 1 3 600
12: 3 2 1 174
13: 3 2 2 248
14: 3 2 3 193
15: 4 1 1 454
16: 4 1 3 362
17: 4 2 1 145
18: 4 2 2 499
19: 4 2 3 522
The keyby parameter is used for grouping and ordering the output in one step.
Completion of missing combinations of the grouping variables is also possible in data.table syntax using the cross join function CJ():
melt(setDT(data), measure = patterns(cols), value.name = cols)[
, .(weightedCOUNT = sum(weight)), keyby = .(score, group, type)][
CJ(score, group, type, unique = TRUE), on = .(score, group, type)][
is.na(weightedCOUNT), weightedCOUNT := 0][]
score group type weightedCOUNT
1: 1 1 1 479
2: 1 1 2 0
3: 1 1 3 189
4: 1 2 1 378
5: 1 2 2 535
6: 1 2 3 596
7: 2 1 1 0
8: 2 1 2 0
9: 2 1 3 351
10: 2 2 1 154
11: 2 2 2 855
12: 2 2 3 265
13: 3 1 1 558
14: 3 1 2 0
15: 3 1 3 600
16: 3 2 1 174
17: 3 2 2 248
18: 3 2 3 193
19: 4 1 1 454
20: 4 1 2 0
21: 4 1 3 362
22: 4 2 1 145
23: 4 2 2 499
24: 4 2 3 522
score group type weightedCOUNT
I have a data table in the below format :
id c1 c2
1 1 NA
1 1 NA
1 1 10
1 1 NA
1 1 NA
1 1 10
1 1 NA
1 1 NA
1 1 11
1 1 NA
1 1 NA
1 1 11
2 1 NA
2 1 12
2 1 NA
2 1 NA
2 1 12
From this data table I would like to update all the NA in between the two values in c2 as below:
id c1 c2
1 1 NA
1 1 NA
1 1 10
1 1 10
1 1 10
1 1 10
1 1 NA
1 1 NA
1 1 11
1 1 11
1 1 11
1 1 11
2 1 NA
2 1 12
2 1 12
2 1 12
2 1 12
Can do it using a for loop and which():
df=data.frame(id = c(rep(1,12)),c2 = c(NA,NA,10,NA,NA,10, NA,NA,11,NA,11,NA))
Find unique values of c2:
vals=unique(df[which(!is.na(df$c2)),'c2'])
Loop through unique values and replace observations between their first and last appearance:
for(i in vals){
df[min(which(df$c2==i)):max(which(df$c2==i)),'c2']=i
}
Besides David's approach which is working directly with row indices there is another data.table approach which uses a non-equi join:
# coerce to data.table
setDT(DT)[
# append unique row id
, rn := .I][
# non-equi join on row ids
DT[!is.na(c2), .(rmin = min(rn), rmax = max(rn)), by = c2],
on = .(rn >= rmin, rn <= rmax), c2 := i.c2][
# remove row id column
, rn := NULL][]
id c1 c2
1: 1 1 NA
2: 1 1 NA
3: 1 1 10
4: 1 1 10
5: 1 1 10
6: 1 1 10
7: 1 1 NA
8: 1 1 NA
9: 1 1 11
10: 1 1 11
11: 1 1 11
12: 1 1 11
13: 2 1 NA
14: 2 1 12
15: 2 1 12
16: 2 1 12
17: 2 1 12
Caveat
The expression
DT[!is.na(c2), .(rmin = min(rn), rmax = max(rn)), by = c2]
returns the row id ranges for each unique value of c2
c2 rmin rmax
1: 10 3 6
2: 11 9 12
3: 12 14 17
There is an implicit assumption that the row id ranges do not overlap. It requires that each "gap" is associated with a unique c2 value. This affects other solutions 1, 2 as well.
Improved solution using rleid()
The code can be improved to handle cases where the above mentioned assumption is violated.
Using rleid(), we can distinguish different gaps even if the have the same c2 value. For instance, for the second sample data set
DT2[!is.na(c2), .(c2 = first(c2), rmin = min(rn), rmax = max(rn)), by = rleid(c2)]
rleid c2 rmin rmax
1: 1 10 3 6
2: 2 11 9 12
3: 3 12 14 17
4: 4 10 20 23
The complete code:
setDT(DT2)[, rn := .I][
DT2[!is.na(c2), .(c2 = first(c2), rmin = min(rn), rmax = max(rn)), by = rleid(c2)],
on = .(rn >= rmin, rn <= rmax), c2 := i.c2][, rn := NULL][]
id c1 c2
1: 1 1 NA
2: 1 1 NA
3: 1 1 10
4: 1 1 10
5: 1 1 10
6: 1 1 10
7: 1 1 NA
8: 1 1 NA
9: 1 1 11
10: 1 1 11
11: 1 1 11
12: 1 1 11
13: 2 1 NA
14: 2 1 12
15: 2 1 12
16: 2 1 12
17: 2 1 12
18: 2 1 NA
19: 2 1 NA
20: 2 1 10
21: 2 1 10
22: 2 1 10
23: 2 1 10
24: 2 1 NA
25: 2 1 NA
id c1 c2
Data
library(data.table)
DT <- fread("id c1 c2
1 1 NA
1 1 NA
1 1 10
1 1 NA
1 1 NA
1 1 10
1 1 NA
1 1 NA
1 1 11
1 1 NA
1 1 NA
1 1 11
2 1 NA
2 1 12
2 1 NA
2 1 NA
2 1 12")
Expanded data set (note the repeated appearance of c2 == 10):
DT2 <- fread("id c1 c2
1 1 NA
1 1 NA
1 1 10
1 1 NA
1 1 NA
1 1 10
1 1 NA
1 1 NA
1 1 11
1 1 NA
1 1 NA
1 1 11
2 1 NA
2 1 12
2 1 NA
2 1 NA
2 1 12
2 1 NA
2 1 NA
2 1 10
2 1 NA
2 1 NA
2 1 10
2 1 NA
2 1 NA")
Okay (new/edited answer), we can make use of the fact that the desired property of a solution is that filling up should yield the same result as filling down:
library(tidyverse)
df %>%
mutate(filled_down = c2, filled_up = c2) %>%
fill(filled_down, .direction="down") %>%
fill(filled_up, .direction="up") %>%
mutate(c2 = ifelse(filled_down == filled_up, filled_down, c2)) %>%
select(-filled_down, -filled_up)
Given a very large longitudinal dataset with different groups, I need to create a flag that indicates the first change in a certain variable (code) between years (year), per group (id). The type of observation within the same id-year just indicates different group members.
Sample data:
library(tidyverse)
sample <- tibble(id = rep(1:3, each=6),
year = rep(2010:2012, 3, each=2),
type = (rep(1:2, 9)),
code = c("abc","abc","","","xyz","xyz", "","","lmn","","efg","efg","def","def","","klm","nop","nop"))
What I need is to flag the first change to code within a group, between years. Second changes do not matter. Missing codes ("") can be treated as NA but in any case should not affect flag. The following is the above tibble with a flag field as it should be:
# A tibble: 18 × 5
id year type code flag
<int> <int> <int> <chr> <dbl>
1 1 2010 1 abc 0
2 1 2010 2 abc 0
3 1 2011 1 0
4 1 2011 2 0
5 1 2012 1 xyz 1
6 1 2012 2 xyz 1
7 2 2010 1 0
8 2 2010 2 0
9 2 2011 1 lmn 0
10 2 2011 2 0
11 2 2012 1 efg 1
12 2 2012 2 efg 1
13 3 2010 1 def 0
14 3 2010 2 def 0
15 3 2011 1 1
16 3 2011 2 klm 1
17 3 2012 1 nop 1
18 3 2012 2 nop 1
I still have a looping mindset and I am trying to use vectorized dplyr to do what I need.
Any input would be greatly appreciated!
EDIT: thanks for pointing this out regarding the importance of year. The id's are arranged by year, as the ordering is important here, and also all types per id per year need to have the same flag. So, in the edited row 15 e code is "" which would not warrant a change by itself, but since in the same year row 16 has a new code, both observations need to have their codes changed to 1.
We can use data.table
library(data.table)
setDT(sample)[, flag :=0][code!="", flag := {rl <- rleid(code)-1; cummax(rl*(rl < 2)) }, id]
sample
# id year type code flag
# 1: 1 2010 1 abc 0
# 2: 1 2010 2 abc 0
# 3: 1 2011 1 0
# 4: 1 2011 2 0
# 5: 1 2012 1 xyz 1
# 6: 1 2012 2 xyz 1
# 7: 2 2010 1 0
# 8: 2 2010 2 0
# 9: 2 2011 1 lmn 0
#10: 2 2011 2 0
#11: 2 2012 1 efg 1
#12: 2 2012 2 efg 1
#13: 3 2010 1 def 0
#14: 3 2010 2 def 0
#15: 3 2011 1 klm 1
#16: 3 2011 2 klm 1
#17: 3 2012 1 nop 1
#18: 3 2012 2 nop 1
Update
If we need to include the 'year' as well,
setDT(sample)[, flag :=0][code!="", flag := {rl <- rleid(code, year)-1
cummax(rl*(rl < 2)) }, id]
possible solution using the dplyr. not sure its the cleanest way though
sample %>%
group_by(id) %>%
#find first year per group where code exists
mutate(first_year = min(year[code != ""])) %>%
#gather all codes from first year (does not assume code is constant within year)
mutate(first_codes = list(code[year==first_year])) %>%
#if year is not first year & code not in first year codes & code not blank
mutate(flag = as.numeric(year != first_year & !(code %in% unlist(first_codes)) & code != "")) %>%
#drop created columns
select(-first_year, -first_codes) %>%
ungroup()
output
# A tibble: 18 × 5
id year type code flag
<int> <int> <int> <chr> <dbl>
1 1 2010 1 abc 0
2 1 2010 2 abc 0
3 1 2011 1 0
4 1 2011 2 0
5 1 2012 1 xyz 1
6 1 2012 2 xyz 1
7 2 2010 1 0
8 2 2010 2 0
9 2 2011 1 lmn 0
10 2 2011 2 0
11 2 2012 1 efg 1
12 2 2012 2 efg 1
13 3 2010 1 def 0
14 3 2010 2 def 0
15 3 2011 1 klm 1
16 3 2011 2 klm 1
17 3 2012 1 nop 1
18 3 2012 2 nop 1
A short solution with the data.table-package:
library(data.table)
setDT(samp)[, flag := 0][code!="", flag := 1*(rleid(code)-1 > 0), by = id]
Or:
setDT(samp)[, flag := 0][code!="", flag := 1*(code!=code[1] & code!=''), by = id][]
which gives the desired result:
> samp
id year type code flag
1: 1 2010 1 abc 0
2: 1 2010 2 abc 0
3: 1 2011 1 0
4: 1 2011 2 0
5: 1 2012 1 xyz 1
6: 1 2012 2 xyz 1
7: 2 2010 1 0
8: 2 2010 2 0
9: 2 2011 1 lmn 0
10: 2 2011 2 0
11: 2 2012 1 efg 1
12: 2 2012 2 efg 1
13: 3 2010 1 def 0
14: 3 2010 2 def 0
15: 3 2011 1 klm 1
16: 3 2011 2 klm 1
17: 3 2012 1 nop 1
18: 3 2012 2 nop 1
Or when the year is relevant as well:
setDT(samp)[, flag := 0][code!="", flag := 1*(rleid(code, year)-1 > 0), id]
A possible base R alternative:
f <- function(x) {
x <- rle(x)$lengths
1 * (rep(seq_along(x), times=x) - 1 > 0)
}
samp$flag <- 0
samp$flag[samp$code!=''] <- with(samp[samp$code!=''], ave(as.character(code), id, FUN = f))
NOTE: it is better not to give your object the same name as functions.
Used data:
samp <- data.frame(id = rep(1:3, each=6),
year = rep(2010:2012, 3, each=2),
type = (rep(1:2, 9)),
code = c("abc","abc","","","xyz","xyz", "","","lmn","","efg","efg","def","def","klm","klm","nop","nop"))