I want to calculate a complementary cumulative survival count for later display in a histogramm (without using ggplot). E.g. count the number of elements surviving 4.0 years, 4.5 years, 5.0 years and so on.
Input is 10000-row dataframe with 4 different types with 4 different distributions for life expectancy:
type <- c(rep("A",1000), rep("B",2000), rep("C",3000), rep("D",4000))
age <- c(rnorm(1000,6,0.5), rnorm(2000,8,0.5), rnorm(3000,10,0.5), rnorm(4000,12,0.5))
input <- data.frame(type,age,stringsAsFactors=FALSE)
Output is built using a range for the age steps:
range <- seq(floor(min(input$age)),ceiling(max(input$age)),0.5)
I expand into a data.frame with range and types:
combns <- expand.grid(age=range,type=LETTERS[1:4], stringsAsFactors=FALSE)
And then use the apply function to count the total number exceeding each age step:
CCSC.apply.all <- apply(combns[1:length(range),],1,function(x){
sum(input$age >= x["age"]) } )
and grouped by type:
CCSC.apply.type <- apply(combns,1,function(x){
sum(
input["age"] >= x["age"] &
input["type"] == x["type"]
) } )
Within the apply function input["age"] >= x["age"] is evaluated sometimes wrong. This results in a wrong count. In the table below columns 2:6 are created using apply, columns 7:11 with a for loop. 2:6 are wrong, 7:11 correct.
> output
range all-apply A-apply B-apply C-apply D-apply all-for A-for B-for C-for D-for
1 4,0 10000 1000 2000 3000 4000 10000 1000 2000 3000 4000
2 4,5 10000 1000 2000 3000 4000 9998 998 2000 3000 4000
3 5,0 10000 1000 2000 3000 4000 9978 978 2000 3000 4000
4 5,5 10000 1000 2000 3000 4000 9843 843 2000 3000 4000
5 6,0 10000 1000 2000 3000 4000 9483 483 2000 3000 4000
6 6,5 10000 1000 2000 3000 4000 9141 143 1998 3000 4000
7 7,0 10000 1000 2000 3000 4000 8981 23 1958 3000 4000
8 7,5 10000 1000 2000 3000 4000 8690 2 1688 3000 4000
9 8,0 10000 1000 2000 3000 4000 8030 0 1030 3000 4000
10 8,5 10000 1000 2000 3000 4000 7329 0 330 2999 4000
11 9,0 10000 1000 2000 3000 4000 6989 0 43 2946 4000
12 9,5 10000 1000 2000 3000 4000 6528 0 2 2526 4000
13 10,0 10000 1000 2000 3000 4000 5494 0 0 1494 4000
14 10,5 8961 1000 2000 1967 3994 4455 0 0 461 3994
15 11,0 8485 1000 2000 1571 3914 3979 0 0 65 3914
16 11,5 7900 1000 2000 1510 3390 3394 0 0 4 3390
17 12,0 6515 1000 2000 1506 2009 2009 0 0 0 2009
18 12,5 5123 1000 2000 1506 617 617 0 0 0 617
19 13,0 4594 1000 2000 1506 88 88 0 0 0 88
20 13,5 4513 1000 2000 1506 7 7 0 0 0 7
21 14,0 4506 1000 2000 1506 0 0 0 0 0 0
Can someone tell me what is the problem with my apply function?
An additional observation: against my expectation the apply method is about factor 100 slower than the for-loop.
Please find the full R script below:
rm(list=ls())
setwd("C:/R_test")
options(OutDec= ",") # to be deleted if not applicable for locale
set.seed(1234)
# creating input - data.frame 10000 data sets to be examined
# 4 different types with 4 different normal distributions for life expectancy
type <- c(rep("A",1000), rep("B",2000), rep("C",3000), rep("D",4000))
age <- c(rnorm(1000,6,0.5), rnorm(2000,8,0.5), rnorm(3000,10,0.5), rnorm(4000,12,0.5))
input <- data.frame(type,age,stringsAsFactors=FALSE)
# complementary cumulative survival count (CCSC)
range <- seq(floor(min(input$age)),ceiling(max(input$age)),0.5) # range for evaluation
# method "apply"
apply_time <- system.time( {
combns <- expand.grid(age=range,type=LETTERS[1:4], stringsAsFactors=FALSE)
CCSC.apply.all <- apply(combns[1:length(range),],1,function(x){
sum(input$age >= x["age"]) } ) # count survivors of all types
CCSC.apply.type <- apply(combns,1,function(x){
sum( # count survivors of certain type
input["age"] >= x["age"] &
input["type"] == x["type"]
) } )
})
output <- data.frame(range, matrix(c(CCSC.apply.all,CCSC.apply.type), nrow=length(range)))
# method "for loop"
for_time <- system.time( {
CCSC.for.all <- rep(0.0, length(range))
CCSC.for.type <- matrix(rep(0.0, 4*length(range)),nrow=length(range))
for(i in 1:length(range))
{
CCSC.for.all[i] <- sum(input$age >= range[i])
for(j in 1:4)
{
CCSC.for.type[i,j] <-
sum(
input["age"] >= range[i] &
input["type"] == LETTERS[j]
)
}
}
})
output <- cbind(output,CCSC.for.all,CCSC.for.type)
colnames(output) <- c("range",
"all-apply","A-apply","B-apply","C-apply","D-apply",
"all-for","A-for","B-for","C-for","D-for")
cat("\ntime for apply method: ", apply_time)
cat("\ntime for for loop method: ", for_time, "\n\n")
write.table(input, file = "CCSC_input.csv", sep=";", row.names=FALSE, dec=",")
write.table(output, file = "CCSC_output.csv", sep=";", row.names=FALSE, dec=",")
"count the number of elements surviving 4.0 years, 4.5 years, 5.0 years and so on."
Code:
using cut() to get age intervals.
1. By age_range and type:
library('data.table')
df <- setDT(input)[, .N, by = .(age_range = cut(age, range, include.lowest = TRUE), type)]
df[order(age_range),]
# age_range type N
# 1: (4.5,5] A 20
# 2: (5,5.5] A 123
# 3: (5.5,6] A 337
# 4: (6,6.5] A 352
# 5: (6,6.5] B 6
# 6: (6.5,7] A 151
# 7: (6.5,7] B 47
# 8: (7,7.5] A 16
# 9: (7,7.5] B 277
# 10: (7.5,8] A 1
# 11: (7.5,8] B 700
# 12: (8,8.5] B 654
# 13: (8,8.5] C 2
# 14: (8.5,9] B 273
# 15: (8.5,9] C 70
# 16: (9,9.5] B 39
# 17: (9,9.5] C 383
# 18: (9.5,10] B 4
# 19: (9.5,10] C 1023
# 20: (10,10.5] C 1065
# 21: (10,10.5] D 6
# 22: (10.5,11] C 406
# 23: (10.5,11] D 92
# 24: (11,11.5] C 49
# 25: (11,11.5] D 543
# 26: (11.5,12] C 2
# 27: (11.5,12] D 1363
# 28: (12,12.5] D 1334
# 29: (12.5,13] D 561
# 30: (13,13.5] D 92
# 31: (13.5,14] D 8
# 32: (14,14.5] D 1
2. By age_range only:
df <- setDT(input)[, .N, by = .(age_range = cut(age, range, include.lowest = TRUE))]
df[order(age_range),]
# age_range N
# 1: (4.5,5] 20
# 2: (5,5.5] 123
# 3: (5.5,6] 337
# 4: (6,6.5] 358
# 5: (6.5,7] 198
# 6: (7,7.5] 293
# 7: (7.5,8] 701
# 8: (8,8.5] 656
# 9: (8.5,9] 343
# 10: (9,9.5] 422
# 11: (9.5,10] 1027
# 12: (10,10.5] 1071
# 13: (10.5,11] 498
# 14: (11,11.5] 592
# 15: (11.5,12] 1365
# 16: (12,12.5] 1334
# 17: (12.5,13] 561
# 18: (13,13.5] 92
# 19: (13.5,14] 8
# 20: (14,14.5] 1
Data:
type <- c(rep("A",1000), rep("B",2000), rep("C",3000), rep("D",4000))
age <- c(rnorm(1000,6,0.5), rnorm(2000,8,0.5), rnorm(3000,10,0.5), rnorm(4000,12,0.5))
input <- data.frame(type,age,stringsAsFactors=FALSE)
range <- seq(floor(min(input$age)),ceiling(max(input$age)),0.5)
I struggled to understand exactly what you were looking for, so apologies if I've got this wrong. I've tried used split to make it easier to iterate over the type variable, then have used the purrr package to iterate rather than the apply family.
By being more explicit outside the iterating function - e.g. using unique(combns$age) - I think it's easier to understand what is being iterated over. For example, in your original code, I think x["age"] resulted in a character rather than a numeric as you were expecting.
FYI - differences in values are likely due to the use of rnorm in generating the data and not setting a seed.
# split input list by type
input_list <- split(input, type)
# for each type, calculate age >= each unique value of combns$age
purrr::map_df(input_list,
.f = function(y) {
purrr::map_dbl(unique(combns$age),
.f = function(x) sum(y$age >= x))
})
# A tibble: 21 x 4
A B C D
<dbl> <dbl> <dbl> <dbl>
1 1000 2000 3000 4000
2 1000 2000 3000 4000
3 970 2000 3000 4000
4 841 2000 3000 4000
5 458 2000 3000 4000
6 149 2000 3000 4000
7 32 1956 3000 4000
8 2 1704 3000 4000
9 0 1022 3000 4000
10 0 340 2997 4000
# … with 11 more rows
Related
I've got a very large dataset (millions of rows that I need to loop through thousands of times), and during the loop I have to do a conditional sum that appears to be taking a very long time. Is there a way of making this more efficient?
Datatable format as follows:
DT <- data.table('A' = c(1,1,1,2,2,3,3,3,3,4),
'B' = c(500,510,540,500,540,500,510,519,540,500),
'C' = c(10,20,10,20,10,50,20,50,20,10))
A
B
C
1
500
10
1
510
20
1
540
10
2
500
20
2
540
10
3
500
50
3
510
20
3
519
50
3
540
20
4
500
10
I need the sum of column C (in a new column, D) subject to A == A, and B >= B & B < B + 20 (by row). So the output table would look like the following:
A
B
C
D
1
500
10
30
1
510
20
30
1
540
10
10
2
500
20
20
2
540
10
10
3
500
50
120
3
510
20
120
3
519
50
120
3
540
20
20
4
500
10
10
The code I'm currently using:
DT[,D:= sum(DT$C[A == DT$A & ((B >= DT$B) & (B < DT$B + 20))]), by=c('A', 'B')]
This takes a very long time to actually run, as well as giving me the wrong answer. The output I get looks like this:
A
B
C
D
1
500
10
10
1
510
20
30
1
540
10
10
2
500
20
20
2
540
10
10
3
500
50
50
3
510
20
70
3
519
50
120
3
540
20
20
4
500
10
10
(i.e. D only appears to increase cumulatively).
I'm less concerned with the cumulative thing, more about speed. Ultimately what I'm trying to get to is the largest sum of C, by A, subject to B being within 20 of eachother. I would really appreciate any help on this! Thanks in advance.
If I understand correctly, this can be solved by a non-equi self join:
DT[, Bp20 := B + 20][
DT, on = .(A, B >= B, B < Bp20), mult = "last"][
, .(B, C = i.C, D = sum(i.C)), by = .(A, Bp20)][
, Bp20 := NULL][]
A B C D
1: 1 500 10 30
2: 1 510 20 30
3: 1 540 10 10
4: 2 500 20 20
5: 2 540 10 10
6: 3 500 50 120
7: 3 510 20 120
8: 3 519 50 120
9: 3 540 20 20
10: 4 500 10 10
# logic for B
DT[, g := B >= shift(B) & B < shift(B, 1) + 20, by = A]
# creating index column
DT[, gi := !g]
DT[is.na(gi), gi := T]
DT[, gi := cumsum(gi)]
DT[, D := sum(C), by = gi] # summing by new groups
DT
# A B C g gi D
# 1: 1 500 10 NA 1 30
# 2: 1 510 20 TRUE 1 30
# 3: 1 540 10 FALSE 2 10
# 4: 2 500 20 NA 3 20
# 5: 2 540 10 FALSE 4 10
# 6: 3 500 50 NA 5 120
# 7: 3 510 20 TRUE 5 120
# 8: 3 519 50 TRUE 5 120
# 9: 3 540 20 FALSE 6 20
# 10: 4 500 10 NA 7 10
You might need to adjust logic for B, as all edge cases isn't clear from the question... if for one A value we have c(30, 40, 50, 60), all of those rows are in one group?
I would like to turn the first table into the second by selecting the last observation of a group for a and b, the first observation for c, sum each observation for the group for d and e, and for f, check if a valid date exists and use that date.
Table 1:
ID a b c d e f
1 10 100 1000 10000 100000 ?
1 10 100 1001 10010 100100 5/07/1977
1 11 111 1002 10020 100200 5/07/1977
2 22 222 2000 20000 200000 6/02/1980
3 33 333 3000 30000 300000 20/12/1978
3 33 333 3001 30010 300100 ?
4 40 400 4000 40000 400000 ?
4 40 400 4001 40010 400100 ?
4 40 400 4002 40020 400200 7/06/1944
4 44 444 4003 40030 400300 ?
4 44 444 4004 40040 400400 ?
4 44 444 4005 40050 400500 ?
5 55 555 5000 50000 500000 31/05/1976
5 55 555 5001 50010 500100 31/05/1976
Table 2:
ID a b c d e f
1 11 111 1000 30030 300300 5/07/1977
2 22 222 2000 20000 200000 6/02/1980
3 33 333 3000 60010 600100 20/12/1978
4 44 444 4000 240150 2401500 7/06/1944
5 55 555 5000 100010 1000100 31/05/1976
I have looked up StackOverflow questions and I have only seen elements of this. I can do a through to e in the following steps.
library(data.table)
setwd('D:/Work/BRB/StackOverflow')
DT = data.table(fread('datatable.csv', header=TRUE))
AB = DT[ , .SD[.N], ID ]
AB = AB[ , c('a', 'b') ]
C = DT[ , .SD[1], ID ]
C = C[ , 'c' ]
DE = DT[ , .(d = sum(d), e = sum(e)) , by = ID ]
Final = cbind(AB, C, DE)
Final
My question is, can I do the operations on variables a, b, c, d, e in one transformation without having to split it into 3?
Also, I have no idea how to do f. Any suggestions?
Finally, I am new to R. Anything else I can improve about my code?
There are several things you can improve:
fread will return a data.table, so no need to wrap it in data.table. You can check with class(DT).
Use the na.strings parameter when reading in the data. See below for an example.
Summarise with:
DT[, .(a = a[.N],
b = b[.N],
c = c[1],
d = sum(d),
e = sum(e),
f = unique(na.omit(f)))
, by = ID]
you will then get:
ID a b c d e f
1: 1 11 111 1000 30030 300300 5/07/1977
2: 2 22 222 2000 20000 200000 6/02/1980
3: 3 33 333 3000 60010 600100 20/12/1978
4: 4 44 444 4000 240150 2401500 7/06/1944
5: 5 55 555 5000 100010 1000100 31/05/1976
Some explanations & other notes:
Subsetting with [1] will give you the first value of a group. You could also use the first-function which is optimized in data.table, and thus faster.
Subsetting with [.N] will give you the last value of a group. You could also use the last-function which is optimized in data.table, and thus faster.
Don't use variable names that are also functions in R (in this case, don't use c as a variable name). See also ?c for an explanation of what the c-function does.
For summarising the f-variable, I used unique in combination with na.omit. If there is more than one unique date by ID, you could also use for example na.omit(f)[1].
If speed is an issue, you could optimize the above to (thx to #Frank):
DT[order(f)
, .(a = last(a),
b = last(b),
c = first(c),
d = sum(d),
e = sum(e),
f = first(f))
, by = ID]
Ordering by f will put NA-values last. As a result now the internal GForce-optimization is used for all calculations.
Used data:
DT <- fread("ID a b c d e f
1 10 100 1000 10000 100000 ?
1 10 100 1001 10010 100100 5/07/1977
1 11 111 1002 10020 100200 5/07/1977
2 22 222 2000 20000 200000 6/02/1980
3 33 333 3000 30000 300000 20/12/1978
3 33 333 3001 30010 300100 ?
4 40 400 4000 40000 400000 ?
4 40 400 4001 40010 400100 ?
4 40 400 4002 40020 400200 7/06/1944
4 44 444 4003 40030 400300 ?
4 44 444 4004 40040 400400 ?
4 44 444 4005 40050 400500 ?
5 55 555 5000 50000 500000 31/05/1976
5 55 555 5001 50010 500100 31/05/1976", na.strings='?')
We can use tidyverse. After grouping by 'ID', we summarise the columns based on the first or last observation
library(dplyr)
DT %>%
group_by(ID) %>%
summarise(a = last(a),
b = last(b),
c = first(c),
d = sum(d),
e = sum(e),
f = f[f!="?"][1])
# A tibble: 5 × 7
# ID a b c d e f
# <int> <int> <int> <int> <int> <int> <chr>
#1 1 11 111 1000 30030 300300 5/07/1977
#2 2 22 222 2000 20000 200000 6/02/1980
#3 3 33 333 3000 60010 600100 20/12/1978
#4 4 44 444 4000 240150 2401500 7/06/1944
#5 5 55 555 5000 100010 1000100 31/05/1976
I have a data frame df1:
chr = c( 1,1,1,1,2,2,2,2)
point = c (257,752,135,1650,252,756,1230,1710)
df1 = data.frame(chr, point)
chr point
1 1 257
2 1 752
3 1 135
4 1 1650
5 2 252
6 2 756
7 2 1230
8 2 1710
I would like to add a new column to this called name. The name to be allocated comes from a reference data frame df2:
chrB = c( 1,1,1,1,2,2,2,2)
txstart = c(0,501,1001,1501,0,501,1001,1501)
txstop = c(500,1000,1500,2000,500,1000,1500,2000)
name2 = c("F","W","Q","G","V","S","L","Y")
chrB txstart txstop name2
1 2 0 500 F
2 2 501 1000 W
3 2 1001 1500 Q
4 2 1501 2000 G
5 1 0 500 V
6 1 501 1000 S
7 1 1001 1500 L
8 1 1501 2000 Y
Where chr in df1 is the same as chrB in df2 AND point in df1 lies between values txstart and txstop the name2 in df2 should be added to df1. result I would like is below:
chr point name
1 1 257 V
2 1 752 S
3 1 135 L
4 1 1650 Y
5 2 252 F
6 2 756 W
7 2 1230 Q
8 2 1710 G
Any help much appreciated!!!
With the updated dataset only the foverlaps method works:
dt1 <- data.table(chr, mp1 = point, mp2 = point,
key = c("chr","mp1", "mp2"))
dt2 <- data.table(chrB, txstart, txstop, name2,
key = c("chrB","txstart", "txstop"))
foverlaps(dt1, dt2, type="within")[, .(chr, midpoint=mp1, name=name2)][]
which gives:
chr midpoint name
1: 1 135 F
2: 1 257 F
3: 1 752 W
4: 1 1650 G
5: 2 252 V
6: 2 756 S
7: 2 1230 L
8: 2 1710 Y
Old answer:
When you want to look whether the midpoint is between the start and stop point of df2, you could use:
df1$name <- df2$name2[match(df1$chr,df2$chrB) &
df1$midpoint > df2$txstart &
df1$midpoint < df2$txstop]
which gives:
> df1
chr midpoint name
1 1 250 F
2 1 750 W
3 1 1250 Q
4 1 1750 G
5 2 250 V
6 2 750 S
7 2 1250 L
8 2 1750 Y
As an alternative approach, you could use the foverlaps function from the data.table package:
library(data.table)
dt1 <- data.table(chr, mp1 = midpoint, mp2 = midpoint, key = c("chr","mp1", "mp2"))
dt2 <- data.table(chrB, txstart, txstop, name2, key = c("chrB","txstart", "txstop"))
foverlaps(dt1, dt2, type="within", nomatch=0L)[, .(chr, midpoint=mp1, name=name2)][]
which gives the same result:
chr midpoint name
1: 1 250 F
2: 1 750 W
3: 1 1250 Q
4: 1 1750 G
5: 2 250 V
6: 2 750 S
7: 2 1250 L
8: 2 1750 Y
I have a data.table that looks like this:
DT <- data.table(A=1:20, B=1:20*10, C=1:20*100)
DT
A B C
1: 1 10 100
2: 2 20 200
3: 3 30 300
4: 4 40 400
5: 5 50 500
...
20: 20 200 2000
I want to be able to calculate a new column "D" that has the first value as the average of the first 20 rows in column B as the first value, and then I want to use the first row of column D to help calculate the next row value of D.
Say the Average of the first 20 rows of column B is 105. and the formula for the next row in column D is this : DT$D[1]+DT$C[2]
where I take the previous row value of D and add the row value of C.
The third row will then look like this: DT$D[2]+DT$C[3]
A B C D
1: 1 10 100 105
2: 2 20 200 305
3: 3 30 300 605
4: 4 40 400 1005
5: 5 50 500 1505
...
20: 20 200 2000 21005
Any ideas on this would be made?
I think shift would be a great help to lag, but dont know how to get rid of the NA that it produces at the first instance?
We can take the mean of the first 20 rows of column B and add the cumulative sum of C. The cumulative sum has one special consideration that we want to add a concatenation of 0 and column C without the first value.
DT[, D := mean(B[1:20]) + cumsum(c(0, C[-1]))][]
# A B C D
# 1: 1 10 100 105
# 2: 2 20 200 305
# 3: 3 30 300 605
# 4: 4 40 400 1005
# 5: 5 50 500 1505
# 6: 6 60 600 2105
# 7: 7 70 700 2805
# 8: 8 80 800 3605
# 9: 9 90 900 4505
# 10: 10 100 1000 5505
# 11: 11 110 1100 6605
# 12: 12 120 1200 7805
# 13: 13 130 1300 9105
# 14: 14 140 1400 10505
# 15: 15 150 1500 12005
# 16: 16 160 1600 13605
# 17: 17 170 1700 15305
# 18: 18 180 1800 17105
# 19: 19 190 1900 19005
# 20: 20 200 2000 21005
I have a data.table as below:
order products value
1000 A|B 10
2000 B|C 20
3000 A|C 30
4000 B|C|D 5
5000 C|D 15
And I need to break the column products and transform/normalize to be used like this:
order prod.seq prod.name value
1000 1 A 10
1000 2 B 10
2000 1 B 20
2000 2 C 20
3000 1 A 30
3000 2 C 30
4000 1 B 5
4000 2 C 5
4000 3 D 5
5000 1 C 15
5000 2 D 15
I guess I can do it using a custom FOR/LOOP but I'd like to know a more advanced way to do that using apply,ddply methods. Any suggestions?
First, convert to a character/string:
DT[,products:=as.character(products)]
Then you can split the string:
DT[,{
x = strsplit(products,"\\|")[[1]]
list( prod.seq = seq_along(x), prod_name = x )
}, by=.(order,value)]
which gives
order value prod.seq prod_name
1: 1000 10 1 A
2: 1000 10 2 B
3: 2000 20 1 B
4: 2000 20 2 C
5: 3000 30 1 A
6: 3000 30 2 C
7: 4000 5 1 B
8: 4000 5 2 C
9: 4000 5 3 D
10: 5000 15 1 C
11: 5000 15 2 D
Here is the another option
library(splitstackshape)
out = cSplit(dat, "products", "|", direction = "long")
out[, prod.seq := seq_len(.N), by = value]
#> out
# order products value prod.seq
# 1: 1000 A 10 1
# 2: 1000 B 10 2
# 3: 2000 B 20 1
# 4: 2000 C 20 2
# 5: 3000 A 30 1
# 6: 3000 C 30 2
# 7: 4000 B 5 1
# 8: 4000 C 5 2
# 9: 4000 D 5 3
#10: 5000 C 15 1
#11: 5000 D 15 2
After cSplit step, using ddply
library(plyr)
ddply(out, .(value), mutate, prod.seq = seq_len(length(order)))
using dplyr
library(dplyr)
out %>% group_by(value) %>% mutate(prod.seq = row_number(order))
using lapply
rbindlist(lapply(split(out, out$value),
function(x){x$prod.seq = seq_len(length(x$order));x}))