I have data which is stored in a dataframe as
CST M QS
501 1204315 1
501 1204324 1
501 6041541 2
501 7508226 1
501 7509677 1
501 7514622 1
503 7511601 2
507 6961200 2
507 7514100 2
507 7522029 1
509 7512374 1
509 7516799 2
511 419110 0.5
511 6000832 5
511 6960800 3.33
511 7010000 2
511 7508229 2.5
511 7508307 2
511 7515126 2
Now, i would like to change this data based on CST different M has to be stored in different columns. These columns should be dynamic such that CST can have only 1 M or it can be 10 or 20 or infinite that many columns has to be generated with M1, M2, M3 and so on. QS has to be displayed with sum of the values based on sum(M)
Sample output is as below
cst M1 M2 M3 M4 M5 M6 M7 Total
501 1204315 1204324 6041541 7508226 7509677 7514622 7
503 7511601 2
507 6961200 7514100 7522029 5
509 7512374 7516799 3
511 419110 6000832 6960800 7010000 7508229 7508307 7515126 17.33
I have used transpose() where it just converts rows to columns.But this will not resolve to the expected output which i want.
I have tried using data.table function as dt[,sum(QS),by="CST"] but am not able to display M along with it in different columns.
Thanks in advance.
We can use data.table. We convert the 'data.frame' to 'data.table' (setDT(df1)), grouped by 'CST', we get the sequence of rows (1:.N) and sum of "QS", assign (:=) the output to create two columns ("N", "Total"). Then, we use dcast to convert from long to wide format.
library(data.table)
setDT(df1)[, c("N", "Total") := list(paste0("M", 1:.N), sum(QS)), CST]
dcast(df1, CST+Total~N, value.var='M')
# CST Total M1 M2 M3 M4 M5 M6 M7
#1: 501 7.00 1204315 1204324 6041541 7508226 7509677 7514622 NA
#2: 503 2.00 7511601 NA NA NA NA NA NA
#3: 507 5.00 6961200 7514100 7522029 NA NA NA NA
#4: 509 3.00 7512374 7516799 NA NA NA NA NA
#5: 511 17.33 419110 6000832 6960800 7010000 7508229 7508307 7515126
Or we use the same methodology with dplyr/tidyr
library(dplyr)
library(tidyr)
df1 %>%
group_by(CST) %>%
mutate(Total=sum(QS), N=row_number()) %>%
select(-QS) %>%
spread(N, M)
Update
If we need the columns in the order, we can convert the "N" to factor with levels specified
setDT(df2)[, c("N", "Total") := list(paste0("M", 1:.N), sum(QS)), CST]
df2[, N:= factor(N, levels=unique(N))]
dcast(df2, CST+Total~N, value.var="M")
data
df2 <- structure(list(CST = c(501L, 501L, 501L, 501L, 501L,
501L, 501L,
501L, 501L, 501L, 501L, 501L, 503L, 507L, 507L, 507L, 509L, 509L,
511L, 511L, 511L, 511L, 511L, 511L, 511L), M = c(1204315L, 1204324L,
6041541L, 7508226L, 7509677L, 7434399L, 7843392L, 7834393L, 8343999L,
3439242L, 3434323L, 7514622L, 7511601L, 6961200L, 7514100L, 7522029L,
7512374L, 7516799L, 419110L, 6000832L, 6960800L, 7010000L, 7508229L,
7508307L, 7515126L), QS = c(1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
1, 2, 2, 2, 1, 1, 2, 0.5, 5, 3.33, 2, 2.5, 2, 2)),
.Names = c("CST",
"M", "QS"), class = "data.frame", row.names = c(NA, -25L))
Related
I've been trying to combine the two For Loops into a single loop.
Loop 1:
Unique.Order.Comment <- unique(df2$Rebuilt.Order.Comment)
length(Unique.Order.Comment)
#loop for the calculations
for (i in 1:length(Unique.Order.Comment)) {
#a <- i-11
#c[i] <- print(sum(n.Cases.per.month$nCases[a:i]))
a <- subset.data.frame(Rebuilt.Data, Rebuilt.Order.Comment == Unique.Order.Comment[i])
assign(Unique.Order.Comment[i],a)
}
Loop 2:
#loop for the calculations
c <- rep(0, nrow(BR))
for (ii in 1:nrow(BR)) {
if (ii < 12){
print(0)
}else {
a <- ii-11
c[ii] <- print(sum(BR$Number.Cases.Authorised[a:ii]))
}
}
c <- data.frame(c)
c <- c %>%
rename(
n.Seen.Cum = c
)
#View(c)
BR <- cbind(BR,c)
The BR need to be Unique.Order.Comment[i] in Loop 2.
What I believe/hope it would look like should be the below.
But I get the error message Error in rep(0, nrow(Unique.Order.Comment[i])) : invalid 'times' argument
(What I think it should look like)
Unique.Order.Comment <- unique(df2$Rebuilt.Order.Comment)
length(Unique.Order.Comment)
#loop for the calculations
for (i in 1:length(Unique.Order.Comment)) {
#a <- i-11
#c[i] <- print(sum(n.Cases.per.month$nCases[a:i]))
a <- subset.data.frame(Rebuilt.Data, Rebuilt.Order.Comment == Unique.Order.Comment[i])
assign(Unique.Order.Comment[i],a)
#loop for the calculations
c <- rep(0, nrow(Unique.Order.Comment[i]))
for (ii in 1:nrow(Unique.Order.Comment[i])) {
if (ii < 12){
print(0)
}else {
a <- ii-11
c[ii] <- print(sum(Unique.Order.Comment[i]$Number.Cases.Authorised[a:ii]))
}
}
c <- data.frame(c)
c <- c %>%
rename(
n.Seen.Cum = c
)
#View(c)
Unique.Order.Comment[i] <- cbind(Unique.Order.Comment[i],c)
}
Edit example data:
dput(Unique.Order.Comment)
c("CN", "DM", "DR", "FF", "PG", "HN", "SK", "GI", "GYN", "BR",
"UR", "LYMPH", "HPB", "BST", "ENDOC", "PAEDGI", "CT", "PERI",
"NEURO", "MOHS", "ICC", "RE", "PAED", "MN", "EMR", "PR", "LBX",
"HAEM", "CTT", "UGI", "NEUR", "URGI", "GYNAE")
dput(head(Rebuilt.Data))
structure(list(Rebuilt.Order.Comment = c("BR", "BR", "BR", "BR",
"BR", "BR"), Period.Received = c("2019-01", "2019-02", "2019-03",
"2019-04", "2019-05", "2019-06"), Number.Cases.Received = c(838L,
730L, 778L, 832L, 574L, 626L), Number.Cases.Authorised = c(680L,
587L, 896L, 715L, 761L, 554L), Number.Cases.Authorised.Less7Days = c(550L,
343L, 520L, 389L, 393L, 374L), Number.Cases.Authorised.Less10.Days = c(628L,
475L, 723L, 595L, 555L, 474L), Percentage.Authorsied.Less7Days = c(0.808823529411765,
0.584327086882453, 0.580357142857143, 0.544055944055944, 0.516425755584757,
0.675090252707581), Percentage.Authorsied.Less10Days = c(0.923529411764706,
0.809199318568995, 0.806919642857143, 0.832167832167832, 0.729303547963206,
0.855595667870036), Avg.TaT.for.Authorised.Cases = structure(c(5.26470588235294,
8.74616695059625, 8.34709821428571, 8.09370629370629, 12.826544021025,
6.22021660649819), class = "difftime", units = "days"), MDM.Received = c(2L,
13L, 2L, NA, NA, 5L), MDM.Received.Avg.TAT = structure(c(5, 29.2307692307692,
0.5, NA, NA, 5.4), class = "difftime", units = "days"), So.Received = c(NA,
1L, NA, 1L, NA, 2L), So.Received.Avg.TAT = structure(c(NA, 14,
NA, 9, NA, 54), class = "difftime", units = "days")), row.names = c(NA,
6L), class = "data.frame")
if I place print(Unique.Order.Comment[i]) before the second seperate loop I get:
"CN"
In theory the first loop subsets data based upon a unique list of Order.Comment (which it can do).
Then it does a cumlative sum and this gets cbind onto the subsetted data.
First, it is easier to help if you provide a small example along with the your expected output. You can share your original data removing the columns which are not necessary to the question or create a fake dataset which is similar to your original data.
Second, I think you are overcomplicating this. It is never a good idea to create multiple datasets in your global environment. They are very difficult to manage and unnecessary pollute the global environment. You can use lists instead.
In this case I don't think we need to split the datasets in different lists as we have different packages that can perform rolling calculations. For example, below I have used zoo package which has rollsumr function.
library(dplyr)
library(zoo)
df <- df %>%
group_by(Rebuilt.Order.Comment) %>%
mutate(n.Seen.Cum = rollsumr(Number.Cases.Authorised, 12, fill = 0)) %>%
ungroup
df
# Rebuilt.Order.Comment Period.Received Number.Cases.Authorised n.Seen.Cum
# <chr> <chr> <int> <int>
# 1 BR 2019-01 680 0
# 2 BR 2019-02 587 0
# 3 BR 2019-03 896 0
# 4 BR 2019-04 715 0
# 5 BR 2019-05 761 0
# 6 BR 2019-06 554 0
# 7 BR 2019-07 843 0
# 8 BR 2019-08 815 0
# 9 BR 2019-09 704 0
#10 BR 2019-10 939 0
#11 BR 2019-11 834 0
#12 BR 2019-12 880 9208
#13 BR 2020-01 801 9329
#14 BR 2020-02 610 9352
#15 BR 2020-03 853 9309
I think I see what you are aiming for, but I may have missed something. Let me know, and I can edit.
From what I can tell, you only need one loop, and instead of assign()ing a bunch of dataframes, you can iteratively build a summary table.
edit
The other answer here is quite elegant! I'm updating my answer based on your new comments just for fun. Not sure why we have different n.Seen.Cum values...
df2 <- structure(list(
Rebuilt.Order.Comment = c("BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR" ),
Period.Received = c("2019-01", "2019-02", "2019-03", "2019-04", "2019-05", "2019-06", "2019-07", "2019-08", "2019-09", "2019-10", "2019-11", "2019-12", "2020-01", "2020-02", "2020-03"),
Number.Cases.Authorised = c(680L, 587L, 896L, 715L, 761L, 554L, 843L, 815L, 704L, 939L, 834L, 880L, 801L, 610L, 853L),
n.Seen.Cum = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9208, 9329, 9352, 9309)),
row.names = c(NA, 15L), class = "data.frame")
# This will hold results
output <- list()
# Loop over this vector
Unique.Order.Comment <- unique(df2$Rebuilt.Order.Comment)
for(comment in Unique.Order.Comment){
# Temporary dataframe that is subset of 'df2'
temp <- df2[df2$Rebuilt.Order.Comment == comment,]
# We can do arithmetic with dates that have days
temp$Period.Received2 <- as.Date(paste(temp$Period.Received, "-01", sep=""))
# Calculate cumsum after 333 days have passed
temp$n.Seen.cum2 <- ifelse(
test = temp$Period.Received2 - min(temp$Period.Received2) > 333,
yes = cumsum(temp$Number.Cases.Authorised),
no = NA)
# better
output[[comment]] <- temp
# quick and dirty
# assign(x = comment, value = temp)
}
output[[1]]
#> Rebuilt.Order.Comment Period.Received Number.Cases.Authorised n.Seen.Cum
#> 1 BR 2019-01 680 0
#> 2 BR 2019-02 587 0
#> 3 BR 2019-03 896 0
#> 4 BR 2019-04 715 0
#> 5 BR 2019-05 761 0
#> 6 BR 2019-06 554 0
#> 7 BR 2019-07 843 0
#> 8 BR 2019-08 815 0
#> 9 BR 2019-09 704 0
#> 10 BR 2019-10 939 0
#> 11 BR 2019-11 834 0
#> 12 BR 2019-12 880 9208
#> 13 BR 2020-01 801 9329
#> 14 BR 2020-02 610 9352
#> 15 BR 2020-03 853 9309
#> Period.Received2 n.Seen.cum2
#> 1 2019-01-01 NA
#> 2 2019-02-01 NA
#> 3 2019-03-01 NA
#> 4 2019-04-01 NA
#> 5 2019-05-01 NA
#> 6 2019-06-01 NA
#> 7 2019-07-01 NA
#> 8 2019-08-01 NA
#> 9 2019-09-01 NA
#> 10 2019-10-01 NA
#> 11 2019-11-01 NA
#> 12 2019-12-01 9208
#> 13 2020-01-01 10009
#> 14 2020-02-01 10619
#> 15 2020-03-01 11472
If you have multiple years and want the cumulative sum to reset, update the test parameter in ifelse() to include some max number of days.
I need to delete some rows in my dataset based on the given condition.
Kindly gothrough the sample data for reference.
ID Date Dur
123 01/05/2000 3
123 08/04/2002 6
564 04/04/2012 2
741 01/08/2011 5
789 02/03/2009 1
789 08/01/2010 NA
789 05/05/2011 NA
852 06/06/2015 3
852 03/02/2016 NA
155 03/02/2008 NA
155 01/01/2009 NA
159 07/07/2008 NA
My main concern is Dur column. I have to delete the rows which have Dur != NA for group ID's
i.e ID's(123,789,852) have more than one record/row with Dur value. so I need to remove the ID with Dur value, which means entire ID of 123 and first record of 789 and 852.
I don't want to delete any ID's(564,741,852) have Dur with single record or any other ID's with null in Dur.
Expected Output:
ID Date Dur
564 04/04/2012 2
741 01/08/2011 5
789 08/01/2010 NA
789 05/05/2011 NA
852 03/02/2016 NA
155 03/02/2008 NA
155 01/01/2009 NA
159 07/07/2008 NA
Kindly suggest a code to solve the issue.
Thanks in Advance!
One way would be to select rows where number of rows in the group is 1 or there are NA's rows in the data.
This can be written in dplyr as :
library(dplyr)
df %>% group_by(ID) %>% filter(n() == 1 | is.na(Dur))
# ID Date Dur
# <int> <chr> <int>
#1 564 04/04/2012 2
#2 741 01/08/2011 5
#3 789 08/01/2010 NA
#4 789 05/05/2011 NA
#5 852 03/02/2016 NA
#6 155 03/02/2008 NA
#7 155 01/01/2009 NA
#8 159 07/07/2008 NA
Using data.table :
library(data.table)
setDT(df)[, .SD[.N == 1 | is.na(Dur)], ID]
and base R :
subset(df, ave(is.na(Dur), ID, FUN = function(x) length(x) == 1 | x))
data
df <- structure(list(ID = c(123L, 123L, 564L, 741L, 789L, 789L, 789L,
852L, 852L, 155L, 155L, 159L), Date = c("01/05/2000", "08/04/2002",
"04/04/2012", "01/08/2011", "02/03/2009", "08/01/2010", "05/05/2011",
"06/06/2015", "03/02/2016", "03/02/2008", "01/01/2009", "07/07/2008"
), Dur = c(3L, 6L, 2L, 5L, 1L, NA, NA, 3L, NA, NA, NA, NA)),
class = "data.frame", row.names = c(NA, -12L))
We can use .I in data.table
library(data.table)
setDT(df1)[df1[, .I[.N == 1| is.na(Dur)], ID]$V1]
I have two data frames, one consisting of numerical values called 'esame':
media id_poll fin
1 5.330000e+00 360 1
2 6.833333e-02 361 0
3 0.000000e+00 362 0
4 NA 363 0
5 8.200000e-01 364 0
6 3.416667e-01 365 0
7 0.000000e+00 366 0
8 0.000000e+00 367 0
9 0.000000e+00 368 0
10 NA 369 0
11 6.150000e-01 370 0
12 0.000000e+00 371 0
13 0.000000e+00 372 0
14 NA 373 0
15 0.000000e+00 374 0
16 0.000000e+00 375 0
17 0.000000e+00 376 0
18 1.298333e+00 377 0
And the second one consisting of numerical ranges which I would like to use to check in which range the 'media' field of the first data.frame is.
If it's in the first range I would like to assign "1" to the field "fin" of the first data.frame, if it's in the second I would like to assign "2" and so on.
So here it is the second data.frame with some of the conditions I'll need:
Range1 Range2 Range3 Range4 ID
0.5 9.9 29.9 >30 360
0.5 15.9 49.9 >50 361
0 4.9 24.9 >25 362
First of all I suppose I won't need to declare Range4 as it's already an information included in Range3. I removed the initial value of all numerical ranges as I need just a single number to check against (or so I think). The same row for ID 360 could be written as:
Range1 Range2 Range3 Range4 ID
0.5 0.6-9.9 10-29.9 >30 360
So my guess is to do something like this:
esame$fin<-ifelse (esame$media<0.6 & datofinale$id_poll=="360", "1", "0")
I could substitute the "0" value with another 'ifelse' statement and go on manually.
Is there any faster way to do that? (the list containing all the condititions is actually pretty larger than the example).
Thank you for any advice.
Not too nice, but this should work:
require(dplyr)
inner_join(Data,Data1,by=c("id_poll"="ID")) %>% rowwise() %>%
mutate(fin = findInterval(media,c(-Inf,Range1,Range2,Range3),left.open=TRUE))
Reproducible data
esame <- structure(list(media = c(5.33, 0.06833333, 0, NA, 0.82, 0.3416667,
0, 0, 0, NA, 0.615, 0, 0, NA, 0, 0, 0, 1.298333), id_poll = 360:377,
fin = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L)), .Names = c("media", "id_poll", "fin"
), row.names = c(NA, -18L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x0000000014320788>)
df1 <- structure(list(Range1 = c(0.5, 0.5, 0), Range2 = c(9.9, 15.9,
4.9), Range3 = c(29.9, 49.9, 24.9), Range4 = c(">30", ">50",
">25"), ID = 360:362), .Names = c("Range1", "Range2", "Range3",
"Range4", "ID"), row.names = c(NA, -3L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000000014320788>)
dplyr solution
Using case_when
library(dplyr)
df2 <- left_join(esame1, df1, by=c("id_poll" = "ID")) %>%
mutate(fin = case_when( media > Range3 ~ 4,
media > Range2 ~ 3,
media > Range1 ~ 2,
media <= Range1 ~ 1,
is.na(Range1) == T ~ 0)) # else case
Output
media ID fin Range1 Range2 Range3 Range4
1 5.33000000 360 2 0.5 9.9 29.9 >30
2 0.06833333 361 1 0.5 15.9 49.9 >50
3 0.00000000 362 1 0.0 4.9 24.9 >25
4 NA 363 0 NA NA NA <NA>
5 0.82000000 364 0 NA NA NA <NA>
We can consider each row in the range data.frame as a vector and ask whether the current media value is greater than the value in this vector.
For simplicity, I'm assuming that all values in the first data.frame has a correspondent in the second, and that they are all ordered the same way.
for(i in 1:nrow(esame)) {
greater.than <- esame[i,1]>range[i,1:3] #this returns a vector of TRUE (greater than this range) and FALSE (within) you want the first FALSE
esame$fin <- max(which(greater.than))+1 #returns the position of the last TRUE +1, which is the position of the first FALSE
}
dat - first df, tad - second. It will put 0 if NA, nested ifelse() and assume that first range is from 0 to present value. However show some example result to check if it works properly.
dat$fin <- sapply(1:nrow(dat), function(x) ifelse(dat[x,1] >= tad[x,1] & !is.na(dat[x,1]), 1, ifelse(dat[x,1] >= tad[x,2] & !is.na(dat[x,1]), 2, ifelse(dat[x,1] >= tad[x,3] & !is.na(dat[x,1]), 3, 0))))
>dat
media id_poll fin
1 5.33000000 360 1
2 0.06833333 361 0
3 0.00000000 362 1
I have a dataset that looks something like this:
df <- structure(list(Claim.Num = c(500L, 500L, 600L, 600L, 700L, 700L,
100L, 200L, 300L), Amount = c(NA, 1000L, NA, 564L, 0L, 200L,
NA, 0L, NA), Company = structure(c(NA, 1L, NA, 4L, 2L, 3L, NA,
3L, NA), .Label = c("ATT", "Boeing", "Petco", "T Mobile"), class = "factor")), .Names =
c("Claim.Num", "Amount", "Company"), class = "data.frame", row.names = c(NA,
-9L))
I want to remove duplicate rows based on Claim Num values, but to remove duplicates based on the following criteria: df$Company == 'NA' | df$Amount == 0
In other words, remove records 1, 3, and 5.
I've gotten this far: df <- df[!duplicated(df$Claim.Num[which(df$Amount = 0 | df$Company == 'NA')]),]
The code runs without errors, but doesn't actually remove duplicate rows based on the required criteria. I think that's because I'm telling it to remove any duplicate Claim Nums which match to those criteria, but not to remove any duplicate Claim.Num but treat certain Amounts & Companies preferentially for removal. Please note that, I can't simple filter out the dataset based on specified values, as there are other records that may have 0 or NA values, that require inclusion (e.g. records 8 & 9 shouldn't be excluded because their Claim.Nums are not duplicated).
If you order your data frame first, then you can make sure duplicated keeps the ones you want:
df.tmp <- with(df, df[order(ifelse(is.na(Company) | Amount == 0, 1, 0)), ])
df.tmp[!duplicated(df.tmp$Claim.Num), ]
# Claim.Num Amount Company
# 2 500 1000 ATT
# 4 600 564 T Mobile
# 6 700 200 Petco
# 7 100 NA <NA>
# 8 200 0 Petco
# 9 300 NA <NA>
Slightly different approach
r <- merge(df,
aggregate(df$Amount,by=list(Claim.Num=df$Claim.Num),length),
by="Claim.Num")
result <-r[!(r$x>1 & (is.na(r$Company) | (r$Amount==0))),-ncol(r)]
result
# Claim.Num Amount Company
# 1 100 NA <NA>
# 2 200 0 Petco
# 3 300 NA <NA>
# 5 500 1000 ATT
# 7 600 564 T Mobile
# 9 700 200 Petco
This adds a column x to indicate which rows have Claim.Num present more than once, then filters the result based on your criteria. The use of -ncol(r) just removes the column x at the end.
Another way based on subset and logical indices:
subset(dat, !(duplicated(Claim.Num) | duplicated(Claim.Num, fromLast = TRUE)) |
(!is.na(Amount) & Amount))
Claim.Num Amount Company
2 500 1000 ATT
4 600 564 T Mobile
6 700 200 Petco
7 100 NA <NA>
8 200 0 Petco
9 300 NA <NA>
I have a huge data file in long format-parts of it supplied below. Each ID can have several rows, where status is the final status. However I need to do the analysis with time varying covariates and hence need to create two new time variables and update the status variable. I´ve been struggling with this for some time now and I cannot figure out how to do this efficiently as there can be as many as four rows per ID. The time varying variable is NUM.AFTER.DIAG. If NUM.AFTER.DIAG==0 then it is easy, where time1=0 and time2=STATUSDATE. However when NUM.AFTER.DIAG==1 then I need to make a new row where time1=0, time2=DOB-DATE.DIAG and NUM.AFTER.DIAG=0 and also make sure STATUS="B". The second row would then be time1=time2 from the previous row and time2=STATUSDATE-DATE.DIAG-time1 from this row. Equally if there are more rows then the different rows needs to be subtracted from each other. Also if NUM.AFTER.DIAG==0 but there are multiple rows then all extra rows can be deleted.
Any ideas for an efficient solution to this?
I´ve looked at john Fox unfold command, but it assumes that all the intervals are in wide format to begin with.
Edit: The table as requested. As for the censor variable: "D"=event (death)
structure(list(ID = c(187L, 258L, 265L, 278L, 281L, 281L, 283L,
283L, 284L, 291L, 292L, 292L, 297L, 299L, 305L, 305L, 311L, 311L,
319L, 319L, 319L, 322L, 322L, 329L, 329L, 333L, 333L, 333L, 334L,
334L), STATUS = c("D", "B", "B", "B", "B", "B", "D", "D", "B",
"B", "B", "B", "D", "D", "D", "D", "B", "B", "B", "B", "B", "D",
"D", "B", "B", "D", "D", "D", "D", "D"), STATUSDATE = structure(c(11153,
15034, 15034, 15034, 15034, 15034, 5005, 5005, 15034, 15034,
15034, 15034, 6374, 5005, 7562, 7562, 15034, 15034, 15034, 15034,
15034, 7743, 7743, 15034, 15034, 4670, 4670, 4670, 5218, 5218
), class = "Date"), DATE.DIAG = structure(c(4578, 4609, 4578,
4487, 4670, 4670, 4517, 4517, 4640, 4213, 4397, 4397, 4397, 4487,
4213, 4213, 4731, 4731, 4701, 4701, 4701, 4397, 4397, 4578, 4578,
4275, 4275, 4275, 4456, 4456), class = "Date"), DOB = structure(c(NA,
13010, NA, NA, -1082, -626, 73, 1353, 13679, NA, 1626, 3087,
-626, -200, 2814, 3757, 1930, 3787, 6740, 13528, 14167, 5462,
6557, 7865, 9235, -901, -504, -108, -535, -78), class = "Date"),
NUM.AFTER.DIAG = c(0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 2, 3, 1, 2, 1, 2, 0, 0, 0, 0, 0)), .Names = c("ID",
"STATUS", "STATUSDATE", "DATE.DIAG", "DOB", "NUM.AFTER.DIAG"), row.names = c(NA,
30L), class = "data.frame")
EDIT: I did come up with a solution, although probably not very efficient.
u1<-ddply(p,.(ID),function(x) {
if (x$NUM.AFTER.DIAG==0){
x$time1<-0
x$time2<-x$STATUSDATE-x$DATE.DIAG
x<-x[1,]
}
else {
x<-rbind(x,x[1,])
x<-x[order(x$DOB),]
u<-max(x$NUM.AFTER.DIAG)
x$NUM.AFTER.DIAG<-0:u
x$time1[1]<-0
x$time2[1:(u)]<-x$DOB[2:(u+1)]-x$DATE.DIAG[2:(u+1)]
x$time2[u+1]<-x$STATUSDATE[u]-x$DATE.DIAG[u]
x$time1[2:(u+1)]<-x$time2[1:u]
x$STATUS[1:u]<-"B"
}
x
}
)
Ok, I've tried something, but I'm not sure I understand your transformation process entirely, so let me know if there are some mistakes. In general ddply will be slow (even when .parallel = TRUE), when there are many individuals, mainly because at the end it has to bring all the data sets of all individuals together and rbind (or rbind.fill) them, which takes forever for a multitude of data.frame objects.
So here's a suggestion, where dat.orig is your toy data set:
I would first split the task in two:
1) NUM.AFTER.DIAG == 0
2) NUM.AFTER.DIAG == 1
1) It seems that if NUM.AFTER.DIAG == 0, except of calculating time2 and extract first row if an ID occurs more than once (like ID 333), there is not much to do in part 1):
## erase multiple occurences
dat <- dat.orig[!(duplicated(dat.orig$ID) & dat.orig$NUM.AFTER.DIAG == 0), ]
dat0 <- dat[dat$NUM.AFTER.DIAG == 0, ]
dat0$time1 <- 0
dat0$time2 <- difftime(dat0$STATUSDATE, dat0$DATE.DIAG, unit = "days")
time.na <- is.na(dat0$DOB)
dat0$time1[time.na] <- dat0$time2[time.na] <- NA
> dat0
ID STATUS STATUSDATE DATE.DIAG DOB NUM.AFTER.DIAG time1 time2
1 187 D 2000-07-15 1982-07-15 <NA> 0 NA NA days
3 265 B 2011-03-01 1982-07-15 <NA> 0 NA NA days
4 278 B 2011-03-01 1982-04-15 <NA> 0 NA NA days
5 281 B 2011-03-01 1982-10-15 1967-01-15 0 0 10364 days
7 283 D 1983-09-15 1982-05-15 1970-03-15 0 0 488 days
10 291 B 2011-03-01 1981-07-15 <NA> 0 NA NA days
11 292 B 2011-03-01 1982-01-15 1974-06-15 0 0 10637 days
13 297 D 1987-06-15 1982-01-15 1968-04-15 0 0 1977 days
14 299 D 1983-09-15 1982-04-15 1969-06-15 0 0 518 days
15 305 D 1990-09-15 1981-07-15 1977-09-15 0 0 3349 days
17 311 B 2011-03-01 1982-12-15 1975-04-15 0 0 10303 days
26 333 D 1982-10-15 1981-09-15 1967-07-15 0 0 395 days
29 334 D 1984-04-15 1982-03-15 1968-07-15 0 0 762 days
2) is a little trickier, but all you actually have to do is insert one more row and calculate the time variables:
## create subset with relevant observations
dat.unfold <- dat[dat$NUM.AFTER.DIAG != 0, ]
## compute time differences
time1 <- difftime(dat.unfold$DOB, dat.unfold$DATE.DIAG, unit = "days")
time1[time1 < 0] <- 0
time2 <- difftime(dat.unfold$STATUSDATE, dat.unfold$DATE.DIAG, unit = "days")
## calculate indices for individuals
n.obs <- daply(dat.unfold, .(ID), function(z) max(z$NUM.AFTER.DIAG) + 1)
df.new <- data.frame(ID = rep(unique(dat.unfold$ID), times = n.obs))
rle.new <- rle(df.new$ID)
ind.last <- cumsum(rle.new$lengths)
ind.first <- !duplicated(df.new$ID)
ind.first.w <- which(ind.first)
ind.second <- ind.first.w + 1
ind2.to.last <- unlist(sapply(seq_along(ind.second),
function(z) ind.second[z]:ind.last[z]))
## insert time variables
df.new$time2 <- df.new$time1 <- NA
df.new$time1[ind.first] <- 0
df.new$time1[!ind.first] <- time1
df.new$time2[!ind.first] <- time2
df.new$time2[ind2.to.last - 1] <- time1
this gives me:
> df.new
ID time1 time2
1 258 0 8401
2 258 8401 10425
3 284 0 9039
4 284 9039 10394
5 319 0 2039
6 319 2039 8827
7 319 8827 9466
8 319 9466 10333
9 322 0 1065
10 322 1065 2160
11 322 2160 3346
12 329 0 3287
13 329 3287 4657
14 329 4657 10456
This should work for the STATUS variable and the other variables in similar fashion.
When both steps are working separately, you just have to do one rbind step at the end.