How to Matching conditions in R Dataframe - r

I am having dataframe which looks like:
Count_ID Stats Date
123 A 10-01-2017
123 A 12-01-2017
123 B 15-01-2017
456 B 18-01-2017
456 C 17-01-2017
789 A 20-01-2017
486 A 25-01-2017
486 A 28-01-2017
I want to add a Status & Count column in Dataframe which give me below mention status.
Match oldest Count_ID as per date having Stats as "A" compare if any Count_ID with same value (i.e 123) is having date > than that Previous same Count_ID having Stats as "A", than show it "False" in status column.
If there are multiple Count_ID with same value (i.e 123) than check Stats "A" than match any same Count_ID with Stats other than "A" or "A" are having date > than of those having Stats "A", than show status as "False"
If there are multiple same Count_ID (i.e 123) having Stats as "A" with date difference <30 days (w.r.t the previous Count_ID as per Date) show status as "False-B".
In count column, show difference of days between same Count_ID created from previous Count_ID.
Where no condition show it as "-".
Required Output:
Count_ID Stats Date Status Count
123 A 10-01-2017 False-B 0
123 A 12-01-2017 False-B 2
123 B 15-01-2017 False 3
456 B 18-01-2017 - 0
456 C 17-01-2017 False 1
789 A 20-01-2017 - 0
486 A 25-01-2017 False-B 0
486 A 28-01-2017 False-B 3
Dput:
structure(list(Count_ID = c(123L, 123L, 123L, 456L, 456L, 789L,
486L, 486L), Stats = c("A", "A", "B", "B", "C", "A", "A", "A"
), Date = c("10/01/2017", "12/01/2017", "15/01/2017", "18/01/2017",
"17/01/2017", "20/01/2017", "25/01/2017", "28/01/2017")), .Names = c("Count_ID",
"Stats", "Date"), class = "data.frame", row.names = c(NA, -8L
))

If I understood the question correctly then you can try this
library(dplyr)
df %>%
group_by(Count_ID) %>%
mutate(Count = c(0, abs(as.numeric(diff(Date)))),
Status = ifelse((Date==min(Date[Stats=='A']) | Date>min(Date[Stats=='A'])) & (n()>1), "FALSE", "-")) %>%
mutate(Status = ifelse(Stats=='A' & Count < 30 & Status=='FALSE', 'FALSE-B', Status)) %>%
data.frame()
Note that the condition for "row item 5" is not clear so I have left it as -. I am not sure how you want to go about this row as there is no Stats = A for Count_ID = 456.
Output is:
Count_ID Stats Date Count Status
1 123 A 2017-01-10 0 FALSE-B
2 123 A 2017-01-12 2 FALSE-B
3 123 B 2017-01-15 3 FALSE
4 456 B 2017-01-18 0 -
5 456 C 2017-01-17 1 -
6 789 A 2017-01-20 0 -
7 486 A 2017-01-25 0 FALSE-B
8 486 A 2017-01-28 3 FALSE-B
Sample data:
df <- structure(list(Count_ID = c(123L, 123L, 123L, 456L, 456L, 789L,
486L, 486L), Stats = c("A", "A", "B", "B", "C", "A", "A", "A"
), Date = structure(c(17176, 17178, 17181, 17184, 17183, 17186,
17191, 17194), class = "Date")), .Names = c("Count_ID", "Stats",
"Date"), row.names = c(NA, -8L), class = "data.frame")

Related

R Combinig two For Loops (Nested For Loop)

I've been trying to combine the two For Loops into a single loop.
Loop 1:
Unique.Order.Comment <- unique(df2$Rebuilt.Order.Comment)
length(Unique.Order.Comment)
#loop for the calculations
for (i in 1:length(Unique.Order.Comment)) {
#a <- i-11
#c[i] <- print(sum(n.Cases.per.month$nCases[a:i]))
a <- subset.data.frame(Rebuilt.Data, Rebuilt.Order.Comment == Unique.Order.Comment[i])
assign(Unique.Order.Comment[i],a)
}
Loop 2:
#loop for the calculations
c <- rep(0, nrow(BR))
for (ii in 1:nrow(BR)) {
if (ii < 12){
print(0)
}else {
a <- ii-11
c[ii] <- print(sum(BR$Number.Cases.Authorised[a:ii]))
}
}
c <- data.frame(c)
c <- c %>%
rename(
n.Seen.Cum = c
)
#View(c)
BR <- cbind(BR,c)
The BR need to be Unique.Order.Comment[i] in Loop 2.
What I believe/hope it would look like should be the below.
But I get the error message Error in rep(0, nrow(Unique.Order.Comment[i])) : invalid 'times' argument
(What I think it should look like)
Unique.Order.Comment <- unique(df2$Rebuilt.Order.Comment)
length(Unique.Order.Comment)
#loop for the calculations
for (i in 1:length(Unique.Order.Comment)) {
#a <- i-11
#c[i] <- print(sum(n.Cases.per.month$nCases[a:i]))
a <- subset.data.frame(Rebuilt.Data, Rebuilt.Order.Comment == Unique.Order.Comment[i])
assign(Unique.Order.Comment[i],a)
#loop for the calculations
c <- rep(0, nrow(Unique.Order.Comment[i]))
for (ii in 1:nrow(Unique.Order.Comment[i])) {
if (ii < 12){
print(0)
}else {
a <- ii-11
c[ii] <- print(sum(Unique.Order.Comment[i]$Number.Cases.Authorised[a:ii]))
}
}
c <- data.frame(c)
c <- c %>%
rename(
n.Seen.Cum = c
)
#View(c)
Unique.Order.Comment[i] <- cbind(Unique.Order.Comment[i],c)
}
Edit example data:
dput(Unique.Order.Comment)
c("CN", "DM", "DR", "FF", "PG", "HN", "SK", "GI", "GYN", "BR",
"UR", "LYMPH", "HPB", "BST", "ENDOC", "PAEDGI", "CT", "PERI",
"NEURO", "MOHS", "ICC", "RE", "PAED", "MN", "EMR", "PR", "LBX",
"HAEM", "CTT", "UGI", "NEUR", "URGI", "GYNAE")
dput(head(Rebuilt.Data))
structure(list(Rebuilt.Order.Comment = c("BR", "BR", "BR", "BR",
"BR", "BR"), Period.Received = c("2019-01", "2019-02", "2019-03",
"2019-04", "2019-05", "2019-06"), Number.Cases.Received = c(838L,
730L, 778L, 832L, 574L, 626L), Number.Cases.Authorised = c(680L,
587L, 896L, 715L, 761L, 554L), Number.Cases.Authorised.Less7Days = c(550L,
343L, 520L, 389L, 393L, 374L), Number.Cases.Authorised.Less10.Days = c(628L,
475L, 723L, 595L, 555L, 474L), Percentage.Authorsied.Less7Days = c(0.808823529411765,
0.584327086882453, 0.580357142857143, 0.544055944055944, 0.516425755584757,
0.675090252707581), Percentage.Authorsied.Less10Days = c(0.923529411764706,
0.809199318568995, 0.806919642857143, 0.832167832167832, 0.729303547963206,
0.855595667870036), Avg.TaT.for.Authorised.Cases = structure(c(5.26470588235294,
8.74616695059625, 8.34709821428571, 8.09370629370629, 12.826544021025,
6.22021660649819), class = "difftime", units = "days"), MDM.Received = c(2L,
13L, 2L, NA, NA, 5L), MDM.Received.Avg.TAT = structure(c(5, 29.2307692307692,
0.5, NA, NA, 5.4), class = "difftime", units = "days"), So.Received = c(NA,
1L, NA, 1L, NA, 2L), So.Received.Avg.TAT = structure(c(NA, 14,
NA, 9, NA, 54), class = "difftime", units = "days")), row.names = c(NA,
6L), class = "data.frame")
if I place print(Unique.Order.Comment[i]) before the second seperate loop I get:
"CN"
In theory the first loop subsets data based upon a unique list of Order.Comment (which it can do).
Then it does a cumlative sum and this gets cbind onto the subsetted data.
First, it is easier to help if you provide a small example along with the your expected output. You can share your original data removing the columns which are not necessary to the question or create a fake dataset which is similar to your original data.
Second, I think you are overcomplicating this. It is never a good idea to create multiple datasets in your global environment. They are very difficult to manage and unnecessary pollute the global environment. You can use lists instead.
In this case I don't think we need to split the datasets in different lists as we have different packages that can perform rolling calculations. For example, below I have used zoo package which has rollsumr function.
library(dplyr)
library(zoo)
df <- df %>%
group_by(Rebuilt.Order.Comment) %>%
mutate(n.Seen.Cum = rollsumr(Number.Cases.Authorised, 12, fill = 0)) %>%
ungroup
df
# Rebuilt.Order.Comment Period.Received Number.Cases.Authorised n.Seen.Cum
# <chr> <chr> <int> <int>
# 1 BR 2019-01 680 0
# 2 BR 2019-02 587 0
# 3 BR 2019-03 896 0
# 4 BR 2019-04 715 0
# 5 BR 2019-05 761 0
# 6 BR 2019-06 554 0
# 7 BR 2019-07 843 0
# 8 BR 2019-08 815 0
# 9 BR 2019-09 704 0
#10 BR 2019-10 939 0
#11 BR 2019-11 834 0
#12 BR 2019-12 880 9208
#13 BR 2020-01 801 9329
#14 BR 2020-02 610 9352
#15 BR 2020-03 853 9309
I think I see what you are aiming for, but I may have missed something. Let me know, and I can edit.
From what I can tell, you only need one loop, and instead of assign()ing a bunch of dataframes, you can iteratively build a summary table.
edit
The other answer here is quite elegant! I'm updating my answer based on your new comments just for fun. Not sure why we have different n.Seen.Cum values...
df2 <- structure(list(
Rebuilt.Order.Comment = c("BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR" ),
Period.Received = c("2019-01", "2019-02", "2019-03", "2019-04", "2019-05", "2019-06", "2019-07", "2019-08", "2019-09", "2019-10", "2019-11", "2019-12", "2020-01", "2020-02", "2020-03"),
Number.Cases.Authorised = c(680L, 587L, 896L, 715L, 761L, 554L, 843L, 815L, 704L, 939L, 834L, 880L, 801L, 610L, 853L),
n.Seen.Cum = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9208, 9329, 9352, 9309)),
row.names = c(NA, 15L), class = "data.frame")
# This will hold results
output <- list()
# Loop over this vector
Unique.Order.Comment <- unique(df2$Rebuilt.Order.Comment)
for(comment in Unique.Order.Comment){
# Temporary dataframe that is subset of 'df2'
temp <- df2[df2$Rebuilt.Order.Comment == comment,]
# We can do arithmetic with dates that have days
temp$Period.Received2 <- as.Date(paste(temp$Period.Received, "-01", sep=""))
# Calculate cumsum after 333 days have passed
temp$n.Seen.cum2 <- ifelse(
test = temp$Period.Received2 - min(temp$Period.Received2) > 333,
yes = cumsum(temp$Number.Cases.Authorised),
no = NA)
# better
output[[comment]] <- temp
# quick and dirty
# assign(x = comment, value = temp)
}
output[[1]]
#> Rebuilt.Order.Comment Period.Received Number.Cases.Authorised n.Seen.Cum
#> 1 BR 2019-01 680 0
#> 2 BR 2019-02 587 0
#> 3 BR 2019-03 896 0
#> 4 BR 2019-04 715 0
#> 5 BR 2019-05 761 0
#> 6 BR 2019-06 554 0
#> 7 BR 2019-07 843 0
#> 8 BR 2019-08 815 0
#> 9 BR 2019-09 704 0
#> 10 BR 2019-10 939 0
#> 11 BR 2019-11 834 0
#> 12 BR 2019-12 880 9208
#> 13 BR 2020-01 801 9329
#> 14 BR 2020-02 610 9352
#> 15 BR 2020-03 853 9309
#> Period.Received2 n.Seen.cum2
#> 1 2019-01-01 NA
#> 2 2019-02-01 NA
#> 3 2019-03-01 NA
#> 4 2019-04-01 NA
#> 5 2019-05-01 NA
#> 6 2019-06-01 NA
#> 7 2019-07-01 NA
#> 8 2019-08-01 NA
#> 9 2019-09-01 NA
#> 10 2019-10-01 NA
#> 11 2019-11-01 NA
#> 12 2019-12-01 9208
#> 13 2020-01-01 10009
#> 14 2020-02-01 10619
#> 15 2020-03-01 11472
If you have multiple years and want the cumulative sum to reset, update the test parameter in ifelse() to include some max number of days.

R, Cumulative Sum in Reverse

Let's say we have two tables:
A table of budgets:
Item Budget
A 900
B 350
C 100
D 0
bDT = structure(list(Item = c("A", "B", "C", "D"), Budget = c(900L,
350L, 100L, 0L)), .Names = c("Item", "Budget"), row.names = c(NA,
-4L), class = "data.frame")
and a table of expected expenses by item per date.
Item Date Expense
A 2017-08-24 850
B 2017-08-18 300
B 2017-08-11 50
C 2017-08-18 50
C 2017-08-11 100
D 2017-08-01 500
expDF = structure(list(Item = c("A", "B", "B", "C", "C", "D"), Date = structure(c(17402,
17396, 17389, 17396, 17389, 17379), class = "Date"), Expense = c(850L,
300L, 50L, 50L, 100L, 500L)), .Names = c("Item", "Date", "Expense"
), row.names = c(NA, -6L), class = "data.frame")
I'm looking to summarize the amount we can spend per item per date like this:
Item Date Spend
A 8/24/2017 850
B 8/18/2017 300
B 8/11/2017 50
C 8/18/2017 50
C 8/11/2017 50
D 8/1/2017 0
This works:
library(data.table)
setDT(bDF); setDT(expDF)
expDF[bDF, on=.(Item), Spending :=
pmin(
Expense,
pmax(
0,
Budget - cumsum(shift(Expense, fill=0))
)
)
, by=.EACHI]
Item Date Expense Spending
1: A 2017-08-24 850 850
2: B 2017-08-18 300 300
3: B 2017-08-11 50 50
4: C 2017-08-18 50 50
5: C 2017-08-11 100 50
6: D 2017-08-01 500 0
How it works
cumsum(shift(Expense, fill = 0)) is prior spending**
max(0, Budget - prior spending) is remaining budget
min(Expense, remaining budget) is current spending
The data.table syntax x[i, on=, j, by=.EACHI] is a join. In this case j takes the form v := expr, which adds a new column to x. See ?data.table for details.
** Well, "prior" in ordering of the table. I'll ignore the OP's weird reversed dates.

R conditional filter

I have two dataframes in R.
Release dataframe
Date Product
2011-01-13 A
2011-02-15 A
2011-01-14 B
2011-02-15 B
Casedata dataframe
Date Product Numberofcases
2011-01-13 A 50
2011-01-12 A 20
2011-01-11 A 100
2011-01-10 A 120
2011-01-09 A 150
2011-01-08 A 180
2011-01-07 A 200
2011-01-06 A 220
2011-01-23 A 500
2011-01-31 A 450
2011-02-08 A 50
2011-02-09 A 1000
2011-02-10 A 1200
2011-02-11 A 1500
2011-02-12 A 1800
2011-02-13 A 2000
2011-02-14 A 2200
2011-02-15 A 5000
2011-01-31 A 4500
:::
:::
2011-01-15 B 1000
My requirement is for every product release date(from release dataframe), I should obtain the corresponding sum(numberofcases) one week prior to the release date(in the casedata dataframe). ie., for product A and release date 2011-01-13, it should be sum of all cases in the previous week (from 2011-01-06 to 2011-01-13) ie., (50+20+100+120+150+180+200+220)
Releasedate Product Numberofcasesoneweekpriorrelease
2011-01-13 A 1040
2011-02-15 A 19250
2011-01-14 B ...
2011-02-15 B ...
What I have tried :
beforerelease <- sqldf("select product,release.date_release,sum(numberofcasescreated) as numberofcasesbeforerelease from release left join casedata using (product) where date_case>=weekbeforerelease and date_case<=date_release group by product,date_release")
finaldf <- merge(beforerelease,afterelease,by=c("monthyear","product"))
I am struck and it is not giving me the expected outcome. Can somebody help me ?
Using the recently implemented non-equi joins feature in the current development version of data.table, v1.9.7, this can be done simply as (assuming all Date columns are of class Date):
require(data.table)
setDT(release)[, Date2 := Date-7L]
setDT(casedata)[release, on = .(Product, Date >= Date2, Date <= Date),
.(count = sum(Numberofcases)), by = .EACHI]
# Product Date Date count
# 1: A 2011-01-06 2011-01-13 1040
# 2: A 2011-02-08 2011-02-15 14750
# 3: B 2011-01-07 2011-01-14 NA
# 4: B 2011-02-08 2011-02-15 NA
With the data.table package you could follow two approaches:
1) Using the foverlaps functionality:
library(data.table)
# convert to a 'data.table' with 'setDT()'
# and create a release window
setDT(release)[, `:=` (bdat = as.Date(Date)-7, edat = as.Date(Date))][, Date := NULL]
# convert to a 'data.table' and create a 2nd date column for use with 'foverlaps
setDT(casedata)[, `:=` (bdat = as.Date(Date), edat = as.Date(Date))][, Date := NULL]
# set the key for use in 'foverlaps'
setkey(release, Product, bdat, edat)
setkey(casedata, Product, bdat, edat)
# do an overlap join ('foverlaps') and summarise
foverlaps(casedata, release, type = 'within', nomatch = 0L)[, .(cases.prior.release = sum(Numberofcases)), by = .(Product, release.date = edat)]
which gives:
Product release.date cases.prior.release
1: A 2011-01-13 1040
2: A 2011-02-15 14750
2) Using the standard join functionality of data.table:
setDT(release)
setDT(casedata)
casedata[, Date := as.Date(Date)
][release[, `:=` (Date = as.Date(Date), idx = .I)
][, .(dates = seq(Date-7,Date,'day')), by = .(Product,idx)],
on = c('Product', Date = 'dates'), nomatch = 0L
][, .(releasedate = Date[.N], cases.prior.release = sum(Numberofcases)), by = .(Product,idx)
][, idx := NULL]
which will get you the same result.
Used data:
release <- structure(list(Date = c("2011-01-13", "2011-02-15", "2011-01-14", "2011-02-15"),
Product = c("A", "A", "B", "B")),
.Names = c("Date", "Product"), class = "data.frame", row.names = c(NA, -4L))
casedata <- structure(list(Date = c("2011-01-13", "2011-01-12", "2011-01-11", "2011-01-10", "2011-01-09", "2011-01-08", "2011-01-07", "2011-01-06", "2011-01-23", "2011-01-31", "2011-02-08", "2011-02-09", "2011-02-10", "2011-02-11", "2011-02-12", "2011-02-13", "2011-02-14", "2011-02-15", "2011-01-31"),
Product = c("A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A"),
Numberofcases = c(50L, 20L, 100L, 120L, 150L, 180L, 200L, 220L, 500L, 450L, 50L, 1000L, 1200L, 1500L, 1800L, 2000L, 2200L, 5000L, 4500L)),
.Names = c("Date", "Product", "Numberofcases"), class = "data.frame", row.names = c(NA, -19L))

Aggregating Dataset to "ignore" categorical variable

I have this dataset wich is structured like this
Neighborhood, var1, var2, COUNTRY, DAY, categ 1, categ 2
1 700 724 AL 0 YES YES
1 500 200 FR 0 YES NO
....
1 701 659 IT 1 NO YES
1 791 669 IT 1 NO YES
....
2 239 222 GE 0 YES NO
and so on...
So that the hyerarchy is "Neighborhood > DAY > COUNTRY" and for every neighborhood,for every day, for every country I have the observation of var1,var2,categ1 and categ2
I'm not interested for the moment in analyzing the country, so what I want to do is to aggregate that (by summing "over" the country field var1 and var2, the categorical variables categ1 and categ2 are not influenced by the country), and have a dataset that for each Neighborhood and for each Day gives me the infos on var1, var2, categ1 and categ2
I'm quite new to R-programming and basically don't know a lot of packages (I would write a program in c++, but I'm forcing myself to learn R)...
So do you have any idea on how to do this?
Data
df1 <- structure(list(Neighborhood = c(1L, 1L, 1L, 1L, 2L),
var1 = c(700L, 500L, 701L, 791L, 239L),
var2 = c(724L, 200L, 659L, 669L, 222L),
COUNTRY = c("AL", "FR", "IT", "IT", "GE"),
DAY = c(0L, 0L, 1L, 1L, 0L),
`categ 1` = c("YES", "YES", "NO", "NO", "YES"),
`categ 2` = c("YES", "NO", "YES", "YES", "NO")),
.Names = c("Neighborhood", "var1", "var2", "COUNTRY", "DAY", "categ 1", "categ 2"),
class = "data.frame", row.names = c(NA, -5L))
EDIT: #akrun
when I try your command, the result is:
aggregate(.~Neighborhood+DAY+COUNTRY, data= df1[!grepl("^categ", names(df1))], mean)
Neighborhood, DAY, COUNTRY, var1, var2
1 1 0 AL 700 724
2 1 0 FR 500 200
3 2 0 GE 239 222
4 1 1 IT 746 664
But (in this example) what I would like to have is:
Neighborhood, DAY, var1, var2
1 1 0 1200 924 //wher var1=700+500....
2 1 1 1492 1328
3 2 0 239 222
If we are not interested in the 'categ' columns, we can grep them out and use aggregate
aggregate(.~Neighborhood+DAY, data= df1[!grepl("^(categ|COUNTRY)", names(df1))], sum)
# Neighborhood DAY var1 var2
#1 1 0 1200 924
#2 2 0 239 222
#3 1 1 1492 1328
Or using dplyr
library(dplyr)
df1 %>%
group_by(Neighborhood, DAY) %>%
summarise_each(funs(sum), matches("^var"))
# Neighborhood DAY var1 var2
# (int) (int) (int) (int)
#1 1 0 1200 924
#2 1 1 1492 1328
#3 2 0 239 222

workflow for creating timevarying covariates in r

I have a huge data file in long format-parts of it supplied below. Each ID can have several rows, where status is the final status. However I need to do the analysis with time varying covariates and hence need to create two new time variables and update the status variable. I´ve been struggling with this for some time now and I cannot figure out how to do this efficiently as there can be as many as four rows per ID. The time varying variable is NUM.AFTER.DIAG. If NUM.AFTER.DIAG==0 then it is easy, where time1=0 and time2=STATUSDATE. However when NUM.AFTER.DIAG==1 then I need to make a new row where time1=0, time2=DOB-DATE.DIAG and NUM.AFTER.DIAG=0 and also make sure STATUS="B". The second row would then be time1=time2 from the previous row and time2=STATUSDATE-DATE.DIAG-time1 from this row. Equally if there are more rows then the different rows needs to be subtracted from each other. Also if NUM.AFTER.DIAG==0 but there are multiple rows then all extra rows can be deleted.
Any ideas for an efficient solution to this?
I´ve looked at john Fox unfold command, but it assumes that all the intervals are in wide format to begin with.
Edit: The table as requested. As for the censor variable: "D"=event (death)
structure(list(ID = c(187L, 258L, 265L, 278L, 281L, 281L, 283L,
283L, 284L, 291L, 292L, 292L, 297L, 299L, 305L, 305L, 311L, 311L,
319L, 319L, 319L, 322L, 322L, 329L, 329L, 333L, 333L, 333L, 334L,
334L), STATUS = c("D", "B", "B", "B", "B", "B", "D", "D", "B",
"B", "B", "B", "D", "D", "D", "D", "B", "B", "B", "B", "B", "D",
"D", "B", "B", "D", "D", "D", "D", "D"), STATUSDATE = structure(c(11153,
15034, 15034, 15034, 15034, 15034, 5005, 5005, 15034, 15034,
15034, 15034, 6374, 5005, 7562, 7562, 15034, 15034, 15034, 15034,
15034, 7743, 7743, 15034, 15034, 4670, 4670, 4670, 5218, 5218
), class = "Date"), DATE.DIAG = structure(c(4578, 4609, 4578,
4487, 4670, 4670, 4517, 4517, 4640, 4213, 4397, 4397, 4397, 4487,
4213, 4213, 4731, 4731, 4701, 4701, 4701, 4397, 4397, 4578, 4578,
4275, 4275, 4275, 4456, 4456), class = "Date"), DOB = structure(c(NA,
13010, NA, NA, -1082, -626, 73, 1353, 13679, NA, 1626, 3087,
-626, -200, 2814, 3757, 1930, 3787, 6740, 13528, 14167, 5462,
6557, 7865, 9235, -901, -504, -108, -535, -78), class = "Date"),
NUM.AFTER.DIAG = c(0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 2, 3, 1, 2, 1, 2, 0, 0, 0, 0, 0)), .Names = c("ID",
"STATUS", "STATUSDATE", "DATE.DIAG", "DOB", "NUM.AFTER.DIAG"), row.names = c(NA,
30L), class = "data.frame")
EDIT: I did come up with a solution, although probably not very efficient.
u1<-ddply(p,.(ID),function(x) {
if (x$NUM.AFTER.DIAG==0){
x$time1<-0
x$time2<-x$STATUSDATE-x$DATE.DIAG
x<-x[1,]
}
else {
x<-rbind(x,x[1,])
x<-x[order(x$DOB),]
u<-max(x$NUM.AFTER.DIAG)
x$NUM.AFTER.DIAG<-0:u
x$time1[1]<-0
x$time2[1:(u)]<-x$DOB[2:(u+1)]-x$DATE.DIAG[2:(u+1)]
x$time2[u+1]<-x$STATUSDATE[u]-x$DATE.DIAG[u]
x$time1[2:(u+1)]<-x$time2[1:u]
x$STATUS[1:u]<-"B"
}
x
}
)
Ok, I've tried something, but I'm not sure I understand your transformation process entirely, so let me know if there are some mistakes. In general ddply will be slow (even when .parallel = TRUE), when there are many individuals, mainly because at the end it has to bring all the data sets of all individuals together and rbind (or rbind.fill) them, which takes forever for a multitude of data.frame objects.
So here's a suggestion, where dat.orig is your toy data set:
I would first split the task in two:
1) NUM.AFTER.DIAG == 0
2) NUM.AFTER.DIAG == 1
1) It seems that if NUM.AFTER.DIAG == 0, except of calculating time2 and extract first row if an ID occurs more than once (like ID 333), there is not much to do in part 1):
## erase multiple occurences
dat <- dat.orig[!(duplicated(dat.orig$ID) & dat.orig$NUM.AFTER.DIAG == 0), ]
dat0 <- dat[dat$NUM.AFTER.DIAG == 0, ]
dat0$time1 <- 0
dat0$time2 <- difftime(dat0$STATUSDATE, dat0$DATE.DIAG, unit = "days")
time.na <- is.na(dat0$DOB)
dat0$time1[time.na] <- dat0$time2[time.na] <- NA
> dat0
ID STATUS STATUSDATE DATE.DIAG DOB NUM.AFTER.DIAG time1 time2
1 187 D 2000-07-15 1982-07-15 <NA> 0 NA NA days
3 265 B 2011-03-01 1982-07-15 <NA> 0 NA NA days
4 278 B 2011-03-01 1982-04-15 <NA> 0 NA NA days
5 281 B 2011-03-01 1982-10-15 1967-01-15 0 0 10364 days
7 283 D 1983-09-15 1982-05-15 1970-03-15 0 0 488 days
10 291 B 2011-03-01 1981-07-15 <NA> 0 NA NA days
11 292 B 2011-03-01 1982-01-15 1974-06-15 0 0 10637 days
13 297 D 1987-06-15 1982-01-15 1968-04-15 0 0 1977 days
14 299 D 1983-09-15 1982-04-15 1969-06-15 0 0 518 days
15 305 D 1990-09-15 1981-07-15 1977-09-15 0 0 3349 days
17 311 B 2011-03-01 1982-12-15 1975-04-15 0 0 10303 days
26 333 D 1982-10-15 1981-09-15 1967-07-15 0 0 395 days
29 334 D 1984-04-15 1982-03-15 1968-07-15 0 0 762 days
2) is a little trickier, but all you actually have to do is insert one more row and calculate the time variables:
## create subset with relevant observations
dat.unfold <- dat[dat$NUM.AFTER.DIAG != 0, ]
## compute time differences
time1 <- difftime(dat.unfold$DOB, dat.unfold$DATE.DIAG, unit = "days")
time1[time1 < 0] <- 0
time2 <- difftime(dat.unfold$STATUSDATE, dat.unfold$DATE.DIAG, unit = "days")
## calculate indices for individuals
n.obs <- daply(dat.unfold, .(ID), function(z) max(z$NUM.AFTER.DIAG) + 1)
df.new <- data.frame(ID = rep(unique(dat.unfold$ID), times = n.obs))
rle.new <- rle(df.new$ID)
ind.last <- cumsum(rle.new$lengths)
ind.first <- !duplicated(df.new$ID)
ind.first.w <- which(ind.first)
ind.second <- ind.first.w + 1
ind2.to.last <- unlist(sapply(seq_along(ind.second),
function(z) ind.second[z]:ind.last[z]))
## insert time variables
df.new$time2 <- df.new$time1 <- NA
df.new$time1[ind.first] <- 0
df.new$time1[!ind.first] <- time1
df.new$time2[!ind.first] <- time2
df.new$time2[ind2.to.last - 1] <- time1
this gives me:
> df.new
ID time1 time2
1 258 0 8401
2 258 8401 10425
3 284 0 9039
4 284 9039 10394
5 319 0 2039
6 319 2039 8827
7 319 8827 9466
8 319 9466 10333
9 322 0 1065
10 322 1065 2160
11 322 2160 3346
12 329 0 3287
13 329 3287 4657
14 329 4657 10456
This should work for the STATUS variable and the other variables in similar fashion.
When both steps are working separately, you just have to do one rbind step at the end.

Resources