Related
Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 5 years ago.
Improve this question
I am referencing another post that appears to provide the exact solution I'm looking for:
Creating new column based on earliest date value in other column in R
Here is my sample data:
structure(list(ID = structure(c(1L, 1L, 1L, 2L, 2L, 2L), .Label = c("a1", "b1"), class = "factor"), Begin = structure(list(sec = c(0, 0, 0, 0, 0, 0), min = c(0L, 0L, 0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 0L, 0L, 0L), mday = c(28L, 4L, 10L, 10L, 12L, 13L), mon = c(11L, 11L, 11L, 11L, 11L, 11L), year = c(115L, 115L,115L, 115L, 115L, 115L), wday = c(1L, 5L, 4L, 4L, 6L, 0L), yday = c(361L, 337L, 343L, 343L, 345L, 346L), isdst = c(0L, 0L, 0L, 0L, 0L, 0L), zone = c("PST", "PST", "PST", "PST", "PST", "PST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec", "min", "hour", "mday", "mon", "year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt", "POSIXt"))), .Names = c("ID", "Begin"), row.names = c(NA, -6L), class = "data.frame")
Here is what I am looking for:
structure(list(ID = structure(c(1L, 1L, 1L, 2L, 2L, 2L), .Label = c("a1", "b1"), class = "factor"), Begin = structure(list(sec = c(0, 0, 0, 0, 0, 0), min = c(0L, 0L, 0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 0L, 0L, 0L), mday = c(28L, 4L, 10L, 10L, 12L, 13L), mon = c(11L, 11L, 11L, 11L, 11L, 11L), year = c(115L, 115L, 115L, 115L, 115L, 115L), wday = c(1L, 5L, 4L, 4L, 6L, 0L), yday = c(361L, 337L, 343L, 343L, 345L, 346L), isdst = c(0L, 0L, 0L, 0L, 0L, 0L), zone = c("PST", "PST", "PST", "PST", "PST", "PST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec", "min", "hour", "mday", "mon", "year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt", "POSIXt")), BeginE = structure(list(
sec = c(0, 0, 0, 0, 0, 0), min = c(0L, 0L, 0L, 0L, 0L, 0L
), hour = c(0L, 0L, 0L, 0L, 0L, 0L), mday = c(4L, 4L, 4L,
10L, 10L, 10L), mon = c(11L, 11L, 11L, 11L, 11L, 11L), year = c(115L,
115L, 115L, 115L, 115L, 115L), wday = c(5L, 5L, 5L, 4L, 4L,
4L), yday = c(337L, 337L, 337L, 343L, 343L, 343L), isdst = c(0L,
0L, 0L, 0L, 0L, 0L), zone = c("PST", "PST", "PST", "PST",
"PST", "PST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec", "min", "hour", "mday", "mon", "year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt", "POSIXt"))), .Names = c("ID", "Begin", "BeginE"), row.names = c(NA, -6L), class = "data.frame")
In response to good comment about providing all code, I attempted the following:
df2 <- as.data.frame(data.table(df)[, BeginE:= min(Begin), by = ID])
This was the error:
`Error in as.POSIXct.POSIXlt(X[[i]], ...) : invalid 'x' argument`
I fixed the issue with a simple conversion:
df$Begin<-as.POSIXct(df$Begin)
Works on my huge dataset as well.
I have a data frame that looks like this:
date time id datetime
1 2015-01-02 14:27:22.130 999000000007628 2015-01-02 14:27:22
2 2015-01-02 14:41:27.720 989001002807730 2015-01-02 14:41:27
3 2015-01-02 14:41:27.940 989001002807730 2015-01-02 14:41:27
4 2015-01-02 14:41:28.140 989001002807730 2015-01-02 14:41:28
5 2015-01-02 14:41:28.170 989001002807730 2015-01-02 14:41:28
6 2015-01-02 14:41:28.350 989001002807730 2015-01-02 14:41:28
I need to find the number of unique "id"s for each "date" in that data frame.
I tried this:
sums<-data.frame(date=unique(data$date), numIDs=0)
for(i in unique(data$date)){
sums[sums$date==i,]$numIDs<-length(unique(data[data$date==i,]$id))
}
and I got the following error:
Error in `$<-.data.frame`(`*tmp*`, "numIDs", value = 0L) :
replacement has 1 row, data has 0
In addition: Warning message:
In `==.default`(data$date, i) :
longer object length is not a multiple of shorter object length
Any ideas?? Thank you!
Hopefully this helps!
data <- structure(list(date = structure(list(sec = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), min = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
hour = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), mday = c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), mon = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), year = c(115L, 115L, 115L, 115L,
115L, 115L, 115L, 115L, 115L, 115L), wday = c(5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L), yday = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), isdst = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), zone = c("PST", "PST", "PST", "PST", "PST",
"PST", "PST", "PST", "PST", "PST"), gmtoff = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst",
"zone", "gmtoff"), class = c("POSIXlt", "POSIXt")), time = c("14:27:22.130",
"14:41:27.720", "14:41:27.940", "14:41:28.140", "14:41:28.170",
"14:41:28.350", "14:41:28.390", "14:41:28.520", "14:41:28.630",
"14:41:28.740"), id = c("999000000007628", "989001002807730",
"989001002807730", "989001002807730", "989001002807730", "989001002807730",
"989001002807730", "989001002807730", "989001002807730", "989001002807730"
), datetime = structure(list(sec = c(22.13, 27.72, 27.94, 28.14,
28.17, 28.35, 28.39, 28.52, 28.63, 28.74), min = c(27L, 41L,
41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L), hour = c(14L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L), mday = c(2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L), mon = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), year = c(115L, 115L, 115L, 115L, 115L, 115L, 115L,
115L, 115L, 115L), wday = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L), yday = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), isdst = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), zone = c("PST", "PST", "PST",
"PST", "PST", "PST", "PST", "PST", "PST", "PST"), gmtoff = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst",
"zone", "gmtoff"), class = c("POSIXlt", "POSIXt")), site = c("Chivato",
"Chivato", "Chivato", "Chivato", "Chivato", "Chivato", "Chivato",
"Chivato", "Chivato", "Chivato")), .Names = c("date", "time",
"id", "datetime", "site"), row.names = c(NA, 10L), class = "data.frame")
You can use the uniqueN function from data.table:
library(data.table)
setDT(df)[, uniqueN(id), by = date]
or (as per the comment of #Richard Scriven):
aggregate(id ~ date, df, function(x) length(unique(x)))
Or we could use n_distinct from library(dplyr)
library(dplyr)
df %>%
group_by(date) %>%
summarise(id=n_distinct(id))
This answer is in response to this post: group by and then count unique observations which was marked as duplicate as I was writing this draft. This is not in response to the question for the duplicate basis here: How to find number of unique ids corresponding to each date in a data drame which asks about finding unique ID's. I'm not sure the second post actually answers the OP's question which is,
"I want to create a table with the number of unique id for each
combination of group1 and group2."
The keyword here is 'combination'. The interpretation is each id has a particular value for group1 and a particular value for group2 so that the set of data of interest is the particular set of values c(id, group1, group2).
Here is the data.frame the OP provided:
df1 <- data.frame(id=sample(letters, 10000, replace = T),
group1=sample(1:2, 10000, replace = T),
group2=sample(100:101, 10000, replace = T))
Using data.table inspired by this post -- https://stackoverflow.com/a/13017723/5220858:
>library(data.table)
>DT <- data.table(df1)
>DT[, .N, by = .(group1, group2)]
group1 group2 N
1: 1 100 2493
2: 1 101 2455
3: 2 100 2559
4: 2 101 2493
N is the count for the id that has a particular group1 value and a particular group2 value. Expanding to include the id also returns a table of 104 unique id, group1, group2 combinations.
>DT[, .N, by = .(id, group1, group2)]
id group1 group2 N
1: t 1 100 107
2: g 1 101 85
3: l 1 101 98
4: a 1 100 83
5: j 1 101 98
---
100: p 1 101 96
101: r 2 101 91
102: y 1 101 104
103: g 1 100 83
104: r 2 100 77
I would like to calculate the pairwise average and median number of days between multiple date variables.
My raw data df might look as following:
id invitation account_date first_order second_order third_order
1 1/1/2016 1/7/2016 1/20/2016 1/22/2016 NA
2 1/1/2016 1/8/2016 1/22/2016 1/23/2016 1/25/2016
3 1/1/2016 1/5/2016 1/20/2016 2/1/2016 NA
4 1/1/2016 1/2/2016 1/18/2016 2/4/2016 2/6/2016
Given that my data are already properly formatted as dates, it's pretty easy to manually calculate the average and median difference for the combinations of dates by first calculating the pairwise differences, e.g.:
id inv_to_act act_to_first act_to_sec act_to_third
1 6 13 2 NA
2 7 14 1 2
3 4 15 12 NA
4 1 16 17 2
And then using base R: mean(df$act_to_first,na.rm=T).
But, I'd like to compute these calculations on several datasets or subsets of the same dataset, so it's not scalable to do each step over and over again. Plus, I'm pretty sure there must be a melt or plyr solution that I haven't figured out.
You could compute the date differences between each pair of dates by looping through the pairs and using difftime:
combos <- combn(tail(names(df), -1), 2)
diffs <- apply(combos, 2, function(x) {
difftime(df[,x[2]], df[,x[1]], units="days")
})
colnames(diffs) <- paste0(combos[1,], "_TO_", combos[2,])
diffs
# invitation_TO_account_date invitation_TO_first_order invitation_TO_second_order invitation_TO_third_order account_date_TO_first_order
# [1,] 6 19 21 NA 13
# [2,] 7 21 22 24 14
# [3,] 4 19 31 NA 15
# [4,] 1 17 34 36 16
# account_date_TO_second_order account_date_TO_third_order first_order_TO_second_order first_order_TO_third_order second_order_TO_third_order
# [1,] 15 NA 2 NA NA
# [2,] 15 17 1 3 2
# [3,] 27 NA 12 NA NA
# [4,] 33 35 17 19 2
After you do that step, you should be able to easily compute the average of each column:
colMeans(diffs, na.rm=TRUE)
# invitation_TO_account_date invitation_TO_first_order invitation_TO_second_order invitation_TO_third_order account_date_TO_first_order
# 4.5 19.0 27.0 30.0 14.5
# account_date_TO_second_order account_date_TO_third_order first_order_TO_second_order first_order_TO_third_order second_order_TO_third_order
# 22.5 26.0 8.0 11.0 2.0
Once you have these functions, you can put them together in a function and apply that function to any input df:
meanDateRanges <- function(df) {
combos <- combn(tail(names(df), -1), 2)
diffs <- apply(combos, 2, function(x) {
difftime(df[,x[2]], df[,x[1]], units="days")
})
colnames(diffs) <- paste0(combos[1,], "_TO_", combos[2,])
colMeans(diffs, na.rm=TRUE)
}
You could run this function on an input data frame with meanDateRanges(df) or on a list of them with lapply(df.list, meanDateRanges).
Data:
df <- structure(list(id = 1:4, invitation = structure(list(sec = c(0,
0, 0, 0), min = c(0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 0L),
mday = c(1L, 1L, 1L, 1L), mon = c(0L, 0L, 0L, 0L), year = c(116L,
116L, 116L, 116L), wday = c(5L, 5L, 5L, 5L), yday = c(0L,
0L, 0L, 0L), isdst = c(0L, 0L, 0L, 0L), zone = c("EST", "EST",
"EST", "EST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_)), .Names = c("sec", "min", "hour", "mday", "mon",
"year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt",
"POSIXt")), account_date = structure(list(sec = c(0, 0, 0, 0),
min = c(0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 0L), mday = c(7L,
8L, 5L, 2L), mon = c(0L, 0L, 0L, 0L), year = c(116L, 116L,
116L, 116L), wday = c(4L, 5L, 2L, 6L), yday = c(6L, 7L, 4L,
1L), isdst = c(0L, 0L, 0L, 0L), zone = c("EST", "EST", "EST",
"EST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_)), .Names = c("sec", "min", "hour", "mday", "mon",
"year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt",
"POSIXt")), first_order = structure(list(sec = c(0, 0, 0, 0),
min = c(0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 0L), mday = c(20L,
22L, 20L, 18L), mon = c(0L, 0L, 0L, 0L), year = c(116L, 116L,
116L, 116L), wday = c(3L, 5L, 3L, 1L), yday = c(19L, 21L,
19L, 17L), isdst = c(0L, 0L, 0L, 0L), zone = c("EST", "EST",
"EST", "EST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_)), .Names = c("sec", "min", "hour", "mday", "mon",
"year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt",
"POSIXt")), second_order = structure(list(sec = c(0, 0, 0, 0),
min = c(0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 0L), mday = c(22L,
23L, 1L, 4L), mon = c(0L, 0L, 1L, 1L), year = c(116L, 116L,
116L, 116L), wday = c(5L, 6L, 1L, 4L), yday = c(21L, 22L,
31L, 34L), isdst = c(0L, 0L, 0L, 0L), zone = c("EST", "EST",
"EST", "EST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_)), .Names = c("sec", "min", "hour", "mday", "mon",
"year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt",
"POSIXt")), third_order = structure(list(sec = c(NA, 0, NA, 0
), min = c(NA, 0L, NA, 0L), hour = c(NA, 0L, NA, 0L), mday = c(NA,
25L, NA, 6L), mon = c(NA, 0L, NA, 1L), year = c(NA, 116L, NA,
116L), wday = c(NA, 1L, NA, 6L), yday = c(NA, 24L, NA, 36L),
isdst = c(-1L, 0L, -1L, 0L), zone = c("", "EST", "", "EST"
), gmtoff = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
)), .Names = c("sec", "min", "hour", "mday", "mon", "year",
"wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt",
"POSIXt"))), .Names = c("id", "invitation", "account_date", "first_order",
"second_order", "third_order"), row.names = c(NA, -4L), class = "data.frame")
I have a data frame that looks like this:
date time id datetime
1 2015-01-02 14:27:22.130 999000000007628 2015-01-02 14:27:22
2 2015-01-02 14:41:27.720 989001002807730 2015-01-02 14:41:27
3 2015-01-02 14:41:27.940 989001002807730 2015-01-02 14:41:27
4 2015-01-02 14:41:28.140 989001002807730 2015-01-02 14:41:28
5 2015-01-02 14:41:28.170 989001002807730 2015-01-02 14:41:28
6 2015-01-02 14:41:28.350 989001002807730 2015-01-02 14:41:28
I need to find the number of unique "id"s for each "date" in that data frame.
I tried this:
sums<-data.frame(date=unique(data$date), numIDs=0)
for(i in unique(data$date)){
sums[sums$date==i,]$numIDs<-length(unique(data[data$date==i,]$id))
}
and I got the following error:
Error in `$<-.data.frame`(`*tmp*`, "numIDs", value = 0L) :
replacement has 1 row, data has 0
In addition: Warning message:
In `==.default`(data$date, i) :
longer object length is not a multiple of shorter object length
Any ideas?? Thank you!
Hopefully this helps!
data <- structure(list(date = structure(list(sec = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), min = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
hour = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), mday = c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), mon = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), year = c(115L, 115L, 115L, 115L,
115L, 115L, 115L, 115L, 115L, 115L), wday = c(5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L), yday = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), isdst = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), zone = c("PST", "PST", "PST", "PST", "PST",
"PST", "PST", "PST", "PST", "PST"), gmtoff = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst",
"zone", "gmtoff"), class = c("POSIXlt", "POSIXt")), time = c("14:27:22.130",
"14:41:27.720", "14:41:27.940", "14:41:28.140", "14:41:28.170",
"14:41:28.350", "14:41:28.390", "14:41:28.520", "14:41:28.630",
"14:41:28.740"), id = c("999000000007628", "989001002807730",
"989001002807730", "989001002807730", "989001002807730", "989001002807730",
"989001002807730", "989001002807730", "989001002807730", "989001002807730"
), datetime = structure(list(sec = c(22.13, 27.72, 27.94, 28.14,
28.17, 28.35, 28.39, 28.52, 28.63, 28.74), min = c(27L, 41L,
41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L), hour = c(14L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L), mday = c(2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L), mon = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), year = c(115L, 115L, 115L, 115L, 115L, 115L, 115L,
115L, 115L, 115L), wday = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L), yday = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), isdst = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), zone = c("PST", "PST", "PST",
"PST", "PST", "PST", "PST", "PST", "PST", "PST"), gmtoff = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst",
"zone", "gmtoff"), class = c("POSIXlt", "POSIXt")), site = c("Chivato",
"Chivato", "Chivato", "Chivato", "Chivato", "Chivato", "Chivato",
"Chivato", "Chivato", "Chivato")), .Names = c("date", "time",
"id", "datetime", "site"), row.names = c(NA, 10L), class = "data.frame")
You can use the uniqueN function from data.table:
library(data.table)
setDT(df)[, uniqueN(id), by = date]
or (as per the comment of #Richard Scriven):
aggregate(id ~ date, df, function(x) length(unique(x)))
Or we could use n_distinct from library(dplyr)
library(dplyr)
df %>%
group_by(date) %>%
summarise(id=n_distinct(id))
This answer is in response to this post: group by and then count unique observations which was marked as duplicate as I was writing this draft. This is not in response to the question for the duplicate basis here: How to find number of unique ids corresponding to each date in a data drame which asks about finding unique ID's. I'm not sure the second post actually answers the OP's question which is,
"I want to create a table with the number of unique id for each
combination of group1 and group2."
The keyword here is 'combination'. The interpretation is each id has a particular value for group1 and a particular value for group2 so that the set of data of interest is the particular set of values c(id, group1, group2).
Here is the data.frame the OP provided:
df1 <- data.frame(id=sample(letters, 10000, replace = T),
group1=sample(1:2, 10000, replace = T),
group2=sample(100:101, 10000, replace = T))
Using data.table inspired by this post -- https://stackoverflow.com/a/13017723/5220858:
>library(data.table)
>DT <- data.table(df1)
>DT[, .N, by = .(group1, group2)]
group1 group2 N
1: 1 100 2493
2: 1 101 2455
3: 2 100 2559
4: 2 101 2493
N is the count for the id that has a particular group1 value and a particular group2 value. Expanding to include the id also returns a table of 104 unique id, group1, group2 combinations.
>DT[, .N, by = .(id, group1, group2)]
id group1 group2 N
1: t 1 100 107
2: g 1 101 85
3: l 1 101 98
4: a 1 100 83
5: j 1 101 98
---
100: p 1 101 96
101: r 2 101 91
102: y 1 101 104
103: g 1 100 83
104: r 2 100 77
I am trying to calculate decimal ages based on dates of birth and a recent date. I have the following test dataframe.
df.1 <- structure(list(dob = structure(list(sec = c(0, 0, 0, NA, 0, 0
), min = c(0L, 0L, 0L, NA, 0L, 0L), hour = c(0L, 0L, 0L, NA,
0L, 0L), mday = c(18L, 24L, 25L, NA, 31L, 15L), mon = c(11L,
5L, 11L, NA, 11L, 11L), year = c(100L, 101L, 102L, NA, 99L, 101L
), wday = c(1L, 0L, 3L, NA, 5L, 6L), yday = c(352L, 174L, 358L,
NA, 364L, 348L), isdst = c(0L, 1L, 0L, -1L, 0L, 0L), zone = c("GMT",
"BST", "GMT", "", "GMT", "GMT"), gmtoff = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst",
"zone", "gmtoff"), class = c("POSIXlt", "POSIXt")), recent.date = structure(list(
sec = c(0, 0, 0, 0, 0, 0), min = c(0L, 0L, 0L, 0L, 0L, 0L
), hour = c(0L, 0L, 0L, 0L, 0L, 0L), mday = c(3L, 2L, 4L,
3L, 1L, 2L), mon = c(5L, 5L, 5L, 5L, 5L, 5L), year = c(114L,
114L, 114L, 114L, 114L, 114L), wday = c(2L, 1L, 3L, 2L, 0L,
1L), yday = c(153L, 152L, 154L, 153L, 151L, 152L), isdst = c(1L,
1L, 1L, 1L, 1L, 1L), zone = c("BST", "BST", "BST", "BST",
"BST", "BST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst",
"zone", "gmtoff"), class = c("POSIXlt", "POSIXt"))), .Names = c("dob",
"recent.date"), row.names = c(NA, -6L), class = "data.frame")
Using the lubridate function and decimal date, I am trying to convert the dates to decimal dates:
library(lubridate)
df.1$decimal.dob <- decimal_date(df.1$dob)
df.1$decimal.recent.date <- decimal_date(df.1$recent.date)
However because I have missing dates of birth, when I try to convert the dates of birth to decimal ages, I get an error. I therefore can't take the 'decimal.recent.date' column away from the 'decimal.dob' column to give me a decimal age.
If anyone could help me getting this to work, I would very much appreciate it!
So search for and remove the NA entries. I'm assuming that if there's an NA in, say, dob$sec[j], then dob$min[j] is also NA and so on.
killit <- which (is.na(dob$sec))
foo <- decimal_date(df.1$dob[-c(killit)])
#[1] 2000.962 2001.477 2002.981 1999.997 2001.953
You'll have to shrink your df.1$decimal.dob to have the same length as the reduced input set.