group by and then count unique observations [duplicate] - r

I have a data frame that looks like this:
date time id datetime
1 2015-01-02 14:27:22.130 999000000007628 2015-01-02 14:27:22
2 2015-01-02 14:41:27.720 989001002807730 2015-01-02 14:41:27
3 2015-01-02 14:41:27.940 989001002807730 2015-01-02 14:41:27
4 2015-01-02 14:41:28.140 989001002807730 2015-01-02 14:41:28
5 2015-01-02 14:41:28.170 989001002807730 2015-01-02 14:41:28
6 2015-01-02 14:41:28.350 989001002807730 2015-01-02 14:41:28
I need to find the number of unique "id"s for each "date" in that data frame.
I tried this:
sums<-data.frame(date=unique(data$date), numIDs=0)
for(i in unique(data$date)){
sums[sums$date==i,]$numIDs<-length(unique(data[data$date==i,]$id))
}
and I got the following error:
Error in `$<-.data.frame`(`*tmp*`, "numIDs", value = 0L) :
replacement has 1 row, data has 0
In addition: Warning message:
In `==.default`(data$date, i) :
longer object length is not a multiple of shorter object length
Any ideas?? Thank you!
Hopefully this helps!
data <- structure(list(date = structure(list(sec = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), min = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
hour = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), mday = c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), mon = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), year = c(115L, 115L, 115L, 115L,
115L, 115L, 115L, 115L, 115L, 115L), wday = c(5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L), yday = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), isdst = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), zone = c("PST", "PST", "PST", "PST", "PST",
"PST", "PST", "PST", "PST", "PST"), gmtoff = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst",
"zone", "gmtoff"), class = c("POSIXlt", "POSIXt")), time = c("14:27:22.130",
"14:41:27.720", "14:41:27.940", "14:41:28.140", "14:41:28.170",
"14:41:28.350", "14:41:28.390", "14:41:28.520", "14:41:28.630",
"14:41:28.740"), id = c("999000000007628", "989001002807730",
"989001002807730", "989001002807730", "989001002807730", "989001002807730",
"989001002807730", "989001002807730", "989001002807730", "989001002807730"
), datetime = structure(list(sec = c(22.13, 27.72, 27.94, 28.14,
28.17, 28.35, 28.39, 28.52, 28.63, 28.74), min = c(27L, 41L,
41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L), hour = c(14L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L), mday = c(2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L), mon = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), year = c(115L, 115L, 115L, 115L, 115L, 115L, 115L,
115L, 115L, 115L), wday = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L), yday = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), isdst = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), zone = c("PST", "PST", "PST",
"PST", "PST", "PST", "PST", "PST", "PST", "PST"), gmtoff = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst",
"zone", "gmtoff"), class = c("POSIXlt", "POSIXt")), site = c("Chivato",
"Chivato", "Chivato", "Chivato", "Chivato", "Chivato", "Chivato",
"Chivato", "Chivato", "Chivato")), .Names = c("date", "time",
"id", "datetime", "site"), row.names = c(NA, 10L), class = "data.frame")

You can use the uniqueN function from data.table:
library(data.table)
setDT(df)[, uniqueN(id), by = date]
or (as per the comment of #Richard Scriven):
aggregate(id ~ date, df, function(x) length(unique(x)))

Or we could use n_distinct from library(dplyr)
library(dplyr)
df %>%
group_by(date) %>%
summarise(id=n_distinct(id))

This answer is in response to this post: group by and then count unique observations which was marked as duplicate as I was writing this draft. This is not in response to the question for the duplicate basis here: How to find number of unique ids corresponding to each date in a data drame which asks about finding unique ID's. I'm not sure the second post actually answers the OP's question which is,
"I want to create a table with the number of unique id for each
combination of group1 and group2."
The keyword here is 'combination'. The interpretation is each id has a particular value for group1 and a particular value for group2 so that the set of data of interest is the particular set of values c(id, group1, group2).
Here is the data.frame the OP provided:
df1 <- data.frame(id=sample(letters, 10000, replace = T),
group1=sample(1:2, 10000, replace = T),
group2=sample(100:101, 10000, replace = T))
Using data.table inspired by this post -- https://stackoverflow.com/a/13017723/5220858:
>library(data.table)
>DT <- data.table(df1)
>DT[, .N, by = .(group1, group2)]
group1 group2 N
1: 1 100 2493
2: 1 101 2455
3: 2 100 2559
4: 2 101 2493
N is the count for the id that has a particular group1 value and a particular group2 value. Expanding to include the id also returns a table of 104 unique id, group1, group2 combinations.
>DT[, .N, by = .(id, group1, group2)]
id group1 group2 N
1: t 1 100 107
2: g 1 101 85
3: l 1 101 98
4: a 1 100 83
5: j 1 101 98
---
100: p 1 101 96
101: r 2 101 91
102: y 1 101 104
103: g 1 100 83
104: r 2 100 77

Related

R Creating vector based on earliest date from another column receive POSIX error [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 5 years ago.
Improve this question
I am referencing another post that appears to provide the exact solution I'm looking for:
Creating new column based on earliest date value in other column in R
Here is my sample data:
structure(list(ID = structure(c(1L, 1L, 1L, 2L, 2L, 2L), .Label = c("a1", "b1"), class = "factor"), Begin = structure(list(sec = c(0, 0, 0, 0, 0, 0), min = c(0L, 0L, 0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 0L, 0L, 0L), mday = c(28L, 4L, 10L, 10L, 12L, 13L), mon = c(11L, 11L, 11L, 11L, 11L, 11L), year = c(115L, 115L,115L, 115L, 115L, 115L), wday = c(1L, 5L, 4L, 4L, 6L, 0L), yday = c(361L, 337L, 343L, 343L, 345L, 346L), isdst = c(0L, 0L, 0L, 0L, 0L, 0L), zone = c("PST", "PST", "PST", "PST", "PST", "PST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec", "min", "hour", "mday", "mon", "year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt", "POSIXt"))), .Names = c("ID", "Begin"), row.names = c(NA, -6L), class = "data.frame")
Here is what I am looking for:
structure(list(ID = structure(c(1L, 1L, 1L, 2L, 2L, 2L), .Label = c("a1", "b1"), class = "factor"), Begin = structure(list(sec = c(0, 0, 0, 0, 0, 0), min = c(0L, 0L, 0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 0L, 0L, 0L), mday = c(28L, 4L, 10L, 10L, 12L, 13L), mon = c(11L, 11L, 11L, 11L, 11L, 11L), year = c(115L, 115L, 115L, 115L, 115L, 115L), wday = c(1L, 5L, 4L, 4L, 6L, 0L), yday = c(361L, 337L, 343L, 343L, 345L, 346L), isdst = c(0L, 0L, 0L, 0L, 0L, 0L), zone = c("PST", "PST", "PST", "PST", "PST", "PST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec", "min", "hour", "mday", "mon", "year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt", "POSIXt")), BeginE = structure(list(
sec = c(0, 0, 0, 0, 0, 0), min = c(0L, 0L, 0L, 0L, 0L, 0L
), hour = c(0L, 0L, 0L, 0L, 0L, 0L), mday = c(4L, 4L, 4L,
10L, 10L, 10L), mon = c(11L, 11L, 11L, 11L, 11L, 11L), year = c(115L,
115L, 115L, 115L, 115L, 115L), wday = c(5L, 5L, 5L, 4L, 4L,
4L), yday = c(337L, 337L, 337L, 343L, 343L, 343L), isdst = c(0L,
0L, 0L, 0L, 0L, 0L), zone = c("PST", "PST", "PST", "PST",
"PST", "PST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec", "min", "hour", "mday", "mon", "year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt", "POSIXt"))), .Names = c("ID", "Begin", "BeginE"), row.names = c(NA, -6L), class = "data.frame")
In response to good comment about providing all code, I attempted the following:
df2 <- as.data.frame(data.table(df)[, BeginE:= min(Begin), by = ID])
This was the error:
`Error in as.POSIXct.POSIXlt(X[[i]], ...) : invalid 'x' argument`
I fixed the issue with a simple conversion:
df$Begin<-as.POSIXct(df$Begin)
Works on my huge dataset as well.

Months displayed incorrectly when using ggplot2

Hi having a problem where March appears twice in my graph but not in my Data.
My data looks like. My data frame is called try1.
Month Year tcol
2016-01-01 00:00:00 06 1461.0
2016-02-01 00:00:00 06 259.5
2016-03-01 00:00:00 06 191.2
2016-04-01 01:00:00 06 151.5
2016-05-01 01:00:00 06 119.6
2016-06-01 01:00:00 06 1372.5
2016-07-01 01:00:00 06 954.0
2016-08-01 01:00:00 06 1784.0
2016-09-01 01:00:00 06 1369.0
2016-10-01 01:00:00 06 6077.0
2016-11-01 00:00:00 06 1638.0
2016-12-01 00:00:00 06 3308.0
And my code looks like.
ggplot(try1, aes(Month,tcol)) +
geom_point(aes(colour = Year),size=2) +
geom_line(aes(colour = Year), size=0.73)+
theme_bw()+
guides(col = guide_legend(ncol = 2))+
scale_x_datetime(
breaks=date_breaks("1 months"),
labels=date_format("%B"))+
xlab("")+ #x axis label
ylab("Total Coliforms")
The problem is that when I plot my graph March appears twice. And October appears to be left out.
The resulting graph
Thanks for your help.
I suspect it is a timezone issue. E.g., with this data
structure(list(Month = structure(list(sec = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), min = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 0L, 0L), mday = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), mon = 0:11, year = c(116L, 116L, 116L, 116L, 116L, 116L,
116L, 116L, 116L, 116L, 116L, 116L), wday = c(5L, 1L, 2L, 5L,
0L, 3L, 5L, 1L, 4L, 6L, 2L, 4L), yday = c(0L, 31L, 60L, 91L,
121L, 152L, 182L, 213L, 244L, 274L, 305L, 335L), isdst = c(0L,
0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L), zone = c("GMT",
"GMT", "GMT", "BST", "BST", "BST", "BST", "BST", "BST", "BST",
"GMT", "GMT"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst",
"zone", "gmtoff"), class = c("POSIXlt", "POSIXt"), tzone = c("Europe/London",
"GMT", "BST")), Year = c(6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L), tcol = c(1461, 259.5, 191.2, 151.5, 119.6, 1372.5,
954, 1784, 1369, 6077, 1638, 3308)), .Names = c("Month", "Year",
"tcol"), row.names = c(NA, -12L), class = "data.frame")
I can reproduce your chart. Try changing the timezone
attr(try1$Month, "tzone") <- "UTC"
and replot.
Update. I was wondering why changing the timezone to "UTC" works. It turns out that date_format() takes a tz argument that defaults to "UTC". See ?date_format. This means that instead of changing the timezone of Month to "UTC", you can also fix your problem by changing the tz argument in date_format() to whatever the original timezone of Month is, which you can inspect via attr(try1$Month, "tzone").

How to find number of unique ids corresponding to each date in a data drame

I have a data frame that looks like this:
date time id datetime
1 2015-01-02 14:27:22.130 999000000007628 2015-01-02 14:27:22
2 2015-01-02 14:41:27.720 989001002807730 2015-01-02 14:41:27
3 2015-01-02 14:41:27.940 989001002807730 2015-01-02 14:41:27
4 2015-01-02 14:41:28.140 989001002807730 2015-01-02 14:41:28
5 2015-01-02 14:41:28.170 989001002807730 2015-01-02 14:41:28
6 2015-01-02 14:41:28.350 989001002807730 2015-01-02 14:41:28
I need to find the number of unique "id"s for each "date" in that data frame.
I tried this:
sums<-data.frame(date=unique(data$date), numIDs=0)
for(i in unique(data$date)){
sums[sums$date==i,]$numIDs<-length(unique(data[data$date==i,]$id))
}
and I got the following error:
Error in `$<-.data.frame`(`*tmp*`, "numIDs", value = 0L) :
replacement has 1 row, data has 0
In addition: Warning message:
In `==.default`(data$date, i) :
longer object length is not a multiple of shorter object length
Any ideas?? Thank you!
Hopefully this helps!
data <- structure(list(date = structure(list(sec = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), min = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
hour = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), mday = c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), mon = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), year = c(115L, 115L, 115L, 115L,
115L, 115L, 115L, 115L, 115L, 115L), wday = c(5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L), yday = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), isdst = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), zone = c("PST", "PST", "PST", "PST", "PST",
"PST", "PST", "PST", "PST", "PST"), gmtoff = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst",
"zone", "gmtoff"), class = c("POSIXlt", "POSIXt")), time = c("14:27:22.130",
"14:41:27.720", "14:41:27.940", "14:41:28.140", "14:41:28.170",
"14:41:28.350", "14:41:28.390", "14:41:28.520", "14:41:28.630",
"14:41:28.740"), id = c("999000000007628", "989001002807730",
"989001002807730", "989001002807730", "989001002807730", "989001002807730",
"989001002807730", "989001002807730", "989001002807730", "989001002807730"
), datetime = structure(list(sec = c(22.13, 27.72, 27.94, 28.14,
28.17, 28.35, 28.39, 28.52, 28.63, 28.74), min = c(27L, 41L,
41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L), hour = c(14L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L), mday = c(2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L), mon = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), year = c(115L, 115L, 115L, 115L, 115L, 115L, 115L,
115L, 115L, 115L), wday = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L), yday = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), isdst = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), zone = c("PST", "PST", "PST",
"PST", "PST", "PST", "PST", "PST", "PST", "PST"), gmtoff = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst",
"zone", "gmtoff"), class = c("POSIXlt", "POSIXt")), site = c("Chivato",
"Chivato", "Chivato", "Chivato", "Chivato", "Chivato", "Chivato",
"Chivato", "Chivato", "Chivato")), .Names = c("date", "time",
"id", "datetime", "site"), row.names = c(NA, 10L), class = "data.frame")
You can use the uniqueN function from data.table:
library(data.table)
setDT(df)[, uniqueN(id), by = date]
or (as per the comment of #Richard Scriven):
aggregate(id ~ date, df, function(x) length(unique(x)))
Or we could use n_distinct from library(dplyr)
library(dplyr)
df %>%
group_by(date) %>%
summarise(id=n_distinct(id))
This answer is in response to this post: group by and then count unique observations which was marked as duplicate as I was writing this draft. This is not in response to the question for the duplicate basis here: How to find number of unique ids corresponding to each date in a data drame which asks about finding unique ID's. I'm not sure the second post actually answers the OP's question which is,
"I want to create a table with the number of unique id for each
combination of group1 and group2."
The keyword here is 'combination'. The interpretation is each id has a particular value for group1 and a particular value for group2 so that the set of data of interest is the particular set of values c(id, group1, group2).
Here is the data.frame the OP provided:
df1 <- data.frame(id=sample(letters, 10000, replace = T),
group1=sample(1:2, 10000, replace = T),
group2=sample(100:101, 10000, replace = T))
Using data.table inspired by this post -- https://stackoverflow.com/a/13017723/5220858:
>library(data.table)
>DT <- data.table(df1)
>DT[, .N, by = .(group1, group2)]
group1 group2 N
1: 1 100 2493
2: 1 101 2455
3: 2 100 2559
4: 2 101 2493
N is the count for the id that has a particular group1 value and a particular group2 value. Expanding to include the id also returns a table of 104 unique id, group1, group2 combinations.
>DT[, .N, by = .(id, group1, group2)]
id group1 group2 N
1: t 1 100 107
2: g 1 101 85
3: l 1 101 98
4: a 1 100 83
5: j 1 101 98
---
100: p 1 101 96
101: r 2 101 91
102: y 1 101 104
103: g 1 100 83
104: r 2 100 77

R: How to obtain difference in weeks between a “POSIXlt” date and the first occurrence of a "POSIXlt" date from the same vector.

I have a data frame with over a million rows of data (agents and call metrics aggregated by day). Each agent is listed multiple times because they handle calls (d1$Calls) across multiple queues each day. I want to identify the number of weeks an agent has been in the field. I would normally be able to do this using "difftime" to obtain the difference between an agent’s start date (d1$Start) and the interaction date (d1$Interaction) for any given day:
floor(difftime(d1$Interaction,d1$Start,units='weeks'))
However, my system’s start dates are unreliable often resulting in negative weeks:
dput(d1)
structure(list(ID = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L), .Label = c("a123", "b123"), class = "factor"), QUEUE = structure(c(9L,
8L, 7L, 6L, 5L, 3L, 4L, 1L, 2L, 4L), .Label = c("MHEK", "MMED",
"MMEF", "MMEM", "MNEM", "MSED", "MSEE", "MSEK", "MSEP"), class = "factor"),
Calls = c(1L, 4L, 25L, 14L, 6L, 25L, 5L, 1L, 1L, 3L), Interaction = structure(list(
sec = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), min = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), mday = c(2L, 2L, 6L, 12L,
12L, 2L, 6L, 6L, 6L, 6L), mon = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), year = c(115L, 115L, 115L, 115L,
115L, 115L, 115L, 115L, 115L, 115L), wday = c(5L, 5L,
2L, 1L, 1L, 5L, 2L, 2L, 2L, 2L), yday = c(1L, 1L, 5L,
11L, 11L, 1L, 5L, 5L, 5L, 5L), isdst = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), zone = c("PST", "PST", "PST",
"PST", "PST", "PST", "PST", "PST", "PST", "PST"), gmtoff = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst",
"zone", "gmtoff"), class = c("POSIXlt", "POSIXt")), Start = structure(list(
sec = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), min = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), mday = c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), mon = c(2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), year = c(115L, 115L, 115L, 115L,
115L, 115L, 115L, 115L, 115L, 115L), wday = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), yday = c(59L, 59L, 59L,
59L, 59L, 59L, 59L, 59L, 59L, 59L), isdst = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), zone = c("PST", "PST",
"PST", "PST", "PST", "PST", "PST", "PST", "PST", "PST"
), gmtoff = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_)), .Names = c("sec", "min",
"hour", "mday", "mon", "year", "wday", "yday", "isdst", "zone",
"gmtoff"), class = c("POSIXlt", "POSIXt")), Weeks = structure(c(-9,
-9, -8, -7, -7, -9, -8, -8, -8, -8), units = "weeks", class = "difftime")), .Names = c("ID",
"QUEUE", "Calls", "Interaction", "Start", "Weeks"), row.names = c(NA,
-10L), class = "data.frame")
To get around this problem, I want to calculate the difference in weeks between any interaction date (d1$Interaction) and the first interaction date in the system for that agent (d1$ID). How is this possible?
This works for me (all in base R):
#split the data frame according to ID
mylist <- split(df, factor(df$ID))
#use do.call to combine lists elements to one data.frame
#instead of do call you can use data.table::rbindlist for speed
mydata <- do.call(rbind,
lapply(mylist, function(x) {
#order each group
x <- x[order(x$Interaction),]
#calculate time differences
#difftime of Interactions vector from the 2nd element to the last, minus
#the Interactions vector of the 1st element to the penultimate
#I use c(0, difftime.... to add a zero to the first difference
#so that I can add it as a column
x$weekdif <- c(0,difftime(x$Interaction[2:length(x$Interaction)],
x$Interaction[1:(length(x$Interaction)-1)],
units='weeks'))
x
}))
Output:
> mydata
ID QUEUE Calls Interaction Start Weeks weekdif
a123.1 a123 MSEP 1 2015-01-02 2015-03-01 -9 weeks 0.0000000
a123.2 a123 MSEK 4 2015-01-02 2015-03-01 -9 weeks 0.0000000
a123.3 a123 MSEE 25 2015-01-06 2015-03-01 -8 weeks 0.5714286
a123.4 a123 MSED 14 2015-01-12 2015-03-01 -7 weeks 0.8571429
a123.5 a123 MNEM 6 2015-01-12 2015-03-01 -7 weeks 0.0000000
b123.6 b123 MMEF 25 2015-01-02 2015-03-01 -9 weeks 0.0000000
b123.7 b123 MMEM 5 2015-01-06 2015-03-01 -8 weeks 0.5714286
b123.8 b123 MHEK 1 2015-01-06 2015-03-01 -8 weeks 0.0000000
b123.9 b123 MMED 1 2015-01-06 2015-03-01 -8 weeks 0.0000000
b123.10 b123 MMEM 3 2015-01-06 2015-03-01 -8 weeks 0.0000000
I changed the function to the following and now it works as you want it:
#you need to import this for the na.locf function
library(zoo)
mylist <- split(df, factor(df$ID))
mydata <- do.call(rbind,
lapply(mylist, function(x) {
x <- x[order(x$Interaction),]
x$weekdif <- c(0,difftime(x$Interaction[2:length(x$Interaction)], x$Interaction[1:(length(x$Interaction)-1)], units='weeks'))
#convert all zeros (apart from first to NAs)
x$weekdif[x$weekdif==0] <- NA
#create the rolling values minus the first NAs
#see the examples at ?na.locf for details on what it does
temp <- as.numeric(na.locf(zoo(x$weekdif)))
#add the first NAs
missing_length <- length(x$weekdif) - length(temp)
x$weekdif <- c(rep(0,missing_length), temp)
x
}))
Output:
ID QUEUE Calls Interaction Start Weeks weekdif
a123.1 a123 MSEP 1 2015-01-02 2015-03-01 -9 weeks 0.0000000
a123.2 a123 MSEK 4 2015-01-02 2015-03-01 -9 weeks 0.0000000
a123.3 a123 MSEE 25 2015-01-06 2015-03-01 -8 weeks 0.5714286
a123.4 a123 MSED 14 2015-01-12 2015-03-01 -7 weeks 0.8571429
a123.5 a123 MNEM 6 2015-01-12 2015-03-01 -7 weeks 0.8571429
b123.6 b123 MMEF 25 2015-01-02 2015-03-01 -9 weeks 0.0000000
b123.7 b123 MMEM 5 2015-01-06 2015-03-01 -8 weeks 0.5714286
b123.8 b123 MHEK 1 2015-01-06 2015-03-01 -8 weeks 0.5714286
b123.9 b123 MMED 1 2015-01-06 2015-03-01 -8 weeks 0.5714286
b123.10 b123 MMEM 3 2015-01-06 2015-03-01 -8 weeks 0.5714286
First values for each id are 0 because there is no previous interaction date.

Decimal ages in R

I am trying to calculate decimal ages based on dates of birth and a recent date. I have the following test dataframe.
df.1 <- structure(list(dob = structure(list(sec = c(0, 0, 0, NA, 0, 0
), min = c(0L, 0L, 0L, NA, 0L, 0L), hour = c(0L, 0L, 0L, NA,
0L, 0L), mday = c(18L, 24L, 25L, NA, 31L, 15L), mon = c(11L,
5L, 11L, NA, 11L, 11L), year = c(100L, 101L, 102L, NA, 99L, 101L
), wday = c(1L, 0L, 3L, NA, 5L, 6L), yday = c(352L, 174L, 358L,
NA, 364L, 348L), isdst = c(0L, 1L, 0L, -1L, 0L, 0L), zone = c("GMT",
"BST", "GMT", "", "GMT", "GMT"), gmtoff = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst",
"zone", "gmtoff"), class = c("POSIXlt", "POSIXt")), recent.date = structure(list(
sec = c(0, 0, 0, 0, 0, 0), min = c(0L, 0L, 0L, 0L, 0L, 0L
), hour = c(0L, 0L, 0L, 0L, 0L, 0L), mday = c(3L, 2L, 4L,
3L, 1L, 2L), mon = c(5L, 5L, 5L, 5L, 5L, 5L), year = c(114L,
114L, 114L, 114L, 114L, 114L), wday = c(2L, 1L, 3L, 2L, 0L,
1L), yday = c(153L, 152L, 154L, 153L, 151L, 152L), isdst = c(1L,
1L, 1L, 1L, 1L, 1L), zone = c("BST", "BST", "BST", "BST",
"BST", "BST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst",
"zone", "gmtoff"), class = c("POSIXlt", "POSIXt"))), .Names = c("dob",
"recent.date"), row.names = c(NA, -6L), class = "data.frame")
Using the lubridate function and decimal date, I am trying to convert the dates to decimal dates:
library(lubridate)
df.1$decimal.dob <- decimal_date(df.1$dob)
df.1$decimal.recent.date <- decimal_date(df.1$recent.date)
However because I have missing dates of birth, when I try to convert the dates of birth to decimal ages, I get an error. I therefore can't take the 'decimal.recent.date' column away from the 'decimal.dob' column to give me a decimal age.
If anyone could help me getting this to work, I would very much appreciate it!
So search for and remove the NA entries. I'm assuming that if there's an NA in, say, dob$sec[j], then dob$min[j] is also NA and so on.
killit <- which (is.na(dob$sec))
foo <- decimal_date(df.1$dob[-c(killit)])
#[1] 2000.962 2001.477 2002.981 1999.997 2001.953
You'll have to shrink your df.1$decimal.dob to have the same length as the reduced input set.

Resources