Getting counts grouped by hour - r

I would like to get the counts per hour for each type (version1 and version2).
Sample data:
type <- c('version1','version1','version1','version2','version2')
startdate <- as.POSIXct(c('2017-11-1 02:11:02.000','2018-3-25 02:13:02.000','2019-3-14 03:45:02.000',
'2017-3-14 02:55:02.000','2018-3-14 03:45:02.000'))
df <- data.frame(type, startdate)
df
type startdate
1 version1 2017-11-01 02:11:02
2 version1 2018-03-25 02:13:02
3 version1 2019-03-14 03:45:02
4 version2 2017-03-14 02:55:02
5 version2 2018-03-14 03:45:02
In this df we see that version1 has two counts for 02h and one count for 03h.
And version2 has one count for 02h and one count for 03h.
Desired output:
hour version1 version2
1 00:00 0 0
2 01:00 0 0
3 02:00 2 1
4 03:00 1 1

We can first get hours from startdate, count number of rows for each hour and type. complete missing hours and fill their count with 0 and use pivot_wider to get data in wide format.
library(dplyr)
library(tidyr)
df %>%
mutate(hr = lubridate::hour(startdate)) %>%
count(hr, type) %>%
complete(type, hr = seq(0, max(hr)), fill = list(n = 0)) %>%
pivot_wider(names_from = type, values_from = n)
# A tibble: 4 x 3
# hr version1 version2
# <int> <dbl> <dbl>
#1 0 0 0
#2 1 0 0
#3 2 2 1
#4 3 1 1

Something was wrong with your start date variable. Thus I set it up with the package lubridate
library(dplyr)
library(tidyr)
type = c('version1','version1','version1','version2','version2')
startdate = lubridate::ymd_hms(c('2017-11-1T02:11:02.000','2018-3-25T02:13:02.000',
'2019-3-14T03:45:02.000','2017-3-14T02:55:02.000',
'2018-3-14T03:45:02.000'))
tibble(type = type, startdate = startdate) %>%
count(type, hour = lubridate::hour(startdate)) %>%
spread(type, n)
# A tibble: 2 x 3
hour version1 version2
<int> <int> <int>
1 2 2 1
2 3 1 1

Base R solution:
# Extract the hour and store it as a vector:
df$hour <- gsub(".* ", "", trunc(df$startdate, units = "hours"))
# Count the number of observations of each type in each hour:
df$type_hour_cnt <- with(df,
ave(paste(type, hour, sep = " - "),
paste(type, hour, sep = " - "), FUN = seq_along))
# Reshape dataframe:
df <- as.data.frame(as.matrix(xtabs(type_hour_cnt ~ hour + type, df, sparse = T)))
# Extract rownames and store them as "hour" vector and then delete row.names:
df <- data.frame(cbind(hour = row.names(df), df), row.names = NULL)

Related

R calculate overlapping months between two start and end dates

I have a dataframe with two start and stop dates that looks like this:
ID G1_START G1_END G2_START G2_END LOCATION
1 1/1/2021 5/31/2021 2/1/2021 5/31/2021 A
2 12/1/2020 3/31/2021 10/1/2020 5/31/2021 B
What I would like to do is create one row per month per patient where the months overlap between the four dates. For example
ID MONTH ACTIVE LOCATION
1 2/1/2021 1 A
1 3/1/2021 1 A
1 4/1/2021 1 A
1 5/1/2021 1 A
2 12/1/2020 1 B
2 1/1/2021 1 B
2 2/1/2021 1 B
2 3/1/2021 1 B
Where active means the ID was on both G1 and G2 during these months.
Here is a method in tidyverse
Reshape data from wide to long format - pivot_longer
Convert the date columns 'START', 'END' to Date class (mdy)
Loop over the 'START', 'END' with map2, get the sequence by '1 month'
Floor the date by month - floor_date
grouped by ID, LOCATION, MONTH, filter the groups where 'Categ' distinct elements are 2
Create 'ACTIVE' column of 1 after returning the distinct rows
library(dplyr)
library(tidyr)
library(lubridate)
library(purrr)
pivot_longer(df1, cols = contains("_"),
names_to = c("Categ", ".value"), names_sep= "_") %>%
transmute(ID, LOCATION, Categ, MONTH = map2(mdy(START), mdy(END), ~
floor_date(seq(.x, .y, by = '1 month'), 'month'))) %>%
unnest(MONTH) %>%
group_by(ID, LOCATION, MONTH) %>%
filter(n_distinct(Categ) == 2) %>%
ungroup %>%
distinct(ID, LOCATION, MONTH) %>%
mutate(ACTIVE = 1) %>%
select(ID, MONTH, ACTIVE, LOCATION)
-output
# A tibble: 8 x 4
ID MONTH ACTIVE LOCATION
<int> <date> <dbl> <chr>
1 1 2021-02-01 1 A
2 1 2021-03-01 1 A
3 1 2021-04-01 1 A
4 1 2021-05-01 1 A
5 2 2020-12-01 1 B
6 2 2021-01-01 1 B
7 2 2021-02-01 1 B
8 2 2021-03-01 1 B
data
df1 <- structure(list(ID = 1:2, G1_START = c("1/1/2021", "12/1/2020"
), G1_END = c("5/31/2021", "3/31/2021"), G2_START = c("2/1/2021",
"10/1/2020"), G2_END = c("5/31/2021", "5/31/2021"), LOCATION = c("A",
"B")), class = "data.frame", row.names = c(NA, -2L))

How can I create a column that cumulatively adds the sum of two previous rows based on conditions?

I tried asking this question before but was it was poorly stated. This is a new attempt cause I haven't solved it yet.
I have a dataset with winners, losers, date, winner_points and loser_points.
For each row, I want two new columns, one for the winner and one for the loser that shows how many points they have scored so far (as both winners and losers).
Example data:
winner <- c(1,2,3,1,2,3,1,2,3)
loser <- c(3,1,1,2,1,1,3,1,2)
date <- c("2017-10-01","2017-10-02","2017-10-03","2017-10-04","2017-10-05","2017-10-06","2017-10-07","2017-10-08","2017-10-09")
winner_points <- c(2,1,2,1,2,1,2,1,2)
loser_points <- c(1,0,1,0,1,0,1,0,1)
test_data <- data.frame(winner, loser, date = as.Date(date), winner_points, loser_points)
I want the output to be:
winner_points_sum <- c(0, 0, 1, 3, 1, 3, 5, 3, 5)
loser_points_sum <- c(0, 2, 2, 1, 4, 5, 4, 7, 4)
test_data <- data.frame(winner, loser, date = as.Date(date), winner_points, loser_points, winner_points_sum, loser_points_sum)
How I've solved it thus far is to do a for loop such as:
library(dplyr)
test_data$winner_points_sum_loop <- 0
test_data$loser_points_sum_loop <- 0
for(i in row.names(test_data)) {
test_data[i,]$winner_points_sum_loop <-
(
test_data %>%
dplyr::filter(winner == test_data[i,]$winner & date < test_data[i,]$date) %>%
dplyr::summarise(points = sum(winner_points, na.rm = TRUE))
+
test_data %>%
dplyr::filter(loser == test_data[i,]$winner & date < test_data[i,]$date) %>%
dplyr::summarise(points = sum(loser_points, na.rm = TRUE))
)
}
test_data$winner_points_sum_loop <- unlist(test_data$winner_points_sum_loop)
Any suggestions how to tackle this problem? The queries take quite some time when the row numbers add up. I've tried elaborating with the AVE function, I can do it for one column to sum a players point as winner but can't figure out how to add their points as loser.
winner <- c(1,2,3,1,2,3,1,2,3)
loser <- c(3,1,1,2,1,1,3,1,2)
date <- c("2017-10-01","2017-10-02","2017-10-03","2017-10-04","2017-10-05","2017-10-06","2017-10-07","2017-10-08","2017-10-09")
winner_points <- c(2,1,2,1,2,1,2,1,2)
loser_points <- c(1,0,1,0,1,0,1,0,1)
test_data <- data.frame(winner, loser, date = as.Date(date), winner_points, loser_points)
library(dplyr)
library(tidyr)
test_data %>%
unite(winner, winner, winner_points) %>% # unite winner columns
unite(loser, loser, loser_points) %>% # unite loser columns
gather(type, pl_pts, winner, loser, -date) %>% # reshape
separate(pl_pts, c("player","points"), convert = T) %>% # separate columns
arrange(date) %>% # order dates (in case it's not)
group_by(player) %>% # for each player
mutate(sum_points = cumsum(points) - points) %>% # get points up to that date
ungroup() %>% # forget the grouping
unite(pl_pts_sumpts, player, points, sum_points) %>% # unite columns
spread(type, pl_pts_sumpts) %>% # reshape
separate(loser, c("loser", "loser_points", "loser_points_sum"), convert = T) %>% # separate columns and give appropriate names
separate(winner, c("winner", "winner_points", "winner_points_sum"), convert = T) %>%
select(winner, loser, date, winner_points, loser_points, winner_points_sum, loser_points_sum) # select the order you prefer
# # A tibble: 9 x 7
# winner loser date winner_points loser_points winner_points_sum loser_points_sum
# * <int> <int> <date> <int> <int> <int> <int>
# 1 1 3 2017-10-01 2 1 0 0
# 2 2 1 2017-10-02 1 0 0 2
# 3 3 1 2017-10-03 2 1 1 2
# 4 1 2 2017-10-04 1 0 3 1
# 5 2 1 2017-10-05 2 1 1 4
# 6 3 1 2017-10-06 1 0 3 5
# 7 1 3 2017-10-07 2 1 5 4
# 8 2 1 2017-10-08 1 0 3 7
# 9 3 2 2017-10-09 2 1 5 4
I finally understood what you want. And I took an approach of getting cumulative points of each player at each point in time and then joining it to the original test_data data frame.
winner <- c(1,2,3,1,2,3,1,2,3)
loser <- c(3,1,1,2,1,1,3,1,2)
date <- c("2017-10-01","2017-10-02","2017-10-03","2017-10-04","2017-10-05","2017-10-06","2017-10-07","2017-10-08","2017-10-09")
winner_points <- c(2,1,2,1,2,1,2,1,2)
loser_points <- c(1,0,1,0,1,0,1,0,1)
test_data <- data.frame(winner, loser, date = as.Date(date), winner_points, loser_points)
library(dplyr)
library(tidyr)
cum_points <- test_data %>%
gather(end_game_status, player_id, winner, loser) %>%
gather(which_point, how_many_points, winner_points, loser_points) %>%
filter(
(end_game_status == "winner" & which_point == "winner_points") |
(end_game_status == "loser" & which_point == "loser_points")) %>%
arrange(date = as.Date(date)) %>%
group_by(player_id) %>%
mutate(cumulative_points = cumsum(how_many_points)) %>%
mutate(cumulative_points_sofar = lag(cumulative_points, default = 0))
select(player_id, date, cumulative_points)
output <- test_data %>%
left_join(cum_points, by = c('date', 'winner' = 'player_id')) %>%
rename(winner_points_sum = cumulative_points_sofar) %>%
left_join(cum_points, by = c('date', 'loser' = 'player_id')) %>%
rename(loser_points_sum = cumulative_points_sofar)
output
The difference to the previous question of the OP is that the OP is now asking for the cumulative sum of points each player has scored so far, i.e., before the actual date. Furthermore, the sample data set now contains a date column which uniquely identifies each row.
So, my previous approach can be used here as well, with some modifications. The solution below reshapes the data from wide to long format whereby two value variables are reshaped simultaneously, computes the cumulative sums for each player id , and finally reshapes from long back to wide format, again. In order to sum only points scored before the actual date, the rows are lagged by one.
It is important to note that the winner and loser columns contain the respective player ids.
library(data.table)
cols <- c("winner", "loser")
setDT(test_data)[
# reshape multiple value variables simultaneously from wide to long format
, melt(.SD, id.vars = "date",
measure.vars = list(cols, paste0(cols, "_points")),
value.name = c("id", "points"))][
# rename variable column
, variable := forcats::lvls_revalue(variable, cols)][
# order by date and cumulate the lagged points by id
order(date), points_sum := cumsum(shift(points, fill = 0)), by = id][
# reshape multiple value variables simultaneously from long to wide format
, dcast(.SD, date ~ variable, value.var = c("id", "points", "points_sum"))]
date id_winner id_loser points_winner points_loser points_sum_winner points_sum_loser
1: 2017-10-01 1 3 2 1 0 0
2: 2017-10-02 2 1 1 0 0 2
3: 2017-10-03 3 1 2 1 1 2
4: 2017-10-04 1 2 1 0 3 1
5: 2017-10-05 2 1 2 1 1 4
6: 2017-10-06 3 1 1 0 3 5
7: 2017-10-07 1 3 2 1 5 4
8: 2017-10-08 2 1 1 0 3 7
9: 2017-10-09 3 2 2 1 5 4

Conditional sum with dates in column names

Want to calculate conditional sum based on specified dates in r. My sample df is
start_date = c("7/24/2017", "7/1/2017", "7/25/2017")
end_date = c("7/27/2017", "7/4/2017", "7/28/2017")
`7/23/2017` = c(1,5,1)
`7/24/2017` = c(2,0,2)
`7/25/2017` = c(0,0,10)
`7/26/2017` = c(2,2,2)
`7/27/2017` = c(0,0,0)
df = data.frame(start_date,end_date,`7/23/2017`,`7/24/2017`,`7/25/2017`,`7/26/2017`,`7/27/2017`)
In Excel it looks like:
I want to perform calculations as specified in Column H which is a conditional sum of columns C through G based on the dates specified in columns A and B.
Apparently, Excel allows columns to be dates but not R.
#wide to long format
dat <- reshape(df, direction="long", varying=list(names(df)[3:7]), v.names="Value",
idvar=c("start_date","end_date"), timevar="Date",
times=seq(as.Date("2017/07/23"),as.Date("2017/07/27"), "day"))
#convert from factor to date class
dat$end_date <- as.Date(dat$end_date, format = "%m/%d/%Y")
dat$start_date <- as.Date(dat$start_date, format = "%m/%d/%Y")
library(dplyr)
dat %>% group_by(start_date, end_date) %>%
mutate(mval = ifelse(between(Date, start_date, end_date), Value, 0)) %>%
summarise(conditional_sum=sum(mval))
# # A tibble: 3 x 3
# # Groups: start_date [?]
# start_date end_date conditional_sum
# <date> <date> <dbl>
# 1 2017-07-01 2017-07-04 0
# 2 2017-07-24 2017-07-27 4
# 3 2017-07-25 2017-07-28 12
You could achieve that as follows:
# number of trailing columns without numeric values
c = 2
# create a separate vector with the dates
dates = as.Date(gsub("X","",tail(colnames(df),-c)),format="%m.%d.%Y")
# convert date columns in dataframe
df$start_date = as.Date(df$start_date,format="%m/%d/%Y")
df$end_date = as.Date(df$end_date,format="%m/%d/%Y")
# calculate sum
sapply(1:nrow(df),function(x) {y = df[x,(c+1):ncol(df)][dates %in%
seq(df$start_date[x],df$end_date[x],by="day") ]; ifelse(length(y)>0,sum(y),0) })
returns:
[1] 4 0 12
Hope this helps!
Here's a solution all in one dplyr pipe:
library(dplyr)
library(lubridate)
library(tidyr)
df %>%
gather(date, value, -c(1, 2)) %>%
mutate(date = gsub('X', '', date)) %>%
mutate(date = gsub('\\.', '/', date)) %>%
mutate(date = mdy(date)) %>%
filter(date >= mdy(start_date) & date <=mdy(end_date)) %>%
group_by(start_date, end_date) %>%
summarize(Conditional_Sum = sum(value)) %>%
right_join(df) %>%
mutate(Conditional_Sum = ifelse(is.na(Conditional_Sum), 0, Conditional_Sum)) %>%
select(-one_of('Conditional_Sum'), one_of('Conditional_Sum'))
## start_date end_date X7.23.2017 X7.24.2017 X7.25.2017 X7.26.2017 X7.27.2017 Conditional_Sum
## <fctr> <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7/24/2017 7/27/2017 1 2 0 2 0 4
## 2 7/1/2017 7/4/2017 5 0 0 2 0 0
## 3 7/25/2017 7/28/2017 1 2 10 2 0 12

Flag dates based on multiple columns

I have a df, this provides information about the create_date and delete_date(if any) for a given ID.
Structure:
ID create_date1 create_date2 delete_date1 delete_date2
1 01-01-2014 NA NA NA
2 01-04-2014 01-08-2014 01-05-2014 NA
the create_date and delete_date extends till 10, i.e. create_date10
and delete_date10 columns are present
Rules/Logic:
We charge a user on monthly basis, if a user was created on 30th of a month, even then it's treated as if the user was active for a month(very low cost)
If a user has a delete date (irrespective on which date) in this month, then from next month the user is not charged
If a user has only create_date and no delete_date then all dates including the create_month is charged
Output expected:
ID 2014-01 2014-02 2014-03 2014-04 2014-05 2014-06 2014-07 2014-08
1 1 1 1 1 1 1 1 1
2 0 0 0 1 1 0 0 1
so on till current date
1 indicates the user is charged/active for that month
Problem:
I have been struggling to do this, but can't even understand how to do this. My earlier method is a bit too slow
Previous Solution:
Make the dataset into tall
Insert sequence of dates for each ID as a new column
Use a for loop to check the status
for each ID, status is equal to 1,
if create_date is equal to sequence, and it's 0 if the lag(delete_date) is equal to sequence
else is same as lag(status)
ID create_date delete_date sequence status?
1 01-01-2014 NA 2014-01 1
1 01-01-2014 NA 2014-02 1
1 01-01-2014 NA 2014-03 1
may not be that efficient : assuming this is just for a single year(could be extended easily)
# convert all dates to Date format
df[,colnames(df[-1])] = lapply(colnames(df[-1]), function(x) as.Date(df[[x]], format = "%d-%m-%Y"))
# extract the month
library(lubridate)
df[,colnames(df[-1])] = lapply(colnames(df[-1]), function(x) month(df[[x]]))
# df
# ID create_date1 create_date2 delete_date1 delete_date2
#1 1 1 NA NA NA
#2 2 4 8 5 NA
# get the current month
current.month <- month(Sys.Date())
# assume for now current month is 9
current.month <- 9
flags <- rep(FALSE, current.month)
func <- function(x){
x[is.na(x)] <- current.month # replacing all NA with current month(9)
create.columns.indices <- x[grepl("create_date", colnames(df[-1]))] # extract the create_months
delete.columns.indices <- x[grepl("delete_date", colnames(df[-1]))] # extract the delete_months
flags <- pmin(1,colSums(t(sapply(seq_along(create.columns.indices),
function(x){
flags[create.columns.indices[x]:delete.columns.indices[x]] = TRUE;
flags
}))))
flags
}
df1 = cbind(df$ID , t(apply(df[-1], 1, func)))
colnames(df1) = c("ID", paste0("month",1:current.month))
# df1
# ID month1 month2 month3 month4 month5 month6 month7 month8 month9
#[1,] 1 1 1 1 1 1 1 1 1 1
#[2,] 2 0 0 0 1 1 0 0 1 1
Here's a still-pretty-long tidyverse approach:
library(tidyverse)
df %>% gather(var, date, -ID) %>% # reshape to long form
# separate date type from column set number
separate(var, c('action', 'number'), sep = '_date', convert = TRUE) %>%
mutate(date = as.Date(date, '%d-%m-%Y')) %>% # parse dates
spread(action, date) %>% # spread create and delete to two columns
mutate(min_date = min(create, delete, na.rm = TRUE), # add helper columns; use outside
max_date = max(create, delete, na.rm = TRUE)) %>% # variable to save memory if an issue
group_by(ID, number) %>%
mutate(month = list(seq(min_date, max_date, by = 'month')), # add month sequence list column
# boolean vector of whether range of months in whole range
active = ifelse(is.na(create),
list(rep(FALSE, length(month[[1]]))),
lapply(month, `%in%`,
seq.Date(create,
min(delete, max_date, na.rm = TRUE),
by = 'month')))) %>%
unnest() %>% # unnest list columns to long form
group_by(ID, month = format(month, '%Y-%m')) %>%
summarise(active = any(active) * 1L) %>% # combine muliple rows for one ID
spread(month, active) # reshape to wide form
## Source: local data frame [2 x 9]
## Groups: ID [2]
##
## ID `2014-01` `2014-02` `2014-03` `2014-04` `2014-05` `2014-06` `2014-07` `2014-08`
## * <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 1 1 1 1 1 1 1 1 1
## 2 2 0 0 0 1 1 0 0 1

Grouping time and counting instances by 12 hour bins in R

I have a dataframe df1 like this :
timestamp
01-12-2015 00:04
01-12-2015 02:20
01-12-2015 02:43
01-12-2015 04:31
01-12-2015 08:51
01-12-2015 11:28
01-12-2015 20:53
01-12-2015 21:28
02-12-2015 00:30
02-12-2015 20:22
Which contains time stamps. I would want to get count by binning hours in 12 hours interval i.e(01-12-2015[0-9],01-12-2015[9-21], and so on.
output Sample:
DayOfMonth Group count
1 1 5
1 2 2
2 1 2
2 2 1
The day of month can be replaced by Serial Number also, starting with 1. Any help to solve this is highly appreciated.
A possible solution in base R:
# convert the 'timestamp' column to a datetime format
df1$timestamp <- as.POSIXct(strptime(df1$timestamp, format = '%d-%m-%Y %H:%M'))
# create day.of.month variable
df1$day.of.month <- format(df1$timestamp, '%d')
# extract the 12 hour interval as am/pm values
df1$group <- gsub('[0-9: ]+','\\1',format(df1$timestamp, '%r'))
# aggregate
aggregate(. ~ group + day.of.month, df1, length)
which gives:
group day.of.month timestamp
1 am 01 6
2 pm 01 2
3 am 02 1
4 pm 02 1
Another solution using data.table and and the pm function of lubridate:
library(lubridate)
library(data.table)
setDT(df1)[, timestamp := dmy_hm(timestamp)
][, group := pm(timestamp)+1
][, .N, .(day.of.month = day(timestamp),group)]
which gives:
day.of.month group N
1: 1 1 6
2: 1 2 2
3: 2 1 1
4: 2 2 1
Used data:
df1 <- structure(list(timestamp = c("01-12-2015 00:04", "01-12-2015 02:20", "01-12-2015 02:43", "01-12-2015 04:31", "01-12-2015 08:51",
"01-12-2015 11:28", "01-12-2015 20:53", "01-12-2015 21:28", "02-12-2015 00:30", "02-12-2015 20:22")),
.Names = "timestamp", class = "data.frame", row.names = c(NA,-10L))
We can use lubridate functions to convert to 'Datetime' class easily and with dplyr to get the output efficiently compared to base R methods.
library(lubridate)
library(dplyr)
df1 %>%
mutate(timestamp = dmy_hm(timestamp)) %>%
group_by(DayOfMonth = day(timestamp)) %>%
group_by(Group = as.numeric(cut(timestamp, breaks = "12 hour")),
add=TRUE) %>%
summarise(GroupCount = n())
# DayOfMonth Group GroupCount
# <int> <dbl> <int>
#1 1 1 6
#2 1 2 2
#3 2 1 1
#4 2 2 1
Or we can use a compact option with data.table
library(data.table)
setDT(df1)[, {t1 <- dmy_hm(timestamp); .(DayOfMonth = day(t1),
Group = (hour(t1)>12)+1L)}][, .(GroupCount = .N), .(DayOfMonth, Group)]
# DayOfMonth Group GroupCount
#1: 1 1 6
#2: 1 2 2
#3: 2 1 1
#4: 2 2 1
NOTE: The data.table solution is done with just two steps...
data
df1 <- structure(list(timestamp = c("01-12-2015 00:04", "01-12-2015 02:20",
"01-12-2015 02:43", "01-12-2015 04:31", "01-12-2015 08:51", "01-12-2015 11:28",
"01-12-2015 20:53", "01-12-2015 21:28", "02-12-2015 00:30", "02-12-2015 20:22"
)), .Names = "timestamp", class = "data.frame", row.names = c(NA,-10L))
Another possible solution in base R :
timeStamp <- c("01-12-2015 00:04","01-12-2015 02:20","01-12-2015 02:43","01-12-2015 04:31",
"01-12-2015 08:51","01-12-2015 11:28","01-12-2015 20:53","01-12-2015 21:28",
"02-12-2015 00:30","02-12-2015 20:22")
times <- as.POSIXlt(timeStamp,format="%d-%m-%Y %H:%M",tz='GMT')
DF <- data.frame(Times=times)
DF$Group <- as.logical(times$hour > 12) + 1
DF$DayOfMonth <- times$mday
res <- aggregate(Times ~ DayOfMonth + Group,data=DF, FUN = length)
# res :
# DayOfMonth Group Times
# 1 1 1 6
# 2 2 1 1
# 3 1 2 2
# 4 2 2 1
Or if you want to include dates in hours range: [21-0] of previous day in the next day :
timeStamp <- c("01-12-2015 00:04","01-12-2015 02:20","01-12-2015 02:43","01-12-2015 04:31",
"01-12-2015 08:51","01-12-2015 11:28","01-12-2015 20:53","01-12-2015 21:28",
"02-12-2015 00:30","02-12-2015 20:22")
times <- as.POSIXlt(timeStamp,format="%d-%m-%Y %H:%M",tz='GMT')
h <- times$hour + times$min*1/60 + times$sec*1/3600
# here we add 3 hours to the dates in hours range [21-0] in this way we
# push them to the next day
times[h >= 21] <- times[h >= 21] + 3*3600
DF <- data.frame(Times=times)
DF$Group <- ifelse(h < 9,1,ifelse(h <= 21,2,NA))
DF$DayOfMonth <- times$mday
res <- aggregate(Times ~ DayOfMonth + Group,data=na.omit(DF), FUN = length)
# res :
# DayOfMonth Group Times
# 1 1 1 5
# 2 2 1 2
# 3 1 2 2
# 4 2 2 1
Adding to the several already presented options, the stringi package has some date parsing functions as well:
library(stringi)
df1$timestamp <- stri_datetime_parse(df1$timestamp, format = 'dd-mm-yyyy HH:mm')
df1$DayOfMonth <- stri_datetime_format(df1$timestamp, format = 'd')
df1$Group <- stri_datetime_format(df1$timestamp, format = 'a')
After that you can get a count with for example the following two options:
# option 1:
aggregate(. ~ Group + DayOfMonth, df1, length) # copied from #ProcrastinatusMaximus
# option 2a:
library(dplyr)
df1 %>%
group_by(DayOfMonth, Group) %>%
tally()
# option 2b:
count(df1, DayOfMonth, Group)
The output of the latter:
DayOfMonth Group n
(chr) (chr) (int)
1 1 a.m. 6
2 1 p.m. 2
3 2 a.m. 1
4 2 p.m. 1

Resources