Time data manipulation - r

I have my time and date as
tt <- structure(list(Date = structure(c(1L, 1L, 1L, 1L, 3L, 3L), .Label = c("2018-10-27",
"2018-10-28", "2018-11-15"), class = "factor"), Time = c("21:07:30.004",
"21:07:31.000", "21:07:32.998", "21:07:32.000", "21:07:33.989",
"21:08:33.989")), row.names = c(NA, 6L), class = "data.frame")
I would like to get the smallest to largest time range for each date as:
Date Time
2018-10-27 21:07:30 to 21:07:32
2018-11-15 21:07:33 to 21:08:33
Code I tried:
time.range <- function(field_or_seed){
paste0(gsub(".* ", "", range(strptime(field_or_seed$Time, format = "%H:%M:%S", tz = 'UTC'))[[1]]), " to ",
gsub(".* ", "", range(strptime(field_or_seed$Time, format = "%H:%M:%S", tz = 'UTC'))[[2]]))
}
aggregate(Time~Date, data = tt, FUN = time.range)
I would really appreciate any suggestion I could use to fix this. Thanks!

A more robust approach is to use appropriate objects, e.g. Date and hms:
library(tidyverse)
tt <- tibble(
Date = c("2018-10-27", "2018-10-27", "2018-10-27", "2018-10-27", "2018-11-15", "2018-11-15"),
Time = c("21:07:30.004", "21:07:31.000", "21:07:32.998", "21:07:32.000", "21:07:33.989", "21:08:33.989")
)
tt %>%
mutate(Date = as.Date(Date),
Time = hms::as_hms(Time)) %>%
group_by(Date) %>%
summarise(start = Time[which.min(Time)], # which.min instead of min to avoid dropping class
end = Time[which.max(Time)])
#> # A tibble: 2 x 3
#> Date start end
#> <date> <time> <time>
#> 1 2018-10-27 21:07:30.004 21:07:32.998
#> 2 2018-11-15 21:07:33.989 21:08:33.989
If you like you could use lubridate::interval to represent the interval, but you'd have to use POSIXct.

Try:
tt$date_time <- as.POSIXct(paste(tt$Date, tt$Time))
do.call(rbind, by(
tt,
tt$Date,
FUN = function(x) {
within(x, Time <-
paste(strftime(min(date_time), "%H:%M:%S"),
"to",
strftime(max(date_time), "%H:%M:%S")))[1, -3]
}
))

Also:
aggregate(
data.frame(Time = strptime(do.call(paste, tt), '%F %R:%OS', tz = 'GMT')),
by = list(Date = tt$Date),
function(Time){
paste(
format(min(Time), '%T'),
format(max(Time), '%T'),
sep = ' to '
)
}
)
# Date Time
#1 2018-10-27 21:07:30 to 21:07:32
#2 2018-11-15 21:07:33 to 21:08:33

Related

apply a custom function across certain columns in a dataframe in R

I have the following dataframe:
library(tidyverse)
library(lubridate)
date_data1 <- data.frame(
name = c('groupA'),
number = as.numeric(c(1:10)),
date1 = seq(from = ymd('2019-07-01'), to = ymd('2019-07-10'), by='days'),
date2 = seq(from = ymd('2019-07-02'), to = ymd('2019-07-11'), by='days'),
date3 = seq(from = ymd('2019-06-29'), to = ymd('2019-07-08'), by='days'),
date4 = seq(from = ymd('2019-07-03'), to = ymd('2019-07-12'), by='days'),
date5 = seq(from = ymd('2019-07-05'), to = ymd('2019-07-14'), by='days')
) %>%
mutate(yday = yday(date5))
date_data2 <- data.frame(
name = c('groupB'),
number = as.numeric(c(1:10)),
date1 = seq(from = ymd('2019-07-01'), to = ymd('2019-07-10'), by='days'),
date2 = seq(from = ymd('2019-07-02'), to = ymd('2019-07-11'), by='days'),
date3 = seq(from = ymd('2019-06-29'), to = ymd('2019-07-08'), by='days'),
date4 = seq(from = ymd('2019-07-03'), to = ymd('2019-07-12'), by='days'),
date5 = seq(from = ymd('2019-07-05'), to = ymd('2019-07-14'), by='days')
) %>%
mutate(yday = yday(date5))
date_data <- bind_rows(date_data1, date_data2)
I want to apply the following function to date1 through date4 columns:
mad <- function(x, y) abs(mean(x - y, na.rm = TRUE))
However, I want to retain the "name" identifier.
I have asked a similar question in the past and the solution worked. However, when attempting to adapt the code, I'm running into issues.
Here's what I thought should work, based on the previous post.
apply(date_data[, 3:6], function(x) mad(date_data[,7], x))
In other words, I'm attempting to find the mean absolute difference (the custom function, "mad") between column 7 ("date5") and columns 3 through 5 (i.e. "date1" through "date4") for each group. The goal is to have a new dataframe that gives the mean absolute difference for each of the date columns (1-4) with two rows, one for groupA and one for groupB.
I tried mapping the function, but I get an error that "arguments imply differeng number of rows."
Here's the code for the map() that does not work:
date_data_test <- date_data %>%
group_by(name) %>%
map_at(c(3:6), function(x) mad(date_data[,7], x)) %>%
data.frame()
Any suggestions are appreciated. Thank you.
Using the across function from dplyr:
library(tidyverse)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
date_data1 <- data.frame(
name = c('groupA'),
number = as.numeric(c(1:10)),
date1 = seq(from = ymd('2019-07-01'), to = ymd('2019-07-10'), by='days'),
date2 = seq(from = ymd('2019-07-02'), to = ymd('2019-07-11'), by='days'),
date3 = seq(from = ymd('2019-06-29'), to = ymd('2019-07-08'), by='days'),
date4 = seq(from = ymd('2019-07-03'), to = ymd('2019-07-12'), by='days'),
date5 = seq(from = ymd('2019-07-05'), to = ymd('2019-07-14'), by='days')
) %>%
mutate(yday = yday(date5))
date_data2 <- data.frame(
name = c('groupB'),
number = as.numeric(c(1:10)),
date1 = seq(from = ymd('2019-07-01'), to = ymd('2019-07-10'), by='days'),
date2 = seq(from = ymd('2019-07-02'), to = ymd('2019-07-11'), by='days'),
date3 = seq(from = ymd('2019-06-29'), to = ymd('2019-07-08'), by='days'),
date4 = seq(from = ymd('2019-07-03'), to = ymd('2019-07-12'), by='days'),
date5 = seq(from = ymd('2019-07-05'), to = ymd('2019-07-14'), by='days')
) %>%
mutate(yday = yday(date5))
date_data <- bind_rows(date_data1, date_data2) %>%
as_tibble()
date_data %>%
group_by(name) %>%
summarise(across(
.cols = 2:5,
.fns = ~ abs(mean(interval(.x, date5) %/% days(1))),
.names = "diff_{.col}_date5"
))
#> # A tibble: 2 × 5
#> name diff_date1_date5 diff_date2_date5 diff_date3_date5 diff_date4_date5
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 groupA 4 3 6 2
#> 2 groupB 4 3 6 2
Created on 2021-11-11 by the reprex package (v2.0.1)

How to put/save all elements of a List into one Excel sheet in R?

I have a list (bbb) with 5 elements in it, i.e., each element for a year, like 2010, 2011, ... , 2014:
The first one in the list is this:
> bbb[1]
$`2010`
Date Average
X2010.01.01 2010-01-01 2.079090e-03
X2010.01.02 2010-01-02 5.147627e-04
X2010.01.03 2010-01-03 2.997464e-04
X2010.01.04 2010-01-04 1.375538e-04
X2010.01.05 2010-01-05 1.332109e-04
The second one in the list is this:
> bbb[2]
$`2011`
Date Average
X2011.01.01 2011-01-01 1.546253e-03
X2011.01.02 2011-01-02 1.152864e-03
X2011.01.03 2011-01-03 1.752446e-03
X2011.01.04 2011-01-04 2.639658e-03
X2011.01.05 2011-01-05 5.231150e-03
X2011.01.06 2011-01-06 8.909878e-04
And so on.
Here is my question:
How can I save all of these list's elements in 1 sheet of an Excel file to have something like this:
Your help would be highly appreciated.
You can do this using dcast.
bbb <- list(`2010` = data.frame(date = as.Date("2010-01-01") + 0:4,
avg = 1:5),
`2011` = data.frame(date = as.Date("2011-01-01") + 0:5,
avg = 11:16),
`2012` = data.frame(date = as.Date("2012-01-01") + 0:9,
avg = 21:30),
`2013` = data.frame(date = as.Date("2013-01-01") + 0:7,
avg = 21:28))
df <- do.call("rbind", bbb)
df$year <- format(df$date, format = "%Y")
df$month_date <- format(df$date, format = "%b-%d")
library(data.table)
library(openxlsx)
df_dcast <- dcast(df, month_date~year, value.var = "avg")
write.xlsx(df_dcast, "example1.xlsx")
Or using spread
library(dplyr)
library(tidyr)
df2 <- df %>%
select(-date) %>%
spread(key = year, value = avg)
write.xlsx(df2, "example2.xlsx")
This isn't very pretty, but it's the best I could think of right now. But you could take the dataframes and loop through the list, joining them by date like this:
library(tidyverse)
library(lubridate)
bbb <- list(`2010` = tibble(date = c('01-01-2010', '01-02-2010', '01-03-2010', '01-04-2010', '01-05-2010'),
average = 11:15),
`2011` = tibble(date = c('01-01-2011', '01-02-2011', '01-03-2011', '01-04-2011', '01-05-2011'),
average = 1:5),
`2012` = tibble(date = c('01-01-2012', '01-02-2012', '01-03-2012', '01-04-2012', '01-05-2012'),
average = 6:10))
for (i in seq_along(bbb)) {
if(i == 1){
df <- bbb[[i]] %>%
mutate(
date = paste(day(as.Date(date, format = '%m-%d-%Y')),
month(as.Date(date, format = '%m-%d-%Y'), label = TRUE),
sep = '-')
)
colnames(df) <- c('date', names(bbb[i])) # Assuming your list of dataframes has just 2 columns: date and average
} else {
join_df <- bbb[[i]] %>%
mutate(
date = paste(day(as.Date(date, format = '%m-%d-%Y')),
month(as.Date(date, format = '%m-%d-%Y'), label = TRUE),
sep = '-')
)
colnames(join_df) <- c('date', names(bbb[i]))
df <- full_join(df, join_df, by = 'date')
}
}
This loops through the list of dataframes and reformats the dates to Day-Month.
# A tibble: 5 x 4
date `2010` `2011` `2012`
<chr> <int> <int> <int>
1 1-Jan 11 1 6
2 2-Jan 12 2 7
3 3-Jan 13 3 8
4 4-Jan 14 4 9
5 5-Jan 15 5 10
You could then write that out with the writexl package function write_xlsx

Using the pipe in selfmade function with tidyeval (quo_name)

I have two functions: date_diff and group_stat. So I have read this article tidyverse and I try so create simple functions and use the pipe.
The first function creates a difftime and names them timex_minus_timey but when I pipe this result into the next function I have to look at the name so I can fill in summary_var. Is there a better way to do this?
library(tidyverse)
#
set.seed(42)
data <- dplyr::bind_rows(
tibble::tibble(Hosp = rep("A", 1000),
drg = sample(letters[1:5], 1000, replace = TRUE),
time1 = as.POSIXlt("2018-02-03 08:00:00", tz = "UTC") + rnorm(1000, 0, 60*60*60),
time2 = time1 + runif(1000, min = 10*60, max = 20*60)),
tibble::tibble(Hosp = rep("B", 1000),
drg = sample(letters[1:5], 1000, replace = TRUE),
time1 = as.POSIXlt("2018-02-03 08:00:00", tz = "UTC") + rnorm(1000, 0, 60*60*60),
time2 = time1 + runif(1000, min = 10*60, max = 20*60))
)
date_diff <- function(df, stamp1, stamp2, units = "mins"){
stamp1 <- rlang::enquo(stamp1)
stamp2 <- rlang::enquo(stamp2)
name <- paste0(rlang::quo_name(stamp1), "_minus_", rlang::quo_name(stamp2))
out <- df %>%
dplyr::mutate(!!name := as.numeric(difftime(!!stamp1, !!stamp2, units=units)))
out
}
group_stat <- function(df, group_var, summary_var, .f) {
func <- rlang::as_function(.f)
group_var <- rlang::enquo(group_var)
summary_var <-rlang::enquo(summary_var)
name <- paste0(rlang::quo_name(summary_var), "_", deparse(substitute(.f)))
df %>%
dplyr::group_by(!!group_var) %>%
dplyr::summarise(!!name := func(!!summary_var, na.rm = TRUE))
}
data %>%
date_diff(time2, time1) %>%
group_stat(Hosp, summary_var = time2_minus_time1, mean)
#> # A tibble: 2 x 2
#> Hosp time2_minus_time1_mean
#> <chr> <dbl>
#> 1 A 15.1
#> 2 B 14.9
Created on 2019-05-02 by the reprex package (v0.2.1)
If you intend to always use these functions one after another in this way you could add an attribute containing the new column's name with date_diff, and have group_stat use that attribute. With the if condition, the attribute is only used if it exists and the summary_var argument is not provided.
date_diff <- function(df, stamp1, stamp2, units = "mins"){
stamp1 <- rlang::enquo(stamp1)
stamp2 <- rlang::enquo(stamp2)
name <- paste0(rlang::quo_name(stamp1), "_minus_", rlang::quo_name(stamp2))
out <- df %>%
dplyr::mutate(!!name := as.numeric(difftime(!!stamp1, !!stamp2, units=units)))
attr(out, 'date_diff_nm') <- name
out
}
group_stat <- function(df, group_var, summary_var, .f) {
if(!is.null(attr(df, 'date_diff_nm')) & missing(summary_var))
summary_var <- attr(df, 'date_diff_nm')
group_var <- rlang::enquo(group_var)
name <- paste0(summary_var, "_", deparse(substitute(.f)))
df %>%
dplyr::group_by(!!group_var) %>%
dplyr::summarise_at(summary_var, funs(!!name := .f), na.rm = T)
}
data %>%
date_diff(time2, time1) %>%
group_stat(Hosp, .f = mean)
# # A tibble: 2 x 2
# Hosp time2_minus_time1_mean
# <chr> <dbl>
# 1 A 15.1
# 2 B 14.9

Rename columns of dataframe by days in R

I need to rename a dataframe by days in analysis.
names(dados) <- c("name", "day_1","Freq_1","Percent_1","day_2","Freq_2","Percent_2",
"day_3","Freq_3","Percent_3","day_4","Freq_4","Percent_4",
"day_5","Freq_5","Percent_5","day_6","Freq_6","Percent_6",
"day_7","Freq_7","Percent_7","day_8","Freq_8","Percent_8",
"day_9","Freq_9","Percent_9")
I'm doing an analysis that the data I get is in a list of dataframes, where each dataframe represents a day of analysis. I combine the dataframes and I have the columns 'name' unique and 'day_X', 'Freq_X' and 'Percent_X' for each dataframe as a return.
As return I need the columns to have the following names:
"name", "day_1","Freq_1","Percent_1","day_2","Freq_2","Percent_2","day_3","Freq_3","Percent_3"
How do I go about analyzing 50 days?
reproducible example:
day1 <- data.frame(name = c("jose", "mary", "julia"), freq = c(1,5,3), percent = c(40,30,20))
day2 <- data.frame(name = c("abner", "jose", "mary"), freq = c(3,5,4), percent = c(20,30,20))
day3 <- data.frame(name = c("abner", "jose", "mike"), freq = c(6,2,3), percent = c(40,30,70))
day4 <- data.frame(name = c("andre", "joseph", "ana"), freq = c(1,5,8), percent = c(40,30,20))
day5 <- data.frame(name = c("abner", "poli", "joseph"), freq = c(4,3,3), percent = c(10,30,10))
dates <- list(day1,day2,day4,day5)
data <- Reduce(function(x, y) merge(x, y, by = "name", all = TRUE), dates)
Here's a way to get what you want using the tidyverse suite of packages. We start by putting the data in the "long" format - but add a column with the date:
long_form <- dates %>%
imap_dfr(function(x, y) dplyr::mutate(x, day_num = y))
Now, to get the wide format you are after, we need to reformat things a bit, as done in the following code. I'm not sure what is supposed to go in the day_# variables, as #useR mentioned in the comments, so it's missing. If you have a variable called day, the code should automatically do the right thing as written.
wide_form <- long_form %>%
gather(key, value, -name,-day_num) %>%
dplyr::mutate(
key = paste(key, day_num, sep = "_")
) %>%
select(-day_num) %>%
spread(key, value)
One can use dplyr::bind_rows to merge all data frames form the list to a data frame. Please provide name to list so that day1, day2 etc can set beforehand. Finally, gather and spread is used to transform the data.
names(dates) <- paste("day", seq_along(dates), sep = "")
library(tidyverse)
bind_rows(dates,.id = "Name") %>%
group_by(Name) %>%
mutate(rn = row_number()) %>%
ungroup() %>%
gather(Key, value, -Name,-rn) %>%
unite("Key", c("Key", "Name")) %>%
spread(Key, value) %>%
select(-rn)
Result:
# # A tibble: 3 x 12
# freq_day1 freq_day2 freq_day3 freq_day4 name_day1 name_day2 name_day3 name_day4 percent_day1 percent_day2 percent~ percent~
# * <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 1 3 1 4 jose abner andre abner 40 20 40 10
# 2 5 5 5 3 mary jose joseph poli 30 30 30 30
# 3 3 4 8 3 julia mary ana joseph 20 20 20 10
#
Data:
Data is slightly modified from OP. I have included stringsAsFactors = FALSE argument as part of data.frame to avoid a mutate_at call to convert factor to character.
day1 <- data.frame(name = c("jose", "mary", "julia"), freq = c(1,5,3), percent = c(40,30,20), stringsAsFactors = FALSE)
day2 <- data.frame(name = c("abner", "jose", "mary"), freq = c(3,5,4), percent = c(20,30,20), stringsAsFactors = FALSE)
day3 <- data.frame(name = c("abner", "jose", "mike"), freq = c(6,2,3), percent = c(40,30,70), stringsAsFactors = FALSE)
day4 <- data.frame(name = c("andre", "joseph", "ana"), freq = c(1,5,8), percent = c(40,30,20), stringsAsFactors = FALSE)
day5 <- data.frame(name = c("abner", "poli", "joseph"), freq = c(4,3,3), percent = c(10,30,10), stringsAsFactors = FALSE)
dates <- list(day1,day2,day4,day5)

R Programming: Creating rows on basis of date difference

I am trying to create a dataset which is based on the difference in the days of start & end date. as an example
Name Start_Date End_Date
Alice 1-1-2017 3-1-2017
John 4-3-2017 5-3-2017
Peter 12-3-2017 12-3-2017
So, the final dataset will be inclusive of the start, end date and also the difference. And eventually it should look something like
Name Date
Alice 1-1-2017
Alice 2-1-2017
Alice 3-1-2017
John 4-3-2017
John 5-3-2017
Peter 12-3-2017
Every help is Great Help. Thanks !
We can use Map to get the sequence and melt the list to data.frame`
df1[-1] <- lapply(df1[-1], as.Date, format = "%d-%m-%Y")
lst <- setNames(Map(function(x, y) seq(x, y, by = "1 day"),
df1$Start_Date, df1$End_Date), df1$Name)
library(reshape2)
melt(lst)[2:1]
data
df1 <- structure(list(Name = c("Alice", "John", "Peter"), Start_Date = structure(c(17167,
17229, 17237), class = "Date"), End_Date = structure(c(17169,
17230, 17237), class = "Date")), .Names = c("Name", "Start_Date",
"End_Date"), row.names = c(NA, -3L), class = "data.frame")
This uses the expandRows function from the package splitstackshape:
df = df %>%
mutate(days_between = as.numeric(End_Date - Start_Date),
id = row_number(Name)) %>%
expandRows("days_between") %>%
group_by(id) %>%
mutate(Date = seq(first(Start_Date),
first(End_Date) - 1,
by = 1)) %>%
ungroup()
using a for loop:
library(data.table)
library(foreach)
library(lubridate)
setDT(df)
names = df[, unique(Name)]
l = foreach(i = 1:length(names)) %do% {
# make a date sequence per name
d = df[Name == names[i], ]
s = seq(from = dmy(d$Start_Date), to = dmy(d$End_Date), by = "days")
# bind the results in a data.table
dx = data.table(name = rep(names[i], length(s)))
dx = cbind(dx, date = s)
}
rbindlist(l)

Resources