How to format the date in r - r

I have a date format like this:
head(ergo_bike)
# A tibble: 6 × 8
hour date...2 time_bike distance calories power `Participant Code` date...8
<dbl> <dttm> <dbl> <dbl> <dbl> <dbl> <chr> <dttm>
1 12 2022-04-12 00:00:00 2 0 0 0.00613 AE1_01 2022-04-12 00:00:00
2 13 2022-04-12 00:00:00 2 0 0 0.00580 AE1_01 2022-04-12 00:00:00
3 14 2022-04-12 00:00:00 1 0 0 0.00258 AE1_01 2022-04-12 00:00:00
4 14 2022-04-13 00:00:00 2 0 0 0.00714 AE1_01 2022-04-13 00:00:00
5 14 2022-03-11 00:00:00 3 0.746 11.2 0.00868 AE1_02 2022-03-11 00:00:00
6 15 2022-03-11 00:00:00 1 0.250 3.75 0.00274 AE1_02 2022-03-11 00:00:00
structure(list(hour = c(12, 13, 14, 14, 14, 15), date...2 = structure(c(1649721600,
1649721600, 1649721600, 1649808000, 1646956800, 1646956800), tzone = "UTC", class = c("POSIXct",
"POSIXt")), time_bike = c(2, 2, 1, 2, 3, 1), distance = c(0,
0, 0, 0, 0.7463732, 0.24986416), calories = c(0, 0, 0, 0, 11.195598,
3.7479625), power = c(0.006130556, 0.005802778, 0.002577778,
0.007138889, 0.008683333, 0.002738889), `Participant Code` = c("AE1_01",
"AE1_01", "AE1_01", "AE1_01", "AE1_02", "AE1_02"), date...8 = structure(c(1649721600,
1649721600, 1649721600, 1649808000, 1646956800, 1646956800), tzone = "UTC", class = c("POSIXct",
"POSIXt"))), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
How can I format the date into the this form : yyyy-mm-dd (I don't want the time included)

I believe you can use ergo_bike$Date <- as.Date(ergo_bike$Date, "%Y-%m-%d")
See this for more info on as.Date

You can use
as.Date()
so that will make it:
ergo_bike$`date...2` <- as.Date(ergo_bike$date...2, "%Y-%m-%d")
You can see the syntax on
https://www.statmethods.net/input/dates.html

Related

Calculate time intervals without any overlap

I have the following data:
# dput:
data <- structure(list(start = structure(c(1641193200, 1641189600, 1641218400,
1641189600, 1641222000, 1641222000, 1641222000), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), end = structure(c(1641218400, 1641218400,
1641241800, 1641218400, 1641241800, 1641241800, 1641232800), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), group = c("A", "B", "C", "D", "E",
"F", "G")), row.names = c(NA, -7L), class = c("tbl_df", "tbl",
"data.frame"))
data
# A tibble: 7 x 3
start end group
<dttm> <dttm> <chr>
1 2022-01-03 07:00:00 2022-01-03 14:00:00 A
2 2022-01-03 06:00:00 2022-01-03 14:00:00 B
3 2022-01-03 14:00:00 2022-01-03 20:30:00 C
4 2022-01-03 06:00:00 2022-01-03 14:00:00 D
5 2022-01-03 15:00:00 2022-01-03 20:30:00 E
6 2022-01-03 15:00:00 2022-01-03 20:30:00 F
7 2022-01-03 15:00:00 2022-01-03 18:00:00 G
And I want to calculate at what time there only 1 group has an "active" time interval (start to end) without overlapping with any other group.
I already experimented with lubridate and the interval function but had trouble comparing more than 2 Intervals with each other.
Desired Output
The output should give the result that the group C has the time interval from 14:00 to 15:00 that has no overlap with any other group.
You can check ivs::iv_locate_splits to see which time frame is occupied by which group:
library(ivs)
ivv <- iv(data$start, data$end)
iv_locate_splits(ivv)
key loc
1 [2022-01-03 06:00:00, 2022-01-03 07:00:00) 2, 4
2 [2022-01-03 07:00:00, 2022-01-03 08:00:00) 1, 2, 4
3 [2022-01-03 08:00:00, 2022-01-03 14:00:00) 1, 2, 4, 7
4 [2022-01-03 14:00:00, 2022-01-03 15:00:00) 3, 7
5 [2022-01-03 15:00:00, 2022-01-03 18:00:00) 3, 5, 6, 7
6 [2022-01-03 18:00:00, 2022-01-03 20:30:00) 3, 5, 6
Updated framework to get the desired outcome:
library(ivs)
#convert to iv format
ivv <- iv(data$start, data$end)
#Check the splits
spl <- iv_locate_splits(ivv)
#Get the index of splits with only 1 group
index <- unlist(spl$loc[lengths(spl$loc) == 1])
#Create the desired outcome using the index
data.frame(frame = spl$key[index],
group = data$group[index])
# frame group
#1 [2022-01-03 14:00:00, 2022-01-03 15:00:00) C

how to filter all the elements of the list in R? but the filter will use the mean of each element so the condition will change for each element

I have a dataset with temperature data for each day, so i grouped them by date. In the end i have a list with dataframes for each day. Now what i want to do is i want to filter by a range all these dataframes. the filter is the mean value of temperature for that day(dataframe) +- 0.5°C.
But the problem is that each dataframe in the list has a different mean value (I hope im clear).
So i want to filter by the mean values of a column but this mean changes for every dataframe.
How can i solve this problem.
I'm an amateur in R so anything is helpful. Thank you in advance
This is a short version of the my list
structure(list(structure(list(Date = structure(c(1646434800,
1646434800, 1646434800, 1646434800, 1646434800, 1646434800, 1646434800,
1646434800, 1646434800, 1646434800), tzone = "", class = c("POSIXct",
"POSIXt")), V4 = c(0.875, 0.5, 0.1875, -0.1875, -0.5, -0.8125,
-1.125, -1.375, -1.625, -1.875)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(Date = structure(c(1646521200,
1646521200, 1646521200, 1646521200, 1646521200, 1646521200, 1646521200,
1646521200, 1646521200, 1646521200, 1646521200), tzone = "", class = c("POSIXct",
"POSIXt")), V4 = c(3.75, 3.75, 3.6875, 3.6875, 3.6875, 3.6875,
3.6875, 3.625, 3.625, 3.625, 3.625)), row.names = c(NA, -11L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(Date = structure(c(1646607600,
1646607600, 1646607600, 1646607600, 1646607600, 1646607600, 1646607600,
1646607600, 1646607600, 1646607600, 1646607600), tzone = "", class = c("POSIXct",
"POSIXt")), V4 = c(3.6875, 3.6875, 3.6875, 3.6875, 3.6875, 3.625,
3.625, 3.625, 3.625, 3.625, 3.625)), row.names = c(NA, -11L), class = c("tbl_df",
"tbl", "data.frame"))), ptype = structure(list(Date = structure(numeric(0), tzone = "", class = c("POSIXct",
"POSIXt")), V4 = numeric(0)), class = c("tbl_df", "tbl", "data.frame"
), row.names = integer(0)), class = c("vctrs_list_of", "vctrs_vctr",
"list"))
You can do this in several ways. Suppose mydata is the list that you provided in the question.
In dplyr you can bind the rows of all the data frames in mydata first to create a single data frame, and then group them by the Date, and then apply the filter to each group. The result is a data frame.
do.call(rbind, mydata) %>%
group_by(Date) %>% filter((V4 <= mean(V4) + 0.5) &
(V4 >= mean(V4)-0.5))
# A tibble: 25 x 2
# Groups: Date [3]
# Date V4
# <dttm> <dbl>
# 1 2022-03-05 06:00:00 -0.188
# 2 2022-03-05 06:00:00 -0.5
# 3 2022-03-05 06:00:00 -0.812
# 4 2022-03-06 06:00:00 3.75
# 5 2022-03-06 06:00:00 3.75
# 6 2022-03-06 06:00:00 3.69
# 7 2022-03-06 06:00:00 3.69
# 8 2022-03-06 06:00:00 3.69
# 9 2022-03-06 06:00:00 3.69
# 10 2022-03-06 06:00:00 3.69
# ... with 15 more rows
In R base you can define your function that filters a single data frame, and then apply the function to mydata. The result is a list of data frames.
myfilter <- function(df) {
cond <- (df$V4 <= mean(df$V4 + 0.5) & (df$V4 >= mean(df$V4) - 0.5))
result <- df[cond,]
return(result)
}
lapply(mydata, myfilter)
# [[1]]
# # A tibble: 3 x 2
# Date V4
# <dttm> <dbl>
# 1 2022-03-05 06:00:00 -0.188
# 2 2022-03-05 06:00:00 -0.5
# 3 2022-03-05 06:00:00 -0.812
#
# [[2]]
# # A tibble: 11 x 2
# Date V4
# <dttm> <dbl>
# 1 2022-03-06 06:00:00 3.75
# 2 2022-03-06 06:00:00 3.75
# 3 2022-03-06 06:00:00 3.69
# 4 2022-03-06 06:00:00 3.69
# 5 2022-03-06 06:00:00 3.69
# 6 2022-03-06 06:00:00 3.69
# 7 2022-03-06 06:00:00 3.69
# 8 2022-03-06 06:00:00 3.62
# 9 2022-03-06 06:00:00 3.62
# 10 2022-03-06 06:00:00 3.62
# 11 2022-03-06 06:00:00 3.62
#
# [[3]]
# # A tibble: 11 x 2
# Date V4
# <dttm> <dbl>
# 1 2022-03-07 06:00:00 3.69
# 2 2022-03-07 06:00:00 3.69
# 3 2022-03-07 06:00:00 3.69
# 4 2022-03-07 06:00:00 3.69
# 5 2022-03-07 06:00:00 3.69
# 6 2022-03-07 06:00:00 3.62
# 7 2022-03-07 06:00:00 3.62
# 8 2022-03-07 06:00:00 3.62
# 9 2022-03-07 06:00:00 3.62
# 10 2022-03-07 06:00:00 3.62
# 11 2022-03-07 06:00:00 3.62

Adjust date format after using dput in R

As you can see, it misconfigured the date in my database. How can I fix this or is it that way??
library(readxl)
df<-read_excel('C:/Desktop/example.xlsx')
dput(df)
> dput(df)
structure(list(Date = structure(c(1629936000, 1629936000, 1629936000,
1629936000, 1629936000, 1629936000), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), Upd = structure(c(1577836800, 1577836800,
1577836800, 1577836800, 1580601600, 1580601600), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), `Value 1` = c(12, 12, 3, 4, 5, 6)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
You could reformat the data:
library(dplyr)
df %>%
mutate(across(c(Date, Upd), ~format(.x, "%d/%m/%Y")))
This returns
# A tibble: 6 x 3
Date Upd `Value 1`
<chr> <chr> <dbl>
1 26/08/2021 01/01/2020 12
2 26/08/2021 01/01/2020 12
3 26/08/2021 01/01/2020 3
4 26/08/2021 01/01/2020 4
5 26/08/2021 02/02/2020 5
6 26/08/2021 02/02/2020 6
Or with base R
df$Date <- format(df$Date, "%d/%m/%Y")
df$Upd <- format(df$Upd, "%d/%m/%Y")
In base R
df[c("Date", "Upd")] <- lapply(df[c("Date", "Upd")], format, "%d/%m/%Y")
-output
df
# A tibble: 6 x 3
Date Upd `Value 1`
<chr> <chr> <dbl>
1 26/08/2021 01/01/2020 12
2 26/08/2021 01/01/2020 12
3 26/08/2021 01/01/2020 3
4 26/08/2021 01/01/2020 4
5 26/08/2021 02/02/2020 5
6 26/08/2021 02/02/2020 6
I'm not sure if this is exactly what you're going after, but I deal a lot with excel files where a column that should be a date get's imported as numeric. I made this little function to help me out with that:
excel_date_number_to_date <- function(number) {
as.Date(number, origin = "1899-12-30")
}
For your dataframe you can use it like this:
df %>%
mutate(across(c(Date, Upd), ~excel_date_number_to_date(.x)))

Check if dates are within a time frame r

I have two datasets, one with values at specific time points for different IDs and another one with several time frames for the IDs. Now I want to check if the timepoint in dataframe one is within any of the time frames from dataset 2 matching the ID.
For example:
df1:
ID date time
1 2020-04-14 11:00:00
1 2020-04-14 18:00:00
1 2020-04-15 10:00:00
1 2020-04-15 20:00:00
1 2020-04-16 11:00:00
1 ...
2 ...
df2:
ID start end
1 2020-04-14 16:00:00 2020-04-14 20:00:00
1 2020-04-15 18:00:00 2020-04-16 13:00:00
2 ...
2
what I want
df1_new:
ID date time mark
1 2020-04-14 11:00:00 0
1 2020-04-14 18:00:00 1
1 2020-04-15 10:00:00 0
1 2020-04-15 20:00:00 1
1 2020-04-16 11:00:00 1
1 ...
2 ...
Any help would be appreciated!
An option could be:
library(tidyverse)
library(lubridate)
#> date, intersect, setdiff, union
df_1 <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L), date = c("14.04.2020",
"14.04.2020", "15.04.2020", "15.04.2020", "16.04.2020"), time = c("11:00:00",
"18:00:00", "10:00:00", "20:00:00", "11:00:00"), date_time = structure(c(1586862000,
1586887200, 1586944800, 1586980800, 1587034800), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), class = "data.frame", row.names = c(NA,
-5L))
df_2 <- structure(list(ID = c(1L, 1L), start = c("14.04.2020 16:00",
"15.04.2020 18:00"), end = c("14.04.2020 20:00", "16.04.2020 13:00"
)), class = "data.frame", row.names = c(NA, -2L))
df_22 <- df_2 %>%
mutate(across(c("start", "end"), dmy_hm)) %>%
group_nest(ID)
left_join(x = df_1, y = df_22, by = "ID") %>%
as_tibble() %>%
mutate(mark = map2_dbl(date_time, data, ~+any(.x %within% interval(.y$start, .y$end)))) %>%
select(-data)
#> # A tibble: 5 x 5
#> ID date time date_time mark
#> <int> <chr> <chr> <dttm> <dbl>
#> 1 1 14.04.2020 11:00:00 2020-04-14 11:00:00 0
#> 2 1 14.04.2020 18:00:00 2020-04-14 18:00:00 1
#> 3 1 15.04.2020 10:00:00 2020-04-15 10:00:00 0
#> 4 1 15.04.2020 20:00:00 2020-04-15 20:00:00 1
#> 5 1 16.04.2020 11:00:00 2020-04-16 11:00:00 1
Created on 2021-05-25 by the reprex package (v2.0.0)

Calculating the number of buyers that are at every hour from 00:00 to 24:00 in the store with tidyverse

I need to calculate the number of buyers in a store at each hour of the day. I have reproduced the data from another similar problem but that seemed not to answer the problem I am looking for. I do not want to calculated the length of stay in the store but want to calculate the occupancy of the store, by counting all buyers in the store, at each hour of the day. I need to do this only with tidyverse and lubridate.
df <- structure(list(ID = c(101, 102, 103, 104, 105, 106, 107),
Time_in = structure(c(1326309720, 1326309900, 1328990700,
1328997240, 1329000840, 1329004440,
1329004680),
class = c("POSIXct", "POSIXt"), tzone = ""),
Time_out = structure(c(1326313800, 1326317340, 1326317460,
1326324660, 1326328260, 1326335460,
1326335460),
class = c("POSIXct", "POSIXt"), tzone = "")), .Names =
c("ID", "Adm", "Disc"),
row.names = c(NA, -7L), class = "data.frame")
Assuming Adm and Disc are an action they perform in the shop.
Using the count on year month day hour here makes it possible to scale this to whatever year you want.
df <- structure(list(ID = c(101, 102, 103, 104, 105, 106, 107),
Adm = structure(c(1326309720, 1326309900, 1328990700,
1328997240, 1329000840, 1329004440,
1329004680),
class = c("POSIXct", "POSIXt"), tzone = ""),
Disc = structure(c(1326313800, 1326317340, 1326317460,
1326324660, 1326328260, 1326335460,
1326335460),
class = c("POSIXct", "POSIXt"), tzone = "")), .Names =
c("ID", "Adm", "Disc"),
row.names = c(NA, -7L), class = "data.frame")
library(tidyverse)
library(lubridate)
#>
#> Attachement du package : 'lubridate'
#> The following object is masked from 'package:base':
#>
#> date
by_hours <- df %>%
gather(key = Type, Time, 2:3)
by_hours
#> ID Type Time
#> 1 101 Adm 2012-01-11 20:22:00
#> 2 102 Adm 2012-01-11 20:25:00
#> 3 103 Adm 2012-02-11 21:05:00
#> 4 104 Adm 2012-02-11 22:54:00
#> 5 105 Adm 2012-02-11 23:54:00
#> 6 106 Adm 2012-02-12 00:54:00
#> 7 107 Adm 2012-02-12 00:58:00
#> 8 101 Disc 2012-01-11 21:30:00
#> 9 102 Disc 2012-01-11 22:29:00
#> 10 103 Disc 2012-01-11 22:31:00
#> 11 104 Disc 2012-01-12 00:31:00
#> 12 105 Disc 2012-01-12 01:31:00
#> 13 106 Disc 2012-01-12 03:31:00
#> 14 107 Disc 2012-01-12 03:31:00
by_hours %>%
mutate(
Time = ymd_hms(Time),
year = year(Time),
month = month(Time),
day = day(Time),
hour = hour(Time),
) %>%
count(year, month, day, hour)
#> # A tibble: 10 x 5
#> year month day hour n
#> <dbl> <dbl> <int> <int> <int>
#> 1 2012 1 11 20 2
#> 2 2012 1 11 21 1
#> 3 2012 1 11 22 2
#> 4 2012 1 12 0 1
#> 5 2012 1 12 1 1
#> 6 2012 1 12 3 2
#> 7 2012 2 11 21 1
#> 8 2012 2 11 22 1
#> 9 2012 2 11 23 1
#> 10 2012 2 12 0 2
Created on 2018-07-17 by the reprex package (v0.2.0).

Resources