As you can see, it misconfigured the date in my database. How can I fix this or is it that way??
library(readxl)
df<-read_excel('C:/Desktop/example.xlsx')
dput(df)
> dput(df)
structure(list(Date = structure(c(1629936000, 1629936000, 1629936000,
1629936000, 1629936000, 1629936000), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), Upd = structure(c(1577836800, 1577836800,
1577836800, 1577836800, 1580601600, 1580601600), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), `Value 1` = c(12, 12, 3, 4, 5, 6)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
You could reformat the data:
library(dplyr)
df %>%
mutate(across(c(Date, Upd), ~format(.x, "%d/%m/%Y")))
This returns
# A tibble: 6 x 3
Date Upd `Value 1`
<chr> <chr> <dbl>
1 26/08/2021 01/01/2020 12
2 26/08/2021 01/01/2020 12
3 26/08/2021 01/01/2020 3
4 26/08/2021 01/01/2020 4
5 26/08/2021 02/02/2020 5
6 26/08/2021 02/02/2020 6
Or with base R
df$Date <- format(df$Date, "%d/%m/%Y")
df$Upd <- format(df$Upd, "%d/%m/%Y")
In base R
df[c("Date", "Upd")] <- lapply(df[c("Date", "Upd")], format, "%d/%m/%Y")
-output
df
# A tibble: 6 x 3
Date Upd `Value 1`
<chr> <chr> <dbl>
1 26/08/2021 01/01/2020 12
2 26/08/2021 01/01/2020 12
3 26/08/2021 01/01/2020 3
4 26/08/2021 01/01/2020 4
5 26/08/2021 02/02/2020 5
6 26/08/2021 02/02/2020 6
I'm not sure if this is exactly what you're going after, but I deal a lot with excel files where a column that should be a date get's imported as numeric. I made this little function to help me out with that:
excel_date_number_to_date <- function(number) {
as.Date(number, origin = "1899-12-30")
}
For your dataframe you can use it like this:
df %>%
mutate(across(c(Date, Upd), ~excel_date_number_to_date(.x)))
Related
I've been looking for answers and messing around with my code for a couple hours. I have a dataset that looks like the following for a specific ID:
# A tibble: 14 × 3
ID state orderDate
<dbl> <chr> <dttm>
1 4227631 1 2022-03-14 19:00:00
2 4227631 1 2022-03-14 20:00:00
3 4227631 1 2022-03-15 11:00:00
4 4227631 0 2022-03-15 11:00:00
5 4227631 1 2022-03-15 20:00:00
6 4227631 1 2022-03-16 04:00:00
7 4227631 0 2022-03-16 04:00:00
8 4227631 1 2022-03-16 05:00:00
9 4227631 0 2022-03-16 13:00:00
10 4227631 1 2022-03-16 15:00:00
This occurs for hundreds of IDs. For this example, I am using dplyr to group_by ID. I only care when status changes between values, not if it stays the same.
I want to calculate the cumulative time each ID remains in status 1. The instances where status 1 is repeated multiple times before it changes should be ignored. I have been planning to use lubridate and dplyr to perform the analysis.
Tibble I am using for this example:
structure(list(ID = c(4227631, 4227631, 4227631, 4227631, 4227631,
4227631, 4227631, 4227631, 4227631, 4227631), state = c("1",
"1", "1", "0", "1", "1", "0", "1", "0", "1"), orderDate = structure(c(1647284400,
1647288000, 1647342000, 1647342000, 1647374400, 1647403200, 1647403200,
1647406800, 1647435600, 1647442800), tzone = "UTC", class = c("POSIXct",
"POSIXt"))), row.names = c(NA, -10L), class = c("tbl_df", "tbl",
"data.frame"))
I've tried various solutions such as Cumulative time with reset however I'm having trouble with lag and incorporating it into this specific analysis.
The expected output would maybe look something like this:
And then I would plan to sum all statusOne together to figure out cumulative time spent in this state.
Invite all more elegant solutions or if someone has a link to a prior question.
EDIT
Using solution below I figured it out!
The solution didn't look at the situations where state 0 immediately followed state 1 and we wanted to look at the total time elapsed between these states.
df %>%
group_by(ID) %>%
mutate(max = cumsum(ifelse(orderName == lag(orderName, default = "1"), 0, 1))) %>%
mutate(hours1 = ifelse(max == lag(max) &
orderName=="1", difftime(orderDate, lag(orderDate), units = "h"), NA)) %>%
mutate(hours2 = ifelse(orderName=="0" & lag(orderName)=="1",
difftime(orderDate, lag(orderDate), units = "h"), NA)) %>%
mutate(hours1 = replace_na(hours1, 0),
hours2 = replace_na(hours2, 0)) %>%
mutate(hours = hours1+hours2) %>%
select(-hours1, -hours2) %>%
summarise(total_hours = sum(hours, na.rm = TRUE)) %>%
filter(total_hours!=0)
This is far from elegant, but at least it appears to provide the correct answer:
library(tidyverse)
df <- structure(list(ID = c(4227631, 4227631, 4227631, 4227631, 4227631,
4227631, 4227631, 4227631, 4227631, 4227631),
state = c("1", "1", "1", "0", "1", "1", "0", "1", "0", "1"),
orderDate = structure(c(1647284400, 1647288000, 1647342000,
1647342000, 1647374400, 1647403200,
1647403200, 1647406800, 1647435600,
1647442800),
tzone = "UTC",
class = c("POSIXct", "POSIXt"))),
row.names = c(NA, -10L),
class = c("tbl_df", "tbl", "data.frame"))
df2 <- df %>%
group_by(ID) %>%
mutate(tmp = ifelse(state == lag(state, default = "1"), 0, 1),
max = cumsum(tmp)) %>%
mutate(hours = ifelse(max == lag(max), difftime(orderDate, lag(orderDate), units = "h"), NA)) %>%
select(-tmp)
df3 <- df2 %>%
group_by(max) %>%
summarise(max, statusOne = sum(hours, na.rm = TRUE))
df4 <- left_join(df2, df3, by = "max") %>%
distinct() %>%
select(-c(max, hours)) %>%
mutate(statusOne = ifelse(statusOne != 0 & lag(statusOne, default = 1) == statusOne, 0, statusOne))
df4
#> # A tibble: 10 × 4
#> # Groups: ID [1]
#> ID state orderDate statusOne
#> <dbl> <chr> <dttm> <dbl>
#> 1 4227631 1 2022-03-14 19:00:00 16
#> 2 4227631 1 2022-03-14 20:00:00 0
#> 3 4227631 1 2022-03-15 11:00:00 0
#> 4 4227631 0 2022-03-15 11:00:00 0
#> 5 4227631 1 2022-03-15 20:00:00 8
#> 6 4227631 1 2022-03-16 04:00:00 0
#> 7 4227631 0 2022-03-16 04:00:00 0
#> 8 4227631 1 2022-03-16 05:00:00 0
#> 9 4227631 0 2022-03-16 13:00:00 0
#> 10 4227631 1 2022-03-16 15:00:00 0
Created on 2022-04-04 by the reprex package (v2.0.1)
Edit
It's a lot more straightforward to get the total_hours state=1 for each ID:
df %>%
group_by(ID) %>%
mutate(max = cumsum(ifelse(state == lag(state, default = "1"), 0, 1))) %>%
mutate(hours = ifelse(max == lag(max), difftime(orderDate, lag(orderDate), units = "h"), NA)) %>%
summarise(total_hours = sum(hours, na.rm = TRUE))
#> # A tibble: 1 × 2
#> ID total_hours
#> <dbl> <dbl>
#> 1 4227631 24
Created on 2022-04-04 by the reprex package (v2.0.1)
I have a dataframe as so
df <- structure(list(TIME = c("11:15:00", NA, "15:15:00", "12:00:00",
"18:40:00", "18:15:00", "7:10:00", "15:58:00", "10:00:00", "10:00:00"
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
))
And I basically want to create a new variable which tells me if the time is in a certain group.
I wrote the following but it's not correct, tried changing to as.POSICxt but no dice.
df <- df %>%
mutate(time_groups = ifelse(between(as.POSIXct(TIME),00:00, 5:59), 1,
ifelse(between(as.POSIXct(TIME),06:00, 8:59), 2,
ifelse(between(as.POSIXct(TIME),09:00,11:59), 3,
ifelse(between(as.POSIXct(TIME),12:00,14:59), 4,
ifelse(between(as.POSIXct(TIME),15:00,17:59), 5,
ifelse(between(as.POSIXct(TIME),18:00,23:59), 6,
), NA)
You could use the findInterval function:
library(tidyverse)
library(lubridate)
a <- c("00:00","5:59", "8:59", "11:59", "14:59", "17:59", "23:59")
b <- ymd_hm(paste(Sys.Date(), a))
df %>%
mutate(Interval = findInterval(ymd_hms(paste(Sys.Date(), TIME)), b))
TIME Interval
<chr> <int>
1 11:15:00 3
2 NA NA
3 15:15:00 5
4 12:00:00 4
5 18:40:00 6
6 18:15:00 6
7 7:10:00 2
8 15:58:00 5
9 10:00:00 3
10 10:00:00 3
I have the following data frame:
df <- structure(list(ID = 1:4, col1.date = structure(c(1546188000,
1272294300, 1087908540, 1512241620), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), col2.date = structure(c(1546237740, 1272928800,
1087966800, 1512277200), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
col3.date = structure(c(1546323000, 1272949200, 1088049600,
1512396000), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
col1.result = c(1.31, 0.95, 3.3, 0.55), col2.result = c(1.19,
1.57, 1.6, 0.59), col3.result = c(0.97, 2.13, 1.1, 0.57)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -4L))
I would like to have for each ID three rows and two columns: result and date.
This is what I have tried:
df_long <- df %>%
gather(v, value, col1.date:col3.result) %>%
separate(v, c("var", "col")
however I am getting the date transformed to numeric.
What am I doing wrong?
Since you ultimately want to reshape multiple columns (and it is the "new way" of tidyr-1.0.0), then try pivot_longer. This answer is adapted directly from the example in the help page at ?pivot_longer:
df %>%
pivot_longer(
col1.date:col3.result,
names_to = c("set", ".value"),
names_pattern = "(.*)\\.(.*)"
)
# # A tibble: 12 x 4
# ID set date result
# <int> <chr> <dttm> <dbl>
# 1 1 col1 2018-12-30 16:40:00 1.31
# 2 1 col2 2018-12-31 06:29:00 1.19
# 3 1 col3 2019-01-01 06:10:00 0.97
# 4 2 col1 2010-04-26 15:05:00 0.95
# 5 2 col2 2010-05-03 23:20:00 1.57
# 6 2 col3 2010-05-04 05:00:00 2.13
# 7 3 col1 2004-06-22 12:49:00 3.3
# 8 3 col2 2004-06-23 05:00:00 1.6
# 9 3 col3 2004-06-24 04:00:00 1.1
# 10 4 col1 2017-12-02 19:07:00 0.55
# 11 4 col2 2017-12-03 05:00:00 0.59
# 12 4 col3 2017-12-04 14:00:00 0.570
I was wondering if someone here can help me with a lapply question.
Every month, data are extracted and the data frames are named according to the date extracted (01-08-2019,01-09-2019,01-10-2019 etc). The contents of each data frame are similar to the example below:
01-09-2019
ID DOB
3 01-07-2019
5 01-06-2019
7 01-05-2019
8 01-09-2019
01-10-2019
ID DOB
2 01-10-2019
5 01-06-2019
8 01-09-2019
9 01-02-2019
As the months roll on, there are more data sets being downloaded.
I am wanting to calculate the ages of people in each of the data sets based on the date the data was extracted - so in essence, the age would be the date difference between the data frame name and the DOB variable.
01-09-2019
ID DOB AGE(months)
3 01-07-2019 2
5 01-06-2019 3
7 01-05-2019 4
8 01-09-2019 0
01-10-2019
ID DOB AGE(months)
2 01-10-2019 0
5 01-06-2019 4
8 01-09-2019 1
9 01-02-2019 8
I was thinking of putting all of the data frames together in a list (as there are a lot) and then using lapply to calculate age across all data frames. How do I go about calculating the difference between a data frame name and a column?
If I may suggest a slightly differen approach: It might make more sense to compress your list into a single data frame before calculating the ages. Given your data looks something like this, i.e. it is a list of data frames, where the list element names are the dates of access:
$`01-09-2019`
# A tibble: 4 x 2
ID DOB
<dbl> <date>
1 3 2019-07-01
2 5 2019-06-01
3 7 2019-05-01
4 8 2019-09-01
$`01-10-2019`
# A tibble: 4 x 2
ID DOB
<dbl> <date>
1 2 2019-10-01
2 5 2019-06-01
3 8 2019-09-01
4 9 2019-02-01
You can call bind_rows first with parameter .id = "date_extracted" to turn your list into a data frame, and then calculate age in months.
library(tidyverse)
library(lubridate)
tib <- bind_rows(tib_list, .id = "date_extracted") %>%
mutate(date_extracted = dmy(date_extracted),
DOB = dmy(DOB),
age_months = month(date_extracted) - month(DOB)
)
#### OUTPUT ####
# A tibble: 8 x 4
date_extracted ID DOB age_months
<date> <dbl> <date> <dbl>
1 2019-09-01 3 2019-07-01 2
2 2019-09-01 5 2019-06-01 3
3 2019-09-01 7 2019-05-01 4
4 2019-09-01 8 2019-09-01 0
5 2019-10-01 2 2019-10-01 0
6 2019-10-01 5 2019-06-01 4
7 2019-10-01 8 2019-09-01 1
8 2019-10-01 9 2019-02-01 8
This can be solved with lapply as well but we can also use Map in this case to iterate over list and their names after adding all the dataframes in a list. In base R,
Map(function(x, y) {
x$DOB <- as.Date(x$DOB)
transform(x, age = as.integer(format(as.Date(y), "%m")) -
as.integer(format(x$DOB, "%m")))
}, list_df, names(list_df))
#$`01-09-2019`
# ID DOB age
#1 3 0001-07-20 2
#2 5 0001-06-20 3
#3 7 0001-05-20 4
#4 8 0001-09-20 0
#$`01-10-2019`
# ID DOB age
#1 2 0001-10-20 0
#2 5 0001-06-20 4
#3 8 0001-09-20 1
#4 9 0001-02-20 8
We can also do the same in tidyverse
library(dplyr)
library(lubridate)
purrr::imap(list_df, ~.x %>% mutate(age = month(.y) - month(DOB)))
data
list_df <- list(`01-09-2019` = structure(list(ID = c(3L, 5L, 7L, 8L),
DOB = structure(c(3L, 2L, 1L, 4L), .Label = c("01-05-2019", "01-06-2019",
"01-07-2019", "01-09-2019"), class = "factor")), class = "data.frame",
row.names = c(NA, -4L)), `01-10-2019` = structure(list(ID = c(2L, 5L, 8L, 9L),
DOB = structure(c(4L, 2L, 3L, 1L), .Label = c("01-02-2019",
"01-06-2019", "01-09-2019", "01-10-2019"), class = "factor")),
class = "data.frame", row.names = c(NA, -4L)))
It's bad practice to use dates and numbers as dataframe names consider prefix the date with an "x" as shown below in this base R solution:
df_list <- list(x01_09_2019 = `01-09-2019`, x01_10_2019 = `01-10-2019`)
df_list <- mapply(cbind, "report_date" = names(df_list), df_list, SIMPLIFY = F)
df_list <- lapply(df_list, function(x){
x$report_date <- as.Date(gsub("_", "-", gsub("x", "", x$report_date)), "%d-%m-%Y")
x$Age <- x$report_date - x$DOB
return(x)
}
)
Data:
`01-09-2019` <- structure(list(ID = c(3, 5, 7, 8),
DOB = structure(c(18078, 18048, 18017, 18140), class = "Date")),
class = "data.frame", row.names = c(NA, -4L))
`01-10-2019` <- structure(list(ID = c(2, 5, 8, 9),
DOB = structure(c(18170, 18048, 18140, 17928), class = "Date")),
class = "data.frame", row.names = c(NA, -4L))
I need to calculate the number of buyers in a store at each hour of the day. I have reproduced the data from another similar problem but that seemed not to answer the problem I am looking for. I do not want to calculated the length of stay in the store but want to calculate the occupancy of the store, by counting all buyers in the store, at each hour of the day. I need to do this only with tidyverse and lubridate.
df <- structure(list(ID = c(101, 102, 103, 104, 105, 106, 107),
Time_in = structure(c(1326309720, 1326309900, 1328990700,
1328997240, 1329000840, 1329004440,
1329004680),
class = c("POSIXct", "POSIXt"), tzone = ""),
Time_out = structure(c(1326313800, 1326317340, 1326317460,
1326324660, 1326328260, 1326335460,
1326335460),
class = c("POSIXct", "POSIXt"), tzone = "")), .Names =
c("ID", "Adm", "Disc"),
row.names = c(NA, -7L), class = "data.frame")
Assuming Adm and Disc are an action they perform in the shop.
Using the count on year month day hour here makes it possible to scale this to whatever year you want.
df <- structure(list(ID = c(101, 102, 103, 104, 105, 106, 107),
Adm = structure(c(1326309720, 1326309900, 1328990700,
1328997240, 1329000840, 1329004440,
1329004680),
class = c("POSIXct", "POSIXt"), tzone = ""),
Disc = structure(c(1326313800, 1326317340, 1326317460,
1326324660, 1326328260, 1326335460,
1326335460),
class = c("POSIXct", "POSIXt"), tzone = "")), .Names =
c("ID", "Adm", "Disc"),
row.names = c(NA, -7L), class = "data.frame")
library(tidyverse)
library(lubridate)
#>
#> Attachement du package : 'lubridate'
#> The following object is masked from 'package:base':
#>
#> date
by_hours <- df %>%
gather(key = Type, Time, 2:3)
by_hours
#> ID Type Time
#> 1 101 Adm 2012-01-11 20:22:00
#> 2 102 Adm 2012-01-11 20:25:00
#> 3 103 Adm 2012-02-11 21:05:00
#> 4 104 Adm 2012-02-11 22:54:00
#> 5 105 Adm 2012-02-11 23:54:00
#> 6 106 Adm 2012-02-12 00:54:00
#> 7 107 Adm 2012-02-12 00:58:00
#> 8 101 Disc 2012-01-11 21:30:00
#> 9 102 Disc 2012-01-11 22:29:00
#> 10 103 Disc 2012-01-11 22:31:00
#> 11 104 Disc 2012-01-12 00:31:00
#> 12 105 Disc 2012-01-12 01:31:00
#> 13 106 Disc 2012-01-12 03:31:00
#> 14 107 Disc 2012-01-12 03:31:00
by_hours %>%
mutate(
Time = ymd_hms(Time),
year = year(Time),
month = month(Time),
day = day(Time),
hour = hour(Time),
) %>%
count(year, month, day, hour)
#> # A tibble: 10 x 5
#> year month day hour n
#> <dbl> <dbl> <int> <int> <int>
#> 1 2012 1 11 20 2
#> 2 2012 1 11 21 1
#> 3 2012 1 11 22 2
#> 4 2012 1 12 0 1
#> 5 2012 1 12 1 1
#> 6 2012 1 12 3 2
#> 7 2012 2 11 21 1
#> 8 2012 2 11 22 1
#> 9 2012 2 11 23 1
#> 10 2012 2 12 0 2
Created on 2018-07-17 by the reprex package (v0.2.0).