Related
Having a dataframe like this:
data.frame(id = c(1,2,3,4), time_stamp_1 = c("Nov 2016-Current", "May 2013-Current", "Oct 2015-Current", "May 2014-Current"), time_stamp_2 = c("Mar 2015-Nov 2016", "May 2008-May 2013", "Aug 2005-Current", "Oct 2014-Jan 2015"))
How is it possible to add new columns which have the time difference in months for every row and whete current insert the "Sept 2022".
Example output:
data.frame(id = c(1,2,3,4), time_stamp_1 = c("Nov 2016-Current", "May 2013-Current", "Oct 2015-Current", "May 2014-Current"), time_stamp_2 = c("Mar 2015-Nov 2016", "May 2008-May 2013", "Aug 2005-Current", "Oct 2014-Jan 2015"), time_stamp_1_duration = c(41,43,24,53), time_stamp_2_duration = c(32,12,45,32))
duration is an example only it is not the real, just for example.
This should do the trick. First replace all the "Current" and "Sept" with the R-recognized abbreviation "Sep", then use tidy::separate and zoo::as.yearmon() to convert to year-month format, then calculate the intervals (in months (x12) per OP):
library(tidyr)
library(zoo)
df <- data.frame(id = c(1,2,3,4), time_stamp_1 = c("Nov 2016-Current", "May 2013-Current", "Oct 2015-Current", "May 2014-Current"), time_stamp_2 = c("Mar 2015-Nov 2016", "May 2008-May 2013", "Aug 2005-Current", "Oct 2014-Jan 2015"))
# convert current and Sept to "Sep 2022"
df[2:3] <- lapply(df[2:3], function(x) gsub("-Current|-Sept 2022", "-Sep 2022", x))
df %>%
separate(time_stamp_1, into = c("my1a", "my1b"), sep = "-") %>%
separate(time_stamp_2, into = c("my2a", "my2b"), sep = "-") %>%
mutate(across(my1a:my2b, ~ as.yearmon(.x, format = "%b %Y"))) %>%
mutate(interval_1 = (my1b - my1a) * 12,
interval_2 = (my2b - my2a) * 12) %>%
left_join(df) %>% select(names(df), "interval_1", "interval_2")
Output:
id time_stamp_1 time_stamp_2 interval_1 interval_2
1 1 Nov 2016-Sep 2022 Mar 2015-Nov 2016 70 20
2 2 May 2013-Sep 2022 May 2008-May 2013 112 60
3 3 Oct 2015-Sep 2022 Aug 2005-Sep 2022 83 205
4 4 May 2014-Sep 2022 Oct 2014-Jan 2015 100 3
As G. Grothendieck mentions in the comments, we could wrap this in a function:
# thanks to G. Grothendieck
ts2mos <- function(x) {
x <- gsub("-Current|-Sept 2022", "-Sep 2022", x)
12 * (as.yearmon(sub(".*-", "", x)) - as.yearmon(x, "%b %Y"))
}
df %>% mutate(interval_1 = ts2mos(time_stamp_1),
interval_2 = ts2mos(time_stamp_2))
Using tidyverse
library(dplyr)
library(lubridate)
library(stringr)
df1 %>%
mutate(across(starts_with('time_stamp'), ~ {
tmp <- str_replace(.x, "Current", 'Sep 2022') %>%
str_replace("(\\w+) (\\d+)-(\\w+) (\\d+)", "\\2-\\1-01/\\4-\\3-01") %>%
interval
tmp %/% months(1)}, .names = "{.col}_duration"))
-output
id time_stamp_1 time_stamp_2 time_stamp_1_duration time_stamp_2_duration
1 1 Nov 2016-Current Mar 2015-Nov 2016 70 20
2 2 May 2013-Current May 2008-May 2013 112 60
3 3 Oct 2015-Current Aug 2005-Current 83 205
4 4 May 2014-Current Oct 2014-Jan 2015 100 3
A tidyverse approach
library(dplyr)
library(lubridate)
library(stringr)
df %>%
mutate(across(starts_with("time_stamp"), ~ str_replace(.x, "Current", "Sep 2022")),
time_stamp_1_duration = sapply(str_split(time_stamp_1, "-"), function(x)
interval(my(x[1]), my(x[2])) %/% months(1)),
time_stamp_2_duration = sapply(str_split(time_stamp_2, "-"), function(x)
interval(my(x[1]), my(x[2])) %/% months(1)),
across(starts_with("time_stamp"), ~ str_replace(.x, "Sep 2022", "Current")))
id time_stamp_1 time_stamp_2 time_stamp_1_duration
1 1 Nov 2016-Current Mar 2015-Nov 2016 70
2 2 May 2013-Current May 2008-May 2013 112
3 3 Oct 2015-Current Aug 2005-Current 83
4 4 May 2014-Current Oct 2014-Jan 2015 100
time_stamp_2_duration
1 20
2 60
3 205
4 3
library(stringr)
timespan_to_duration <- function(x) {
x[ x == 'Current' ] <- 'Sep 2022'
x <- str_replace_all(x, '\\s+', ' 01 ')
x <- as.POSIXct(x, format = '%b %d %Y')
((difftime(x[ 2 ], x[ 1 ], units = 'days') |>
as.integer()) / 30) |>
round()
}
df <- data.frame(id = c(1,2,3,4),
time_stamp_1 = c("Nov 2016-Current", "May 2013-Current", "Oct 2015-Current", "May 2014-Current"),
time_stamp_2 = c("Mar 2015-Nov 2016", "May 2008-May 2013", "Aug 2005-Current", "Oct 2014-Jan 2015"))
df$time_stamp_1_duration <- df$time_stamp_1 |>
str_split('-') |>
lapply(timespan_to_duration) |>
unlist()
df$time_stamp_2_duration <- df$time_stamp_2 |>
str_split('-') |>
lapply(timespan_to_duration) |>
unlist()
df
One possible solution using the function tstrsplit from data.table package. Not that I am also using the built-in constant month.abb.
df[c("duration1", "duration2")] = lapply(df[2:3], function(x) {
x = data.table::tstrsplit(sub("Current", "Sep 2022", x),
split="\\s|-",
type.convert=TRUE,
names=c("mo1", "yr1", "mo2", "yr2"))
x[c("mo1", "mo2")] = lapply(x[c("mo1", "mo2")], match, month.abb)
pmax(x$yr2 - x$yr1-1, 0) * 12 + 12-x$mo1 + x$mo2
})
id time_stamp_1 time_stamp_2 duration1 duration2
1 1 Nov 2016-Current Mar 2015-Nov 2016 70 20
2 2 May 2013-Current May 2008-May 2013 112 60
3 3 Oct 2015-Current Aug 2005-Current 83 205
4 4 May 2014-Current Oct 2014-Jan 2015 100 3
I have columns like these:
year period period2 Sales
2015 201504 April 2015 10000
2015 201505 May 2015 11000
2018 201803 March 2018 12000
I want to change the type of period or period2 column as a date, to use later in time series analysis
Data:
tibble::tibble(
year = c(2015,2015,2018),
period = c(201504, 201505,201803 ),
period2 = c("April 2015", "May 2015", "March 2018"),
Sales = c(10000,11000,12000)
)
Using lubridate package you can transform them into date variables:
df <- tibble::tibble(
year = c(2015,2015,2018),
period = c(201504, 201505,201803 ),
period2 = c("April 2015", "May 2015", "March 2018"),
Sales = c(10000,11000,12000)
)
library(dplyr)
df %>%
mutate(period = lubridate::ym(period),
period2 = lubridate::my(period2))
I have a data that looks like this:
Sample data can be build using codes
df<-structure(list(ID = c(1, 1, 1, 2, 2, 3, 3, 3), Date = c("Day 1",
"Day 7", "Day 29", "Day_8", "Day9", "Day7", "Day.1", "Day 21"
), Score = c("A", "B", "E", "D", "F", "G", "A", "B"), Pass = c("Y",
"Y", "N", "Y", "N", "N", "Y", "Y")), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
How can I write a piping code to complete a filter and selection. The filter I want is the earliest date, and data selection is (ID, Date, Score). If it is doable, I would like to clean data a little bit as it is allover the place right now. The final data might looks like this:
Could anyone give me some guidance on this. if possible, both base and tidyverse?
my thought is:
df1 <- df %>% filter() %>% select(-Pass)
PartII:
If date is something like ad date, how should I get the max(Date)?
New data set can be build using
df2<- structure(list(Subject = c("39-903", "39-903", "39-903",
"39-903", "39-903", "39-903", "39-903", "39-903", "39-905",
"39-906", "39-907", "304-902", "301-902", "301-903", "301-904"
), DT = c("30 Apr 2019", "25 Jun 2019", "23 Jul 2019", "24 Oct 2019",
"19 Dec 2019", "27 Jan 2020", "05 Apr 2020", "29 Apr 2020", "",
"03 Dec 2018", "12 Jul 2019", "29 Apr 2020", "30 Dec 2019", "13 Jan 2020",
"8 Jun 2020")), class = "data.frame", row.names = c(NA, -15L))
I tried
df2<- df2%>% group_by(Subject)%>% mutate(Date=dmy("DT")) %>% filter (Date==max(Date)
and got warning Warning messages: 1: All formats failed to parse. No formats found.. Mine did not work.
You can also try:
library(dplyr)
library(tidyr)
#Code
newdf <- df %>% group_by(ID) %>%
mutate(Date=gsub('\\.','_',Date),
Val=parse_number(Date)) %>%
filter(Val==min(Val)) %>%
select(c(ID,Date,Score))
Output:
# A tibble: 3 x 3
# Groups: ID [3]
ID Date Score
<dbl> <chr> <chr>
1 1 Day 1 A
2 2 Day_8 D
3 3 Day_1 A
Update: More versatile solution:
#Code 2
newdf <- df %>% group_by(ID) %>%
mutate(Val=as.numeric(gsub("[^0-9-]", "", Date))) %>%
filter(Val==min(Val)) %>%
select(c(ID,Date,Score))
Output:
# A tibble: 3 x 3
# Groups: ID [3]
ID Date Score
<dbl> <chr> <chr>
1 1 Day 1 A
2 2 Day_8 D
3 3 Day.1 A
You can try with this:
library(dplyr)
df %>%
mutate(Date = as.numeric(stringr::str_extract(Date, "\\d+"))) %>%
group_by(ID) %>%
slice_min(Date) %>%
ungroup() %>%
select(-Pass)
#> # A tibble: 3 x 3
#> ID Date Score
#> <dbl> <dbl> <chr>
#> 1 1 1 A
#> 2 2 8 D
#> 3 3 1 A
For Date I believe it's better if you keep just the number instead of Day X.
We can use base R
df1 <- transform(df, Date = as.numeric(gsub("\\D+", "", Date)))
df1[with(df1, ave(Date, ID, FUN = min) == Date),]
# ID Date Score Pass
#1 1 1 A Y
#4 2 8 D Y
#7 3 1 A Y
if we need to keep the NA elements as well, can use an OR (|) condition
library(dplyr)
library(lubridate)
df2%>%
group_by(Subject)%>%
mutate(Date=dmy(DT)) %>%
filter (Date==max(Date) |is.na(Date))
I have a dataset, df1, I would like to convert all the values from the 24 hour clock to UTC.
Date Name
1/2/2020 16:46 A
1/2/2020 16:51 B
I Would like
Date Name
1/2/2020 4:46:47 PM A
1/2/2020 4:51:44 PM B
I have tried:
df$Date<- format(df$Date, "%m/%d/%Y %I:%M:%S %p")
dput:
structure(list(Date = structure(1:2, .Label = c("1/2/2020 16:46",
"1/2/2020 16:51"), class = "factor"), Name = structure(1:2, .Label = c("A",
"B"), class = "factor")), class = "data.frame", row.names = c(NA,
-2L))
You can first convert the data to POSIXct format and then use format to get data in the required format.
df$Date <- format(as.POSIXct(df$Date, format = "%m/%d/%Y %H:%M"),
"%m/%d/%Y %I:%M:%S %p")
#Can also use mdy_hm from lubridate
#df$Date <- format(lubridate::mdy_hm(df$Date), "%m/%d/%Y %I:%M:%S %p")
df
# Date Name
#1 01/02/2020 04:46:00 PM A
#2 01/02/2020 04:51:00 PM B
Assuming you want to actually convert a string in one format to a string in another format rather than having it as a (more useful) actual date/time, you can use a little arithmetic and string chopping along with mapply:
splits <- strsplit(as.character(df$Date), " |:")
Hours <- as.numeric(sapply(splits, `[`, 2))
AMPM <- c(" AM", " PM")[Hours %/% 12 + 1]
Hours <- Hours %% 13 + Hours %/% 13
df$Date <- mapply(function(x, y, z) paste0(x[1], " ", y, ":", x[3], z), splits, Hours, AMPM)
df
#> Date Name
#> 1 1/2/2020 4:46 PM A
#> 2 1/2/2020 4:51 PM B
Created on 2020-02-26 by the reprex package (v0.3.0)
Assuming the same assumptions as the previous answer by Allan, here is another way of converting from 24 hour to 12 hour.
library(tidyverse)
library(lubridate)
df <- tibble(
date = c(ymd_hms("2020/01/02 16:46:00", "2020/01/02 16:51:00", tz = "UTC")),
name = c("A", "B")
)
df %>%
mutate(date_hour = hour(date),
am_pm = if_else(date_hour > 12, "PM", "AM"),
date_hour = if_else(date_hour > 12, date_hour - 12, date_hour - 0),
newdatetime = paste0(date(date), " ", date_hour , ":", minute(date), " ", am_pm)) %>%
select(-c(date_hour, am_pm))
df
# A tibble: 2 x 3
date name newdatetime
<dttm> <chr> <chr>
1 2020-01-02 16:46:00 A 2020-01-02 4:46 PM
2 2020-01-02 16:51:00 B 2020-01-02 4:51 PM
Hope this helps!
Recently I stumble over a problem. Unfortunately my variable for the date has not been recorded uniformly.
I got a similar data frame like the one shown below
Variable1 <- c(10,20,30,40,50)
Variable2 <- c("a", "b", "c", "d", "d")
Date <- c("today 10:45", "yesterday 3:10", "28 october 2018 5:32", "28 october 2018 8:32", "27 october 2018 5:32")
df <- data.frame(Variable1, Variable2, Date)
df
For my use I need to extract only the date of it. Therefore, I would like to create a new variable based on "Date".
The Date variable should only contain the date. The hour is irrelevant for my purpose and can be ignored.
My goal is to get the following data frame:
Variable1 <- c(10,20,30,40,50)
Variable2 <- c("a", "b", "c", "d", "d")
Date <- c("31 october 2018", "30 october 2018", "28 october 2018", "28 october 2018", "27 october 2018")
df2 <- data.frame(Variable1, Variable2, Date)
df2
Preferably the values for Date should also be in the correct format (date).
Thank you already in advance.
df$NewDate[grepl("today",df$Date)]<-Sys.Date() # Convert today to date
df$NewDate[grepl("yesterday",df$Date)]<-Sys.Date()-1 # covert yesterday to date
df$NewDate[is.na(df$NewDate)]<-df$Date[is.na(df$NewDate)] %>% as.Date(format="%d %b %Y") # Convert explicit dates to date format
class(df$NewDate)<-"Date" # Convert column to Date class
df
Variable1 Variable2 Date NewDate
1 10 a today 10:45 2018-10-31
2 20 b yesterday 3:10 2018-10-30
3 30 c 28 october 2018 5:32 2018-10-28
4 40 d 28 october 2018 8:32 2018-10-28
5 50 d 27 october 2018 5:32 2018-10-27
tolower( # not strictly necessary, but for consistency
gsub("yesterday", format(Sys.Date()-1, "%d %B %Y"), # convert *day to dates
gsub("today", format(Sys.Date(), "%d %B %Y"),
gsub("\\s*[0-9:]*$", "", # remove the times
c("today 10:45", "yesterday 3:10", "28 october 2018 5:32", "28 october 2018 8:32", "27 october 2018 5:32")))))
# [1] "31 october 2018" "30 october 2018" "28 october 2018" "28 october 2018" "27 october 2018"
Another solution, using indices.
Date <- c("today 10:45", "yesterday 3:10", "28 october 2018 5:32", "28 october 2018 8:32", "27 october 2018 5:32")
Date <- sub("today", Sys.Date(), Date)
Date <- sub("yesterday", Sys.Date() - 1, Date)
i <- grep("[[:alpha:]]", Date)
Date[i] <- format(as.POSIXct(Date[i], format = "%d %B %Y %H:%M"), format = "%d %B %Y")
Date[-i] <- format(as.POSIXct(Date[-i]), format = "%d %B %Y")
Date
#[1] "31 October 2018" "30 October 2018" "28 October 2018"
#[4] "28 October 2018" "27 October 2018"
Then I noticed the solution by user r2evans, that converts everything to lowercase. So, if necessary, end with
Date <- tolower(Date)