this just head() from my dataset, there are millions of rows. this is how it looks
#before
dt$date[c("2010-05-12" "2010-05-28" "2010-06-29" "2010-06-30" "2010-07-02" "2010-07-02")]
i want to convert these to the last day of the month.
but i have to accommodate the fact that months end in 30 and 31. how would i change accordingly?
#after
[c("2010-05-31" "2010-05-31" "2010-06-30" "2010-06-30" "2010-07-31" "2010-07-31")]
Cheers
using the lubridate package
require(lubridate)
require(data.table)
dt <- data.table(date = as.Date(c("2010-05-12", "2010-05-28", "2010-06-29", "2010-06-30", "2010-07-02", "2010-07-02")))
day(dt$date) <- days_in_month(dt$date)
output:
> dt
date
1: 2010-05-31
2: 2010-05-31
3: 2010-06-30
4: 2010-06-30
5: 2010-07-31
6: 2010-07-31
Base solution (works on data.table objects) with several steps.
# Find the range: date_range => Date vector
date_range <- range(df$date)
# Generate a sequence, having every date in the range:
# date_lkp => Date vector
date_lkp <- seq.Date(date_range[1], date_range[2], by = "days")
# Truncate the date to months: mth_date => Date Vector
mth_date <- as.Date(paste0(substr(date_lkp, 1, 8), "01"), "%Y-%m-%d")
# Store every date in the sequence as well as the date for each
# end of month: date_tab => data.frame
date_tab <- data.frame(date_lkp = date_lkp,
eom_date = ave(date_lkp, mth_date, FUN = max))
# Perform a lookup for each date in the original data to retrieve
# the last day of each month: new_date => Date vector
df$eom_date <- date_tab$eom_date[match(df$date, date_tab$date_lkp)]
# Data:
date <- c("2010-05-12",
"2010-05-28",
"2010-06-29",
"2010-06-30",
"2010-07-02",
"2010-07-02")
library(data.table)
df <- data.table(date = date, stringsAsFactors = FALSE)
df$date <- as.Date(df$date, "%Y-%m-%d")
Related
I have two data frames, x and y. The data frame x has a range of dates while data frame y has individual dates. I want to get the sum of the individual date values for the time ranges in data frame x. Thus id "a" would have the sum of all the values from 2019/1/1 through 2019/3/1.
id <- c("a","b","c")
start_date <- as.Date(c("2019/1/1", "2019/2/1", "2019/3/1"))
end_date <- as.Date(c("2019/3/1", "2019/4/1", "2019/5/1"))
x <- data.frame(id, start_date, end_date)
dates <- seq(as.Date("2019/1/1"),as.Date("2019/5/1"),1)
values <- runif(121, min=0, max=7)
y <- data.frame(dates, values)
Desired output
id start_date end_date sum
a 2019/1/1 2019/3/1 221.8892
One base R option is using apply
x$sum <- apply(x, 1, function(v) sum(subset(y,dates >= v["start_date"] & dates<=v["end_date"])$values))
such that
> x
id start_date end_date sum
1 a 2019-01-01 2019-03-01 196.0311
2 b 2019-02-01 2019-04-01 185.6970
3 c 2019-03-01 2019-05-01 173.6429
Data
set.seed(1234)
id <- c("a","b","c")
start_date <- as.Date(c("2019/1/1", "2019/2/1", "2019/3/1"))
end_date <- as.Date(c("2019/3/1", "2019/4/1", "2019/5/1"))
x <- data.frame(id, start_date, end_date)
dates <- seq(as.Date("2019/1/1"),as.Date("2019/5/1"),1)
values <- runif(121, min=0, max=7)
y <- data.frame(dates, values)
There are many ways of doing this. One possibility would be:
library(data.table)
x <- setDT(x)
# create a complete series for each id
x <- x[, .(dates = seq(start_date, end_date, 1)), by=id]
# merge the data
m <- merge(x, y, by="dates")
# get the sums
m[, .(sum = sum(values)), by=id]
id sum
1: a 196.0311
2: b 185.6970
3: c 173.6429
You can add setseed before you create the random variables to exactly replicate the numbers
set.seed(1234)
I have a dataframe called 'trial'. I have combined the Date and Time in the data frame to get a field which has timestamp as a POSIXct. I want to set this combined date time or the timestampas the index for my data frame 'trial' how can I do so? I have seen similar questions on this with no success.
The code is as follows:
trial <- read.csv("2018_05_04_h093500.csv", header=TRUE, skip = 16, sep=",")
trial$Date <- with(trial, as.POSIXct(paste(as.Date(Date, format="%Y/%m/%d"), Time)))
dtPOSIXct <- as.POSIXct(trial$Date )
dtTime <- as.numeric(dtPOSIXct - trunc(dtPOSIXct, "days"))
class(dtTime) <- "POSIXct"
If you're talking about rownames which is the R equivalent to index in pandas, they can't be POSIXct datetimes, they have to be characters.
# sample data
x <- data.frame('a' = 1:3, 'b' = c('a', 'b', 'c'))
print(x)
# a b
#2018-01-01 15:51:33 1 a
#2018-01-04 11:42:31 2 b
#2018-01-07 22:04:41 3 c
dates <- c('2018-01-01 15:51:33', '2018-01-04 11:42:31', '2018-01-07 22:04:41')
rownames(x) <- as.POSIXct(dates, format = '%Y-%m-%d %H:%M:%S')
print(class(rownames(x)[1]))
# [1] "character"
That said, you can still coerce them to POSIXct (or any other class, obviously) at the time of evaluation at the cost of some overhead and code clutteredness:
# print x where rownames, when coerced to POSIXct, represent dates after d
d <- as.POSIXct('2018-01-03 00:00:00', '%Y-%m-%d %H:%M:%S')
x[as.POSIXct(rownames(x), format = f) > d, ]
# a b
#2018-01-04 11:42:31 2 b
#2018-01-07 22:04:41 3 c
However, perhaps an easier approach would be to just have an arbitrary column effectively act as a datetime index:
x$date <- as.POSIXct(dates, format = '%Y-%m-%d %H:%M:%S')
class(x$date[1])
# [1] "POSIXct" "POSIXt"
I have a dataframe loaded in RStudio with information about numerous events (millions).
Each row is an entry of a single event and apart from other information it includes two attributes with date information. The first one contains the date when the event began and the second when it ended. But the events are not sequential so they might overlap in time.
fecha fecha_fin
7510607 2014-02-13 20:09:59.8270000 2014-02-27 09:55:40.9700000
7510608 2014-02-13 20:10:01.1870000 2014-02-27 09:55:42.5630000
7557931 2014-02-16 05:32:08.6230000 2014-02-16 14:03:19.4970000
What could be the best and most efficient option to find which calendar days had no activity (without any event in process)? Please, keep in my mind that the duration of the events must be taken into consideration.
I tend to use foverlaps from the data.table package for such cases, e.g.:
library(data.table)
dt <- fread("id,fecha,fecha_fin
7510607,2014-02-01 20:09:59.8270000,2014-02-10 09:55:40.9700000
7510607,2014-02-13 20:09:59.8270000,2014-02-27 09:55:40.9700000
7510608,2014-02-13 20:10:01.1870000,2014-02-27 09:55:42.5630000
7557931,2014-02-16 05:32:08.6230000,2014-02-16 14:03:19.4970000")
setkey(dt, fecha, fecha_fin)
set(dt, j = 1L, value = NULL)
dt <- dt[,lapply(.SD, as.POSIXct, tz = "CET"),.SDcols=1:2]
dt2 <- data.table(fecha=as.POSIXct(seq(min(as.Date(dt$fecha)), max(as.Date(dt$fecha_fin)), "1 day")))[,fecha_fin:=fecha+60*60*24-1]
as.Date(foverlaps(dt2, dt)[is.na(fecha) & is.na(fecha_fin),i.fecha])
# [1] "2014-02-11" "2014-02-12"
Update, with slightly modified code from lukeA:
I hope there is nothing wrong with my benchmarking here...
library(data.table)
library(lubridate)
library(microbenchmark)
# Create dt ---------------------------------------------------------------
size = 99999
# With this size result is an empty set, check smaller sizes like 999 to confirm
# results are same for both functions
create_dt <- function() {
set.seed(2016)
dt <- data.table(
ID = 1:size,
fecha = sample(
seq(ymd('2000/01/01'), ymd('2016/11/16'), by="day"),
size, replace = TRUE)
)
dt[, fecha_fin := fecha + sample(1:3, size, replace = TRUE)]
setkey(dt, fecha, fecha_fin)
set(dt, j = 1L, value = NULL)
dt <- dt[,lapply(.SD, as.POSIXct, tz = "CET"),.SDcols=1:2]
}
dt <- create_dt()
# Declare functions -------------------------------------------------------
f_mdz <- function() {
dt_2 <- data.table(
fecha = seq(min(dt$fecha), max(dt$fecha_fin), by = '1 day')
# Function simplified here!!!
)[, fecha_fin := fecha]
# ---------------------------
as.Date(
foverlaps(dt_2, dt)[is.na(fecha) & is.na(fecha_fin),i.fecha])#,
# origin = '1970-01-01')
}
f_lukeA <- function() {
dt2 <- data.table(
fecha = seq(min(dt$fecha), max(dt$fecha_fin), "1 day")
)[,fecha_fin:=fecha+60*60*24-1]
as.Date(
foverlaps(dt2, dt)[is.na(fecha) & is.na(fecha_fin),i.fecha])
}
# Benchmark! --------------------------------------------------------------
microbenchmark(
dt_mdz <- f_mdz(),
dt_lukeA <- f_lukeA(),
times = 100)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# dt_mdz <- f_mdz() 46.96793 55.11631 95.59214 60.33659 191.5536 212.4523 100 a
# dt_lukeA <- f_lukeA() 50.57496 56.42464 105.07356 60.81974 194.0779 211.8037 100 a
identical(dt_mdz, dt_lukeA)
Old answer here:
A point of departure (far from being efficient, e.g. row-wise operations on data.table...) for further investigation could be:
library(data.table)
library(lubridate)
library(magrittr)
dt <- data.table(
ID = c(7510607L, 7510608L, 7557931L),
fecha = ymd(c('2014-02-15', '2014-02-16', '2014-02-11')),
fecha_fin = ymd(c('2014-02-27', '2014-02-27', '2014-02-12'))
)
# ID fecha fecha_fin
# 1: 7510607 2014-02-15 2014-02-27
# 2: 7510608 2014-02-16 2014-02-27
# 3: 7557931 2014-02-11 2014-02-12
# Make the data "long"
long_dt <- dt[, .(days = seq(fecha, fecha_fin, by = '1 day')), by = ID]
# Get the diff with days sequence from min to max date
setdiff(
seq(long_dt[, min(days)], long_dt[, max(days)], by = '1 day'),
long_dt[, sort(unique(days))]
) %>% as.Date(origin = '1970-01-01')
# [1] "2014-02-13" "2014-02-14"
Please note I have changed your data to actually have two days (2014-02-13 and 2014-02-14) without any activity.
A base R solution would be this:
df$fecha <- strptime(df$fecha, "%Y-%m-%d")
df$fecha_fin <- strptime(df$fecha_fin, "%Y-%m-%d")
dates_list <- lapply(1:3, function(x){
interval_events <- seq(from = df$fecha[x], to = df$fecha_fin[x], by = "days")
})
interval_events <- unique(do.call("c", dates_list))
interval_complete <- seq(from = min(df$fecha), max(df$fecha_fin), by = "days")
interval_complete[!(interval_complete %in% interval_events)]
#[1] "2014-02-13 CET" "2014-02-14 CET"
Here is a simple one! You just expand the dates and take a union of all the dates.
## Data
dt1=as.Date(c('2014/01/01','2014/01/08','2014/01/05'))
dt2=as.Date(c('2014/01/10','2014/01/14','2014/01/05'))
df=data.frame(id=sample(1:3), dt1=dt1, dt2=dt2)
## Code
date=apply(df, 1, function(x) seq(as.Date(x[2]), as.Date(x[3]), by="day"))
event_dates=as.Date(Reduce(union, date), origin = "1970-01-01")
I have a large datafile where all the dates have been loaded as charaters. I would like to change all the Dates columns to date format. Most of the dates have "%y%m%d" format, some have "%Y%m%d" format. There are 25 columns of dates, so changing each one individually is inefficient.
I can do
df$DATE1 <- as.Date(df$DATE1, format ="%y%m%d")
df$DATE2 <- as.Date(df$DATE2, format ="%y%m%d")
etc., but very bad coding.
I tried the following code, but is is not working. This assumes all of the dates are of the format "%y%m%d". Using grep("DATE", names(df)) will get all the Dates columns
df[ , grep("DATE", names(df))] <- as.Date(df[ , grep("DATE", names(df))], "%y%m%d")
Try:
df[, cols <- grep("^DATE", names(df))] <- lapply(df[, cols <- grep("^DATE", names(df))], as.Date, format = "%y%m%d")
Example:
df <- data.frame(DATE1 = c('910812', '900928'), DATE2 = c('890813', '890910'))
# Apply the above and you get:
# > df
# DATE1 DATE2
# 1 1991-08-12 1989-08-13
# 2 1990-09-28 1989-09-10
# > class(df[, 1])
# [1] "Date"
I am subtracting dates in xts i.e.
library(xts)
# make data
x <- data.frame(x = 1:4,
BDate = c("1/1/2000 12:00","2/1/2000 12:00","3/1/2000 12:00","4/1/2000 12:00"),
CDate = c("2/1/2000 12:00","3/1/2000 12:00","4/1/2000 12:00","9/1/2000 12:00"),
ADate = c("3/1/2000","4/1/2000","5/1/2000","10/1/2000"),
stringsAsFactors = FALSE)
x$ADate <- as.POSIXct(x$ADate, format = "%d/%m/%Y")
# object we will use
xxts <- xts(x[, 1:3], order.by= x[, 4] )
#### The subtractions
# anwser in days
transform(xxts, lag = as.POSIXct(BDate, format = "%d/%m/%Y %H:%M") - index(xxts))
# asnwer in hours
transform(xxts, lag = as.POSIXct(CDate, format = "%d/%m/%Y %H:%M") - index(xxts))
Question: How can I standardise the result so that I always get the answer in hours. Not by multiplying the days by 24 as I will not know before han whther the subtratcion will round to days or hours....
Unless I can somehow check if the format is in days perhaps using grep and regexand then multiply within an if clause.
I have tried to work through this and went for the grep regex apprach but this doesnt even keep the negative sign..
p <- transform(xxts, lag = as.POSIXct(BDate, format = "%d/%m/%Y %H:%M") - index(xxts))
library(stringr)
ind <- grep("days", p$lag)
p$lag[ind] <- as.numeric( str_extract_all(p$lag[ind], "\\(?[0-9,.]+\\)?")) * 24
p$lag
#2000-01-03 2000-01-04 2000-01-05 2000-01-10
# 36 36 36 132
I am convinced there is a more elegant solution...
ok difftime works...
transform(xxts, lag = difftime(as.POSIXct(BDate, format = "%d/%m/%Y %H:%M"), index(xxts), unit = "hours"))