Please help me on this..
so I have daily observations (data frame) for 32-year period. (thus total around 11659 rows: there's some missing rows)
I want to calculate average of each column at every 365th interval (i.e. every jan-01 for 32 year period, every Jan-02 for 32 year period, etc.
so the output would have total 365 rows and each row is average of 32 rows at 365 interval.
any suggestions? I found similar case and tried their solution and modified a bit but the output is not correct. especially I don't understand sapply part below..
df <-data.frame(x=c(1:10000),y=c(1:10000))
byapply <- function(x, by, fun, ...)
{
# Create index list
if (length(by) == 1)
{
nr <- nrow(x)
split.index <- rep(1:ceiling(nr / by), each = by, length.out = nr)
} else
{
nr <- length(by)
split.index <- by
}
index.list <- split(seq(from = 1, to = nr), split.index)
# Pass index list to fun using sapply() and return object #this is where I am lost
sapply(index.list, function(i)
{
do.call(fun, list(x[, i], ...))
})
}
thank you for your time..
How about using the plyr package:
require(plyr) # for aggregating data
require(plyr) # for aggregating data
series<-data.frame(date=as.Date("1964-01-01")+(1:100000),
obs=runif(10000),
obs2=runif(10000),
obs3=runif(10000))
ddply(series, # run on series df
.(DOY=format(date,"%j")), # group by string of day and month (call col DOY)
summarise, # tell the function to summarise by group (day of year)
daymean=mean(obs), # calculate the mean
daymean2=mean(obs2), # calculate the mean
daymean3=mean(obs3) # calculate the mean
)
# DOY daymean daymean2 daymean3
#1 001 0.4957763 0.4882559 0.4944281
#2 002 0.5184197 0.4970996 0.4720893
#3 003 0.5192313 0.5185357 0.4878891
#4 004 0.4787227 0.5150596 0.5317068
#5 005 0.4972933 0.5065012 0.4956527
#6 006 0.5112484 0.5276013 0.4785681
#...
Although there's possibly a special function, which does exactly what you need, here is a solution using ave:
set.seed(1)
dates = seq(from=as.Date("1970-01-01"), as.Date("2000-01-01"), by="day")
df <- data.frame(val1=runif(length(dates)),
val2=rchisq(length(dates), 10))
day <- format(dates, "%j") # day of year (1:366)
df <- cbind(df, setNames(as.data.frame(sapply(df, function(x) {
ave(x, day) # calculate mean by day for df$val1 and df$val2
})), paste0(names(df), "_mean")))
head(df[1:365, 3:4], 3)
# val1_mean val2_mean
# 1 0.5317151 10.485001
# 2 0.5555664 10.490968
# 3 0.6428217 10.763027
That is, if I understood your task correctly.
Related
I want to count the number of rows within a certain time range based on each row after grouping by id. For instance, let us say a 1-month window around each datetime entry in the column "cleaned_date".
head(data$cleaned_date)
[1] "2004-10-11 CDT" "2008-09-10 CDT" "2011-10-25 CDT" "2011-12-31 CST"
The dates are in POSIXct format.
For the first entry, I need to count the number of rows within the time from 2004-09-11 to 2004-11-11, for the second entry, count the number of rows within the time from 2008-08-10 to 2008-10-10, so on and so forth.
I used roughly the following code
data %>% group_by(id) %>% filter(cleaned_date %within% interval(cleaned_date - 24 * 60 * 60 * 30, cleaned_date + 24 * 60 * 60 * 30)) %>% mutate(counts = n())
But it does not seem to work and I got counts as an empty column. Any help would be appreciated, thanks!
A reproducible example can be the following:
The input is
cleaned_date id
1 2008-09-11 A
2 2008-09-10 B
3 2008-09-30 B
4 2011-10-25 A
5 2011-11-14 A
And I want the output to be
cleaned_date id counts
1 2008-09-11 A 1
2 2008-09-10 B 2
3 2008-09-30 B 2
4 2011-10-25 A 2
5 2011-11-14 A 2
For the first entry, I want to count the rows in the timeframe 2008-08-11 to 2008-10-11, the second entry seems to satisfy but we need to group by "id", so it does not count. For the second entry I want to count the rows in the timeframe 2008-08-10 to 2008-10-10, rows 2 and 3 satisfy, so the counts is 2. For the third entry I want to count the rows in the timeframe 2008-08-30 to 2008-10-30, rows 2 and 3 satisfy again, so on and so forth.
Note that the actual dataset I would like to operate on has millions of rows, so it might be more efficient to use tidyverse rather than base R.
Perhaps not the most elegant solution.
# input data. Dates as character vector
input = data.frame(
cleaned_date = c("2008-09-11", "2008-09-10", "2008-09-30", "2011-10-25", "2011-11-14"),
id = c("A", "B", "B", "A", "A")
)
# function to create a date window n months around specified date
window <- function(x, n = 1){
x <- rep(as.POSIXlt(x),2)
x[1]$mon <- x[1]$mon - n
x[2]$mon <- x[2]$mon + n
return(format(seq(from = x[1], to = x[2], by = "day"), format="%Y-%m-%d"))
}
# find counts for each row
input$counts <- unlist(lapply(1:nrow(input), function(x){
length(which((input$cleaned_date %in% window(input$cleaned_date[x])) & input$id == input$id[x]))
}))
input
cleaned_date id counts
1 2008-09-11 A 1
2 2008-09-10 B 2
3 2008-09-30 B 2
4 2011-10-25 A 2
5 2011-11-14 A 2
Edit for large datasets:
# dummy dataset with 1,000,000 rows
years <- c(2000:2020)
months <- c(1:12)
days <- c(1:20)
n <- 1000000
dates <- paste(sample(years, size = n, replace = T), sample(months, size = n, replace = T), sample(days, size = n, replace = T), sep = "-")
groups <- sample(c("A","B","C"), size = n, replace = T)
input <- data.frame(
cleaned_date = dates,
id = groups
)
input$cleaned_date <- format(as.POSIXlt(input$cleaned_date), format="%Y-%m-%d")
# optional, sort data by date for small boost in performance
input <- input[order(input$cleaned_date),]
counts <- NULL
#pb <- progress::progress_bar$new(total = length(unique(input$cleaned_date)))
t1 <- Sys.time()
# split up vectorization for each unique date.
for(date in unique(input$cleaned_date)){
#pb$tick()
w <- window(date)
tmp <- input[which(input$cleaned_date %in% w),]
tmp_counts <- unlist(lapply(which(tmp$cleaned_date == date), function(x){
length(which(tmp$id == tmp$id[x]))
}))
counts <- c(counts, tmp_counts)
}
# add counts to dataset
input$counts <- counts
# optional, re-order data to original format
input <- input[order(as.numeric(rownames(input))),]
print(Sys.time() - t1)
Time difference of 3.247204 mins
If you want to go faster, you can run the loop in parallel
library(foreach)
library(doParallel)
cores=detectCores()
cl <- makeCluster(cores[1]-1)
registerDoParallel(cl)
dates = unique(input$cleaned_date)
t1 <- Sys.time()
counts <- foreach(i=1:length(dates), .combine= "c") %dopar% {
w <- window(dates[i])
tmp <- input[which(input$cleaned_date %in% w),]
tmp_counts <- unlist(lapply(which(tmp$cleaned_date == dates[i]), function(x){
length(which(tmp$id == tmp$id[x]))
}))
tmp_counts
}
stopCluster(cl)
input$counts <- counts
input <- input[order(as.numeric(rownames(input))),]
print(Sys.time() - t1)
Time difference of 37.37211 secs
Note, I'm running this on a MacBook Pro with a 2.3 GHz Quad-Core Intel Core i7 and 16 GB of RAM.
It is still hard to determine exactly what you're trying to accomplish, but this will at least get you counts for a specified date range:
df %>%
group_by(id) %>%
filter(cleaned_date >= "2008-08-11" & cleaned_date <= "2008-10-11") %>%
mutate(counts = n())
Will give us:
cleaned_date id counts
<date> <chr> <int>
1 2008-09-11 A 1
2 2008-09-10 B 2
3 2008-09-30 B 2
This question already has answers here:
Split date into different columns for year, month and day
(4 answers)
Closed 6 years ago.
I have a dataset which looks like:
mother_id,dateOfBirth
1,1962-09-24
2,1991-02-19
3,1978-11-11
I need to extract the constituent elements (day,month,year) from date of birth and put them in corresponding columns to look like:
mother_id,dateOfBirth,dayOfBirth,monthOfBirth,yearOfBirth
1,1962-09-24,24,09,1962
2,1991-02-19,19,02,1991
3,1978-11-11,11,11,1978
Currently, I have it coded as a loop:
data <- read.csv("/home/tumaini/Desktop/IHI-Projects/Data-Linkage/matching file dss nacp.csv",stringsAsFactors = F)
dss_individuals <- read.csv("/home/tumaini/Desktop/IHI-Projects/Data-Linkage/Data/dssIndividuals.csv", stringsAsFactors = F)
lookup <- data[,c("patientid","extId")]
# remove duplicates
lookup <- lookup[!(duplicated(lookup$patientid)),]
dss_individuals$dateOfBirth <- as.character.Date(dss_individuals$dob)
dss_individuals$dayOfBirth <- 0
dss_individuals$monthOfBirth <- 0
dss_individuals$yearOfBirth <- 0
# Loop starts here
for(i in 1:nrow(dss_individuals)){ #nrow(dss_individuals)
split_list <- unlist(strsplit(dss_individuals[i,]$dateOfBirth,'[- ]'))
dss_individuals[i,]["dayOfBirth"] <- split_list[3]
dss_individuals[i,]["monthOfBirth"] <- split_list[2]
dss_individuals[i,]["yearOfBirth"] <- split_list[1]
}
This seems to work, but is horrendously slow as I have 400 000 rows. Is there a way I can get this done more efficiently?
I compared the speed of substr, format, and use of lubridate. It seems that lubridate and format are much faster than substr, if the the variable is stored as date. However, substr would be fastest if the variable is stored as character vector. The results of a single run is shown.
x <- sample(
seq(as.Date('1000/01/01'), as.Date('2000/01/01'), by="day"),
400000, replace = T)
system.time({
y <- substr(x, 1, 4)
m <- substr(x, 6, 7)
d <- substr(x, 9, 10)
})
# user system elapsed
# 3.775 0.004 3.779
system.time({
y <- format(x,"%y")
m <- format(x,"%m")
d <- format(x,"%d")
})
# user system elapsed
# 1.118 0.000 1.118
system.time({
y <- year(x)
m <- month(x)
d <- day(x)
})
# user system elapsed
# 0.951 0.000 0.951
x1 <- as.character(x)
system.time({
y <- substr(x1, 1, 4)
m <- substr(x1, 6, 7)
d <- substr(x1, 9, 10)
})
# user system elapsed
# 0.082 0.000 0.082
Not sure if this will solve your speed issues but here is a nicer way of doing it using dplyr and lubridate. In general when it comes to manipulating data.frames I personally recommend using either data.tables or dplyr. Data.tables is supposed to be faster but dplyr is more verbose which I personally prefer as I find it easier to pick up my code after not having read it for months.
library(dplyr)
library(lubridate)
dat <- data.frame( mother_id = c(1,2,3),
dateOfBirth = ymd(c( "1962-09-24" ,"1991-02-19" ,"1978-11-11"))
)
dat %>% mutate( year = year(dateOfBirth) ,
month = month(dateOfBirth),
day = day(dateOfBirth) )
Or you can use the mutate_each function to save having to write the variable name multiple times (though you get less control over the name of the output variables)
dat %>% mutate_each( funs(year , month , day) , dateOfBirth)
Here are some solutions. These solutions each (i) use 1 or 2 lines of code and (ii) return numeric year, month and day columns. In addition, the first two solutions use no packages -- the third uses chron's month.day.year function.
1) POSIXlt Convert to "POSIXlt" class and pick off the parts.
lt <- as.POSIXlt(DF$dateOfBirth, origin = "1970-01-01")
transform(DF, year = lt$year + 1900, month = lt$mon + 1, day = lt$mday)
giving:
mother_id dateOfBirth year month day
1 1 1962-09-24 1962 9 24
2 2 1991-02-19 1991 2 19
3 3 1978-11-11 1978 11 11
2) read.table
cbind(DF, read.table(text = format(DF$dateOfBirth), sep = "-",
col.names = c("year", "month", "day")))
giving:
mother_id dateOfBirth year month day
1 1 1962-09-24 1962 9 24
2 2 1991-02-19 1991 2 19
3 3 1978-11-11 1978 11 11
3) chron::month.day.year
library(chron)
cbind(DF, month.day.year(DF$dateOfBirth))
giving:
mother_id dateOfBirth month day year
1 1 1962-09-24 9 24 1962
2 2 1991-02-19 2 19 1991
3 3 1978-11-11 11 11 1978
Note 1: Often when year, month and day are added to data it is not really necessary and in fact they could be generated on the fly when needed using format, substr or as.POSIXlt so you might critically examine whether you actually need to do this.
Note 2: The input data frame, DF in reproducible form, was assumed to be:
Lines <- "mother_id,dateOfBirth
1,1962-09-24
2,1991-02-19
3,1978-11-11"
DF <- read.csv(text = Lines)
Use format once for each part:
dss_individuals$dayOfBirth <- format(dss_individuals$dateOfBirth,"%d")
dss_individuals$monthOfBirth <- format(dss_individuals$dateOfBirth,"%m")
dss_individuals$yearOfBirth <- format(dss_individuals$dateOfBirth,"%Y")
Check the substr function from the base package (or other functions from the nice stringr package) to extract different parts of a string. This function may assume that day, month and year are always in the same place and with the same length.
The strsplit function is vectorized so using rbind.data.frame to convert your list to a dataframe works:
do.call(rbind.data.frame, strsplit(df$dateOfBirth, split = '-'))
Results need to be transposed in order to be used: you can do it using do.call or the t function.
Suppose I have a series of observations representing date intervals, e.g.
library(dplyr)
library(magrittr)
df <-
data_frame(start = as.Date(c('2000-01-01', '2000-01-03', '2000-01-08',
'2000-01-20', '2000-01-22')),
end = as.Date(c('2000-01-02', '2000-01-05', '2000-01-10',
'2000-01-21', '2000-02-10')))
I would like to group these observations such that the start time of observation n occurs within some specified interval following the end date of observation n-1. For instance, if we set that interval to be 5 days, we would see something like:
# start end group
# (date) (date) (dbl)
# 1 2000-01-01 2000-01-02 1
# 2 2000-01-03 2000-01-05 1
# 3 2000-01-08 2000-01-10 1
# 4 2000-01-20 2000-01-21 2
# 5 2000-01-22 2000-02-10 2
(For the sake of simplicity, I'm assuming no overlap in dates, although this isn't necessarily the case in the data). I thought about using igraph to create a weighted edgelist, but that seemed overly complicated. Efficiency is, I believe, important: I'll be running this on roughly 4 million groups of data of about 5-10 rows each.
While my solution does work, to me it seems error-prone, slow, and clunky. I'm thinking using a package or some vectorization would really improve matters.
group_dates <- function(df, interval){
# assign first date to first group
df %<>% arrange(start, end)
df[1, 'group'] <- 1
# for each start date, determine if it is within `interval` days of the
# closest end date
lapply(df$start[-1], function(cur_start){
earlier_data <- df[df$end <= cur_start, ]
diffs <- cur_start - earlier_data$end
min_interval <- diffs[which.min(diffs)]
closest_group <- earlier_data$group[which.min(diffs)]
if(min_interval <= interval){
df[df$start == cur_start, 'group'] <<- closest_group
} else {
df[df$start == cur_start, 'group'] <<- closest_group + 1
}
})
return(df)
}
You can do that relatively easily with dplyr.
The idea is the following:
Lag the end data (shifting it down by one)
Calculate the difference between start date and the lagged end date
Adding 'BreakPoints' - A variable with TRUE when the difference is more than 5 days and FALSE otherwise
Calculating the cumulative sum of this break-point. This will add 1 every time it find a new breakpoint so a new interval should be started
Something like this should work for you:
df %>%
mutate(lagged_end = lag(end),
diff = start - lagged_end,
new_interval = diff > 5,
new_interval = ifelse(is.na(new_interval), FALSE, new_interval),
interval_number = cumsum(new_interval))
This should be also quite quick since it's all in dplyr
This isn't as elegant as Lorenzo Rossi's solution, but offers a slightly different approach using cut.Date and 2 lines of code:
breakpoints <- c(FALSE, sapply(2:nrow(df), function(x) df[x,"start"] - df[x-1,"end"]) > 5)
clusterLabels <- as.numeric(cut.Date(df$start, c(min(df$start), df[breakpoints, "start"], max(df$start)+1)))
I want to calculate fiscal year returns and standard deviations from daily returns for a large number of firms. I am relatively new to R, having previously used SAS to calculate returns etc. However, I'd like to switch to R in the short/medium-term.
I have two files: 1) Containing a firm identifier, dates, daily returns(df.1) and 2) my sample (df.2) over which I'd like to aggregate the returns
firm date ret
1 01/01/1992 0.024
1 02/01/1992 0.010
. . .
. . .
1 31/12/2014 0.002
2 01/01/1992 0.004
2 02/01/1992 0.012
The file is very large about 1M rows.
The second file looks like that:
firm fiscal_year_start fiscal_year_end
1 01/01/1992 31/12/1992
1 01/01/1993 31/12/1993
1 01/01/1994 31/12/1994
I want to calculate fiscal year returns and annualised standard deviation. Both .csv files are loaded into R as data frames. I am unsure on how to best treat the date variables and how to structure the for loop to loop through the daily return file.
Any help would be much appreciated.
EDIT1
I am able to subset the big data frame using this function:
myfunc <- function(x,y,z){df.1(df.1$date1 >= x & df.1$date1 < y & df.1$firm == firm1,]}
firm1 <- df.2$firm[1]
start_date <- df.2$StartDate[1]
end_date <- df.2$EndDate[1]
Test <- myfunc(start_date,end_date, firm1)
For this subset I can then get the fiscal-year return and std:
# return
fiscal_year_ret <- with(Test, sum(Test$ret))
# annualized variance
var <- with(Test, var(Test$ret))
annualized_var <- var*length(Test)
annualized_st.dev <- sqrt(annualized_var)
My big problem is embedding this into a loop that allows me to loop through the different firm identifiers and dates in df.2
EDIT2
So I have something like this
df.output <- data.frame(returns=as.numeric(),
std.deviation=as.numeric(),
stringsAsFactors=FALSE)
I would like to populate the above data frame with the results.
for (i in sample) {
myfunc <- function(x,y,z){df.1[df.1$date1 >= x & df.1$date1 < y & df.1$firm == firm1,]}
firm1 <- df.2$firm[i]
start_date <- df.2$StartDate[i]
end_date <- df.2$EndDate[i]
subset <- myfunc(start_date,end_date, firm1)
# return
fiscal_year_ret <- with(subset, sum(subset$ret))
df.output$returns <-fiscal_year_ret
# variance
var <- with(subset, var(subset$ret))
annualized_var <- var*length(subset)
annualized_st.dev <- sqrt(annualized_var)
}
Something like that.
Here is one way:
library(lubridate)
data %>%
mutate(year =
date %>%
mdy %>%
floor_date(unit = "year") )
group_by(year) %>%
summarize(
mean_return = mean(ret),
sd_return = sd(ret))
I'm hoping to take a dataset with cross section salary data for employees and create a large uninterrupted time series, imputing values along the way. Suppose I have:
name <- c("carl","carl","bob","rick","rick","rick","rick")
sex <- c(rep("M",7))
salary <- c(18000, 14000, 34000, 11000, 23000, 23000, 25000)
date <- as.Date(c("2007-04-30","2007-07-30","2009-12-09","2006-01-01",
"2008-01-01","2009-12-09", "2010-01-01"))
salaries <- data.frame(name,sex,salary,date)
salaries
name sex salary date
carl M 18000 2007-04-30
carl M 14000 2007-07-30
bob M 34000 2009-12-09
rick M 11000 2006-01-01
rick M 23000 2008-01-01
rick M 23000 2009-12-09
rick M 25000 2010-01-01
As we can see, poor carl got his salary cut by 4k in July. Prior to that, he was earning 18k. This was the case for 3 months before he got the cut ,but my data doesn't reflect this. I'd like to make a nice picture showing this trend, but first I need to change the data to look like this (where * denotes imputed values):
head(salaries)
name sex salary date change
carl M 18000 2007-04-30 0
carl M 18000 2007-05-30* 0
carl M 18000 2007-06-30* 0
carl M 14000 2007-07-30 1
bob M 34000 2009-12-09 0
rick M 11000 2006-01-01 0
rick M 11000 2006-02-01* 0
... .. ....... ...... ....
rick M 11000 2007-12-01* 0
rick M 23000 2008-01-01 1
rick M 23000 2008-02-01* 1
.... ...... ...... ........
rick M 23000 2009-12-09 1
rick M 25000 2010-01-01 2
So i'd like to impute in-between values and also mark when a change occurs. A guy like bob, who never had a salary change, just stays at 0. But rick, who's had multiple salary changes get's marked each time so we know when the change occurred and which number it is. I'm only interested in the month as the unit of analysis but it would be useful to know how to impute daily as well.
If you have a single time series,
you can use na.locf to replace missing values with the last available value
or approx if you only want to interpolate between values.
To create those individual time series, you can convert the data between your "tall" (normalized) format and a "wide" format with dcast and melt.
To count the number of changes, you can use ddply and cumsum.
library(reshape2)
library(plyr)
library(zoo)
# Convert to wide format
d <- dcast( salaries, date ~ name, value.var = "salary" )
# Add all the dates you want
dates <- seq.Date( from = min(d$date), max(d$date), by="month" )
d <- merge( d, data.frame(date=dates), all=TRUE )
# Fill in the missing values
# If you want the last non-missing value:
#d <- as.data.frame(lapply(d, na.locf, na.rm=FALSE))
# If you only want to interpolate between values:
d <- as.data.frame(lapply(d,
function(x) approx( seq_along(x), x, seq_along(x), method="constant" )$y
))
# Convert back to the tall format
d <- melt(d, id.vars="date", value.name="salary", variable.name="name", na.rm=TRUE)
# Add the number of changes
d <- ddply(
d, "name", transform,
change = cumsum(c(0, diff(salary) != 0))
)
Elaborating on #Vincent's advice:
name <- c("carl","carl","bob","rick","rick","rick","rick")
sex <- c(rep("M",7))
salary <- c(18000, 14000, 34000, 11000, 23000, 23000, 25000)
office <- c('melbourne','sydney','adelaide','perth','perth','melbourne','melbourne')
date <- as.Date(c("2007-04-30","2007-07-30","2009-12-09","2006-01-01",
"2008-01-01","2009-12-09", "2010-01-01"))
salaries <- data.frame(name,sex,salary,date, office)
salaries
library(reshape2)
library(plyr)
library(zoo)
Dealing with numeric vector using approx
# Convert to wide format
d <- dcast( salaries, date ~ name, value.var = "salary" )
# Add all the dates you want
dates <- seq.Date( from = min(d$date), max(d$date), by="month" )
d <- merge( d, data.frame(date=dates), all=TRUE )
# Fill in the missing values
# If you want the last non-missing value:
#d <- as.data.frame(lapply(d, na.locf, na.rm=FALSE, fromLast = T))
#If you only want to interpolate between values:
d <- as.data.frame(lapply(d,
function(x) approx( seq_along(x), x, seq_along(x), method="constant" )$y
))
# Convert back to the tall format
d <- melt(d, id.vars="date", value.name="salary", variable.name="name", na.rm=TRUE)
# Add the number of changes
d <- ddply(
d, "name", transform,
change = cumsum(c(0, diff(salary) != 0))
)
Convert character vector with na.locf
# Convert to wide format
a <- dcast( salaries, date ~ name, value.var = "office" )
# Add all the dates you want
dates <- seq.Date( from = min(a$date), max(a$date), by="month" )
a <- merge( a, data.frame(date=dates), all=TRUE )
# Fill in the missing values using na.locf
a <- as.data.frame(lapply(a, na.locf, na.rm=FALSE, fromLast = T))
# Convert back to the tall format
a <- melt(a, id.vars="date", value.name="office", variable.name="name", na.rm=TRUE)
Merge results
d$date <- as.Date(d$date)
out = merge(a,d, by = c('name','date'))