I am using the NAPM ISM data set from the FRED database. The data is monthly frequency. I would like to create another data frame with daily frequency where value each business day is the last monthly data release. So if last release of 49.5 on 02/01/16 then every day in February has a value of 49.5.
Code Sample
start_date <- as.Date("1970-01-01")
end_date <- Sys.Date()
US_PMI <- getSymbols("NAPM", auto.assign = FALSE, src ="FRED", from = start_date, to = end_date)
test <- data.frame(date=index(US_PMI), coredata(US_PMI))
I do not know which packages and data I need to reproduce your example but you can use a sequence of daily dates, the merge function and the NA filler in the zoo package to create the daily data frame:
library(zoo)
# Date range
sd = as.Date("1970-01-01")
ed = Sys.Date()
# Create daily and monthly data frame
daily.df = data.frame(date = seq(sd, ed, "days"))
monthly.df = data.frame(date = seq(sd, ed, "months"))
# Add some variable to the monthly data frame
monthly.df$v = rnorm(nrow(monthly.df))
# Merge
df = merge(daily.df, monthly.df, by = "date", all = TRUE)
# Fill up NA's
df = transform(df, v = na.locf(v))
This might not be the fastest way to obtain the data frame but it should work.
Related
I have a panel data set of 34 banks from 2007 to 2020. I need to calculate the yearly log return from the daily adjusted closing stock prices of these banks. The data structure is as follows:
The date after importing to R is in POSIXct format while prices are in number format. There are in total 121323 observations.
Can anyone help me with the codes to calculate annual returns from daily prices? One of the problems is accounting for the panel data structure, the return calculation for Bank A has to end on the last date and the process has to repeat for Bank B and end on the last date so on. In the above data structure, the return calculation has to end on 07-01-2007 for Bank A. I have tried using quantmod but was unsuccessful. Any help is much appreciated.
Thanks
Here is a solution that works if the data is in the posted format.
library(dplyr)
df1 %>%
mutate(year = format(date, "%Y")) %>%
group_by(bank, year) %>%
summarize(LogReturns = log(last(price)) - log(first(price)))
Test data
library(quantmod)
from <- as.Date("2010-01-01")
to <- Sys.Date()
getSymbols("GOOGL", from = from, to = to)
getSymbols("MSFT", from = from, to = to)
df1 <- data.frame(date = index(GOOGL), price = GOOGL[,4])
df2 <- data.frame(date = index(MSFT), price = MSFT[,4])
names(df1)[2] <- "price"
names(df2)[2] <- "price"
df1$bank <- "GOOGL"
df2$bank <- "MSFT"
df1 <- rbind(df1, df2)
rm(GOOGL, MSFT, df2)
I'm a novice to R and I've written very clunky functions to apply to geochemical datasets to interpolate data to nearby dates, convert decimal dates, reshape & average the geochemical data by year/month, and spit it all out at the end as a new data frame. However, it only does one column at a time and there are anywhere between 2-10 columns of data per dataset and I have over 50 datasets. This requires a lot of copy and pasting and I know there should be a better way to do it but I've tried and failed to get anywhere for months.
I have tried reading up on this but haven't been able to implement any loops I've seen suggested elsewhere.
Here is an example of my datasets:
Year SrCa MgCa BaCa
1958.00 8.98 4.29 4.77
1958.08 9.00 4.21 4.56
1958.17 9.02 4.16 4.39
...
Here are the functions I have written:
#Interpolates monthly or bimonthly data to dates for the 15th of every month
yrmonth_INTERP<-function(dataset, agecolumn, variable1, var1name){
X_in = dataset[[agecolumn]] #select X (Age) column
y_in = dataset[[variable1]] # select y (data) column
x_out <- seq.Date(as.Date("1920/01/15"), as.Date("2017/12/15"), by = "months") #create reference dates
x_out <- decimal_date(x_out) #reference dates to decimal dates
xy_int <- approx(x = X_in, y = y_in, xout = x_out) #interpolate data
xy_int <- signif(as.data.frame(xy_int, row.names = NULL), digits = 12)
xy_int <- na.omit(xy_int)
Age<-date_decimal(xy_int[[1]]) #convert decimal to date
Year<-year(Age)
Month<-month(Age)
Day<-day(Age)
var1<-xy_int[[2]] #pull out variable
newdata<-cbind.data.frame(Year, Month, Day, var1) #create dataframe
date1 <-paste(newdata$Year,newdata$Month, newdata$Day,sep="-") #put together separate time variables into date
date1 <- ymd(date1) #convert to date
data_months <- cbind(date1, newdata) #add date column to previous dateframe
colnames(data_months) = c('Age', 'Year', 'Months', 'Day', var1name) #name columns
return (data_months)
}
#Turns lots of data points into the average for every month
yrmonth_avg<-function(dataset, agecolumn, variable1, var1name, varsum){
Age<-date_decimal(dataset[[agecolumn]]) #convert decimal to date
Year<-year(Age)
Month<-month(Age)
Day<-day(Age)
var1<-dataset[[variable1]] #pull out data variable
newdata<-cbind.data.frame(Year, Month, Day, var1) #create dataframe of time and data
datamelt = melt(newdata, id = c('Year', 'Month', 'Day'))
datacast = dcast(datamelt, variable ~ Year + Month, mean) #wide cast/reshape data to row to get mean by year and month
datacast2 = dcast(datamelt, variable ~ Year + Month, sum) #wide cast/reshape data to row to get mean by year and month
Var1Data = datacast[-1:0] #remove first column
Var1sum = datacast2[-1:0]
re_data = gather(Var1Data, key='Age', value = var1name) #reshape mean data to columns
re_data1 = gsub("_", "-", re_data$Age) #pull out info to make date
re_data2 <- ymd(re_data1, truncated = 1) #create date
day(re_data2) <- 15
newColNames <- c("Year", "Month")
newCols <- colsplit(re_data1, "-", newColNames) #keep separated time period columns
re_sum = gather(Var1sum, key='Age', value = 'Sum') #return sum data to columns
data_months <- cbind(re_data2, re_data[[2]], re_sum[[2]], newCols) #create dataframe
data_months[[4]] <- as.numeric(data_months[[4]])
data_months[[5]] <- as.numeric(data_months[[5]])
colnames(data_months) = c('Age', var1name, varsum, 'Year', 'Months')
return (data_months)
}
And what I get out at the end is:
Age SrCa Year Months
1958-01-15 8.989589 1958 1
1958-02-15 9.009619 1958 2
1958-03-15 9.035000 1958 3
...
Can I put a loop of some kind in there to apply the function to all columns in the dataframe so I don't have to run the function 2-10 times to get all the averaged geochemical data averaged?
Do I need to break up the different actions of the function to make this possible?
Can I apply this function across a list of the other dataframes?
EDIT: realised I had extraneous data and two separate functions I should have mentioned that essentially do the same thing with datasets of different reolution
If you bring all your data frames in a list, this should do:
apply_all <- function(list_of_dfs){ # apply to all data frames
return(lapply(list_of_dfs, function(df) apply(df,2,YRMONTH)))) # apply to all columns of a data frame
}
I've got a GPS dataset with about 5600 rows of coordinates from 5 GPS devices ('nodes') over several days and I want to reduce the number of GPS points to just one point per hour. Because the number of points per hour fluctuates, a simple for-loop is not possible.
A simplified structure of the table would be this:
ID node easting northing year month day hour minute time
The column 'time' is class "POSIXlt" "POSIXt".
Trying my first approach, a multiple nested for-loop, I learned about the Second circle of Inferno.
Does someone has any idea, how to reduce multiple rows (per hour) to one (per hour), separated by each device in R.
Assuming that the year, month, day, and time columns contain information related to the time column, the solution could be as follows:
# Generate data
md <- data.frame(
node = rep(1:5, each = 2)
, easting = sample(1:10, size = 20, replace = TRUE)
, northing = sample(1:10, size = 20, replace = TRUE)
, year = 2017
, month = "June "
, day = 6
, hour = rep(1:2, each = 2, times = 5)
, minute = NA
, time = NA
)
# Solution
library(dplyr)
md %>%
group_by(node, year, month, day, hour) %>%
summarize(
easting = mean(easting),
northing = mean(northing)
)
You can create a new column "Unix_hour": the UNIX timestamp divided by 3600.
So, you will have a unique id for each hour.
To do this, you should user as.numeric to convert a POSIXct date into Unix timestamp (in seconds):
as.numeric(POSIXct_variable) / 3600
It will return the timestamp.
Then, you will just group by on this new column "Unix_hour":
aggregate(. ~ Unix_hour, df, mean)
(Change aggregate function "mean" if you to aggregate other variables in another way)
You could convert your multi columns for date time into one, e.g:
DateTimeUTCmin5 <- ISOdate(year = tmp$Year,
month = tmp$Month,
day = tmp$Day,
hour = tmp$Hour,
min = tmp$Min,
sec = tmp$Sec,
tz = "America/New_York")
add an hour floor using floor_date from lubridate
df$HourFloor = floor_date(df$DateTimeUTCmin5, unit = "hour")
then decide how you want to extract the data from that hour, mean, first, max?
Hourstats <- df %>% group_by(HourFloor) %>%
summarise(meanEast = mean(easting, na.rm = TRUE),
firstNorth = first(northing, na.rm = TRUE))) %>%
ungroup()
I'm working with some bird GPS tracking data, and I would like to exclude points based on the time stamp.
Some background information- the GPS loggers track each bird for just over 24 hours, starting in the evening, and continuing through the night and the following day. What I would like to do is exclude points taken after 9:30pm on the day AFTER deployment (so removing points from the very end of the track).
As an R novice, I'm struggling because the deployment dates differ for each bird, so I can't simply use subset() for a specific date and time.
An example of my dataframe (df):
BirdID x y Datetime
15K12 492719.9 5634805 2015-06-23 18:25:00
15K12 492491.5 5635018 2015-06-23 18:27:00
15K70 455979.1 5653581 2015-06-24 19:54:00
15K70 456040.9 5653668 2015-06-24 19:59:00
So, pretending these points represent the start of the GPS track for each animal, I would like to remove points after 9:30 pm on June 24 for bird 15K12, and after 9:30 on June 25 for bird 15K70.
Any ideas?
First, check if df$Datetime is a date variable:
class(df$Datetime)
If it's not, you can convert it with this:
df$Datetime <- ymd_hms(df&Datetime)
You use mutate to create a new variable called newdate that takes the earliest date of the bird's data and sets the date for cutoff which is the next day at 21:30:00 of the earliest date of a bird's observations.
Then you filter the Datetime column by the newdate column and you get the observations that are found earlier that the specified date.
library(dplyr); library(lubridate)
df %>%
group_by(BirdID) %>%
mutate(newdate = as.POSIXct(date(min(Datetime)) + days(1) + hours(21) + minutes(30))) %>%
filter(Datetime < newdate)
Did a reproducible example:
library(dplyr); library(lubridate)
set.seed(1)
# Create a data frame (1000 observations)
BirdID <- paste(rep(floor(runif(250, 1, 20)),4),
rep("k", 1000), rep(floor(runif(250, 1, 40)),4), sep = "")
x <- rnorm(1000, mean = 47000, sd = 2000)
y <- rnorm(1000, mean = 5650000, sd = 300000)
Datetime <- as.POSIXct(rnorm(1000, mean = as.numeric(as.POSIXct("2015-06-23 18:25:00")), sd = 99999), tz = "GMT", origin = "1970-01-01")
df <- data.frame(BirdID, x, y, Datetime, stringsAsFactors = FALSE)
# Filter the data frame by the specified date
df_filtered <- df %>%
group_by(BirdID) %>%
mutate(newdate = as.POSIXct(date(min(Datetime)) + days(1) + hours(21) + minutes(30))) %>%
filter(Datetime < newdate)
This should fix any problem.
I have a data frame which consists of date and temperature of 34 different systems each system in different column. I need to calculate every systems average hourly temperature. I use this code to calculate average for 1 system. But if I want to calculate average for other 33 systems, I have to repeat code again, and again. Is there a better way to find hourly average in all columns at once ?
dat$ut_ms <- dat$ut_ms/1000
dat[ ,1]<- as.POSIXct(dat[,1], origin="1970-01-01")
dat$ut_ms <- strptime(dat$ut_ms, "%Y-%m-%d %H:%M")
dat$ut_ms <- cut(dat[enter image description here][1]$ut_ms, breaks = 'hour')
meanNPWD2401<- aggregate(NPWD2401 ~ ut_ms, dat, mean)
I added a picture of the data. For better understing of what I want.
You can split your data per hour and itterate,
list1 <- split(dat, cut(strptime(dat$ut_ms, format = '%Y-%m-%d %H:%M'), 'hour'))
lapply(list1, colMeans)
When you rearrange the data into a long format, things get much easier
n.system <- 34
n.time <- 100
temp <- rnorm(n.time * n.system)
temp <- matrix(temp, ncol = n.system)
seconds <- runif(n.time, max = 3 * 3600)
time <- as.POSIXct(seconds, origin = "1970-01-01")
dataset <- data.frame(time, temp)
library(dplyr)
library(tidyr)
dataset %>%
gather(key = "system", value = "temperature", -time) %>%
mutate(hour = cut(time, "hour")) %>%
group_by(system, hour) %>%
summarise(average = mean(temperature))