Related
i had data frame which contain many row. Here it is:
library(lubridate)
date_ <- date(seq(ymd_h("2020-01-01-00"), ymd_h("2021-03-31-23"), by = "hours"))
hour_ <- hour(seq(ymd_h("2020-01-01-00"), ymd_h("2021-03-31-23"), by = "hours"))
game1 <- sort(round(runif(length(date_), 10, 50), 0), decreasing = TRUE)
game2 <- sort(round(runif(length(date_), 20, 100), 0), decreasing = TRUE)
game3 <- sort(round(runif(length(date_), 30, 150), 0), decreasing = TRUE)
game4 <- sort(round(runif(length(date_), 40, 200), 0), decreasing = TRUE)
game_data <- data.frame(date_, hour_, game1, game2, game3, game4)
I just want to subset game_data to get all the last 6 months data. How do i get it?
Last 6 months of data from the max date in the data ?
You can try -
library(lubridate)
library(dplyr)
result <- subset(game_data, date_ > max(date_) %m-% months(6))
Or with dplyr -
result <- game_data %>% filter(date_ > max(date_) %m-% months(6))
This question already has answers here:
R: How to filter/subset a sequence of dates
(4 answers)
Closed 3 years ago.
I only want to select the data between 2015-1 and 2018-1.
usefuldata1 <-usefuldata[usefuldata$zeit > "2014 - 12",]
usefuldata2 <-usefuldata1[usefuldata1$zeit < "2018-2",]
Bigger than 2014 works. However, smaller than 2018 does not.
I get the same number as before.
My data looks as follows:
x <- sample(letters, size = 30, replace = T)
y <- paste(sample(2010:2020, size = 30, replace = T),
sample(1:12, size = 30, replace = T), sep = "-")
df <- data.frame(name = x, date = y)
There is a very userfull type in R called Date, by experience about 90% of all problems about dates can be solved by that.
see: ?as.Date for help
In your case, the first thing to do is format y as a Date type adding the days.
x <- sample(letters, size = 30, replace = T)
y <- paste(sample(2010:2020, size = 30, replace = T),
sample(1:12, size = 30, replace = T),
"01", sep = "-") # <- this is the day (01) for all.
Then we have to format it as a Date type.
y = as.Date(y, format = "%Y-%m-%d") # year complete (%Y), month (%m) and day (%d)
df <- data.frame(name = x, date = y)
And subset them.
dateIndex = which(y > "2014-12-31" & y < "2018-01-01")
subset_df = df[dateIndex,]
I hope this helps you.
I have a data frame:
MS_NR SS_NR DATE HOUR VALUE
1 13095010 68 1/01/2014 0:00:00 9,8
2 13095010 68 1/01/2014 1:00:00 8,0
3 13095010 68 1/01/2014 2:00:00 NA
4 13095010 68 1/01/2014 3:00:00 7,5
5 13095010 68 1/01/2014 4:00:00 7,0
6 13095010 68 1/01/2014 5:00:00 8,5
are temperature observations of a weather station taken every hour, I want to calculate the daily, weekly, monthly and annual averages of several data frames of different weather stations. How can I do this within a loop, so that the process is not repetitive?
When working with hydro-meteorological data, I usually use xts and hydroTSM packages as they have many functions for data aggregation.
You didn't provide any data so I created one for demonstration purpose
library(xts)
library(hydroTSM)
# Generate random data
set.seed(2018)
date = seq(from = as.Date("2016-01-01"), to = as.Date("2018-12-31"),
by = "days")
temperature = runif(length(date), -15, 35)
dat <- data.frame(date, temperature)
# Convert to xts object for xts & hydroTSM functions
dat_xts <- xts(dat[, -1], order.by = dat$date)
# All daily, monthly & annual series in one plot
hydroplot(dat_xts, pfreq = "dma", var.type = "Temperature")
# Weekly average
dat_weekly <- apply.weekly(dat_xts, FUN = mean)
plot(dat_weekly)
# Monthly average
dat_monthly <- daily2monthly(dat_xts, FUN = mean, na.rm = TRUE)
plot.zoo(dat_monthly, xaxt = "n", xlab = "")
axis.Date(1, at = pretty(index(dat_monthly)),
labels = format(pretty(index(dat_monthly)), format = "%b-%Y"),
las = 1, cex.axis = 1.1)
# Seasonal average: need to specify the months
dat_seasonal <- dm2seasonal(dat_xts, season = "DJF", FUN = mean, na.rm = TRUE)
plot(dat_seasonal)
# Annual average
dat_annual <- daily2annual(dat_xts, FUN = mean, na.rm = TRUE)
plot(dat_annual)
Edit: using OP's data
df <- readr::read_csv2("Temp_2014_Hour.csv")
str(df)
# Convert DATE to Date object & put in a new column
df$date <- as.Date(df$DATE, format = "%d/%m/%Y")
dat <- df[, c("date", "VALUE")]
str(dat)
dat_xts <- xts(dat[, -1], order.by = dat$date)
Created on 2018-02-28 by the reprex package (v0.2.0).
I try this
first using read.table load the file
library(openair)
Temp <- read.table (file, header=TRUE, sep=";",stringsAsFactors = FALSE, dec = ",", na.strings = "NA")
tiempos <- Temp$HOUR
timestamps <- as.POSIXlt(as.POSIXct('1900-1-1', tz='UTC')
+ as.difftime(as.character(tiempos))
time <- format(timestamps, format='%H:%M:%S')
date<-paste(Temp[,3], time, sep=" ")
date
Temp_met <- cbind(date, CovTemp[-c(3,4)])
Temp_met$date <- as.POSIXct(strptime(Met_CovTemp$date,
format = "%d/%m/%Y %H:%M", "GMT"))
## daily mean
Temp_daily <- timeAverage(Met_CovTemp, avg.time = "day")
## weekly mean
Temp_week <- timeAverage(Met_CovTemp, avg.time = "week")
## monthly mean
Temp_month <- timeAverage(Met_CovTemp, avg.time = "month")
## annual mean
Temp_annual <- timeAverage(Met_CovTemp, avg.time = "year")
I have a spreadsheet documenting prices of 40 similar products at various dates. It looks like this.
date_1<-seq(as.Date("2010-01-01"), as.Date("2011-01-01"), length.out = 40)
date_2<-seq(as.Date("2011-01-01"), as.Date("2012-01-01"), length.out = 40)
date_3<-seq(as.Date("2012-01-01"), as.Date("2013-01-01"), length.out = 40)
date_4<-seq(as.Date("2013-01-01"), as.Date("2014-01-01"), length.out = 40)
date_5<-seq(as.Date("2014-01-01"), as.Date("2015-01-01"), length.out = 40)
date_6<-seq(as.Date("2015-01-01"), as.Date("2016-01-01"), length.out = 40)
price_1<-floor(seq(20, 50, length.out = 40))
price_2<-floor(seq(20, 60, length.out = 40))
price_3<-floor(seq(20, 70, length.out = 40))
price_4<-floor(seq(30, 80, length.out = 40))
price_5<-floor(seq(40, 100, length.out = 40))
price_6<-floor(seq(50, 130, length.out = 40))
data.frame(date_1,price_1,date_2,price_2,date_3,price_3,date_4,price_4,date_5,price_5,date_6,price_6)
The problem is, the columns representing dates and prices alternate (convenient for record keeping). How can I transform the above data to a new dataframe consisting solely of prices of these 40 products as rows, with dates as column names? This will generate a lot of NA's because the dates in each column differ but that's OK.
When working with time series data it is often helpful to have it in long form (one row per observation), even if your target output is wide (one row per time series). Here are three possible approaches to get it into long form, then widen:
1. base reshape()
To get long form, base reshape is definitely a powerful option. The following solution improves on the accepted solution because it works for any numbers of products and observations and eliminates an unnecessary step:
df <- data.frame(date_1,price_1,date_2,price_2,date_3,price_3,
date_4,price_4,date_5,price_5,date_6,price_6)
# no need to create an id variable
long_form <- reshape(df, # idvar="id" by default
varying = list(grep('date_',names(df), value=TRUE),
grep('price_',names(df), value=TRUE) ),
v.names=c("date","price"),
direction="long",
sep="_")
And reshape can also widen it. (We'll use spread in another approach below.)
wide_form <- reshape(long_form, drop='time', timevar='date', direction='wide')
2. data.table melt() and dcast() (likely faster on real dataset)
Make sure you have data.table v1.9.6 or later, which allows you to melt multiple columns.
library(data.table)
setDT(df)
melt.data.table(df[, prod_id := .I], # product id = original row number
measure.vars = list(grep('date_',names(df), value=TRUE),
grep('price_',names(df), value=TRUE) ),
variable.name = 'sequence',
value.name = c('date','price'),
id.vars = 'prod_id') -> long_form
In this case you don't use the sequence, so to get wide form is just:
dcast.data.table(long_form[, !'sequence', with=FALSE],
value.var = 'price', # optional (function guesses correctly)
prod_id ~ date) -> wide_form
3. tidyr & dplyr split-apply-combine (easy to understand)
It doesn't require the mental gymnastics that reshape does (at least for me). It is a column-wise variant on the "split-apply-combine" paradigm.
library(dplyr); library(tidyr)
# Create long-form time series data
# Split table into sequenced prices and dates, then combine on product and sequence
full_join(
df %>%
select(starts_with('date_')) %>% #~~~~ Left side = date component ~~~~~~~~
mutate(prod_id = 1:nrow(df)) %>% #~ product id = original row number ~
gather(sequence, date, -prod_id) %>% #~ long form = 1 row per prod per seq ~
mutate(sequence = #~~~ Cols: product_id, sequence, date ~~~
sub('^date_(\\d+)$', '\\1', sequence) ) ,
df %>%
select(starts_with('price_')) %>% #~~~ Right side = price component ~~~~~~~
mutate(prod_id = 1:nrow(df)) %>% #~ ~
gather(sequence, price, -prod_id) %>% #~ same idea ~
mutate(sequence = #~~ Cols: product_id, sequence, price ~~~
sub('^price_(\\d+)$', '\\1', sequence) )
) -> long_form
In this case you don't need the sequence, so to get to wide form it's simply:
long_form %>% select(-sequence) %>% spread(date, price) -> wide_form
as noted by others above.
Here is one way I came up with using dplyr/tidyr packages:
library(tidyr)
library(dplyr)
date_1<-seq(as.Date("2010-01-01"), as.Date("2011-01-01"), length.out = 40)
date_2<-seq(as.Date("2011-01-01"), as.Date("2012-01-01"), length.out = 40)
date_3<-seq(as.Date("2012-01-01"), as.Date("2013-01-01"), length.out = 40)
date_4<-seq(as.Date("2013-01-01"), as.Date("2014-01-01"), length.out = 40)
date_5<-seq(as.Date("2014-01-01"), as.Date("2015-01-01"), length.out = 40)
date_6<-seq(as.Date("2015-01-01"), as.Date("2016-01-01"), length.out = 40)
price_1<-floor(seq(20, 50, length.out = 40))
price_2<-floor(seq(20, 60, length.out = 40))
price_3<-floor(seq(20, 70, length.out = 40))
price_4<-floor(seq(30, 80, length.out = 40))
price_5<-floor(seq(40, 100, length.out = 40))
price_6<-floor(seq(50, 130, length.out = 40))
df <- data.frame(date_1,price_1,date_2,price_2,date_3,price_3,date_4,price_4,date_5,price_5,date_6,price_6)
dates <- df[, grep('date', names(df))]
dates <- dates %>% gather(date_type, date) %>% select(-date_type)
prices <- df[, grep('price', names(df))]
prices <- prices %>% gather(price_type, price) %>% select(-price_type)
df <- cbind(dates, prices)
Then, to spread dates to columns and prices to rows, you can do something like this:
df <- arrange(df, price)
df <- spread(df, date, price)
Using baseR and tidyr you could do:
library(tidyr)
#add an id to identify the products
df$id=1:40
#transform the data to a long format
long_data <- reshape(df,idvar="id",varying=list(paste0("date_",1:6),paste0("price_",1:6)),v.names=c("date","price"),direction="long",sep="_")
long_data <- long_data[,!grepl("time",colnames(long_data))]
#put it back to a wide format
wide_data <- spread(long_data,date,price)
I am trying to get the total precipitation values for every hour from a personal weather station I have using the weatherData package. The problem I have is that the data is collected every five minutes and the values repeat themselves until there is a change in precipitation value. I have tried the 'duplicated' function but I get a large number of data removed when there is no precipitation which makes it hard for me to get a summary of the hourly precipitation.
Please see code below
## Load required libraries
library(weatherData)
library(ggplot2)
library(scales)
library(plyr)
library(reshape2)
library(gridExtra)
library(lubridate)
library(weathermetrics)
library(zoo)
# Get data for PWS using weatherData package
pws <- getWeatherForDate("IPENANGB2", "2014-09-01","2014-09-30", station_type = "id",opt_detailed=T, opt_custom_columns=T, custom_columns=c(1,2,6,7,10))
# Rename columns
colnames(pws)<-c("time","time1","tempc","wdd","wspd","prcp")
## Adding date columns
pws$time<-as.POSIXct(pws$time1,format="%Y-%m-%d %H:%M:%S",tz="Australia/Perth")
pws$year <- as.numeric(format(pws$time,"%Y"))
pws$date <-as.Date(pws$time,format="%Y-%m-%d",tz="Australia/Perth")
pws$year <- as.numeric(as.POSIXlt(pws$date)$year+1900)
pws$month <- as.numeric(as.POSIXlt(pws$date)$mon+1)
pws$monthf <- factor(pws$month,levels=as.character(1:12),labels=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"),ordered=TRUE)
pws$weekday <- as.POSIXlt(pws$date)$wday
pws$weekdayf <- factor(pws$weekday,levels=rev(0:6),labels=rev(c("Mon","Tue","Wed","Thu","Fri","Sat","Sun")),ordered=TRUE)
pws$yearmonth <- as.yearmon(pws$date)
pws$yearmonthf <- factor(pws$yearmonth)
pws$week <- as.numeric(format(as.Date(pws$date),"%W"))
pws$weekf<- factor(pws$week)
pws$jday<-yday(pws$date)
pws$hour <- as.numeric(format(strptime(pws$time, format = "%Y-%m-%d %H:%M"),format = "%H"))
pws$min <- as.numeric(format(strptime(pws$time, format = "%Y-%m-%d %H:%M"),format = "%M"))
# Remove duplicate values
pws.df <- pws[!duplicated(pws$prcp),]
Assuming you want to get hourly averages of tempc, wdd, wspd, prcp:
# used packages
library(weatherData)
library(lubridate)
library(dplyr)
library(stringr)
# read data
pws <- getWeatherForDate("IPENANGB2",
"2014-09-01",
"2014-09-30",
station_type = "id",
opt_detailed = T,
opt_custom_columns = T,
custom_columns = c(1, 2, 6, 7, 10))
# rename columns
colnames(pws) <- c("time", "time1", "tempc", "wdd", "wspd", "prcp")
# cleaning dataset and adding some columns
useful_pws <-
pws %>%
select(2:6) %>%
filter(!str_detect(time1, "<br>")) %>%
mutate(time1 = ymd_hms(time1),
year = year(time1),
month = month(time1),
day = day(time1),
hour = hour(time1)) %>%
tbl_df()
# summarising dataset
useful_pws %>%
select(-time1) %>%
group_by(year, month, day, hour) %>%
summarise(tempc = mean(tempc, na.rm = TRUE),
wdd = mean(wdd, na.rm = TRUE),
wspd = mean(wspd, na.rm = TRUE),
prcp = mean(prcp, na.rm = TRUE))